tts-markup-utility/main.py

import re
from sys import exit

import argparse

from audiogen import AudioGenerator


class SimpleMarkupParser:
    def __init__(self, input_text):
        self.input_text = " ".join(input_text.split())
        self.parsed_output = []
        self.sections = {}

    def parse(self):
        tokens = re.split(r"(\[[^]]+])", self.input_text)

        for token in tokens:
            voice_match = re.match(r"\[voice ([^]]+)]", token)
            if voice_match:
                self.parsed_output.append(
                    {"type": "voice", "voice": voice_match.group(1)}
                )
                continue

            silence_match = re.match(r"\[silence (\d+)s]", token)
            if silence_match:
                duration = int(silence_match.group(1)) * 1000
                self.parsed_output.append({"type": "silence", "duration": duration})
                continue

            section_match = re.match(r"\[section (\d+)]", token)
            if section_match:
                section_id = int(section_match.group(1))
                self.parsed_output.append(
                    {"type": "section_start", "section_id": section_id}
                )
                continue

            end_section_match = re.match(r"\[end_section]", token)
            if end_section_match:
                self.parsed_output.append({"type": "section_end"})
                continue

            insert_section_match = re.match(r"\[insert_section (\d+)]", token)
            if insert_section_match:
                section_id = int(insert_section_match.group(1))
                self.parsed_output.append(
                    {"type": "insert_section", "section_id": section_id}
                )
                continue

            if re.match(r"\[.*]", token):
                self.parsed_output.append({"type": "none", "text": token})
                continue

            if token.strip():
                self.parsed_output.append({"type": "text", "text": token.strip()})

    def get_output(self):
        return self.parsed_output


def main():
    parser_description = """
    TTS text with voice selection, silence intervals, and section functionality.
    The script supports a simple markup language to change voices, insert silence, define sections, and insert sections within the text.

    Markup Language Syntax:
    - Change Voice: Use [voice VOICE_NAME] to switch to a different voice. 
        Example: [voice alloy] switches to the 'alloy' voice.
    - Insert Silence: Use [silence SECONDSs] to insert a period of silence. 
        Example: [silence 4s] inserts a 4-second silence.
    - Define Section: Use [section SECTION_ID] to start a new section with the given ID. 
        Example: [section 1] starts a new section with ID 1.
    - End Section: Use [end_section] to end the current section.
    - Insert Section: Use [insert_section SECTION_ID] to insert the audio from the specified section ID. 
        Example: [section 1] [voice alloy] Hi there! [end_section] [insert_section 1] inserts the audio from section 1.
                 In effect, this will say "Hi there" with the 'alloy' voice, and then repeat it exactly.

    Supported voices:
    - All OpenAI voices are supported. These are:
        valid_voices = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
        - alloy (male, neutral)
        - echo (male, full-bodied)
        - fable (male, high)
        - onyx (male, deep)
        - nova (female, expressive)
        - shimmer (female, full-bodied)

    Sample Input:
    "[voice alloy] How's it going? [section 1] [voice fable] I love it here! [end_section] [voice alloy] Repeat that please? [insert_section 1]"

    This input will:
    1. Start with the 'alloy' voice saying "How's it going?"
    2. Define a new section (ID 1) with the 'fable' voice saying "I love it here!"
    3. Switch back to the 'alloy' voice saying "Repeat that please?"
    4. Insert fable speaking the audio from section 1 (without regenerating it).
    """

    parser = argparse.ArgumentParser(
        description=parser_description, formatter_class=argparse.RawTextHelpFormatter
    )
    parser.add_argument("--file", type=str, help="File containing the text to parse.")
    parser.add_argument("text", nargs="?", default=None, help="Text to parse.")
    parser.add_argument(
        "--out-file",
        type=str,
        default="out.mp3",
        help="Output file to save the audio to (mp3 recommended). Default out.mp3",
    )
    parser.add_argument(
        "--provider",
        type=str,
        default="openai",
        help="AI Provider. Supported: openai, zuki",
    )
    parser.add_argument(
        "--api-key",
        type=str,
        default=None,
        help="API Key for AI Provider. Alternatively, create a file 'apikey.secret' in the workdir containing your API key.",
    )
    args = parser.parse_args()

    if not args.file and not args.text:
        print("Please provide either a file (using --file <PATH>) or a text input!")
        exit(1)

    if args.file and args.text:
        print(
            "Please provide either a file (using --file <PATH>) or a text input, not both!"
        )
        exit(1)

    input_text = args.text
    if args.file:
        with open(args.file, "r") as file:
            input_text = file.read()

    parser = SimpleMarkupParser(input_text)
    parser.parse()
    output = parser.get_output()
    print("parsed:", output)

    if len(output) == 0:
        print("No output found! Does the input text adhere to the expected format?")
        exit(3)

    tts = AudioGenerator(output, args.out_file, ai_provider=args.provider)
    try:
        tts.validate_voices()
    except ValueError as e:
        print(e)
        valid_voices = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
        print(
            "Voices not valid! Valid voices are: "
            + "'"
            + "', '".join(valid_voices)
            + "'"
        )

    try:
        tts.generate_audio()
    except ValueError as e:
        print("Generating audio failed:")
        print(e)


if __name__ == "__main__":
    main()
feat: implemented audiogen 2024-04-25 13:13:59 +00:00			`import re`
fmt: re-linted files 2024-04-25 13:32:07 +00:00			`from sys import exit`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00
			`import argparse`

			`from audiogen import AudioGenerator`


			`class SimpleMarkupParser:`
			`def __init__(self, input_text):`
fmt: re-linted files 2024-04-25 13:32:07 +00:00			`self.input_text = " ".join(input_text.split())`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00			`self.parsed_output = []`
			`self.sections = {}`

			`def parse(self):`
fmt: re-linted files 2024-04-25 13:32:07 +00:00			`tokens = re.split(r"(\[[^]]+])", self.input_text)`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00
			`for token in tokens:`
fmt: re-linted files 2024-04-25 13:32:07 +00:00			`voice_match = re.match(r"\[voice ([^]]+)]", token)`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00			`if voice_match:`
fmt: re-linted files 2024-04-25 13:32:07 +00:00			`self.parsed_output.append(`
			`{"type": "voice", "voice": voice_match.group(1)}`
			`)`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00			`continue`

fmt: re-linted files 2024-04-25 13:32:07 +00:00			`silence_match = re.match(r"\[silence (\d+)s]", token)`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00			`if silence_match:`
			`duration = int(silence_match.group(1)) * 1000`
fmt: re-linted files 2024-04-25 13:32:07 +00:00			`self.parsed_output.append({"type": "silence", "duration": duration})`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00			`continue`

fmt: re-linted files 2024-04-25 13:32:07 +00:00			`section_match = re.match(r"\[section (\d+)]", token)`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00			`if section_match:`
			`section_id = int(section_match.group(1))`
fmt: re-linted files 2024-04-25 13:32:07 +00:00			`self.parsed_output.append(`
			`{"type": "section_start", "section_id": section_id}`
			`)`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00			`continue`

fmt: re-linted files 2024-04-25 13:32:07 +00:00			`end_section_match = re.match(r"\[end_section]", token)`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00			`if end_section_match:`
fmt: re-linted files 2024-04-25 13:32:07 +00:00			`self.parsed_output.append({"type": "section_end"})`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00			`continue`

fmt: re-linted files 2024-04-25 13:32:07 +00:00			`insert_section_match = re.match(r"\[insert_section (\d+)]", token)`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00			`if insert_section_match:`
			`section_id = int(insert_section_match.group(1))`
fmt: re-linted files 2024-04-25 13:32:07 +00:00			`self.parsed_output.append(`
			`{"type": "insert_section", "section_id": section_id}`
			`)`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00			`continue`

fmt: re-linted files 2024-04-25 13:32:07 +00:00			`if re.match(r"\[.*]", token):`
			`self.parsed_output.append({"type": "none", "text": token})`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00			`continue`

			`if token.strip():`
fmt: re-linted files 2024-04-25 13:32:07 +00:00			`self.parsed_output.append({"type": "text", "text": token.strip()})`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00
			`def get_output(self):`
			`return self.parsed_output`


			`def main():`
			`parser_description = """`
			`TTS text with voice selection, silence intervals, and section functionality.`
			`The script supports a simple markup language to change voices, insert silence, define sections, and insert sections within the text.`

			`Markup Language Syntax:`
			`- Change Voice: Use [voice VOICE_NAME] to switch to a different voice.`
			`Example: [voice alloy] switches to the 'alloy' voice.`
			`- Insert Silence: Use [silence SECONDSs] to insert a period of silence.`
			`Example: [silence 4s] inserts a 4-second silence.`
			`- Define Section: Use [section SECTION_ID] to start a new section with the given ID.`
			`Example: [section 1] starts a new section with ID 1.`
			`- End Section: Use [end_section] to end the current section.`
			`- Insert Section: Use [insert_section SECTION_ID] to insert the audio from the specified section ID.`
			`Example: [section 1] [voice alloy] Hi there! [end_section] [insert_section 1] inserts the audio from section 1.`
			`In effect, this will say "Hi there" with the 'alloy' voice, and then repeat it exactly.`

			`Supported voices:`
			`- All OpenAI voices are supported. These are:`
			`valid_voices = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']`
			`- alloy (male, neutral)`
			`- echo (male, full-bodied)`
			`- fable (male, high)`
			`- onyx (male, deep)`
			`- nova (female, expressive)`
			`- shimmer (female, full-bodied)`

			`Sample Input:`
			`"[voice alloy] How's it going? [section 1] [voice fable] I love it here! [end_section] [voice alloy] Repeat that please? [insert_section 1]"`

			`This input will:`
			`1. Start with the 'alloy' voice saying "How's it going?"`
			`2. Define a new section (ID 1) with the 'fable' voice saying "I love it here!"`
			`3. Switch back to the 'alloy' voice saying "Repeat that please?"`
			`4. Insert fable speaking the audio from section 1 (without regenerating it).`
			`"""`

fmt: re-linted files 2024-04-25 13:32:07 +00:00			`parser = argparse.ArgumentParser(`
			`description=parser_description, formatter_class=argparse.RawTextHelpFormatter`
			`)`
			`parser.add_argument("--file", type=str, help="File containing the text to parse.")`
			`parser.add_argument("text", nargs="?", default=None, help="Text to parse.")`
			`parser.add_argument(`
			`"--out-file",`
			`type=str,`
			`default="out.mp3",`
			`help="Output file to save the audio to (mp3 recommended). Default out.mp3",`
			`)`
			`parser.add_argument(`
			`"--provider",`
			`type=str,`
			`default="openai",`
			`help="AI Provider. Supported: openai, zuki",`
			`)`
			`parser.add_argument(`
			`"--api-key",`
			`type=str,`
			`default=None,`
			`help="API Key for AI Provider. Alternatively, create a file 'apikey.secret' in the workdir containing your API key.",`
			`)`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00			`args = parser.parse_args()`

			`if not args.file and not args.text:`
			`print("Please provide either a file (using --file <PATH>) or a text input!")`
			`exit(1)`

			`if args.file and args.text:`
fmt: re-linted files 2024-04-25 13:32:07 +00:00			`print(`
			`"Please provide either a file (using --file <PATH>) or a text input, not both!"`
			`)`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00			`exit(1)`

			`input_text = args.text`
			`if args.file:`
fmt: re-linted files 2024-04-25 13:32:07 +00:00			`with open(args.file, "r") as file:`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00			`input_text = file.read()`

			`parser = SimpleMarkupParser(input_text)`
			`parser.parse()`
			`output = parser.get_output()`
			`print("parsed:", output)`

			`if len(output) == 0:`
			`print("No output found! Does the input text adhere to the expected format?")`
			`exit(3)`

			`tts = AudioGenerator(output, args.out_file, ai_provider=args.provider)`
			`try:`
			`tts.validate_voices()`
			`except ValueError as e:`
			`print(e)`
fmt: re-linted files 2024-04-25 13:32:07 +00:00			`valid_voices = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]`
			`print(`
			`"Voices not valid! Valid voices are: "`
			`+ "'"`
			`+ "', '".join(valid_voices)`
			`+ "'"`
			`)`
feat: implemented audiogen 2024-04-25 13:13:59 +00:00
			`try:`
			`tts.generate_audio()`
			`except ValueError as e:`
			`print("Generating audio failed:")`
			`print(e)`


			`if __name__ == "__main__":`
			`main()`