tts-markup-utility/main.py

import re
from sys import exit

import argparse

from audiogen import AudioGenerator


class SimpleMarkupParser:
    def __init__(self, input_text):
        self.input_text = " ".join(input_text.split())
        self.parsed_output = []
        self.sections = {}

    def parse(self):
        tokens = re.split(r"(\[[^]]+])", self.input_text)

        for token in tokens:
            voice_match = re.match(r"\[voice ([^]]+)]", token)
            if voice_match:
                self.parsed_output.append(
                    {"type": "voice", "voice": voice_match.group(1)}
                )
                continue

            silence_match = re.match(r"\[silence (\d+)s]", token)
            if silence_match:
                duration = int(silence_match.group(1)) * 1000
                self.parsed_output.append({"type": "silence", "duration": duration})
                continue

            section_match = re.match(r"\[section (\d+)]", token)
            if section_match:
                section_id = int(section_match.group(1))
                self.parsed_output.append(
                    {"type": "section_start", "section_id": section_id}
                )
                continue

            end_section_match = re.match(r"\[end_section]", token)
            if end_section_match:
                self.parsed_output.append({"type": "section_end"})
                continue

            insert_section_match = re.match(r"\[insert_section (\d+)]", token)
            if insert_section_match:
                section_id = int(insert_section_match.group(1))
                self.parsed_output.append(
                    {"type": "insert_section", "section_id": section_id}
                )
                continue

            if re.match(r"\[.*]", token):
                self.parsed_output.append({"type": "none", "text": token})
                continue

            if token.strip():
                self.parsed_output.append({"type": "text", "text": token.strip()})

    def get_output(self):
        return self.parsed_output


def main():
    parser_description = """
    TTS text with voice selection, silence intervals, and section functionality.
    The script supports a simple markup language to change voices, insert silence, define sections, and insert sections within the text.

    Markup Language Syntax:
    - Change Voice: Use [voice VOICE_NAME] to switch to a different voice.
        Example: [voice alloy] switches to the 'alloy' voice.
    - Insert Silence: Use [silence SECONDSs] to insert a period of silence.
        Example: [silence 4s] inserts a 4-second silence.
    - Define Section: Use [section SECTION_ID] to start a new section with the given ID.
        Example: [section 1] starts a new section with ID 1.
    - End Section: Use [end_section] to end the current section.
    - Insert Section: Use [insert_section SECTION_ID] to insert the audio from the specified section ID.
        Example: [section 1] [voice alloy] Hi there! [end_section] [insert_section 1] inserts the audio from section 1.
                 In effect, this will say "Hi there" with the 'alloy' voice, and then repeat it exactly.

    Supported voices:
    - All OpenAI voices are supported. These are:
        valid_voices = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
        - alloy (male, neutral)
        - echo (male, full-bodied)
        - fable (male, high)
        - onyx (male, deep)
        - nova (female, expressive)
        - shimmer (female, full-bodied)

    Sample Input:
    "[voice alloy] How's it going? [section 1] [voice fable] I love it here! [end_section] [voice alloy] Repeat that please? [insert_section 1]"

    This input will:
    1. Start with the 'alloy' voice saying "How's it going?"
    2. Define a new section (ID 1) with the 'fable' voice saying "I love it here!"
    3. Switch back to the 'alloy' voice saying "Repeat that please?"
    4. Insert fable speaking the audio from section 1 (without regenerating it).
    """

    parser = argparse.ArgumentParser(
        description=parser_description, formatter_class=argparse.RawTextHelpFormatter
    )
    parser.add_argument("--file", type=str, help="File containing the text to parse.")
    parser.add_argument("text", nargs="?", default=None, help="Text to parse.")
    parser.add_argument(
        "--out-file",
        type=str,
        default="out.mp3",
        help="Output file to save the audio to (mp3 recommended). Default out.mp3",
    )
    parser.add_argument(
        "--provider",
        type=str,
        default="openai",
        help="AI Provider. Supported: openai, zuki",
    )
    parser.add_argument(
        "--api-key",
        type=str,
        default=None,
        help="API Key for AI Provider. Alternatively, create a file 'apikey.secret' in the workdir containing your API key.",
    )
    args = parser.parse_args()

    if not args.file and not args.text:
        print("Please provide either a file (using --file <PATH>) or a text input!")
        exit(1)

    if args.file and args.text:
        print(
            "Please provide either a file (using --file <PATH>) or a text input, not both!"
        )
        exit(1)

    input_text = args.text
    if args.file:
        with open(args.file, "r") as file:
            input_text = file.read()

    parser = SimpleMarkupParser(input_text)
    parser.parse()
    output = parser.get_output()
    print("parsed:", output)

    if len(output) == 0:
        print("No output found! Does the input text adhere to the expected format?")
        exit(3)

    tts = AudioGenerator(output, args.out_file, ai_provider=args.provider)
    try:
        tts.validate_voices()
    except ValueError as e:
        print(e)
        valid_voices = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
        print(
            "Voices not valid! Valid voices are: "
            + "'"
            + "', '".join(valid_voices)
            + "'"
        )

    try:
        tts.generate_audio()
    except ValueError as e:
        print("Generating audio failed:")
        print(e)


if __name__ == "__main__":
    main()