import re from sys import exit import argparse from audiogen import AudioGenerator class SimpleMarkupParser: def __init__(self, input_text): self.input_text = " ".join(input_text.split()) self.parsed_output = [] self.sections = {} def parse(self): tokens = re.split(r"(\[[^]]+])", self.input_text) for token in tokens: voice_match = re.match(r"\[voice ([^]]+)]", token) if voice_match: self.parsed_output.append( {"type": "voice", "voice": voice_match.group(1)} ) continue silence_match = re.match(r"\[silence (\d+)s]", token) if silence_match: duration = int(silence_match.group(1)) * 1000 self.parsed_output.append({"type": "silence", "duration": duration}) continue section_match = re.match(r"\[section (\d+)]", token) if section_match: section_id = int(section_match.group(1)) self.parsed_output.append( {"type": "section_start", "section_id": section_id} ) continue end_section_match = re.match(r"\[end_section]", token) if end_section_match: self.parsed_output.append({"type": "section_end"}) continue insert_section_match = re.match(r"\[insert_section (\d+)]", token) if insert_section_match: section_id = int(insert_section_match.group(1)) self.parsed_output.append( {"type": "insert_section", "section_id": section_id} ) continue if re.match(r"\[.*]", token): self.parsed_output.append({"type": "none", "text": token}) continue if token.strip(): self.parsed_output.append({"type": "text", "text": token.strip()}) def get_output(self): return self.parsed_output def main(): parser_description = """ TTS text with voice selection, silence intervals, and section functionality. The script supports a simple markup language to change voices, insert silence, define sections, and insert sections within the text. Markup Language Syntax: - Change Voice: Use [voice VOICE_NAME] to switch to a different voice. Example: [voice alloy] switches to the 'alloy' voice. - Insert Silence: Use [silence SECONDSs] to insert a period of silence. Example: [silence 4s] inserts a 4-second silence. - Define Section: Use [section SECTION_ID] to start a new section with the given ID. Example: [section 1] starts a new section with ID 1. - End Section: Use [end_section] to end the current section. - Insert Section: Use [insert_section SECTION_ID] to insert the audio from the specified section ID. Example: [section 1] [voice alloy] Hi there! [end_section] [insert_section 1] inserts the audio from section 1. In effect, this will say "Hi there" with the 'alloy' voice, and then repeat it exactly. Supported voices: - All OpenAI voices are supported. These are: valid_voices = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'] - alloy (male, neutral) - echo (male, full-bodied) - fable (male, high) - onyx (male, deep) - nova (female, expressive) - shimmer (female, full-bodied) Sample Input: "[voice alloy] How's it going? [section 1] [voice fable] I love it here! [end_section] [voice alloy] Repeat that please? [insert_section 1]" This input will: 1. Start with the 'alloy' voice saying "How's it going?" 2. Define a new section (ID 1) with the 'fable' voice saying "I love it here!" 3. Switch back to the 'alloy' voice saying "Repeat that please?" 4. Insert fable speaking the audio from section 1 (without regenerating it). """ parser = argparse.ArgumentParser( description=parser_description, formatter_class=argparse.RawTextHelpFormatter ) parser.add_argument("--file", type=str, help="File containing the text to parse.") parser.add_argument("text", nargs="?", default=None, help="Text to parse.") parser.add_argument( "--out-file", type=str, default="out.mp3", help="Output file to save the audio to (mp3 recommended). Default out.mp3", ) parser.add_argument( "--provider", type=str, default="openai", help="AI Provider. Supported: openai, zuki", ) parser.add_argument( "--api-key", type=str, default=None, help="API Key for AI Provider. Alternatively, create a file 'apikey.secret' in the workdir containing your API key.", ) args = parser.parse_args() if not args.file and not args.text: print("Please provide either a file (using --file ) or a text input!") exit(1) if args.file and args.text: print( "Please provide either a file (using --file ) or a text input, not both!" ) exit(1) input_text = args.text if args.file: with open(args.file, "r") as file: input_text = file.read() parser = SimpleMarkupParser(input_text) parser.parse() output = parser.get_output() print("parsed:", output) if len(output) == 0: print("No output found! Does the input text adhere to the expected format?") exit(3) tts = AudioGenerator(output, args.out_file, ai_provider=args.provider) try: tts.validate_voices() except ValueError as e: print(e) valid_voices = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"] print( "Voices not valid! Valid voices are: " + "'" + "', '".join(valid_voices) + "'" ) try: tts.generate_audio() except ValueError as e: print("Generating audio failed:") print(e) if __name__ == "__main__": main()