147 lines
5.7 KiB
Python
147 lines
5.7 KiB
Python
|
import re
|
||
|
|
||
|
import argparse
|
||
|
|
||
|
from audiogen import AudioGenerator
|
||
|
|
||
|
|
||
|
class SimpleMarkupParser:
|
||
|
def __init__(self, input_text):
|
||
|
self.input_text = ' '.join(input_text.split())
|
||
|
self.parsed_output = []
|
||
|
self.sections = {}
|
||
|
|
||
|
def parse(self):
|
||
|
tokens = re.split(r'(\[[^]]+])', self.input_text)
|
||
|
|
||
|
for token in tokens:
|
||
|
voice_match = re.match(r'\[voice ([^]]+)]', token)
|
||
|
if voice_match:
|
||
|
self.parsed_output.append({'type': 'voice', 'voice': voice_match.group(1)})
|
||
|
continue
|
||
|
|
||
|
silence_match = re.match(r'\[silence (\d+)s]', token)
|
||
|
if silence_match:
|
||
|
duration = int(silence_match.group(1)) * 1000
|
||
|
self.parsed_output.append({'type': 'silence', 'duration': duration})
|
||
|
continue
|
||
|
|
||
|
section_match = re.match(r'\[section (\d+)]', token)
|
||
|
if section_match:
|
||
|
section_id = int(section_match.group(1))
|
||
|
self.parsed_output.append({'type': 'section_start', 'section_id': section_id})
|
||
|
continue
|
||
|
|
||
|
end_section_match = re.match(r'\[end_section]', token)
|
||
|
if end_section_match:
|
||
|
self.parsed_output.append({'type': 'section_end'})
|
||
|
continue
|
||
|
|
||
|
insert_section_match = re.match(r'\[insert_section (\d+)]', token)
|
||
|
if insert_section_match:
|
||
|
section_id = int(insert_section_match.group(1))
|
||
|
self.parsed_output.append({'type': 'insert_section', 'section_id': section_id})
|
||
|
continue
|
||
|
|
||
|
if re.match(r'\[.*]', token):
|
||
|
self.parsed_output.append({'type': 'none', 'text': token})
|
||
|
continue
|
||
|
|
||
|
if token.strip():
|
||
|
self.parsed_output.append({'type': 'text', 'text': token.strip()})
|
||
|
|
||
|
def get_output(self):
|
||
|
return self.parsed_output
|
||
|
|
||
|
|
||
|
def main():
|
||
|
|
||
|
|
||
|
parser_description = """
|
||
|
TTS text with voice selection, silence intervals, and section functionality.
|
||
|
The script supports a simple markup language to change voices, insert silence, define sections, and insert sections within the text.
|
||
|
|
||
|
Markup Language Syntax:
|
||
|
- Change Voice: Use [voice VOICE_NAME] to switch to a different voice.
|
||
|
Example: [voice alloy] switches to the 'alloy' voice.
|
||
|
- Insert Silence: Use [silence SECONDSs] to insert a period of silence.
|
||
|
Example: [silence 4s] inserts a 4-second silence.
|
||
|
- Define Section: Use [section SECTION_ID] to start a new section with the given ID.
|
||
|
Example: [section 1] starts a new section with ID 1.
|
||
|
- End Section: Use [end_section] to end the current section.
|
||
|
- Insert Section: Use [insert_section SECTION_ID] to insert the audio from the specified section ID.
|
||
|
Example: [section 1] [voice alloy] Hi there! [end_section] [insert_section 1] inserts the audio from section 1.
|
||
|
In effect, this will say "Hi there" with the 'alloy' voice, and then repeat it exactly.
|
||
|
|
||
|
Supported voices:
|
||
|
- All OpenAI voices are supported. These are:
|
||
|
valid_voices = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
|
||
|
- alloy (male, neutral)
|
||
|
- echo (male, full-bodied)
|
||
|
- fable (male, high)
|
||
|
- onyx (male, deep)
|
||
|
- nova (female, expressive)
|
||
|
- shimmer (female, full-bodied)
|
||
|
|
||
|
Sample Input:
|
||
|
"[voice alloy] How's it going? [section 1] [voice fable] I love it here! [end_section] [voice alloy] Repeat that please? [insert_section 1]"
|
||
|
|
||
|
This input will:
|
||
|
1. Start with the 'alloy' voice saying "How's it going?"
|
||
|
2. Define a new section (ID 1) with the 'fable' voice saying "I love it here!"
|
||
|
3. Switch back to the 'alloy' voice saying "Repeat that please?"
|
||
|
4. Insert fable speaking the audio from section 1 (without regenerating it).
|
||
|
"""
|
||
|
|
||
|
|
||
|
parser = argparse.ArgumentParser(description=parser_description,
|
||
|
formatter_class=argparse.RawTextHelpFormatter)
|
||
|
parser.add_argument('--file', type=str, help="File containing the text to parse.")
|
||
|
parser.add_argument('text', nargs='?', default=None, help="Text to parse.")
|
||
|
parser.add_argument('--out-file', type=str, default="out.mp3",
|
||
|
help="Output file to save the audio to (mp3 recommended). Default out.mp3")
|
||
|
parser.add_argument('--provider', type=str, default="openai", help="AI Provider. Supported: openai, zuki")
|
||
|
parser.add_argument('--api-key', type=str, default=None,
|
||
|
help="API Key for AI Provider. Alternatively, create a file 'apikey.secret' in the workdir containing your API key.")
|
||
|
args = parser.parse_args()
|
||
|
|
||
|
if not args.file and not args.text:
|
||
|
print("Please provide either a file (using --file <PATH>) or a text input!")
|
||
|
exit(1)
|
||
|
|
||
|
if args.file and args.text:
|
||
|
print("Please provide either a file (using --file <PATH>) or a text input, not both!")
|
||
|
exit(1)
|
||
|
|
||
|
input_text = args.text
|
||
|
if args.file:
|
||
|
with open(args.file, 'r') as file:
|
||
|
input_text = file.read()
|
||
|
|
||
|
parser = SimpleMarkupParser(input_text)
|
||
|
parser.parse()
|
||
|
output = parser.get_output()
|
||
|
print("parsed:", output)
|
||
|
|
||
|
if len(output) == 0:
|
||
|
print("No output found! Does the input text adhere to the expected format?")
|
||
|
exit(3)
|
||
|
|
||
|
tts = AudioGenerator(output, args.out_file, ai_provider=args.provider)
|
||
|
try:
|
||
|
tts.validate_voices()
|
||
|
except ValueError as e:
|
||
|
print(e)
|
||
|
valid_voices = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
|
||
|
print("Voices not valid! Valid voices are: " + "'" + "', '".join(valid_voices) + "'")
|
||
|
|
||
|
try:
|
||
|
tts.generate_audio()
|
||
|
except ValueError as e:
|
||
|
print("Generating audio failed:")
|
||
|
print(e)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|