2024-04-25 13:13:59 +00:00
|
|
|
import re
|
2024-04-25 13:32:07 +00:00
|
|
|
from sys import exit
|
2024-04-25 13:13:59 +00:00
|
|
|
|
|
|
|
import argparse
|
|
|
|
|
|
|
|
from audiogen import AudioGenerator
|
|
|
|
|
|
|
|
|
|
|
|
class SimpleMarkupParser:
|
|
|
|
def __init__(self, input_text):
|
2024-04-25 13:32:07 +00:00
|
|
|
self.input_text = " ".join(input_text.split())
|
2024-04-25 13:13:59 +00:00
|
|
|
self.parsed_output = []
|
|
|
|
self.sections = {}
|
|
|
|
|
|
|
|
def parse(self):
|
2024-04-25 13:32:07 +00:00
|
|
|
tokens = re.split(r"(\[[^]]+])", self.input_text)
|
2024-04-25 13:13:59 +00:00
|
|
|
|
|
|
|
for token in tokens:
|
2024-04-25 13:32:07 +00:00
|
|
|
voice_match = re.match(r"\[voice ([^]]+)]", token)
|
2024-04-25 13:13:59 +00:00
|
|
|
if voice_match:
|
2024-04-25 13:32:07 +00:00
|
|
|
self.parsed_output.append(
|
|
|
|
{"type": "voice", "voice": voice_match.group(1)}
|
|
|
|
)
|
2024-04-25 13:13:59 +00:00
|
|
|
continue
|
|
|
|
|
2024-04-25 13:32:07 +00:00
|
|
|
silence_match = re.match(r"\[silence (\d+)s]", token)
|
2024-04-25 13:13:59 +00:00
|
|
|
if silence_match:
|
|
|
|
duration = int(silence_match.group(1)) * 1000
|
2024-04-25 13:32:07 +00:00
|
|
|
self.parsed_output.append({"type": "silence", "duration": duration})
|
2024-04-25 13:13:59 +00:00
|
|
|
continue
|
|
|
|
|
2024-04-25 13:32:07 +00:00
|
|
|
section_match = re.match(r"\[section (\d+)]", token)
|
2024-04-25 13:13:59 +00:00
|
|
|
if section_match:
|
|
|
|
section_id = int(section_match.group(1))
|
2024-04-25 13:32:07 +00:00
|
|
|
self.parsed_output.append(
|
|
|
|
{"type": "section_start", "section_id": section_id}
|
|
|
|
)
|
2024-04-25 13:13:59 +00:00
|
|
|
continue
|
|
|
|
|
2024-04-25 13:32:07 +00:00
|
|
|
end_section_match = re.match(r"\[end_section]", token)
|
2024-04-25 13:13:59 +00:00
|
|
|
if end_section_match:
|
2024-04-25 13:32:07 +00:00
|
|
|
self.parsed_output.append({"type": "section_end"})
|
2024-04-25 13:13:59 +00:00
|
|
|
continue
|
|
|
|
|
2024-04-25 13:32:07 +00:00
|
|
|
insert_section_match = re.match(r"\[insert_section (\d+)]", token)
|
2024-04-25 13:13:59 +00:00
|
|
|
if insert_section_match:
|
|
|
|
section_id = int(insert_section_match.group(1))
|
2024-04-25 13:32:07 +00:00
|
|
|
self.parsed_output.append(
|
|
|
|
{"type": "insert_section", "section_id": section_id}
|
|
|
|
)
|
2024-04-25 13:13:59 +00:00
|
|
|
continue
|
|
|
|
|
2024-04-25 13:32:07 +00:00
|
|
|
if re.match(r"\[.*]", token):
|
|
|
|
self.parsed_output.append({"type": "none", "text": token})
|
2024-04-25 13:13:59 +00:00
|
|
|
continue
|
|
|
|
|
|
|
|
if token.strip():
|
2024-04-25 13:32:07 +00:00
|
|
|
self.parsed_output.append({"type": "text", "text": token.strip()})
|
2024-04-25 13:13:59 +00:00
|
|
|
|
|
|
|
def get_output(self):
|
|
|
|
return self.parsed_output
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
parser_description = """
|
|
|
|
TTS text with voice selection, silence intervals, and section functionality.
|
|
|
|
The script supports a simple markup language to change voices, insert silence, define sections, and insert sections within the text.
|
|
|
|
|
|
|
|
Markup Language Syntax:
|
|
|
|
- Change Voice: Use [voice VOICE_NAME] to switch to a different voice.
|
|
|
|
Example: [voice alloy] switches to the 'alloy' voice.
|
|
|
|
- Insert Silence: Use [silence SECONDSs] to insert a period of silence.
|
|
|
|
Example: [silence 4s] inserts a 4-second silence.
|
|
|
|
- Define Section: Use [section SECTION_ID] to start a new section with the given ID.
|
|
|
|
Example: [section 1] starts a new section with ID 1.
|
|
|
|
- End Section: Use [end_section] to end the current section.
|
|
|
|
- Insert Section: Use [insert_section SECTION_ID] to insert the audio from the specified section ID.
|
|
|
|
Example: [section 1] [voice alloy] Hi there! [end_section] [insert_section 1] inserts the audio from section 1.
|
|
|
|
In effect, this will say "Hi there" with the 'alloy' voice, and then repeat it exactly.
|
|
|
|
|
|
|
|
Supported voices:
|
|
|
|
- All OpenAI voices are supported. These are:
|
|
|
|
valid_voices = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
|
|
|
|
- alloy (male, neutral)
|
|
|
|
- echo (male, full-bodied)
|
|
|
|
- fable (male, high)
|
|
|
|
- onyx (male, deep)
|
|
|
|
- nova (female, expressive)
|
|
|
|
- shimmer (female, full-bodied)
|
|
|
|
|
|
|
|
Sample Input:
|
|
|
|
"[voice alloy] How's it going? [section 1] [voice fable] I love it here! [end_section] [voice alloy] Repeat that please? [insert_section 1]"
|
|
|
|
|
|
|
|
This input will:
|
|
|
|
1. Start with the 'alloy' voice saying "How's it going?"
|
|
|
|
2. Define a new section (ID 1) with the 'fable' voice saying "I love it here!"
|
|
|
|
3. Switch back to the 'alloy' voice saying "Repeat that please?"
|
|
|
|
4. Insert fable speaking the audio from section 1 (without regenerating it).
|
|
|
|
"""
|
|
|
|
|
2024-04-25 13:32:07 +00:00
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description=parser_description, formatter_class=argparse.RawTextHelpFormatter
|
|
|
|
)
|
|
|
|
parser.add_argument("--file", type=str, help="File containing the text to parse.")
|
|
|
|
parser.add_argument("text", nargs="?", default=None, help="Text to parse.")
|
|
|
|
parser.add_argument(
|
|
|
|
"--out-file",
|
|
|
|
type=str,
|
|
|
|
default="out.mp3",
|
|
|
|
help="Output file to save the audio to (mp3 recommended). Default out.mp3",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--provider",
|
|
|
|
type=str,
|
|
|
|
default="openai",
|
|
|
|
help="AI Provider. Supported: openai, zuki",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--api-key",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="API Key for AI Provider. Alternatively, create a file 'apikey.secret' in the workdir containing your API key.",
|
|
|
|
)
|
2024-04-25 13:13:59 +00:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
if not args.file and not args.text:
|
|
|
|
print("Please provide either a file (using --file <PATH>) or a text input!")
|
|
|
|
exit(1)
|
|
|
|
|
|
|
|
if args.file and args.text:
|
2024-04-25 13:32:07 +00:00
|
|
|
print(
|
|
|
|
"Please provide either a file (using --file <PATH>) or a text input, not both!"
|
|
|
|
)
|
2024-04-25 13:13:59 +00:00
|
|
|
exit(1)
|
|
|
|
|
|
|
|
input_text = args.text
|
|
|
|
if args.file:
|
2024-04-25 13:32:07 +00:00
|
|
|
with open(args.file, "r") as file:
|
2024-04-25 13:13:59 +00:00
|
|
|
input_text = file.read()
|
|
|
|
|
|
|
|
parser = SimpleMarkupParser(input_text)
|
|
|
|
parser.parse()
|
|
|
|
output = parser.get_output()
|
|
|
|
print("parsed:", output)
|
|
|
|
|
|
|
|
if len(output) == 0:
|
|
|
|
print("No output found! Does the input text adhere to the expected format?")
|
|
|
|
exit(3)
|
|
|
|
|
|
|
|
tts = AudioGenerator(output, args.out_file, ai_provider=args.provider)
|
|
|
|
try:
|
|
|
|
tts.validate_voices()
|
|
|
|
except ValueError as e:
|
|
|
|
print(e)
|
2024-04-25 13:32:07 +00:00
|
|
|
valid_voices = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
|
|
|
|
print(
|
|
|
|
"Voices not valid! Valid voices are: "
|
|
|
|
+ "'"
|
|
|
|
+ "', '".join(valid_voices)
|
|
|
|
+ "'"
|
|
|
|
)
|
2024-04-25 13:13:59 +00:00
|
|
|
|
|
|
|
try:
|
|
|
|
tts.generate_audio()
|
|
|
|
except ValueError as e:
|
|
|
|
print("Generating audio failed:")
|
|
|
|
print(e)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|