tts-markup-utility/audiogen.py

import logging
import re
from pathlib import Path
from pydub import AudioSegment, silence
from openai import OpenAI
import time


def get_api_key() -> str:
    try:
        with open('apikey.secret') as f:
            api_key = f.read().strip()
            if api_key == '':
                raise ValueError('API key not found. Please provide your API key in the file \'apikey.secret\'.')
            return api_key
    except FileNotFoundError:
        raise ValueError('Couldn\'t read API key from file \'apikey.secret\'. Does it exist?')


class AudioGenerator:
    def __init__(self, parsed_data, output_file, default_silence=650, ai_provider="openai", api_key=None):
        self.parsed_data = parsed_data
        self.output_file = output_file
        self.default_silence = default_silence
        self.sections = {}
        self.current_section = None

        if not api_key:
            api_key = get_api_key()

        match ai_provider:
            case "openai":
                self.client = OpenAI(api_key=api_key)
            case "zuki":
                self.client = OpenAI(base_url="https://zukijourney.xyzbot.net/v1", api_key=api_key)
            case _:
                raise ValueError(f"Unsupported AI provider: {ai_provider}")

    def validate_voices(self):
        """Check if all voices in the parsed data are valid."""
        valid_voices = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
        invalid_voices = set()

        for item in self.parsed_data:
            if item['type'] == 'voice' and item['voice'] not in valid_voices:
                invalid_voices.add(item['voice'])

        if invalid_voices:
            raise ValueError(f"Invalid voice(s) found: {', '.join(invalid_voices)}")
        print("All voices are valid.")

    def validate_sections(self):
        """Check if all sections used are defined beforehand."""
        used_sections = set()
        defined_sections = set()
        section_errors = []

        for item in self.parsed_data:

            if item['type'] == 'section_start':
                defined_sections.add(item['section_id'])
            elif item['type'] == 'insert_section':
                section_id = item['section_id']
                if section_id not in defined_sections:
                    section_errors.append(f"Section {section_id} is used before being defined.")
                used_sections.add(item['section_id'])

        undefined_sections = used_sections - defined_sections

        if undefined_sections or len(section_errors) > 0:
            raise ValueError(f"Section Validation Errors:\n  {'\n  '.join(section_errors)}\n\nUndefined section(s) used: {', '.join(map(str, undefined_sections))}")
        print("All sections are properly defined.")


    def text_to_speech(self, text, voice):
        """Generate speech using OpenAI's voice API with retry logic."""
        print(f"Voice {voice} chosen")
        print(f"TTS: {text[:50]}...")

        temp_path = Path("temp_speech.mp3")
        attempts = 0
        success = False

        while not success:
            try:
                response = self.client.audio.speech.create(
                    model="tts-1",
                    voice=voice,
                    input=text,
                )
                response.write_to_file(str(temp_path))
                success = True
                return AudioSegment.from_mp3(temp_path)
            except Exception as e:
                print(f"Failed to generate TTS: {e}")
                attempts += 1
                if attempts >= 3:
                    user_decision = input("Retry TTS generation? (yes/no): ").strip().lower()
                    if user_decision.lower() in ['y', 'yes']:
                        attempts = 0  # Reset attempts for another round of retries
                    else:
                        print("Exiting due to TTS generation failure.")
                        exit(1)
                else:
                    print("Retrying...")
                    time.sleep(1)  # Wait a bit before retrying to avoid hammering the API too quickly

    def generate_audio(self):
        self.validate_voices()
        self.validate_sections()
        combined_audio = AudioSegment.empty()
        current_voice = None

        for item in self.parsed_data:
            if item['type'] == 'voice':
                current_voice = item['voice']
            elif item['type'] == 'text':
                if not current_voice:
                    raise ValueError("First text segment before voice was selected!")
                audio_segment = self.text_to_speech(item['text'], current_voice)
                combined_audio += audio_segment
                if self.default_silence > 0:
                    combined_audio += AudioSegment.silent(duration=self.default_silence)
                if self.current_section is not None:
                    self.sections[self.current_section] += audio_segment
            elif item['type'] == 'silence':
                combined_audio += AudioSegment.silent(duration=item['duration'])
                if self.current_section is not None:
                    self.sections[self.current_section] += AudioSegment.silent(duration=item['duration'])
            elif item['type'] == 'section_start':
                self.current_section = item['section_id']
                self.sections[self.current_section] = AudioSegment.empty()
            elif item['type'] == 'section_end':
                self.current_section = None
            elif item['type'] == 'insert_section':
                section_id = item['section_id']
                if section_id in self.sections:
                    combined_audio += self.sections[section_id]
                else:
                    raise ValueError(f"Section {section_id} not found!")

        combined_audio.export(self.output_file, format="mp3")
# Example usage