This commit is contained in:
Yandrik 2024-02-09 01:45:49 +01:00
commit 9fa38baa83
16 changed files with 5149 additions and 0 deletions

.gitignore vendored Normal file
View File

@ -0,0 +1,167 @@
### Python template
# Byte-compiled / optimized / DLL files
# C extensions
# Distribution / packaging
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
# Installer logs
# Unit test / coverage reports
# Translations
# Django stuff:
# Flask stuff:
# Scrapy stuff:
# Sphinx documentation
# PyBuilder
# Jupyter Notebook
# IPython
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# PEP 582; used by e.g. and
# Celery stuff
# SageMath parsed files
# Environments
# Spyder project settings
# Rope project settings
# mkdocs documentation
# mypy
# Pyre type checker
# pytype static type analyzer
# Cython debug symbols
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.

.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
# Editor-based HTTP Client requests
# Datasource local storage ignored files

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />

.idea/misc.xml Normal file
View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Poetry (ttsthing)" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (ttsthing)" project-jdk-type="Python SDK" />

.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<module fileurl="file://$PROJECT_DIR$/.idea/ttsthing.iml" filepath="$PROJECT_DIR$/.idea/ttsthing.iml" />

.idea/ttsthing.iml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.11 (ttsthing)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />

.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />

.python-version Normal file
View File

@ -0,0 +1 @@

283 Normal file
View File

@ -0,0 +1,283 @@
import os
from typing import Tuple, List
import argparse
import re
from pathlib import Path
from pydub import AudioSegment
import torch
from TTS.api import TTS
import requests
import prompt
OUT_DIR = 'out'
SPEAKER_WAV = 'speaker.wav'
LANG = "de"
tts = None
with open('apikey.secret') as f:
api_key =
except FileNotFoundError:
print('Couldn\'t read API key from file \'apikey.secret\'w. Does it exist?')
def is_float(s: str) -> bool:
return True
except ValueError:
return False
def has_word_characters(s: str) -> bool:
if'\w', s):
return True
return False
def transform_string(input_str: str) -> str:
This method transforms strings like "Der Gauner, die Gauner" into der_gauner_die_gauner
output_str = (input_str
.replace(' ', '_')
.replace(',', '')
.replace(';', '')
.replace('.', '')
.replace('/', '')
.replace('\\', '')
.replace(']', '')
.replace('[', '')
.replace('(', '')
.replace(')', ''))
return output_str
def get_tts_lazy() -> TTS:
global tts
if tts is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
return tts
def do_tts(text: str, speaker_wav: str, language: str, mp3_path: str, multisample: int = 4):
"""Generates audio file from text."""
os.makedirs('.tmp', exist_ok=True)
# to prevent long extra phrases, run `multisample` times, and take shortest (smallest) sample
for i in range(multisample):
get_tts_lazy().tts_to_file(text=text, speaker_wav=speaker_wav, language=language, file_path=f".tmp/temp{i}.wav")
shortest_sound = None
# load from file and convert to mp3
for i in range(multisample):
sound = AudioSegment.from_file(f".tmp/temp{i}.wav")
if shortest_sound is None:
shortest_sound = sound
elif len(sound) < len(shortest_sound):
shortest_sound = sound
sound = shortest_sound
sound = sound.set_frame_rate(44100) # Set frame rate to 44.1kHz, high quality
sound = sound.set_sample_width(2) # 2 byte (16 bit) samples, high quality
sound = sound.set_channels(2) # make it stereo
# Export as high quality mp3
sound.export(mp3_path, format="mp3", bitrate="192k") # Export with high quality bitrate
temp = "passieren; der Schaden, die Schäden; der Start, die Starts; die Strecke, die Strecken; der Verkehr; wenden; das Zeichen, die Zeichen; aussteigen; ausweichen; die Autobahn, die Autobahnen; der Bord, die Borde; die Brücke, die Brücken; einholen; einsteigen; entgegenkommen; fort; freigeben; der Hafen, die Häfen; der Halt, die Halte; die Kurve, die Kurven; laden; mobil; der Parkplatz, die Parkplätze; rollen; das Signal, die Signale; sperren; die Station, die Stationen; stoppen; das Tempo, die Tempos; das Ticket, die Tickets; der Transport, die Transporte; transportieren; der Tunnel, die Tunnel; der Unfall, die Unfälle; verkehren; verpassen"
def extract_words_from_cards(cards: [Tuple[str, str]], temp: float = 0.0) -> List[str]:
url = ""
left_sides = [card[0] for card in cards]
query_words = ';'.join(left_sides) # german words
payload = {
"model": "mixtral-8x7b-instruct",
# "model": "pplx-70b-chat",
"messages": [
"role": "system",
# "content": prompt.LLAMA_CARDS_TO_WORDS_PROMPT
"role": "user",
"content": "Here are Anki Cards to transform into speakable phrases: \n" + query_words +
"\nMake sure to ONLY output a string of semicolon-separated speakable phrases. DO NOT write anything else!"
"temperature": temp,
"presence_penalty": 0
headers = {
"accept": "application/json",
"content-type": "application/json",
"authorization": f"Bearer {api_key}",
while True:
response =, json=payload, headers=headers)
if response.status_code != 200:
print("Perplexity API error")
user_input = input("Retry? [Y/n] > ")
if user_input.strip().lower() in ['n', 'no']:
# split response at ;, and then return as list
content = response.json()['choices'][0]['message']['content']
print(f'query: {query_words}')
print(f'content: {content}')
words = content.split(";")
words = [word.split('\n')[0].strip() for word in words] # remove eventual newlines (e.g. comments)
if len(words[-1]) == 0 or not has_word_characters(words[-1]):
words = words[:-1]
return words
def process_text_to_audio(phrases_and_filenames: [(str, str)]):
os.makedirs(OUT_DIR, exist_ok=True)
for idx, phrase_and_filename in enumerate(phrases_and_filenames, 1):
phrase, filename = phrase_and_filename
mp3_path = os.path.join(OUT_DIR, f"{filename}.mp3")
do_tts(phrase, 'speaker.wav', 'de', mp3_path)
def main():
parser = argparse.ArgumentParser(description="Parse markdown note for Anki cards.")
parser.add_argument('note_file', type=str, help='The path of the markdown note file.')
parser.add_argument('--out-file', type=str, default='', help='The output file.')
parser.add_argument('--obsidian-format', action='store_true', help='Use Obsidian path format.')
parser.add_argument("--batch-size", type=int, default=64,
help='Number of cards sent to the LLM at one time, and processed in bulk. Default 64')
parser.add_argument('--multisample', type=int, default=4,
help='Number of audio generations per batch. Reduces audio with arbitrary sounds for short cards. Default 4')
parser.add_argument('--multisample-multiply-limit', type=int, default=8,
help='If a phrase is shorter than the multisample multiply limit, significantly more audio generations (generally *3) are done to improve quality. Set to 0 to disable. Default 8')
parser.add_argument('--multisample-multiply', type=int, default=3,
help='Sets the multiplier for additional audio generations when a phrase is shorter than the multisample multiply limit. Default is 3')
args = parser.parse_args()
# Check if the note file exists
if not Path(args.note_file).exists():
raise FileNotFoundError(f"File {args.note_file} does not exist.")
# Read file and parse for Anki cards
with open(args.note_file, 'r') as f:
content =
matches = re.findall(r"^Q: (.+)\nA: (.+)\n", content, re.MULTILINE)
# truncate for debug
# matches = matches[:129]
batch_size = args.batch_size
# process in blocks of batch_size
out_content = content
for i in range(len(matches) // batch_size + 1):
to_match = matches[i * batch_size:i * batch_size + batch_size]
correct_words_generated = False
words = None
subbatch_size = batch_size
cur_temp = 0
# process cards
while not correct_words_generated:
batches = [to_match[i:i + subbatch_size] for i in range(0, len(to_match), subbatch_size)]
words = []
for batch in batches:
words += extract_words_from_cards(batch, temp=cur_temp)
if len(words) != len(to_match):
print(f'generated words len ({len(words)}) != matches len ({len(to_match)})')
f'Current Batch Size is {subbatch_size}, temp is {cur_temp}. If this happens repeatedly, try reducing the batch size.')
userinput = input('Try again? [Y/n/split/temp] > ')
if userinput.strip().lower() in ['n', 'no']:
elif userinput.strip().lower() in ['s', 'split']:
subbatch_size = max(1, subbatch_size // 2)
# generate batches
elif userinput.strip().lower() in ['t', 'temp']:
while True:
new_temp = input(f"cur temp: {cur_temp}. Input new temp (0-2) > ")
if is_float(new_temp) and 0 <= float(new_temp) <= 2:
cur_temp = float(new_temp)
print("Must be numeric (float) between 0 and 2")
print("trying again...")
correct_words_generated = True
# assert len(words) == len(to_match), f'generated words len ({len(words)}) != matches len ({len(to_match)})'
filenames = []
for j, word in enumerate(words):
print(f'speaker-ifying word {word} ({j + i * batch_size} of {len(matches)})')
word = word.strip()
filename = f"{transform_string(word)}.mp3"
multisample = args.multisample
if len(word) < args.multisample_multiply_limit:
multisample *= args.multisample_multiply # generate 3x as many for short words / phrases
do_tts(word, SPEAKER_WAV, LANG, f'./{OUT_DIR}/{filename}', multisample=multisample)
for i, (question, answer) in enumerate(to_match):
out_content = out_content.replace(f"Q: {question}",
f"Q: {question} {'![[' if args.obsidian_format else f'![](./{OUT_DIR}/'}{filenames[i]}{']]' if args.obsidian_format else ')'}")
# open out file for writing
out_file = args.out_file
with open(out_file, 'w') as f:
if __name__ == '__main__':

poetry.lock generated Normal file

File diff suppressed because it is too large Load Diff

84 Normal file
View File

@ -0,0 +1,84 @@
I'll give you a list of Anki cards. Your job is to transform the dictionary entry card into speakable words, card for card, so that it can be passed to a TTS program.
Your job is to transform cards into speakable phrases, separated by semicolons.
ALWAYS follow these instructions:
- MAKE SURE to output NOTHING ELSE but the result
- MAKE SURE to not forget a single word! Start from the top, and continue onwards to the bottom
- MAKE SURE to output NO EXTRA WORDS to the cards
- Remove any brackets containing words (e.g. (ADJ), (NUR PL))
- ALWAYS Remove '|' word separators (e.g aus|steigen -> aussteigen)
- For verbs, include past and past perfect tense (e.g. befinden -> befinden, befand, befunden)
- DO NOT include extra words ANYWHERE EXCEPT FOR IN DECLARED CASES (e.g. DO NOT DO THIS: dahin -> dahin, dorthin. INSTEAD: dahin -> dahin)
- If a plural exists, ALWAYS write down the singular and the plural (e.g. die Werkstatt -en -> die Werkstatt, die Werkstätten)
- Separate plurals (of the same card) ALWAYS with commas, NEVER with semicolons. ALWAYS separate different cards with semicolons.
- If a noun does not change form in the plural, write it with both the singular and plural articles to reflect the change in number (e.g., "das Mädchen, -" -> "das Mädchen, die Mädchen"). If a noun only exists in the plural form, write only the plural (e.g., "die Eltern" remains "die Eltern").
- For words that only have a plural (e.g. Geschwister), just write the plural form (in this case "Die Geschwister")
- For adjectives, adverbs, propositions and ALL OTHER WORDS, write down ONLY the words themselves (e.g. dahin -> dahin)
For example, given the following input:
das Taxi -s; die U-Bahn -en; befinden; die Werkstatt -en; mobil (ADJ); dahin
Output this:
das Taxi, die Taxis; die U-Bahn, die U-Bahnen; befinden, befand, befunden; die Werkstatt, die Werkstätten; mobil; dahin
Begin with the first card, then the second card, the third, and so on, until you've transcribed the last card.
If there is only one card, output the speakable phrase directly, WITHOUT a following semicolon.
NEVER output ANYTHING ELSE except for the semicolon-delimited string of speakable phrases, EVER! THIS IS THE MOST IMPORTANT!
# Remember, the result should be SPEAKABLE, not READABLE, and needs to include ALL WORDS.
I'll give you a list of Anki cards. Your job is to transform the dictionary entry card into speakable words, card for card, so that it can be passed to a TTS program.
Your job is to transform cards into speakable phrases, separated by semicolons.
ALWAYS follow these instructions:
- MAKE SURE to output NOTHING ELSE but the result
- MAKE SURE to not forget a single word! Start from the top, and continue onwards to the bottom
- MAKE SURE to output NO EXTRA WORDS to the cards
- Remove any brackets containing words (e.g. (ADJ)), or word separators (e.g aus|steigen -> aussteigen)
- For verbs, include past and past perfect tense (e.g. befinden -> befinden, befand, befunden)
- For adjectives, adverbs, propositions and other words, write down ONLY the words themselves (e.g. dahin -> dahin)
- DO NOT include extra words ANYWHERE EXCEPT FOR IN DECLARED CASES (e.g. DO NOT DO THIS: dahin -> dahin, dorthin. INSTEAD: dahin -> dahin)
- If a plural exists, ALWAYS write down the singular and the plural (e.g. die Werkstatt -en -> die Werkstatt, die Werkstätten)
- Separate plurals (of the same card) ALWAYS with commas, NEVER with semicolons. ALWAYS separate different cards with semicolons.
For example, given the following input:
das Taxi -s; die U-Bahn -en; befinden; die Werkstatt -en; mobil (ADJ); dahin
Output this:
das Taxi, die Taxis; die U-Bahn, die U-Bahnen; befinden, befand, befunden; die Werkstatt, die Werkstätten; mobil; dahin
Remember, the result should be SPEAKABLE, not READABLE, and needs to include ALL WORDS.
MAKE SURE that you only output according to the given content, NOTHING MORE! DO NOT add any extra text, or change the formatting in any way.
I'll give you a list of German words and phrases. Your job is to transform them into speakable phrases, word for word, not leaving any out, separated by semicolons, ALWAYS following these rules:
- Output the singular and plural forms of nouns (e.g., "das Taxi -s" -> "das Taxi, die Taxis").
- For verbs, include the infinitive, past tense, and past participle (e.g., "scheiden" -> "scheiden, schied, geschieden").
- Write adjectives and adverbs in their base form only.
- Remove any brackets containing words (e.g. (ADJ), (NUR PL))
- ALWAYS remove word separators (e.g aus|steigen -> aussteigen)
- Pronouns should be listed in both singular and plural forms where applicable.
- Use commas to separate different forms of the same word, and semicolons to separate different words or phrases.
- Do not add or omit any words; the output should match the input in number and order.
- If a noun does not change form in the plural, write it with both the singular and plural articles to reflect the change in number (e.g., "das Mädchen, -" -> "das Mädchen, die Mädchen"). If a noun only exists in the plural form, write only the plural (e.g., "die Eltern" remains "die Eltern").
- For words that only have a plural (e.g. Geschwister), just write the plural form (in this case "Die Geschwister")
Example Input:
das Taxi -s; die U-Bahn -en; befinden; die Werkstatt -en; mobil (ADJ); dahin; Eltern (NUR PL); der Palästinenser, -
Expected Output:
das Taxi, die Taxis; die U-Bahn, die U-Bahnen; befinden, befand, befunden; die Werkstatt, die Werkstätten; mobil; dahin; die Eltern; der Palästinenser
# - MAKE SURE that the number and order of output words EQUALS the number and order of input words. This is CRITICALLY IMPORTANT! So DO NOT LEAVE ANYTHING OUT!!!
# - MAKE SURE to start at the top, and write a place for EVERY word! Do not leave any out.
# - End after the last card has been written out.

pyproject.toml Normal file
View File

@ -0,0 +1,21 @@
name = "ttsthing"
version = "0.1.0"
description = ""
authors = ["Yandrik <>"]
readme = ""
python = "3.11.*"
pathlib = "^1.0.1"
openai = "^1.11.1"
tts = "^0.22.0"
torch = "^2.2.0"
pydub = "^0.25.1"
argparse = "^1.4.0"
requests = "^2.31.0"
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

speaker.wav Normal file

Binary file not shown.

speaker2.wav Normal file

Binary file not shown.

speaker_shorter.wav Normal file

Binary file not shown.

speech.mp3 Normal file

Binary file not shown.