INITIAL COMMIT

2024-02-09 01:45:49 +01:00
commit 9fa38baa83
16 changed files with 5149 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,167 @@
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+out*/
+apikey.secret
+out*.md
+.tmp/
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Poetry (ttsthing)" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (ttsthing)" project-jdk-type="Python SDK" />
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/ttsthing.iml" filepath="$PROJECT_DIR$/.idea/ttsthing.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/ttsthing.iml
+++ b/.idea/ttsthing.iml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Python 3.11 (ttsthing)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
+3.11
--- a/main.py
+++ b/main.py
@ -0,0 +1,283 @@
+import os
+from typing import Tuple, List
+
+import argparse
+import re
+from pathlib import Path
+from pydub import AudioSegment
+import torch
+from TTS.api import TTS
+
+import requests
+
+import prompt
+
+OUT_DIR = 'out'
+SPEAKER_WAV = 'speaker.wav'
+LANG = "de"
+
+tts = None
+
+try:
+    with open('apikey.secret') as f:
+        api_key = f.read().strip()
+except FileNotFoundError:
+    print('Couldn\'t read API key from file \'apikey.secret\'w. Does it exist?')
+
+
+def is_float(s: str) -> bool:
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+
+
+def has_word_characters(s: str) -> bool:
+    if re.search(r'\w', s):
+        return True
+    else:
+        return False
+
+
+def transform_string(input_str: str) -> str:
+    """
+    This method transforms strings like "Der Gauner, die Gauner" into der_gauner_die_gauner
+    """
+    output_str = (input_str
+                  .lower()
+                  .replace(' ', '_')
+                  .replace(',', '')
+                  .replace(';', '')
+                  .replace('.', '')
+                  .replace('/', '')
+                  .replace('\\', '')
+                  .replace(']', '')
+                  .replace('[', '')
+                  .replace('(', '')
+                  .replace(')', ''))
+    return output_str
+
+
+def get_tts_lazy() -> TTS:
+    global tts
+    if tts is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
+    return tts
+
+
+def do_tts(text: str, speaker_wav: str, language: str, mp3_path: str, multisample: int = 4):
+    """Generates audio file from text."""
+
+    os.makedirs('.tmp', exist_ok=True)
+
+    # to prevent long extra phrases, run `multisample` times, and take shortest (smallest) sample
+    for i in range(multisample):
+        get_tts_lazy().tts_to_file(text=text, speaker_wav=speaker_wav, language=language, file_path=f".tmp/temp{i}.wav")
+
+    shortest_sound = None
+
+    # load from file and convert to mp3
+    for i in range(multisample):
+        sound = AudioSegment.from_file(f".tmp/temp{i}.wav")
+        if shortest_sound is None:
+            shortest_sound = sound
+        elif len(sound) < len(shortest_sound):
+            shortest_sound = sound
+
+    sound = shortest_sound
+
+    sound = sound.set_frame_rate(44100)  # Set frame rate to 44.1kHz, high quality
+    sound = sound.set_sample_width(2)  # 2 byte (16 bit) samples, high quality
+    sound = sound.set_channels(2)  # make it stereo
+
+    # Export as high quality mp3
+    sound.export(mp3_path, format="mp3", bitrate="192k")  # Export with high quality bitrate
+
+
+temp = "passieren; der Schaden, die Schäden; der Start, die Starts; die Strecke, die Strecken; der Verkehr; wenden; das Zeichen, die Zeichen; aussteigen; ausweichen; die Autobahn, die Autobahnen; der Bord, die Borde; die Brücke, die Brücken; einholen; einsteigen; entgegenkommen; fort; freigeben; der Hafen, die Häfen; der Halt, die Halte; die Kurve, die Kurven; laden; mobil; der Parkplatz, die Parkplätze; rollen; das Signal, die Signale; sperren; die Station, die Stationen; stoppen; das Tempo, die Tempos; das Ticket, die Tickets; der Transport, die Transporte; transportieren; der Tunnel, die Tunnel; der Unfall, die Unfälle; verkehren; verpassen"
+
+
+def extract_words_from_cards(cards: [Tuple[str, str]], temp: float = 0.0) -> List[str]:
+    url = "https://api.perplexity.ai/chat/completions"
+
+    left_sides = [card[0] for card in cards]
+    query_words = ';'.join(left_sides)  # german words
+
+    payload = {
+        "model": "mixtral-8x7b-instruct",
+        # "model": "pplx-70b-chat",
+        "messages": [
+            {
+                "role": "system",
+                "content": prompt.CARDS_TO_WORDS_PROMPT  # prompt.CARDS_TO_WORDS_PROMPT
+                # "content": prompt.LLAMA_CARDS_TO_WORDS_PROMPT
+            },
+            {
+                "role": "user",
+                "content": "Here are Anki Cards to transform into speakable phrases: \n" + query_words +
+                           "\nMake sure to ONLY output a string of semicolon-separated speakable phrases. DO NOT write anything else!"
+            }
+        ],
+        "temperature": temp,
+        "presence_penalty": 0
+    }
+    headers = {
+        "accept": "application/json",
+        "content-type": "application/json",
+        "authorization": f"Bearer {api_key}",
+    }
+
+    while True:
+        response = requests.post(url, json=payload, headers=headers)
+
+        if response.status_code != 200:
+            print("Perplexity API error")
+            user_input = input("Retry? [Y/n] > ")
+            if user_input.strip().lower() in ['n', 'no']:
+                print("exiting...")
+                exit(0)
+            else:
+                continue
+        else:
+            break
+
+    print((response.json()))
+
+    # split response at ;, and then return as list
+
+    content = response.json()['choices'][0]['message']['content']
+
+    print(f'query:   {query_words}')
+    print(f'content: {content}')
+
+    words = content.split(";")
+    words = [word.split('\n')[0].strip() for word in words]  # remove eventual newlines (e.g. comments)
+    if len(words[-1]) == 0 or not has_word_characters(words[-1]):
+        words = words[:-1]
+    return words
+
+
+def process_text_to_audio(phrases_and_filenames: [(str, str)]):
+    os.makedirs(OUT_DIR, exist_ok=True)
+    for idx, phrase_and_filename in enumerate(phrases_and_filenames, 1):
+        phrase, filename = phrase_and_filename
+        mp3_path = os.path.join(OUT_DIR, f"{filename}.mp3")
+        do_tts(phrase, 'speaker.wav', 'de', mp3_path)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Parse markdown note for Anki cards.")
+    parser.add_argument('note_file', type=str, help='The path of the markdown note file.')
+    parser.add_argument('--out-file', type=str, default='out.md', help='The output file.')
+    parser.add_argument('--obsidian-format', action='store_true', help='Use Obsidian path format.')
+    parser.add_argument("--batch-size", type=int, default=64,
+                        help='Number of cards sent to the LLM at one time, and processed in bulk. Default 64')
+    parser.add_argument('--multisample', type=int, default=4,
+                        help='Number of audio generations per batch. Reduces audio with arbitrary sounds for short cards. Default 4')
+    parser.add_argument('--multisample-multiply-limit', type=int, default=8,
+                        help='If a phrase is shorter than the multisample multiply limit, significantly more audio generations (generally *3) are done to improve quality. Set to 0 to disable. Default 8')
+    parser.add_argument('--multisample-multiply', type=int, default=3,
+                        help='Sets the multiplier for additional audio generations when a phrase is shorter than the multisample multiply limit. Default is 3')
+
+    args = parser.parse_args()
+
+    # Check if the note file exists
+    if not Path(args.note_file).exists():
+        raise FileNotFoundError(f"File {args.note_file} does not exist.")
+
+    # Read file and parse for Anki cards
+    with open(args.note_file, 'r') as f:
+        content = f.read()
+
+    matches = re.findall(r"^Q: (.+)\nA: (.+)\n", content, re.MULTILINE)
+
+    # truncate for debug
+    # matches = matches[:129]
+
+    batch_size = args.batch_size
+
+    # process in blocks of batch_size
+
+    out_content = content
+
+    for i in range(len(matches) // batch_size + 1):
+        to_match = matches[i * batch_size:i * batch_size + batch_size]
+
+        correct_words_generated = False
+
+        words = None
+
+        subbatch_size = batch_size
+        cur_temp = 0
+
+        # process cards
+        while not correct_words_generated:
+            batches = [to_match[i:i + subbatch_size] for i in range(0, len(to_match), subbatch_size)]
+            words = []
+            for batch in batches:
+                words += extract_words_from_cards(batch, temp=cur_temp)
+
+            if len(words) != len(to_match):
+                print(f'generated words len ({len(words)}) != matches len ({len(to_match)})')
+                print(
+                    f'Current Batch Size is {subbatch_size}, temp is {cur_temp}. If this happens repeatedly, try reducing the batch size.')
+                userinput = input('Try again? [Y/n/split/temp] > ')
+                if userinput.strip().lower() in ['n', 'no']:
+                    print("aborting...")
+                    exit(0)
+                elif userinput.strip().lower() in ['s', 'split']:
+                    subbatch_size = max(1, subbatch_size // 2)
+                    # generate batches
+                elif userinput.strip().lower() in ['t', 'temp']:
+                    while True:
+                        new_temp = input(f"cur temp: {cur_temp}. Input new temp (0-2) > ")
+                        if is_float(new_temp) and 0 <= float(new_temp) <= 2:
+                            cur_temp = float(new_temp)
+                            break
+                        else:
+                            print("Must be numeric (float) between 0 and 2")
+
+
+
+
+
+
+                else:
+                    print("trying again...")
+            else:
+                correct_words_generated = True
+
+        # assert len(words) == len(to_match), f'generated words len ({len(words)}) != matches len ({len(to_match)})'
+
+        print(str(words))
+
+        filenames = []
+
+        for j, word in enumerate(words):
+            print(f'speaker-ifying word {word} ({j + i * batch_size} of {len(matches)})')
+
+            word = word.strip()
+
+            filename = f"{transform_string(word)}.mp3"
+            filenames.append(filename)
+
+            multisample = args.multisample
+            if len(word) < args.multisample_multiply_limit:
+                multisample *= args.multisample_multiply  # generate 3x as many for short words / phrases
+
+            do_tts(word, SPEAKER_WAV, LANG, f'./{OUT_DIR}/{filename}', multisample=multisample)
+
+        for i, (question, answer) in enumerate(to_match):
+            out_content = out_content.replace(f"Q: {question}",
+                                              f"Q: {question} {'![[' if args.obsidian_format else f'![](./{OUT_DIR}/'}{filenames[i]}{']]' if args.obsidian_format else ')'}")
+
+    # open out file for writing
+    out_file = args.out_file
+    with open(out_file, 'w') as f:
+        f.write(out_content)
+
+
+if __name__ == '__main__':
+    main()
--- a/poetry.lock
+++ b/poetry.lock
--- a/prompt.py
+++ b/prompt.py
@ -0,0 +1,84 @@
+CARDS_TO_WORDS_PROMPT = '''
+I'll give you a list of Anki cards. Your job is to transform the dictionary entry card into speakable words, card for card, so that it can be passed to a TTS program. 
+Your job is to transform cards into speakable phrases, separated by semicolons. 
+
+ALWAYS follow these instructions: 
+- MAKE SURE to output NOTHING ELSE but the result 
+- MAKE SURE to not forget a single word! Start from the top, and continue onwards to the bottom
+- MAKE SURE to output NO EXTRA WORDS to the cards
+- Remove any brackets containing words (e.g. (ADJ), (NUR PL))
+- ALWAYS Remove '|' word separators (e.g aus|steigen -> aussteigen)
+- For verbs, include past and past perfect tense (e.g. befinden -> befinden, befand, befunden)
+- DO NOT include extra words ANYWHERE EXCEPT FOR IN DECLARED CASES (e.g. DO NOT DO THIS: dahin -> dahin, dorthin. INSTEAD: dahin -> dahin)
+- If a plural exists, ALWAYS write down the singular and the plural (e.g. die Werkstatt -en -> die Werkstatt, die Werkstätten)
+    - Separate plurals (of the same card) ALWAYS with commas, NEVER with semicolons. ALWAYS separate different cards with semicolons.
+- If a noun does not change form in the plural, write it with both the singular and plural articles to reflect the change in number (e.g., "das Mädchen, -" -> "das Mädchen, die Mädchen"). If a noun only exists in the plural form, write only the plural (e.g., "die Eltern" remains "die Eltern").
+- For words that only have a plural (e.g. Geschwister), just write the plural form (in this case "Die Geschwister")
+- For adjectives, adverbs, propositions and ALL OTHER WORDS, write down ONLY the words themselves (e.g. dahin -> dahin)
+
+For example, given the following input: 
+
+das Taxi -s; die U-Bahn -en; befinden; die Werkstatt -en; mobil (ADJ); dahin
+
+Output this: 
+das Taxi, die Taxis; die U-Bahn, die U-Bahnen; befinden, befand, befunden; die Werkstatt, die Werkstätten; mobil; dahin
+
+Begin with the first card, then the second card, the third, and so on, until you've transcribed the last card. 
+If there is only one card, output the speakable phrase directly, WITHOUT a following semicolon.
+NEVER output ANYTHING ELSE except for the semicolon-delimited string of speakable phrases, EVER! THIS IS THE MOST IMPORTANT!
+'''
+
+# Remember, the result should be SPEAKABLE, not READABLE, and needs to include ALL WORDS.
+
+
+LLAMA_CARDS_TO_WORDS_PROMPT = '''
+I'll give you a list of Anki cards. Your job is to transform the dictionary entry card into speakable words, card for card, so that it can be passed to a TTS program. 
+Your job is to transform cards into speakable phrases, separated by semicolons. 
+
+ALWAYS follow these instructions: 
+- MAKE SURE to output NOTHING ELSE but the result 
+- MAKE SURE to not forget a single word! Start from the top, and continue onwards to the bottom
+- MAKE SURE to output NO EXTRA WORDS to the cards
+- Remove any brackets containing words (e.g. (ADJ)), or word separators (e.g aus|steigen -> aussteigen)
+- For verbs, include past and past perfect tense (e.g. befinden -> befinden, befand, befunden)
+- For adjectives, adverbs, propositions and other words, write down ONLY the words themselves (e.g. dahin -> dahin)
+- DO NOT include extra words ANYWHERE EXCEPT FOR IN DECLARED CASES (e.g. DO NOT DO THIS: dahin -> dahin, dorthin. INSTEAD: dahin -> dahin)
+- If a plural exists, ALWAYS write down the singular and the plural (e.g. die Werkstatt -en -> die Werkstatt, die Werkstätten)
+    - Separate plurals (of the same card) ALWAYS with commas, NEVER with semicolons. ALWAYS separate different cards with semicolons.
+
+For example, given the following input: 
+
+das Taxi -s; die U-Bahn -en; befinden; die Werkstatt -en; mobil (ADJ); dahin
+
+Output this: 
+das Taxi, die Taxis; die U-Bahn, die U-Bahnen; befinden, befand, befunden; die Werkstatt, die Werkstätten; mobil; dahin
+
+Remember, the result should be SPEAKABLE, not READABLE, and needs to include ALL WORDS.
+MAKE SURE that you only output according to the given content, NOTHING MORE! DO NOT add any extra text, or change the formatting in any way.
+'''
+
+IMPROVED_CARDS_TO_WORDS_PROMPT = '''
+I'll give you a list of German words and phrases. Your job is to transform them into speakable phrases, word for word, not leaving any out, separated by semicolons, ALWAYS following these rules:
+
+- Output the singular and plural forms of nouns (e.g., "das Taxi -s" -> "das Taxi, die Taxis").
+- For verbs, include the infinitive, past tense, and past participle (e.g., "scheiden" -> "scheiden, schied, geschieden").
+- Write adjectives and adverbs in their base form only.
+- Remove any brackets containing words (e.g. (ADJ), (NUR PL))
+- ALWAYS remove word separators (e.g aus|steigen -> aussteigen)
+- Pronouns should be listed in both singular and plural forms where applicable.
+- Use commas to separate different forms of the same word, and semicolons to separate different words or phrases.
+- Do not add or omit any words; the output should match the input in number and order.
+- If a noun does not change form in the plural, write it with both the singular and plural articles to reflect the change in number (e.g., "das Mädchen, -" -> "das Mädchen, die Mädchen"). If a noun only exists in the plural form, write only the plural (e.g., "die Eltern" remains "die Eltern").
+- For words that only have a plural (e.g. Geschwister), just write the plural form (in this case "Die Geschwister")
+
+Example Input:
+das Taxi -s; die U-Bahn -en; befinden; die Werkstatt -en; mobil (ADJ); dahin; Eltern (NUR PL); der Palästinenser, -
+
+Expected Output:
+das Taxi, die Taxis; die U-Bahn, die U-Bahnen; befinden, befand, befunden; die Werkstatt, die Werkstätten; mobil; dahin; die Eltern; der Palästinenser
+'''
+
+# - MAKE SURE that the number and order of output words EQUALS the number and order of input words. This is CRITICALLY IMPORTANT! So DO NOT LEAVE ANYTHING OUT!!!
+
+# - MAKE SURE to start at the top, and write a place for EVERY word! Do not leave any out.
+# - End after the last card has been written out.
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,21 @@
+[tool.poetry]
+name = "ttsthing"
+version = "0.1.0"
+description = ""
+authors = ["Yandrik <me@yandrik.dev>"]
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "3.11.*"
+pathlib = "^1.0.1"
+openai = "^1.11.1"
+tts = "^0.22.0"
+torch = "^2.2.0"
+pydub = "^0.25.1"
+argparse = "^1.4.0"
+requests = "^2.31.0"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
--- a/speaker.wav
+++ b/speaker.wav
--- a/speaker2.wav
+++ b/speaker2.wav
--- a/speaker_shorter.wav
+++ b/speaker_shorter.wav
--- a/speech.mp3
+++ b/speech.mp3