feat: fixed direct recording to webservice

2023-12-18 19:33:58 +01:00
parent 67951b4349
commit 1e2f0334f7
5 changed files with 548 additions and 860 deletions
--- a/main.py
+++ b/main.py
@ -360,14 +360,16 @@ def main(page):

        set_transcribe_ready(False)

-    def paralyze_ui():
+    def paralyze_ui(spinner: bool = True, disable_recording_button: bool = True):
        model_size_select.current.disabled = True
        model_device_select.current.disabled = True
        # model_bits_select.current.disabled = True
        model_load_unload_button.current.disabled = True
-        processing_spinner.current.visible = True
+        processing_spinner.current.visible = spinner
        current_mode_select.current.disabled = True

+        record_button.current.disabled = disable_recording_button
+
        model_load_unload_button.current.icon = ft.icons.CLOSE
        model_load_unload_button.current.disabled = False
        for btn in transcribe_buttons:
@ -392,6 +394,8 @@ def main(page):
            model_load_unload_button.current.icon = ft.icons.CLOSE
            model_load_unload_button.current.disabled = False

+            record_button.current.disabled = False
+
            if mm.is_model_loaded():
                current_mode_select.current.disabled = True
        else:
@ -406,6 +410,8 @@ def main(page):
            processing_spinner.current.visible = False
            current_mode_select.current.disabled = False

+            record_button.current.disabled = True
+
        page.update()

    def on_url_input(e):
@ -442,15 +448,23 @@ def main(page):

            recorded_audio = b"".join(sound_chunks)

+            set_transcribe_ready(False)
+
            transcribe(recorded_audio)

            recording = False

            # sound = pygame.mixer.Sound(buffer=recorded_audio)  # doesn't work because sampling rate is wrong
+            record_button.current.bgcolor = "0x000000FF"

-            print("playing back recorded sound")
+            set_transcribe_ready(True)
+
+            print("done")
            # sound.play()
        else:
+            if not transcribe_ready:
+                print("Can't record, not ready")
+                return
            print("Starting Recording...")
            recording = True

@ -472,6 +486,9 @@ def main(page):

            rec_stream.start_stream()

+            record_button.current.bgcolor = "0xFFFF4444"
+            paralyze_ui(spinner=False, disable_recording_button=False)
+
    def find_recordingdevice_tuple_by_name(search_name: str) -> typing.Tuple[int, str] | None:
        return next(((device_id, name) for device_id, name in capture_devices if name == search_name))

@ -594,5 +611,8 @@ def main(page):
        ], expand=True),
    )

+    # refresh all values, and make sure the right stuff is shown
+    mode_select()
+

 ft.app(target=main)
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -9,15 +9,13 @@ readme = "README.md"
 [tool.poetry.dependencies]
 python = ">=3.11, <3.13"
 flet = "^0.10.3"
-faster-whisper = "^0.9.0"
+faster-whisper = "0.10.0"
 pygame = "^2.5.2"
-torch = "2.0.0"
 requests = "^2.31.0"
 validators = "^0.22.0"
 pyinstaller = "^6.1.0"
-pysdl2 = "^0.9.16"
-pysdl2-dll = "^2.28.4"
 pyaudio = "^0.2.13"
+pydub = "^0.25.1"


 [build-system]
--- a/utils.py
+++ b/utils.py
@ -3,6 +3,9 @@ import os

 from typing import DefaultDict, Dict, List

+from pydub import AudioSegment
+import io
+

 def tree() -> DefaultDict:
    return defaultdict(tree)
@ -44,3 +47,15 @@ def defaultdict_to_dict(d: defaultdict) -> dict:
    if isinstance(d, defaultdict):
        d = {k: defaultdict_to_dict(v) for k, v in d.items()}
    return d
+
+
+def convert_to_mp3(audio_data: bytes, sample_width: int, frame_rate: int, channels: int) -> bytes:
+    audio = AudioSegment.from_raw(
+        io.BytesIO(audio_data),
+        sample_width=sample_width,
+        frame_rate=frame_rate,
+        channels=channels
+    )
+    mp3_buffer = io.BytesIO()
+    audio.export(mp3_buffer, format="mp3")
+    return mp3_buffer.getvalue()
--- a/whisper_webservice_interface.py
+++ b/whisper_webservice_interface.py
@ -2,6 +2,8 @@ from typing import Optional, Union, Dict, Any

 import requests

+from utils import convert_to_mp3
+

 def send_asr_request(url: str, audio_file_path_or_bytes: str | bytes, task: Optional[str] = None, language: Optional[str] = None,
                     initial_prompt: Optional[str] = None, encode: Optional[bool] = None,
@ -28,7 +30,7 @@ def send_asr_request(url: str, audio_file_path_or_bytes: str | bytes, task: Opti
        with open(audio_file_path_or_bytes, 'rb') as f:
            audio_file = f.read()
    else:
-        audio_file = audio_file_path_or_bytes
+        audio_file = convert_to_mp3(audio_file_path_or_bytes, sample_width=2, frame_rate=16000, channels=1)

    files = {
        'audio_file': audio_file