feat: fixed direct recording to webservice

2023-12-18 19:33:58 +01:00
parent d721eb3a5b
commit 85c89d5343
5 changed files with 548 additions and 860 deletions
--- a/main.py
+++ b/main.py
@ -360,14 +360,16 @@ def main(page):
        set_transcribe_ready(False)
-    def paralyze_ui():
+    def paralyze_ui(spinner: bool = True, disable_recording_button: bool = True):
        model_size_select.current.disabled = True
        model_device_select.current.disabled = True
        # model_bits_select.current.disabled = True
        model_load_unload_button.current.disabled = True
-        processing_spinner.current.visible = True
+        processing_spinner.current.visible = spinner
        current_mode_select.current.disabled = True
        record_button.current.disabled = disable_recording_button
        model_load_unload_button.current.icon = ft.icons.CLOSE
        model_load_unload_button.current.disabled = False
        for btn in transcribe_buttons:
@ -392,6 +394,8 @@ def main(page):
            model_load_unload_button.current.icon = ft.icons.CLOSE
            model_load_unload_button.current.disabled = False
            record_button.current.disabled = False
            if mm.is_model_loaded():
                current_mode_select.current.disabled = True
        else:
@ -406,6 +410,8 @@ def main(page):
            processing_spinner.current.visible = False
            current_mode_select.current.disabled = False
            record_button.current.disabled = True
        page.update()
    def on_url_input(e):
@ -442,15 +448,23 @@ def main(page):
            recorded_audio = b"".join(sound_chunks)
            set_transcribe_ready(False)
            transcribe(recorded_audio)
            recording = False
            # sound = pygame.mixer.Sound(buffer=recorded_audio)  # doesn't work because sampling rate is wrong
            record_button.current.bgcolor = "0x000000FF"
-            print("playing back recorded sound")
+            set_transcribe_ready(True)
            print("done")
            # sound.play()
        else:
            if not transcribe_ready:
                print("Can't record, not ready")
                return
            print("Starting Recording...")
            recording = True
@ -472,6 +486,9 @@ def main(page):
            rec_stream.start_stream()
            record_button.current.bgcolor = "0xFFFF4444"
            paralyze_ui(spinner=False, disable_recording_button=False)
    def find_recordingdevice_tuple_by_name(search_name: str) -> typing.Tuple[int, str] | None:
        return next(((device_id, name) for device_id, name in capture_devices if name == search_name))
@ -594,5 +611,8 @@ def main(page):
        ], expand=True),
    )
    # refresh all values, and make sure the right stuff is shown
    mode_select()
 ft.app(target=main)
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -9,15 +9,13 @@ readme = "README.md"
 [tool.poetry.dependencies]
 python = ">=3.11, <3.13"
 flet = "^0.10.3"
-faster-whisper = "^0.9.0"
+faster-whisper = "0.10.0"
 pygame = "^2.5.2"
 torch = "2.0.0"
 requests = "^2.31.0"
 validators = "^0.22.0"
 pyinstaller = "^6.1.0"
 pysdl2 = "^0.9.16"
 pysdl2-dll = "^2.28.4"
 pyaudio = "^0.2.13"
 pydub = "^0.25.1"
 [build-system]
--- a/utils.py
+++ b/utils.py
@ -3,6 +3,9 @@ import os
 from typing import DefaultDict, Dict, List
 from pydub import AudioSegment
 import io
 def tree() -> DefaultDict:
    return defaultdict(tree)
@ -44,3 +47,15 @@ def defaultdict_to_dict(d: defaultdict) -> dict:
    if isinstance(d, defaultdict):
        d = {k: defaultdict_to_dict(v) for k, v in d.items()}
    return d
 def convert_to_mp3(audio_data: bytes, sample_width: int, frame_rate: int, channels: int) -> bytes:
    audio = AudioSegment.from_raw(
        io.BytesIO(audio_data),
        sample_width=sample_width,
        frame_rate=frame_rate,
        channels=channels
    )
    mp3_buffer = io.BytesIO()
    audio.export(mp3_buffer, format="mp3")
    return mp3_buffer.getvalue()
--- a/whisper_webservice_interface.py
+++ b/whisper_webservice_interface.py
@ -2,6 +2,8 @@ from typing import Optional, Union, Dict, Any
 import requests
 from utils import convert_to_mp3
 def send_asr_request(url: str, audio_file_path_or_bytes: str | bytes, task: Optional[str] = None, language: Optional[str] = None,
                     initial_prompt: Optional[str] = None, encode: Optional[bool] = None,
@ -28,7 +30,7 @@ def send_asr_request(url: str, audio_file_path_or_bytes: str | bytes, task: Opti
        with open(audio_file_path_or_bytes, 'rb') as f:
            audio_file = f.read()
    else:
-        audio_file = audio_file_path_or_bytes
+        audio_file = convert_to_mp3(audio_file_path_or_bytes, sample_width=2, frame_rate=16000, channels=1)
    files = {
        'audio_file': audio_file