meta: removed build files

feat: added last opened folders list
feat: fixed direct recording to webservice
2024-01-17 12:13:41 +01:00 · 2023-12-19 01:24:58 +01:00 · 2023-12-18 19:33:58 +01:00 · 2023-10-29 20:36:25 +01:00
18 changed files with 1309 additions and 920 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+/build
+/dist
+/__pycache__
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
--- a/.idea/fluesterpost.iml
+++ b/.idea/fluesterpost.iml
@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Poetry (fluesterpost)" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Poetry (fluesterpost)" project-jdk-type="Python SDK" />
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/fluesterpost.iml" filepath="$PROJECT_DIR$/.idea/fluesterpost.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
--- a/pycache/nn_model_manager.cpython-311.pyc
+++ b/pycache/nn_model_manager.cpython-311.pyc
--- a/pycache/utils.cpython-311.pyc
+++ b/pycache/utils.cpython-311.pyc
--- a/pycache/whisper_webservice_interface.cpython-311.pyc
+++ b/pycache/whisper_webservice_interface.cpython-311.pyc
--- a/main.py
+++ b/main.py
@ -1,6 +1,10 @@
 import os
 import pprint
 import traceback
+import typing
+
+import requests.exceptions
+import validators

 import utils
 import flet as ft
@ -10,30 +14,157 @@ from typing import DefaultDict
 import pygame

 import nn_model_manager as mm
+import whisper_webservice_interface
+
+import wave
+import sys
+import pyaudio
+
+
+# === TEMP ===
+import logging
+
+logging.basicConfig()
+logging.getLogger("faster_whisper").setLevel(logging.DEBUG)
+
+# === END ===
+
+
+# globals
+transcribe_ready: bool = False
+recording: bool = False
+rec_stream: pyaudio.Stream | None = None
+sound_chunks = []
+recorded_audio = []
+
+# AUDIO stuff
+REC_CHUNK = 1024
+REC_FORMAT = pyaudio.paInt16
+REC_CHANNELS = 1
+REC_RATE = 16000
+REC_RECORD_SECONDS = 5


 def main(page):
    pygame.mixer.init()

-    first_name = ft.Ref[ft.TextField]()
-    last_name = ft.Ref[ft.TextField]()
-    greetings = ft.Ref[ft.Column]()
+    # get audio device names
+    p = pyaudio.PyAudio()
+
+    capture_devices = [(i, p.get_device_info_by_index(i)['name']) for i in range(p.get_device_count()) if
+                       p.get_device_info_by_index(i)['maxInputChannels'] > 0]
+
+    record_button = ft.Ref[ft.IconButton]()
+    mic_select = ft.Ref[ft.Dropdown]()

    file_tree = ft.Ref[ft.Column]()
    file_tree_empty_text = ft.Ref[ft.Text]()

-    load_model_text = ft.Ref[ft.Text]()
+    # mode select
+    current_mode_select = ft.Ref[ft.Dropdown]()
+    current_mode_info_text = ft.Ref[ft.Text]()
+    processing_spinner = ft.Ref[ft.ProgressRing]()
+
+    # local model mode
    model_size_select = ft.Ref[ft.Dropdown]()
    model_device_select = ft.Ref[ft.Dropdown]()
    # model_bits_select = ft.Ref[ft.Dropdown]()
    model_load_unload_button = ft.Ref[ft.IconButton]()
-    model_loading_spinner = ft.Ref[ft.ProgressRing]()
+
+    # docker whisper webservice mode
+    whisper_webservice_url_input = ft.Ref[ft.TextField]()

    transcribe_buttons: list[ft.Ref[ft.IconButton]] = []

    output_text_container = ft.Ref[ft.Container]()
    output_text_col = ft.Ref[ft.Column]()

+    # last opened folders
+
+
+    def transcribe(fileOrBytes: str | bytes):
+        print(f"DEBUG: trying to transcribe audio {fileOrBytes if isinstance(fileOrBytes, str) else f'with len {len(fileOrBytes)}'}")
+
+        # === LOCAL MODEL CODE ===
+        if current_mode_select.current.value == 'local':
+            if not mm.is_model_loaded() or (isinstance(fileOrBytes, str) and not fileOrBytes.endswith('.mp3')):
+                print("DEBUG: can't transcribe a non-MP3 file or while no model is loaded")
+                return
+
+            print(f"DEBUG: starting transcription")
+            output_text_container.current.alignment = ft.alignment.center
+            output_text_col.current.controls = [ft.ProgressRing()]
+
+            # set all transcribe buttons to disabled
+            for btn in transcribe_buttons:
+                btn.current.disabled = True
+            page.update()
+
+            try:
+                if isinstance(fileOrBytes, str):
+                    segments, info = mm.transcribe_from_file(fileOrBytes)
+                else:
+                    segments, info = mm.transcribe_from_i16_audio(fileOrBytes)
+
+                txt = ''
+
+                for seg in segments:
+                    txt += seg.text + '\n'
+
+                output_text_container.current.alignment = ft.alignment.top_left
+                output_text_col.current.controls = [ft.Text(txt, selectable=True)]  # TODO
+
+            except Exception as e:
+                output_text_container.current.alignment = ft.alignment.center
+                output_text_col.current.controls = [ft.Text(f"Transcribing failed: {str(e)}")]  # TODO
+
+            finally:
+                # set all transcribe buttons to disabled
+                for btn in transcribe_buttons:
+                    btn.current.disabled = False
+                page.update()
+
+        # === WEBSERVICE MODE CODE ===
+        elif current_mode_select.current.value == 'webservice':
+            url = whisper_webservice_url_input.current.value
+            print(f"DEBUG: starting web transcription")
+            if validators.url(url, simple_host=True):
+
+                output_text_container.current.alignment = ft.alignment.center
+                output_text_col.current.controls = [ft.ProgressRing()]
+                # set all transcribe buttons to disabled
+                for btn in transcribe_buttons:
+                    btn.current.disabled = True
+                page.update()
+
+                try:
+                    print(f'DEBUG: sending web request...')
+                    code, text = whisper_webservice_interface.send_asr_request(url, fileOrBytes, task="transcribe")
+                except requests.exceptions.RequestException as e:
+                    output_text_container.current.alignment = ft.alignment.center
+                    print(f'web transcription failed: {str(e)}')
+                    output_text_col.current.controls = \
+                        [ft.Text(f"HTTP Request to {url}/asr failed. Reason:\n{str(e)}")]
+                    # set all transcribe buttons to enabled
+                    for btn in transcribe_buttons:
+                        btn.current.disabled = False
+                    page.update()
+                    return
+
+                # set all transcribe buttons to enabled
+                for btn in transcribe_buttons:
+                    btn.current.disabled = False
+
+                if code == 200:
+                    output_text_container.current.alignment = ft.alignment.top_left
+                    output_text_col.current.controls = [ft.Text(text, selectable=True)]
+                else:
+                    output_text_container.current.alignment = ft.alignment.center
+                    output_text_col.current.controls = \
+                        [ft.Text(f"HTTP Request to {url}/asr failed ({code}):\n{text}")]
+
+                page.update()
+
    def generate_file_tree(path: str, tree_dict: dict | DefaultDict):
        if path[-1] == os.sep:
            path = path[:-1]
@ -101,45 +232,15 @@ def main(page):
            control.append(ft.IconButton(icon=ft.icons.PLAY_CIRCLE_OUTLINED, ref=_button_ref,
                                         on_click=lambda _, f=full_file_path, r=_button_ref: start_playing(f, r)))

-            def transcribe(filepath: str):
-                print(f"DEBUG: trying to transcribe file {filepath}")
-                if not mm.is_model_loaded() or not filepath.endswith('.mp3'):
-                    return
-                
-                print(f"DEBUG: starting transcription")
-                output_text_container.current.alignment = ft.alignment.center
-                output_text_col.current.controls = [ft.ProgressRing()]
-                
-                # set all transcribe buttons to disabled
-                for btn in transcribe_buttons:
-                    btn.current.disabled = True
-                page.update()
-                
-                try:
-                    segments, info = mm.transcribe_from_file(filepath)
-                    
-                    txt = ''
-                    
-                    for seg in segments:
-                        txt += seg.text + '\n'
-                    
-                    output_text_container.current.alignment = ft.alignment.top_left
-                    output_text_col.current.controls = [ft.Text(txt, selectable=True)]  # TODO
-                        
-                except Exception as e:
-                    output_text_container.current.alignment = ft.alignment.center
-                    output_text_col.current.controls = [ft.Text(f"Transcribing failed: {str(e)}")]  # TODO
-                    
-                finally:
-                    # set all transcribe buttons to disabled
-                    for btn in transcribe_buttons:
-                        btn.current.disabled = False
-                    page.update()
-                    
-                
            transcribe_button_ref = ft.Ref[ft.IconButton]()

-            control.append(ft.IconButton(icon=ft.icons.FORMAT_ALIGN_LEFT, disabled=not mm.is_model_loaded(), ref=transcribe_button_ref,
+            # check enabled
+            enabled = (current_mode_select.current.value == 'local' and mm.is_model_loaded()) or (
+                    current_mode_select.current.value == 'webservice' and
+                    validators.url(whisper_webservice_url_input.current.value, simple_host=True))
+
+            control.append(ft.IconButton(icon=ft.icons.FORMAT_ALIGN_LEFT, disabled=not enabled,
+                                         ref=transcribe_button_ref,
                                         on_click=lambda _, f=full_file_path: transcribe(f)))

            transcribe_buttons.append(transcribe_button_ref)
@ -155,17 +256,11 @@ def main(page):
        ]
        )

-    def btn_click(e):
-        greetings.current.controls.append(
-            ft.Text(f"Hello, {first_name.current.value} {last_name.current.value}!")
-        )
-        first_name.current.value = ""
-        last_name.current.value = ""
-        page.update()
-        first_name.current.focus()
-
-    def on_dialog_result(e: ft.FilePickerResultEvent):
+    def on_dialog_result(e: ft.FilePickerResultEvent | str):
+        if isinstance(e, ft.FilePickerResultEvent):
            path = e.path
+        else:
+            path = e
        if path:
            print(f"path is {path}")
            try:
@ -179,20 +274,71 @@ def main(page):
                        )
                        file_tree_empty_text.current.visible = False

+                    # add to last opened folders
+
+                    last_opened_folders = page.client_storage.get('last_opened_folders') if page.client_storage.contains_key(
+                        'last_opened_folders') else []
+
+                    if path not in last_opened_folders:
+                        last_opened_folders.append(path)
+                        last_opened_folders = last_opened_folders[-10:]
+                        page.client_storage.set('last_opened_folders', last_opened_folders)
+
                    page.update()
-            except e:
-                print("didn't work aaa")  # TODO: fix
+            except Exception as e:
+                print(f"An error occurred when building the file tree: {str(e)}")
                
-    def load_model():

-        load_model_text.current.value = 'Loading... This may take a while.'
+    def mode_select():
+        global transcribe_ready
+        if mm.is_model_loaded():
+            print("BUG: cannot change mode while model is loaded!")
+            return

+        next_mode = current_mode_select.current.value
+        if next_mode == 'local':
+            # enable model selects & loads
+            model_size_select.current.visible = True
+            model_device_select.current.visible = True
+            model_load_unload_button.current.visible = True
+            model_size_select.current.disabled = False
+            model_device_select.current.disabled = False
+
+            whisper_webservice_url_input.current.visible = False
+
+            for btn in transcribe_buttons:
+                btn.current.disabled = True
+
+            set_transcribe_ready(False)
+
+        elif next_mode == 'webservice':
+            # enable model selects & loads
+            model_size_select.current.visible = False
+            model_device_select.current.visible = False
+            model_load_unload_button.current.visible = False
            model_size_select.current.disabled = True
            model_device_select.current.disabled = True
-        # model_bits_select.current.disabled = True
            model_load_unload_button.current.disabled = True
-        model_loading_spinner.current.visible = True
+            current_mode_info_text.current.value = 'Input the URL of the onerahmet/openai-whisper-asr-webservice docker container'
+
+            whisper_webservice_url_input.current.visible = True
+            whisper_webservice_url_input.current.disabled = False
+
+            on_url_input(None)
+
+
+        else:
+            raise Exception(f'BUG: Impossible mode {next_mode} received!')
+
        page.update()
+        page.client_storage.set('selected_mode', next_mode)
+
+    def load_model():
+        current_mode_info_text.current.value = 'Loading... This may take a while.'
+
+        page.update()
+
+        paralyze_ui()

        try:
            mm.set_model(
@ -203,55 +349,165 @@ def main(page):
        except Exception as e:
            print(f"loading model failed. Exception: {str(e)}")
            print(traceback.format_exc())
-            load_model_text.current.value = f'Loading failed. Reason:\n{str(e)}'
-            model_size_select.current.disabled = False
-            model_device_select.current.disabled = False
-            # model_bits_select.current.disabled = False
+            current_mode_info_text.current.value = f'Loading failed. Reason:\n{str(e)}'
+            set_transcribe_ready(False)

            # raise e

-        model_loading_spinner.current.visible = False
-        model_load_unload_button.current.disabled = False
+        processing_spinner.current.visible = False

        if mm.is_model_loaded():
-            load_model_text.current.value = f'Loaded.'
-            model_load_unload_button.current.icon = ft.icons.CLOSE
-            model_load_unload_button.current.on_click = lambda _: unload_model()
+            current_mode_info_text.current.value = f'Loaded.'

            # if successful, save to shared preferences
            page.client_storage.set('model_size', model_size_select.current.value)
            page.client_storage.set('device_select', model_device_select.current.value)

            # set all transcribe buttons to enabled
-            for btn in transcribe_buttons:
-                btn.current.disabled = False
-        
-        page.update()
+            set_transcribe_ready(True)
+        else:
+            set_transcribe_ready(False)

    def unload_model():
-        model_load_unload_button.current.disabled = True
-        
        # set all transcribe buttons to disabled
-        for btn in transcribe_buttons:
-            btn.current.disabled = True
-            
-        page.update()
+        paralyze_ui()

        if mm.is_model_loaded():
            mm.unload_model()

-        load_model_text.current.value = 'Select parameters, and then load transcription model.'
+        set_transcribe_ready(False)
+
+    def paralyze_ui(spinner: bool = True, disable_recording_button: bool = True):
+        model_size_select.current.disabled = True
+        model_device_select.current.disabled = True
+        # model_bits_select.current.disabled = True
+        model_load_unload_button.current.disabled = True
+        processing_spinner.current.visible = spinner
+        current_mode_select.current.disabled = True
+
+        record_button.current.disabled = disable_recording_button
+
+        model_load_unload_button.current.icon = ft.icons.CLOSE
+        model_load_unload_button.current.disabled = False
+        for btn in transcribe_buttons:
+            btn.current.disabled = True
+        model_load_unload_button.current.disabled = True
+        page.update()
+
+    def set_transcribe_ready(rdy: bool):
+        global transcribe_ready
+        transcribe_ready = rdy
+
+        if transcribe_ready:
+            for btn in transcribe_buttons:
+                btn.current.disabled = False
+            model_size_select.current.disabled = True
+            model_device_select.current.disabled = True
+            # model_bits_select.current.disabled = True
+            model_load_unload_button.current.disabled = True
+            processing_spinner.current.visible = False
+            model_load_unload_button.current.on_click = lambda _: unload_model()
+
+            model_load_unload_button.current.icon = ft.icons.CLOSE
+            model_load_unload_button.current.disabled = False
+
+            record_button.current.disabled = False
+
+            if mm.is_model_loaded():
+                current_mode_select.current.disabled = True
+        else:
+            for btn in transcribe_buttons:
+                btn.current.disabled = True
            model_size_select.current.disabled = False
            model_device_select.current.disabled = False
            # model_bits_select.current.disabled = False
            model_load_unload_button.current.disabled = False
            model_load_unload_button.current.icon = ft.icons.START
            model_load_unload_button.current.on_click = lambda _: load_model()
-        model_loading_spinner.current.visible = False
+            processing_spinner.current.visible = False
+            current_mode_select.current.disabled = False
+
+            record_button.current.disabled = True
+
        page.update()

+    def on_url_input(e):
+        url_value = whisper_webservice_url_input.current.value
+        # print(url_value)

+        if validators.url(url_value, simple_host=True):
+            # print('valid')
+            page.client_storage.set('webservice_url', url_value)
+            # set all transcribe buttons to enabled
+            set_transcribe_ready(True)
+        else:
+            # print('invalid')
+            # set all transcribe buttons to disabled
+            set_transcribe_ready(False)

+        page.update()
+
+    print(tuple(page.client_storage.get('selected_mic')))
+
+    def toggle_recording():
+        global recording
+        global rec_stream
+        global sound_chunks
+        global recorded_audio
+
+        if recording:
+            print("Stopping recording...")
+
+            rec_stream.stop_stream()
+
+            while not rec_stream.is_stopped():
+                pass  # wait until stopped
+
+            recorded_audio = b"".join(sound_chunks)
+
+            set_transcribe_ready(False)
+
+            transcribe(recorded_audio)
+
+            recording = False
+
+            # sound = pygame.mixer.Sound(buffer=recorded_audio)  # doesn't work because sampling rate is wrong
+            record_button.current.bgcolor = "0x000000FF"
+
+            set_transcribe_ready(True)
+
+            print("done")
+            # sound.play()
+        else:
+            if not transcribe_ready:
+                print("Can't record, not ready")
+                return
+            print("Starting Recording...")
+            recording = True
+
+            sound_chunks = []
+
+            def cb(in_data, _frame_count, _time_info, _status):
+                sound_chunks.append(in_data)
+                print(_time_info)
+                return in_data, pyaudio.paContinue
+
+            rec_stream = p.open(
+                format=REC_FORMAT,
+                channels=REC_CHANNELS,
+                rate=REC_RATE,
+                input=True,
+                frames_per_buffer=REC_CHUNK,
+                stream_callback=cb
+            )
+
+            rec_stream.start_stream()
+
+            record_button.current.bgcolor = "0xFFFF4444"
+            paralyze_ui(spinner=False, disable_recording_button=False)
+
+    def find_recordingdevice_tuple_by_name(search_name: str) -> typing.Tuple[int, str] | None:
+        return next(((device_id, name) for device_id, name in capture_devices if name == search_name))

    # set up file picker
    file_picker = ft.FilePicker(on_result=on_dialog_result)
@ -263,11 +519,55 @@ def main(page):
        ft.Divider()
    )

+    mode = page.client_storage.get('selected_mode') if page.client_storage.contains_key('selected_mode') else 'local'
+
+    # last opened folders
+
+
+    # build controls list
+    last_opened_folders = page.client_storage.get('last_opened_folders') if page.client_storage.contains_key(
+        'last_opened_folders') else []
+
+    if not (isinstance(last_opened_folders, list) and all(isinstance(item, str) for item in last_opened_folders)):
+        last_opened_folders = []
+        
+    # TODO: rebuild when last_opened_folders changes
+    last_opened = [
+        ft.PopupMenuItem(
+            on_click=lambda _, folder_name=folder_name: on_dialog_result( folder_name ),
+            content=ft.Row([
+                ft.Icon(ft.icons.FOLDER, color=ft.colors.BLUE),
+                ft.Text(folder_name, size=14, weight=ft.FontWeight.BOLD),
+            ])
+        )
+        for folder_name in last_opened_folders
+    ]
+    
+
    page.add(
        ft.ResponsiveRow([
            ft.Container(
                ft.Column([
+                    ft.Row([
                        ft.ElevatedButton("Add Folder", on_click=lambda _: file_picker.get_directory_path()),
+                        ft.PopupMenuButton(
+                            items=last_opened,
+                        ),
+                        ft.Container(expand=True),
+                        ft.IconButton(ft.icons.RECORD_VOICE_OVER, ref=record_button,
+                                      on_click=lambda _: toggle_recording()),
+                    ]),
+                    ft.Dropdown(
+                        ref=mic_select,
+                        options=[ft.dropdown.Option(x[1]) for x in capture_devices],
+                        value=page.client_storage.get('selected_mic')[1] if (
+                                page.client_storage.contains_key('selected_mic') and tuple(
+                            page.client_storage.get('selected_mic')) in capture_devices) else capture_devices[0][1],
+                        height=36,
+                        content_padding=2,
+                        on_change=lambda _: page.client_storage.set('selected_mic', find_recordingdevice_tuple_by_name(
+                            mic_select.current.value)) if mic_select.current.value else None
+                    ),
                    ft.Column(ref=file_tree, scroll=ft.ScrollMode.ALWAYS, expand=True),
                    # ft.ListView(ref=file_tree),
                    ft.Text("No Folder Open Yet", style=ft.TextTheme.body_small, color="grey",
@ -275,21 +575,44 @@ def main(page):
                ], expand=True), expand=True, col=4),
            ft.Container(expand=True, content=ft.Column(expand=True, controls=[
                ft.Column([
-                    ft.Text('Select parameters, and then load transcription model.', ref=load_model_text),
+                    ft.Text(
+                        'Select parameters, and then load transcription model.'
+                        if mode == 'local'
+                        else 'Input the URL of the onerahmet/openai-whisper-asr-webservice docker container'
+                        , ref=current_mode_info_text),
                    ft.Row([
+                        ft.Dropdown(
+                            ref=current_mode_select,
+                            width=160,
+                            hint_text='mode',
+                            value=mode,
+                            on_change=lambda _: mode_select(),
+                            options=[
+                                ft.dropdown.Option('local'),
+                                ft.dropdown.Option('webservice'),
+                            ],
+                        ),
+
+                        # === LOCAL MODE ===
                        ft.Dropdown(
                            ref=model_size_select,
                            width=100,
                            hint_text='model size',
-                            value=page.client_storage.get('model_size') if page.client_storage.contains_key('model_size') else 'base',
-                            options=[ft.dropdown.Option(x) for x in mm.ModelSize.__args__],  # __args__ is not perfect here. But works.
+                            value=page.client_storage.get('model_size') if page.client_storage.contains_key(
+                                'model_size') else 'base',
+                            options=[ft.dropdown.Option(x) for x in mm.ModelSize.__args__],
+                            # __args__ is not perfect here. But works.
+                            visible=mode == 'local',
                        ),
                        ft.Dropdown(
                            ref=model_device_select,
                            width=100,
                            hint_text='device',
-                            value=page.client_storage.get('device_select') if page.client_storage.contains_key('device_select') else 'auto',
-                            options=[ft.dropdown.Option(x) for x in mm.Device.__args__]  # __args__ is not perfect here. But works.
+                            value=page.client_storage.get('device_select') if page.client_storage.contains_key(
+                                'device_select') else 'auto',
+                            options=[ft.dropdown.Option(x) for x in mm.Device.__args__],
+                            visible=mode == 'local',
+                            # __args__ is not perfect here. But works.
                        ),
                        # ft.Dropdown(
                        #    ref=model_bits_select,
@ -302,8 +625,21 @@ def main(page):
                            icon=ft.icons.START,
                            ref=model_load_unload_button,
                            on_click=lambda _: load_model(),
+                            visible=mode == 'local',
                        ),
-                        ft.ProgressRing(ref=model_loading_spinner, visible=False)
+                        # === WEBSERVICE MODE ===
+                        ft.TextField(
+                            ref=whisper_webservice_url_input,
+                            visible=mode == 'webservice',
+                            on_change=on_url_input,
+                            hint_text='e.g. http://localhost:9000',
+                            value=page.client_storage.get('webservice_url') if page.client_storage.contains_key(
+                                'webservice_url') else '',
+                        ),
+                        # TODO: question mark hint button about what the web service is
+
+                        # === GENERAL ===
+                        ft.ProgressRing(ref=processing_spinner, visible=False)
                    ])
                ]),
                ft.Container(expand=True, padding=12, border=ft.border.all(2, 'grey'),
@ -318,5 +654,8 @@ def main(page):
        ], expand=True),
    )

+    # refresh all values, and make sure the right stuff is shown
+    mode_select()
+

 ft.app(target=main)
--- a/main.spec
+++ b/main.spec
@ -0,0 +1,37 @@
+# -*- mode: python ; coding: utf-8 -*-
+
+
+a = Analysis(
+    ['main.py'],
+    pathex=[],
+    binaries=[],
+    datas=[],
+    hiddenimports=[],
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    noarchive=False,
+)
+pyz = PYZ(a.pure)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    a.binaries,
+    a.datas,
+    [],
+    name='main',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    runtime_tmpdir=None,
+    console=False,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)
--- a/nn_model_manager.py
+++ b/nn_model_manager.py
@ -1,5 +1,7 @@
+import io
 import threading

+import numpy as np
 from faster_whisper import WhisperModel
 import faster_whisper
 from typing import Literal, Iterable, Tuple
@ -47,6 +49,30 @@ def is_model_loaded() -> bool:
    return _model is not None


+def transcribe_from_i16_audio(audio: bytes) -> Tuple[Iterable[
+    faster_whisper.transcribe.Segment], faster_whisper.transcribe.TranscriptionInfo] | None:
+    """
+        Transcribe audio from an MP3 file.
+        Note that this can - and will - crash if you don't catch exceptions.
+
+        If the model isn't loaded yet, this will return None.
+        Otherwise, it will return the raw transcription from `faster-whisper`.
+    """
+    if not is_model_loaded():
+        return None
+
+    data = np.frombuffer(audio, dtype=np.int16)
+
+    # Convert s16 to f32.
+    data = data.astype(np.float32) / 32768.0
+
+    global _model
+    segments, info = _model.transcribe(data, beam_size=5)
+    # transcribe, and throw all exceptions to application to handle
+
+    return segments, info
+
+
 def transcribe_from_file(mp3_path: str) -> Tuple[Iterable[faster_whisper.transcribe.Segment], faster_whisper.transcribe.TranscriptionInfo] | None:
    """
        Transcribe audio from an MP3 file.
--- a/openapitools.json
+++ b/openapitools.json
@ -0,0 +1,7 @@
+{
+  "$schema": "./node_modules/@openapitools/openapi-generator-cli/config.schema.json",
+  "spaces": 2,
+  "generator-cli": {
+    "version": "7.0.1"
+  }
+}
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -7,11 +7,15 @@ license = "MIT"
 readme = "README.md"

 [tool.poetry.dependencies]
-python = "^3.11"
+python = ">=3.11, <3.13"
 flet = "^0.10.3"
-faster-whisper = "^0.9.0"
+faster-whisper = "0.10.0"
 pygame = "^2.5.2"
-torch = "2.0.0"
+requests = "^2.31.0"
+validators = "^0.22.0"
+pyinstaller = "^6.1.0"
+pyaudio = "^0.2.13"
+pydub = "^0.25.1"


 [build-system]
--- a/utils.py
+++ b/utils.py
@ -3,6 +3,9 @@ import os

 from typing import DefaultDict, Dict, List

+from pydub import AudioSegment
+import io
+

 def tree() -> DefaultDict:
    return defaultdict(tree)
@ -44,3 +47,15 @@ def defaultdict_to_dict(d: defaultdict) -> dict:
    if isinstance(d, defaultdict):
        d = {k: defaultdict_to_dict(v) for k, v in d.items()}
    return d
+
+
+def convert_to_mp3(audio_data: bytes, sample_width: int, frame_rate: int, channels: int) -> bytes:
+    audio = AudioSegment.from_raw(
+        io.BytesIO(audio_data),
+        sample_width=sample_width,
+        frame_rate=frame_rate,
+        channels=channels
+    )
+    mp3_buffer = io.BytesIO()
+    audio.export(mp3_buffer, format="mp3")
+    return mp3_buffer.getvalue()
--- a/whisper_webservice_interface.py
+++ b/whisper_webservice_interface.py
@ -0,0 +1,86 @@
+from typing import Optional, Union, Dict, Any
+
+import requests
+
+from utils import convert_to_mp3
+
+
+def send_asr_request(url: str, audio_file_path_or_bytes: str | bytes, task: Optional[str] = None, language: Optional[str] = None,
+                     initial_prompt: Optional[str] = None, encode: Optional[bool] = None,
+                     output: Optional[str] = None, word_timestamps: Optional[bool] = None) -> tuple[int, str]:
+    """
+    Send a request to the ASR endpoint.
+    Returns the text represented by the audio file if everything worked out,
+    and a tuple of the form (status_code, response_text) otherwise
+    """
+    endpoint = f"{url}/asr"
+
+    params = {
+        "task": task,
+        "language": language,
+        "initial_prompt": initial_prompt,
+        "encode": encode,
+        "output": output,
+        "word_timestamps": word_timestamps
+    }
+
+    params = {k: v for k, v in params.items() if v is not None}
+
+    if isinstance(audio_file_path_or_bytes, str):
+        with open(audio_file_path_or_bytes, 'rb') as f:
+            audio_file = f.read()
+    else:
+        audio_file = convert_to_mp3(audio_file_path_or_bytes, sample_width=2, frame_rate=16000, channels=1)
+
+    files = {
+        'audio_file': audio_file
+    }
+
+    response = requests.post(endpoint, params=params, files=files)
+
+    return response.status_code, response.text
+
+
+def detect_language(url: str, audio_file_path: str, encode: Optional[bool] = None) -> Dict[str, Any] | tuple[int, str]:
+    """
+    Send a request to the Detect Language endpoint.
+    Returns either a dictionary of the form {'detected_language': '<LANG>', 'language_code': '<LANG_CODE>'} if the request
+    was successful, or a tuple of the form (status_code, response_text) otherwise.
+    """
+    endpoint = f"{url}/detect-language"
+
+    params = {
+        "encode": encode
+    }
+
+    params = {k: v for k, v in params.items() if v is not None}
+
+    with open(audio_file_path, 'rb') as f:
+        audio_file = f.read()
+
+    files = {
+        'audio_file': audio_file
+    }
+
+    response = requests.post(endpoint, params=params, files=files)
+
+    if response.status_code == 200:
+        return response.json()
+    else:
+        return response.status_code, response.text
+
+
+# Example usage
+def main():
+    url = "http://127.0.0.1:9000"  # Replace with the actual URL of the webservice
+    audio_file_path = "/run/media/yannik/IC RECORDER/REC_FILE/Interview01/231021_1541.mp3"
+
+    response = send_asr_request(url, audio_file_path, task="transcribe", language="en")
+    print(response)
+
+    response = detect_language(url, audio_file_path)
+    print(response)
+
+
+if __name__ == "__main__":
+    main()
Author	SHA1	Message	Date
Yandrik	8c521b2997	meta: removed build files	2024-01-17 12:13:41 +01:00
Yandrik	44f6fbcdf5	feat: added last opened folders list	2023-12-19 01:24:58 +01:00
Yandrik	1e2f0334f7	feat: fixed direct recording to webservice	2023-12-18 19:33:58 +01:00
Yandrik	67951b4349	feat: implemented live recording transcription	2023-10-29 20:36:25 +01:00