Compare commits
4 Commits
8c521b2997
...
7ccfc66e50
Author | SHA1 | Date | |
---|---|---|---|
7ccfc66e50 | |||
07fd885448 | |||
85c89d5343 | |||
d721eb3a5b |
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
/build
|
||||
/dist
|
||||
/__pycache__
|
3
.idea/.gitignore
vendored
Normal file
3
.idea/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
12
.idea/fluesterpost.iml
Normal file
12
.idea/fluesterpost.iml
Normal file
@ -0,0 +1,12 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyDocumentationSettings">
|
||||
<option name="format" value="PLAIN" />
|
||||
<option name="myDocStringFormat" value="Plain" />
|
||||
</component>
|
||||
</module>
|
6
.idea/inspectionProfiles/profiles_settings.xml
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
Normal file
@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
7
.idea/misc.xml
Normal file
7
.idea/misc.xml
Normal file
@ -0,0 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="Poetry (fluesterpost)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Poetry (fluesterpost)" project-jdk-type="Python SDK" />
|
||||
</project>
|
8
.idea/modules.xml
Normal file
8
.idea/modules.xml
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/fluesterpost.iml" filepath="$PROJECT_DIR$/.idea/fluesterpost.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
6
.idea/vcs.xml
Normal file
6
.idea/vcs.xml
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
Binary file not shown.
Binary file not shown.
BIN
__pycache__/whisper_webservice_interface.cpython-311.pyc
Normal file
BIN
__pycache__/whisper_webservice_interface.cpython-311.pyc
Normal file
Binary file not shown.
539
main.py
539
main.py
@ -1,6 +1,10 @@
|
||||
import os
|
||||
import pprint
|
||||
import traceback
|
||||
import typing
|
||||
|
||||
import requests.exceptions
|
||||
import validators
|
||||
|
||||
import utils
|
||||
import flet as ft
|
||||
@ -10,30 +14,157 @@ from typing import DefaultDict
|
||||
import pygame
|
||||
|
||||
import nn_model_manager as mm
|
||||
import whisper_webservice_interface
|
||||
|
||||
import wave
|
||||
import sys
|
||||
import pyaudio
|
||||
|
||||
|
||||
# === TEMP ===
|
||||
import logging
|
||||
|
||||
logging.basicConfig()
|
||||
logging.getLogger("faster_whisper").setLevel(logging.DEBUG)
|
||||
|
||||
# === END ===
|
||||
|
||||
|
||||
# globals
|
||||
transcribe_ready: bool = False
|
||||
recording: bool = False
|
||||
rec_stream: pyaudio.Stream | None = None
|
||||
sound_chunks = []
|
||||
recorded_audio = []
|
||||
|
||||
# AUDIO stuff
|
||||
REC_CHUNK = 1024
|
||||
REC_FORMAT = pyaudio.paInt16
|
||||
REC_CHANNELS = 1
|
||||
REC_RATE = 16000
|
||||
REC_RECORD_SECONDS = 5
|
||||
|
||||
|
||||
def main(page):
|
||||
pygame.mixer.init()
|
||||
|
||||
first_name = ft.Ref[ft.TextField]()
|
||||
last_name = ft.Ref[ft.TextField]()
|
||||
greetings = ft.Ref[ft.Column]()
|
||||
# get audio device names
|
||||
p = pyaudio.PyAudio()
|
||||
|
||||
capture_devices = [(i, p.get_device_info_by_index(i)['name']) for i in range(p.get_device_count()) if
|
||||
p.get_device_info_by_index(i)['maxInputChannels'] > 0]
|
||||
|
||||
record_button = ft.Ref[ft.IconButton]()
|
||||
mic_select = ft.Ref[ft.Dropdown]()
|
||||
|
||||
file_tree = ft.Ref[ft.Column]()
|
||||
file_tree_empty_text = ft.Ref[ft.Text]()
|
||||
|
||||
load_model_text = ft.Ref[ft.Text]()
|
||||
# mode select
|
||||
current_mode_select = ft.Ref[ft.Dropdown]()
|
||||
current_mode_info_text = ft.Ref[ft.Text]()
|
||||
processing_spinner = ft.Ref[ft.ProgressRing]()
|
||||
|
||||
# local model mode
|
||||
model_size_select = ft.Ref[ft.Dropdown]()
|
||||
model_device_select = ft.Ref[ft.Dropdown]()
|
||||
# model_bits_select = ft.Ref[ft.Dropdown]()
|
||||
model_load_unload_button = ft.Ref[ft.IconButton]()
|
||||
model_loading_spinner = ft.Ref[ft.ProgressRing]()
|
||||
|
||||
# docker whisper webservice mode
|
||||
whisper_webservice_url_input = ft.Ref[ft.TextField]()
|
||||
|
||||
transcribe_buttons: list[ft.Ref[ft.IconButton]] = []
|
||||
|
||||
output_text_container = ft.Ref[ft.Container]()
|
||||
output_text_col = ft.Ref[ft.Column]()
|
||||
|
||||
# last opened folders
|
||||
|
||||
|
||||
def transcribe(fileOrBytes: str | bytes):
|
||||
print(f"DEBUG: trying to transcribe audio {fileOrBytes if isinstance(fileOrBytes, str) else f'with len {len(fileOrBytes)}'}")
|
||||
|
||||
# === LOCAL MODEL CODE ===
|
||||
if current_mode_select.current.value == 'local':
|
||||
if not mm.is_model_loaded() or (isinstance(fileOrBytes, str) and not fileOrBytes.endswith('.mp3')):
|
||||
print("DEBUG: can't transcribe a non-MP3 file or while no model is loaded")
|
||||
return
|
||||
|
||||
print(f"DEBUG: starting transcription")
|
||||
output_text_container.current.alignment = ft.alignment.center
|
||||
output_text_col.current.controls = [ft.ProgressRing()]
|
||||
|
||||
# set all transcribe buttons to disabled
|
||||
for btn in transcribe_buttons:
|
||||
btn.current.disabled = True
|
||||
page.update()
|
||||
|
||||
try:
|
||||
if isinstance(fileOrBytes, str):
|
||||
segments, info = mm.transcribe_from_file(fileOrBytes)
|
||||
else:
|
||||
segments, info = mm.transcribe_from_i16_audio(fileOrBytes)
|
||||
|
||||
txt = ''
|
||||
|
||||
for seg in segments:
|
||||
txt += seg.text + '\n'
|
||||
|
||||
output_text_container.current.alignment = ft.alignment.top_left
|
||||
output_text_col.current.controls = [ft.Text(txt, selectable=True)] # TODO
|
||||
|
||||
except Exception as e:
|
||||
output_text_container.current.alignment = ft.alignment.center
|
||||
output_text_col.current.controls = [ft.Text(f"Transcribing failed: {str(e)}")] # TODO
|
||||
|
||||
finally:
|
||||
# set all transcribe buttons to disabled
|
||||
for btn in transcribe_buttons:
|
||||
btn.current.disabled = False
|
||||
page.update()
|
||||
|
||||
# === WEBSERVICE MODE CODE ===
|
||||
elif current_mode_select.current.value == 'webservice':
|
||||
url = whisper_webservice_url_input.current.value
|
||||
print(f"DEBUG: starting web transcription")
|
||||
if validators.url(url, simple_host=True):
|
||||
|
||||
output_text_container.current.alignment = ft.alignment.center
|
||||
output_text_col.current.controls = [ft.ProgressRing()]
|
||||
# set all transcribe buttons to disabled
|
||||
for btn in transcribe_buttons:
|
||||
btn.current.disabled = True
|
||||
page.update()
|
||||
|
||||
try:
|
||||
print(f'DEBUG: sending web request...')
|
||||
code, text = whisper_webservice_interface.send_asr_request(url, fileOrBytes, task="transcribe")
|
||||
except requests.exceptions.RequestException as e:
|
||||
output_text_container.current.alignment = ft.alignment.center
|
||||
print(f'web transcription failed: {str(e)}')
|
||||
output_text_col.current.controls = \
|
||||
[ft.Text(f"HTTP Request to {url}/asr failed. Reason:\n{str(e)}")]
|
||||
# set all transcribe buttons to enabled
|
||||
for btn in transcribe_buttons:
|
||||
btn.current.disabled = False
|
||||
page.update()
|
||||
return
|
||||
|
||||
# set all transcribe buttons to enabled
|
||||
for btn in transcribe_buttons:
|
||||
btn.current.disabled = False
|
||||
|
||||
if code == 200:
|
||||
output_text_container.current.alignment = ft.alignment.top_left
|
||||
output_text_col.current.controls = [ft.Text(text, selectable=True)]
|
||||
else:
|
||||
output_text_container.current.alignment = ft.alignment.center
|
||||
output_text_col.current.controls = \
|
||||
[ft.Text(f"HTTP Request to {url}/asr failed ({code}):\n{text}")]
|
||||
|
||||
page.update()
|
||||
|
||||
def generate_file_tree(path: str, tree_dict: dict | DefaultDict):
|
||||
if path[-1] == os.sep:
|
||||
path = path[:-1]
|
||||
@ -99,47 +230,17 @@ def main(page):
|
||||
_button_ref = ft.Ref[ft.IconButton]()
|
||||
|
||||
control.append(ft.IconButton(icon=ft.icons.PLAY_CIRCLE_OUTLINED, ref=_button_ref,
|
||||
on_click=lambda _, f=full_file_path, r=_button_ref: start_playing(f, r)))
|
||||
|
||||
def transcribe(filepath: str):
|
||||
print(f"DEBUG: trying to transcribe file {filepath}")
|
||||
if not mm.is_model_loaded() or not filepath.endswith('.mp3'):
|
||||
return
|
||||
|
||||
print(f"DEBUG: starting transcription")
|
||||
output_text_container.current.alignment = ft.alignment.center
|
||||
output_text_col.current.controls = [ft.ProgressRing()]
|
||||
|
||||
# set all transcribe buttons to disabled
|
||||
for btn in transcribe_buttons:
|
||||
btn.current.disabled = True
|
||||
page.update()
|
||||
|
||||
try:
|
||||
segments, info = mm.transcribe_from_file(filepath)
|
||||
|
||||
txt = ''
|
||||
|
||||
for seg in segments:
|
||||
txt += seg.text + '\n'
|
||||
|
||||
output_text_container.current.alignment = ft.alignment.top_left
|
||||
output_text_col.current.controls = [ft.Text(txt, selectable=True)] # TODO
|
||||
|
||||
except Exception as e:
|
||||
output_text_container.current.alignment = ft.alignment.center
|
||||
output_text_col.current.controls = [ft.Text(f"Transcribing failed: {str(e)}")] # TODO
|
||||
|
||||
finally:
|
||||
# set all transcribe buttons to disabled
|
||||
for btn in transcribe_buttons:
|
||||
btn.current.disabled = False
|
||||
page.update()
|
||||
|
||||
on_click=lambda _, f=full_file_path, r=_button_ref: start_playing(f, r)))
|
||||
|
||||
transcribe_button_ref = ft.Ref[ft.IconButton]()
|
||||
|
||||
control.append(ft.IconButton(icon=ft.icons.FORMAT_ALIGN_LEFT, disabled=not mm.is_model_loaded(), ref=transcribe_button_ref,
|
||||
# check enabled
|
||||
enabled = (current_mode_select.current.value == 'local' and mm.is_model_loaded()) or (
|
||||
current_mode_select.current.value == 'webservice' and
|
||||
validators.url(whisper_webservice_url_input.current.value, simple_host=True))
|
||||
|
||||
control.append(ft.IconButton(icon=ft.icons.FORMAT_ALIGN_LEFT, disabled=not enabled,
|
||||
ref=transcribe_button_ref,
|
||||
on_click=lambda _, f=full_file_path: transcribe(f)))
|
||||
|
||||
transcribe_buttons.append(transcribe_button_ref)
|
||||
@ -155,17 +256,11 @@ def main(page):
|
||||
]
|
||||
)
|
||||
|
||||
def btn_click(e):
|
||||
greetings.current.controls.append(
|
||||
ft.Text(f"Hello, {first_name.current.value} {last_name.current.value}!")
|
||||
)
|
||||
first_name.current.value = ""
|
||||
last_name.current.value = ""
|
||||
page.update()
|
||||
first_name.current.focus()
|
||||
|
||||
def on_dialog_result(e: ft.FilePickerResultEvent):
|
||||
path = e.path
|
||||
def on_dialog_result(e: ft.FilePickerResultEvent | str):
|
||||
if isinstance(e, ft.FilePickerResultEvent):
|
||||
path = e.path
|
||||
else:
|
||||
path = e
|
||||
if path:
|
||||
print(f"path is {path}")
|
||||
try:
|
||||
@ -179,21 +274,72 @@ def main(page):
|
||||
)
|
||||
file_tree_empty_text.current.visible = False
|
||||
|
||||
# add to last opened folders
|
||||
|
||||
last_opened_folders = page.client_storage.get('last_opened_folders') if page.client_storage.contains_key(
|
||||
'last_opened_folders') else []
|
||||
|
||||
if path not in last_opened_folders:
|
||||
last_opened_folders.append(path)
|
||||
last_opened_folders = last_opened_folders[-10:]
|
||||
page.client_storage.set('last_opened_folders', last_opened_folders)
|
||||
|
||||
page.update()
|
||||
except e:
|
||||
print("didn't work aaa") # TODO: fix
|
||||
except Exception as e:
|
||||
print(f"An error occurred when building the file tree: {str(e)}")
|
||||
|
||||
|
||||
def mode_select():
|
||||
global transcribe_ready
|
||||
if mm.is_model_loaded():
|
||||
print("BUG: cannot change mode while model is loaded!")
|
||||
return
|
||||
|
||||
next_mode = current_mode_select.current.value
|
||||
if next_mode == 'local':
|
||||
# enable model selects & loads
|
||||
model_size_select.current.visible = True
|
||||
model_device_select.current.visible = True
|
||||
model_load_unload_button.current.visible = True
|
||||
model_size_select.current.disabled = False
|
||||
model_device_select.current.disabled = False
|
||||
|
||||
whisper_webservice_url_input.current.visible = False
|
||||
|
||||
for btn in transcribe_buttons:
|
||||
btn.current.disabled = True
|
||||
|
||||
set_transcribe_ready(False)
|
||||
|
||||
elif next_mode == 'webservice':
|
||||
# enable model selects & loads
|
||||
model_size_select.current.visible = False
|
||||
model_device_select.current.visible = False
|
||||
model_load_unload_button.current.visible = False
|
||||
model_size_select.current.disabled = True
|
||||
model_device_select.current.disabled = True
|
||||
model_load_unload_button.current.disabled = True
|
||||
current_mode_info_text.current.value = 'Input the URL of the onerahmet/openai-whisper-asr-webservice docker container'
|
||||
|
||||
whisper_webservice_url_input.current.visible = True
|
||||
whisper_webservice_url_input.current.disabled = False
|
||||
|
||||
on_url_input(None)
|
||||
|
||||
|
||||
else:
|
||||
raise Exception(f'BUG: Impossible mode {next_mode} received!')
|
||||
|
||||
page.update()
|
||||
page.client_storage.set('selected_mode', next_mode)
|
||||
|
||||
def load_model():
|
||||
current_mode_info_text.current.value = 'Loading... This may take a while.'
|
||||
|
||||
load_model_text.current.value = 'Loading... This may take a while.'
|
||||
|
||||
model_size_select.current.disabled = True
|
||||
model_device_select.current.disabled = True
|
||||
# model_bits_select.current.disabled = True
|
||||
model_load_unload_button.current.disabled = True
|
||||
model_loading_spinner.current.visible = True
|
||||
page.update()
|
||||
|
||||
paralyze_ui()
|
||||
|
||||
try:
|
||||
mm.set_model(
|
||||
size=model_size_select.current.value or 'base',
|
||||
@ -203,55 +349,165 @@ def main(page):
|
||||
except Exception as e:
|
||||
print(f"loading model failed. Exception: {str(e)}")
|
||||
print(traceback.format_exc())
|
||||
load_model_text.current.value = f'Loading failed. Reason:\n{str(e)}'
|
||||
model_size_select.current.disabled = False
|
||||
model_device_select.current.disabled = False
|
||||
# model_bits_select.current.disabled = False
|
||||
current_mode_info_text.current.value = f'Loading failed. Reason:\n{str(e)}'
|
||||
set_transcribe_ready(False)
|
||||
|
||||
# raise e
|
||||
|
||||
model_loading_spinner.current.visible = False
|
||||
model_load_unload_button.current.disabled = False
|
||||
processing_spinner.current.visible = False
|
||||
|
||||
if mm.is_model_loaded():
|
||||
load_model_text.current.value = f'Loaded.'
|
||||
model_load_unload_button.current.icon = ft.icons.CLOSE
|
||||
model_load_unload_button.current.on_click = lambda _: unload_model()
|
||||
current_mode_info_text.current.value = f'Loaded.'
|
||||
|
||||
# if successful, save to shared preferences
|
||||
page.client_storage.set('model_size', model_size_select.current.value)
|
||||
page.client_storage.set('device_select', model_device_select.current.value)
|
||||
|
||||
# set all transcribe buttons to enabled
|
||||
for btn in transcribe_buttons:
|
||||
btn.current.disabled = False
|
||||
|
||||
page.update()
|
||||
set_transcribe_ready(True)
|
||||
else:
|
||||
set_transcribe_ready(False)
|
||||
|
||||
def unload_model():
|
||||
model_load_unload_button.current.disabled = True
|
||||
|
||||
# set all transcribe buttons to disabled
|
||||
for btn in transcribe_buttons:
|
||||
btn.current.disabled = True
|
||||
|
||||
page.update()
|
||||
paralyze_ui()
|
||||
|
||||
if mm.is_model_loaded():
|
||||
mm.unload_model()
|
||||
|
||||
load_model_text.current.value = 'Select parameters, and then load transcription model.'
|
||||
model_size_select.current.disabled = False
|
||||
model_device_select.current.disabled = False
|
||||
# model_bits_select.current.disabled = False
|
||||
set_transcribe_ready(False)
|
||||
|
||||
def paralyze_ui(spinner: bool = True, disable_recording_button: bool = True):
|
||||
model_size_select.current.disabled = True
|
||||
model_device_select.current.disabled = True
|
||||
# model_bits_select.current.disabled = True
|
||||
model_load_unload_button.current.disabled = True
|
||||
processing_spinner.current.visible = spinner
|
||||
current_mode_select.current.disabled = True
|
||||
|
||||
record_button.current.disabled = disable_recording_button
|
||||
|
||||
model_load_unload_button.current.icon = ft.icons.CLOSE
|
||||
model_load_unload_button.current.disabled = False
|
||||
model_load_unload_button.current.icon = ft.icons.START
|
||||
model_load_unload_button.current.on_click = lambda _: load_model()
|
||||
model_loading_spinner.current.visible = False
|
||||
for btn in transcribe_buttons:
|
||||
btn.current.disabled = True
|
||||
model_load_unload_button.current.disabled = True
|
||||
page.update()
|
||||
|
||||
def set_transcribe_ready(rdy: bool):
|
||||
global transcribe_ready
|
||||
transcribe_ready = rdy
|
||||
|
||||
if transcribe_ready:
|
||||
for btn in transcribe_buttons:
|
||||
btn.current.disabled = False
|
||||
model_size_select.current.disabled = True
|
||||
model_device_select.current.disabled = True
|
||||
# model_bits_select.current.disabled = True
|
||||
model_load_unload_button.current.disabled = True
|
||||
processing_spinner.current.visible = False
|
||||
model_load_unload_button.current.on_click = lambda _: unload_model()
|
||||
|
||||
model_load_unload_button.current.icon = ft.icons.CLOSE
|
||||
model_load_unload_button.current.disabled = False
|
||||
|
||||
record_button.current.disabled = False
|
||||
|
||||
if mm.is_model_loaded():
|
||||
current_mode_select.current.disabled = True
|
||||
else:
|
||||
for btn in transcribe_buttons:
|
||||
btn.current.disabled = True
|
||||
model_size_select.current.disabled = False
|
||||
model_device_select.current.disabled = False
|
||||
# model_bits_select.current.disabled = False
|
||||
model_load_unload_button.current.disabled = False
|
||||
model_load_unload_button.current.icon = ft.icons.START
|
||||
model_load_unload_button.current.on_click = lambda _: load_model()
|
||||
processing_spinner.current.visible = False
|
||||
current_mode_select.current.disabled = False
|
||||
|
||||
record_button.current.disabled = True
|
||||
|
||||
page.update()
|
||||
|
||||
def on_url_input(e):
|
||||
url_value = whisper_webservice_url_input.current.value
|
||||
# print(url_value)
|
||||
|
||||
if validators.url(url_value, simple_host=True):
|
||||
# print('valid')
|
||||
page.client_storage.set('webservice_url', url_value)
|
||||
# set all transcribe buttons to enabled
|
||||
set_transcribe_ready(True)
|
||||
else:
|
||||
# print('invalid')
|
||||
# set all transcribe buttons to disabled
|
||||
set_transcribe_ready(False)
|
||||
|
||||
page.update()
|
||||
|
||||
print(tuple(page.client_storage.get('selected_mic')))
|
||||
|
||||
def toggle_recording():
|
||||
global recording
|
||||
global rec_stream
|
||||
global sound_chunks
|
||||
global recorded_audio
|
||||
|
||||
if recording:
|
||||
print("Stopping recording...")
|
||||
|
||||
rec_stream.stop_stream()
|
||||
|
||||
while not rec_stream.is_stopped():
|
||||
pass # wait until stopped
|
||||
|
||||
recorded_audio = b"".join(sound_chunks)
|
||||
|
||||
set_transcribe_ready(False)
|
||||
|
||||
transcribe(recorded_audio)
|
||||
|
||||
recording = False
|
||||
|
||||
# sound = pygame.mixer.Sound(buffer=recorded_audio) # doesn't work because sampling rate is wrong
|
||||
record_button.current.bgcolor = "0x000000FF"
|
||||
|
||||
set_transcribe_ready(True)
|
||||
|
||||
print("done")
|
||||
# sound.play()
|
||||
else:
|
||||
if not transcribe_ready:
|
||||
print("Can't record, not ready")
|
||||
return
|
||||
print("Starting Recording...")
|
||||
recording = True
|
||||
|
||||
sound_chunks = []
|
||||
|
||||
def cb(in_data, _frame_count, _time_info, _status):
|
||||
sound_chunks.append(in_data)
|
||||
print(_time_info)
|
||||
return in_data, pyaudio.paContinue
|
||||
|
||||
rec_stream = p.open(
|
||||
format=REC_FORMAT,
|
||||
channels=REC_CHANNELS,
|
||||
rate=REC_RATE,
|
||||
input=True,
|
||||
frames_per_buffer=REC_CHUNK,
|
||||
stream_callback=cb
|
||||
)
|
||||
|
||||
rec_stream.start_stream()
|
||||
|
||||
record_button.current.bgcolor = "0xFFFF4444"
|
||||
paralyze_ui(spinner=False, disable_recording_button=False)
|
||||
|
||||
def find_recordingdevice_tuple_by_name(search_name: str) -> typing.Tuple[int, str] | None:
|
||||
return next(((device_id, name) for device_id, name in capture_devices if name == search_name))
|
||||
|
||||
# set up file picker
|
||||
file_picker = ft.FilePicker(on_result=on_dialog_result)
|
||||
@ -263,11 +519,55 @@ def main(page):
|
||||
ft.Divider()
|
||||
)
|
||||
|
||||
mode = page.client_storage.get('selected_mode') if page.client_storage.contains_key('selected_mode') else 'local'
|
||||
|
||||
# last opened folders
|
||||
|
||||
|
||||
# build controls list
|
||||
last_opened_folders = page.client_storage.get('last_opened_folders') if page.client_storage.contains_key(
|
||||
'last_opened_folders') else []
|
||||
|
||||
if not (isinstance(last_opened_folders, list) and all(isinstance(item, str) for item in last_opened_folders)):
|
||||
last_opened_folders = []
|
||||
|
||||
# TODO: rebuild when last_opened_folders changes
|
||||
last_opened = [
|
||||
ft.PopupMenuItem(
|
||||
on_click=lambda _, folder_name=folder_name: on_dialog_result( folder_name ),
|
||||
content=ft.Row([
|
||||
ft.Icon(ft.icons.FOLDER, color=ft.colors.BLUE),
|
||||
ft.Text(folder_name, size=14, weight=ft.FontWeight.BOLD),
|
||||
])
|
||||
)
|
||||
for folder_name in last_opened_folders
|
||||
]
|
||||
|
||||
|
||||
page.add(
|
||||
ft.ResponsiveRow([
|
||||
ft.Container(
|
||||
ft.Column([
|
||||
ft.ElevatedButton("Add Folder", on_click=lambda _: file_picker.get_directory_path()),
|
||||
ft.Row([
|
||||
ft.ElevatedButton("Add Folder", on_click=lambda _: file_picker.get_directory_path()),
|
||||
ft.PopupMenuButton(
|
||||
items=last_opened,
|
||||
),
|
||||
ft.Container(expand=True),
|
||||
ft.IconButton(ft.icons.RECORD_VOICE_OVER, ref=record_button,
|
||||
on_click=lambda _: toggle_recording()),
|
||||
]),
|
||||
ft.Dropdown(
|
||||
ref=mic_select,
|
||||
options=[ft.dropdown.Option(x[1]) for x in capture_devices],
|
||||
value=page.client_storage.get('selected_mic')[1] if (
|
||||
page.client_storage.contains_key('selected_mic') and tuple(
|
||||
page.client_storage.get('selected_mic')) in capture_devices) else capture_devices[0][1],
|
||||
height=36,
|
||||
content_padding=2,
|
||||
on_change=lambda _: page.client_storage.set('selected_mic', find_recordingdevice_tuple_by_name(
|
||||
mic_select.current.value)) if mic_select.current.value else None
|
||||
),
|
||||
ft.Column(ref=file_tree, scroll=ft.ScrollMode.ALWAYS, expand=True),
|
||||
# ft.ListView(ref=file_tree),
|
||||
ft.Text("No Folder Open Yet", style=ft.TextTheme.body_small, color="grey",
|
||||
@ -275,21 +575,44 @@ def main(page):
|
||||
], expand=True), expand=True, col=4),
|
||||
ft.Container(expand=True, content=ft.Column(expand=True, controls=[
|
||||
ft.Column([
|
||||
ft.Text('Select parameters, and then load transcription model.', ref=load_model_text),
|
||||
ft.Text(
|
||||
'Select parameters, and then load transcription model.'
|
||||
if mode == 'local'
|
||||
else 'Input the URL of the onerahmet/openai-whisper-asr-webservice docker container'
|
||||
, ref=current_mode_info_text),
|
||||
ft.Row([
|
||||
ft.Dropdown(
|
||||
ref=current_mode_select,
|
||||
width=160,
|
||||
hint_text='mode',
|
||||
value=mode,
|
||||
on_change=lambda _: mode_select(),
|
||||
options=[
|
||||
ft.dropdown.Option('local'),
|
||||
ft.dropdown.Option('webservice'),
|
||||
],
|
||||
),
|
||||
|
||||
# === LOCAL MODE ===
|
||||
ft.Dropdown(
|
||||
ref=model_size_select,
|
||||
width=100,
|
||||
hint_text='model size',
|
||||
value=page.client_storage.get('model_size') if page.client_storage.contains_key('model_size') else 'base',
|
||||
options=[ft.dropdown.Option(x) for x in mm.ModelSize.__args__], # __args__ is not perfect here. But works.
|
||||
value=page.client_storage.get('model_size') if page.client_storage.contains_key(
|
||||
'model_size') else 'base',
|
||||
options=[ft.dropdown.Option(x) for x in mm.ModelSize.__args__],
|
||||
# __args__ is not perfect here. But works.
|
||||
visible=mode == 'local',
|
||||
),
|
||||
ft.Dropdown(
|
||||
ref=model_device_select,
|
||||
width=100,
|
||||
hint_text='device',
|
||||
value=page.client_storage.get('device_select') if page.client_storage.contains_key('device_select') else 'auto',
|
||||
options=[ft.dropdown.Option(x) for x in mm.Device.__args__] # __args__ is not perfect here. But works.
|
||||
value=page.client_storage.get('device_select') if page.client_storage.contains_key(
|
||||
'device_select') else 'auto',
|
||||
options=[ft.dropdown.Option(x) for x in mm.Device.__args__],
|
||||
visible=mode == 'local',
|
||||
# __args__ is not perfect here. But works.
|
||||
),
|
||||
# ft.Dropdown(
|
||||
# ref=model_bits_select,
|
||||
@ -297,13 +620,26 @@ def main(page):
|
||||
# hint_text='bits',
|
||||
# value='16bit',
|
||||
# options=[ft.dropdown.Option(x) for x in mm.ComputeType.__args__] # __args__ is not perfect here. But works.
|
||||
#),
|
||||
# ),
|
||||
ft.IconButton(
|
||||
icon=ft.icons.START,
|
||||
ref=model_load_unload_button,
|
||||
on_click=lambda _: load_model(),
|
||||
visible=mode == 'local',
|
||||
),
|
||||
ft.ProgressRing(ref=model_loading_spinner, visible=False)
|
||||
# === WEBSERVICE MODE ===
|
||||
ft.TextField(
|
||||
ref=whisper_webservice_url_input,
|
||||
visible=mode == 'webservice',
|
||||
on_change=on_url_input,
|
||||
hint_text='e.g. http://localhost:9000',
|
||||
value=page.client_storage.get('webservice_url') if page.client_storage.contains_key(
|
||||
'webservice_url') else '',
|
||||
),
|
||||
# TODO: question mark hint button about what the web service is
|
||||
|
||||
# === GENERAL ===
|
||||
ft.ProgressRing(ref=processing_spinner, visible=False)
|
||||
])
|
||||
]),
|
||||
ft.Container(expand=True, padding=12, border=ft.border.all(2, 'grey'),
|
||||
@ -311,12 +647,15 @@ def main(page):
|
||||
ref=output_text_container,
|
||||
content=ft.Column(
|
||||
[ft.Text('Nothing to see here!', text_align=ft.TextAlign.CENTER)],
|
||||
ref=output_text_col,
|
||||
expand=True,
|
||||
scroll=ft.ScrollMode.ADAPTIVE)),
|
||||
ref=output_text_col,
|
||||
expand=True,
|
||||
scroll=ft.ScrollMode.ADAPTIVE)),
|
||||
]), col=8)
|
||||
], expand=True),
|
||||
)
|
||||
|
||||
# refresh all values, and make sure the right stuff is shown
|
||||
mode_select()
|
||||
|
||||
|
||||
ft.app(target=main)
|
||||
|
37
main.spec
Normal file
37
main.spec
Normal file
@ -0,0 +1,37 @@
|
||||
# -*- mode: python ; coding: utf-8 -*-
|
||||
|
||||
|
||||
a = Analysis(
|
||||
['main.py'],
|
||||
pathex=[],
|
||||
binaries=[],
|
||||
datas=[],
|
||||
hiddenimports=[],
|
||||
hookspath=[],
|
||||
hooksconfig={},
|
||||
runtime_hooks=[],
|
||||
excludes=[],
|
||||
noarchive=False,
|
||||
)
|
||||
pyz = PYZ(a.pure)
|
||||
|
||||
exe = EXE(
|
||||
pyz,
|
||||
a.scripts,
|
||||
a.binaries,
|
||||
a.datas,
|
||||
[],
|
||||
name='main',
|
||||
debug=False,
|
||||
bootloader_ignore_signals=False,
|
||||
strip=False,
|
||||
upx=True,
|
||||
upx_exclude=[],
|
||||
runtime_tmpdir=None,
|
||||
console=False,
|
||||
disable_windowed_traceback=False,
|
||||
argv_emulation=False,
|
||||
target_arch=None,
|
||||
codesign_identity=None,
|
||||
entitlements_file=None,
|
||||
)
|
@ -1,5 +1,7 @@
|
||||
import io
|
||||
import threading
|
||||
|
||||
import numpy as np
|
||||
from faster_whisper import WhisperModel
|
||||
import faster_whisper
|
||||
from typing import Literal, Iterable, Tuple
|
||||
@ -47,6 +49,30 @@ def is_model_loaded() -> bool:
|
||||
return _model is not None
|
||||
|
||||
|
||||
def transcribe_from_i16_audio(audio: bytes) -> Tuple[Iterable[
|
||||
faster_whisper.transcribe.Segment], faster_whisper.transcribe.TranscriptionInfo] | None:
|
||||
"""
|
||||
Transcribe audio from an MP3 file.
|
||||
Note that this can - and will - crash if you don't catch exceptions.
|
||||
|
||||
If the model isn't loaded yet, this will return None.
|
||||
Otherwise, it will return the raw transcription from `faster-whisper`.
|
||||
"""
|
||||
if not is_model_loaded():
|
||||
return None
|
||||
|
||||
data = np.frombuffer(audio, dtype=np.int16)
|
||||
|
||||
# Convert s16 to f32.
|
||||
data = data.astype(np.float32) / 32768.0
|
||||
|
||||
global _model
|
||||
segments, info = _model.transcribe(data, beam_size=5)
|
||||
# transcribe, and throw all exceptions to application to handle
|
||||
|
||||
return segments, info
|
||||
|
||||
|
||||
def transcribe_from_file(mp3_path: str) -> Tuple[Iterable[faster_whisper.transcribe.Segment], faster_whisper.transcribe.TranscriptionInfo] | None:
|
||||
"""
|
||||
Transcribe audio from an MP3 file.
|
||||
|
7
openapitools.json
Normal file
7
openapitools.json
Normal file
@ -0,0 +1,7 @@
|
||||
{
|
||||
"$schema": "./node_modules/@openapitools/openapi-generator-cli/config.schema.json",
|
||||
"spaces": 2,
|
||||
"generator-cli": {
|
||||
"version": "7.0.1"
|
||||
}
|
||||
}
|
1418
poetry.lock
generated
1418
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -7,11 +7,15 @@ license = "MIT"
|
||||
readme = "README.md"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.11"
|
||||
python = ">=3.11, <3.13"
|
||||
flet = "^0.10.3"
|
||||
faster-whisper = "^0.9.0"
|
||||
faster-whisper = "0.10.0"
|
||||
pygame = "^2.5.2"
|
||||
torch = "2.0.0"
|
||||
requests = "^2.31.0"
|
||||
validators = "^0.22.0"
|
||||
pyinstaller = "^6.1.0"
|
||||
pyaudio = "^0.2.13"
|
||||
pydub = "^0.25.1"
|
||||
|
||||
|
||||
[build-system]
|
||||
|
15
utils.py
15
utils.py
@ -3,6 +3,9 @@ import os
|
||||
|
||||
from typing import DefaultDict, Dict, List
|
||||
|
||||
from pydub import AudioSegment
|
||||
import io
|
||||
|
||||
|
||||
def tree() -> DefaultDict:
|
||||
return defaultdict(tree)
|
||||
@ -44,3 +47,15 @@ def defaultdict_to_dict(d: defaultdict) -> dict:
|
||||
if isinstance(d, defaultdict):
|
||||
d = {k: defaultdict_to_dict(v) for k, v in d.items()}
|
||||
return d
|
||||
|
||||
|
||||
def convert_to_mp3(audio_data: bytes, sample_width: int, frame_rate: int, channels: int) -> bytes:
|
||||
audio = AudioSegment.from_raw(
|
||||
io.BytesIO(audio_data),
|
||||
sample_width=sample_width,
|
||||
frame_rate=frame_rate,
|
||||
channels=channels
|
||||
)
|
||||
mp3_buffer = io.BytesIO()
|
||||
audio.export(mp3_buffer, format="mp3")
|
||||
return mp3_buffer.getvalue()
|
86
whisper_webservice_interface.py
Normal file
86
whisper_webservice_interface.py
Normal file
@ -0,0 +1,86 @@
|
||||
from typing import Optional, Union, Dict, Any
|
||||
|
||||
import requests
|
||||
|
||||
from utils import convert_to_mp3
|
||||
|
||||
|
||||
def send_asr_request(url: str, audio_file_path_or_bytes: str | bytes, task: Optional[str] = None, language: Optional[str] = None,
|
||||
initial_prompt: Optional[str] = None, encode: Optional[bool] = None,
|
||||
output: Optional[str] = None, word_timestamps: Optional[bool] = None) -> tuple[int, str]:
|
||||
"""
|
||||
Send a request to the ASR endpoint.
|
||||
Returns the text represented by the audio file if everything worked out,
|
||||
and a tuple of the form (status_code, response_text) otherwise
|
||||
"""
|
||||
endpoint = f"{url}/asr"
|
||||
|
||||
params = {
|
||||
"task": task,
|
||||
"language": language,
|
||||
"initial_prompt": initial_prompt,
|
||||
"encode": encode,
|
||||
"output": output,
|
||||
"word_timestamps": word_timestamps
|
||||
}
|
||||
|
||||
params = {k: v for k, v in params.items() if v is not None}
|
||||
|
||||
if isinstance(audio_file_path_or_bytes, str):
|
||||
with open(audio_file_path_or_bytes, 'rb') as f:
|
||||
audio_file = f.read()
|
||||
else:
|
||||
audio_file = convert_to_mp3(audio_file_path_or_bytes, sample_width=2, frame_rate=16000, channels=1)
|
||||
|
||||
files = {
|
||||
'audio_file': audio_file
|
||||
}
|
||||
|
||||
response = requests.post(endpoint, params=params, files=files)
|
||||
|
||||
return response.status_code, response.text
|
||||
|
||||
|
||||
def detect_language(url: str, audio_file_path: str, encode: Optional[bool] = None) -> Dict[str, Any] | tuple[int, str]:
|
||||
"""
|
||||
Send a request to the Detect Language endpoint.
|
||||
Returns either a dictionary of the form {'detected_language': '<LANG>', 'language_code': '<LANG_CODE>'} if the request
|
||||
was successful, or a tuple of the form (status_code, response_text) otherwise.
|
||||
"""
|
||||
endpoint = f"{url}/detect-language"
|
||||
|
||||
params = {
|
||||
"encode": encode
|
||||
}
|
||||
|
||||
params = {k: v for k, v in params.items() if v is not None}
|
||||
|
||||
with open(audio_file_path, 'rb') as f:
|
||||
audio_file = f.read()
|
||||
|
||||
files = {
|
||||
'audio_file': audio_file
|
||||
}
|
||||
|
||||
response = requests.post(endpoint, params=params, files=files)
|
||||
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
return response.status_code, response.text
|
||||
|
||||
|
||||
# Example usage
|
||||
def main():
|
||||
url = "http://127.0.0.1:9000" # Replace with the actual URL of the webservice
|
||||
audio_file_path = "/run/media/yannik/IC RECORDER/REC_FILE/Interview01/231021_1541.mp3"
|
||||
|
||||
response = send_asr_request(url, audio_file_path, task="transcribe", language="en")
|
||||
print(response)
|
||||
|
||||
response = detect_language(url, audio_file_path)
|
||||
print(response)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user