fluesterpost/nn_model_manager.py

92 lines
2.7 KiB
Python

import io
import threading
import numpy as np
from faster_whisper import WhisperModel
import faster_whisper
from typing import Literal, Iterable, Tuple
_model: WhisperModel | None = None
ModelSize = Literal["tiny", "base", "small", "medium", "large-v1", "large-v2"]
Device = Literal["cuda", "cpu", "auto"]
ComputeType = Literal["8bit", "16bit", "32bit"]
def set_model(size: ModelSize, device: Device): #, compute_type: ComputeType):
'''
compute = None
if compute_type == '8bit':
if device == 'cuda' or device == 'auto':
compute = 'int8_float16'
else:
compute = 'int8'
elif compute_type == '16bit':
if device == 'cuda' or device == 'auto':
compute = 'int8'
else:
raise Exception("Cannot do 16 bit computing on CPU")
elif compute_type == '32bit':
compute = 'float'
else:
raise Exception(f"Invalid Compute / Device configuration (device {device} with {compute_type})")
'''
global _model
_model = WhisperModel(size, device=device)
def unload_model():
if not is_model_loaded():
return
global _model
_model = None # TODO: check if this works
def is_model_loaded() -> bool:
global _model
return _model is not None
def transcribe_from_i16_audio(audio: bytes) -> Tuple[Iterable[
faster_whisper.transcribe.Segment], faster_whisper.transcribe.TranscriptionInfo] | None:
"""
Transcribe audio from an MP3 file.
Note that this can - and will - crash if you don't catch exceptions.
If the model isn't loaded yet, this will return None.
Otherwise, it will return the raw transcription from `faster-whisper`.
"""
if not is_model_loaded():
return None
data = np.frombuffer(audio, dtype=np.int16)
# Convert s16 to f32.
data = data.astype(np.float32) / 32768.0
global _model
segments, info = _model.transcribe(data, beam_size=5)
# transcribe, and throw all exceptions to application to handle
return segments, info
def transcribe_from_file(mp3_path: str) -> Tuple[Iterable[faster_whisper.transcribe.Segment], faster_whisper.transcribe.TranscriptionInfo] | None:
"""
Transcribe audio from an MP3 file.
Note that this can - and will - crash if you don't catch exceptions.
If the model isn't loaded yet, this will return None.
Otherwise, it will return the raw transcription from `faster-whisper`.
"""
if not is_model_loaded():
return None
global _model
segments, info = _model.transcribe(mp3_path, beam_size=5)
# transcribe, and throw all exceptions to application to handle
return segments, info