fluesterpost/nn_model_manager.py

import io
import threading

import numpy as np
from faster_whisper import WhisperModel
import faster_whisper
from typing import Literal, Iterable, Tuple

_model: WhisperModel | None = None

ModelSize = Literal["tiny", "base", "small", "medium", "large-v1", "large-v2"]
Device = Literal["cuda", "cpu", "auto"]
ComputeType = Literal["8bit", "16bit", "32bit"]


def set_model(size: ModelSize, device: Device):  #, compute_type: ComputeType):

    '''
    compute = None
    if compute_type == '8bit':
        if device == 'cuda' or device == 'auto':
            compute = 'int8_float16'
        else:
            compute = 'int8'
    elif compute_type == '16bit':
        if device == 'cuda' or device == 'auto':
            compute = 'int8'
        else:
            raise Exception("Cannot do 16 bit computing on CPU")
    elif compute_type == '32bit':
        compute = 'float'
    else:
        raise Exception(f"Invalid Compute / Device configuration (device {device} with {compute_type})")
    '''

    global _model
    _model = WhisperModel(size, device=device)


def unload_model():
    if not is_model_loaded():
        return

    global _model
    _model = None  # TODO: check if this works

def is_model_loaded() -> bool:
    global _model
    return _model is not None


def transcribe_from_i16_audio(audio: bytes) -> Tuple[Iterable[
    faster_whisper.transcribe.Segment], faster_whisper.transcribe.TranscriptionInfo] | None:
    """
        Transcribe audio from an MP3 file.
        Note that this can - and will - crash if you don't catch exceptions.

        If the model isn't loaded yet, this will return None.
        Otherwise, it will return the raw transcription from `faster-whisper`.
    """
    if not is_model_loaded():
        return None

    data = np.frombuffer(audio, dtype=np.int16)

    # Convert s16 to f32.
    data = data.astype(np.float32) / 32768.0

    global _model
    segments, info = _model.transcribe(data, beam_size=5)
    # transcribe, and throw all exceptions to application to handle

    return segments, info


def transcribe_from_file(mp3_path: str) -> Tuple[Iterable[faster_whisper.transcribe.Segment], faster_whisper.transcribe.TranscriptionInfo] | None:
    """
        Transcribe audio from an MP3 file.
        Note that this can - and will - crash if you don't catch exceptions.

        If the model isn't loaded yet, this will return None.
        Otherwise, it will return the raw transcription from `faster-whisper`.
    """
    if not is_model_loaded():
        return None

    global _model
    segments, info = _model.transcribe(mp3_path, beam_size=5)
    # transcribe, and throw all exceptions to application to handle

    return segments, info