diff --git a/buzz/model_loader.py b/buzz/model_loader.py index 70031a547..c5e2742bb 100644 --- a/buzz/model_loader.py +++ b/buzz/model_loader.py @@ -43,10 +43,17 @@ class WhisperModelSize(str, enum.Enum): SMALL = "small" MEDIUM = "medium" LARGE = "large" + LARGEV2 = "large-v2" + LARGEV3 = "large-v3" def to_faster_whisper_model_size(self) -> str: if self == WhisperModelSize.LARGE: - return "large-v2" + return "large-v1" + return self.value + + def to_whisper_cpp_model_size(self) -> str: + if self == WhisperModelSize.LARGE: + return "large-v1" return self.value def __str__(self): @@ -201,7 +208,9 @@ def get_local_model_path(self) -> Optional[str]: "base": "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe", "small": "1be3a9b2063867b937e64e2ec7483364a79917e157fa98c5d94b5c1fffea987b", "medium": "6c14d5adee5f86394037b4e4e8b59f1673b6cee10e3cf0b11bbdbee79c156208", - "large": "64d182b440b98d5203c4f9bd541544d84c605196c4f7b845dfa11fb23594d1e2", + "large-v1": "7d99f41a10525d0206bddadd86760181fa920438b6b33237e3118ff6c83bb53d", + "large-v2": "9a423fe4d40c82774b6af34115b8b935f34152246eb19e80e376071d3f999487", + "large-v3": "64d182b440b98d5203c4f9bd541544d84c605196c4f7b845dfa11fb23594d1e2", } @@ -318,7 +327,12 @@ def download_faster_whisper_model( % (size, ", ".join(faster_whisper.utils._MODELS)) ) - repo_id = "guillaumekln/faster-whisper-%s" % size + logging.debug("Downloading Faster Whisper model: %s", size) + + if size == WhisperModelSize.LARGEV3: + repo_id = "Systran/faster-whisper-large-v3" + else: + repo_id = "guillaumekln/faster-whisper-%s" % size allow_patterns = [ "model.bin", # largest by size first @@ -357,7 +371,7 @@ def __init__(self, model: TranscriptionModel): def run(self) -> None: if self.model.model_type == ModelType.WHISPER_CPP: - model_name = self.model.whisper_model_size.value + model_name = self.model.whisper_model_size.to_whisper_cpp_model_size() url = huggingface_hub.hf_hub_url( repo_id="ggerganov/whisper.cpp", filename=f"ggml-{model_name}.bin", diff --git a/docs/docs/faq.md b/docs/docs/faq.md index 0cb5a12c3..05cb61926 100644 --- a/docs/docs/faq.md +++ b/docs/docs/faq.md @@ -19,4 +19,10 @@ sidebar_position: 5 Relevant tools: - Mac OS - [BlackHole](https://github.com/ExistentialAudio/BlackHole). - Windows - [VB CABLE](https://vb-audio.com/Cable/) - - Linux - [PulseAudio Volume Control](https://wiki.ubuntu.com/record_system_sound) \ No newline at end of file + - Linux - [PulseAudio Volume Control](https://wiki.ubuntu.com/record_system_sound) + +4. **What model should I use?** + + Model size to use will depend on your hardware and use case. Smaller models will work faster but will have more inaccuracies. Larger models will be more accurate but will require more powerful hardware or longer time to transcribe. + + When choosing among large models consider the following. "Large" is the first released older model, "Large-V2" is later updated model with better accuracy, for some languages considered the most robust and stable. "Large-V3" is the latest model with the best accuracy in many cases, but some times can hallucinate or invent words that were never in the audio. The only sure way to know what model best suits your needs is to test them all in your language.