deps: use pypi provided silero vad, upgrade to latest

jitsi · Nov 20, 2024 · 4d17ab9 · 4d17ab9
1 parent 67f708c
commit 4d17ab9
Show file tree

Hide file tree

Showing 7 changed files with 24 additions and 254 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,6 +42,7 @@ langchain-openai = "^0.2.0"
 av = "^12.3.0"
 pybase64 = "^1.4.0"
 vllm = "0.6.2"
+silero-vad = "^5.1.2"
 
 [build-system]
 build-backend = "poetry.core.masonry.api"

diff --git a/requirements.txt b/requirements.txt
@@ -119,6 +119,7 @@ s3transfer==0.10.2 ; python_version >= "3.11" and python_version < "3.12"
 safetensors==0.4.5 ; python_version >= "3.11" and python_version < "3.12"
 sentencepiece==0.2.0 ; python_version >= "3.11" and python_version < "3.12"
 setuptools==75.1.0 ; python_version >= "3.11" and python_version < "3.12"
+silero-vad==5.1.2 ; python_version >= "3.11" and python_version < "3.12"
 six==1.16.0 ; python_version >= "3.11" and python_version < "3.12"
 sniffio==1.3.1 ; python_version >= "3.11" and python_version < "3.12"
 sqlalchemy==2.0.35 ; python_version >= "3.11" and python_version < "3.12"

diff --git a/skynet/modules/stt/streaming_whisper/cfg.py b/skynet/modules/stt/streaming_whisper/cfg.py
@@ -1,6 +1,5 @@
-import os
-
 from faster_whisper import WhisperModel
+from silero_vad import load_silero_vad
 
 from skynet.env import (
     device,
@@ -11,12 +10,11 @@
     whisper_model_path,
 )
 from skynet.logs import get_logger
-from skynet.modules.stt.streaming_whisper.utils import vad_utils as vad
 
 log = get_logger(__name__)
 
 
-vad_model = vad.init_jit_model(f'{os.getcwd()}/skynet/modules/stt/streaming_whisper/models/vad/silero_vad.jit')
+vad_model = load_silero_vad(onnx=False)
 
 device = whisper_device if whisper_device != 'auto' else device
 log.info(f'Using {device}')

diff --git a/skynet/modules/stt/streaming_whisper/models/vad/silero_vad.jit b/skynet/modules/stt/streaming_whisper/models/vad/silero_vad.jit
diff --git a/skynet/modules/stt/streaming_whisper/utils/utils.py b/skynet/modules/stt/streaming_whisper/utils/utils.py
@@ -6,6 +6,7 @@
 import numpy as np
 from numpy import ndarray
 from pydantic import BaseModel
+from silero_vad import get_speech_timestamps, read_audio
 from uuid6 import UUID
 
 import skynet.modules.stt.streaming_whisper.cfg as cfg
@@ -188,8 +189,8 @@ def is_silent(audio: bytes) -> Tuple[bool, iter]:
     chunk_duration = convert_bytes_to_seconds(audio)
     wav_header = get_wav_header([audio], chunk_duration_s=chunk_duration)
     stream = wav_header + b'' + audio
-    audio = cfg.vad.read_audio(stream)
-    st = cfg.vad.get_speech_timestamps(audio, model=cfg.vad_model, return_seconds=True)
+    audio = read_audio(stream)
+    st = get_speech_timestamps(audio, model=cfg.vad_model, return_seconds=True)
     log.debug(f'Detected speech timestamps: {st}')
     silent = True if len(st) == 0 else False
     return silent, st

diff --git a/skynet/modules/stt/streaming_whisper/utils/vad_utils.py b/skynet/modules/stt/streaming_whisper/utils/vad_utils.py