From 6196b7d64122b8ca600fdf99dfe59677cad90b49 Mon Sep 17 00:00:00 2001 From: jhj0517 <97279763+jhj0517@users.noreply.github.com> Date: Tue, 17 Dec 2024 19:06:10 +0900 Subject: [PATCH 1/7] Install `faster-whisper` directly from repository --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 27145fe..7b62da3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ torch torchaudio git+https://github.com/jhj0517/jhj0517-whisper.git -faster-whisper==1.0.3 +git+https://github.com/SYSTRAN/faster-whisper.git transformers gradio gradio-i18n From 685979d832db563f8278dcc3d1126fcb36d42816 Mon Sep 17 00:00:00 2001 From: jhj0517 <97279763+jhj0517@users.noreply.github.com> Date: Tue, 17 Dec 2024 19:29:53 +0900 Subject: [PATCH 2/7] Fix defaults --- configs/default_parameters.yaml | 2 +- modules/whisper/data_classes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/default_parameters.yaml b/configs/default_parameters.yaml index e756dce..89ea204 100644 --- a/configs/default_parameters.yaml +++ b/configs/default_parameters.yaml @@ -28,7 +28,7 @@ whisper: max_new_tokens: null hallucination_silence_threshold: null hotwords: null - language_detection_threshold: null + language_detection_threshold: 0.5 language_detection_segments: 1 add_timestamp: true diff --git a/modules/whisper/data_classes.py b/modules/whisper/data_classes.py index 705e4b8..ad72ee3 100644 --- a/modules/whisper/data_classes.py +++ b/modules/whisper/data_classes.py @@ -319,7 +319,7 @@ class WhisperParams(BaseParams): ) hotwords: Optional[str] = Field(default=None, description="Hotwords/hint phrases for the model") language_detection_threshold: Optional[float] = Field( - default=None, + default=0.5, description="Threshold for language detection probability" ) language_detection_segments: int = Field( From c0a2a37c0b8d2dcc0806a3307aa778e59e13f94e Mon Sep 17 00:00:00 2001 From: jhj0517 <97279763+jhj0517@users.noreply.github.com> Date: Tue, 17 Dec 2024 20:23:42 +0900 Subject: [PATCH 3/7] Add `get_chunk_index()` and fix attributes --- modules/vad/silero_vad.py | 40 ++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/modules/vad/silero_vad.py b/modules/vad/silero_vad.py index cb6da93..4041ada 100644 --- a/modules/vad/silero_vad.py +++ b/modules/vad/silero_vad.py @@ -4,11 +4,13 @@ import numpy as np from typing import BinaryIO, Union, List, Optional, Tuple import warnings +import bisect import faster_whisper -from modules.whisper.data_classes import * from faster_whisper.transcribe import SpeechTimestampsMap import gradio as gr +from modules.whisper.data_classes import * + class SileroVAD: def __init__(self): @@ -58,6 +60,7 @@ def run(self, vad_options=vad_parameters, progress=progress ) + audio = self.collect_chunks(audio, speech_chunks) duration_after_vad = audio.shape[0] / sampling_rate @@ -94,35 +97,27 @@ def get_speech_timestamps( min_silence_duration_ms = vad_options.min_silence_duration_ms window_size_samples = self.window_size_samples speech_pad_ms = vad_options.speech_pad_ms - sampling_rate = 16000 - min_speech_samples = sampling_rate * min_speech_duration_ms / 1000 - speech_pad_samples = sampling_rate * speech_pad_ms / 1000 + min_speech_samples = self.sampling_rate * min_speech_duration_ms / 1000 + speech_pad_samples = self.sampling_rate * speech_pad_ms / 1000 max_speech_samples = ( - sampling_rate * max_speech_duration_s + self.sampling_rate * max_speech_duration_s - window_size_samples - 2 * speech_pad_samples ) - min_silence_samples = sampling_rate * min_silence_duration_ms / 1000 - min_silence_samples_at_max_speech = sampling_rate * 98 / 1000 + min_silence_samples = self.sampling_rate * min_silence_duration_ms / 1000 + min_silence_samples_at_max_speech = self.sampling_rate * 98 / 1000 audio_length_samples = len(audio) - state, context = self.model.get_initial_states(batch_size=1) - - speech_probs = [] - for current_start_sample in range(0, audio_length_samples, window_size_samples): - progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...") - - chunk = audio[current_start_sample: current_start_sample + window_size_samples] - if len(chunk) < window_size_samples: - chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk)))) - speech_prob, state, context = self.model(chunk, state, context, sampling_rate) - speech_probs.append(speech_prob) + padded_audio = np.pad( + audio, (0, window_size_samples - audio.shape[0] % window_size_samples) + ) + speech_probs = self.model(padded_audio.reshape(1, -1)).squeeze(0) triggered = False speeches = [] current_speech = {} - neg_threshold = threshold - 0.15 + neg_threshold = vad_options.neg_threshold # to save potential segment end (and tolerate some silence) temp_end = 0 @@ -223,6 +218,13 @@ def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray: return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks]) + def get_chunk_index(self, time: float) -> int: + sample = int(time * self.sampling_rate) + return min( + bisect.bisect(self.chunk_end_sample, sample), + len(self.chunk_end_sample) - 1, + ) + @staticmethod def format_timestamp( seconds: float, From a9d0d4566555a842b1d3d64b176e11fa9c9b4158 Mon Sep 17 00:00:00 2001 From: jhj0517 <97279763+jhj0517@users.noreply.github.com> Date: Tue, 17 Dec 2024 20:30:11 +0900 Subject: [PATCH 4/7] Update `restore_speech_timestamps()` --- modules/vad/silero_vad.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/modules/vad/silero_vad.py b/modules/vad/silero_vad.py index 4041ada..f0ab758 100644 --- a/modules/vad/silero_vad.py +++ b/modules/vad/silero_vad.py @@ -218,13 +218,6 @@ def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray: return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks]) - def get_chunk_index(self, time: float) -> int: - sample = int(time * self.sampling_rate) - return min( - bisect.bisect(self.chunk_end_sample, sample), - len(self.chunk_end_sample) - 1, - ) - @staticmethod def format_timestamp( seconds: float, @@ -260,8 +253,23 @@ def restore_speech_timestamps( ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate) for segment in segments: - segment.start = ts_map.get_original_time(segment.start) - segment.end = ts_map.get_original_time(segment.end) + if segment.words: + words = [] + for word in segment.words: + # Ensure the word start and end times are resolved to the same chunk. + middle = (word.start + word.end) / 2 + chunk_index = ts_map.get_chunk_index(middle) + word.start = ts_map.get_original_time(word.start, chunk_index) + word.end = ts_map.get_original_time(word.end, chunk_index) + words.append(word) + + segment.start = words[0].start + segment.end = words[-1].end + segment.words = words + + else: + segment.start = ts_map.get_original_time(segment.start) + segment.end = ts_map.get_original_time(segment.end) return segments From 1c73539024b9642c841ab5a580cf8b8ef9b2a9a1 Mon Sep 17 00:00:00 2001 From: jhj0517 <97279763+jhj0517@users.noreply.github.com> Date: Wed, 18 Dec 2024 00:14:09 +0900 Subject: [PATCH 5/7] Fix backend requirements as well --- backend/requirements-backend.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/requirements-backend.txt b/backend/requirements-backend.txt index cfa2248..a5b3967 100644 --- a/backend/requirements-backend.txt +++ b/backend/requirements-backend.txt @@ -3,7 +3,7 @@ torch torchaudio git+https://github.com/jhj0517/jhj0517-whisper.git -faster-whisper==1.0.3 +git+https://github.com/SYSTRAN/faster-whisper.git transformers gradio gradio-i18n From 2e3fe64fb39510355f244da879054621e06e7485 Mon Sep 17 00:00:00 2001 From: jhj0517 <97279763+jhj0517@users.noreply.github.com> Date: Wed, 18 Dec 2024 00:28:30 +0900 Subject: [PATCH 6/7] Fix colab requirements as well --- notebook/whisper-webui.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebook/whisper-webui.ipynb b/notebook/whisper-webui.ipynb index 52382e4..ba63fcf 100644 --- a/notebook/whisper-webui.ipynb +++ b/notebook/whisper-webui.ipynb @@ -53,7 +53,7 @@ "!git clone https://github.com/jhj0517/Whisper-WebUI.git\n", "%cd Whisper-WebUI\n", "!pip install git+https://github.com/jhj0517/jhj0517-whisper.git\n", - "!pip install faster-whisper==1.0.3\n", + "!pip install git+https://github.com/SYSTRAN/faster-whisper.git\n", "!pip install ctranslate2==4.4.0\n", "!pip install gradio\n", "!pip install gradio-i18n\n", From bdc4855d3f7e2916381ee19c5cb3d7d8cd77a1cc Mon Sep 17 00:00:00 2001 From: jhj0517 <97279763+jhj0517@users.noreply.github.com> Date: Wed, 18 Dec 2024 16:10:00 +0900 Subject: [PATCH 7/7] Specify requirements file using relative path --- backend/requirements-backend.txt | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/backend/requirements-backend.txt b/backend/requirements-backend.txt index a5b3967..1305f51 100644 --- a/backend/requirements-backend.txt +++ b/backend/requirements-backend.txt @@ -1,17 +1,5 @@ # Whisper-WebUI dependencies ---extra-index-url https://download.pytorch.org/whl/cu124 -torch -torchaudio -git+https://github.com/jhj0517/jhj0517-whisper.git -git+https://github.com/SYSTRAN/faster-whisper.git -transformers -gradio -gradio-i18n -pytubefix -ruamel.yaml==0.18.6 -pyannote.audio==3.3.2 -git+https://github.com/jhj0517/ultimatevocalremover_api.git -git+https://github.com/jhj0517/pyrubberband.git +-r ./../requirements.txt # Backend dependencies python-dotenv