From 0c007044762f4a308f28a6cd884946634b61c377 Mon Sep 17 00:00:00 2001 From: jhj0517 <97279763+jhj0517@users.noreply.github.com> Date: Mon, 15 Jul 2024 07:35:25 +0900 Subject: [PATCH 1/6] Revert "rename the variables" This reverts commit 07b632920a9781d62af6ff5ce26c3ccd66354aab. --- modules/vad/silero_vad.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/vad/silero_vad.py b/modules/vad/silero_vad.py index c73593c5..d55d7961 100644 --- a/modules/vad/silero_vad.py +++ b/modules/vad/silero_vad.py @@ -240,15 +240,15 @@ def collect_chunks( Returns: Tuple containing: - Processed audio as a numpy array - - Duration of non-speech (silenced or removed) audio in seconds + - Duration of changed (silenced or removed) audio in seconds """ if not chunks: return np.array([], dtype=np.float32), 0.0 total_samples = audio.shape[0] - speech_samples_count = sum(chunk["end"] - chunk["start"] for chunk in chunks) - non_speech_samples_count = total_samples - speech_samples_count - non_speech_duration = non_speech_samples_count / self.sampling_rate + speech_samples = sum(chunk["end"] - chunk["start"] for chunk in chunks) + changed_samples = total_samples - speech_samples + duration_difference = changed_samples / self.sampling_rate if not silence_non_speech: processed_audio = np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks]) @@ -258,7 +258,7 @@ def collect_chunks( start, end = chunk['start'], chunk['end'] processed_audio[start:end] = audio[start:end] - return processed_audio, non_speech_duration + return processed_audio, duration_difference @staticmethod def format_timestamp( From f7c56950e15afb3bbb1e9c99c93ee051d669c243 Mon Sep 17 00:00:00 2001 From: jhj0517 <97279763+jhj0517@users.noreply.github.com> Date: Mon, 15 Jul 2024 07:35:37 +0900 Subject: [PATCH 2/6] Revert "add `silence_non_speech` parameter" This reverts commit b678293544dbce3ad7b234752336c86154dfb05a. --- modules/vad/silero_vad.py | 63 ++++++--------------------------- modules/whisper/whisper_base.py | 1 - 2 files changed, 10 insertions(+), 54 deletions(-) diff --git a/modules/vad/silero_vad.py b/modules/vad/silero_vad.py index d55d7961..7b927bd1 100644 --- a/modules/vad/silero_vad.py +++ b/modules/vad/silero_vad.py @@ -1,6 +1,6 @@ from faster_whisper.vad import VadOptions, get_vad_model import numpy as np -from typing import BinaryIO, Union, List, Optional, Tuple +from typing import BinaryIO, Union, List, Optional import warnings import faster_whisper import gradio as gr @@ -15,7 +15,6 @@ def __init__(self): def run(self, audio: Union[str, BinaryIO, np.ndarray], vad_parameters: VadOptions, - silence_non_speech: bool = True, progress: gr.Progress = gr.Progress()): """ Run VAD @@ -26,8 +25,6 @@ def run(self, Audio path or file binary or Audio numpy array vad_parameters: Options for VAD processing. - silence_non_speech: bool - If True, non-speech parts will be silenced instead of being removed. progress: gr.Progress Indicator to show progress directly in gradio. @@ -43,32 +40,19 @@ def run(self, audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate) duration = audio.shape[0] / sampling_rate + duration_after_vad = duration if vad_parameters is None: vad_parameters = VadOptions() elif isinstance(vad_parameters, dict): vad_parameters = VadOptions(**vad_parameters) - speech_chunks = self.get_speech_timestamps( audio=audio, vad_options=vad_parameters, progress=progress ) - - audio, duration_diff = self.collect_chunks( - audio=audio, - chunks=speech_chunks, - silence_non_speech=silence_non_speech - ) - - if silence_non_speech: - print( - f"VAD filter silenced {self.format_timestamp(duration_diff)} of audio.", - ) - else: - print( - f"VAD filter removed {self.format_timestamp(duration_diff)} of audio", - ) + audio = self.collect_chunks(audio, speech_chunks) + duration_after_vad = audio.shape[0] / sampling_rate return audio @@ -224,41 +208,13 @@ def get_speech_timestamps( def update_model(self): self.model = get_vad_model() - def collect_chunks( - self, - audio: np.ndarray, - chunks: List[dict], - silence_non_speech: bool = True, - ) -> Tuple[np.ndarray, float]: - """Collects and concatenate audio chunks. - - Args: - audio: One dimensional float array. - chunks: List of dictionaries containing start and end samples of speech chunks - silence_non_speech: If True, non-speech parts will be silenced instead of being removed. - - Returns: - Tuple containing: - - Processed audio as a numpy array - - Duration of changed (silenced or removed) audio in seconds - """ + @staticmethod + def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray: + """Collects and concatenates audio chunks.""" if not chunks: - return np.array([], dtype=np.float32), 0.0 - - total_samples = audio.shape[0] - speech_samples = sum(chunk["end"] - chunk["start"] for chunk in chunks) - changed_samples = total_samples - speech_samples - duration_difference = changed_samples / self.sampling_rate + return np.array([], dtype=np.float32) - if not silence_non_speech: - processed_audio = np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks]) - else: - processed_audio = np.zeros_like(audio) - for chunk in chunks: - start, end = chunk['start'], chunk['end'] - processed_audio[start:end] = audio[start:end] - - return processed_audio, duration_difference + return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks]) @staticmethod def format_timestamp( @@ -282,3 +238,4 @@ def format_timestamp( return ( f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}" ) + diff --git a/modules/whisper/whisper_base.py b/modules/whisper/whisper_base.py index 9912320c..3fa6f422 100644 --- a/modules/whisper/whisper_base.py +++ b/modules/whisper/whisper_base.py @@ -96,7 +96,6 @@ def run(self, audio = self.vad.run( audio=audio, vad_parameters=vad_options, - silence_non_speech=True, progress=progress ) From 4da95457079ce8c134c2101dfc90c12bbba608f5 Mon Sep 17 00:00:00 2001 From: jhj0517 <97279763+jhj0517@users.noreply.github.com> Date: Mon, 15 Jul 2024 07:24:59 +0900 Subject: [PATCH 3/6] Fix default value of the beam size --- app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app.py b/app.py index 8817c34f..e1b6f8f3 100644 --- a/app.py +++ b/app.py @@ -73,7 +73,7 @@ def create_whisper_parameters(self): cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True) with gr.Accordion("Advanced Parameters", open=False): - nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True, + nb_beam_size = gr.Number(label="Beam Size", value=5, precision=0, interactive=True, info="Beam size to use for decoding.") nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True, info="If the average log probability over sampled tokens is below this value, treat as failed.") From ac480c2b77ee8d0e0e5b10ca6e30204897a13776 Mon Sep 17 00:00:00 2001 From: jhj0517 <97279763+jhj0517@users.noreply.github.com> Date: Mon, 15 Jul 2024 07:26:35 +0900 Subject: [PATCH 4/6] Limit Vad to only faster-whisper --- modules/whisper/faster_whisper_inference.py | 18 +++++++++++++++++- modules/whisper/whisper_base.py | 14 -------------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/modules/whisper/faster_whisper_inference.py b/modules/whisper/faster_whisper_inference.py index 9d8bdeb6..006b925a 100644 --- a/modules/whisper/faster_whisper_inference.py +++ b/modules/whisper/faster_whisper_inference.py @@ -71,6 +71,20 @@ def transcribe(self, if not params.hotwords: params.hotwords = None + vad_options = None + if params.vad_filter: + # Explicit value set for float('inf') from gr.Number() + if params.max_speech_duration_s >= 9999: + params.max_speech_duration_s = float('inf') + + vad_options = VadOptions( + threshold=params.threshold, + min_speech_duration_ms=params.min_speech_duration_ms, + max_speech_duration_s=params.max_speech_duration_s, + min_silence_duration_ms=params.min_silence_duration_ms, + speech_pad_ms=params.speech_pad_ms + ) + params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens) segments, info = self.model.transcribe( @@ -100,7 +114,9 @@ def transcribe(self, hotwords=params.hotwords, language_detection_threshold=params.language_detection_threshold, language_detection_segments=params.language_detection_segments, - prompt_reset_on_temperature=params.prompt_reset_on_temperature + prompt_reset_on_temperature=params.prompt_reset_on_temperature, + vad_filter=params.vad_filter, + vad_parameters=vad_options ) progress(0, desc="Loading audio..") diff --git a/modules/whisper/whisper_base.py b/modules/whisper/whisper_base.py index 3fa6f422..c5815f9f 100644 --- a/modules/whisper/whisper_base.py +++ b/modules/whisper/whisper_base.py @@ -85,20 +85,6 @@ def run(self, """ params = WhisperParameters.as_value(*whisper_params) - if params.vad_filter: - vad_options = VadOptions( - threshold=params.threshold, - min_speech_duration_ms=params.min_speech_duration_ms, - max_speech_duration_s=params.max_speech_duration_s, - min_silence_duration_ms=params.min_silence_duration_ms, - speech_pad_ms=params.speech_pad_ms - ) - audio = self.vad.run( - audio=audio, - vad_parameters=vad_options, - progress=progress - ) - if params.lang == "Automatic Detection": params.lang = None else: From 477e9f55a31da412cf5255b508ae2144296a3630 Mon Sep 17 00:00:00 2001 From: jhj0517 <97279763+jhj0517@users.noreply.github.com> Date: Mon, 15 Jul 2024 07:29:55 +0900 Subject: [PATCH 5/6] chore: add credit --- modules/diarize/audio_loader.py | 2 ++ modules/diarize/diarize_pipeline.py | 2 ++ modules/vad/silero_vad.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/modules/diarize/audio_loader.py b/modules/diarize/audio_loader.py index 2efd4216..d90e52c3 100644 --- a/modules/diarize/audio_loader.py +++ b/modules/diarize/audio_loader.py @@ -1,3 +1,5 @@ +# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/audio.py + import os import subprocess from functools import lru_cache diff --git a/modules/diarize/diarize_pipeline.py b/modules/diarize/diarize_pipeline.py index 766ff54d..6d495f02 100644 --- a/modules/diarize/diarize_pipeline.py +++ b/modules/diarize/diarize_pipeline.py @@ -1,3 +1,5 @@ +# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/diarize.py + import numpy as np import pandas as pd import os diff --git a/modules/vad/silero_vad.py b/modules/vad/silero_vad.py index 7b927bd1..2b3aba0f 100644 --- a/modules/vad/silero_vad.py +++ b/modules/vad/silero_vad.py @@ -1,3 +1,5 @@ +# Adapted from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py + from faster_whisper.vad import VadOptions, get_vad_model import numpy as np from typing import BinaryIO, Union, List, Optional From 174fcfd40a43def3c057e8754f240f315e30ef49 Mon Sep 17 00:00:00 2001 From: jhj0517 <97279763+jhj0517@users.noreply.github.com> Date: Mon, 15 Jul 2024 07:40:46 +0900 Subject: [PATCH 6/6] limit Vad to faster-whisper only --- app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app.py b/app.py index e1b6f8f3..f80c1506 100644 --- a/app.py +++ b/app.py @@ -137,7 +137,7 @@ def create_whisper_parameters(self): nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0) nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0) - with gr.Accordion("VAD", open=False): + with gr.Accordion("VAD", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)): cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True) sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5, info="Lower it to be more sensitive to small sounds.")