Merge pull request #214 from jhj0517/fix/limit-vad

Limit Vad feature to faster-whisper
jhj0517 · Jul 14, 2024 · 7386da0 · 7386da0
2 parents 6f9cdbb + 174fcfd
commit 7386da0
Show file tree

Hide file tree

Showing 6 changed files with 35 additions and 71 deletions.
diff --git a/app.py b/app.py
@@ -73,7 +73,7 @@ def create_whisper_parameters(self):
             cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                        interactive=True)
         with gr.Accordion("Advanced Parameters", open=False):
-            nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True,
+            nb_beam_size = gr.Number(label="Beam Size", value=5, precision=0, interactive=True,
                                      info="Beam size to use for decoding.")
             nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True,
                                               info="If the average log probability over sampled tokens is below this value, treat as failed.")
@@ -137,7 +137,7 @@ def create_whisper_parameters(self):
                 nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
                 nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
 
-        with gr.Accordion("VAD", open=False):
+        with gr.Accordion("VAD", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
             cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
             sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5,
                                      info="Lower it to be more sensitive to small sounds.")

diff --git a/modules/diarize/audio_loader.py b/modules/diarize/audio_loader.py
@@ -1,3 +1,5 @@
+# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/audio.py
+
 import os
 import subprocess
 from functools import lru_cache

diff --git a/modules/diarize/diarize_pipeline.py b/modules/diarize/diarize_pipeline.py
@@ -1,3 +1,5 @@
+# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/diarize.py
+
 import numpy as np
 import pandas as pd
 import os

diff --git a/modules/vad/silero_vad.py b/modules/vad/silero_vad.py
@@ -1,6 +1,8 @@
+# Adapted from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py
+
 from faster_whisper.vad import VadOptions, get_vad_model
 import numpy as np
-from typing import BinaryIO, Union, List, Optional, Tuple
+from typing import BinaryIO, Union, List, Optional
 import warnings
 import faster_whisper
 import gradio as gr
@@ -15,7 +17,6 @@ def __init__(self):
     def run(self,
             audio: Union[str, BinaryIO, np.ndarray],
             vad_parameters: VadOptions,
-            silence_non_speech: bool = True,
             progress: gr.Progress = gr.Progress()):
         """
         Run VAD
@@ -26,8 +27,6 @@ def run(self,
             Audio path or file binary or Audio numpy array
         vad_parameters:
             Options for VAD processing.
-        silence_non_speech: bool
-            If True, non-speech parts will be silenced instead of being removed.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
 
@@ -43,32 +42,19 @@ def run(self,
             audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)
 
         duration = audio.shape[0] / sampling_rate
+        duration_after_vad = duration
 
         if vad_parameters is None:
             vad_parameters = VadOptions()
         elif isinstance(vad_parameters, dict):
             vad_parameters = VadOptions(**vad_parameters)
-
         speech_chunks = self.get_speech_timestamps(
             audio=audio,
             vad_options=vad_parameters,
             progress=progress
         )
-
-        audio, duration_diff = self.collect_chunks(
-            audio=audio,
-            chunks=speech_chunks,
-            silence_non_speech=silence_non_speech
-        )
-
-        if silence_non_speech:
-            print(
-                f"VAD filter silenced {self.format_timestamp(duration_diff)} of audio.",
-            )
-        else:
-            print(
-                f"VAD filter removed {self.format_timestamp(duration_diff)} of audio",
-            )
+        audio = self.collect_chunks(audio, speech_chunks)
+        duration_after_vad = audio.shape[0] / sampling_rate
 
         return audio
 
@@ -224,41 +210,13 @@ def get_speech_timestamps(
     def update_model(self):
         self.model = get_vad_model()
 
-    def collect_chunks(
-        self,
-        audio: np.ndarray,
-        chunks: List[dict],
-        silence_non_speech: bool = True,
-    ) -> Tuple[np.ndarray, float]:
-        """Collects and concatenate audio chunks.
-
-        Args:
-          audio: One dimensional float array.
-          chunks: List of dictionaries containing start and end samples of speech chunks
-          silence_non_speech: If True, non-speech parts will be silenced instead of being removed.
-
-        Returns:
-          Tuple containing:
-            - Processed audio as a numpy array
-            - Duration of non-speech (silenced or removed) audio in seconds
-        """
+    @staticmethod
+    def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
+        """Collects and concatenates audio chunks."""
         if not chunks:
-            return np.array([], dtype=np.float32), 0.0
-
-        total_samples = audio.shape[0]
-        speech_samples_count = sum(chunk["end"] - chunk["start"] for chunk in chunks)
-        non_speech_samples_count = total_samples - speech_samples_count
-        non_speech_duration = non_speech_samples_count / self.sampling_rate
+            return np.array([], dtype=np.float32)
 
-        if not silence_non_speech:
-            processed_audio = np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
-        else:
-            processed_audio = np.zeros_like(audio)
-            for chunk in chunks:
-                start, end = chunk['start'], chunk['end']
-                processed_audio[start:end] = audio[start:end]
-
-        return processed_audio, non_speech_duration
+        return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
 
     @staticmethod
     def format_timestamp(
@@ -282,3 +240,4 @@ def format_timestamp(
         return (
             f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
         )
+
diff --git a/modules/whisper/faster_whisper_inference.py b/modules/whisper/faster_whisper_inference.py
@@ -71,6 +71,20 @@ def transcribe(self,
         if not params.hotwords:
             params.hotwords = None
 
+        vad_options = None
+        if params.vad_filter:
+            # Explicit value set for float('inf') from gr.Number()
+            if params.max_speech_duration_s >= 9999:
+                params.max_speech_duration_s = float('inf')
+
+            vad_options = VadOptions(
+                threshold=params.threshold,
+                min_speech_duration_ms=params.min_speech_duration_ms,
+                max_speech_duration_s=params.max_speech_duration_s,
+                min_silence_duration_ms=params.min_silence_duration_ms,
+                speech_pad_ms=params.speech_pad_ms
+            )
+
         params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
 
         segments, info = self.model.transcribe(
@@ -100,7 +114,9 @@ def transcribe(self,
             hotwords=params.hotwords,
             language_detection_threshold=params.language_detection_threshold,
             language_detection_segments=params.language_detection_segments,
-            prompt_reset_on_temperature=params.prompt_reset_on_temperature
+            prompt_reset_on_temperature=params.prompt_reset_on_temperature,
+            vad_filter=params.vad_filter,
+            vad_parameters=vad_options
         )
         progress(0, desc="Loading audio..")
 

diff --git a/modules/whisper/whisper_base.py b/modules/whisper/whisper_base.py
@@ -85,21 +85,6 @@ def run(self,
         """
         params = WhisperParameters.as_value(*whisper_params)
 
-        if params.vad_filter:
-            vad_options = VadOptions(
-                threshold=params.threshold,
-                min_speech_duration_ms=params.min_speech_duration_ms,
-                max_speech_duration_s=params.max_speech_duration_s,
-                min_silence_duration_ms=params.min_silence_duration_ms,
-                speech_pad_ms=params.speech_pad_ms
-            )
-            audio = self.vad.run(
-                audio=audio,
-                vad_parameters=vad_options,
-                silence_non_speech=True,
-                progress=progress
-            )
-
         if params.lang == "Automatic Detection":
             params.lang = None
         else: