From 0c007044762f4a308f28a6cd884946634b61c377 Mon Sep 17 00:00:00 2001
From: jhj0517 <97279763+jhj0517@users.noreply.github.com>
Date: Mon, 15 Jul 2024 07:35:25 +0900
Subject: [PATCH 1/6] Revert "rename the variables"

This reverts commit 07b632920a9781d62af6ff5ce26c3ccd66354aab.
---
 modules/vad/silero_vad.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/vad/silero_vad.py b/modules/vad/silero_vad.py
index c73593c5..d55d7961 100644
--- a/modules/vad/silero_vad.py
+++ b/modules/vad/silero_vad.py
@@ -240,15 +240,15 @@ def collect_chunks(
         Returns:
           Tuple containing:
             - Processed audio as a numpy array
-            - Duration of non-speech (silenced or removed) audio in seconds
+            - Duration of changed (silenced or removed) audio in seconds
         """
         if not chunks:
             return np.array([], dtype=np.float32), 0.0
 
         total_samples = audio.shape[0]
-        speech_samples_count = sum(chunk["end"] - chunk["start"] for chunk in chunks)
-        non_speech_samples_count = total_samples - speech_samples_count
-        non_speech_duration = non_speech_samples_count / self.sampling_rate
+        speech_samples = sum(chunk["end"] - chunk["start"] for chunk in chunks)
+        changed_samples = total_samples - speech_samples
+        duration_difference = changed_samples / self.sampling_rate
 
         if not silence_non_speech:
             processed_audio = np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
@@ -258,7 +258,7 @@ def collect_chunks(
                 start, end = chunk['start'], chunk['end']
                 processed_audio[start:end] = audio[start:end]
 
-        return processed_audio, non_speech_duration
+        return processed_audio, duration_difference
 
     @staticmethod
     def format_timestamp(

From f7c56950e15afb3bbb1e9c99c93ee051d669c243 Mon Sep 17 00:00:00 2001
From: jhj0517 <97279763+jhj0517@users.noreply.github.com>
Date: Mon, 15 Jul 2024 07:35:37 +0900
Subject: [PATCH 2/6] Revert "add `silence_non_speech` parameter"

This reverts commit b678293544dbce3ad7b234752336c86154dfb05a.
---
 modules/vad/silero_vad.py       | 63 ++++++---------------------------
 modules/whisper/whisper_base.py |  1 -
 2 files changed, 10 insertions(+), 54 deletions(-)

diff --git a/modules/vad/silero_vad.py b/modules/vad/silero_vad.py
index d55d7961..7b927bd1 100644
--- a/modules/vad/silero_vad.py
+++ b/modules/vad/silero_vad.py
@@ -1,6 +1,6 @@
 from faster_whisper.vad import VadOptions, get_vad_model
 import numpy as np
-from typing import BinaryIO, Union, List, Optional, Tuple
+from typing import BinaryIO, Union, List, Optional
 import warnings
 import faster_whisper
 import gradio as gr
@@ -15,7 +15,6 @@ def __init__(self):
     def run(self,
             audio: Union[str, BinaryIO, np.ndarray],
             vad_parameters: VadOptions,
-            silence_non_speech: bool = True,
             progress: gr.Progress = gr.Progress()):
         """
         Run VAD
@@ -26,8 +25,6 @@ def run(self,
             Audio path or file binary or Audio numpy array
         vad_parameters:
             Options for VAD processing.
-        silence_non_speech: bool
-            If True, non-speech parts will be silenced instead of being removed.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
 
@@ -43,32 +40,19 @@ def run(self,
             audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)
 
         duration = audio.shape[0] / sampling_rate
+        duration_after_vad = duration
 
         if vad_parameters is None:
             vad_parameters = VadOptions()
         elif isinstance(vad_parameters, dict):
             vad_parameters = VadOptions(**vad_parameters)
-
         speech_chunks = self.get_speech_timestamps(
             audio=audio,
             vad_options=vad_parameters,
             progress=progress
         )
-
-        audio, duration_diff = self.collect_chunks(
-            audio=audio,
-            chunks=speech_chunks,
-            silence_non_speech=silence_non_speech
-        )
-
-        if silence_non_speech:
-            print(
-                f"VAD filter silenced {self.format_timestamp(duration_diff)} of audio.",
-            )
-        else:
-            print(
-                f"VAD filter removed {self.format_timestamp(duration_diff)} of audio",
-            )
+        audio = self.collect_chunks(audio, speech_chunks)
+        duration_after_vad = audio.shape[0] / sampling_rate
 
         return audio
 
@@ -224,41 +208,13 @@ def get_speech_timestamps(
     def update_model(self):
         self.model = get_vad_model()
 
-    def collect_chunks(
-        self,
-        audio: np.ndarray,
-        chunks: List[dict],
-        silence_non_speech: bool = True,
-    ) -> Tuple[np.ndarray, float]:
-        """Collects and concatenate audio chunks.
-
-        Args:
-          audio: One dimensional float array.
-          chunks: List of dictionaries containing start and end samples of speech chunks
-          silence_non_speech: If True, non-speech parts will be silenced instead of being removed.
-
-        Returns:
-          Tuple containing:
-            - Processed audio as a numpy array
-            - Duration of changed (silenced or removed) audio in seconds
-        """
+    @staticmethod
+    def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
+        """Collects and concatenates audio chunks."""
         if not chunks:
-            return np.array([], dtype=np.float32), 0.0
-
-        total_samples = audio.shape[0]
-        speech_samples = sum(chunk["end"] - chunk["start"] for chunk in chunks)
-        changed_samples = total_samples - speech_samples
-        duration_difference = changed_samples / self.sampling_rate
+            return np.array([], dtype=np.float32)
 
-        if not silence_non_speech:
-            processed_audio = np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
-        else:
-            processed_audio = np.zeros_like(audio)
-            for chunk in chunks:
-                start, end = chunk['start'], chunk['end']
-                processed_audio[start:end] = audio[start:end]
-
-        return processed_audio, duration_difference
+        return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
 
     @staticmethod
     def format_timestamp(
@@ -282,3 +238,4 @@ def format_timestamp(
         return (
             f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
         )
+
diff --git a/modules/whisper/whisper_base.py b/modules/whisper/whisper_base.py
index 9912320c..3fa6f422 100644
--- a/modules/whisper/whisper_base.py
+++ b/modules/whisper/whisper_base.py
@@ -96,7 +96,6 @@ def run(self,
             audio = self.vad.run(
                 audio=audio,
                 vad_parameters=vad_options,
-                silence_non_speech=True,
                 progress=progress
             )
 

From 4da95457079ce8c134c2101dfc90c12bbba608f5 Mon Sep 17 00:00:00 2001
From: jhj0517 <97279763+jhj0517@users.noreply.github.com>
Date: Mon, 15 Jul 2024 07:24:59 +0900
Subject: [PATCH 3/6] Fix default value of the beam size

---
 app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app.py b/app.py
index 8817c34f..e1b6f8f3 100644
--- a/app.py
+++ b/app.py
@@ -73,7 +73,7 @@ def create_whisper_parameters(self):
             cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                        interactive=True)
         with gr.Accordion("Advanced Parameters", open=False):
-            nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True,
+            nb_beam_size = gr.Number(label="Beam Size", value=5, precision=0, interactive=True,
                                      info="Beam size to use for decoding.")
             nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True,
                                               info="If the average log probability over sampled tokens is below this value, treat as failed.")

From ac480c2b77ee8d0e0e5b10ca6e30204897a13776 Mon Sep 17 00:00:00 2001
From: jhj0517 <97279763+jhj0517@users.noreply.github.com>
Date: Mon, 15 Jul 2024 07:26:35 +0900
Subject: [PATCH 4/6] Limit Vad to only faster-whisper

---
 modules/whisper/faster_whisper_inference.py | 18 +++++++++++++++++-
 modules/whisper/whisper_base.py             | 14 --------------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/modules/whisper/faster_whisper_inference.py b/modules/whisper/faster_whisper_inference.py
index 9d8bdeb6..006b925a 100644
--- a/modules/whisper/faster_whisper_inference.py
+++ b/modules/whisper/faster_whisper_inference.py
@@ -71,6 +71,20 @@ def transcribe(self,
         if not params.hotwords:
             params.hotwords = None
 
+        vad_options = None
+        if params.vad_filter:
+            # Explicit value set for float('inf') from gr.Number()
+            if params.max_speech_duration_s >= 9999:
+                params.max_speech_duration_s = float('inf')
+
+            vad_options = VadOptions(
+                threshold=params.threshold,
+                min_speech_duration_ms=params.min_speech_duration_ms,
+                max_speech_duration_s=params.max_speech_duration_s,
+                min_silence_duration_ms=params.min_silence_duration_ms,
+                speech_pad_ms=params.speech_pad_ms
+            )
+
         params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
 
         segments, info = self.model.transcribe(
@@ -100,7 +114,9 @@ def transcribe(self,
             hotwords=params.hotwords,
             language_detection_threshold=params.language_detection_threshold,
             language_detection_segments=params.language_detection_segments,
-            prompt_reset_on_temperature=params.prompt_reset_on_temperature
+            prompt_reset_on_temperature=params.prompt_reset_on_temperature,
+            vad_filter=params.vad_filter,
+            vad_parameters=vad_options
         )
         progress(0, desc="Loading audio..")
 
diff --git a/modules/whisper/whisper_base.py b/modules/whisper/whisper_base.py
index 3fa6f422..c5815f9f 100644
--- a/modules/whisper/whisper_base.py
+++ b/modules/whisper/whisper_base.py
@@ -85,20 +85,6 @@ def run(self,
         """
         params = WhisperParameters.as_value(*whisper_params)
 
-        if params.vad_filter:
-            vad_options = VadOptions(
-                threshold=params.threshold,
-                min_speech_duration_ms=params.min_speech_duration_ms,
-                max_speech_duration_s=params.max_speech_duration_s,
-                min_silence_duration_ms=params.min_silence_duration_ms,
-                speech_pad_ms=params.speech_pad_ms
-            )
-            audio = self.vad.run(
-                audio=audio,
-                vad_parameters=vad_options,
-                progress=progress
-            )
-
         if params.lang == "Automatic Detection":
             params.lang = None
         else:

From 477e9f55a31da412cf5255b508ae2144296a3630 Mon Sep 17 00:00:00 2001
From: jhj0517 <97279763+jhj0517@users.noreply.github.com>
Date: Mon, 15 Jul 2024 07:29:55 +0900
Subject: [PATCH 5/6] chore: add credit

---
 modules/diarize/audio_loader.py     | 2 ++
 modules/diarize/diarize_pipeline.py | 2 ++
 modules/vad/silero_vad.py           | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/modules/diarize/audio_loader.py b/modules/diarize/audio_loader.py
index 2efd4216..d90e52c3 100644
--- a/modules/diarize/audio_loader.py
+++ b/modules/diarize/audio_loader.py
@@ -1,3 +1,5 @@
+# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/audio.py
+
 import os
 import subprocess
 from functools import lru_cache
diff --git a/modules/diarize/diarize_pipeline.py b/modules/diarize/diarize_pipeline.py
index 766ff54d..6d495f02 100644
--- a/modules/diarize/diarize_pipeline.py
+++ b/modules/diarize/diarize_pipeline.py
@@ -1,3 +1,5 @@
+# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/diarize.py
+
 import numpy as np
 import pandas as pd
 import os
diff --git a/modules/vad/silero_vad.py b/modules/vad/silero_vad.py
index 7b927bd1..2b3aba0f 100644
--- a/modules/vad/silero_vad.py
+++ b/modules/vad/silero_vad.py
@@ -1,3 +1,5 @@
+# Adapted from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py
+
 from faster_whisper.vad import VadOptions, get_vad_model
 import numpy as np
 from typing import BinaryIO, Union, List, Optional

From 174fcfd40a43def3c057e8754f240f315e30ef49 Mon Sep 17 00:00:00 2001
From: jhj0517 <97279763+jhj0517@users.noreply.github.com>
Date: Mon, 15 Jul 2024 07:40:46 +0900
Subject: [PATCH 6/6] limit Vad to faster-whisper only

---
 app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app.py b/app.py
index e1b6f8f3..f80c1506 100644
--- a/app.py
+++ b/app.py
@@ -137,7 +137,7 @@ def create_whisper_parameters(self):
                 nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
                 nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
 
-        with gr.Accordion("VAD", open=False):
+        with gr.Accordion("VAD", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
             cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
             sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5,
                                      info="Lower it to be more sensitive to small sounds.")