From 6196b7d64122b8ca600fdf99dfe59677cad90b49 Mon Sep 17 00:00:00 2001
From: jhj0517 <97279763+jhj0517@users.noreply.github.com>
Date: Tue, 17 Dec 2024 19:06:10 +0900
Subject: [PATCH 1/7] Install `faster-whisper` directly from repository

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 27145fe..7b62da3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,7 @@
 torch
 torchaudio
 git+https://github.com/jhj0517/jhj0517-whisper.git
-faster-whisper==1.0.3
+git+https://github.com/SYSTRAN/faster-whisper.git
 transformers
 gradio
 gradio-i18n

From 685979d832db563f8278dcc3d1126fcb36d42816 Mon Sep 17 00:00:00 2001
From: jhj0517 <97279763+jhj0517@users.noreply.github.com>
Date: Tue, 17 Dec 2024 19:29:53 +0900
Subject: [PATCH 2/7] Fix defaults

---
 configs/default_parameters.yaml | 2 +-
 modules/whisper/data_classes.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/default_parameters.yaml b/configs/default_parameters.yaml
index e756dce..89ea204 100644
--- a/configs/default_parameters.yaml
+++ b/configs/default_parameters.yaml
@@ -28,7 +28,7 @@ whisper:
   max_new_tokens: null
   hallucination_silence_threshold: null
   hotwords: null
-  language_detection_threshold: null
+  language_detection_threshold: 0.5
   language_detection_segments: 1
   add_timestamp: true
 
diff --git a/modules/whisper/data_classes.py b/modules/whisper/data_classes.py
index 705e4b8..ad72ee3 100644
--- a/modules/whisper/data_classes.py
+++ b/modules/whisper/data_classes.py
@@ -319,7 +319,7 @@ class WhisperParams(BaseParams):
     )
     hotwords: Optional[str] = Field(default=None, description="Hotwords/hint phrases for the model")
     language_detection_threshold: Optional[float] = Field(
-        default=None,
+        default=0.5,
         description="Threshold for language detection probability"
     )
     language_detection_segments: int = Field(

From c0a2a37c0b8d2dcc0806a3307aa778e59e13f94e Mon Sep 17 00:00:00 2001
From: jhj0517 <97279763+jhj0517@users.noreply.github.com>
Date: Tue, 17 Dec 2024 20:23:42 +0900
Subject: [PATCH 3/7] Add `get_chunk_index()` and fix attributes

---
 modules/vad/silero_vad.py | 40 ++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/modules/vad/silero_vad.py b/modules/vad/silero_vad.py
index cb6da93..4041ada 100644
--- a/modules/vad/silero_vad.py
+++ b/modules/vad/silero_vad.py
@@ -4,11 +4,13 @@
 import numpy as np
 from typing import BinaryIO, Union, List, Optional, Tuple
 import warnings
+import bisect
 import faster_whisper
-from modules.whisper.data_classes import *
 from faster_whisper.transcribe import SpeechTimestampsMap
 import gradio as gr
 
+from modules.whisper.data_classes import *
+
 
 class SileroVAD:
     def __init__(self):
@@ -58,6 +60,7 @@ def run(self,
             vad_options=vad_parameters,
             progress=progress
         )
+
         audio = self.collect_chunks(audio, speech_chunks)
         duration_after_vad = audio.shape[0] / sampling_rate
 
@@ -94,35 +97,27 @@ def get_speech_timestamps(
         min_silence_duration_ms = vad_options.min_silence_duration_ms
         window_size_samples = self.window_size_samples
         speech_pad_ms = vad_options.speech_pad_ms
-        sampling_rate = 16000
-        min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
-        speech_pad_samples = sampling_rate * speech_pad_ms / 1000
+        min_speech_samples = self.sampling_rate * min_speech_duration_ms / 1000
+        speech_pad_samples = self.sampling_rate * speech_pad_ms / 1000
         max_speech_samples = (
-                sampling_rate * max_speech_duration_s
+                self.sampling_rate * max_speech_duration_s
                 - window_size_samples
                 - 2 * speech_pad_samples
         )
-        min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
-        min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
+        min_silence_samples = self.sampling_rate * min_silence_duration_ms / 1000
+        min_silence_samples_at_max_speech = self.sampling_rate * 98 / 1000
 
         audio_length_samples = len(audio)
 
-        state, context = self.model.get_initial_states(batch_size=1)
-
-        speech_probs = []
-        for current_start_sample in range(0, audio_length_samples, window_size_samples):
-            progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")
-
-            chunk = audio[current_start_sample: current_start_sample + window_size_samples]
-            if len(chunk) < window_size_samples:
-                chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
-            speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
-            speech_probs.append(speech_prob)
+        padded_audio = np.pad(
+            audio, (0, window_size_samples - audio.shape[0] % window_size_samples)
+        )
+        speech_probs = self.model(padded_audio.reshape(1, -1)).squeeze(0)
 
         triggered = False
         speeches = []
         current_speech = {}
-        neg_threshold = threshold - 0.15
+        neg_threshold = vad_options.neg_threshold
 
         # to save potential segment end (and tolerate some silence)
         temp_end = 0
@@ -223,6 +218,13 @@ def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
 
         return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
 
+    def get_chunk_index(self, time: float) -> int:
+        sample = int(time * self.sampling_rate)
+        return min(
+            bisect.bisect(self.chunk_end_sample, sample),
+            len(self.chunk_end_sample) - 1,
+        )
+
     @staticmethod
     def format_timestamp(
         seconds: float,

From a9d0d4566555a842b1d3d64b176e11fa9c9b4158 Mon Sep 17 00:00:00 2001
From: jhj0517 <97279763+jhj0517@users.noreply.github.com>
Date: Tue, 17 Dec 2024 20:30:11 +0900
Subject: [PATCH 4/7] Update `restore_speech_timestamps()`

---
 modules/vad/silero_vad.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/modules/vad/silero_vad.py b/modules/vad/silero_vad.py
index 4041ada..f0ab758 100644
--- a/modules/vad/silero_vad.py
+++ b/modules/vad/silero_vad.py
@@ -218,13 +218,6 @@ def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
 
         return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
 
-    def get_chunk_index(self, time: float) -> int:
-        sample = int(time * self.sampling_rate)
-        return min(
-            bisect.bisect(self.chunk_end_sample, sample),
-            len(self.chunk_end_sample) - 1,
-        )
-
     @staticmethod
     def format_timestamp(
         seconds: float,
@@ -260,8 +253,23 @@ def restore_speech_timestamps(
         ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
 
         for segment in segments:
-            segment.start = ts_map.get_original_time(segment.start)
-            segment.end = ts_map.get_original_time(segment.end)
+            if segment.words:
+                words = []
+                for word in segment.words:
+                    # Ensure the word start and end times are resolved to the same chunk.
+                    middle = (word.start + word.end) / 2
+                    chunk_index = ts_map.get_chunk_index(middle)
+                    word.start = ts_map.get_original_time(word.start, chunk_index)
+                    word.end = ts_map.get_original_time(word.end, chunk_index)
+                    words.append(word)
+
+                segment.start = words[0].start
+                segment.end = words[-1].end
+                segment.words = words
+
+            else:
+                segment.start = ts_map.get_original_time(segment.start)
+                segment.end = ts_map.get_original_time(segment.end)
 
         return segments
 

From 1c73539024b9642c841ab5a580cf8b8ef9b2a9a1 Mon Sep 17 00:00:00 2001
From: jhj0517 <97279763+jhj0517@users.noreply.github.com>
Date: Wed, 18 Dec 2024 00:14:09 +0900
Subject: [PATCH 5/7] Fix backend requirements as well

---
 backend/requirements-backend.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/requirements-backend.txt b/backend/requirements-backend.txt
index cfa2248..a5b3967 100644
--- a/backend/requirements-backend.txt
+++ b/backend/requirements-backend.txt
@@ -3,7 +3,7 @@
 torch
 torchaudio
 git+https://github.com/jhj0517/jhj0517-whisper.git
-faster-whisper==1.0.3
+git+https://github.com/SYSTRAN/faster-whisper.git
 transformers
 gradio
 gradio-i18n

From 2e3fe64fb39510355f244da879054621e06e7485 Mon Sep 17 00:00:00 2001
From: jhj0517 <97279763+jhj0517@users.noreply.github.com>
Date: Wed, 18 Dec 2024 00:28:30 +0900
Subject: [PATCH 6/7] Fix colab requirements as well

---
 notebook/whisper-webui.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebook/whisper-webui.ipynb b/notebook/whisper-webui.ipynb
index 52382e4..ba63fcf 100644
--- a/notebook/whisper-webui.ipynb
+++ b/notebook/whisper-webui.ipynb
@@ -53,7 +53,7 @@
         "!git clone https://github.com/jhj0517/Whisper-WebUI.git\n",
         "%cd Whisper-WebUI\n",
         "!pip install git+https://github.com/jhj0517/jhj0517-whisper.git\n",
-        "!pip install faster-whisper==1.0.3\n",
+        "!pip install git+https://github.com/SYSTRAN/faster-whisper.git\n",
         "!pip install ctranslate2==4.4.0\n",
         "!pip install gradio\n",
         "!pip install gradio-i18n\n",

From bdc4855d3f7e2916381ee19c5cb3d7d8cd77a1cc Mon Sep 17 00:00:00 2001
From: jhj0517 <97279763+jhj0517@users.noreply.github.com>
Date: Wed, 18 Dec 2024 16:10:00 +0900
Subject: [PATCH 7/7] Specify requirements file using relative path

---
 backend/requirements-backend.txt | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/backend/requirements-backend.txt b/backend/requirements-backend.txt
index a5b3967..1305f51 100644
--- a/backend/requirements-backend.txt
+++ b/backend/requirements-backend.txt
@@ -1,17 +1,5 @@
 # Whisper-WebUI dependencies
---extra-index-url https://download.pytorch.org/whl/cu124
-torch
-torchaudio
-git+https://github.com/jhj0517/jhj0517-whisper.git
-git+https://github.com/SYSTRAN/faster-whisper.git
-transformers
-gradio
-gradio-i18n
-pytubefix
-ruamel.yaml==0.18.6
-pyannote.audio==3.3.2
-git+https://github.com/jhj0517/ultimatevocalremover_api.git
-git+https://github.com/jhj0517/pyrubberband.git
+-r ./../requirements.txt
 
 # Backend dependencies
 python-dotenv