Merge pull request #159 from jhj0517/feature/add-vad-parameter

Add Silero VAD Options parameter
jhj0517 · Jun 1, 2024 · 6cba7cb · 6cba7cb
2 parents 14d5cf9 + ed10d53
commit 6cba7cb
Show file tree

Hide file tree

Showing 3 changed files with 98 additions and 6 deletions.
diff --git a/app.py b/app.py
@@ -1,6 +1,7 @@
 import gradio as gr
 import os
 import argparse
+import webbrowser
 
 from modules.whisper_Inference import WhisperInference
 from modules.faster_whisper_inference import FasterWhisperInference
@@ -60,8 +61,15 @@ def launch(self):
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
-                    with gr.Accordion("Advanced_Parameters", open=False):
+                    with gr.Accordion("VAD Options", open=False, visible=not self.args.disable_faster_whisper):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
+                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
+                        nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
+                        nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
+                        nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
+                        nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
+                        nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
+                    with gr.Accordion("Advanced_Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -93,7 +101,14 @@ def launch(self):
                                                              initial_prompt=tb_initial_prompt,
                                                              temperature=sd_temperature,
                                                              compression_ratio_threshold=nb_compression_ratio_threshold,
-                                                             vad_filter=cb_vad_filter)
+                                                             vad_filter=cb_vad_filter,
+                                                             threshold=sd_threshold,
+                                                             min_speech_duration_ms=nb_min_speech_duration_ms,
+                                                             max_speech_duration_s=nb_max_speech_duration_s,
+                                                             min_silence_duration_ms=nb_min_silence_duration_ms,
+                                                             window_size_sample=nb_window_size_sample,
+                                                             speech_pad_ms=nb_speech_pad_ms)
+
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
@@ -120,8 +135,15 @@ def launch(self):
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                                    interactive=True)
-                    with gr.Accordion("Advanced_Parameters", open=False):
+                    with gr.Accordion("VAD Options", open=False, visible=not self.args.disable_faster_whisper):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
+                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
+                        nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
+                        nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
+                        nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
+                        nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
+                        nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
+                    with gr.Accordion("Advanced_Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -153,7 +175,13 @@ def launch(self):
                                                              initial_prompt=tb_initial_prompt,
                                                              temperature=sd_temperature,
                                                              compression_ratio_threshold=nb_compression_ratio_threshold,
-                                                             vad_filter=cb_vad_filter)
+                                                             vad_filter=cb_vad_filter,
+                                                             threshold=sd_threshold,
+                                                             min_speech_duration_ms=nb_min_speech_duration_ms,
+                                                             max_speech_duration_s=nb_max_speech_duration_s,
+                                                             min_silence_duration_ms=nb_min_silence_duration_ms,
+                                                             window_size_sample=nb_window_size_sample,
+                                                             speech_pad_ms=nb_speech_pad_ms)
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
@@ -173,8 +201,15 @@ def launch(self):
                         dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
                     with gr.Row():
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
-                    with gr.Accordion("Advanced_Parameters", open=False):
+                    with gr.Accordion("VAD Options", open=False, visible=not self.args.disable_faster_whisper):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
+                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
+                        nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
+                        nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
+                        nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
+                        nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
+                        nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
+                    with gr.Accordion("Advanced_Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -205,7 +240,13 @@ def launch(self):
                                                              initial_prompt=tb_initial_prompt,
                                                              temperature=sd_temperature,
                                                              compression_ratio_threshold=nb_compression_ratio_threshold,
-                                                             vad_filter=cb_vad_filter)
+                                                             vad_filter=cb_vad_filter,
+                                                             threshold=sd_threshold,
+                                                             min_speech_duration_ms=nb_min_speech_duration_ms,
+                                                             max_speech_duration_s=nb_max_speech_duration_s,
+                                                             min_silence_duration_ms=nb_min_silence_duration_ms,
+                                                             window_size_sample=nb_window_size_sample,
+                                                             speech_pad_ms=nb_speech_pad_ms)
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
@@ -284,6 +325,7 @@ def launch(self):
             launch_args['server_port'] = self.args.server_port
         if self.args.username and self.args.password:
             launch_args['auth'] = (self.args.username, self.args.password)
+
         self.app.queue(api_open=False).launch(**launch_args)
 
 

diff --git a/modules/faster_whisper_inference.py b/modules/faster_whisper_inference.py
@@ -6,6 +6,7 @@
 from datetime import datetime
 
 import faster_whisper
+from faster_whisper.vad import VadOptions
 import ctranslate2
 import whisper
 import torch
@@ -260,6 +261,15 @@ def transcribe(self,
             language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
             params.lang = language_code_dict[params.lang]
 
+        vad_options = VadOptions(
+            threshold=params.threshold,
+            min_speech_duration_ms=params.min_speech_duration_ms,
+            max_speech_duration_s=params.max_speech_duration_s,
+            min_silence_duration_ms=params.min_silence_duration_ms,
+            window_size_samples=params.window_size_samples,
+            speech_pad_ms=params.speech_pad_ms
+        )
+
         segments, info = self.model.transcribe(
             audio=audio,
             language=params.lang,
@@ -272,6 +282,7 @@ def transcribe(self,
             temperature=params.temperature,
             compression_ratio_threshold=params.compression_ratio_threshold,
             vad_filter=params.vad_filter,
+            vad_parameters=vad_options
         )
         progress(0, desc="Loading audio..")
 

diff --git a/modules/whisper_parameter.py b/modules/whisper_parameter.py
@@ -19,6 +19,12 @@ class WhisperGradioComponents:
     temperature: gr.Slider
     compression_ratio_threshold: gr.Number
     vad_filter: gr.Checkbox
+    threshold: gr.Slider
+    min_speech_duration_ms: gr.Number
+    max_speech_duration_s: gr.Number
+    min_silence_duration_ms: gr.Number
+    window_size_sample: gr.Number
+    speech_pad_ms: gr.Number
     """
     A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
     See more about Gradio pre-processing: https://www.gradio.app/docs/components
@@ -78,6 +84,33 @@ class WhisperGradioComponents:
         Enable the voice activity detection (VAD) to filter out parts of the audio
         without speech. This step is using the Silero VAD model
         https://github.com/snakers4/silero-vad.
+        
+    threshold: gr.Slider
+        This parameter is related with Silero VAD. Speech threshold. 
+        Silero VAD outputs speech probabilities for each audio chunk,
+        probabilities ABOVE this value are considered as SPEECH. It is better to tune this
+        parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
+        
+    min_speech_duration_ms: gr.Number
+        This parameter is related with Silero VAD. Final speech chunks shorter min_speech_duration_ms are thrown out.
+        
+    max_speech_duration_s: gr.Number
+        This parameter is related with Silero VAD. Maximum duration of speech chunks in seconds. Chunks longer
+        than max_speech_duration_s will be split at the timestamp of the last silence that
+        lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
+        split aggressively just before max_speech_duration_s.
+    
+    min_silence_duration_ms: gr.Number
+        This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
+        before separating it
+        
+    window_size_samples: gr.Number
+        This parameter is related with Silero VAD. Audio chunks of window_size_samples size are fed to the silero VAD model.
+        WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate.
+        Values other than these may affect model performance!!
+        
+    speech_pad_ms: gr.Number
+        This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side    
     """
 
     def to_list(self) -> list:
@@ -108,6 +141,12 @@ class WhisperValues:
     temperature: float
     compression_ratio_threshold: float
     vad_filter: bool
+    threshold: float
+    min_speech_duration_ms: int
+    max_speech_duration_s: float
+    min_silence_duration_ms: int
+    window_size_samples: int
+    speech_pad_ms: int
     """
     A data class to use Whisper parameters. Use "after" Gradio pre-processing.
     See more about Gradio pre-processing: : https://www.gradio.app/docs/components