Skip to content

Commit

Permalink
Merge pull request #159 from jhj0517/feature/add-vad-parameter
Browse files Browse the repository at this point in the history
Add Silero VAD Options parameter
  • Loading branch information
jhj0517 authored Jun 1, 2024
2 parents 14d5cf9 + ed10d53 commit 6cba7cb
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 6 deletions.
54 changes: 48 additions & 6 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import gradio as gr
import os
import argparse
import webbrowser

from modules.whisper_Inference import WhisperInference
from modules.faster_whisper_inference import FasterWhisperInference
Expand Down Expand Up @@ -60,8 +61,15 @@ def launch(self):
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
with gr.Row():
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
with gr.Accordion("Advanced_Parameters", open=False):
with gr.Accordion("VAD Options", open=False, visible=not self.args.disable_faster_whisper):
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
with gr.Accordion("Advanced_Parameters", open=False):
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
Expand Down Expand Up @@ -93,7 +101,14 @@ def launch(self):
initial_prompt=tb_initial_prompt,
temperature=sd_temperature,
compression_ratio_threshold=nb_compression_ratio_threshold,
vad_filter=cb_vad_filter)
vad_filter=cb_vad_filter,
threshold=sd_threshold,
min_speech_duration_ms=nb_min_speech_duration_ms,
max_speech_duration_s=nb_max_speech_duration_s,
min_silence_duration_ms=nb_min_silence_duration_ms,
window_size_sample=nb_window_size_sample,
speech_pad_ms=nb_speech_pad_ms)

btn_run.click(fn=self.whisper_inf.transcribe_file,
inputs=params + whisper_params.to_list(),
outputs=[tb_indicator, files_subtitles])
Expand All @@ -120,8 +135,15 @@ def launch(self):
with gr.Row():
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
interactive=True)
with gr.Accordion("Advanced_Parameters", open=False):
with gr.Accordion("VAD Options", open=False, visible=not self.args.disable_faster_whisper):
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
with gr.Accordion("Advanced_Parameters", open=False):
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
Expand Down Expand Up @@ -153,7 +175,13 @@ def launch(self):
initial_prompt=tb_initial_prompt,
temperature=sd_temperature,
compression_ratio_threshold=nb_compression_ratio_threshold,
vad_filter=cb_vad_filter)
vad_filter=cb_vad_filter,
threshold=sd_threshold,
min_speech_duration_ms=nb_min_speech_duration_ms,
max_speech_duration_s=nb_max_speech_duration_s,
min_silence_duration_ms=nb_min_silence_duration_ms,
window_size_sample=nb_window_size_sample,
speech_pad_ms=nb_speech_pad_ms)
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
inputs=params + whisper_params.to_list(),
outputs=[tb_indicator, files_subtitles])
Expand All @@ -173,8 +201,15 @@ def launch(self):
dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
with gr.Row():
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
with gr.Accordion("Advanced_Parameters", open=False):
with gr.Accordion("VAD Options", open=False, visible=not self.args.disable_faster_whisper):
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
with gr.Accordion("Advanced_Parameters", open=False):
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
Expand Down Expand Up @@ -205,7 +240,13 @@ def launch(self):
initial_prompt=tb_initial_prompt,
temperature=sd_temperature,
compression_ratio_threshold=nb_compression_ratio_threshold,
vad_filter=cb_vad_filter)
vad_filter=cb_vad_filter,
threshold=sd_threshold,
min_speech_duration_ms=nb_min_speech_duration_ms,
max_speech_duration_s=nb_max_speech_duration_s,
min_silence_duration_ms=nb_min_silence_duration_ms,
window_size_sample=nb_window_size_sample,
speech_pad_ms=nb_speech_pad_ms)
btn_run.click(fn=self.whisper_inf.transcribe_mic,
inputs=params + whisper_params.to_list(),
outputs=[tb_indicator, files_subtitles])
Expand Down Expand Up @@ -284,6 +325,7 @@ def launch(self):
launch_args['server_port'] = self.args.server_port
if self.args.username and self.args.password:
launch_args['auth'] = (self.args.username, self.args.password)

self.app.queue(api_open=False).launch(**launch_args)


Expand Down
11 changes: 11 additions & 0 deletions modules/faster_whisper_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from datetime import datetime

import faster_whisper
from faster_whisper.vad import VadOptions
import ctranslate2
import whisper
import torch
Expand Down Expand Up @@ -260,6 +261,15 @@ def transcribe(self,
language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
params.lang = language_code_dict[params.lang]

vad_options = VadOptions(
threshold=params.threshold,
min_speech_duration_ms=params.min_speech_duration_ms,
max_speech_duration_s=params.max_speech_duration_s,
min_silence_duration_ms=params.min_silence_duration_ms,
window_size_samples=params.window_size_samples,
speech_pad_ms=params.speech_pad_ms
)

segments, info = self.model.transcribe(
audio=audio,
language=params.lang,
Expand All @@ -272,6 +282,7 @@ def transcribe(self,
temperature=params.temperature,
compression_ratio_threshold=params.compression_ratio_threshold,
vad_filter=params.vad_filter,
vad_parameters=vad_options
)
progress(0, desc="Loading audio..")

Expand Down
39 changes: 39 additions & 0 deletions modules/whisper_parameter.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ class WhisperGradioComponents:
temperature: gr.Slider
compression_ratio_threshold: gr.Number
vad_filter: gr.Checkbox
threshold: gr.Slider
min_speech_duration_ms: gr.Number
max_speech_duration_s: gr.Number
min_silence_duration_ms: gr.Number
window_size_sample: gr.Number
speech_pad_ms: gr.Number
"""
A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
See more about Gradio pre-processing: https://www.gradio.app/docs/components
Expand Down Expand Up @@ -78,6 +84,33 @@ class WhisperGradioComponents:
Enable the voice activity detection (VAD) to filter out parts of the audio
without speech. This step is using the Silero VAD model
https://github.com/snakers4/silero-vad.
threshold: gr.Slider
This parameter is related with Silero VAD. Speech threshold.
Silero VAD outputs speech probabilities for each audio chunk,
probabilities ABOVE this value are considered as SPEECH. It is better to tune this
parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
min_speech_duration_ms: gr.Number
This parameter is related with Silero VAD. Final speech chunks shorter min_speech_duration_ms are thrown out.
max_speech_duration_s: gr.Number
This parameter is related with Silero VAD. Maximum duration of speech chunks in seconds. Chunks longer
than max_speech_duration_s will be split at the timestamp of the last silence that
lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
split aggressively just before max_speech_duration_s.
min_silence_duration_ms: gr.Number
This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
before separating it
window_size_samples: gr.Number
This parameter is related with Silero VAD. Audio chunks of window_size_samples size are fed to the silero VAD model.
WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate.
Values other than these may affect model performance!!
speech_pad_ms: gr.Number
This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
"""

def to_list(self) -> list:
Expand Down Expand Up @@ -108,6 +141,12 @@ class WhisperValues:
temperature: float
compression_ratio_threshold: float
vad_filter: bool
threshold: float
min_speech_duration_ms: int
max_speech_duration_s: float
min_silence_duration_ms: int
window_size_samples: int
speech_pad_ms: int
"""
A data class to use Whisper parameters. Use "after" Gradio pre-processing.
See more about Gradio pre-processing: : https://www.gradio.app/docs/components
Expand Down

0 comments on commit 6cba7cb

Please sign in to comment.