Skip to content

Commit

Permalink
Merge pull request #214 from jhj0517/fix/limit-vad
Browse files Browse the repository at this point in the history
Limit Vad feature to faster-whisper
  • Loading branch information
jhj0517 authored Jul 14, 2024
2 parents 6f9cdbb + 174fcfd commit 7386da0
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 71 deletions.
4 changes: 2 additions & 2 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def create_whisper_parameters(self):
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
interactive=True)
with gr.Accordion("Advanced Parameters", open=False):
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True,
nb_beam_size = gr.Number(label="Beam Size", value=5, precision=0, interactive=True,
info="Beam size to use for decoding.")
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True,
info="If the average log probability over sampled tokens is below this value, treat as failed.")
Expand Down Expand Up @@ -137,7 +137,7 @@ def create_whisper_parameters(self):
nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)

with gr.Accordion("VAD", open=False):
with gr.Accordion("VAD", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5,
info="Lower it to be more sensitive to small sounds.")
Expand Down
2 changes: 2 additions & 0 deletions modules/diarize/audio_loader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/audio.py

import os
import subprocess
from functools import lru_cache
Expand Down
2 changes: 2 additions & 0 deletions modules/diarize/diarize_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/diarize.py

import numpy as np
import pandas as pd
import os
Expand Down
65 changes: 12 additions & 53 deletions modules/vad/silero_vad.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Adapted from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py

from faster_whisper.vad import VadOptions, get_vad_model
import numpy as np
from typing import BinaryIO, Union, List, Optional, Tuple
from typing import BinaryIO, Union, List, Optional
import warnings
import faster_whisper
import gradio as gr
Expand All @@ -15,7 +17,6 @@ def __init__(self):
def run(self,
audio: Union[str, BinaryIO, np.ndarray],
vad_parameters: VadOptions,
silence_non_speech: bool = True,
progress: gr.Progress = gr.Progress()):
"""
Run VAD
Expand All @@ -26,8 +27,6 @@ def run(self,
Audio path or file binary or Audio numpy array
vad_parameters:
Options for VAD processing.
silence_non_speech: bool
If True, non-speech parts will be silenced instead of being removed.
progress: gr.Progress
Indicator to show progress directly in gradio.
Expand All @@ -43,32 +42,19 @@ def run(self,
audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)

duration = audio.shape[0] / sampling_rate
duration_after_vad = duration

if vad_parameters is None:
vad_parameters = VadOptions()
elif isinstance(vad_parameters, dict):
vad_parameters = VadOptions(**vad_parameters)

speech_chunks = self.get_speech_timestamps(
audio=audio,
vad_options=vad_parameters,
progress=progress
)

audio, duration_diff = self.collect_chunks(
audio=audio,
chunks=speech_chunks,
silence_non_speech=silence_non_speech
)

if silence_non_speech:
print(
f"VAD filter silenced {self.format_timestamp(duration_diff)} of audio.",
)
else:
print(
f"VAD filter removed {self.format_timestamp(duration_diff)} of audio",
)
audio = self.collect_chunks(audio, speech_chunks)
duration_after_vad = audio.shape[0] / sampling_rate

return audio

Expand Down Expand Up @@ -224,41 +210,13 @@ def get_speech_timestamps(
def update_model(self):
self.model = get_vad_model()

def collect_chunks(
self,
audio: np.ndarray,
chunks: List[dict],
silence_non_speech: bool = True,
) -> Tuple[np.ndarray, float]:
"""Collects and concatenate audio chunks.
Args:
audio: One dimensional float array.
chunks: List of dictionaries containing start and end samples of speech chunks
silence_non_speech: If True, non-speech parts will be silenced instead of being removed.
Returns:
Tuple containing:
- Processed audio as a numpy array
- Duration of non-speech (silenced or removed) audio in seconds
"""
@staticmethod
def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
"""Collects and concatenates audio chunks."""
if not chunks:
return np.array([], dtype=np.float32), 0.0

total_samples = audio.shape[0]
speech_samples_count = sum(chunk["end"] - chunk["start"] for chunk in chunks)
non_speech_samples_count = total_samples - speech_samples_count
non_speech_duration = non_speech_samples_count / self.sampling_rate
return np.array([], dtype=np.float32)

if not silence_non_speech:
processed_audio = np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
else:
processed_audio = np.zeros_like(audio)
for chunk in chunks:
start, end = chunk['start'], chunk['end']
processed_audio[start:end] = audio[start:end]

return processed_audio, non_speech_duration
return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])

@staticmethod
def format_timestamp(
Expand All @@ -282,3 +240,4 @@ def format_timestamp(
return (
f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
)

18 changes: 17 additions & 1 deletion modules/whisper/faster_whisper_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,20 @@ def transcribe(self,
if not params.hotwords:
params.hotwords = None

vad_options = None
if params.vad_filter:
# Explicit value set for float('inf') from gr.Number()
if params.max_speech_duration_s >= 9999:
params.max_speech_duration_s = float('inf')

vad_options = VadOptions(
threshold=params.threshold,
min_speech_duration_ms=params.min_speech_duration_ms,
max_speech_duration_s=params.max_speech_duration_s,
min_silence_duration_ms=params.min_silence_duration_ms,
speech_pad_ms=params.speech_pad_ms
)

params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)

segments, info = self.model.transcribe(
Expand Down Expand Up @@ -100,7 +114,9 @@ def transcribe(self,
hotwords=params.hotwords,
language_detection_threshold=params.language_detection_threshold,
language_detection_segments=params.language_detection_segments,
prompt_reset_on_temperature=params.prompt_reset_on_temperature
prompt_reset_on_temperature=params.prompt_reset_on_temperature,
vad_filter=params.vad_filter,
vad_parameters=vad_options
)
progress(0, desc="Loading audio..")

Expand Down
15 changes: 0 additions & 15 deletions modules/whisper/whisper_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,21 +85,6 @@ def run(self,
"""
params = WhisperParameters.as_value(*whisper_params)

if params.vad_filter:
vad_options = VadOptions(
threshold=params.threshold,
min_speech_duration_ms=params.min_speech_duration_ms,
max_speech_duration_s=params.max_speech_duration_s,
min_silence_duration_ms=params.min_silence_duration_ms,
speech_pad_ms=params.speech_pad_ms
)
audio = self.vad.run(
audio=audio,
vad_parameters=vad_options,
silence_non_speech=True,
progress=progress
)

if params.lang == "Automatic Detection":
params.lang = None
else:
Expand Down

0 comments on commit 7386da0

Please sign in to comment.