Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Install faster-whisper directly from repository #428

Merged
merged 7 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 1 addition & 13 deletions backend/requirements-backend.txt
Original file line number Diff line number Diff line change
@@ -1,17 +1,5 @@
# Whisper-WebUI dependencies
--extra-index-url https://download.pytorch.org/whl/cu124
torch
torchaudio
git+https://github.com/jhj0517/jhj0517-whisper.git
faster-whisper==1.0.3
transformers
gradio
gradio-i18n
pytubefix
ruamel.yaml==0.18.6
pyannote.audio==3.3.2
git+https://github.com/jhj0517/ultimatevocalremover_api.git
git+https://github.com/jhj0517/pyrubberband.git
-r ./../requirements.txt

# Backend dependencies
python-dotenv
Expand Down
2 changes: 1 addition & 1 deletion configs/default_parameters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ whisper:
max_new_tokens: null
hallucination_silence_threshold: null
hotwords: null
language_detection_threshold: null
language_detection_threshold: 0.5
language_detection_segments: 1
add_timestamp: true

Expand Down
52 changes: 31 additions & 21 deletions modules/vad/silero_vad.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
import numpy as np
from typing import BinaryIO, Union, List, Optional, Tuple
import warnings
import bisect
import faster_whisper
from modules.whisper.data_classes import *
from faster_whisper.transcribe import SpeechTimestampsMap
import gradio as gr

from modules.whisper.data_classes import *


class SileroVAD:
def __init__(self):
Expand Down Expand Up @@ -58,6 +60,7 @@ def run(self,
vad_options=vad_parameters,
progress=progress
)

audio = self.collect_chunks(audio, speech_chunks)
duration_after_vad = audio.shape[0] / sampling_rate

Expand Down Expand Up @@ -94,35 +97,27 @@ def get_speech_timestamps(
min_silence_duration_ms = vad_options.min_silence_duration_ms
window_size_samples = self.window_size_samples
speech_pad_ms = vad_options.speech_pad_ms
sampling_rate = 16000
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
min_speech_samples = self.sampling_rate * min_speech_duration_ms / 1000
speech_pad_samples = self.sampling_rate * speech_pad_ms / 1000
max_speech_samples = (
sampling_rate * max_speech_duration_s
self.sampling_rate * max_speech_duration_s
- window_size_samples
- 2 * speech_pad_samples
)
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
min_silence_samples = self.sampling_rate * min_silence_duration_ms / 1000
min_silence_samples_at_max_speech = self.sampling_rate * 98 / 1000

audio_length_samples = len(audio)

state, context = self.model.get_initial_states(batch_size=1)

speech_probs = []
for current_start_sample in range(0, audio_length_samples, window_size_samples):
progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")

chunk = audio[current_start_sample: current_start_sample + window_size_samples]
if len(chunk) < window_size_samples:
chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
speech_probs.append(speech_prob)
padded_audio = np.pad(
audio, (0, window_size_samples - audio.shape[0] % window_size_samples)
)
speech_probs = self.model(padded_audio.reshape(1, -1)).squeeze(0)

triggered = False
speeches = []
current_speech = {}
neg_threshold = threshold - 0.15
neg_threshold = vad_options.neg_threshold

# to save potential segment end (and tolerate some silence)
temp_end = 0
Expand Down Expand Up @@ -258,8 +253,23 @@ def restore_speech_timestamps(
ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)

for segment in segments:
segment.start = ts_map.get_original_time(segment.start)
segment.end = ts_map.get_original_time(segment.end)
if segment.words:
words = []
for word in segment.words:
# Ensure the word start and end times are resolved to the same chunk.
middle = (word.start + word.end) / 2
chunk_index = ts_map.get_chunk_index(middle)
word.start = ts_map.get_original_time(word.start, chunk_index)
word.end = ts_map.get_original_time(word.end, chunk_index)
words.append(word)

segment.start = words[0].start
segment.end = words[-1].end
segment.words = words

else:
segment.start = ts_map.get_original_time(segment.start)
segment.end = ts_map.get_original_time(segment.end)

return segments

2 changes: 1 addition & 1 deletion modules/whisper/data_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ class WhisperParams(BaseParams):
)
hotwords: Optional[str] = Field(default=None, description="Hotwords/hint phrases for the model")
language_detection_threshold: Optional[float] = Field(
default=None,
default=0.5,
description="Threshold for language detection probability"
)
language_detection_segments: int = Field(
Expand Down
2 changes: 1 addition & 1 deletion notebook/whisper-webui.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
"!git clone https://github.com/jhj0517/Whisper-WebUI.git\n",
"%cd Whisper-WebUI\n",
"!pip install git+https://github.com/jhj0517/jhj0517-whisper.git\n",
"!pip install faster-whisper==1.0.3\n",
"!pip install git+https://github.com/SYSTRAN/faster-whisper.git\n",
"!pip install ctranslate2==4.4.0\n",
"!pip install gradio\n",
"!pip install gradio-i18n\n",
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
torch
torchaudio
git+https://github.com/jhj0517/jhj0517-whisper.git
faster-whisper==1.0.3
git+https://github.com/SYSTRAN/faster-whisper.git
transformers
gradio
gradio-i18n
Expand Down
Loading