Skip to content

Commit

Permalink
add example for silero vad
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj committed Sep 15, 2023
1 parent f45e836 commit c3a50f2
Show file tree
Hide file tree
Showing 8 changed files with 298 additions and 24 deletions.
4 changes: 4 additions & 0 deletions sherpa-onnx/csrc/session.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,4 +76,8 @@ Ort::SessionOptions GetSessionOptions(const OnlineLMConfig &config) {
return GetSessionOptionsImpl(config.lm_num_threads, config.lm_provider);
}

Ort::SessionOptions GetSessionOptions(const VadModelConfig &config) {
return GetSessionOptionsImpl(config.num_threads, config.provider);
}

} // namespace sherpa_onnx
3 changes: 3 additions & 0 deletions sherpa-onnx/csrc/session.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "sherpa-onnx/csrc/offline-model-config.h"
#include "sherpa-onnx/csrc/online-lm-config.h"
#include "sherpa-onnx/csrc/online-model-config.h"
#include "sherpa-onnx/csrc/vad-model-config.h"

namespace sherpa_onnx {

Expand All @@ -20,6 +21,8 @@ Ort::SessionOptions GetSessionOptions(const OfflineModelConfig &config);
Ort::SessionOptions GetSessionOptions(const OfflineLMConfig &config);

Ort::SessionOptions GetSessionOptions(const OnlineLMConfig &config);

Ort::SessionOptions GetSessionOptions(const VadModelConfig &config);
} // namespace sherpa_onnx

#endif // SHERPA_ONNX_CSRC_SESSION_H_
48 changes: 40 additions & 8 deletions sherpa-onnx/csrc/sherpa-onnx-vad-microphone.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,7 @@
#include <queue>

#include "portaudio.h" // NOLINT
#include "sherpa-onnx/csrc/display.h"
#include "sherpa-onnx/csrc/microphone.h"
#include "sherpa-onnx/csrc/parse-options.h"
#include "sherpa-onnx/csrc/vad-model-config.h"
#include "sherpa-onnx/csrc/vad-model.h"

bool stop = false;
Expand All @@ -28,12 +25,20 @@ static int32_t RecordCallback(const void *input_buffer,
const PaStreamCallbackTimeInfo * /*time_info*/,
PaStreamCallbackFlags /*status_flags*/,
void *user_data) {
int32_t window_size = *reinterpret_cast<int32_t *>(user_data);

std::lock_guard<std::mutex> lock(mutex);

queue.emplace(
std::vector<float> samples(
reinterpret_cast<const float *>(input_buffer),
reinterpret_cast<const float *>(input_buffer) + frames_per_buffer);

if (!queue.empty() && queue.back().size() < window_size) {
queue.back().insert(queue.back().end(), samples.begin(), samples.end());
} else {
queue.push(std::move(samples));
}

return stop ? paComplete : paContinue;
}

Expand Down Expand Up @@ -109,30 +114,57 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
0, // frames per buffer
paClipOff, // we won't output out of range samples
// so don't bother clipping them
RecordCallback, nullptr);
RecordCallback, &config.silero_vad.window_size);
if (err != paNoError) {
fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
exit(EXIT_FAILURE);
}

err = Pa_StartStream(stream);

auto vad_model = sherpa_onnx::VadModel::Create(config);

fprintf(stderr, "Started\n");

if (err != paNoError) {
fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
exit(EXIT_FAILURE);
}

int32_t speech_count = 0;
int32_t non_speech_count = 0;
while (!stop) {
{
std::lock_guard<std::mutex> lock(mutex);
while (!queue.empty()) {
fprintf(stderr, "%d\n", (int)queue.size());
while (!queue.empty() &&
queue.front().size() >= config.silero_vad.window_size) {
bool is_speech =
vad_model->IsSpeech(queue.front().data(), queue.front().size());

queue.pop();

if (is_speech) {
speech_count += 1;
non_speech_count = 0;
} else {
speech_count = 0;
non_speech_count += 1;
}

if (speech_count == 1) {
static int32_t k = 0;
++k;
fprintf(stderr, "Detected speech: %d\n", k);
}

if (non_speech_count == 1) {
static int32_t k = 0;
++k;
fprintf(stderr, "Detected non-speech: %d\n", k);
}
}
}
Pa_Sleep(100); // sleep for 100ms
stop = true;
}

err = Pa_CloseStream(stream);
Expand Down
20 changes: 14 additions & 6 deletions sherpa-onnx/csrc/silero-vad-model-config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,22 @@ namespace sherpa_onnx {
void SileroVadModelConfig::Register(ParseOptions *po) {
po->Register("silero-vad-model", &model, "Path to silero VAD ONNX model.");

po->Register("silero-vad-prob", &prob,
po->Register("silero-vad-threshold", &threshold,
"Speech threshold. Silero VAD outputs speech probabilities for "
"each audio chunk, probabilities ABOVE this value are "
"considered as SPEECH. It is better to tune this parameter for "
"each dataset separately, but lazy "
"0.5 is pretty good for most datasets.");

po->Register(
"silero-vad-min-silence-duration", &min_silence_duration,
"In seconds. In the end of each speech chunk wait for "
"--silero-vad-min-silence-duration seconds before separating it");

po->Register("silero-vad-min-speech-duration", &min_speech_duration,
"In seconds. In the end of each silence chunk wait for "
"--silero-vad-min-speech-duration seconds before separating it");

po->Register(
"silero-vad-window-size", &window_size,
"In samples. Audio chunks of --silero-vad-window-size samples are fed "
Expand All @@ -43,15 +48,17 @@ bool SileroVadModelConfig::Validate() const {
return false;
}

if (prob < 0.01) {
if (threshold < 0.01) {
SHERPA_ONNX_LOGE(
"Please use a larger value for --silero-vad-prob. Given: %f", prob);
"Please use a larger value for --silero-vad-threshold. Given: %f",
threshold);
return false;
}

if (prob >= 1) {
if (threshold >= 1) {
SHERPA_ONNX_LOGE(
"Please use a smaller value for --silero-vad-prob. Given: %f", prob);
"Please use a smaller value for --silero-vad-threshold. Given: %f",
threshold);
return false;
}

Expand All @@ -63,8 +70,9 @@ std::string SileroVadModelConfig::ToString() const {

os << "SilerVadModelConfig(";
os << "model=\"" << model << "\", ";
os << "prob=" << prob << ", ";
os << "threshold=" << threshold << ", ";
os << "min_silence_duration=" << min_silence_duration << ", ";
os << "min_speech_duration=" << min_speech_duration << ", ";
os << "window_size=" << window_size << ")";

return os.str();
Expand Down
10 changes: 6 additions & 4 deletions sherpa-onnx/csrc/silero-vad-model-config.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,18 @@ struct SileroVadModelConfig {
//
// The predicted probability of a segment is larger than this
// value, then it is classified as speech.
float prob = 0.5;
float threshold = 0.5;

float min_silence_duration = 0.1; // in seconds
float min_silence_duration = 0.5; // in seconds

float min_speech_duration = 0.25; // in seconds

// 512, 1024, 1536 samples for 16000 Hz
// 256, 512, 768 samples for 800 Hz
int window_size = 1536; // in samples
int window_size = 512; // in samples

// support only 16000 and 8000
int32_t sample_rate = 16000;
int32_t sample_rate = 16000; // not exposed to users

SileroVadModelConfig() = default;

Expand Down
Loading

0 comments on commit c3a50f2

Please sign in to comment.