add example for silero vad

k2-fsa · Sep 15, 2023 · c3a50f2 · c3a50f2
1 parent f45e836
commit c3a50f2
Show file tree

Hide file tree

Showing 8 changed files with 298 additions and 24 deletions.
diff --git a/sherpa-onnx/csrc/session.cc b/sherpa-onnx/csrc/session.cc
@@ -76,4 +76,8 @@ Ort::SessionOptions GetSessionOptions(const OnlineLMConfig &config) {
   return GetSessionOptionsImpl(config.lm_num_threads, config.lm_provider);
 }
 
+Ort::SessionOptions GetSessionOptions(const VadModelConfig &config) {
+  return GetSessionOptionsImpl(config.num_threads, config.provider);
+}
+
 }  // namespace sherpa_onnx
diff --git a/sherpa-onnx/csrc/session.h b/sherpa-onnx/csrc/session.h
@@ -10,6 +10,7 @@
 #include "sherpa-onnx/csrc/offline-model-config.h"
 #include "sherpa-onnx/csrc/online-lm-config.h"
 #include "sherpa-onnx/csrc/online-model-config.h"
+#include "sherpa-onnx/csrc/vad-model-config.h"
 
 namespace sherpa_onnx {
 
@@ -20,6 +21,8 @@ Ort::SessionOptions GetSessionOptions(const OfflineModelConfig &config);
 Ort::SessionOptions GetSessionOptions(const OfflineLMConfig &config);
 
 Ort::SessionOptions GetSessionOptions(const OnlineLMConfig &config);
+
+Ort::SessionOptions GetSessionOptions(const VadModelConfig &config);
 }  // namespace sherpa_onnx
 
 #endif  // SHERPA_ONNX_CSRC_SESSION_H_
diff --git a/sherpa-onnx/csrc/sherpa-onnx-vad-microphone.cc b/sherpa-onnx/csrc/sherpa-onnx-vad-microphone.cc
@@ -12,10 +12,7 @@
 #include <queue>
 
 #include "portaudio.h"  // NOLINT
-#include "sherpa-onnx/csrc/display.h"
 #include "sherpa-onnx/csrc/microphone.h"
-#include "sherpa-onnx/csrc/parse-options.h"
-#include "sherpa-onnx/csrc/vad-model-config.h"
 #include "sherpa-onnx/csrc/vad-model.h"
 
 bool stop = false;
@@ -28,12 +25,20 @@ static int32_t RecordCallback(const void *input_buffer,
                               const PaStreamCallbackTimeInfo * /*time_info*/,
                               PaStreamCallbackFlags /*status_flags*/,
                               void *user_data) {
+  int32_t window_size = *reinterpret_cast<int32_t *>(user_data);
+
   std::lock_guard<std::mutex> lock(mutex);
 
-  queue.emplace(
+  std::vector<float> samples(
       reinterpret_cast<const float *>(input_buffer),
       reinterpret_cast<const float *>(input_buffer) + frames_per_buffer);
 
+  if (!queue.empty() && queue.back().size() < window_size) {
+    queue.back().insert(queue.back().end(), samples.begin(), samples.end());
+  } else {
+    queue.push(std::move(samples));
+  }
+
   return stop ? paComplete : paContinue;
 }
 
@@ -109,30 +114,57 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
                     0,          // frames per buffer
                     paClipOff,  // we won't output out of range samples
                                 // so don't bother clipping them
-                    RecordCallback, nullptr);
+                    RecordCallback, &config.silero_vad.window_size);
   if (err != paNoError) {
     fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
     exit(EXIT_FAILURE);
   }
 
   err = Pa_StartStream(stream);
+
+  auto vad_model = sherpa_onnx::VadModel::Create(config);
+
   fprintf(stderr, "Started\n");
 
   if (err != paNoError) {
     fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
     exit(EXIT_FAILURE);
   }
 
+  int32_t speech_count = 0;
+  int32_t non_speech_count = 0;
   while (!stop) {
     {
       std::lock_guard<std::mutex> lock(mutex);
-      while (!queue.empty()) {
-        fprintf(stderr, "%d\n", (int)queue.size());
+      while (!queue.empty() &&
+             queue.front().size() >= config.silero_vad.window_size) {
+        bool is_speech =
+            vad_model->IsSpeech(queue.front().data(), queue.front().size());
+
         queue.pop();
+
+        if (is_speech) {
+          speech_count += 1;
+          non_speech_count = 0;
+        } else {
+          speech_count = 0;
+          non_speech_count += 1;
+        }
+
+        if (speech_count == 1) {
+          static int32_t k = 0;
+          ++k;
+          fprintf(stderr, "Detected speech: %d\n", k);
+        }
+
+        if (non_speech_count == 1) {
+          static int32_t k = 0;
+          ++k;
+          fprintf(stderr, "Detected non-speech: %d\n", k);
+        }
       }
     }
     Pa_Sleep(100);  // sleep for 100ms
-    stop = true;
   }
 
   err = Pa_CloseStream(stream);

diff --git a/sherpa-onnx/csrc/silero-vad-model-config.cc b/sherpa-onnx/csrc/silero-vad-model-config.cc
@@ -12,17 +12,22 @@ namespace sherpa_onnx {
 void SileroVadModelConfig::Register(ParseOptions *po) {
   po->Register("silero-vad-model", &model, "Path to silero VAD ONNX model.");
 
-  po->Register("silero-vad-prob", &prob,
+  po->Register("silero-vad-threshold", &threshold,
                "Speech threshold. Silero VAD outputs speech probabilities for "
                "each audio chunk, probabilities ABOVE this value are "
                "considered as SPEECH. It is better to tune this parameter for "
                "each dataset separately, but lazy "
                "0.5 is pretty good for most datasets.");
+
   po->Register(
       "silero-vad-min-silence-duration", &min_silence_duration,
       "In seconds.  In the end of each speech chunk wait for "
       "--silero-vad-min-silence-duration seconds before separating it");
 
+  po->Register("silero-vad-min-speech-duration", &min_speech_duration,
+               "In seconds.  In the end of each silence chunk wait for "
+               "--silero-vad-min-speech-duration seconds before separating it");
+
   po->Register(
       "silero-vad-window-size", &window_size,
       "In samples. Audio chunks of --silero-vad-window-size samples are fed "
@@ -43,15 +48,17 @@ bool SileroVadModelConfig::Validate() const {
     return false;
   }
 
-  if (prob < 0.01) {
+  if (threshold < 0.01) {
     SHERPA_ONNX_LOGE(
-        "Please use a larger value for --silero-vad-prob. Given: %f", prob);
+        "Please use a larger value for --silero-vad-threshold. Given: %f",
+        threshold);
     return false;
   }
 
-  if (prob >= 1) {
+  if (threshold >= 1) {
     SHERPA_ONNX_LOGE(
-        "Please use a smaller value for --silero-vad-prob. Given: %f", prob);
+        "Please use a smaller value for --silero-vad-threshold. Given: %f",
+        threshold);
     return false;
   }
 
@@ -63,8 +70,9 @@ std::string SileroVadModelConfig::ToString() const {
 
   os << "SilerVadModelConfig(";
   os << "model=\"" << model << "\", ";
-  os << "prob=" << prob << ", ";
+  os << "threshold=" << threshold << ", ";
   os << "min_silence_duration=" << min_silence_duration << ", ";
+  os << "min_speech_duration=" << min_speech_duration << ", ";
   os << "window_size=" << window_size << ")";
 
   return os.str();

diff --git a/sherpa-onnx/csrc/silero-vad-model-config.h b/sherpa-onnx/csrc/silero-vad-model-config.h
@@ -17,16 +17,18 @@ struct SileroVadModelConfig {
   //
   // The predicted probability of a segment is larger than this
   // value, then it is classified as speech.
-  float prob = 0.5;
+  float threshold = 0.5;
 
-  float min_silence_duration = 0.1;  // in seconds
+  float min_silence_duration = 0.5;  // in seconds
+
+  float min_speech_duration = 0.25;  // in seconds
 
   // 512, 1024, 1536 samples for 16000 Hz
   // 256, 512, 768 samples for 800 Hz
-  int window_size = 1536;  // in samples
+  int window_size = 512;  // in samples
 
   // support only 16000 and 8000
-  int32_t sample_rate = 16000;
+  int32_t sample_rate = 16000;  // not exposed to users
 
   SileroVadModelConfig() = default;