From d162ca9eb8495ac1c37060c166ca77e6bccfad71 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 10 Apr 2024 21:00:35 +0800 Subject: [PATCH] Add C++ microphone examples for audio tagging (#749) --- .github/workflows/test-build-wheel.yaml | 2 +- .github/workflows/test-pip-install.yaml | 2 +- README.md | 33 ++- android/README.md | 16 +- .../asr-microphone-example/c-api-alsa.cc | 2 +- cmake/cmake_extension.py | 2 + ...microphone-with-endpoint-detection-alsa.py | 2 +- python-api-examples/vad-alsa.py | 2 +- .../vad-remove-non-speech-segments-alsa.py | 2 +- sherpa-onnx/csrc/CMakeLists.txt | 8 + sherpa-onnx/csrc/alsa.cc | 2 +- .../sherpa-onnx-alsa-offline-audio-tagging.cc | 190 ++++++++++++++ ...nnx-alsa-offline-speaker-identification.cc | 4 +- sherpa-onnx/csrc/sherpa-onnx-alsa-offline.cc | 6 +- sherpa-onnx/csrc/sherpa-onnx-alsa.cc | 2 +- .../csrc/sherpa-onnx-keyword-spotter-alsa.cc | 4 +- .../sherpa-onnx-keyword-spotter-microphone.cc | 40 ++- ...a-onnx-microphone-offline-audio-tagging.cc | 238 ++++++++++++++++++ ...crophone-offline-speaker-identification.cc | 37 ++- .../csrc/sherpa-onnx-microphone-offline.cc | 35 ++- sherpa-onnx/csrc/sherpa-onnx-microphone.cc | 34 ++- sherpa-onnx/csrc/sherpa-onnx-vad-alsa.cc | 2 +- .../sherpa-onnx-vad-microphone-offline-asr.cc | 48 +++- .../csrc/sherpa-onnx-vad-microphone.cc | 53 +++- 24 files changed, 706 insertions(+), 60 deletions(-) create mode 100644 sherpa-onnx/csrc/sherpa-onnx-alsa-offline-audio-tagging.cc create mode 100644 sherpa-onnx/csrc/sherpa-onnx-microphone-offline-audio-tagging.cc diff --git a/.github/workflows/test-build-wheel.yaml b/.github/workflows/test-build-wheel.yaml index 54c265bbda..c7c36f871b 100644 --- a/.github/workflows/test-build-wheel.yaml +++ b/.github/workflows/test-build-wheel.yaml @@ -89,7 +89,7 @@ jobs: export PATH=/c/hostedtoolcache/windows/Python/3.8.10/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH - export PATH=/c/hostedtoolcache/windows/Python/3.11.8/x64/bin:$PATH + export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.12.2/x64/bin:$PATH which sherpa-onnx diff --git a/.github/workflows/test-pip-install.yaml b/.github/workflows/test-pip-install.yaml index c79f3a8b34..381df814f1 100644 --- a/.github/workflows/test-pip-install.yaml +++ b/.github/workflows/test-pip-install.yaml @@ -67,7 +67,7 @@ jobs: export PATH=/c/hostedtoolcache/windows/Python/3.8.10/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH - export PATH=/c/hostedtoolcache/windows/Python/3.11.8/x64/bin:$PATH + export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.12.2/x64/bin:$PATH sherpa-onnx --help diff --git a/README.md b/README.md index 7ea1b638e6..4f4246ebb8 100644 --- a/README.md +++ b/README.md @@ -2,23 +2,48 @@ This repository supports running the following functions **locally** - - Speech-to-text (i.e., ASR) + - Speech-to-text (i.e., ASR); both streaming and non-streaming are supported - Text-to-speech (i.e., TTS) - Speaker identification + - Speaker verification + - Spoken language identification + - Audio tagging + - VAD (e.g., [silero-vad](https://github.com/snakers4/silero-vad)) on the following platforms and operating systems: - - Linux, macOS, Windows - - Android + - x86, ``x86_64``, 32-bit ARM, 64-bit ARM (arm64, aarch64), RISC-V (riscv64) + - Linux, macOS, Windows, openKylin + - Android, WearOS - iOS - - Raspberry Pi + - NodeJS + - WebAssembly + - [Raspberry Pi](https://www.raspberrypi.com/) + - [RV1126](https://www.rock-chips.com/uploads/pdf/2022.8.26/191/RV1126%20Brief%20Datasheet.pdf) + - [LicheePi4A](https://sipeed.com/licheepi4a) + - [VisionFive 2](https://www.starfivetech.com/en/site/boards) + - [旭日X3派](https://developer.horizon.ai/api/v1/fileData/documents_pi/index.html) - etc +with the following APIs + + - C++ + - C + - Python + - Go + - ``C#`` + - Javascript + - Java + - Kotlin + - Swift + # Useful links - Documentation: https://k2-fsa.github.io/sherpa/onnx/ - APK for the text-to-speech engine: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html - APK for speaker identification: https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk.html +- APK for speech recognition: https://github.com/k2-fsa/sherpa-onnx/releases/ +- Bilibili 演示视频: https://search.bilibili.com/all?keyword=%E6%96%B0%E4%B8%80%E4%BB%A3Kaldi # How to reach us diff --git a/android/README.md b/android/README.md index 053ad66e00..705049f8f2 100644 --- a/android/README.md +++ b/android/README.md @@ -7,14 +7,22 @@ for usage. - [SherpaOnnx](./SherpaOnnx) It uses a streaming ASR model. - [SherpaOnnx2Pass](./SherpaOnnx2Pass) It uses a streaming ASR model - for the first pass and use a non-streaming ASR model for the second pass. + for the first pass and use a non-streaming ASR model for the second pass -- [SherpaOnnxVad](./SherpaOnnxVad) It demonstrates how to use a VAD +- [SherpaOnnxKws](./SherpaOnnxKws) It demonstrates how to use keyword spotting -- [SherpaOnnxVadAsr](./SherpaOnnxVadAsr) It uses a VAD with a non-streaming - ASR model. +- [SherpaOnnxSpeakerIdentification](./SherpaOnnxSpeakerIdentification) It demonstrates + how to use speaker identification - [SherpaOnnxTts](./SherpaOnnxTts) It is for standalone text-to-speech. - [SherpaOnnxTtsEngine](./SherpaOnnxTtsEngine) It is for text-to-speech engine; you can use it to replace the system TTS engine. + +- [SherpaOnnxVad](./SherpaOnnxVad) It demonstrates how to use a VAD + +- [SherpaOnnxVadAsr](./SherpaOnnxVadAsr) It uses a VAD with a non-streaming + ASR model. + +- [SherpaOnnxWebSocket](./SherpaOnnxWebSocket) It shows how to write a websocket + client for the Python streaming websocket server. diff --git a/c-api-examples/asr-microphone-example/c-api-alsa.cc b/c-api-examples/asr-microphone-example/c-api-alsa.cc index caa5d8c6bc..a1df63ad44 100644 --- a/c-api-examples/asr-microphone-example/c-api-alsa.cc +++ b/c-api-examples/asr-microphone-example/c-api-alsa.cc @@ -99,7 +99,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] Subdevices: 1/1 Subdevice #0: subdevice #0 -and if you want to select card 3 and the device 0 on that card, please use: +and if you want to select card 3 and device 0 on that card, please use: plughw:3,0 diff --git a/cmake/cmake_extension.py b/cmake/cmake_extension.py index b78129b21f..ca57ff3039 100644 --- a/cmake/cmake_extension.py +++ b/cmake/cmake_extension.py @@ -50,6 +50,7 @@ def get_binaries(): "sherpa-onnx-keyword-spotter", "sherpa-onnx-microphone", "sherpa-onnx-microphone-offline", + "sherpa-onnx-microphone-offline-audio-tagging", "sherpa-onnx-microphone-offline-speaker-identification", "sherpa-onnx-offline", "sherpa-onnx-offline-language-identification", @@ -69,6 +70,7 @@ def get_binaries(): "sherpa-onnx-alsa-offline-speaker-identification", "sherpa-onnx-offline-tts-play-alsa", "sherpa-onnx-vad-alsa", + "sherpa-onnx-alsa-offline-audio-tagging", ] if is_windows(): diff --git a/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py index 45962755fa..81d5ae9b56 100755 --- a/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py +++ b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py @@ -123,7 +123,7 @@ def get_args(): Subdevices: 1/1 Subdevice #0: subdevice #0 -and if you want to select card 3 and the device 0 on that card, please use: +and if you want to select card 3 and device 0 on that card, please use: plughw:3,0 diff --git a/python-api-examples/vad-alsa.py b/python-api-examples/vad-alsa.py index 8f23d477e4..259869c016 100755 --- a/python-api-examples/vad-alsa.py +++ b/python-api-examples/vad-alsa.py @@ -39,7 +39,7 @@ def get_args(): Subdevices: 1/1 Subdevice #0: subdevice #0 -and if you want to select card 3 and the device 0 on that card, please use: +and if you want to select card 3 and device 0 on that card, please use: plughw:3,0 diff --git a/python-api-examples/vad-remove-non-speech-segments-alsa.py b/python-api-examples/vad-remove-non-speech-segments-alsa.py index 34f88e40fe..6d93bb1e90 100755 --- a/python-api-examples/vad-remove-non-speech-segments-alsa.py +++ b/python-api-examples/vad-remove-non-speech-segments-alsa.py @@ -68,7 +68,7 @@ def get_args(): Subdevices: 1/1 Subdevice #0: subdevice #0 -and if you want to select card 3 and the device 0 on that card, please use: +and if you want to select card 3 and device 0 on that card, please use: plughw:3,0 diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index 5b2e5941c0..fe2a1a939a 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -264,6 +264,7 @@ if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY) add_executable(sherpa-onnx-alsa-offline sherpa-onnx-alsa-offline.cc alsa.cc) add_executable(sherpa-onnx-alsa-offline-speaker-identification sherpa-onnx-alsa-offline-speaker-identification.cc alsa.cc) add_executable(sherpa-onnx-vad-alsa sherpa-onnx-vad-alsa.cc alsa.cc) + add_executable(sherpa-onnx-alsa-offline-audio-tagging sherpa-onnx-alsa-offline-audio-tagging.cc alsa.cc) if(SHERPA_ONNX_ENABLE_TTS) @@ -276,6 +277,7 @@ if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY) sherpa-onnx-alsa-offline-speaker-identification sherpa-onnx-keyword-spotter-alsa sherpa-onnx-vad-alsa + sherpa-onnx-alsa-offline-audio-tagging ) if(SHERPA_ONNX_ENABLE_TTS) @@ -354,6 +356,11 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY) microphone.cc ) + add_executable(sherpa-onnx-microphone-offline-audio-tagging + sherpa-onnx-microphone-offline-audio-tagging.cc + microphone.cc + ) + if(BUILD_SHARED_LIBS) set(PA_LIB portaudio) else() @@ -365,6 +372,7 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY) sherpa-onnx-keyword-spotter-microphone sherpa-onnx-microphone-offline sherpa-onnx-microphone-offline-speaker-identification + sherpa-onnx-microphone-offline-audio-tagging sherpa-onnx-vad-microphone sherpa-onnx-vad-microphone-offline-asr ) diff --git a/sherpa-onnx/csrc/alsa.cc b/sherpa-onnx/csrc/alsa.cc index 3c883331ab..a65761099e 100644 --- a/sherpa-onnx/csrc/alsa.cc +++ b/sherpa-onnx/csrc/alsa.cc @@ -35,7 +35,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] Subdevices: 1/1 Subdevice #0: subdevice #0 -and if you want to select card 3 and the device 0 on that card, please use: +and if you want to select card 3 and device 0 on that card, please use: plughw:3,0 diff --git a/sherpa-onnx/csrc/sherpa-onnx-alsa-offline-audio-tagging.cc b/sherpa-onnx/csrc/sherpa-onnx-alsa-offline-audio-tagging.cc new file mode 100644 index 0000000000..6a5b701eac --- /dev/null +++ b/sherpa-onnx/csrc/sherpa-onnx-alsa-offline-audio-tagging.cc @@ -0,0 +1,190 @@ +// sherpa-onnx/csrc/sherpa-onnx-alsa-offline-audio-tagging.cc +// +// Copyright (c) 2022-2024 Xiaomi Corporation + +#include +#include +#include + +#include +#include // NOLINT +#include // NOLINT + +#include "sherpa-onnx/csrc/alsa.h" +#include "sherpa-onnx/csrc/audio-tagging.h" +#include "sherpa-onnx/csrc/macros.h" + +enum class State { + kIdle, + kRecording, + kDecoding, +}; + +State state = State::kIdle; + +// true to stop the program and exit +bool stop = false; + +std::vector samples; +std::mutex samples_mutex; + +static void DetectKeyPress() { + SHERPA_ONNX_LOGE("Press Enter to start"); + int32_t key; + while (!stop && (key = getchar())) { + if (key != 0x0a) { + continue; + } + + switch (state) { + case State::kIdle: + SHERPA_ONNX_LOGE("Start recording. Press Enter to stop recording"); + state = State::kRecording; + { + std::lock_guard lock(samples_mutex); + samples.clear(); + } + break; + case State::kRecording: + SHERPA_ONNX_LOGE("Stop recording. Decoding ..."); + state = State::kDecoding; + break; + case State::kDecoding: + break; + } + } +} + +static void Record(const char *device_name, int32_t expected_sample_rate) { + sherpa_onnx::Alsa alsa(device_name); + + if (alsa.GetExpectedSampleRate() != expected_sample_rate) { + fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(), + expected_sample_rate); + exit(-1); + } + + int32_t chunk = 0.1 * alsa.GetActualSampleRate(); + while (!stop) { + const std::vector &s = alsa.Read(chunk); + std::lock_guard lock(samples_mutex); + samples.insert(samples.end(), s.begin(), s.end()); + } +} + +static void Handler(int32_t sig) { + stop = true; + fprintf(stderr, "\nCaught Ctrl + C. Press Enter to exit\n"); +} + +int32_t main(int32_t argc, char *argv[]) { + signal(SIGINT, Handler); + + const char *kUsageMessage = R"usage( +Audio tagging from microphone (Linux only). +Usage: + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 +tar xvf sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 +rm sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 + +./bin/sherpa-onnx-alsa-offline-audio-tagging \ + --zipformer-model=./sherpa-onnx-zipformer-audio-tagging-2024-04-09/model.onnx \ + --labels=./sherpa-onnx-zipformer-audio-tagging-2024-04-09/class_labels_indices.csv \ + device_name + +Please refer to +https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models +for a list of pre-trained models to download. + +The device name specifies which microphone to use in case there are several +on your system. You can use + + arecord -l + +to find all available microphones on your computer. For instance, if it outputs + +**** List of CAPTURE Hardware Devices **** +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] + Subdevices: 1/1 + Subdevice #0: subdevice #0 + +and if you want to select card 3 and device 0 on that card, please use: + + plughw:3,0 + +as the device_name. +)usage"; + + sherpa_onnx::ParseOptions po(kUsageMessage); + sherpa_onnx::AudioTaggingConfig config; + config.Register(&po); + + po.Read(argc, argv); + if (po.NumArgs() != 1) { + fprintf(stderr, "Please provide only 1 argument: the device name\n"); + po.PrintUsage(); + exit(EXIT_FAILURE); + } + + fprintf(stderr, "%s\n", config.ToString().c_str()); + + if (!config.Validate()) { + fprintf(stderr, "Errors in config!\n"); + return -1; + } + + SHERPA_ONNX_LOGE("Creating audio tagger ..."); + sherpa_onnx::AudioTagging tagger(config); + SHERPA_ONNX_LOGE("Audio tagger created created!"); + + std::string device_name = po.GetArg(1); + fprintf(stderr, "Use recording device: %s\n", device_name.c_str()); + + int32_t sample_rate = 16000; // fixed to 16000Hz for all models from icefall + + std::thread t2(Record, device_name.c_str(), sample_rate); + using namespace std::chrono_literals; // NOLINT + std::this_thread::sleep_for(100ms); // sleep for 100ms + std::thread t(DetectKeyPress); + + while (!stop) { + switch (state) { + case State::kIdle: + break; + case State::kRecording: + break; + case State::kDecoding: { + std::vector buf; + { + std::lock_guard lock(samples_mutex); + buf = std::move(samples); + } + SHERPA_ONNX_LOGE("Computing..."); + auto s = tagger.CreateStream(); + s->AcceptWaveform(sample_rate, buf.data(), buf.size()); + auto results = tagger.Compute(s.get()); + SHERPA_ONNX_LOGE("Result is:"); + + int32_t i = 0; + std::ostringstream os; + for (const auto &event : results) { + os << i << ": " << event.ToString() << "\n"; + i += 1; + } + + SHERPA_ONNX_LOGE("\n%s\n", os.str().c_str()); + + state = State::kIdle; + SHERPA_ONNX_LOGE("Press Enter to start"); + break; + } + } + + std::this_thread::sleep_for(20ms); // sleep for 20ms + } + t.join(); + t2.join(); + + return 0; +} diff --git a/sherpa-onnx/csrc/sherpa-onnx-alsa-offline-speaker-identification.cc b/sherpa-onnx/csrc/sherpa-onnx-alsa-offline-speaker-identification.cc index 76695d5cf7..f4702a836f 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-alsa-offline-speaker-identification.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-alsa-offline-speaker-identification.cc @@ -71,8 +71,8 @@ static void Record(const char *device_name, int32_t expected_sample_rate) { int32_t chunk = 0.1 * alsa.GetActualSampleRate(); while (!stop) { - std::lock_guard lock(samples_mutex); const std::vector &s = alsa.Read(chunk); + std::lock_guard lock(samples_mutex); samples.insert(samples.end(), s.begin(), s.end()); } } @@ -193,7 +193,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] Subdevices: 1/1 Subdevice #0: subdevice #0 -and if you want to select card 3 and the device 0 on that card, please use: +and if you want to select card 3 and device 0 on that card, please use: plughw:3,0 as the device_name. diff --git a/sherpa-onnx/csrc/sherpa-onnx-alsa-offline.cc b/sherpa-onnx/csrc/sherpa-onnx-alsa-offline.cc index 2f24a21a6e..b69ec6cd18 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-alsa-offline.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-alsa-offline.cc @@ -68,8 +68,8 @@ static void Record(const char *device_name, int32_t expected_sample_rate) { int32_t chunk = 0.1 * alsa.GetActualSampleRate(); while (!stop) { - std::lock_guard lock(samples_mutex); const std::vector &s = alsa.Read(chunk); + std::lock_guard lock(samples_mutex); samples.insert(samples.end(), s.begin(), s.end()); } } @@ -119,7 +119,7 @@ Please refer to for a list of pre-trained models to download. The device name specifies which microphone to use in case there are several -on you system. You can use +on your system. You can use arecord -l @@ -130,7 +130,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] Subdevices: 1/1 Subdevice #0: subdevice #0 -and if you want to select card 3 and the device 0 on that card, please use: +and if you want to select card 3 and device 0 on that card, please use: plughw:3,0 diff --git a/sherpa-onnx/csrc/sherpa-onnx-alsa.cc b/sherpa-onnx/csrc/sherpa-onnx-alsa.cc index ccd909bb3f..a0c4e3d64b 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-alsa.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-alsa.cc @@ -52,7 +52,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] Subdevices: 1/1 Subdevice #0: subdevice #0 -and if you want to select card 3 and the device 0 on that card, please use: +and if you want to select card 3 and device 0 on that card, please use: plughw:3,0 diff --git a/sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-alsa.cc b/sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-alsa.cc index 2e784ebb81..a909ff250b 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-alsa.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-alsa.cc @@ -40,7 +40,7 @@ Please refer to for a list of pre-trained models to download. The device name specifies which microphone to use in case there are several -on you system. You can use +on your system. You can use arecord -l @@ -51,7 +51,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] Subdevices: 1/1 Subdevice #0: subdevice #0 -and if you want to select card 3 and the device 0 on that card, please use: +and if you want to select card 3 and device 0 on that card, please use: plughw:3,0 diff --git a/sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-microphone.cc b/sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-microphone.cc index 1f42da40a8..6100ba4515 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-microphone.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-microphone.cc @@ -10,10 +10,11 @@ #include "portaudio.h" // NOLINT #include "sherpa-onnx/csrc/display.h" -#include "sherpa-onnx/csrc/microphone.h" #include "sherpa-onnx/csrc/keyword-spotter.h" +#include "sherpa-onnx/csrc/microphone.h" bool stop = false; +float mic_sample_rate = 16000; static int32_t RecordCallback(const void *input_buffer, void * /*output_buffer*/, @@ -23,7 +24,8 @@ static int32_t RecordCallback(const void *input_buffer, void *user_data) { auto stream = reinterpret_cast(user_data); - stream->AcceptWaveform(16000, reinterpret_cast(input_buffer), + stream->AcceptWaveform(mic_sample_rate, + reinterpret_cast(input_buffer), frames_per_buffer); return stop ? paComplete : paContinue; @@ -80,14 +82,31 @@ for a list of pre-trained models to download. PaDeviceIndex num_devices = Pa_GetDeviceCount(); fprintf(stderr, "Num devices: %d\n", num_devices); - PaStreamParameters param; + int32_t device_index = Pa_GetDefaultInputDevice(); - param.device = Pa_GetDefaultInputDevice(); - if (param.device == paNoDevice) { + if (device_index == paNoDevice) { fprintf(stderr, "No default input device found\n"); + fprintf(stderr, "If you are using Linux, please switch to \n"); + fprintf(stderr, " ./bin/sherpa-onnx-keyword-spotter-alsa \n"); exit(EXIT_FAILURE); } - fprintf(stderr, "Use default device: %d\n", param.device); + + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); + if (pDeviceIndex) { + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); + device_index = atoi(pDeviceIndex); + } + + for (int32_t i = 0; i != num_devices; ++i) { + const PaDeviceInfo *info = Pa_GetDeviceInfo(i); + fprintf(stderr, " %s %d %s\n", (i == device_index) ? "*" : " ", i, + info->name); + } + + PaStreamParameters param; + param.device = device_index; + + fprintf(stderr, "Use device: %d\n", param.device); const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); fprintf(stderr, " Name: %s\n", info->name); @@ -98,12 +117,19 @@ for a list of pre-trained models to download. param.suggestedLatency = info->defaultLowInputLatency; param.hostApiSpecificStreamInfo = nullptr; + + const char *pSampleRateStr = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); + if (pSampleRateStr) { + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); + mic_sample_rate = atof(pSampleRateStr); + } + float sample_rate = 16000; PaStream *stream; PaError err = Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ - sample_rate, + mic_sample_rate, 0, // frames per buffer paClipOff, // we won't output out of range samples // so don't bother clipping them diff --git a/sherpa-onnx/csrc/sherpa-onnx-microphone-offline-audio-tagging.cc b/sherpa-onnx/csrc/sherpa-onnx-microphone-offline-audio-tagging.cc new file mode 100644 index 0000000000..169b995f82 --- /dev/null +++ b/sherpa-onnx/csrc/sherpa-onnx-microphone-offline-audio-tagging.cc @@ -0,0 +1,238 @@ +// sherpa-onnx/csrc/sherpa-onnx-microphone-offline-audio-tagging.cc +// +// Copyright (c) 2024 Xiaomi Corporation + +#include +#include +#include + +#include +#include // std::tolower +#include // NOLINT +#include // NOLINT + +#include "portaudio.h" // NOLINT +#include "sherpa-onnx/csrc/audio-tagging.h" +#include "sherpa-onnx/csrc/macros.h" +#include "sherpa-onnx/csrc/microphone.h" + +enum class State { + kIdle, + kRecording, + kDecoding, +}; + +State state = State::kIdle; + +// true to stop the program and exit +bool stop = false; + +std::vector samples; +std::mutex samples_mutex; + +static void DetectKeyPress() { + SHERPA_ONNX_LOGE("Press Enter to start"); + int32_t key; + while (!stop && (key = getchar())) { + if (key != 0x0a) { + continue; + } + + switch (state) { + case State::kIdle: + SHERPA_ONNX_LOGE("Start recording. Press Enter to stop recording"); + state = State::kRecording; + { + std::lock_guard lock(samples_mutex); + samples.clear(); + } + break; + case State::kRecording: + SHERPA_ONNX_LOGE("Stop recording. Decoding ..."); + state = State::kDecoding; + break; + case State::kDecoding: + break; + } + } +} + +static int32_t RecordCallback(const void *input_buffer, + void * /*output_buffer*/, + unsigned long frames_per_buffer, // NOLINT + const PaStreamCallbackTimeInfo * /*time_info*/, + PaStreamCallbackFlags /*status_flags*/, + void *user_data) { + std::lock_guard lock(samples_mutex); + + auto p = reinterpret_cast(input_buffer); + samples.insert(samples.end(), p, p + frames_per_buffer); + + return stop ? paComplete : paContinue; +} + +static void Handler(int32_t sig) { + stop = true; + fprintf(stderr, "\nCaught Ctrl + C. Press Enter to exit\n"); +} + +int32_t main(int32_t argc, char *argv[]) { + signal(SIGINT, Handler); + + const char *kUsageMessage = R"usage( +Audio tagging from microphone. +Usage: + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 +tar xvf sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 +rm sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 + +./bin/sherpa-onnx-microphone-offline-audio-tagging \ + --zipformer-model=./sherpa-onnx-zipformer-audio-tagging-2024-04-09/model.onnx \ + --labels=./sherpa-onnx-zipformer-audio-tagging-2024-04-09/class_labels_indices.csv + +Please see +https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models +for more models. +)usage"; + + sherpa_onnx::ParseOptions po(kUsageMessage); + sherpa_onnx::AudioTaggingConfig config; + config.Register(&po); + + po.Read(argc, argv); + if (po.NumArgs() != 0) { + fprintf(stderr, "\nThis program does not support positional arguments\n\n"); + po.PrintUsage(); + exit(EXIT_FAILURE); + } + + fprintf(stderr, "%s\n", config.ToString().c_str()); + + if (!config.Validate()) { + fprintf(stderr, "Errors in config!\n"); + return -1; + } + + SHERPA_ONNX_LOGE("Creating audio tagger ..."); + sherpa_onnx::AudioTagging tagger(config); + SHERPA_ONNX_LOGE("Audio tagger created created!"); + + sherpa_onnx::Microphone mic; + + PaDeviceIndex num_devices = Pa_GetDeviceCount(); + fprintf(stderr, "Num devices: %d\n", num_devices); + + int32_t device_index = Pa_GetDefaultInputDevice(); + + if (device_index == paNoDevice) { + fprintf(stderr, "No default input device found\n"); + fprintf(stderr, "If you are using Linux, please switch to \n"); + fprintf(stderr, " ./bin/sherpa-onnx-alsa-offline-audio-tagging \n"); + exit(EXIT_FAILURE); + } + + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); + if (pDeviceIndex) { + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); + device_index = atoi(pDeviceIndex); + } + + for (int32_t i = 0; i != num_devices; ++i) { + const PaDeviceInfo *info = Pa_GetDeviceInfo(i); + fprintf(stderr, " %s %d %s\n", (i == device_index) ? "*" : " ", i, + info->name); + } + + PaStreamParameters param; + param.device = device_index; + + fprintf(stderr, "Use device: %d\n", param.device); + + const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); + fprintf(stderr, " Name: %s\n", info->name); + fprintf(stderr, " Max input channels: %d\n", info->maxInputChannels); + + param.channelCount = 1; + param.sampleFormat = paFloat32; + + param.suggestedLatency = info->defaultLowInputLatency; + param.hostApiSpecificStreamInfo = nullptr; + float mic_sample_rate = 16000; + const char *pSampleRateStr = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); + if (pSampleRateStr) { + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); + mic_sample_rate = atof(pSampleRateStr); + } + + float sample_rate = 16000; + + PaStream *stream; + PaError err = + Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ + mic_sample_rate, + 0, // frames per buffer + paClipOff, // we won't output out of range samples + // so don't bother clipping them + RecordCallback, nullptr); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } + + err = Pa_StartStream(stream); + fprintf(stderr, "Started\n"); + + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } + + std::thread t(DetectKeyPress); + while (!stop) { + switch (state) { + case State::kIdle: + break; + case State::kRecording: + break; + case State::kDecoding: { + std::vector buf; + { + std::lock_guard lock(samples_mutex); + buf = std::move(samples); + } + + SHERPA_ONNX_LOGE("Computing..."); + auto s = tagger.CreateStream(); + s->AcceptWaveform(mic_sample_rate, buf.data(), buf.size()); + auto results = tagger.Compute(s.get()); + + SHERPA_ONNX_LOGE("Result is:"); + + int32_t i = 0; + std::ostringstream os; + for (const auto &event : results) { + os << i << ": " << event.ToString() << "\n"; + i += 1; + } + + SHERPA_ONNX_LOGE("\n%s\n", os.str().c_str()); + + state = State::kIdle; + SHERPA_ONNX_LOGE("Press Enter to start"); + break; + } + } + + Pa_Sleep(20); // sleep for 20ms + } + t.join(); + + err = Pa_CloseStream(stream); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } + + return 0; +} diff --git a/sherpa-onnx/csrc/sherpa-onnx-microphone-offline-speaker-identification.cc b/sherpa-onnx/csrc/sherpa-onnx-microphone-offline-speaker-identification.cc index f525f5223d..769e8b5ff9 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-microphone-offline-speaker-identification.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-microphone-offline-speaker-identification.cc @@ -223,14 +223,31 @@ Note that `zh` means Chinese, while `en` means English. PaDeviceIndex num_devices = Pa_GetDeviceCount(); fprintf(stderr, "Num devices: %d\n", num_devices); - PaStreamParameters param; - - param.device = Pa_GetDefaultInputDevice(); - if (param.device == paNoDevice) { + int32_t device_index = Pa_GetDefaultInputDevice(); + if (device_index == paNoDevice) { fprintf(stderr, "No default input device found\n"); + fprintf(stderr, "If you are using Linux, please switch to \n"); + fprintf(stderr, + " ./bin/sherpa-onnx-alsa-offline-speaker-identification \n"); exit(EXIT_FAILURE); } - fprintf(stderr, "Use default device: %d\n", param.device); + + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); + if (pDeviceIndex) { + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); + device_index = atoi(pDeviceIndex); + } + + for (int32_t i = 0; i != num_devices; ++i) { + const PaDeviceInfo *info = Pa_GetDeviceInfo(i); + fprintf(stderr, " %s %d %s\n", (i == device_index) ? "*" : " ", i, + info->name); + } + + PaStreamParameters param; + param.device = device_index; + + fprintf(stderr, "Use device: %d\n", param.device); const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); fprintf(stderr, " Name: %s\n", info->name); @@ -241,12 +258,18 @@ Note that `zh` means Chinese, while `en` means English. param.suggestedLatency = info->defaultLowInputLatency; param.hostApiSpecificStreamInfo = nullptr; + float mic_sample_rate = 16000; + const char *pSampleRateStr = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); + if (pSampleRateStr) { + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); + mic_sample_rate = atof(pSampleRateStr); + } float sample_rate = 16000; PaStream *stream; PaError err = Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ - sample_rate, + mic_sample_rate, 0, // frames per buffer paClipOff, // we won't output out of range samples // so don't bother clipping them @@ -279,7 +302,7 @@ Note that `zh` means Chinese, while `en` means English. } auto s = extractor.CreateStream(); - s->AcceptWaveform(sample_rate, buf.data(), buf.size()); + s->AcceptWaveform(mic_sample_rate, buf.data(), buf.size()); s->InputFinished(); auto embedding = extractor.Compute(s.get()); auto name = manager.Search(embedding.data(), threshold); diff --git a/sherpa-onnx/csrc/sherpa-onnx-microphone-offline.cc b/sherpa-onnx/csrc/sherpa-onnx-microphone-offline.cc index a587ffa44b..75ffa97f09 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-microphone-offline.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-microphone-offline.cc @@ -139,14 +139,31 @@ for a list of pre-trained models to download. PaDeviceIndex num_devices = Pa_GetDeviceCount(); fprintf(stderr, "Num devices: %d\n", num_devices); - PaStreamParameters param; + int32_t device_index = Pa_GetDefaultInputDevice(); - param.device = Pa_GetDefaultInputDevice(); - if (param.device == paNoDevice) { + if (device_index == paNoDevice) { fprintf(stderr, "No default input device found\n"); + fprintf(stderr, "If you are using Linux, please switch to \n"); + fprintf(stderr, " ./bin/sherpa-onnx-alsa-offline \n"); exit(EXIT_FAILURE); } - fprintf(stderr, "Use default device: %d\n", param.device); + + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); + if (pDeviceIndex) { + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); + device_index = atoi(pDeviceIndex); + } + + for (int32_t i = 0; i != num_devices; ++i) { + const PaDeviceInfo *info = Pa_GetDeviceInfo(i); + fprintf(stderr, " %s %d %s\n", (i == device_index) ? "*" : " ", i, + info->name); + } + + PaStreamParameters param; + param.device = device_index; + + fprintf(stderr, "Use device: %d\n", param.device); const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); fprintf(stderr, " Name: %s\n", info->name); @@ -157,12 +174,18 @@ for a list of pre-trained models to download. param.suggestedLatency = info->defaultLowInputLatency; param.hostApiSpecificStreamInfo = nullptr; + float mic_sample_rate = 16000; + const char *pSampleRateStr = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); + if (pSampleRateStr) { + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); + mic_sample_rate = atof(pSampleRateStr); + } float sample_rate = 16000; PaStream *stream; PaError err = Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ - sample_rate, + mic_sample_rate, 0, // frames per buffer paClipOff, // we won't output out of range samples // so don't bother clipping them @@ -195,7 +218,7 @@ for a list of pre-trained models to download. } auto s = recognizer.CreateStream(); - s->AcceptWaveform(sample_rate, buf.data(), buf.size()); + s->AcceptWaveform(mic_sample_rate, buf.data(), buf.size()); recognizer.DecodeStream(s.get()); SHERPA_ONNX_LOGE("Decoding Done! Result is:"); SHERPA_ONNX_LOGE("%s", s->GetResult().text.c_str()); diff --git a/sherpa-onnx/csrc/sherpa-onnx-microphone.cc b/sherpa-onnx/csrc/sherpa-onnx-microphone.cc index bdb43a2046..cb8e4d8d93 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-microphone.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-microphone.cc @@ -15,6 +15,7 @@ #include "sherpa-onnx/csrc/online-recognizer.h" bool stop = false; +float mic_sample_rate = 16000; static int32_t RecordCallback(const void *input_buffer, void * /*output_buffer*/, @@ -24,7 +25,8 @@ static int32_t RecordCallback(const void *input_buffer, void *user_data) { auto stream = reinterpret_cast(user_data); - stream->AcceptWaveform(16000, reinterpret_cast(input_buffer), + stream->AcceptWaveform(mic_sample_rate, + reinterpret_cast(input_buffer), frames_per_buffer); return stop ? paComplete : paContinue; @@ -81,14 +83,31 @@ for a list of pre-trained models to download. PaDeviceIndex num_devices = Pa_GetDeviceCount(); fprintf(stderr, "Num devices: %d\n", num_devices); - PaStreamParameters param; + int32_t device_index = Pa_GetDefaultInputDevice(); - param.device = Pa_GetDefaultInputDevice(); - if (param.device == paNoDevice) { + if (device_index == paNoDevice) { fprintf(stderr, "No default input device found\n"); + fprintf(stderr, "If you are using Linux, please switch to \n"); + fprintf(stderr, " ./bin/sherpa-onnx-alsa \n"); exit(EXIT_FAILURE); } - fprintf(stderr, "Use default device: %d\n", param.device); + + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); + if (pDeviceIndex) { + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); + device_index = atoi(pDeviceIndex); + } + + for (int32_t i = 0; i != num_devices; ++i) { + const PaDeviceInfo *info = Pa_GetDeviceInfo(i); + fprintf(stderr, " %s %d %s\n", (i == device_index) ? "*" : " ", i, + info->name); + } + + PaStreamParameters param; + param.device = device_index; + + fprintf(stderr, "Use device: %d\n", param.device); const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); fprintf(stderr, " Name: %s\n", info->name); @@ -99,6 +118,11 @@ for a list of pre-trained models to download. param.suggestedLatency = info->defaultLowInputLatency; param.hostApiSpecificStreamInfo = nullptr; + const char *pSampleRateStr = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); + if (pSampleRateStr) { + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); + mic_sample_rate = atof(pSampleRateStr); + } float sample_rate = 16000; PaStream *stream; diff --git a/sherpa-onnx/csrc/sherpa-onnx-vad-alsa.cc b/sherpa-onnx/csrc/sherpa-onnx-vad-alsa.cc index 31a3f39b05..47fa6119d7 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-vad-alsa.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-vad-alsa.cc @@ -47,7 +47,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] Subdevices: 1/1 Subdevice #0: subdevice #0 -and if you want to select card 3 and the device 0 on that card, please use: +and if you want to select card 3 and device 0 on that card, please use: plughw:3,0 diff --git a/sherpa-onnx/csrc/sherpa-onnx-vad-microphone-offline-asr.cc b/sherpa-onnx/csrc/sherpa-onnx-vad-microphone-offline-asr.cc index e7d8c0349e..0632e81ae7 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-vad-microphone-offline-asr.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-vad-microphone-offline-asr.cc @@ -13,6 +13,7 @@ #include "sherpa-onnx/csrc/circular-buffer.h" #include "sherpa-onnx/csrc/microphone.h" #include "sherpa-onnx/csrc/offline-recognizer.h" +#include "sherpa-onnx/csrc/resample.h" #include "sherpa-onnx/csrc/voice-activity-detector.h" bool stop = false; @@ -115,14 +116,29 @@ to download models for offline ASR. PaDeviceIndex num_devices = Pa_GetDeviceCount(); fprintf(stderr, "Num devices: %d\n", num_devices); - PaStreamParameters param; + int32_t device_index = Pa_GetDefaultInputDevice(); - param.device = Pa_GetDefaultInputDevice(); - if (param.device == paNoDevice) { + if (device_index == paNoDevice) { fprintf(stderr, "No default input device found\n"); exit(EXIT_FAILURE); } - fprintf(stderr, "Use default device: %d\n", param.device); + + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); + if (pDeviceIndex) { + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); + device_index = atoi(pDeviceIndex); + } + + for (int32_t i = 0; i != num_devices; ++i) { + const PaDeviceInfo *info = Pa_GetDeviceInfo(i); + fprintf(stderr, " %s %d %s\n", (i == device_index) ? "*" : " ", i, + info->name); + } + + PaStreamParameters param; + param.device = device_index; + + fprintf(stderr, "Use device: %d\n", param.device); const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); fprintf(stderr, " Name: %s\n", info->name); @@ -133,12 +149,27 @@ to download models for offline ASR. param.suggestedLatency = info->defaultLowInputLatency; param.hostApiSpecificStreamInfo = nullptr; + float mic_sample_rate = 16000; + const char *pSampleRateStr = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); + if (pSampleRateStr) { + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); + mic_sample_rate = atof(pSampleRateStr); + } float sample_rate = 16000; + std::unique_ptr resampler; + if (mic_sample_rate != sample_rate) { + float min_freq = std::min(mic_sample_rate, sample_rate); + float lowpass_cutoff = 0.99 * 0.5 * min_freq; + + int32_t lowpass_filter_width = 6; + resampler = std::make_unique( + mic_sample_rate, sample_rate, lowpass_cutoff, lowpass_filter_width); + } PaStream *stream; PaError err = Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ - sample_rate, + mic_sample_rate, 0, // frames per buffer paClipOff, // we won't output out of range samples // so don't bother clipping them @@ -168,6 +199,13 @@ to download models for offline ASR. while (buffer.Size() >= window_size) { std::vector samples = buffer.Get(buffer.Head(), window_size); buffer.Pop(window_size); + + if (resampler) { + std::vector tmp; + resampler->Resample(samples.data(), samples.size(), true, &tmp); + samples = std::move(tmp); + } + vad->AcceptWaveform(samples.data(), samples.size()); } } diff --git a/sherpa-onnx/csrc/sherpa-onnx-vad-microphone.cc b/sherpa-onnx/csrc/sherpa-onnx-vad-microphone.cc index da013b9e85..bf22f16939 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-vad-microphone.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-vad-microphone.cc @@ -12,6 +12,7 @@ #include "portaudio.h" // NOLINT #include "sherpa-onnx/csrc/circular-buffer.h" #include "sherpa-onnx/csrc/microphone.h" +#include "sherpa-onnx/csrc/resample.h" #include "sherpa-onnx/csrc/voice-activity-detector.h" #include "sherpa-onnx/csrc/wave-writer.h" @@ -76,14 +77,31 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx PaDeviceIndex num_devices = Pa_GetDeviceCount(); fprintf(stderr, "Num devices: %d\n", num_devices); - PaStreamParameters param; + int32_t device_index = Pa_GetDefaultInputDevice(); - param.device = Pa_GetDefaultInputDevice(); - if (param.device == paNoDevice) { + if (device_index == paNoDevice) { fprintf(stderr, "No default input device found\n"); + fprintf(stderr, "If you are using Linux, please switch to \n"); + fprintf(stderr, " ./bin/sherpa-onnx-vad-alsa \n"); exit(EXIT_FAILURE); } - fprintf(stderr, "Use default device: %d\n", param.device); + + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); + if (pDeviceIndex) { + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); + device_index = atoi(pDeviceIndex); + } + + for (int32_t i = 0; i != num_devices; ++i) { + const PaDeviceInfo *info = Pa_GetDeviceInfo(i); + fprintf(stderr, " %s %d %s\n", (i == device_index) ? "*" : " ", i, + info->name); + } + + PaStreamParameters param; + param.device = device_index; + + fprintf(stderr, "Use device: %d\n", param.device); const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); fprintf(stderr, " Name: %s\n", info->name); @@ -94,12 +112,28 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx param.suggestedLatency = info->defaultLowInputLatency; param.hostApiSpecificStreamInfo = nullptr; + float mic_sample_rate = 16000; + const char *pSampleRateStr = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); + if (pSampleRateStr) { + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); + mic_sample_rate = atof(pSampleRateStr); + } float sample_rate = 16000; + std::unique_ptr resampler; + if (mic_sample_rate != sample_rate) { + float min_freq = std::min(mic_sample_rate, sample_rate); + float lowpass_cutoff = 0.99 * 0.5 * min_freq; + + int32_t lowpass_filter_width = 6; + resampler = std::make_unique( + mic_sample_rate, sample_rate, lowpass_cutoff, lowpass_filter_width); + } + PaStream *stream; PaError err = Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ - sample_rate, + mic_sample_rate, 0, // frames per buffer paClipOff, // we won't output out of range samples // so don't bother clipping them @@ -131,6 +165,13 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx while (buffer.Size() >= window_size) { std::vector samples = buffer.Get(buffer.Head(), window_size); buffer.Pop(window_size); + + if (resampler) { + std::vector tmp; + resampler->Resample(samples.data(), samples.size(), true, &tmp); + samples = std::move(tmp); + } + vad->AcceptWaveform(samples.data(), samples.size()); if (vad->IsSpeechDetected() && !printed) { @@ -149,7 +190,7 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx char filename[128]; snprintf(filename, sizeof(filename), "seg-%d-%.3fs.wav", k, duration); k += 1; - sherpa_onnx::WriteWave(filename, 16000, segment.samples.data(), + sherpa_onnx::WriteWave(filename, sample_rate, segment.samples.data(), segment.samples.size()); fprintf(stderr, "Saved to %s\n", filename); fprintf(stderr, "----------\n");