From 728dcea047fbcd313f766d8f907657817096ac6f Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 13 May 2024 16:03:34 +0800 Subject: [PATCH] Add non-streaming ASR APIs for node-addon-api (#868) --- .github/scripts/test-nodejs-addon-npm.sh | 36 +- nodejs-addon-examples/README.md | 75 ++- .../test_asr_non_streaming_nemo_ctc.js | 50 ++ .../test_asr_non_streaming_paraformer.js | 48 ++ .../test_asr_non_streaming_transducer.js | 52 ++ .../test_asr_non_streaming_whisper.js | 48 ++ .../test_asr_streaming_paraformer.js | 56 ++ ...est_asr_streaming_paraformer_microphone.js | 104 ++++ ...d_asr_non_streaming_nemo_ctc_microphone.js | 110 ++++ ...asr_non_streaming_paraformer_microphone.js | 108 ++++ ...asr_non_streaming_transducer_microphone.js | 113 ++++ ...ad_asr_non_streaming_whisper_microphone.js | 109 ++++ scripts/node-addon-api/CMakeLists.txt | 1 + .../node-addon-api/lib/non-streaming-asr.js | 40 ++ scripts/node-addon-api/lib/sherpa-onnx.js | 2 + .../node-addon-api/src/non-streaming-asr.cc | 586 ++++++++++++++++++ .../src/sherpa-onnx-node-addon-api.cc | 10 +- scripts/node-addon-api/src/streaming-asr.cc | 76 ++- 18 files changed, 1588 insertions(+), 36 deletions(-) create mode 100644 nodejs-addon-examples/test_asr_non_streaming_nemo_ctc.js create mode 100644 nodejs-addon-examples/test_asr_non_streaming_paraformer.js create mode 100644 nodejs-addon-examples/test_asr_non_streaming_transducer.js create mode 100644 nodejs-addon-examples/test_asr_non_streaming_whisper.js create mode 100644 nodejs-addon-examples/test_asr_streaming_paraformer.js create mode 100644 nodejs-addon-examples/test_asr_streaming_paraformer_microphone.js create mode 100644 nodejs-addon-examples/test_vad_asr_non_streaming_nemo_ctc_microphone.js create mode 100644 nodejs-addon-examples/test_vad_asr_non_streaming_paraformer_microphone.js create mode 100644 nodejs-addon-examples/test_vad_asr_non_streaming_transducer_microphone.js create mode 100644 nodejs-addon-examples/test_vad_asr_non_streaming_whisper_microphone.js create mode 100644 scripts/node-addon-api/lib/non-streaming-asr.js create mode 100644 scripts/node-addon-api/src/non-streaming-asr.cc diff --git a/.github/scripts/test-nodejs-addon-npm.sh b/.github/scripts/test-nodejs-addon-npm.sh index c941d7843..d4264296b 100755 --- a/.github/scripts/test-nodejs-addon-npm.sh +++ b/.github/scripts/test-nodejs-addon-npm.sh @@ -22,5 +22,39 @@ node ./test_asr_streaming_ctc.js # To decode with HLG.fst node ./test_asr_streaming_ctc_hlg.js - rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 +tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 +rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 + +node ./test_asr_streaming_paraformer.js +rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 +tar xvf sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 +rm sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 + +node ./test_asr_non_streaming_transducer.js +rm -rf sherpa-onnx-zipformer-en-2023-04-01 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 +tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 +rm sherpa-onnx-whisper-tiny.en.tar.bz2 + +node ./test_asr_non_streaming_whisper.js +rm -rf sherpa-onnx-whisper-tiny.en + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 +tar xvf sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 +rm sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 + +node ./test_asr_non_streaming_nemo_ctc.js +rm -rf sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + +node ./test_asr_non_streaming_paraformer.js +rm -rf sherpa-onnx-paraformer-zh-2023-03-28 diff --git a/nodejs-addon-examples/README.md b/nodejs-addon-examples/README.md index 86208513f..9cc352e76 100644 --- a/nodejs-addon-examples/README.md +++ b/nodejs-addon-examples/README.md @@ -39,7 +39,7 @@ npm install naudiodon2 node ./test_vad_microphone.js ``` -## Streaming speech recognition with zipformer transducer +## Streaming speech recognition with Zipformer transducer ```bash wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 @@ -54,7 +54,7 @@ npm install naudiodon2 node ./test_asr_streaming_transducer_microphone.js ``` -## Streaming speech recognition with zipformer CTC +## Streaming speech recognition with Zipformer CTC ```bash wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 @@ -72,3 +72,74 @@ npm install naudiodon2 node ./test_asr_streaming_ctc_microphone.js node ./test_asr_streaming_ctc_hlg_microphone.js ``` + +## Streaming speech recognition with Paraformer + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 +tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 +rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 + +node ./test_asr_streaming_paraformer.js + +# To run the test with a microphone, you need to install the package naudiodon2 +npm install naudiodon2 + +node ./test_asr_streaming_paraformer_microphone.js +``` + +## Non-streaming speech recognition with Zipformer transducer + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 +tar xvf sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 +rm sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 + +node ./test_asr_non_streaming_transducer.js + +# To run VAD + non-streaming ASR with transudcer using a microphone +npm install naudiodon2 +node ./test_vad_asr_non_streaming_transducer_microphone.js +``` + +## Non-streaming speech recognition with Whisper + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 +tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 +rm sherpa-onnx-whisper-tiny.en.tar.bz2 + +node ./test_asr_non_streaming_whisper.js + +# To run VAD + non-streaming ASR with Paraformer using a microphone +npm install naudiodon2 +node ./test_vad_asr_non_streaming_whisper_microphone.js +``` + +## Non-streaming speech recognition with NeMo CTC models + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 +tar xvf sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 +rm sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 + +node ./test_asr_non_streaming_nemo_ctc.js + +# To run VAD + non-streaming ASR with Paraformer using a microphone +npm install naudiodon2 +node ./test_vad_asr_non_streaming_nemo_ctc_microphone.js +``` + +## Non-streaming speech recognition with Paraformer + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + +node ./test_asr_non_streaming_paraformer.js + +# To run VAD + non-streaming ASR with Paraformer using a microphone +npm install naudiodon2 +node ./test_vad_asr_non_streaming_paraformer_microphone.js +``` diff --git a/nodejs-addon-examples/test_asr_non_streaming_nemo_ctc.js b/nodejs-addon-examples/test_asr_non_streaming_nemo_ctc.js new file mode 100644 index 000000000..89991ee51 --- /dev/null +++ b/nodejs-addon-examples/test_asr_non_streaming_nemo_ctc.js @@ -0,0 +1,50 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); +const performance = require('perf_hooks').performance; + + +// Please download test files from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'nemoCtc': { + 'model': + './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/model.onnx', + }, + 'tokens': + './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + } +}; + +const waveFilename = + './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/de-german.wav'; + +const recognizer = new sherpa_onnx.OfflineRecognizer(config); +console.log('Started') +let start = performance.now(); +const stream = recognizer.createStream(); +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); + +recognizer.decode(stream); +result = recognizer.getResult(stream) +let stop = performance.now(); +console.log('Done') + +const elapsed_seconds = (stop - start) / 1000; +const duration = wave.samples.length / wave.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'secodns') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) +console.log(waveFilename) +console.log('result\n', result) diff --git a/nodejs-addon-examples/test_asr_non_streaming_paraformer.js b/nodejs-addon-examples/test_asr_non_streaming_paraformer.js new file mode 100644 index 000000000..46505b720 --- /dev/null +++ b/nodejs-addon-examples/test_asr_non_streaming_paraformer.js @@ -0,0 +1,48 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); +const performance = require('perf_hooks').performance; + + +// Please download test files from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'paraformer': { + 'model': './sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx', + }, + 'tokens': './sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + } +}; + +const waveFilename = + './sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/5-henan.wav'; + +const recognizer = new sherpa_onnx.OfflineRecognizer(config); +console.log('Started') +let start = performance.now(); +const stream = recognizer.createStream(); +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); + +recognizer.decode(stream); +result = recognizer.getResult(stream) +let stop = performance.now(); +console.log('Done') + +const elapsed_seconds = (stop - start) / 1000; +const duration = wave.samples.length / wave.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'secodns') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) +console.log(waveFilename) +console.log('result\n', result) diff --git a/nodejs-addon-examples/test_asr_non_streaming_transducer.js b/nodejs-addon-examples/test_asr_non_streaming_transducer.js new file mode 100644 index 000000000..d44586238 --- /dev/null +++ b/nodejs-addon-examples/test_asr_non_streaming_transducer.js @@ -0,0 +1,52 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); +const performance = require('perf_hooks').performance; + + +// Please download test files from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'transducer': { + 'encoder': + './sherpa-onnx-zipformer-en-2023-04-01/encoder-epoch-99-avg-1.int8.onnx', + 'decoder': + './sherpa-onnx-zipformer-en-2023-04-01/decoder-epoch-99-avg-1.onnx', + 'joiner': + './sherpa-onnx-zipformer-en-2023-04-01/joiner-epoch-99-avg-1.int8.onnx', + }, + 'tokens': './sherpa-onnx-zipformer-en-2023-04-01/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + } +}; + +const waveFilename = './sherpa-onnx-zipformer-en-2023-04-01/test_wavs/1.wav'; + +const recognizer = new sherpa_onnx.OfflineRecognizer(config); +console.log('Started') +let start = performance.now(); +const stream = recognizer.createStream(); +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); + +recognizer.decode(stream); +result = recognizer.getResult(stream) +let stop = performance.now(); +console.log('Done') + +const elapsed_seconds = (stop - start) / 1000; +const duration = wave.samples.length / wave.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'secodns') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) +console.log(waveFilename) +console.log('result\n', result) diff --git a/nodejs-addon-examples/test_asr_non_streaming_whisper.js b/nodejs-addon-examples/test_asr_non_streaming_whisper.js new file mode 100644 index 000000000..559bd2140 --- /dev/null +++ b/nodejs-addon-examples/test_asr_non_streaming_whisper.js @@ -0,0 +1,48 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); +const performance = require('perf_hooks').performance; + + +// Please download test files from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'whisper': { + 'encoder': './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx', + 'decoder': './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx', + }, + 'tokens': './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + } +}; + +const waveFilename = './sherpa-onnx-whisper-tiny.en/test_wavs/0.wav'; + +const recognizer = new sherpa_onnx.OfflineRecognizer(config); +console.log('Started') +let start = performance.now(); +const stream = recognizer.createStream(); +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); + +recognizer.decode(stream); +result = recognizer.getResult(stream) +let stop = performance.now(); +console.log('Done') + +const elapsed_seconds = (stop - start) / 1000; +const duration = wave.samples.length / wave.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'secodns') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) +console.log(waveFilename) +console.log('result\n', result) diff --git a/nodejs-addon-examples/test_asr_streaming_paraformer.js b/nodejs-addon-examples/test_asr_streaming_paraformer.js new file mode 100644 index 000000000..5c57a8487 --- /dev/null +++ b/nodejs-addon-examples/test_asr_streaming_paraformer.js @@ -0,0 +1,56 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); +const performance = require('perf_hooks').performance; + + +// Please download test files from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'paraformer': { + 'encoder': + './sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx', + 'decoder': + './sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx', + }, + 'tokens': './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + } +}; + +const waveFilename = + './sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav'; + +const recognizer = new sherpa_onnx.OnlineRecognizer(config); +console.log('Started') +let start = performance.now(); +const stream = recognizer.createStream(); +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); + +const tailPadding = new Float32Array(wave.sampleRate * 0.4); +stream.acceptWaveform({samples: tailPadding, sampleRate: wave.sampleRate}); + +while (recognizer.isReady(stream)) { + recognizer.decode(stream); +} +result = recognizer.getResult(stream) +let stop = performance.now(); +console.log('Done') + +const elapsed_seconds = (stop - start) / 1000; +const duration = wave.samples.length / wave.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'secodns') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) +console.log(waveFilename) +console.log('result\n', result) diff --git a/nodejs-addon-examples/test_asr_streaming_paraformer_microphone.js b/nodejs-addon-examples/test_asr_streaming_paraformer_microphone.js new file mode 100644 index 000000000..0c87ce9b2 --- /dev/null +++ b/nodejs-addon-examples/test_asr_streaming_paraformer_microphone.js @@ -0,0 +1,104 @@ +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) +// +const portAudio = require('naudiodon2'); +// console.log(portAudio.getDevices()); + +const sherpa_onnx = require('sherpa-onnx-node'); + +function createOnlineRecognizer() { + const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'paraformer': { + 'encoder': + './sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx', + 'decoder': + './sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx', + }, + 'tokens': './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + }, + 'decodingMethod': 'greedy_search', + 'maxActivePaths': 4, + 'enableEndpoint': true, + 'rule1MinTrailingSilence': 2.4, + 'rule2MinTrailingSilence': 1.2, + 'rule3MinUtteranceLength': 20 + }; + + return new sherpa_onnx.OnlineRecognizer(config); +} + +const recognizer = createOnlineRecognizer(); +const stream = recognizer.createStream(); + +let lastText = ''; +let segmentIndex = 0; + +const ai = new portAudio.AudioIO({ + inOptions: { + channelCount: 1, + closeOnError: true, // Close the stream if an audio error is detected, if + // set false then just log the error + deviceId: -1, // Use -1 or omit the deviceId to select the default device + sampleFormat: portAudio.SampleFormatFloat32, + sampleRate: recognizer.config.featConfig.sampleRate + } +}); + +const display = new sherpa_onnx.Display(50); + +ai.on('data', data => { + const samples = new Float32Array(data.buffer); + + stream.acceptWaveform( + {sampleRate: recognizer.config.featConfig.sampleRate, samples: samples}); + + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + + const isEndpoint = recognizer.isEndpoint(stream); + let text = recognizer.getResult(stream).text.toLowerCase(); + + if (isEndpoint) { + // for online paraformer models, we have to manually padding on endpoint + // so that the last word can be recognized + const tailPadding = + new Float32Array(recognizer.config.featConfig.sampleRate * 0.4); + stream.acceptWaveform({ + samples: tailPadding, + sampleRate: recognizer.config.featConfig.sampleRate + }); + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + text = recognizer.getResult(stream).text.toLowerCase(); + } + + if (text.length > 0 && lastText != text) { + lastText = text; + display.print(segmentIndex, lastText); + } + if (isEndpoint) { + if (text.length > 0) { + lastText = text; + segmentIndex += 1; + } + recognizer.reset(stream) + } +}); + +ai.on('close', () => { + console.log('Free resources'); + stream.free(); + recognizer.free(); +}); + +ai.start(); +console.log('Started! Please speak') diff --git a/nodejs-addon-examples/test_vad_asr_non_streaming_nemo_ctc_microphone.js b/nodejs-addon-examples/test_vad_asr_non_streaming_nemo_ctc_microphone.js new file mode 100644 index 000000000..4567994c4 --- /dev/null +++ b/nodejs-addon-examples/test_vad_asr_non_streaming_nemo_ctc_microphone.js @@ -0,0 +1,110 @@ +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) +// +const portAudio = require('naudiodon2'); +// console.log(portAudio.getDevices()); + +const sherpa_onnx = require('sherpa-onnx-node'); + +function createRecognizer() { + // Please download test files from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'nemoCtc': { + 'model': + './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/model.onnx', + }, + 'tokens': + './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + } + }; + + return new sherpa_onnx.OfflineRecognizer(config); +} + +function createVad() { + // please download silero_vad.onnx from + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + const config = { + sileroVad: { + model: './silero_vad.onnx', + threshold: 0.5, + minSpeechDuration: 0.25, + minSilenceDuration: 0.5, + windowSize: 512, + }, + sampleRate: 16000, + debug: true, + numThreads: 1, + }; + + const bufferSizeInSeconds = 60; + + return new sherpa_onnx.Vad(config, bufferSizeInSeconds); +} + +const recognizer = createRecognizer(); +const vad = createVad(); + +const bufferSizeInSeconds = 30; +const buffer = + new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate); + +const ai = new portAudio.AudioIO({ + inOptions: { + channelCount: 1, + closeOnError: true, // Close the stream if an audio error is detected, if + // set false then just log the error + deviceId: -1, // Use -1 or omit the deviceId to select the default device + sampleFormat: portAudio.SampleFormatFloat32, + sampleRate: vad.config.sampleRate + } +}); + +let printed = false; +let index = 0; +ai.on('data', data => { + const windowSize = vad.config.sileroVad.windowSize; + buffer.push(new Float32Array(data.buffer)); + while (buffer.size() > windowSize) { + const samples = buffer.get(buffer.head(), windowSize); + buffer.pop(windowSize); + vad.acceptWaveform(samples); + } + + while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + const stream = recognizer.createStream(); + stream.acceptWaveform({ + samples: segment.samples, + sampleRate: recognizer.config.featConfig.sampleRate + }); + recognizer.decode(stream); + const r = recognizer.getResult(stream); + if (r.text.length > 0) { + const text = r.text.toLowerCase().trim(); + console.log(`${index}: ${text}`); + + const filename = `${index}-${text}-${ + new Date() + .toLocaleTimeString('en-US', {hour12: false}) + .split(' ')[0]}.wav`; + sherpa_onnx.writeWave( + filename, + {samples: segment.samples, sampleRate: vad.config.sampleRate}) + + index += 1; + } + } +}); + +ai.start(); +console.log('Started! Please speak') diff --git a/nodejs-addon-examples/test_vad_asr_non_streaming_paraformer_microphone.js b/nodejs-addon-examples/test_vad_asr_non_streaming_paraformer_microphone.js new file mode 100644 index 000000000..7e133b4fd --- /dev/null +++ b/nodejs-addon-examples/test_vad_asr_non_streaming_paraformer_microphone.js @@ -0,0 +1,108 @@ +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) +// +const portAudio = require('naudiodon2'); +// console.log(portAudio.getDevices()); + +const sherpa_onnx = require('sherpa-onnx-node'); + +function createRecognizer() { + // Please download test files from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'paraformer': { + 'model': './sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx', + }, + 'tokens': './sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + } + }; + + return new sherpa_onnx.OfflineRecognizer(config); +} + +function createVad() { + // please download silero_vad.onnx from + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + const config = { + sileroVad: { + model: './silero_vad.onnx', + threshold: 0.5, + minSpeechDuration: 0.25, + minSilenceDuration: 0.5, + windowSize: 512, + }, + sampleRate: 16000, + debug: true, + numThreads: 1, + }; + + const bufferSizeInSeconds = 60; + + return new sherpa_onnx.Vad(config, bufferSizeInSeconds); +} + +const recognizer = createRecognizer(); +const vad = createVad(); + +const bufferSizeInSeconds = 30; +const buffer = + new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate); + +const ai = new portAudio.AudioIO({ + inOptions: { + channelCount: 1, + closeOnError: true, // Close the stream if an audio error is detected, if + // set false then just log the error + deviceId: -1, // Use -1 or omit the deviceId to select the default device + sampleFormat: portAudio.SampleFormatFloat32, + sampleRate: vad.config.sampleRate + } +}); + +let printed = false; +let index = 0; +ai.on('data', data => { + const windowSize = vad.config.sileroVad.windowSize; + buffer.push(new Float32Array(data.buffer)); + while (buffer.size() > windowSize) { + const samples = buffer.get(buffer.head(), windowSize); + buffer.pop(windowSize); + vad.acceptWaveform(samples); + } + + while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + const stream = recognizer.createStream(); + stream.acceptWaveform({ + samples: segment.samples, + sampleRate: recognizer.config.featConfig.sampleRate + }); + recognizer.decode(stream); + const r = recognizer.getResult(stream); + if (r.text.length > 0) { + const text = r.text.toLowerCase().trim(); + console.log(`${index}: ${text}`); + + const filename = `${index}-${text}-${ + new Date() + .toLocaleTimeString('en-US', {hour12: false}) + .split(' ')[0]}.wav`; + sherpa_onnx.writeWave( + filename, + {samples: segment.samples, sampleRate: vad.config.sampleRate}) + + index += 1; + } + } +}); + +ai.start(); +console.log('Started! Please speak') diff --git a/nodejs-addon-examples/test_vad_asr_non_streaming_transducer_microphone.js b/nodejs-addon-examples/test_vad_asr_non_streaming_transducer_microphone.js new file mode 100644 index 000000000..c554e1e90 --- /dev/null +++ b/nodejs-addon-examples/test_vad_asr_non_streaming_transducer_microphone.js @@ -0,0 +1,113 @@ +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) +// +const portAudio = require('naudiodon2'); +// console.log(portAudio.getDevices()); + +const sherpa_onnx = require('sherpa-onnx-node'); + +function createRecognizer() { + // Please download test files from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'transducer': { + 'encoder': + './sherpa-onnx-zipformer-en-2023-04-01/encoder-epoch-99-avg-1.int8.onnx', + 'decoder': + './sherpa-onnx-zipformer-en-2023-04-01/decoder-epoch-99-avg-1.onnx', + 'joiner': + './sherpa-onnx-zipformer-en-2023-04-01/joiner-epoch-99-avg-1.int8.onnx', + }, + 'tokens': './sherpa-onnx-zipformer-en-2023-04-01/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + } + }; + + return new sherpa_onnx.OfflineRecognizer(config); +} + +function createVad() { + // please download silero_vad.onnx from + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + const config = { + sileroVad: { + model: './silero_vad.onnx', + threshold: 0.5, + minSpeechDuration: 0.25, + minSilenceDuration: 0.5, + windowSize: 512, + }, + sampleRate: 16000, + debug: true, + numThreads: 1, + }; + + const bufferSizeInSeconds = 60; + + return new sherpa_onnx.Vad(config, bufferSizeInSeconds); +} + +const recognizer = createRecognizer(); +const vad = createVad(); + +const bufferSizeInSeconds = 30; +const buffer = + new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate); + +const ai = new portAudio.AudioIO({ + inOptions: { + channelCount: 1, + closeOnError: true, // Close the stream if an audio error is detected, if + // set false then just log the error + deviceId: -1, // Use -1 or omit the deviceId to select the default device + sampleFormat: portAudio.SampleFormatFloat32, + sampleRate: vad.config.sampleRate + } +}); + +let printed = false; +let index = 0; +ai.on('data', data => { + const windowSize = vad.config.sileroVad.windowSize; + buffer.push(new Float32Array(data.buffer)); + while (buffer.size() > windowSize) { + const samples = buffer.get(buffer.head(), windowSize); + buffer.pop(windowSize); + vad.acceptWaveform(samples); + } + + while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + const stream = recognizer.createStream(); + stream.acceptWaveform({ + samples: segment.samples, + sampleRate: recognizer.config.featConfig.sampleRate + }); + recognizer.decode(stream); + const r = recognizer.getResult(stream); + if (r.text.length > 0) { + const text = r.text.toLowerCase().trim(); + console.log(`${index}: ${text}`); + + const filename = `${index}-${text}-${ + new Date() + .toLocaleTimeString('en-US', {hour12: false}) + .split(' ')[0]}.wav`; + sherpa_onnx.writeWave( + filename, + {samples: segment.samples, sampleRate: vad.config.sampleRate}) + + index += 1; + } + } +}); + +ai.start(); +console.log('Started! Please speak') diff --git a/nodejs-addon-examples/test_vad_asr_non_streaming_whisper_microphone.js b/nodejs-addon-examples/test_vad_asr_non_streaming_whisper_microphone.js new file mode 100644 index 000000000..0261c5cbd --- /dev/null +++ b/nodejs-addon-examples/test_vad_asr_non_streaming_whisper_microphone.js @@ -0,0 +1,109 @@ +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) +// +const portAudio = require('naudiodon2'); +// console.log(portAudio.getDevices()); + +const sherpa_onnx = require('sherpa-onnx-node'); + +function createRecognizer() { + // Please download test files from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'whisper': { + 'encoder': './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx', + 'decoder': './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx', + }, + 'tokens': './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + } + }; + + return new sherpa_onnx.OfflineRecognizer(config); +} + +function createVad() { + // please download silero_vad.onnx from + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + const config = { + sileroVad: { + model: './silero_vad.onnx', + threshold: 0.5, + minSpeechDuration: 0.25, + minSilenceDuration: 0.5, + windowSize: 512, + }, + sampleRate: 16000, + debug: true, + numThreads: 1, + }; + + const bufferSizeInSeconds = 60; + + return new sherpa_onnx.Vad(config, bufferSizeInSeconds); +} + +const recognizer = createRecognizer(); +const vad = createVad(); + +const bufferSizeInSeconds = 30; +const buffer = + new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate); + +const ai = new portAudio.AudioIO({ + inOptions: { + channelCount: 1, + closeOnError: true, // Close the stream if an audio error is detected, if + // set false then just log the error + deviceId: -1, // Use -1 or omit the deviceId to select the default device + sampleFormat: portAudio.SampleFormatFloat32, + sampleRate: vad.config.sampleRate + } +}); + +let printed = false; +let index = 0; +ai.on('data', data => { + const windowSize = vad.config.sileroVad.windowSize; + buffer.push(new Float32Array(data.buffer)); + while (buffer.size() > windowSize) { + const samples = buffer.get(buffer.head(), windowSize); + buffer.pop(windowSize); + vad.acceptWaveform(samples); + } + + while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + const stream = recognizer.createStream(); + stream.acceptWaveform({ + samples: segment.samples, + sampleRate: recognizer.config.featConfig.sampleRate + }); + recognizer.decode(stream); + const r = recognizer.getResult(stream); + if (r.text.length > 0) { + const text = r.text.toLowerCase().trim(); + console.log(`${index}: ${text}`); + + const filename = `${index}-${text}-${ + new Date() + .toLocaleTimeString('en-US', {hour12: false}) + .split(' ')[0]}.wav`; + sherpa_onnx.writeWave( + filename, + {samples: segment.samples, sampleRate: vad.config.sampleRate}) + + index += 1; + } + } +}); + +ai.start(); +console.log('Started! Please speak') diff --git a/scripts/node-addon-api/CMakeLists.txt b/scripts/node-addon-api/CMakeLists.txt index d8656081a..3668f9558 100644 --- a/scripts/node-addon-api/CMakeLists.txt +++ b/scripts/node-addon-api/CMakeLists.txt @@ -18,6 +18,7 @@ add_definitions(-DNAPI_VERSION=3) include_directories(${CMAKE_JS_INC}) set(srcs + src/non-streaming-asr.cc src/sherpa-onnx-node-addon-api.cc src/streaming-asr.cc src/vad.cc diff --git a/scripts/node-addon-api/lib/non-streaming-asr.js b/scripts/node-addon-api/lib/non-streaming-asr.js new file mode 100644 index 000000000..852343e64 --- /dev/null +++ b/scripts/node-addon-api/lib/non-streaming-asr.js @@ -0,0 +1,40 @@ +const addon = require('./addon.js'); + +class OfflineStream { + constructor(handle) { + this.handle = handle; + } + + // obj is {samples: samples, sampleRate: sampleRate} + // samples is a float32 array containing samples in the range [-1, 1] + // sampleRate is a number + acceptWaveform(obj) { + addon.acceptWaveformOffline(this.handle, obj) + } +} + +class OfflineRecognizer { + constructor(config) { + this.handle = addon.createOfflineRecognizer(config); + this.config = config + } + + createStream() { + const handle = addon.createOfflineStream(this.handle); + return new OfflineStream(handle); + } + + decode(stream) { + addon.decodeOfflineStream(this.handle, stream.handle); + } + + getResult(stream) { + const jsonStr = addon.getOfflineStreamResultAsJson(stream.handle); + + return JSON.parse(jsonStr); + } +} + +module.exports = { + OfflineRecognizer, +} diff --git a/scripts/node-addon-api/lib/sherpa-onnx.js b/scripts/node-addon-api/lib/sherpa-onnx.js index 7619b26ff..a8840a298 100644 --- a/scripts/node-addon-api/lib/sherpa-onnx.js +++ b/scripts/node-addon-api/lib/sherpa-onnx.js @@ -1,9 +1,11 @@ const addon = require('./addon.js') const streaming_asr = require('./streaming-asr.js'); +const non_streaming_asr = require('./non-streaming-asr.js'); const vad = require('./vad.js'); module.exports = { OnlineRecognizer: streaming_asr.OnlineRecognizer, + OfflineRecognizer: non_streaming_asr.OfflineRecognizer, readWave: addon.readWave, writeWave: addon.writeWave, Display: streaming_asr.Display, diff --git a/scripts/node-addon-api/src/non-streaming-asr.cc b/scripts/node-addon-api/src/non-streaming-asr.cc new file mode 100644 index 000000000..a1fc2f37c --- /dev/null +++ b/scripts/node-addon-api/src/non-streaming-asr.cc @@ -0,0 +1,586 @@ +// scripts/node-addon-api/src/non-streaming-asr.cc +// +// Copyright (c) 2024 Xiaomi Corporation +#include + +#include "napi.h" // NOLINT +#include "sherpa-onnx/c-api/c-api.h" + +// defined in ./streaming-asr.cc +SherpaOnnxFeatureConfig GetFeatureConfig(Napi::Object obj); + +static SherpaOnnxOfflineTransducerModelConfig GetOfflineTransducerModelConfig( + Napi::Object obj) { + SherpaOnnxOfflineTransducerModelConfig config; + memset(&config, 0, sizeof(config)); + + if (!obj.Has("transducer") || !obj.Get("transducer").IsObject()) { + return config; + } + + Napi::Object o = obj.Get("transducer").As(); + + if (o.Has("encoder") && o.Get("encoder").IsString()) { + Napi::String encoder = o.Get("encoder").As(); + std::string s = encoder.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + config.encoder = p; + } + + if (o.Has("decoder") && o.Get("decoder").IsString()) { + Napi::String decoder = o.Get("decoder").As(); + std::string s = decoder.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + config.decoder = p; + } + + if (o.Has("joiner") && o.Get("joiner").IsString()) { + Napi::String joiner = o.Get("joiner").As(); + std::string s = joiner.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + config.joiner = p; + } + + return config; +} + +static SherpaOnnxOfflineParaformerModelConfig GetOfflineParaformerModelConfig( + Napi::Object obj) { + SherpaOnnxOfflineParaformerModelConfig config; + memset(&config, 0, sizeof(config)); + + if (!obj.Has("paraformer") || !obj.Get("paraformer").IsObject()) { + return config; + } + + Napi::Object o = obj.Get("paraformer").As(); + + if (o.Has("model") && o.Get("model").IsString()) { + Napi::String model = o.Get("model").As(); + std::string s = model.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + config.model = p; + } + + return config; +} + +static SherpaOnnxOfflineNemoEncDecCtcModelConfig GetOfflineNeMoCtcModelConfig( + Napi::Object obj) { + SherpaOnnxOfflineNemoEncDecCtcModelConfig config; + memset(&config, 0, sizeof(config)); + + if (!obj.Has("nemoCtc") || !obj.Get("nemoCtc").IsObject()) { + return config; + } + + Napi::Object o = obj.Get("nemoCtc").As(); + + if (o.Has("model") && o.Get("model").IsString()) { + Napi::String model = o.Get("model").As(); + std::string s = model.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + config.model = p; + } + + return config; +} + +static SherpaOnnxOfflineWhisperModelConfig GetOfflineWhisperModelConfig( + Napi::Object obj) { + SherpaOnnxOfflineWhisperModelConfig config; + memset(&config, 0, sizeof(config)); + + if (!obj.Has("whisper") || !obj.Get("whisper").IsObject()) { + return config; + } + + Napi::Object o = obj.Get("whisper").As(); + + if (o.Has("encoder") && o.Get("encoder").IsString()) { + Napi::String encoder = o.Get("encoder").As(); + std::string s = encoder.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + config.encoder = p; + } + + if (o.Has("decoder") && o.Get("decoder").IsString()) { + Napi::String decoder = o.Get("decoder").As(); + std::string s = decoder.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + config.decoder = p; + } + + if (o.Has("language") && o.Get("language").IsString()) { + Napi::String language = o.Get("language").As(); + std::string s = language.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + config.language = p; + } + + if (o.Has("task") && o.Get("task").IsString()) { + Napi::String task = o.Get("task").As(); + std::string s = task.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + config.task = p; + } + + return config; +} + +static SherpaOnnxOfflineTdnnModelConfig GetOfflineTdnnModelConfig( + Napi::Object obj) { + SherpaOnnxOfflineTdnnModelConfig config; + memset(&config, 0, sizeof(config)); + + if (!obj.Has("tdnn") || !obj.Get("tdnn").IsObject()) { + return config; + } + + Napi::Object o = obj.Get("tdnn").As(); + + if (o.Has("model") && o.Get("model").IsString()) { + Napi::String model = o.Get("model").As(); + std::string s = model.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + config.model = p; + } + + return config; +} + +static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { + SherpaOnnxOfflineModelConfig c; + memset(&c, 0, sizeof(c)); + + if (!obj.Has("modelConfig") || !obj.Get("modelConfig").IsObject()) { + return c; + } + + Napi::Object o = obj.Get("modelConfig").As(); + + c.transducer = GetOfflineTransducerModelConfig(o); + c.paraformer = GetOfflineParaformerModelConfig(o); + c.nemo_ctc = GetOfflineNeMoCtcModelConfig(o); + c.whisper = GetOfflineWhisperModelConfig(o); + c.tdnn = GetOfflineTdnnModelConfig(o); + + if (o.Has("tokens") && o.Get("tokens").IsString()) { + Napi::String tokens = o.Get("tokens").As(); + std::string s = tokens.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.tokens = p; + } + + if (o.Has("numThreads") && o.Get("numThreads").IsNumber()) { + c.num_threads = o.Get("numThreads").As().Int32Value(); + } + + if (o.Has("debug") && + (o.Get("debug").IsNumber() || o.Get("debug").IsBoolean())) { + if (o.Get("debug").IsBoolean()) { + c.debug = o.Get("debug").As().Value(); + } else { + c.debug = o.Get("debug").As().Int32Value(); + } + } + + if (o.Has("provider") && o.Get("provider").IsString()) { + Napi::String provider = o.Get("provider").As(); + std::string s = provider.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.provider = p; + } + + if (o.Has("modelType") && o.Get("modelType").IsString()) { + Napi::String model_type = o.Get("modelType").As(); + std::string s = model_type.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.model_type = p; + } + + return c; +} + +static SherpaOnnxOfflineLMConfig GetOfflineLMConfig(Napi::Object obj) { + SherpaOnnxOfflineLMConfig c; + memset(&c, 0, sizeof(c)); + + if (!obj.Has("lmConfig") || !obj.Get("lmConfig").IsObject()) { + return c; + } + + Napi::Object o = obj.Get("lmConfig").As(); + + if (o.Has("model") && o.Get("model").IsString()) { + Napi::String model = o.Get("model").As(); + std::string s = model.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.model = p; + } + + if (o.Has("scale") && o.Get("scale").IsNumber()) { + c.scale = o.Get("scale").As().FloatValue(); + } + + return c; +} + +static Napi::External +CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + if (info.Length() != 1) { + std::ostringstream os; + os << "Expect only 1 argument. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[0].IsObject()) { + Napi::TypeError::New(env, "Expect an object as the argument") + .ThrowAsJavaScriptException(); + + return {}; + } + + Napi::Object o = info[0].As(); + + SherpaOnnxOfflineRecognizerConfig c; + memset(&c, 0, sizeof(c)); + c.feat_config = GetFeatureConfig(o); + c.model_config = GetOfflineModelConfig(o); + c.lm_config = GetOfflineLMConfig(o); + + if (o.Has("decodingMethod") && o.Get("decodingMethod").IsString()) { + Napi::String decoding_method = o.Get("decodingMethod").As(); + std::string s = decoding_method.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.decoding_method = p; + } + + if (o.Has("maxActivePaths") && o.Get("maxActivePaths").IsNumber()) { + c.max_active_paths = + o.Get("maxActivePaths").As().Int32Value(); + } + + if (o.Has("hotwordsFile") && o.Get("hotwordsFile").IsString()) { + Napi::String hotwords_file = o.Get("hotwordsFile").As(); + std::string s = hotwords_file.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.hotwords_file = p; + } + + if (o.Has("hotwordsScore") && o.Get("hotwordsScore").IsNumber()) { + c.hotwords_score = o.Get("hotwordsScore").As().FloatValue(); + } + + SherpaOnnxOfflineRecognizer *recognizer = CreateOfflineRecognizer(&c); + + if (c.model_config.transducer.encoder) { + delete[] c.model_config.transducer.encoder; + } + + if (c.model_config.transducer.decoder) { + delete[] c.model_config.transducer.decoder; + } + + if (c.model_config.transducer.joiner) { + delete[] c.model_config.transducer.joiner; + } + + if (c.model_config.paraformer.model) { + delete[] c.model_config.paraformer.model; + } + + if (c.model_config.nemo_ctc.model) { + delete[] c.model_config.nemo_ctc.model; + } + + if (c.model_config.whisper.encoder) { + delete[] c.model_config.whisper.encoder; + } + + if (c.model_config.whisper.decoder) { + delete[] c.model_config.whisper.decoder; + } + + if (c.model_config.whisper.language) { + delete[] c.model_config.whisper.language; + } + + if (c.model_config.whisper.task) { + delete[] c.model_config.whisper.task; + } + + if (c.model_config.tdnn.model) { + delete[] c.model_config.tdnn.model; + } + + if (c.model_config.tokens) { + delete[] c.model_config.tokens; + } + + if (c.model_config.provider) { + delete[] c.model_config.provider; + } + + if (c.model_config.model_type) { + delete[] c.model_config.model_type; + } + + if (c.lm_config.model) { + delete[] c.lm_config.model; + } + + if (c.decoding_method) { + delete[] c.decoding_method; + } + + if (c.hotwords_file) { + delete[] c.hotwords_file; + } + + if (!recognizer) { + Napi::TypeError::New(env, "Please check your config!") + .ThrowAsJavaScriptException(); + + return {}; + } + + return Napi::External::New( + env, recognizer, + [](Napi::Env env, SherpaOnnxOfflineRecognizer *recognizer) { + DestroyOfflineRecognizer(recognizer); + }); +} + +static Napi::External CreateOfflineStreamWrapper( + const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + if (info.Length() != 1) { + std::ostringstream os; + os << "Expect only 1 argument. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[0].IsExternal()) { + Napi::TypeError::New( + env, + "You should pass an offline recognizer pointer as the only argument") + .ThrowAsJavaScriptException(); + + return {}; + } + + SherpaOnnxOfflineRecognizer *recognizer = + info[0].As>().Data(); + + SherpaOnnxOfflineStream *stream = CreateOfflineStream(recognizer); + + return Napi::External::New( + env, stream, [](Napi::Env env, SherpaOnnxOfflineStream *stream) { + DestroyOfflineStream(stream); + }); +} + +static void AcceptWaveformOfflineWrapper(const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + + if (info.Length() != 2) { + std::ostringstream os; + os << "Expect only 2 arguments. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return; + } + + if (!info[0].IsExternal()) { + Napi::TypeError::New(env, "Argument 0 should be an online stream pointer.") + .ThrowAsJavaScriptException(); + + return; + } + + SherpaOnnxOfflineStream *stream = + info[0].As>().Data(); + + if (!info[1].IsObject()) { + Napi::TypeError::New(env, "Argument 1 should be an object") + .ThrowAsJavaScriptException(); + + return; + } + + Napi::Object obj = info[1].As(); + + if (!obj.Has("samples")) { + Napi::TypeError::New(env, "The argument object should have a field samples") + .ThrowAsJavaScriptException(); + + return; + } + + if (!obj.Get("samples").IsTypedArray()) { + Napi::TypeError::New(env, "The object['samples'] should be a typed array") + .ThrowAsJavaScriptException(); + + return; + } + + if (!obj.Has("sampleRate")) { + Napi::TypeError::New(env, + "The argument object should have a field sampleRate") + .ThrowAsJavaScriptException(); + + return; + } + + if (!obj.Get("sampleRate").IsNumber()) { + Napi::TypeError::New(env, "The object['samples'] should be a number") + .ThrowAsJavaScriptException(); + + return; + } + + Napi::Float32Array samples = obj.Get("samples").As(); + int32_t sample_rate = obj.Get("sampleRate").As().Int32Value(); + + AcceptWaveformOffline(stream, sample_rate, samples.Data(), + samples.ElementLength()); +} + +static void DecodeOfflineStreamWrapper(const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + if (info.Length() != 2) { + std::ostringstream os; + os << "Expect only 2 arguments. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return; + } + + if (!info[0].IsExternal()) { + Napi::TypeError::New(env, + "Argument 0 should be an offline recognizer pointer.") + .ThrowAsJavaScriptException(); + + return; + } + + if (!info[1].IsExternal()) { + Napi::TypeError::New(env, "Argument 1 should be an offline stream pointer.") + .ThrowAsJavaScriptException(); + + return; + } + + SherpaOnnxOfflineRecognizer *recognizer = + info[0].As>().Data(); + + SherpaOnnxOfflineStream *stream = + info[1].As>().Data(); + + DecodeOfflineStream(recognizer, stream); +} + +static Napi::String GetOfflineStreamResultAsJsonWrapper( + const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + if (info.Length() != 1) { + std::ostringstream os; + os << "Expect only 1 argument. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[0].IsExternal()) { + Napi::TypeError::New(env, "Argument 0 should be an online stream pointer.") + .ThrowAsJavaScriptException(); + + return {}; + } + + SherpaOnnxOfflineStream *stream = + info[0].As>().Data(); + + const char *json = GetOfflineStreamResultAsJson(stream); + Napi::String s = Napi::String::New(env, json); + + DestroyOfflineStreamResultJson(json); + + return s; +} + +void InitNonStreamingAsr(Napi::Env env, Napi::Object exports) { + exports.Set(Napi::String::New(env, "createOfflineRecognizer"), + Napi::Function::New(env, CreateOfflineRecognizerWrapper)); + + exports.Set(Napi::String::New(env, "createOfflineStream"), + Napi::Function::New(env, CreateOfflineStreamWrapper)); + + exports.Set(Napi::String::New(env, "acceptWaveformOffline"), + Napi::Function::New(env, AcceptWaveformOfflineWrapper)); + + exports.Set(Napi::String::New(env, "decodeOfflineStream"), + Napi::Function::New(env, DecodeOfflineStreamWrapper)); + + exports.Set(Napi::String::New(env, "getOfflineStreamResultAsJson"), + Napi::Function::New(env, GetOfflineStreamResultAsJsonWrapper)); +} diff --git a/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc b/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc index 97910fcd6..5e0211dd2 100644 --- a/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc +++ b/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc @@ -4,15 +4,21 @@ #include "napi.h" // NOLINT void InitStreamingAsr(Napi::Env env, Napi::Object exports); + +void InitNonStreamingAsr(Napi::Env env, Napi::Object exports); + +void InitVad(Napi::Env env, Napi::Object exports); + void InitWaveReader(Napi::Env env, Napi::Object exports); + void InitWaveWriter(Napi::Env env, Napi::Object exports); -void InitVad(Napi::Env env, Napi::Object exports); Napi::Object Init(Napi::Env env, Napi::Object exports) { InitStreamingAsr(env, exports); + InitNonStreamingAsr(env, exports); + InitVad(env, exports); InitWaveReader(env, exports); InitWaveWriter(env, exports); - InitVad(env, exports); return exports; } diff --git a/scripts/node-addon-api/src/streaming-asr.cc b/scripts/node-addon-api/src/streaming-asr.cc index 9314c3a5f..fb17e62e3 100644 --- a/scripts/node-addon-api/src/streaming-asr.cc +++ b/scripts/node-addon-api/src/streaming-asr.cc @@ -13,7 +13,7 @@ } }; */ -static SherpaOnnxFeatureConfig GetFeatureConfig(Napi::Object obj) { +SherpaOnnxFeatureConfig GetFeatureConfig(Napi::Object obj) { SherpaOnnxFeatureConfig config; memset(&config, 0, sizeof(config)); @@ -113,6 +113,39 @@ GetOnlineZipformer2CtcModelConfig(Napi::Object obj) { return config; } +static SherpaOnnxOnlineParaformerModelConfig GetOnlineParaformerModelConfig( + Napi::Object obj) { + SherpaOnnxOnlineParaformerModelConfig config; + memset(&config, 0, sizeof(config)); + + if (!obj.Has("paraformer") || !obj.Get("paraformer").IsObject()) { + return config; + } + + Napi::Object o = obj.Get("paraformer").As(); + + if (o.Has("encoder") && o.Get("encoder").IsString()) { + Napi::String encoder = o.Get("encoder").As(); + std::string s = encoder.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + config.encoder = p; + } + + if (o.Has("decoder") && o.Get("decoder").IsString()) { + Napi::String decoder = o.Get("decoder").As(); + std::string s = decoder.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + config.decoder = p; + } + return config; +} + static SherpaOnnxOnlineModelConfig GetOnlineModelConfig(Napi::Object obj) { SherpaOnnxOnlineModelConfig config; memset(&config, 0, sizeof(config)); @@ -124,6 +157,7 @@ static SherpaOnnxOnlineModelConfig GetOnlineModelConfig(Napi::Object obj) { Napi::Object o = obj.Get("modelConfig").As(); config.transducer = GetOnlineTransducerModelConfig(o); + config.paraformer = GetOnlineParaformerModelConfig(o); config.zipformer2_ctc = GetOnlineZipformer2CtcModelConfig(o); if (o.Has("tokens") && o.Get("tokens").IsString()) { @@ -290,35 +324,6 @@ static Napi::External CreateOnlineRecognizerWrapper( c.ctc_fst_decoder_config = GetCtcFstDecoderConfig(config); -#if 0 - printf("encoder: %s\n", c.model_config.transducer.encoder - ? c.model_config.transducer.encoder - : "no"); - printf("decoder: %s\n", c.model_config.transducer.decoder - ? c.model_config.transducer.decoder - : "no"); - printf("joiner: %s\n", c.model_config.transducer.joiner - ? c.model_config.transducer.joiner - : "no"); - - printf("tokens: %s\n", c.model_config.tokens ? c.model_config.tokens : "no"); - printf("num_threads: %d\n", c.model_config.num_threads); - printf("provider: %s\n", - c.model_config.provider ? c.model_config.provider : "no"); - printf("debug: %d\n", c.model_config.debug); - printf("model_type: %s\n", - c.model_config.model_type ? c.model_config.model_type : "no"); - - printf("decoding_method: %s\n", c.decoding_method ? c.decoding_method : "no"); - printf("max_active_paths: %d\n", c.max_active_paths); - printf("enable_endpoint: %d\n", c.enable_endpoint); - printf("rule1_min_trailing_silence: %.3f\n", c.rule1_min_trailing_silence); - printf("rule2_min_trailing_silence: %.3f\n", c.rule2_min_trailing_silence); - printf("rule3_min_utterance_length: %.3f\n", c.rule3_min_utterance_length); - printf("hotwords_file: %s\n", c.hotwords_file ? c.hotwords_file : "no"); - printf("hotwords_score: %.3f\n", c.hotwords_score); -#endif - SherpaOnnxOnlineRecognizer *recognizer = CreateOnlineRecognizer(&c); if (c.model_config.transducer.encoder) { @@ -333,6 +338,14 @@ static Napi::External CreateOnlineRecognizerWrapper( delete[] c.model_config.transducer.joiner; } + if (c.model_config.paraformer.encoder) { + delete[] c.model_config.paraformer.encoder; + } + + if (c.model_config.paraformer.decoder) { + delete[] c.model_config.paraformer.decoder; + } + if (c.model_config.zipformer2_ctc.model) { delete[] c.model_config.zipformer2_ctc.model; } @@ -389,7 +402,8 @@ static Napi::External CreateOnlineStreamWrapper( if (!info[0].IsExternal()) { Napi::TypeError::New( - env, "You should pass a recognizer pointer as the only argument") + env, + "You should pass an online recognizer pointer as the only argument") .ThrowAsJavaScriptException(); return {};