From 70d14353bbfb4068c2e86951c194d1c79d319c3d Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sun, 21 Jul 2024 15:39:55 +0800 Subject: [PATCH] Add WebAssembly for SenseVoice (#1158) --- .github/scripts/test-nodejs-npm.sh | 7 + .github/workflows/test-nodejs.yaml | 9 + nodejs-examples/README.md | 15 + nodejs-examples/test-offline-nemo-ctc.js | 26 -- .../test-offline-paraformer-itn.js | 26 -- nodejs-examples/test-offline-paraformer.js | 27 -- nodejs-examples/test-offline-sense-voice.js | 101 ++++++ nodejs-examples/test-offline-transducer.js | 22 -- nodejs-examples/test-offline-tts-en.js | 4 - nodejs-examples/test-offline-tts-zh.js | 3 - nodejs-examples/test-offline-whisper.js | 23 -- .../test-online-paraformer-microphone.js | 18 -- nodejs-examples/test-online-paraformer.js | 18 -- nodejs-examples/test-online-transducer-itn.js | 17 - .../test-online-transducer-microphone.js | 17 - nodejs-examples/test-online-transducer.js | 17 - .../test-online-zipformer2-ctc-hlg.js | 15 - nodejs-examples/test-online-zipformer2-ctc.js | 20 -- wasm/asr/sherpa-onnx-asr.js | 295 +++++++++++++----- wasm/nodejs/sherpa-onnx-wasm-nodejs.cc | 10 +- wasm/tts/sherpa-onnx-tts.js | 44 ++- 21 files changed, 383 insertions(+), 351 deletions(-) create mode 100644 nodejs-examples/test-offline-sense-voice.js diff --git a/.github/scripts/test-nodejs-npm.sh b/.github/scripts/test-nodejs-npm.sh index b5822dd8e..006be8b0d 100755 --- a/.github/scripts/test-nodejs-npm.sh +++ b/.github/scripts/test-nodejs-npm.sh @@ -10,6 +10,13 @@ ls -lh ls -lh node_modules # offline asr +# +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + +node ./test-offline-sense-voice.js +rm -rf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 ls -lh diff --git a/.github/workflows/test-nodejs.yaml b/.github/workflows/test-nodejs.yaml index 4ae3014c4..21a0d0a5c 100644 --- a/.github/workflows/test-nodejs.yaml +++ b/.github/workflows/test-nodejs.yaml @@ -48,6 +48,11 @@ jobs: with: fetch-depth: 0 + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + key: ${{ matrix.os }}-${{ matrix.build_type }}-wasm-nodejs + - name: Install emsdk uses: mymindstorm/setup-emsdk@v14 @@ -77,6 +82,10 @@ jobs: env: NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} run: | + export CMAKE_CXX_COMPILER_LAUNCHER=ccache + export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH" + cmake --version + ./build-wasm-simd-nodejs.sh cp -v build-wasm-simd-nodejs/install/bin/wasm/nodejs/*.js ./scripts/nodejs/ cp -v build-wasm-simd-nodejs/install/bin/wasm/nodejs/*.wasm ./scripts/nodejs/ diff --git a/nodejs-examples/README.md b/nodejs-examples/README.md index 3399b8255..73a85de77 100644 --- a/nodejs-examples/README.md +++ b/nodejs-examples/README.md @@ -88,6 +88,21 @@ tar xvf sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 node ./test-offline-paraformer.js ``` +## ./test-offline-sense-voice.js + +[./test-offline-sense-voice.js](./test-offline-sense-voice.js) demonstrates +how to decode a file with a non-streaming Paraformer model. + +You can use the following command to run it: + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + +node ./test-offline-sense-voice.js +``` + ## ./test-offline-transducer.js [./test-offline-transducer.js](./test-offline-transducer.js) demonstrates diff --git a/nodejs-examples/test-offline-nemo-ctc.js b/nodejs-examples/test-offline-nemo-ctc.js index 7ed15b009..fc18d4194 100644 --- a/nodejs-examples/test-offline-nemo-ctc.js +++ b/nodejs-examples/test-offline-nemo-ctc.js @@ -13,27 +13,9 @@ function createOfflineRecognizer() { }; let modelConfig = { - transducer: { - encoder: '', - decoder: '', - joiner: '', - }, - paraformer: { - model: '', - }, nemoCtc: { model: './sherpa-onnx-nemo-ctc-en-conformer-small/model.int8.onnx', }, - whisper: { - encoder: '', - decoder: '', - language: '', - task: '', - tailPaddings: -1, - }, - tdnn: { - model: '', - }, tokens: './sherpa-onnx-nemo-ctc-en-conformer-small/tokens.txt', numThreads: 1, debug: 0, @@ -41,19 +23,11 @@ function createOfflineRecognizer() { modelType: 'nemo_ctc', }; - let lmConfig = { - model: '', - scale: 1.0, - }; - let config = { featConfig: featConfig, modelConfig: modelConfig, - lmConfig: lmConfig, decodingMethod: 'greedy_search', maxActivePaths: 4, - hotwordsFile: '', - hotwordsScore: 1.5, }; return sherpa_onnx.createOfflineRecognizer(config); diff --git a/nodejs-examples/test-offline-paraformer-itn.js b/nodejs-examples/test-offline-paraformer-itn.js index 3acb69744..58856cc82 100644 --- a/nodejs-examples/test-offline-paraformer-itn.js +++ b/nodejs-examples/test-offline-paraformer-itn.js @@ -13,27 +13,9 @@ function createOfflineRecognizer() { }; let modelConfig = { - transducer: { - encoder: '', - decoder: '', - joiner: '', - }, paraformer: { model: './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx', }, - nemoCtc: { - model: '', - }, - whisper: { - encoder: '', - decoder: '', - language: '', - task: '', - tailPaddings: -1, - }, - tdnn: { - model: '', - }, tokens: './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt', numThreads: 1, debug: 0, @@ -41,19 +23,11 @@ function createOfflineRecognizer() { modelType: 'paraformer', }; - let lmConfig = { - model: '', - scale: 1.0, - }; let config = { featConfig: featConfig, modelConfig: modelConfig, - lmConfig: lmConfig, decodingMethod: 'greedy_search', - maxActivePaths: 4, - hotwordsFile: '', - hotwordsScore: 1.5, // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst ruleFsts: './itn_zh_number.fst', }; diff --git a/nodejs-examples/test-offline-paraformer.js b/nodejs-examples/test-offline-paraformer.js index 800ebcbd8..d9286aaab 100644 --- a/nodejs-examples/test-offline-paraformer.js +++ b/nodejs-examples/test-offline-paraformer.js @@ -13,27 +13,9 @@ function createOfflineRecognizer() { }; let modelConfig = { - transducer: { - encoder: '', - decoder: '', - joiner: '', - }, paraformer: { model: './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx', }, - nemoCtc: { - model: '', - }, - whisper: { - encoder: '', - decoder: '', - language: '', - task: '', - tailPaddings: -1, - }, - tdnn: { - model: '', - }, tokens: './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt', numThreads: 1, debug: 0, @@ -41,19 +23,10 @@ function createOfflineRecognizer() { modelType: 'paraformer', }; - let lmConfig = { - model: '', - scale: 1.0, - }; - let config = { featConfig: featConfig, modelConfig: modelConfig, - lmConfig: lmConfig, decodingMethod: 'greedy_search', - maxActivePaths: 4, - hotwordsFile: '', - hotwordsScore: 1.5, }; return sherpa_onnx.createOfflineRecognizer(config); diff --git a/nodejs-examples/test-offline-sense-voice.js b/nodejs-examples/test-offline-sense-voice.js new file mode 100644 index 000000000..1c0c8bd01 --- /dev/null +++ b/nodejs-examples/test-offline-sense-voice.js @@ -0,0 +1,101 @@ +// Copyright (c) 2024 Xiaomi Corporation (authors: Fangjun Kuang) + +const fs = require('fs'); +const {Readable} = require('stream'); +const wav = require('wav'); + +const sherpa_onnx = require('sherpa-onnx'); + +function createOfflineRecognizer() { + let featConfig = { + sampleRate: 16000, + featureDim: 80, + }; + + let modelConfig = { + senseVoice: { + model: + './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx', + language: '', + useInverseTextNormalization: 1, + }, + tokens: './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt', + numThreads: 1, + debug: 0, + provider: 'cpu', + }; + + let config = { + featConfig: featConfig, + modelConfig: modelConfig, + decodingMethod: 'greedy_search', + }; + + return sherpa_onnx.createOfflineRecognizer(config); +} + + +const recognizer = createOfflineRecognizer(); +const stream = recognizer.createStream(); + +const waveFilename = + './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav'; + +const reader = new wav.Reader(); +const readable = new Readable().wrap(reader); +const buf = []; + +reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { + if (sampleRate != recognizer.config.featConfig.sampleRate) { + throw new Error(`Only support sampleRate ${ + recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`); + } + + if (audioFormat != 1) { + throw new Error(`Only support PCM format. Given ${audioFormat}`); + } + + if (channels != 1) { + throw new Error(`Only a single channel. Given ${channel}`); + } + + if (bitDepth != 16) { + throw new Error(`Only support 16-bit samples. Given ${bitDepth}`); + } +}); + +fs.createReadStream(waveFilename, {'highWaterMark': 4096}) + .pipe(reader) + .on('finish', function(err) { + // tail padding + const floatSamples = + new Float32Array(recognizer.config.featConfig.sampleRate * 0.5); + + buf.push(floatSamples); + const flattened = + Float32Array.from(buf.reduce((a, b) => [...a, ...b], [])); + + stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened); + recognizer.decode(stream); + const text = recognizer.getResult(stream).text; + console.log(text); + + stream.free(); + recognizer.free(); + }); + +readable.on('readable', function() { + let chunk; + while ((chunk = readable.read()) != null) { + const int16Samples = new Int16Array( + chunk.buffer, chunk.byteOffset, + chunk.length / Int16Array.BYTES_PER_ELEMENT); + + const floatSamples = new Float32Array(int16Samples.length); + for (let i = 0; i < floatSamples.length; i++) { + floatSamples[i] = int16Samples[i] / 32768.0; + } + + buf.push(floatSamples); + } +}); diff --git a/nodejs-examples/test-offline-transducer.js b/nodejs-examples/test-offline-transducer.js index 9ae6daabe..fddfa5890 100644 --- a/nodejs-examples/test-offline-transducer.js +++ b/nodejs-examples/test-offline-transducer.js @@ -21,22 +21,6 @@ function createOfflineRecognizer() { joiner: './sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.int8.onnx', }, - paraformer: { - model: '', - }, - nemoCtc: { - model: '', - }, - whisper: { - encoder: '', - decoder: '', - language: '', - task: '', - tailPaddings: -1, - }, - tdnn: { - model: '', - }, tokens: './sherpa-onnx-zipformer-en-2023-06-26/tokens.txt', numThreads: 1, debug: 0, @@ -44,15 +28,9 @@ function createOfflineRecognizer() { modelType: 'transducer', }; - let lmConfig = { - model: '', - scale: 1.0, - }; - let config = { featConfig: featConfig, modelConfig: modelConfig, - lmConfig: lmConfig, decodingMethod: 'greedy_search', maxActivePaths: 4, hotwordsFile: '', diff --git a/nodejs-examples/test-offline-tts-en.js b/nodejs-examples/test-offline-tts-en.js index b38b4aa03..61c23f5eb 100644 --- a/nodejs-examples/test-offline-tts-en.js +++ b/nodejs-examples/test-offline-tts-en.js @@ -5,10 +5,8 @@ const sherpa_onnx = require('sherpa-onnx'); function createOfflineTts() { let offlineTtsVitsModelConfig = { model: './vits-piper-en_US-amy-low/en_US-amy-low.onnx', - lexicon: '', tokens: './vits-piper-en_US-amy-low/tokens.txt', dataDir: './vits-piper-en_US-amy-low/espeak-ng-data', - dictDir: '', noiseScale: 0.667, noiseScaleW: 0.8, lengthScale: 1.0, @@ -22,8 +20,6 @@ function createOfflineTts() { let offlineTtsConfig = { offlineTtsModelConfig: offlineTtsModelConfig, - ruleFsts: '', - ruleFars: '', maxNumSentences: 1, }; diff --git a/nodejs-examples/test-offline-tts-zh.js b/nodejs-examples/test-offline-tts-zh.js index dd93fb070..7be148862 100644 --- a/nodejs-examples/test-offline-tts-zh.js +++ b/nodejs-examples/test-offline-tts-zh.js @@ -7,8 +7,6 @@ function createOfflineTts() { model: './vits-icefall-zh-aishell3/model.onnx', lexicon: './vits-icefall-zh-aishell3/lexicon.txt', tokens: './vits-icefall-zh-aishell3/tokens.txt', - dataDir: '', - dictDir: '', noiseScale: 0.667, noiseScaleW: 0.8, lengthScale: 1.0, @@ -31,7 +29,6 @@ function createOfflineTts() { return sherpa_onnx.createOfflineTts(offlineTtsConfig); } - const tts = createOfflineTts(); const speakerId = 66; const speed = 1.0; diff --git a/nodejs-examples/test-offline-whisper.js b/nodejs-examples/test-offline-whisper.js index 5a2147e3e..a8a90fb72 100644 --- a/nodejs-examples/test-offline-whisper.js +++ b/nodejs-examples/test-offline-whisper.js @@ -13,17 +13,6 @@ function createOfflineRecognizer() { }; let modelConfig = { - transducer: { - encoder: '', - decoder: '', - joiner: '', - }, - paraformer: { - model: '', - }, - nemoCtc: { - model: '', - }, whisper: { encoder: './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx', decoder: './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx', @@ -31,9 +20,6 @@ function createOfflineRecognizer() { task: 'transcribe', tailPaddings: -1, }, - tdnn: { - model: '', - }, tokens: './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt', numThreads: 1, debug: 0, @@ -41,19 +27,10 @@ function createOfflineRecognizer() { modelType: 'whisper', }; - let lmConfig = { - model: '', - scale: 1.0, - }; - let config = { featConfig: featConfig, modelConfig: modelConfig, - lmConfig: lmConfig, decodingMethod: 'greedy_search', - maxActivePaths: 4, - hotwordsFile: '', - hotwordsScore: 1.5, }; return sherpa_onnx.createOfflineRecognizer(config); diff --git a/nodejs-examples/test-online-paraformer-microphone.js b/nodejs-examples/test-online-paraformer-microphone.js index 072276468..a8fb596f6 100644 --- a/nodejs-examples/test-online-paraformer-microphone.js +++ b/nodejs-examples/test-online-paraformer-microphone.js @@ -6,12 +6,6 @@ console.log(portAudio.getDevices()); const sherpa_onnx = require('sherpa-onnx'); function createOnlineRecognizer() { - let onlineTransducerModelConfig = { - encoder: '', - decoder: '', - joiner: '', - }; - let onlineParaformerModelConfig = { encoder: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx', @@ -19,14 +13,8 @@ function createOnlineRecognizer() { './sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx', }; - let onlineZipformer2CtcModelConfig = { - model: '', - }; - let onlineModelConfig = { - transducer: onlineTransducerModelConfig, paraformer: onlineParaformerModelConfig, - zipformer2Ctc: onlineZipformer2CtcModelConfig, tokens: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt', numThreads: 1, provider: 'cpu', @@ -48,12 +36,6 @@ function createOnlineRecognizer() { rule1MinTrailingSilence: 2.4, rule2MinTrailingSilence: 1.2, rule3MinUtteranceLength: 20, - hotwordsFile: '', - hotwordsScore: 1.5, - ctcFstDecoderConfig: { - graph: '', - maxActive: 3000, - } }; return sherpa_onnx.createOnlineRecognizer(recognizerConfig); diff --git a/nodejs-examples/test-online-paraformer.js b/nodejs-examples/test-online-paraformer.js index 5d1eae166..ff6cdc0f2 100644 --- a/nodejs-examples/test-online-paraformer.js +++ b/nodejs-examples/test-online-paraformer.js @@ -7,12 +7,6 @@ const wav = require('wav'); const sherpa_onnx = require('sherpa-onnx'); function createOnlineRecognizer() { - let onlineTransducerModelConfig = { - encoder: '', - decoder: '', - joiner: '', - }; - let onlineParaformerModelConfig = { encoder: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx', @@ -20,14 +14,8 @@ function createOnlineRecognizer() { './sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx', }; - let onlineZipformer2CtcModelConfig = { - model: '', - }; - let onlineModelConfig = { - transducer: onlineTransducerModelConfig, paraformer: onlineParaformerModelConfig, - zipformer2Ctc: onlineZipformer2CtcModelConfig, tokens: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt', numThreads: 1, provider: 'cpu', @@ -49,12 +37,6 @@ function createOnlineRecognizer() { rule1MinTrailingSilence: 2.4, rule2MinTrailingSilence: 1.2, rule3MinUtteranceLength: 20, - hotwordsFile: '', - hotwordsScore: 1.5, - ctcFstDecoderConfig: { - graph: '', - maxActive: 3000, - } }; return sherpa_onnx.createOnlineRecognizer(recognizerConfig); diff --git a/nodejs-examples/test-online-transducer-itn.js b/nodejs-examples/test-online-transducer-itn.js index 9bc5360a2..399ae89e3 100644 --- a/nodejs-examples/test-online-transducer-itn.js +++ b/nodejs-examples/test-online-transducer-itn.js @@ -16,19 +16,8 @@ function createOnlineRecognizer() { './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx', }; - let onlineParaformerModelConfig = { - encoder: '', - decoder: '', - }; - - let onlineZipformer2CtcModelConfig = { - model: '', - }; - let onlineModelConfig = { transducer: onlineTransducerModelConfig, - paraformer: onlineParaformerModelConfig, - zipformer2Ctc: onlineZipformer2CtcModelConfig, tokens: './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt', numThreads: 1, @@ -51,12 +40,6 @@ function createOnlineRecognizer() { rule1MinTrailingSilence: 2.4, rule2MinTrailingSilence: 1.2, rule3MinUtteranceLength: 20, - hotwordsFile: '', - hotwordsScore: 1.5, - ctcFstDecoderConfig: { - graph: '', - maxActive: 3000, - }, // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst ruleFsts: './itn_zh_number.fst', }; diff --git a/nodejs-examples/test-online-transducer-microphone.js b/nodejs-examples/test-online-transducer-microphone.js index 52eba8a99..f718eb61d 100644 --- a/nodejs-examples/test-online-transducer-microphone.js +++ b/nodejs-examples/test-online-transducer-microphone.js @@ -15,19 +15,8 @@ function createOnlineRecognizer() { './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx', }; - let onlineParaformerModelConfig = { - encoder: '', - decoder: '', - }; - - let onlineZipformer2CtcModelConfig = { - model: '', - }; - let onlineModelConfig = { transducer: onlineTransducerModelConfig, - paraformer: onlineParaformerModelConfig, - zipformer2Ctc: onlineZipformer2CtcModelConfig, tokens: './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt', numThreads: 1, @@ -50,12 +39,6 @@ function createOnlineRecognizer() { rule1MinTrailingSilence: 2.4, rule2MinTrailingSilence: 1.2, rule3MinUtteranceLength: 20, - hotwordsFile: '', - hotwordsScore: 1.5, - ctcFstDecoderConfig: { - graph: '', - maxActive: 3000, - } }; return sherpa_onnx.createOnlineRecognizer(recognizerConfig); diff --git a/nodejs-examples/test-online-transducer.js b/nodejs-examples/test-online-transducer.js index 2ca30ee2b..96d66a840 100644 --- a/nodejs-examples/test-online-transducer.js +++ b/nodejs-examples/test-online-transducer.js @@ -16,19 +16,8 @@ function createOnlineRecognizer() { './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx', }; - let onlineParaformerModelConfig = { - encoder: '', - decoder: '', - }; - - let onlineZipformer2CtcModelConfig = { - model: '', - }; - let onlineModelConfig = { transducer: onlineTransducerModelConfig, - paraformer: onlineParaformerModelConfig, - zipformer2Ctc: onlineZipformer2CtcModelConfig, tokens: './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt', numThreads: 1, @@ -51,12 +40,6 @@ function createOnlineRecognizer() { rule1MinTrailingSilence: 2.4, rule2MinTrailingSilence: 1.2, rule3MinUtteranceLength: 20, - hotwordsFile: '', - hotwordsScore: 1.5, - ctcFstDecoderConfig: { - graph: '', - maxActive: 3000, - } }; return sherpa_onnx.createOnlineRecognizer(recognizerConfig); diff --git a/nodejs-examples/test-online-zipformer2-ctc-hlg.js b/nodejs-examples/test-online-zipformer2-ctc-hlg.js index fed7b4e5c..66e8c6240 100644 --- a/nodejs-examples/test-online-zipformer2-ctc-hlg.js +++ b/nodejs-examples/test-online-zipformer2-ctc-hlg.js @@ -7,25 +7,12 @@ const wav = require('wav'); const sherpa_onnx = require('sherpa-onnx'); function createOnlineRecognizer() { - let onlineTransducerModelConfig = { - encoder: '', - decoder: '', - joiner: '', - }; - - let onlineParaformerModelConfig = { - encoder: '', - decoder: '', - }; - let onlineZipformer2CtcModelConfig = { model: './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx', }; let onlineModelConfig = { - transducer: onlineTransducerModelConfig, - paraformer: onlineParaformerModelConfig, zipformer2Ctc: onlineZipformer2CtcModelConfig, tokens: './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt', numThreads: 1, @@ -48,8 +35,6 @@ function createOnlineRecognizer() { rule1MinTrailingSilence: 2.4, rule2MinTrailingSilence: 1.2, rule3MinUtteranceLength: 20, - hotwordsFile: '', - hotwordsScore: 1.5, ctcFstDecoderConfig: { graph: './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst', maxActive: 3000, diff --git a/nodejs-examples/test-online-zipformer2-ctc.js b/nodejs-examples/test-online-zipformer2-ctc.js index ad239accd..3fddbcc89 100644 --- a/nodejs-examples/test-online-zipformer2-ctc.js +++ b/nodejs-examples/test-online-zipformer2-ctc.js @@ -7,32 +7,18 @@ const wav = require('wav'); const sherpa_onnx = require('sherpa-onnx'); function createOnlineRecognizer() { - let onlineTransducerModelConfig = { - encoder: '', - decoder: '', - joiner: '', - }; - - let onlineParaformerModelConfig = { - encoder: '', - decoder: '', - }; - let onlineZipformer2CtcModelConfig = { model: './sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx', }; let onlineModelConfig = { - transducer: onlineTransducerModelConfig, - paraformer: onlineParaformerModelConfig, zipformer2Ctc: onlineZipformer2CtcModelConfig, tokens: './sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt', numThreads: 1, provider: 'cpu', debug: 1, - modelType: '', }; let featureConfig = { @@ -49,12 +35,6 @@ function createOnlineRecognizer() { rule1MinTrailingSilence: 2.4, rule2MinTrailingSilence: 1.2, rule3MinUtteranceLength: 20, - hotwordsFile: '', - hotwordsScore: 1.5, - ctcFstDecoderConfig: { - graph: '', - maxActive: 3000, - } }; return sherpa_onnx.createOnlineRecognizer(recognizerConfig); diff --git a/wasm/asr/sherpa-onnx-asr.js b/wasm/asr/sherpa-onnx-asr.js index 3341a093c..1a70ba952 100644 --- a/wasm/asr/sherpa-onnx-asr.js +++ b/wasm/asr/sherpa-onnx-asr.js @@ -39,6 +39,10 @@ function freeConfig(config, Module) { freeConfig(config.tdnn, Module) } + if ('senseVoice' in config) { + freeConfig(config.senseVoice, Module) + } + if ('lm' in config) { freeConfig(config.lm, Module) } @@ -52,9 +56,9 @@ function freeConfig(config, Module) { // The user should free the returned pointers function initSherpaOnnxOnlineTransducerModelConfig(config, Module) { - const encoderLen = Module.lengthBytesUTF8(config.encoder) + 1; - const decoderLen = Module.lengthBytesUTF8(config.decoder) + 1; - const joinerLen = Module.lengthBytesUTF8(config.joiner) + 1; + const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1; + const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1; + const joinerLen = Module.lengthBytesUTF8(config.joiner || '') + 1; const n = encoderLen + decoderLen + joinerLen; @@ -64,13 +68,13 @@ function initSherpaOnnxOnlineTransducerModelConfig(config, Module) { const ptr = Module._malloc(len); let offset = 0; - Module.stringToUTF8(config.encoder, buffer + offset, encoderLen); + Module.stringToUTF8(config.encoder || '', buffer + offset, encoderLen); offset += encoderLen; - Module.stringToUTF8(config.decoder, buffer + offset, decoderLen); + Module.stringToUTF8(config.decoder || '', buffer + offset, decoderLen); offset += decoderLen; - Module.stringToUTF8(config.joiner, buffer + offset, joinerLen); + Module.stringToUTF8(config.joiner || '', buffer + offset, joinerLen); offset = 0; Module.setValue(ptr, buffer + offset, 'i8*'); @@ -87,8 +91,8 @@ function initSherpaOnnxOnlineTransducerModelConfig(config, Module) { } function initSherpaOnnxOnlineParaformerModelConfig(config, Module) { - const encoderLen = Module.lengthBytesUTF8(config.encoder) + 1; - const decoderLen = Module.lengthBytesUTF8(config.decoder) + 1; + const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1; + const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1; const n = encoderLen + decoderLen; const buffer = Module._malloc(n); @@ -97,10 +101,10 @@ function initSherpaOnnxOnlineParaformerModelConfig(config, Module) { const ptr = Module._malloc(len); let offset = 0; - Module.stringToUTF8(config.encoder, buffer + offset, encoderLen); + Module.stringToUTF8(config.encoder || '', buffer + offset, encoderLen); offset += encoderLen; - Module.stringToUTF8(config.decoder, buffer + offset, decoderLen); + Module.stringToUTF8(config.decoder || '', buffer + offset, decoderLen); offset = 0; Module.setValue(ptr, buffer + offset, 'i8*'); @@ -114,13 +118,13 @@ function initSherpaOnnxOnlineParaformerModelConfig(config, Module) { } function initSherpaOnnxOnlineZipformer2CtcModelConfig(config, Module) { - const n = Module.lengthBytesUTF8(config.model) + 1; + const n = Module.lengthBytesUTF8(config.model || '') + 1; const buffer = Module._malloc(n); const len = 1 * 4; // 1 pointer const ptr = Module._malloc(len); - Module.stringToUTF8(config.model, buffer, n); + Module.stringToUTF8(config.model || '', buffer, n); Module.setValue(ptr, buffer, 'i8*'); @@ -130,10 +134,33 @@ function initSherpaOnnxOnlineZipformer2CtcModelConfig(config, Module) { } function initSherpaOnnxOnlineModelConfig(config, Module) { + if (!('transducer' in config)) { + config.transducer = { + encoder: '', + decoder: '', + joiner: '', + }; + } + + if (!('paraformer' in config)) { + config.paraformer = { + encoder: '', + decoder: '', + }; + } + + if (!('zipformer2Ctc' in config)) { + config.zipformer2Ctc = { + model: '', + }; + } + const transducer = initSherpaOnnxOnlineTransducerModelConfig(config.transducer, Module); + const paraformer = initSherpaOnnxOnlineParaformerModelConfig(config.paraformer, Module); + const ctc = initSherpaOnnxOnlineZipformer2CtcModelConfig( config.zipformer2Ctc, Module); @@ -150,9 +177,9 @@ function initSherpaOnnxOnlineModelConfig(config, Module) { Module._CopyHeap(ctc.ptr, ctc.len, ptr + offset); offset += ctc.len; - const tokensLen = Module.lengthBytesUTF8(config.tokens) + 1; - const providerLen = Module.lengthBytesUTF8(config.provider) + 1; - const modelTypeLen = Module.lengthBytesUTF8(config.modelType) + 1; + const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; + const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1; + const modelTypeLen = Module.lengthBytesUTF8(config.modelType || '') + 1; const modelingUnitLen = Module.lengthBytesUTF8(config.modelingUnit || '') + 1; const bpeVocabLen = Module.lengthBytesUTF8(config.bpeVocab || '') + 1; @@ -161,13 +188,13 @@ function initSherpaOnnxOnlineModelConfig(config, Module) { const buffer = Module._malloc(bufferLen); offset = 0; - Module.stringToUTF8(config.tokens, buffer, tokensLen); + Module.stringToUTF8(config.tokens || '', buffer, tokensLen); offset += tokensLen; - Module.stringToUTF8(config.provider, buffer + offset, providerLen); + Module.stringToUTF8(config.provider || 'cpu', buffer + offset, providerLen); offset += providerLen; - Module.stringToUTF8(config.modelType, buffer + offset, modelTypeLen); + Module.stringToUTF8(config.modelType || '', buffer + offset, modelTypeLen); offset += modelTypeLen; Module.stringToUTF8( @@ -181,13 +208,13 @@ function initSherpaOnnxOnlineModelConfig(config, Module) { Module.setValue(ptr + offset, buffer, 'i8*'); // tokens offset += 4; - Module.setValue(ptr + offset, config.numThreads, 'i32'); + Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); offset += 4; Module.setValue(ptr + offset, buffer + tokensLen, 'i8*'); // provider offset += 4; - Module.setValue(ptr + offset, config.debug, 'i32'); + Module.setValue(ptr + offset, config.debug || 0, 'i32'); offset += 4; Module.setValue( @@ -215,8 +242,8 @@ function initSherpaOnnxFeatureConfig(config, Module) { const len = 2 * 4; // 2 pointers const ptr = Module._malloc(len); - Module.setValue(ptr, config.sampleRate, 'i32'); - Module.setValue(ptr + 4, config.featureDim, 'i32'); + Module.setValue(ptr, config.sampleRate || 16000, 'i32'); + Module.setValue(ptr + 4, config.featureDim || 80, 'i32'); return {ptr: ptr, len: len}; } @@ -224,16 +251,30 @@ function initSherpaOnnxOnlineCtcFstDecoderConfig(config, Module) { const len = 2 * 4; const ptr = Module._malloc(len); - const graphLen = Module.lengthBytesUTF8(config.graph) + 1; + const graphLen = Module.lengthBytesUTF8(config.graph || '') + 1; const buffer = Module._malloc(graphLen); Module.stringToUTF8(config.graph, buffer, graphLen); Module.setValue(ptr, buffer, 'i8*'); - Module.setValue(ptr + 4, config.maxActive, 'i32'); + Module.setValue(ptr + 4, config.maxActive || 3000, 'i32'); return {ptr: ptr, len: len, buffer: buffer}; } function initSherpaOnnxOnlineRecognizerConfig(config, Module) { + if (!('featConfig' in config)) { + config.featConfig = { + sampleRate: 16000, + featureDim: 80, + }; + } + + if (!('ctcFstDecoderConfig' in config)) { + config.ctcFstDecoderConfig = { + graph: '', + maxActive: 3000, + }; + } + const feat = initSherpaOnnxFeatureConfig(config.featConfig, Module); const model = initSherpaOnnxOnlineModelConfig(config.modelConfig, Module); const ctcFstDecoder = initSherpaOnnxOnlineCtcFstDecoderConfig( @@ -249,8 +290,9 @@ function initSherpaOnnxOnlineRecognizerConfig(config, Module) { Module._CopyHeap(model.ptr, model.len, ptr + offset); offset += model.len; - const decodingMethodLen = Module.lengthBytesUTF8(config.decodingMethod) + 1; - const hotwordsFileLen = Module.lengthBytesUTF8(config.hotwordsFile) + 1; + const decodingMethodLen = + Module.lengthBytesUTF8(config.decodingMethod || 'greedy_search') + 1; + const hotwordsFileLen = Module.lengthBytesUTF8(config.hotwordsFile || '') + 1; const ruleFstsFileLen = Module.lengthBytesUTF8(config.ruleFsts || '') + 1; const ruleFarsFileLen = Module.lengthBytesUTF8(config.ruleFars || '') + 1; const bufferLen = @@ -258,10 +300,12 @@ function initSherpaOnnxOnlineRecognizerConfig(config, Module) { const buffer = Module._malloc(bufferLen); offset = 0; - Module.stringToUTF8(config.decodingMethod, buffer, decodingMethodLen); + Module.stringToUTF8( + config.decodingMethod || 'greedy_search', buffer, decodingMethodLen); offset += decodingMethodLen; - Module.stringToUTF8(config.hotwordsFile, buffer + offset, hotwordsFileLen); + Module.stringToUTF8( + config.hotwordsFile || '', buffer + offset, hotwordsFileLen); offset += hotwordsFileLen; Module.stringToUTF8(config.ruleFsts || '', buffer + offset, ruleFstsFileLen); @@ -274,25 +318,25 @@ function initSherpaOnnxOnlineRecognizerConfig(config, Module) { Module.setValue(ptr + offset, buffer, 'i8*'); // decoding method offset += 4; - Module.setValue(ptr + offset, config.maxActivePaths, 'i32'); + Module.setValue(ptr + offset, config.maxActivePaths || 4, 'i32'); offset += 4; - Module.setValue(ptr + offset, config.enableEndpoint, 'i32'); + Module.setValue(ptr + offset, config.enableEndpoint || 0, 'i32'); offset += 4; - Module.setValue(ptr + offset, config.rule1MinTrailingSilence, 'float'); + Module.setValue(ptr + offset, config.rule1MinTrailingSilence || 2.4, 'float'); offset += 4; - Module.setValue(ptr + offset, config.rule2MinTrailingSilence, 'float'); + Module.setValue(ptr + offset, config.rule2MinTrailingSilence || 1.2, 'float'); offset += 4; - Module.setValue(ptr + offset, config.rule3MinUtteranceLength, 'float'); + Module.setValue(ptr + offset, config.rule3MinUtteranceLength || 20, 'float'); offset += 4; Module.setValue(ptr + offset, buffer + decodingMethodLen, 'i8*'); offset += 4; - Module.setValue(ptr + offset, config.hotwordsScore, 'float'); + Module.setValue(ptr + offset, config.hotwordsScore || 1.5, 'float'); offset += 4; Module._CopyHeap(ctcFstDecoder.ptr, ctcFstDecoder.len, ptr + offset); @@ -313,7 +357,6 @@ function initSherpaOnnxOnlineRecognizerConfig(config, Module) { } } - function createOnlineRecognizer(Module, myConfig) { const onlineTransducerModelConfig = { encoder: '', @@ -395,9 +438,9 @@ function createOnlineRecognizer(Module, myConfig) { } function initSherpaOnnxOfflineTransducerModelConfig(config, Module) { - const encoderLen = Module.lengthBytesUTF8(config.encoder) + 1; - const decoderLen = Module.lengthBytesUTF8(config.decoder) + 1; - const joinerLen = Module.lengthBytesUTF8(config.joiner) + 1; + const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1; + const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1; + const joinerLen = Module.lengthBytesUTF8(config.joiner || '') + 1; const n = encoderLen + decoderLen + joinerLen; @@ -407,13 +450,13 @@ function initSherpaOnnxOfflineTransducerModelConfig(config, Module) { const ptr = Module._malloc(len); let offset = 0; - Module.stringToUTF8(config.encoder, buffer + offset, encoderLen); + Module.stringToUTF8(config.encoder || '', buffer + offset, encoderLen); offset += encoderLen; - Module.stringToUTF8(config.decoder, buffer + offset, decoderLen); + Module.stringToUTF8(config.decoder || '', buffer + offset, decoderLen); offset += decoderLen; - Module.stringToUTF8(config.joiner, buffer + offset, joinerLen); + Module.stringToUTF8(config.joiner || '', buffer + offset, joinerLen); offset = 0; Module.setValue(ptr, buffer + offset, 'i8*'); @@ -430,14 +473,14 @@ function initSherpaOnnxOfflineTransducerModelConfig(config, Module) { } function initSherpaOnnxOfflineParaformerModelConfig(config, Module) { - const n = Module.lengthBytesUTF8(config.model) + 1; + const n = Module.lengthBytesUTF8(config.model || '') + 1; const buffer = Module._malloc(n); const len = 1 * 4; // 1 pointer const ptr = Module._malloc(len); - Module.stringToUTF8(config.model, buffer, n); + Module.stringToUTF8(config.model || '', buffer, n); Module.setValue(ptr, buffer, 'i8*'); @@ -447,14 +490,14 @@ function initSherpaOnnxOfflineParaformerModelConfig(config, Module) { } function initSherpaOnnxOfflineNemoEncDecCtcModelConfig(config, Module) { - const n = Module.lengthBytesUTF8(config.model) + 1; + const n = Module.lengthBytesUTF8(config.model || '') + 1; const buffer = Module._malloc(n); const len = 1 * 4; // 1 pointer const ptr = Module._malloc(len); - Module.stringToUTF8(config.model, buffer, n); + Module.stringToUTF8(config.model || '', buffer, n); Module.setValue(ptr, buffer, 'i8*'); @@ -464,10 +507,10 @@ function initSherpaOnnxOfflineNemoEncDecCtcModelConfig(config, Module) { } function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { - const encoderLen = Module.lengthBytesUTF8(config.encoder) + 1; - const decoderLen = Module.lengthBytesUTF8(config.decoder) + 1; - const languageLen = Module.lengthBytesUTF8(config.language) + 1; - const taskLen = Module.lengthBytesUTF8(config.task) + 1; + const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1; + const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1; + const languageLen = Module.lengthBytesUTF8(config.language || '') + 1; + const taskLen = Module.lengthBytesUTF8(config.task || '') + 1; const n = encoderLen + decoderLen + languageLen + taskLen; const buffer = Module._malloc(n); @@ -476,16 +519,16 @@ function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { const ptr = Module._malloc(len); let offset = 0; - Module.stringToUTF8(config.encoder, buffer + offset, encoderLen); + Module.stringToUTF8(config.encoder || '', buffer + offset, encoderLen); offset += encoderLen; - Module.stringToUTF8(config.decoder, buffer + offset, decoderLen); + Module.stringToUTF8(config.decoder || '', buffer + offset, decoderLen); offset += decoderLen; - Module.stringToUTF8(config.language, buffer + offset, languageLen); + Module.stringToUTF8(config.language || '', buffer + offset, languageLen); offset += languageLen; - Module.stringToUTF8(config.task, buffer + offset, taskLen); + Module.stringToUTF8(config.task || '', buffer + offset, taskLen); offset = 0; Module.setValue(ptr, buffer + offset, 'i8*'); @@ -508,13 +551,13 @@ function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { } function initSherpaOnnxOfflineTdnnModelConfig(config, Module) { - const n = Module.lengthBytesUTF8(config.model) + 1; + const n = Module.lengthBytesUTF8(config.model || '') + 1; const buffer = Module._malloc(n); const len = 1 * 4; // 1 pointer const ptr = Module._malloc(len); - Module.stringToUTF8(config.model, buffer, n); + Module.stringToUTF8(config.model || '', buffer, n); Module.setValue(ptr, buffer, 'i8*'); @@ -523,16 +566,48 @@ function initSherpaOnnxOfflineTdnnModelConfig(config, Module) { } } +function initSherpaOnnxOfflineSenseVoiceModelConfig(config, Module) { + const modelLen = Module.lengthBytesUTF8(config.model || '') + 1; + const languageLen = Module.lengthBytesUTF8(config.language || '') + 1; + + // useItn is a integer with 4 bytes + const n = modelLen + languageLen; + const buffer = Module._malloc(n); + + const len = 3 * 4; // 2 pointers + 1 int + const ptr = Module._malloc(len); + + let offset = 0; + Module.stringToUTF8(config.model || '', buffer + offset, modelLen); + offset += modelLen; + + Module.stringToUTF8(config.language || '', buffer + offset, languageLen); + offset += languageLen; + + offset = 0; + Module.setValue(ptr, buffer + offset, 'i8*'); + offset += modelLen; + + Module.setValue(ptr + 4, buffer + offset, 'i8*'); + offset += languageLen; + + Module.setValue(ptr + 8, config.useInverseTextNormalization || 0, 'i32'); + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + function initSherpaOnnxOfflineLMConfig(config, Module) { - const n = Module.lengthBytesUTF8(config.model) + 1; + const n = Module.lengthBytesUTF8(config.model || '') + 1; const buffer = Module._malloc(n); const len = 2 * 4; const ptr = Module._malloc(len); - Module.stringToUTF8(config.model, buffer, n); + Module.stringToUTF8(config.model || '', buffer, n); Module.setValue(ptr, buffer, 'i8*'); - Module.setValue(ptr + 4, config.scale, 'float'); + Module.setValue(ptr + 4, config.scale || 1, 'float'); return { buffer: buffer, ptr: ptr, len: len, @@ -540,18 +615,70 @@ function initSherpaOnnxOfflineLMConfig(config, Module) { } function initSherpaOnnxOfflineModelConfig(config, Module) { + if (!('transducer' in config)) { + config.transducer = { + encoder: '', + decoder: '', + joiner: '', + }; + } + + if (!('paraformer' in config)) { + config.paraformer = { + model: '', + }; + } + + if (!('nemoCtc' in config)) { + config.nemoCtc = { + model: '', + }; + } + + if (!('whisper' in config)) { + config.whisper = { + encoder: '', + decoder: '', + language: '', + task: '', + tailPaddings: -1, + }; + } + + if (!('tdnn' in config)) { + config.tdnn = { + model: '', + }; + } + + if (!('senseVoice' in config)) { + config.senseVoice = { + model: '', + language: '', + useInverseTextNormalization: 0, + }; + } + const transducer = initSherpaOnnxOfflineTransducerModelConfig(config.transducer, Module); + const paraformer = initSherpaOnnxOfflineParaformerModelConfig(config.paraformer, Module); + const nemoCtc = initSherpaOnnxOfflineNemoEncDecCtcModelConfig(config.nemoCtc, Module); + const whisper = initSherpaOnnxOfflineWhisperModelConfig(config.whisper, Module); + const tdnn = initSherpaOnnxOfflineTdnnModelConfig(config.tdnn, Module); + const senseVoice = + initSherpaOnnxOfflineSenseVoiceModelConfig(config.senseVoice, Module); + const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + - tdnn.len + 8 * 4; + tdnn.len + 8 * 4 + senseVoice.len; + const ptr = Module._malloc(len); let offset = 0; @@ -570,9 +697,10 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { Module._CopyHeap(tdnn.ptr, tdnn.len, ptr + offset); offset += tdnn.len; - const tokensLen = Module.lengthBytesUTF8(config.tokens) + 1; - const providerLen = Module.lengthBytesUTF8(config.provider) + 1; - const modelTypeLen = Module.lengthBytesUTF8(config.modelType) + 1; + + const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; + const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1; + const modelTypeLen = Module.lengthBytesUTF8(config.modelType || '') + 1; const modelingUnitLen = Module.lengthBytesUTF8(config.modelingUnit || '') + 1; const bpeVocabLen = Module.lengthBytesUTF8(config.bpeVocab || '') + 1; const teleSpeechCtcLen = @@ -580,16 +708,17 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { const bufferLen = tokensLen + providerLen + modelTypeLen + modelingUnitLen + bpeVocabLen + teleSpeechCtcLen; + const buffer = Module._malloc(bufferLen); offset = 0; Module.stringToUTF8(config.tokens, buffer, tokensLen); offset += tokensLen; - Module.stringToUTF8(config.provider, buffer + offset, providerLen); + Module.stringToUTF8(config.provider || 'cpu', buffer + offset, providerLen); offset += providerLen; - Module.stringToUTF8(config.modelType, buffer + offset, modelTypeLen); + Module.stringToUTF8(config.modelType || '', buffer + offset, modelTypeLen); offset += modelTypeLen; Module.stringToUTF8( @@ -608,10 +737,10 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { Module.setValue(ptr + offset, buffer, 'i8*'); // tokens offset += 4; - Module.setValue(ptr + offset, config.numThreads, 'i32'); + Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); offset += 4; - Module.setValue(ptr + offset, config.debug, 'i32'); + Module.setValue(ptr + offset, config.debug || 0, 'i32'); offset += 4; Module.setValue(ptr + offset, buffer + tokensLen, 'i8*'); // provider @@ -639,13 +768,30 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { 'i8*'); // teleSpeechCtc offset += 4; + Module._CopyHeap(senseVoice.ptr, senseVoice.len, ptr + offset); + return { buffer: buffer, ptr: ptr, len: len, transducer: transducer, - paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn + paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn, + senseVoice: senseVoice, } } function initSherpaOnnxOfflineRecognizerConfig(config, Module) { + if (!('featConfig' in config)) { + config.featConfig = { + sampleRate: 16000, + featureDim: 80, + }; + } + + if (!('lmConfig' in config)) { + config.lmConfig = { + model: '', + scale: 1.0, + }; + } + const feat = initSherpaOnnxFeatureConfig(config.featConfig, Module); const model = initSherpaOnnxOfflineModelConfig(config.modelConfig, Module); const lm = initSherpaOnnxOfflineLMConfig(config.lmConfig, Module); @@ -663,8 +809,9 @@ function initSherpaOnnxOfflineRecognizerConfig(config, Module) { Module._CopyHeap(lm.ptr, lm.len, ptr + offset); offset += lm.len; - const decodingMethodLen = Module.lengthBytesUTF8(config.decodingMethod) + 1; - const hotwordsFileLen = Module.lengthBytesUTF8(config.hotwordsFile) + 1; + const decodingMethodLen = + Module.lengthBytesUTF8(config.decodingMethod || 'greedy_search') + 1; + const hotwordsFileLen = Module.lengthBytesUTF8(config.hotwordsFile || '') + 1; const ruleFstsLen = Module.lengthBytesUTF8(config.ruleFsts || '') + 1; const ruleFarsLen = Module.lengthBytesUTF8(config.ruleFars || '') + 1; const bufferLen = @@ -672,10 +819,12 @@ function initSherpaOnnxOfflineRecognizerConfig(config, Module) { const buffer = Module._malloc(bufferLen); offset = 0; - Module.stringToUTF8(config.decodingMethod, buffer, decodingMethodLen); + Module.stringToUTF8( + config.decodingMethod || 'greedy_search', buffer, decodingMethodLen); offset += decodingMethodLen; - Module.stringToUTF8(config.hotwordsFile, buffer + offset, hotwordsFileLen); + Module.stringToUTF8( + config.hotwordsFile || '', buffer + offset, hotwordsFileLen); offset += hotwordsFileLen; Module.stringToUTF8(config.ruleFsts || '', buffer + offset, ruleFstsLen); @@ -689,13 +838,13 @@ function initSherpaOnnxOfflineRecognizerConfig(config, Module) { Module.setValue(ptr + offset, buffer, 'i8*'); // decoding method offset += 4; - Module.setValue(ptr + offset, config.maxActivePaths, 'i32'); + Module.setValue(ptr + offset, config.maxActivePaths || 4, 'i32'); offset += 4; Module.setValue(ptr + offset, buffer + decodingMethodLen, 'i8*'); offset += 4; - Module.setValue(ptr + offset, config.hotwordsScore, 'float'); + Module.setValue(ptr + offset, config.hotwordsScore || 1.5, 'float'); offset += 4; Module.setValue( diff --git a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc index 6e138c76f..b54cd0883 100644 --- a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc +++ b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc @@ -16,6 +16,7 @@ static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, ""); static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, ""); static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineTdnnModelConfig) == 4, ""); +static_assert(sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) == 3 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineLMConfig) == 2 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineModelConfig) == @@ -23,7 +24,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == sizeof(SherpaOnnxOfflineParaformerModelConfig) + sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) + sizeof(SherpaOnnxOfflineWhisperModelConfig) + - sizeof(SherpaOnnxOfflineTdnnModelConfig) + 8 * 4, + sizeof(SherpaOnnxOfflineTdnnModelConfig) + 8 * 4 + + sizeof(SherpaOnnxOfflineSenseVoiceModelConfig), ""); static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineRecognizerConfig) == @@ -63,6 +65,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { auto nemo_ctc = &model_config->nemo_ctc; auto whisper = &model_config->whisper; auto tdnn = &model_config->tdnn; + auto sense_voice = &model_config->sense_voice; fprintf(stdout, "----------offline transducer model config----------\n"); fprintf(stdout, "encoder: %s\n", transducer->encoder); @@ -85,6 +88,11 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { fprintf(stdout, "----------offline tdnn model config----------\n"); fprintf(stdout, "model: %s\n", tdnn->model); + fprintf(stdout, "----------offline sense_voice model config----------\n"); + fprintf(stdout, "model: %s\n", sense_voice->model); + fprintf(stdout, "language: %s\n", sense_voice->language); + fprintf(stdout, "use_itn: %d\n", sense_voice->use_itn); + fprintf(stdout, "tokens: %s\n", model_config->tokens); fprintf(stdout, "num_threads: %d\n", model_config->num_threads); fprintf(stdout, "provider: %s\n", model_config->provider); diff --git a/wasm/tts/sherpa-onnx-tts.js b/wasm/tts/sherpa-onnx-tts.js index 0b6d5e009..4d68b854f 100644 --- a/wasm/tts/sherpa-onnx-tts.js +++ b/wasm/tts/sherpa-onnx-tts.js @@ -14,14 +14,10 @@ function freeConfig(config, Module) { // The user should free the returned pointers function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { const modelLen = Module.lengthBytesUTF8(config.model) + 1; - const lexiconLen = Module.lengthBytesUTF8(config.lexicon) + 1; - const tokensLen = Module.lengthBytesUTF8(config.tokens) + 1; - const dataDirLen = Module.lengthBytesUTF8(config.dataDir) + 1; - - if (!('dictDir' in config)) { - config.dictDir = '' - } - const dictDirLen = Module.lengthBytesUTF8(config.dictDir) + 1; + const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1; + const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; + const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; + const dictDirLen = Module.lengthBytesUTF8(config.dictDir || '') + 1; const n = modelLen + lexiconLen + tokensLen + dataDirLen + dictDirLen; @@ -31,19 +27,19 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { const ptr = Module._malloc(len); let offset = 0; - Module.stringToUTF8(config.model, buffer + offset, modelLen); + Module.stringToUTF8(config.model || '', buffer + offset, modelLen); offset += modelLen; - Module.stringToUTF8(config.lexicon, buffer + offset, lexiconLen); + Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen); offset += lexiconLen; - Module.stringToUTF8(config.tokens, buffer + offset, tokensLen); + Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen); offset += tokensLen; - Module.stringToUTF8(config.dataDir, buffer + offset, dataDirLen); + Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen); offset += dataDirLen; - Module.stringToUTF8(config.dictDir, buffer + offset, dictDirLen); + Module.stringToUTF8(config.dictDir || '', buffer + offset, dictDirLen); offset += dictDirLen; offset = 0; @@ -59,9 +55,9 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { Module.setValue(ptr + 12, buffer + offset, 'i8*'); offset += dataDirLen; - Module.setValue(ptr + 16, config.noiseScale, 'float'); - Module.setValue(ptr + 20, config.noiseScaleW, 'float'); - Module.setValue(ptr + 24, config.lengthScale, 'float'); + Module.setValue(ptr + 16, config.noiseScale || 0.667, 'float'); + Module.setValue(ptr + 20, config.noiseScaleW || 0.8, 'float'); + Module.setValue(ptr + 24, config.lengthScale || 1.0, 'float'); Module.setValue(ptr + 28, buffer + offset, 'i8*'); offset += dictDirLen; @@ -81,13 +77,13 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { Module._CopyHeap(vitsModelConfig.ptr, vitsModelConfig.len, ptr + offset); offset += vitsModelConfig.len; - Module.setValue(ptr + offset, config.numThreads, 'i32'); + Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); offset += 4; - Module.setValue(ptr + offset, config.debug, 'i32'); + Module.setValue(ptr + offset, config.debug || 0, 'i32'); offset += 4; - const providerLen = Module.lengthBytesUTF8(config.provider) + 1; + const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1; const buffer = Module._malloc(providerLen); Module.stringToUTF8(config.provider, buffer, providerLen); Module.setValue(ptr + offset, buffer, 'i8*'); @@ -107,17 +103,17 @@ function initSherpaOnnxOfflineTtsConfig(config, Module) { Module._CopyHeap(modelConfig.ptr, modelConfig.len, ptr + offset); offset += modelConfig.len; - const ruleFstsLen = Module.lengthBytesUTF8(config.ruleFsts) + 1; - const ruleFarsLen = Module.lengthBytesUTF8(config.ruleFars) + 1; + const ruleFstsLen = Module.lengthBytesUTF8(config.ruleFsts || '') + 1; + const ruleFarsLen = Module.lengthBytesUTF8(config.ruleFars || '') + 1; const buffer = Module._malloc(ruleFstsLen + ruleFarsLen); - Module.stringToUTF8(config.ruleFsts, buffer, ruleFstsLen); - Module.stringToUTF8(config.ruleFars, buffer + ruleFstsLen, ruleFarsLen); + Module.stringToUTF8(config.ruleFsts || '', buffer, ruleFstsLen); + Module.stringToUTF8(config.ruleFars || '', buffer + ruleFstsLen, ruleFarsLen); Module.setValue(ptr + offset, buffer, 'i8*'); offset += 4; - Module.setValue(ptr + offset, config.maxNumSentences, 'i32'); + Module.setValue(ptr + offset, config.maxNumSentences || 1, 'i32'); offset += 4; Module.setValue(ptr + offset, buffer + ruleFstsLen, 'i8*');