From fd5a0d1e00ffa11b2a3ded064fc6f9a0910dc845 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 5 Jun 2024 00:26:40 +0800 Subject: [PATCH] Add C++ runtime for Tele-AI/TeleSpeech-ASR (#970) --- .github/scripts/test-dot-net.sh | 19 +-- .github/scripts/test-offline-ctc.sh | 33 ++++ .github/scripts/test-python.sh | 12 ++ .../workflows/build-wheels-macos-arm64.yaml | 4 +- .../build-wheels-macos-universal2.yaml | 90 +++++++++++ .github/workflows/build-wheels-macos-x64.yaml | 4 +- .github/workflows/export-telespeech-ctc.yaml | 46 ++++++ .github/workflows/linux.yaml | 16 +- .github/workflows/macos.yaml | 14 +- .github/workflows/swift.yaml | 2 +- .github/workflows/test-go.yaml | 18 ++- .github/workflows/test-piper-phonemize.yaml | 2 +- .gitignore | 1 + CMakeLists.txt | 2 +- cmake/espeak-ng-for-piper.cmake | 4 +- cmake/kaldi-native-fbank.cmake | 16 +- .../offline-decode-files/Program.cs | 11 ++ .../run-telespeech-ctc.sh | 15 ++ .../non-streaming-decode-files/main.go | 3 + .../run-telespeech-ctc.sh | 19 +++ .../NonStreamingDecodeFileParaformer.java | 2 +- .../NonStreamingDecodeFileTeleSpeechCtc.java | 47 ++++++ ...n-streaming-decode-file-tele-speech-ctc.sh | 37 +++++ .../offline-telespeech-ctc-decode-files.py | 60 ++++++++ scripts/apk/generate-vad-asr-apk-script.py | 16 ++ scripts/dotnet/OfflineModelConfig.cs | 4 + scripts/dotnet/run.sh | 4 +- .../run-telespeech-ctc.sh | 1 + scripts/go/sherpa_onnx.go | 8 +- .../node-addon-api/src/non-streaming-asr.cc | 5 + sherpa-onnx/c-api/c-api.cc | 3 + sherpa-onnx/c-api/c-api.h | 1 + sherpa-onnx/csrc/CMakeLists.txt | 1 + sherpa-onnx/csrc/features.cc | 101 ++++++++---- sherpa-onnx/csrc/features.h | 11 +- sherpa-onnx/csrc/offline-ctc-model.cc | 17 +++ sherpa-onnx/csrc/offline-model-config.cc | 18 ++- sherpa-onnx/csrc/offline-model-config.h | 3 + .../csrc/offline-recognizer-ctc-impl.h | 11 ++ sherpa-onnx/csrc/offline-recognizer-impl.cc | 30 +++- sherpa-onnx/csrc/offline-stream.cc | 90 +++++++---- .../csrc/offline-telespeech-ctc-model.cc | 144 ++++++++++++++++++ .../csrc/offline-telespeech-ctc-model.h | 81 ++++++++++ sherpa-onnx/csrc/online-model-config.cc | 2 +- .../k2fsa/sherpa/onnx/OfflineModelConfig.java | 13 ++ sherpa-onnx/jni/offline-recognizer.cc | 6 + sherpa-onnx/kotlin-api/OfflineRecognizer.kt | 10 ++ .../python/csrc/offline-model-config.cc | 41 ++--- .../python/sherpa_onnx/offline_recognizer.py | 65 ++++++++ swift-api-examples/SherpaOnnx.swift | 8 +- wasm/asr/sherpa-onnx-asr.js | 19 ++- wasm/nodejs/sherpa-onnx-wasm-nodejs.cc | 3 +- 52 files changed, 1050 insertions(+), 143 deletions(-) create mode 100644 .github/workflows/build-wheels-macos-universal2.yaml create mode 100755 dotnet-examples/offline-decode-files/run-telespeech-ctc.sh create mode 100755 go-api-examples/non-streaming-decode-files/run-telespeech-ctc.sh create mode 100644 java-api-examples/NonStreamingDecodeFileTeleSpeechCtc.java create mode 100755 java-api-examples/run-non-streaming-decode-file-tele-speech-ctc.sh create mode 100755 python-api-examples/offline-telespeech-ctc-decode-files.py create mode 120000 scripts/go/_internal/non-streaming-decode-files/run-telespeech-ctc.sh create mode 100644 sherpa-onnx/csrc/offline-telespeech-ctc-model.cc create mode 100644 sherpa-onnx/csrc/offline-telespeech-ctc-model.h diff --git a/.github/scripts/test-dot-net.sh b/.github/scripts/test-dot-net.sh index 1843cdf42..6ae126037 100755 --- a/.github/scripts/test-dot-net.sh +++ b/.github/scripts/test-dot-net.sh @@ -2,7 +2,16 @@ cd dotnet-examples/ -cd vad-non-streaming-asr-paraformer +cd ./offline-decode-files +./run-telespeech-ctc.sh +./run-nemo-ctc.sh +./run-paraformer.sh +./run-zipformer.sh +./run-hotwords.sh +./run-whisper.sh +./run-tdnn-yesno.sh + +cd ../vad-non-streaming-asr-paraformer ./run.sh cd ../offline-punctuation @@ -22,14 +31,6 @@ cd ../online-decode-files ./run-transducer.sh ./run-paraformer.sh -cd ../offline-decode-files -./run-nemo-ctc.sh -./run-paraformer.sh -./run-zipformer.sh -./run-hotwords.sh -./run-whisper.sh -./run-tdnn-yesno.sh - cd ../offline-tts ./run-aishell3.sh ./run-piper.sh diff --git a/.github/scripts/test-offline-ctc.sh b/.github/scripts/test-offline-ctc.sh index 4a5955464..7fc5e4501 100755 --- a/.github/scripts/test-offline-ctc.sh +++ b/.github/scripts/test-offline-ctc.sh @@ -15,6 +15,39 @@ echo "PATH: $PATH" which $EXE +log "test offline TeleSpeech CTC" +url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 +name=$(basename $url) +repo=$(basename -s .tar.bz2 $name) + +curl -SL -O $url +tar xvf $name +rm $name +ls -lh $repo + +test_wavs=( +3-sichuan.wav +4-tianjin.wav +5-henan.wav +) +for w in ${test_wavs[@]}; do + time $EXE \ + --tokens=$repo/tokens.txt \ + --telespeech-ctc=$repo/model.int8.onnx \ + --debug=1 \ + $repo/test_wavs/$w +done + +time $EXE \ + --tokens=$repo/tokens.txt \ + --telespeech-ctc=$repo/model.int8.onnx \ + --debug=1 \ + $repo/test_wavs/3-sichuan.wav \ + $repo/test_wavs/4-tianjin.wav \ + $repo/test_wavs/5-henan.wav + +rm -rf $repo + log "-----------------------------------------------------------------" log "Run Nemo fast conformer hybrid transducer ctc models (CTC branch)" log "-----------------------------------------------------------------" diff --git a/.github/scripts/test-python.sh b/.github/scripts/test-python.sh index e908f62b2..a52b5b910 100755 --- a/.github/scripts/test-python.sh +++ b/.github/scripts/test-python.sh @@ -10,6 +10,18 @@ log() { export GIT_CLONE_PROTECTION_ACTIVE=false +log "test offline TeleSpeech CTC" +url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 +name=$(basename $url) +repo=$(basename -s .tar.bz2 $name) + +curl -SL -O $url +tar xvf $name +rm $name +ls -lh $repo +python3 ./python-api-examples/offline-telespeech-ctc-decode-files.py +rm -rf $repo + log "test online NeMo CTC" url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-streaming-fast-conformer-ctc-en-80ms.tar.bz2 diff --git a/.github/workflows/build-wheels-macos-arm64.yaml b/.github/workflows/build-wheels-macos-arm64.yaml index 2cdea3f78..9a8edd504 100644 --- a/.github/workflows/build-wheels-macos-arm64.yaml +++ b/.github/workflows/build-wheels-macos-arm64.yaml @@ -82,7 +82,7 @@ jobs: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | - python3 -m pip install --upgrade pip - python3 -m pip install wheel twine setuptools + python3 -m pip install --break-system-packages --upgrade pip + python3 -m pip install --break-system-packages wheel twine setuptools twine upload ./wheelhouse/*.whl diff --git a/.github/workflows/build-wheels-macos-universal2.yaml b/.github/workflows/build-wheels-macos-universal2.yaml new file mode 100644 index 000000000..4d52110ee --- /dev/null +++ b/.github/workflows/build-wheels-macos-universal2.yaml @@ -0,0 +1,90 @@ +name: build-wheels-macos-universal2 + +on: + push: + branches: + - wheel + tags: + - '*' + workflow_dispatch: + +env: + SHERPA_ONNX_IS_IN_GITHUB_ACTIONS: 1 + +concurrency: + group: build-wheels-macos-universal2-${{ github.ref }} + cancel-in-progress: true + +jobs: + build_wheels_macos_universal2: + name: ${{ matrix.python-version }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest] + python-version: ["cp38", "cp39", "cp310", "cp311", "cp312"] + + steps: + - uses: actions/checkout@v4 + + - name: Build wheels + uses: pypa/cibuildwheel@v2.15.0 + env: + CIBW_BUILD: "${{ matrix.python-version}}-* " + CIBW_ENVIRONMENT: SHERPA_ONNX_CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES='arm64;x86_64'" + CIBW_ARCHS: "universal2" + CIBW_BUILD_VERBOSITY: 3 + + # Don't repair macOS wheels + CIBW_REPAIR_WHEEL_COMMAND_MACOS: "" + + - name: Display wheels + shell: bash + run: | + ls -lh ./wheelhouse/ + + - uses: actions/upload-artifact@v4 + with: + name: wheel-${{ matrix.python-version }} + path: ./wheelhouse/*.whl + + - name: Publish to huggingface + if: matrix.python-version == 'cp38' + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + + git clone https://huggingface.co/csukuangfj/sherpa-onnx-wheels huggingface + cd huggingface + git fetch + git pull + git merge -m "merge remote" --ff origin main + + cp -v ../wheelhouse/*.whl . + + git status + git add . + git commit -m "add more wheels" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-wheels main + + - name: Publish wheels to PyPI + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + python3 -m pip install --break-system-packages --upgrade pip + python3 -m pip install --break-system-packages wheel twine setuptools + + twine upload ./wheelhouse/*.whl diff --git a/.github/workflows/build-wheels-macos-x64.yaml b/.github/workflows/build-wheels-macos-x64.yaml index 13dc292dc..fbd7781b5 100644 --- a/.github/workflows/build-wheels-macos-x64.yaml +++ b/.github/workflows/build-wheels-macos-x64.yaml @@ -99,7 +99,7 @@ jobs: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | - python3 -m pip install --upgrade pip - python3 -m pip install wheel twine setuptools + python3 -m pip install --break-system-packages --upgrade pip + python3 -m pip install --break-system-packages wheel twine setuptools twine upload ./wheelhouse/*.whl diff --git a/.github/workflows/export-telespeech-ctc.yaml b/.github/workflows/export-telespeech-ctc.yaml index 38b3ce68f..102c3884e 100644 --- a/.github/workflows/export-telespeech-ctc.yaml +++ b/.github/workflows/export-telespeech-ctc.yaml @@ -48,3 +48,49 @@ jobs: repo_name: k2-fsa/sherpa-onnx repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} tag: asr-models + + - name: Publish float32 model to huggingface + shell: bash + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + src=scripts/tele-speech/sherpa-onnx-telespeech-ctc-zh-2024-06-04 + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + export GIT_CLONE_PROTECTION_ACTIVE=false + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-telespeech-ctc-zh-2024-06-04 hf + cp -a $src/* hf/ + cd hf + git lfs track "*.pdf" + git lfs track "*.onnx" + git add . + git commit -m 'add model files' || true + git status + ls -lh + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-telespeech-ctc-zh-2024-06-04 main || true + rm -rf hf + + - name: Publish int8 model to huggingface + shell: bash + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + src=scripts/tele-speech/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + export GIT_CLONE_PROTECTION_ACTIVE=false + + rm -rf hf + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 hf + cp -a $src/* hf/ + cd hf + git lfs track "*.pdf" + git lfs track "*.onnx" + git add . + git commit -m 'add model files' || true + git status + ls -lh + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 main || true diff --git a/.github/workflows/linux.yaml b/.github/workflows/linux.yaml index 92f2b02f0..861a2df0e 100644 --- a/.github/workflows/linux.yaml +++ b/.github/workflows/linux.yaml @@ -130,34 +130,34 @@ jobs: name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }} path: install/* - - name: Test online transducer + - name: Test offline CTC shell: bash run: | du -h -d1 . export PATH=$PWD/build/bin:$PATH - export EXE=sherpa-onnx + export EXE=sherpa-onnx-offline - .github/scripts/test-online-transducer.sh + .github/scripts/test-offline-ctc.sh du -h -d1 . - - name: Test online transducer (C API) + - name: Test online transducer shell: bash run: | du -h -d1 . export PATH=$PWD/build/bin:$PATH - export EXE=decode-file-c-api + export EXE=sherpa-onnx .github/scripts/test-online-transducer.sh du -h -d1 . - - name: Test offline CTC + - name: Test online transducer (C API) shell: bash run: | du -h -d1 . export PATH=$PWD/build/bin:$PATH - export EXE=sherpa-onnx-offline + export EXE=decode-file-c-api - .github/scripts/test-offline-ctc.sh + .github/scripts/test-online-transducer.sh du -h -d1 . - name: Test spoken language identification (C++ API) diff --git a/.github/workflows/macos.yaml b/.github/workflows/macos.yaml index 95030e573..1c7fe2ee1 100644 --- a/.github/workflows/macos.yaml +++ b/.github/workflows/macos.yaml @@ -107,6 +107,14 @@ jobs: otool -L build/bin/sherpa-onnx otool -l build/bin/sherpa-onnx + - name: Test offline CTC + shell: bash + run: | + export PATH=$PWD/build/bin:$PATH + export EXE=sherpa-onnx-offline + + .github/scripts/test-offline-ctc.sh + - name: Test offline transducer shell: bash run: | @@ -192,13 +200,7 @@ jobs: .github/scripts/test-offline-whisper.sh - - name: Test offline CTC - shell: bash - run: | - export PATH=$PWD/build/bin:$PATH - export EXE=sherpa-onnx-offline - .github/scripts/test-offline-ctc.sh - name: Test online transducer shell: bash diff --git a/.github/workflows/swift.yaml b/.github/workflows/swift.yaml index 5d3a01252..6b4ef10df 100644 --- a/.github/workflows/swift.yaml +++ b/.github/workflows/swift.yaml @@ -39,7 +39,7 @@ jobs: strategy: fail-fast: false matrix: - os: [macos-13] + os: [macos-latest, macos-14] steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/test-go.yaml b/.github/workflows/test-go.yaml index fa88d794a..5724d9cb9 100644 --- a/.github/workflows/test-go.yaml +++ b/.github/workflows/test-go.yaml @@ -30,14 +30,12 @@ concurrency: jobs: test-go: - name: ${{ matrix.os }} ${{matrix.arch }} + name: ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - include: - - os: macos-latest - arch: amd64 + os: [macos-latest, macos-14] steps: - uses: actions/checkout@v4 @@ -47,7 +45,7 @@ jobs: - name: ccache uses: hendrikmuhs/ccache-action@v1.2 with: - key: ${{ matrix.os }}-${{ matrix.arch }} + key: ${{ matrix.os }}-go - uses: actions/setup-go@v5 with: @@ -109,8 +107,6 @@ jobs: go build ls -lh - git lfs install - echo "Test vits-ljs" ./run-vits-ljs.sh rm -rf vits-ljs @@ -144,7 +140,13 @@ jobs: go build ls -lh - git lfs install + echo "Test telespeech ctc" + ./run-telespeech-ctc.sh + rm -rf sherpa-onnx-telespeech-ctc-* + + echo "Test transducer" + ./run-transducer.sh + rm -rf sherpa-onnx-zipformer-en-2023-06-26 echo "Test transducer" ./run-transducer.sh diff --git a/.github/workflows/test-piper-phonemize.yaml b/.github/workflows/test-piper-phonemize.yaml index dd00a4901..1edbae6d2 100644 --- a/.github/workflows/test-piper-phonemize.yaml +++ b/.github/workflows/test-piper-phonemize.yaml @@ -57,7 +57,7 @@ jobs: mkdir build cd build - cmake -DCMAKE_VERBOSE_MAKEFILE=ON -D SHERPA_ONNX_ENABLE_TESTS=ON -D CMAKE_BUILD_TYPE=${{ matrix.build_type }} -D BUILD_SHARED_LIBS=${{ matrix.shared_lib }} -DCMAKE_INSTALL_PREFIX=./install .. + cmake -DSHERPA_ONNX_ENABLE_EPSEAK_NG_EXE=ON -DBUILD_ESPEAK_NG_EXE=ON -DCMAKE_VERBOSE_MAKEFILE=ON -D SHERPA_ONNX_ENABLE_TESTS=ON -D CMAKE_BUILD_TYPE=${{ matrix.build_type }} -D BUILD_SHARED_LIBS=${{ matrix.shared_lib }} -DCMAKE_INSTALL_PREFIX=./install .. - name: Build shell: bash diff --git a/.gitignore b/.gitignore index 282da268f..1eb26e5c2 100644 --- a/.gitignore +++ b/.gitignore @@ -106,3 +106,4 @@ node_modules package-lock.json sherpa-onnx-nemo-* sherpa-onnx-vits-* +sherpa-onnx-telespeech-ctc-* diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b5136510..584583ba1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "Minimum OS X deployment ve project(sherpa-onnx) -set(SHERPA_ONNX_VERSION "1.9.26") +set(SHERPA_ONNX_VERSION "1.9.27") # Disable warning about # diff --git a/cmake/espeak-ng-for-piper.cmake b/cmake/espeak-ng-for-piper.cmake index 42b6ce75f..8601ab2d4 100644 --- a/cmake/espeak-ng-for-piper.cmake +++ b/cmake/espeak-ng-for-piper.cmake @@ -14,7 +14,9 @@ function(download_espeak_ng_for_piper) set(USE_SPEECHPLAYER OFF CACHE BOOL "" FORCE) set(EXTRA_cmn ON CACHE BOOL "" FORCE) set(EXTRA_ru ON CACHE BOOL "" FORCE) - set(BUILD_ESPEAK_NG_EXE OFF CACHE BOOL "" FORCE) + if (NOT SHERPA_ONNX_ENABLE_EPSEAK_NG_EXE) + set(BUILD_ESPEAK_NG_EXE OFF CACHE BOOL "" FORCE) + endif() # If you don't have access to the Internet, # please pre-download kaldi-decoder diff --git a/cmake/kaldi-native-fbank.cmake b/cmake/kaldi-native-fbank.cmake index ce76745ed..1d10a01e6 100644 --- a/cmake/kaldi-native-fbank.cmake +++ b/cmake/kaldi-native-fbank.cmake @@ -1,9 +1,9 @@ function(download_kaldi_native_fbank) include(FetchContent) - set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.19.1.tar.gz") - set(kaldi_native_fbank_URL2 "https://hub.nuaa.cf/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.19.1.tar.gz") - set(kaldi_native_fbank_HASH "SHA256=0cae8cbb9ea42916b214e088912f9e8f2f648f54756b305f93f552382f31f904") + set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.19.3.tar.gz") + set(kaldi_native_fbank_URL2 "https://hub.nuaa.cf/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.19.3.tar.gz") + set(kaldi_native_fbank_HASH "SHA256=335fe1daf1b9bfb2a7b6bf03b64c4c4686c39077c57fb8058c02611981676638") set(KALDI_NATIVE_FBANK_BUILD_TESTS OFF CACHE BOOL "" FORCE) set(KALDI_NATIVE_FBANK_BUILD_PYTHON OFF CACHE BOOL "" FORCE) @@ -12,11 +12,11 @@ function(download_kaldi_native_fbank) # If you don't have access to the Internet, # please pre-download kaldi-native-fbank set(possible_file_locations - $ENV{HOME}/Downloads/kaldi-native-fbank-1.19.1.tar.gz - ${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.19.1.tar.gz - ${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.19.1.tar.gz - /tmp/kaldi-native-fbank-1.19.1.tar.gz - /star-fj/fangjun/download/github/kaldi-native-fbank-1.19.1.tar.gz + $ENV{HOME}/Downloads/kaldi-native-fbank-1.19.3.tar.gz + ${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.19.3.tar.gz + ${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.19.3.tar.gz + /tmp/kaldi-native-fbank-1.19.3.tar.gz + /star-fj/fangjun/download/github/kaldi-native-fbank-1.19.3.tar.gz ) foreach(f IN LISTS possible_file_locations) diff --git a/dotnet-examples/offline-decode-files/Program.cs b/dotnet-examples/offline-decode-files/Program.cs index f88c634d7..ea30a14e2 100644 --- a/dotnet-examples/offline-decode-files/Program.cs +++ b/dotnet-examples/offline-decode-files/Program.cs @@ -34,6 +34,9 @@ class Options [Option(Required = false, Default = "",HelpText = "Path to transducer joiner.onnx. Used only for transducer models")] public string Joiner { get; set; } + [Option("model-type", Required = false, Default = "", HelpText = "model type")] + public string ModelType { get; set; } + [Option("whisper-encoder", Required = false, Default = "", HelpText = "Path to whisper encoder.onnx. Used only for whisper models")] public string WhisperEncoder { get; set; } @@ -56,6 +59,9 @@ class Options [Option("nemo-ctc", Required = false, HelpText = "Path to model.onnx. Used only for NeMo CTC models")] public string NeMoCtc { get; set; } + [Option("telespeech-ctc", Required = false, HelpText = "Path to model.onnx. Used only for TeleSpeech CTC models")] + public string TeleSpeechCtc { get; set; } + [Option("num-threads", Required = false, Default = 1, HelpText = "Number of threads for computation")] public int NumThreads { get; set; } @@ -201,6 +207,10 @@ private static void Run(Options options) { config.ModelConfig.NeMoCtc.Model = options.NeMoCtc; } + else if (!String.IsNullOrEmpty(options.TeleSpeechCtc)) + { + config.ModelConfig.TeleSpeechCtc = options.TeleSpeechCtc; + } else if (!String.IsNullOrEmpty(options.WhisperEncoder)) { config.ModelConfig.Whisper.Encoder = options.WhisperEncoder; @@ -218,6 +228,7 @@ private static void Run(Options options) return; } + config.ModelConfig.ModelType = options.ModelType; config.DecodingMethod = options.DecodingMethod; config.MaxActivePaths = options.MaxActivePaths; config.HotwordsFile = options.HotwordsFile; diff --git a/dotnet-examples/offline-decode-files/run-telespeech-ctc.sh b/dotnet-examples/offline-decode-files/run-telespeech-ctc.sh new file mode 100755 index 000000000..d678026d0 --- /dev/null +++ b/dotnet-examples/offline-decode-files/run-telespeech-ctc.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -d sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + tar xvf sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + rm sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 +fi + +dotnet run \ + --telespeech-ctc=./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx \ + --tokens=./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt \ + --model-type=telespeech-ctc \ + --files ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav diff --git a/go-api-examples/non-streaming-decode-files/main.go b/go-api-examples/non-streaming-decode-files/main.go index d38c43c26..48ad35302 100644 --- a/go-api-examples/non-streaming-decode-files/main.go +++ b/go-api-examples/non-streaming-decode-files/main.go @@ -40,6 +40,9 @@ func main() { flag.IntVar(&config.ModelConfig.Debug, "debug", 0, "Whether to show debug message") flag.StringVar(&config.ModelConfig.ModelType, "model-type", "", "Optional. Used for loading the model in a faster way") flag.StringVar(&config.ModelConfig.Provider, "provider", "cpu", "Provider to use") + flag.StringVar(&config.ModelConfig.ModelingUnit, "modeling-unit", "cjkchar", "cjkchar, bpe, cjkchar+bpe, or leave it to empty") + flag.StringVar(&config.ModelConfig.BpeVocab, "bpe-vocab", "", "") + flag.StringVar(&config.ModelConfig.TeleSpeechCtc, "telespeech-ctc", "", "Used for TeleSpeechCtc model") flag.StringVar(&config.LmConfig.Model, "lm-model", "", "Optional. Path to the LM model") flag.Float32Var(&config.LmConfig.Scale, "lm-scale", 1.0, "Optional. Scale for the LM model") diff --git a/go-api-examples/non-streaming-decode-files/run-telespeech-ctc.sh b/go-api-examples/non-streaming-decode-files/run-telespeech-ctc.sh new file mode 100755 index 000000000..d9785b2aa --- /dev/null +++ b/go-api-examples/non-streaming-decode-files/run-telespeech-ctc.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -d sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + tar xvf sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + rm sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 +fi + +go mod tidy +go build + +./non-streaming-decode-files \ + --telespeech-ctc ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx \ + --tokens ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt \ + --model-type telespeech-ctc \ + --debug 0 \ + ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav diff --git a/java-api-examples/NonStreamingDecodeFileParaformer.java b/java-api-examples/NonStreamingDecodeFileParaformer.java index 4a49c8d66..81121f6a1 100644 --- a/java-api-examples/NonStreamingDecodeFileParaformer.java +++ b/java-api-examples/NonStreamingDecodeFileParaformer.java @@ -4,7 +4,7 @@ // to decode files. import com.k2fsa.sherpa.onnx.*; -public class NonStreamingDecodeFileTransducer { +public class NonStreamingDecodeFileParaformer { public static void main(String[] args) { // please refer to // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese-english diff --git a/java-api-examples/NonStreamingDecodeFileTeleSpeechCtc.java b/java-api-examples/NonStreamingDecodeFileTeleSpeechCtc.java new file mode 100644 index 000000000..ccb2bde04 --- /dev/null +++ b/java-api-examples/NonStreamingDecodeFileTeleSpeechCtc.java @@ -0,0 +1,47 @@ +// Copyright 2024 Xiaomi Corporation + +// This file shows how to use an offline TeleSpeech CTC model +// to decode files. +import com.k2fsa.sherpa.onnx.*; + +public class NonStreamingDecodeFileTeleSpeechCtc { + public static void main(String[] args) { + // please refer to + // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese-english + // to download model files + String model = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx"; + String tokens = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt"; + + String waveFilename = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav"; + + WaveReader reader = new WaveReader(waveFilename); + + OfflineModelConfig modelConfig = + OfflineModelConfig.builder() + .setTeleSpeech(model) + .setTokens(tokens) + .setNumThreads(1) + .setDebug(true) + .setModelType("telespeech_ctc") + .build(); + + OfflineRecognizerConfig config = + OfflineRecognizerConfig.builder() + .setOfflineModelConfig(modelConfig) + .setDecodingMethod("greedy_search") + .build(); + + OfflineRecognizer recognizer = new OfflineRecognizer(config); + OfflineStream stream = recognizer.createStream(); + stream.acceptWaveform(reader.getSamples(), reader.getSampleRate()); + + recognizer.decode(stream); + + String text = recognizer.getResult(stream).getText(); + + System.out.printf("filename:%s\nresult:%s\n", waveFilename, text); + + stream.release(); + recognizer.release(); + } +} diff --git a/java-api-examples/run-non-streaming-decode-file-tele-speech-ctc.sh b/java-api-examples/run-non-streaming-decode-file-tele-speech-ctc.sh new file mode 100755 index 000000000..075f1e2e3 --- /dev/null +++ b/java-api-examples/run-non-streaming-decode-file-tele-speech-ctc.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +set -ex + +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then + mkdir -p ../build + pushd ../build + cmake \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=ON \ + .. + + make -j4 + ls -lh lib + popd +fi + +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then + pushd ../sherpa-onnx/java-api + make + popd +fi + +if [ ! -f ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + tar xvf sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + rm sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 +fi + +java \ + -Djava.library.path=$PWD/../build/lib \ + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ + ./NonStreamingDecodeFileTeleSpeechCtc.java diff --git a/python-api-examples/offline-telespeech-ctc-decode-files.py b/python-api-examples/offline-telespeech-ctc-decode-files.py new file mode 100755 index 000000000..17b7f4853 --- /dev/null +++ b/python-api-examples/offline-telespeech-ctc-decode-files.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +""" +This file shows how to use a non-streaming CTC model from +https://github.com/Tele-AI/TeleSpeech-ASR +to decode files. + +Please download model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + + +""" + +from pathlib import Path + +import sherpa_onnx +import soundfile as sf + + +def create_recognizer(): + model = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx" + tokens = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt" + test_wav = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav" + # test_wav = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/4-tianjin.wav" + # test_wav = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/5-henan.wav" + + if not Path(model).is_file() or not Path(test_wav).is_file(): + raise ValueError( + """Please download model files from + https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + """ + ) + return ( + sherpa_onnx.OfflineRecognizer.from_telespeech_ctc( + model=model, + tokens=tokens, + debug=True, + ), + test_wav, + ) + + +def main(): + recognizer, wave_filename = create_recognizer() + + audio, sample_rate = sf.read(wave_filename, dtype="float32", always_2d=True) + audio = audio[:, 0] # only use the first channel + + # audio is a 1-D float32 numpy array normalized to the range [-1, 1] + # sample_rate does not need to be 16000 Hz + + stream = recognizer.create_stream() + stream.accept_waveform(sample_rate, audio) + recognizer.decode_stream(stream) + print(wave_filename) + print(stream.result) + + +if __name__ == "__main__": + main() diff --git a/scripts/apk/generate-vad-asr-apk-script.py b/scripts/apk/generate-vad-asr-apk-script.py index 009c565a7..ca38fa3fb 100755 --- a/scripts/apk/generate-vad-asr-apk-script.py +++ b/scripts/apk/generate-vad-asr-apk-script.py @@ -163,6 +163,22 @@ def get_models(): ls -lh + popd + """, + ), + Model( + model_name="sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04", + idx=11, + lang="zh", + short_name="telespeech", + cmd=""" + pushd $model_name + + rm -rfv test_wavs + rm test.py + + ls -lh + popd """, ), diff --git a/scripts/dotnet/OfflineModelConfig.cs b/scripts/dotnet/OfflineModelConfig.cs index 2dc2347c1..f5620944f 100644 --- a/scripts/dotnet/OfflineModelConfig.cs +++ b/scripts/dotnet/OfflineModelConfig.cs @@ -25,6 +25,7 @@ public OfflineModelConfig() ModelType = ""; ModelingUnit = "cjkchar"; BpeVocab = ""; + TeleSpeechCtc = ""; } public OfflineTransducerModelConfig Transducer; public OfflineParaformerModelConfig Paraformer; @@ -50,5 +51,8 @@ public OfflineModelConfig() [MarshalAs(UnmanagedType.LPStr)] public string BpeVocab; + + [MarshalAs(UnmanagedType.LPStr)] + public string TeleSpeechCtc; } } diff --git a/scripts/dotnet/run.sh b/scripts/dotnet/run.sh index 3ce1a0fb5..7aa3ae5e4 100755 --- a/scripts/dotnet/run.sh +++ b/scripts/dotnet/run.sh @@ -30,7 +30,7 @@ mkdir -p linux macos windows-x64 windows-x86 linux_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl linux_wheel=$src_dir/$linux_wheel_filename -macos_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_x86_64.whl +macos_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_universal2.whl macos_wheel=$src_dir/$macos_wheel_filename windows_x64_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win_amd64.whl @@ -61,7 +61,7 @@ if [ ! -f $src_dir/linux/libsherpa-onnx-core.so ]; then fi if [ ! -f $src_dir/macos/libsherpa-onnx-core.dylib ]; then - echo "---macOS x86_64---" + echo "--- macOS x86_64/arm64 universal2---" cd macos mkdir -p wheel cd wheel diff --git a/scripts/go/_internal/non-streaming-decode-files/run-telespeech-ctc.sh b/scripts/go/_internal/non-streaming-decode-files/run-telespeech-ctc.sh new file mode 120000 index 000000000..1625a0708 --- /dev/null +++ b/scripts/go/_internal/non-streaming-decode-files/run-telespeech-ctc.sh @@ -0,0 +1 @@ +../../../../go-api-examples/non-streaming-decode-files/run-telespeech-ctc.sh \ No newline at end of file diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index e89787da9..a03031866 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -381,8 +381,9 @@ type OfflineModelConfig struct { // Optional. Specify it for faster model initialization. ModelType string - ModelingUnit string // Optional. cjkchar, bpe, cjkchar+bpe - BpeVocab string // Optional. + ModelingUnit string // Optional. cjkchar, bpe, cjkchar+bpe + BpeVocab string // Optional. + TeleSpeechCtc string // Optional. } // Configuration for the offline/non-streaming recognizer. @@ -477,6 +478,9 @@ func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer { c.model_config.bpe_vocab = C.CString(config.ModelConfig.BpeVocab) defer C.free(unsafe.Pointer(c.model_config.bpe_vocab)) + c.model_config.telespeech_ctc = C.CString(config.ModelConfig.TeleSpeechCtc) + defer C.free(unsafe.Pointer(c.model_config.telespeech_ctc)) + c.lm_config.model = C.CString(config.LmConfig.Model) defer C.free(unsafe.Pointer(c.lm_config.model)) diff --git a/scripts/node-addon-api/src/non-streaming-asr.cc b/scripts/node-addon-api/src/non-streaming-asr.cc index d101c7eb6..671528200 100644 --- a/scripts/node-addon-api/src/non-streaming-asr.cc +++ b/scripts/node-addon-api/src/non-streaming-asr.cc @@ -128,6 +128,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { SHERPA_ONNX_ASSIGN_ATTR_STR(model_type, modelType); SHERPA_ONNX_ASSIGN_ATTR_STR(modeling_unit, modelingUnit); SHERPA_ONNX_ASSIGN_ATTR_STR(bpe_vocab, bpeVocab); + SHERPA_ONNX_ASSIGN_ATTR_STR(telespeech_ctc, teleSpeechCtc); return c; } @@ -242,6 +243,10 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { delete[] c.model_config.bpe_vocab; } + if (c.model_config.telespeech_ctc) { + delete[] c.model_config.telespeech_ctc; + } + if (c.lm_config.model) { delete[] c.lm_config.model; } diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index eaf782b92..eb4e293d1 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -366,6 +366,9 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer( recognizer_config.model_config.bpe_vocab = SHERPA_ONNX_OR(config->model_config.bpe_vocab, ""); + recognizer_config.model_config.telespeech_ctc = + SHERPA_ONNX_OR(config->model_config.telespeech_ctc, ""); + recognizer_config.lm_config.model = SHERPA_ONNX_OR(config->lm_config.model, ""); recognizer_config.lm_config.scale = diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index bd9b6a4d4..e75d1955f 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -395,6 +395,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { // - cjkchar+bpe const char *modeling_unit; const char *bpe_vocab; + const char *telespeech_ctc; } SherpaOnnxOfflineModelConfig; SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig { diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index 232412338..6edb82402 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -39,6 +39,7 @@ set(sources offline-stream.cc offline-tdnn-ctc-model.cc offline-tdnn-model-config.cc + offline-telespeech-ctc-model.cc offline-transducer-greedy-search-decoder.cc offline-transducer-greedy-search-nemo-decoder.cc offline-transducer-model-config.cc diff --git a/sherpa-onnx/csrc/features.cc b/sherpa-onnx/csrc/features.cc index 2eab92ede..ed806f392 100644 --- a/sherpa-onnx/csrc/features.cc +++ b/sherpa-onnx/csrc/features.cc @@ -56,22 +56,11 @@ std::string FeatureExtractorConfig::ToString() const { class FeatureExtractor::Impl { public: explicit Impl(const FeatureExtractorConfig &config) : config_(config) { - opts_.frame_opts.dither = config.dither; - opts_.frame_opts.snip_edges = config.snip_edges; - opts_.frame_opts.samp_freq = config.sampling_rate; - opts_.frame_opts.frame_shift_ms = config.frame_shift_ms; - opts_.frame_opts.frame_length_ms = config.frame_length_ms; - opts_.frame_opts.remove_dc_offset = config.remove_dc_offset; - opts_.frame_opts.window_type = config.window_type; - - opts_.mel_opts.num_bins = config.feature_dim; - - opts_.mel_opts.high_freq = config.high_freq; - opts_.mel_opts.low_freq = config.low_freq; - - opts_.mel_opts.is_librosa = config.is_librosa; - - fbank_ = std::make_unique(opts_); + if (config_.is_mfcc) { + InitMfcc(); + } else { + InitFbank(); + } } void AcceptWaveform(int32_t sampling_rate, const float *waveform, int32_t n) { @@ -101,35 +90,48 @@ class FeatureExtractor::Impl { std::vector samples; resampler_->Resample(waveform, n, false, &samples); - fbank_->AcceptWaveform(opts_.frame_opts.samp_freq, samples.data(), - samples.size()); + if (fbank_) { + fbank_->AcceptWaveform(config_.sampling_rate, samples.data(), + samples.size()); + } else { + mfcc_->AcceptWaveform(config_.sampling_rate, samples.data(), + samples.size()); + } return; } - if (sampling_rate != opts_.frame_opts.samp_freq) { + if (sampling_rate != config_.sampling_rate) { SHERPA_ONNX_LOGE( "Creating a resampler:\n" " in_sample_rate: %d\n" " output_sample_rate: %d\n", - sampling_rate, static_cast(opts_.frame_opts.samp_freq)); + sampling_rate, static_cast(config_.sampling_rate)); - float min_freq = - std::min(sampling_rate, opts_.frame_opts.samp_freq); + float min_freq = std::min(sampling_rate, config_.sampling_rate); float lowpass_cutoff = 0.99 * 0.5 * min_freq; int32_t lowpass_filter_width = 6; resampler_ = std::make_unique( - sampling_rate, opts_.frame_opts.samp_freq, lowpass_cutoff, + sampling_rate, config_.sampling_rate, lowpass_cutoff, lowpass_filter_width); std::vector samples; resampler_->Resample(waveform, n, false, &samples); - fbank_->AcceptWaveform(opts_.frame_opts.samp_freq, samples.data(), - samples.size()); + if (fbank_) { + fbank_->AcceptWaveform(config_.sampling_rate, samples.data(), + samples.size()); + } else { + mfcc_->AcceptWaveform(config_.sampling_rate, samples.data(), + samples.size()); + } return; } - fbank_->AcceptWaveform(sampling_rate, waveform, n); + if (fbank_) { + fbank_->AcceptWaveform(sampling_rate, waveform, n); + } else { + mfcc_->AcceptWaveform(sampling_rate, waveform, n); + } } void InputFinished() const { @@ -179,11 +181,56 @@ class FeatureExtractor::Impl { return features; } - int32_t FeatureDim() const { return opts_.mel_opts.num_bins; } + int32_t FeatureDim() const { + return mfcc_ ? mfcc_opts_.num_ceps : opts_.mel_opts.num_bins; + } + + private: + void InitFbank() { + opts_.frame_opts.dither = config_.dither; + opts_.frame_opts.snip_edges = config_.snip_edges; + opts_.frame_opts.samp_freq = config_.sampling_rate; + opts_.frame_opts.frame_shift_ms = config_.frame_shift_ms; + opts_.frame_opts.frame_length_ms = config_.frame_length_ms; + opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset; + opts_.frame_opts.window_type = config_.window_type; + + opts_.mel_opts.num_bins = config_.feature_dim; + + opts_.mel_opts.high_freq = config_.high_freq; + opts_.mel_opts.low_freq = config_.low_freq; + + opts_.mel_opts.is_librosa = config_.is_librosa; + + fbank_ = std::make_unique(opts_); + } + void InitMfcc() { + mfcc_opts_.frame_opts.dither = config_.dither; + mfcc_opts_.frame_opts.snip_edges = config_.snip_edges; + mfcc_opts_.frame_opts.samp_freq = config_.sampling_rate; + mfcc_opts_.frame_opts.frame_shift_ms = config_.frame_shift_ms; + mfcc_opts_.frame_opts.frame_length_ms = config_.frame_length_ms; + mfcc_opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset; + mfcc_opts_.frame_opts.window_type = config_.window_type; + + mfcc_opts_.mel_opts.num_bins = config_.feature_dim; + + mfcc_opts_.mel_opts.high_freq = config_.high_freq; + mfcc_opts_.mel_opts.low_freq = config_.low_freq; + + mfcc_opts_.mel_opts.is_librosa = config_.is_librosa; + + mfcc_opts_.num_ceps = config_.num_ceps; + mfcc_opts_.use_energy = config_.use_energy; + + mfcc_ = std::make_unique(mfcc_opts_); + } private: std::unique_ptr fbank_; + std::unique_ptr mfcc_; knf::FbankOptions opts_; + knf::MfccOptions mfcc_opts_; FeatureExtractorConfig config_; mutable std::mutex mutex_; std::unique_ptr resampler_; diff --git a/sherpa-onnx/csrc/features.h b/sherpa-onnx/csrc/features.h index c3bc02d5e..2e4596a57 100644 --- a/sherpa-onnx/csrc/features.h +++ b/sherpa-onnx/csrc/features.h @@ -18,7 +18,10 @@ struct FeatureExtractorConfig { // the sampling rate of the input waveform, we will do resampling inside. int32_t sampling_rate = 16000; - // Feature dimension + // num_mel_bins + // + // Note: for mfcc, this value is also for num_mel_bins. + // The actual feature dimension is actuall num_ceps int32_t feature_dim = 80; // minimal frequency for Mel-filterbank, in Hz @@ -69,6 +72,12 @@ struct FeatureExtractorConfig { // for details std::string nemo_normalize_type; + // for MFCC + int32_t num_ceps = 13; + bool use_energy = true; + + bool is_mfcc = false; + std::string ToString() const; void Register(ParseOptions *po); diff --git a/sherpa-onnx/csrc/offline-ctc-model.cc b/sherpa-onnx/csrc/offline-ctc-model.cc index ed18720a3..cfa8ab45c 100644 --- a/sherpa-onnx/csrc/offline-ctc-model.cc +++ b/sherpa-onnx/csrc/offline-ctc-model.cc @@ -12,6 +12,7 @@ #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/offline-nemo-enc-dec-ctc-model.h" #include "sherpa-onnx/csrc/offline-tdnn-ctc-model.h" +#include "sherpa-onnx/csrc/offline-telespeech-ctc-model.h" #include "sherpa-onnx/csrc/offline-wenet-ctc-model.h" #include "sherpa-onnx/csrc/offline-zipformer-ctc-model.h" #include "sherpa-onnx/csrc/onnx-utils.h" @@ -24,6 +25,7 @@ enum class ModelType { kTdnn, kZipformerCtc, kWenetCtc, + kTeleSpeechCtc, kUnknown, }; @@ -63,6 +65,9 @@ static ModelType GetModelType(char *model_data, size_t model_data_length, "If you are using models from WeNet, please refer to\n" "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/" "run.sh\n" + "If you are using models from TeleSpeech, please refer to\n" + "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/tele-speech/" + "add-metadata.py" "\n" "for how to add metadta to model.onnx\n"); return ModelType::kUnknown; @@ -78,6 +83,8 @@ static ModelType GetModelType(char *model_data, size_t model_data_length, return ModelType::kZipformerCtc; } else if (model_type.get() == std::string("wenet_ctc")) { return ModelType::kWenetCtc; + } else if (model_type.get() == std::string("telespeech_ctc")) { + return ModelType::kTeleSpeechCtc; } else { SHERPA_ONNX_LOGE("Unsupported model_type: %s", model_type.get()); return ModelType::kUnknown; @@ -97,6 +104,8 @@ std::unique_ptr OfflineCtcModel::Create( filename = config.zipformer_ctc.model; } else if (!config.wenet_ctc.model.empty()) { filename = config.wenet_ctc.model; + } else if (!config.telespeech_ctc.empty()) { + filename = config.telespeech_ctc; } else { SHERPA_ONNX_LOGE("Please specify a CTC model"); exit(-1); @@ -124,6 +133,9 @@ std::unique_ptr OfflineCtcModel::Create( case ModelType::kWenetCtc: return std::make_unique(config); break; + case ModelType::kTeleSpeechCtc: + return std::make_unique(config); + break; case ModelType::kUnknown: SHERPA_ONNX_LOGE("Unknown model type in offline CTC!"); return nullptr; @@ -147,6 +159,8 @@ std::unique_ptr OfflineCtcModel::Create( filename = config.zipformer_ctc.model; } else if (!config.wenet_ctc.model.empty()) { filename = config.wenet_ctc.model; + } else if (!config.telespeech_ctc.empty()) { + filename = config.telespeech_ctc; } else { SHERPA_ONNX_LOGE("Please specify a CTC model"); exit(-1); @@ -175,6 +189,9 @@ std::unique_ptr OfflineCtcModel::Create( case ModelType::kWenetCtc: return std::make_unique(mgr, config); break; + case ModelType::kTeleSpeechCtc: + return std::make_unique(mgr, config); + break; case ModelType::kUnknown: SHERPA_ONNX_LOGE("Unknown model type in offline CTC!"); return nullptr; diff --git a/sherpa-onnx/csrc/offline-model-config.cc b/sherpa-onnx/csrc/offline-model-config.cc index b85a0a9f2..24a5a2141 100644 --- a/sherpa-onnx/csrc/offline-model-config.cc +++ b/sherpa-onnx/csrc/offline-model-config.cc @@ -19,6 +19,9 @@ void OfflineModelConfig::Register(ParseOptions *po) { zipformer_ctc.Register(po); wenet_ctc.Register(po); + po->Register("telespeech-ctc", &telespeech_ctc, + "Path to model.onnx for telespeech ctc"); + po->Register("tokens", &tokens, "Path to tokens.txt"); po->Register("num-threads", &num_threads, @@ -33,7 +36,7 @@ void OfflineModelConfig::Register(ParseOptions *po) { po->Register("model-type", &model_type, "Specify it to reduce model initialization time. " "Valid values are: transducer, paraformer, nemo_ctc, whisper, " - "tdnn, zipformer2_ctc" + "tdnn, zipformer2_ctc, telespeech_ctc." "All other values lead to loading the model twice."); po->Register("modeling-unit", &modeling_unit, "The modeling unit of the model, commonly used units are bpe, " @@ -55,14 +58,14 @@ bool OfflineModelConfig::Validate() const { } if (!FileExists(tokens)) { - SHERPA_ONNX_LOGE("tokens: %s does not exist", tokens.c_str()); + SHERPA_ONNX_LOGE("tokens: '%s' does not exist", tokens.c_str()); return false; } if (!modeling_unit.empty() && (modeling_unit == "bpe" || modeling_unit == "cjkchar+bpe")) { if (!FileExists(bpe_vocab)) { - SHERPA_ONNX_LOGE("bpe_vocab: %s does not exist", bpe_vocab.c_str()); + SHERPA_ONNX_LOGE("bpe_vocab: '%s' does not exist", bpe_vocab.c_str()); return false; } } @@ -91,6 +94,14 @@ bool OfflineModelConfig::Validate() const { return wenet_ctc.Validate(); } + if (!telespeech_ctc.empty() && !FileExists(telespeech_ctc)) { + SHERPA_ONNX_LOGE("telespeech_ctc: '%s' does not exist", + telespeech_ctc.c_str()); + return false; + } else { + return true; + } + return transducer.Validate(); } @@ -105,6 +116,7 @@ std::string OfflineModelConfig::ToString() const { os << "tdnn=" << tdnn.ToString() << ", "; os << "zipformer_ctc=" << zipformer_ctc.ToString() << ", "; os << "wenet_ctc=" << wenet_ctc.ToString() << ", "; + os << "telespeech_ctc=\"" << telespeech_ctc << "\", "; os << "tokens=\"" << tokens << "\", "; os << "num_threads=" << num_threads << ", "; os << "debug=" << (debug ? "True" : "False") << ", "; diff --git a/sherpa-onnx/csrc/offline-model-config.h b/sherpa-onnx/csrc/offline-model-config.h index 93ea7fd0e..856a6f35d 100644 --- a/sherpa-onnx/csrc/offline-model-config.h +++ b/sherpa-onnx/csrc/offline-model-config.h @@ -24,6 +24,7 @@ struct OfflineModelConfig { OfflineTdnnModelConfig tdnn; OfflineZipformerCtcModelConfig zipformer_ctc; OfflineWenetCtcModelConfig wenet_ctc; + std::string telespeech_ctc; std::string tokens; int32_t num_threads = 2; @@ -52,6 +53,7 @@ struct OfflineModelConfig { const OfflineTdnnModelConfig &tdnn, const OfflineZipformerCtcModelConfig &zipformer_ctc, const OfflineWenetCtcModelConfig &wenet_ctc, + const std::string &telespeech_ctc, const std::string &tokens, int32_t num_threads, bool debug, const std::string &provider, const std::string &model_type, const std::string &modeling_unit, @@ -63,6 +65,7 @@ struct OfflineModelConfig { tdnn(tdnn), zipformer_ctc(zipformer_ctc), wenet_ctc(wenet_ctc), + telespeech_ctc(telespeech_ctc), tokens(tokens), num_threads(num_threads), debug(debug), diff --git a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h index 988a487b6..2c83dac28 100644 --- a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h +++ b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h @@ -88,6 +88,17 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { #endif void Init() { + if (!config_.model_config.telespeech_ctc.empty()) { + config_.feat_config.snip_edges = true; + config_.feat_config.num_ceps = 40; + config_.feat_config.feature_dim = 40; + config_.feat_config.low_freq = 40; + config_.feat_config.high_freq = -200; + config_.feat_config.use_energy = false; + config_.feat_config.normalize_samples = false; + config_.feat_config.is_mfcc = true; + } + if (!config_.model_config.wenet_ctc.model.empty()) { // WeNet CTC models assume input samples are in the range // [-32768, 32767], so we set normalize_samples to false diff --git a/sherpa-onnx/csrc/offline-recognizer-impl.cc b/sherpa-onnx/csrc/offline-recognizer-impl.cc index c23acf126..656425778 100644 --- a/sherpa-onnx/csrc/offline-recognizer-impl.cc +++ b/sherpa-onnx/csrc/offline-recognizer-impl.cc @@ -29,7 +29,8 @@ std::unique_ptr OfflineRecognizerImpl::Create( } else if (model_type == "paraformer") { return std::make_unique(config); } else if (model_type == "nemo_ctc" || model_type == "tdnn" || - model_type == "zipformer2_ctc" || model_type == "wenet_ctc") { + model_type == "zipformer2_ctc" || model_type == "wenet_ctc" || + model_type == "telespeech_ctc") { return std::make_unique(config); } else if (model_type == "whisper") { return std::make_unique(config); @@ -53,6 +54,8 @@ std::unique_ptr OfflineRecognizerImpl::Create( model_filename = config.model_config.paraformer.model; } else if (!config.model_config.nemo_ctc.model.empty()) { model_filename = config.model_config.nemo_ctc.model; + } else if (!config.model_config.telespeech_ctc.empty()) { + model_filename = config.model_config.telespeech_ctc; } else if (!config.model_config.tdnn.model.empty()) { model_filename = config.model_config.tdnn.model; } else if (!config.model_config.zipformer_ctc.model.empty()) { @@ -111,6 +114,10 @@ std::unique_ptr OfflineRecognizerImpl::Create( "\n " "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/run.sh" "\n" + "(7) CTC models from TeleSpeech" + "\n " + "https://github.com/Tele-AI/TeleSpeech-ASR" + "\n" "\n"); exit(-1); } @@ -133,7 +140,8 @@ std::unique_ptr OfflineRecognizerImpl::Create( if (model_type == "EncDecCTCModelBPE" || model_type == "EncDecHybridRNNTCTCBPEModel" || model_type == "tdnn" || - model_type == "zipformer2_ctc" || model_type == "wenet_ctc") { + model_type == "zipformer2_ctc" || model_type == "wenet_ctc" || + model_type == "telespeech_ctc") { return std::make_unique(config); } @@ -151,7 +159,8 @@ std::unique_ptr OfflineRecognizerImpl::Create( " - Whisper models\n" " - Tdnn models\n" " - Zipformer CTC models\n" - " - WeNet CTC models\n", + " - WeNet CTC models\n" + " - TeleSpeech CTC models\n", model_type.c_str()); exit(-1); @@ -169,7 +178,8 @@ std::unique_ptr OfflineRecognizerImpl::Create( } else if (model_type == "paraformer") { return std::make_unique(mgr, config); } else if (model_type == "nemo_ctc" || model_type == "tdnn" || - model_type == "zipformer2_ctc" || model_type == "wenet_ctc") { + model_type == "zipformer2_ctc" || model_type == "wenet_ctc" || + model_type == "telespeech_ctc") { return std::make_unique(mgr, config); } else if (model_type == "whisper") { return std::make_unique(mgr, config); @@ -199,6 +209,8 @@ std::unique_ptr OfflineRecognizerImpl::Create( model_filename = config.model_config.zipformer_ctc.model; } else if (!config.model_config.wenet_ctc.model.empty()) { model_filename = config.model_config.wenet_ctc.model; + } else if (!config.model_config.telespeech_ctc.empty()) { + model_filename = config.model_config.telespeech_ctc; } else if (!config.model_config.whisper.encoder.empty()) { model_filename = config.model_config.whisper.encoder; } else { @@ -251,6 +263,10 @@ std::unique_ptr OfflineRecognizerImpl::Create( "\n " "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/run.sh" "\n" + "(7) CTC models from TeleSpeech" + "\n " + "https://github.com/Tele-AI/TeleSpeech-ASR" + "\n" "\n"); exit(-1); } @@ -273,7 +289,8 @@ std::unique_ptr OfflineRecognizerImpl::Create( if (model_type == "EncDecCTCModelBPE" || model_type == "EncDecHybridRNNTCTCBPEModel" || model_type == "tdnn" || - model_type == "zipformer2_ctc" || model_type == "wenet_ctc") { + model_type == "zipformer2_ctc" || model_type == "wenet_ctc" || + model_type == "telespeech_ctc") { return std::make_unique(mgr, config); } @@ -291,7 +308,8 @@ std::unique_ptr OfflineRecognizerImpl::Create( " - Whisper models\n" " - Tdnn models\n" " - Zipformer CTC models\n" - " - WeNet CTC models\n", + " - WeNet CTC models\n" + " - TeleSpeech CTC models\n", model_type.c_str()); exit(-1); diff --git a/sherpa-onnx/csrc/offline-stream.cc b/sherpa-onnx/csrc/offline-stream.cc index 206b36003..4321a62dd 100644 --- a/sherpa-onnx/csrc/offline-stream.cc +++ b/sherpa-onnx/csrc/offline-stream.cc @@ -57,22 +57,44 @@ class OfflineStream::Impl { explicit Impl(const FeatureExtractorConfig &config, ContextGraphPtr context_graph) : config_(config), context_graph_(context_graph) { - opts_.frame_opts.dither = config.dither; - opts_.frame_opts.snip_edges = config.snip_edges; - opts_.frame_opts.samp_freq = config.sampling_rate; - opts_.frame_opts.frame_shift_ms = config.frame_shift_ms; - opts_.frame_opts.frame_length_ms = config.frame_length_ms; - opts_.frame_opts.remove_dc_offset = config.remove_dc_offset; - opts_.frame_opts.window_type = config.window_type; + if (config.is_mfcc) { + mfcc_opts_.frame_opts.dither = config_.dither; + mfcc_opts_.frame_opts.snip_edges = config_.snip_edges; + mfcc_opts_.frame_opts.samp_freq = config_.sampling_rate; + mfcc_opts_.frame_opts.frame_shift_ms = config_.frame_shift_ms; + mfcc_opts_.frame_opts.frame_length_ms = config_.frame_length_ms; + mfcc_opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset; + mfcc_opts_.frame_opts.window_type = config_.window_type; - opts_.mel_opts.num_bins = config.feature_dim; + mfcc_opts_.mel_opts.num_bins = config_.feature_dim; - opts_.mel_opts.high_freq = config.high_freq; - opts_.mel_opts.low_freq = config.low_freq; + mfcc_opts_.mel_opts.high_freq = config_.high_freq; + mfcc_opts_.mel_opts.low_freq = config_.low_freq; - opts_.mel_opts.is_librosa = config.is_librosa; + mfcc_opts_.mel_opts.is_librosa = config_.is_librosa; - fbank_ = std::make_unique(opts_); + mfcc_opts_.num_ceps = config_.num_ceps; + mfcc_opts_.use_energy = config_.use_energy; + + mfcc_ = std::make_unique(mfcc_opts_); + } else { + opts_.frame_opts.dither = config.dither; + opts_.frame_opts.snip_edges = config.snip_edges; + opts_.frame_opts.samp_freq = config.sampling_rate; + opts_.frame_opts.frame_shift_ms = config.frame_shift_ms; + opts_.frame_opts.frame_length_ms = config.frame_length_ms; + opts_.frame_opts.remove_dc_offset = config.remove_dc_offset; + opts_.frame_opts.window_type = config.window_type; + + opts_.mel_opts.num_bins = config.feature_dim; + + opts_.mel_opts.high_freq = config.high_freq; + opts_.mel_opts.low_freq = config.low_freq; + + opts_.mel_opts.is_librosa = config.is_librosa; + + fbank_ = std::make_unique(opts_); + } } explicit Impl(WhisperTag /*tag*/) { @@ -81,6 +103,7 @@ class OfflineStream::Impl { opts_.mel_opts.num_bins = 80; // not used whisper_fbank_ = std::make_unique(opts_.frame_opts); + config_.sampling_rate = opts_.frame_opts.samp_freq; } explicit Impl(CEDTag /*tag*/) { @@ -98,6 +121,8 @@ class OfflineStream::Impl { opts_.mel_opts.num_bins = 64; opts_.mel_opts.high_freq = 8000; + config_.sampling_rate = opts_.frame_opts.samp_freq; + fbank_ = std::make_unique(opts_); } @@ -115,52 +140,60 @@ class OfflineStream::Impl { void AcceptWaveformImpl(int32_t sampling_rate, const float *waveform, int32_t n) { - if (sampling_rate != opts_.frame_opts.samp_freq) { + if (sampling_rate != config_.sampling_rate) { SHERPA_ONNX_LOGE( "Creating a resampler:\n" " in_sample_rate: %d\n" " output_sample_rate: %d\n", - sampling_rate, static_cast(opts_.frame_opts.samp_freq)); + sampling_rate, static_cast(config_.sampling_rate)); - float min_freq = - std::min(sampling_rate, opts_.frame_opts.samp_freq); + float min_freq = std::min(sampling_rate, config_.sampling_rate); float lowpass_cutoff = 0.99 * 0.5 * min_freq; int32_t lowpass_filter_width = 6; auto resampler = std::make_unique( - sampling_rate, opts_.frame_opts.samp_freq, lowpass_cutoff, + sampling_rate, config_.sampling_rate, lowpass_cutoff, lowpass_filter_width); std::vector samples; resampler->Resample(waveform, n, true, &samples); if (fbank_) { - fbank_->AcceptWaveform(opts_.frame_opts.samp_freq, samples.data(), + fbank_->AcceptWaveform(config_.sampling_rate, samples.data(), samples.size()); fbank_->InputFinished(); + } else if (mfcc_) { + mfcc_->AcceptWaveform(config_.sampling_rate, samples.data(), + samples.size()); + mfcc_->InputFinished(); } else { - whisper_fbank_->AcceptWaveform(opts_.frame_opts.samp_freq, - samples.data(), samples.size()); + whisper_fbank_->AcceptWaveform(config_.sampling_rate, samples.data(), + samples.size()); whisper_fbank_->InputFinished(); } return; - } // if (sampling_rate != opts_.frame_opts.samp_freq) + } // if (sampling_rate != config_.sampling_rate) if (fbank_) { fbank_->AcceptWaveform(sampling_rate, waveform, n); fbank_->InputFinished(); + } else if (mfcc_) { + mfcc_->AcceptWaveform(sampling_rate, waveform, n); + mfcc_->InputFinished(); } else { whisper_fbank_->AcceptWaveform(sampling_rate, waveform, n); whisper_fbank_->InputFinished(); } } - int32_t FeatureDim() const { return opts_.mel_opts.num_bins; } + int32_t FeatureDim() const { + return mfcc_ ? mfcc_opts_.num_ceps : opts_.mel_opts.num_bins; + } std::vector GetFrames() const { - int32_t n = - fbank_ ? fbank_->NumFramesReady() : whisper_fbank_->NumFramesReady(); - + int32_t n = fbank_ ? fbank_->NumFramesReady() + : mfcc_ ? mfcc_->NumFramesReady() + : whisper_fbank_->NumFramesReady(); assert(n > 0 && "Please first call AcceptWaveform()"); int32_t feature_dim = FeatureDim(); @@ -170,8 +203,9 @@ class OfflineStream::Impl { float *p = features.data(); for (int32_t i = 0; i != n; ++i) { - const float *f = - fbank_ ? fbank_->GetFrame(i) : whisper_fbank_->GetFrame(i); + const float *f = fbank_ ? fbank_->GetFrame(i) + : mfcc_ ? mfcc_->GetFrame(i) + : whisper_fbank_->GetFrame(i); std::copy(f, f + feature_dim, p); p += feature_dim; } @@ -222,8 +256,10 @@ class OfflineStream::Impl { private: FeatureExtractorConfig config_; std::unique_ptr fbank_; + std::unique_ptr mfcc_; std::unique_ptr whisper_fbank_; knf::FbankOptions opts_; + knf::MfccOptions mfcc_opts_; OfflineRecognitionResult r_; ContextGraphPtr context_graph_; }; diff --git a/sherpa-onnx/csrc/offline-telespeech-ctc-model.cc b/sherpa-onnx/csrc/offline-telespeech-ctc-model.cc new file mode 100644 index 000000000..68c0afbe8 --- /dev/null +++ b/sherpa-onnx/csrc/offline-telespeech-ctc-model.cc @@ -0,0 +1,144 @@ +// sherpa-onnx/csrc/offline-telespeech-ctc-model.cc +// +// Copyright (c) 2023-2024 Xiaomi Corporation + +#include "sherpa-onnx/csrc/offline-telespeech-ctc-model.h" + +#include "sherpa-onnx/csrc/macros.h" +#include "sherpa-onnx/csrc/onnx-utils.h" +#include "sherpa-onnx/csrc/session.h" +#include "sherpa-onnx/csrc/text-utils.h" +#include "sherpa-onnx/csrc/transpose.h" + +namespace sherpa_onnx { + +class OfflineTeleSpeechCtcModel::Impl { + public: + explicit Impl(const OfflineModelConfig &config) + : config_(config), + env_(ORT_LOGGING_LEVEL_ERROR), + sess_opts_(GetSessionOptions(config)), + allocator_{} { + auto buf = ReadFile(config_.telespeech_ctc); + Init(buf.data(), buf.size()); + } + +#if __ANDROID_API__ >= 9 + Impl(AAssetManager *mgr, const OfflineModelConfig &config) + : config_(config), + env_(ORT_LOGGING_LEVEL_ERROR), + sess_opts_(GetSessionOptions(config)), + allocator_{} { + auto buf = ReadFile(mgr, config_.telespeech_ctc); + Init(buf.data(), buf.size()); + } +#endif + + std::vector Forward(Ort::Value features, + Ort::Value /*features_length*/) { + std::vector shape = + features.GetTensorTypeAndShapeInfo().GetShape(); + + if (static_cast(shape[0]) != 1) { + SHERPA_ONNX_LOGE("This model supports only batch size 1. Given %d", + static_cast(shape[0])); + } + + auto out = sess_->Run({}, input_names_ptr_.data(), &features, 1, + output_names_ptr_.data(), output_names_ptr_.size()); + + std::vector logits_shape = {1}; + Ort::Value logits_length = Ort::Value::CreateTensor( + allocator_, logits_shape.data(), logits_shape.size()); + + int64_t *dst = logits_length.GetTensorMutableData(); + dst[0] = out[0].GetTensorTypeAndShapeInfo().GetShape()[0]; + + // (T, B, C) -> (B, T, C) + Ort::Value logits = Transpose01(allocator_, &out[0]); + + std::vector ans; + ans.reserve(2); + ans.push_back(std::move(logits)); + ans.push_back(std::move(logits_length)); + + return ans; + } + + int32_t VocabSize() const { return vocab_size_; } + + int32_t SubsamplingFactor() const { return subsampling_factor_; } + + OrtAllocator *Allocator() const { return allocator_; } + + private: + void Init(void *model_data, size_t model_data_length) { + sess_ = std::make_unique(env_, model_data, model_data_length, + sess_opts_); + + GetInputNames(sess_.get(), &input_names_, &input_names_ptr_); + + GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_); + + // get meta data + Ort::ModelMetadata meta_data = sess_->GetModelMetadata(); + if (config_.debug) { + std::ostringstream os; + PrintModelMetadata(os, meta_data); + SHERPA_ONNX_LOGE("%s\n", os.str().c_str()); + } + + { + auto shape = + sess_->GetOutputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape(); + vocab_size_ = shape[2]; + } + } + + private: + OfflineModelConfig config_; + Ort::Env env_; + Ort::SessionOptions sess_opts_; + Ort::AllocatorWithDefaultOptions allocator_; + + std::unique_ptr sess_; + + std::vector input_names_; + std::vector input_names_ptr_; + + std::vector output_names_; + std::vector output_names_ptr_; + + int32_t vocab_size_ = 0; + int32_t subsampling_factor_ = 4; +}; + +OfflineTeleSpeechCtcModel::OfflineTeleSpeechCtcModel( + const OfflineModelConfig &config) + : impl_(std::make_unique(config)) {} + +#if __ANDROID_API__ >= 9 +OfflineTeleSpeechCtcModel::OfflineTeleSpeechCtcModel( + AAssetManager *mgr, const OfflineModelConfig &config) + : impl_(std::make_unique(mgr, config)) {} +#endif + +OfflineTeleSpeechCtcModel::~OfflineTeleSpeechCtcModel() = default; + +std::vector OfflineTeleSpeechCtcModel::Forward( + Ort::Value features, Ort::Value features_length) { + return impl_->Forward(std::move(features), std::move(features_length)); +} + +int32_t OfflineTeleSpeechCtcModel::VocabSize() const { + return impl_->VocabSize(); +} +int32_t OfflineTeleSpeechCtcModel::SubsamplingFactor() const { + return impl_->SubsamplingFactor(); +} + +OrtAllocator *OfflineTeleSpeechCtcModel::Allocator() const { + return impl_->Allocator(); +} + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-telespeech-ctc-model.h b/sherpa-onnx/csrc/offline-telespeech-ctc-model.h new file mode 100644 index 000000000..42ef300ff --- /dev/null +++ b/sherpa-onnx/csrc/offline-telespeech-ctc-model.h @@ -0,0 +1,81 @@ +// sherpa-onnx/csrc/offline-telespeech-ctc-model.h +// +// Copyright (c) 2024 Xiaomi Corporation +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TELESPEECH_CTC_MODEL_H_ +#define SHERPA_ONNX_CSRC_OFFLINE_TELESPEECH_CTC_MODEL_H_ +#include +#include +#include +#include + +#if __ANDROID_API__ >= 9 +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + +#include "onnxruntime_cxx_api.h" // NOLINT +#include "sherpa-onnx/csrc/offline-ctc-model.h" +#include "sherpa-onnx/csrc/offline-model-config.h" + +namespace sherpa_onnx { + +/** This class implements the CTC model from + * https://github.com/Tele-AI/TeleSpeech-ASR. + * + * See + * https://github.com/lovemefan/telespeech-asr-python/blob/main/telespeechasr/onnx/onnx_infer.py + * and + * https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/tele-speech/test.py + */ +class OfflineTeleSpeechCtcModel : public OfflineCtcModel { + public: + explicit OfflineTeleSpeechCtcModel(const OfflineModelConfig &config); + +#if __ANDROID_API__ >= 9 + OfflineTeleSpeechCtcModel(AAssetManager *mgr, + const OfflineModelConfig &config); +#endif + + ~OfflineTeleSpeechCtcModel() override; + + /** Run the forward method of the model. + * + * @param features A tensor of shape (N, T, C). + * @param features_length A 1-D tensor of shape (N,) containing number of + * valid frames in `features` before padding. + * Its dtype is int64_t. + * + * @return Return a vector containing: + * - log_probs: A 3-D tensor of shape (N, T', vocab_size). + * - log_probs_length A 1-D tensor of shape (N,). Its dtype is int64_t + */ + std::vector Forward(Ort::Value features, + Ort::Value features_length) override; + + /** Return the vocabulary size of the model + */ + int32_t VocabSize() const override; + + /** SubsamplingFactor of the model + */ + int32_t SubsamplingFactor() const override; + + /** Return an allocator for allocating memory + */ + OrtAllocator *Allocator() const override; + + // TeleSpeech CTC models do not support batch size > 1 + bool SupportBatchProcessing() const override { return false; } + + std::string FeatureNormalizationMethod() const override { + return "per_feature"; + } + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_CSRC_OFFLINE_TELESPEECH_CTC_MODEL_H_ diff --git a/sherpa-onnx/csrc/online-model-config.cc b/sherpa-onnx/csrc/online-model-config.cc index 5ea24babe..a8efa870d 100644 --- a/sherpa-onnx/csrc/online-model-config.cc +++ b/sherpa-onnx/csrc/online-model-config.cc @@ -66,7 +66,7 @@ bool OnlineModelConfig::Validate() const { if (!modeling_unit.empty() && (modeling_unit == "bpe" || modeling_unit == "cjkchar+bpe")) { if (!FileExists(bpe_vocab)) { - SHERPA_ONNX_LOGE("bpe_vocab: %s does not exist", bpe_vocab.c_str()); + SHERPA_ONNX_LOGE("bpe_vocab: '%s' does not exist", bpe_vocab.c_str()); return false; } } diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineModelConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineModelConfig.java index c7eba0237..6b44a84ba 100644 --- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineModelConfig.java +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineModelConfig.java @@ -7,6 +7,7 @@ public class OfflineModelConfig { private final OfflineParaformerModelConfig paraformer; private final OfflineWhisperModelConfig whisper; private final OfflineNemoEncDecCtcModelConfig nemo; + private final String teleSpeech; private final String tokens; private final int numThreads; private final boolean debug; @@ -21,6 +22,7 @@ private OfflineModelConfig(Builder builder) { this.paraformer = builder.paraformer; this.whisper = builder.whisper; this.nemo = builder.nemo; + this.teleSpeech = builder.teleSpeech; this.tokens = builder.tokens; this.numThreads = builder.numThreads; this.debug = builder.debug; @@ -74,11 +76,16 @@ public String getBpeVocab() { return bpeVocab; } + public String getTeleSpeech() { + return teleSpeech; + } + public static class Builder { private OfflineParaformerModelConfig paraformer = OfflineParaformerModelConfig.builder().build(); private OfflineTransducerModelConfig transducer = OfflineTransducerModelConfig.builder().build(); private OfflineWhisperModelConfig whisper = OfflineWhisperModelConfig.builder().build(); private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build(); + private String teleSpeech = ""; private String tokens = ""; private int numThreads = 1; private boolean debug = true; @@ -106,6 +113,12 @@ public Builder setNemo(OfflineNemoEncDecCtcModelConfig nemo) { return this; } + + public Builder setTeleSpeech(String teleSpeech) { + this.teleSpeech = teleSpeech; + return this; + } + public Builder setWhisper(OfflineWhisperModelConfig whisper) { this.whisper = whisper; return this; diff --git a/sherpa-onnx/jni/offline-recognizer.cc b/sherpa-onnx/jni/offline-recognizer.cc index c067451a4..cf69389a3 100644 --- a/sherpa-onnx/jni/offline-recognizer.cc +++ b/sherpa-onnx/jni/offline-recognizer.cc @@ -172,6 +172,12 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) { ans.model_config.nemo_ctc.model = p; env->ReleaseStringUTFChars(s, p); + fid = env->GetFieldID(model_config_cls, "teleSpeech", "Ljava/lang/String;"); + s = (jstring)env->GetObjectField(model_config, fid); + p = env->GetStringUTFChars(s, nullptr); + ans.model_config.telespeech_ctc = p; + env->ReleaseStringUTFChars(s, p); + return ans; } diff --git a/sherpa-onnx/kotlin-api/OfflineRecognizer.kt b/sherpa-onnx/kotlin-api/OfflineRecognizer.kt index e7f72884e..151ac73d5 100644 --- a/sherpa-onnx/kotlin-api/OfflineRecognizer.kt +++ b/sherpa-onnx/kotlin-api/OfflineRecognizer.kt @@ -35,6 +35,7 @@ data class OfflineModelConfig( var paraformer: OfflineParaformerModelConfig = OfflineParaformerModelConfig(), var whisper: OfflineWhisperModelConfig = OfflineWhisperModelConfig(), var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(), + var teleSpeech: String = "", var numThreads: Int = 1, var debug: Boolean = false, var provider: String = "cpu", @@ -272,6 +273,15 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? { tokens = "$modelDir/tokens.txt", ) } + + 11 -> { + val modelDir = "sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04" + return OfflineModelConfig( + teleSpeech = "$modelDir/model.int8.onnx", + tokens = "$modelDir/tokens.txt", + modelType = "tele_speech", + ) + } } return null } diff --git a/sherpa-onnx/python/csrc/offline-model-config.cc b/sherpa-onnx/python/csrc/offline-model-config.cc index 3fc3b34c2..a72c182ea 100644 --- a/sherpa-onnx/python/csrc/offline-model-config.cc +++ b/sherpa-onnx/python/csrc/offline-model-config.cc @@ -29,25 +29,27 @@ void PybindOfflineModelConfig(py::module *m) { using PyClass = OfflineModelConfig; py::class_(*m, "OfflineModelConfig") - .def(py::init(), - py::arg("transducer") = OfflineTransducerModelConfig(), - py::arg("paraformer") = OfflineParaformerModelConfig(), - py::arg("nemo_ctc") = OfflineNemoEncDecCtcModelConfig(), - py::arg("whisper") = OfflineWhisperModelConfig(), - py::arg("tdnn") = OfflineTdnnModelConfig(), - py::arg("zipformer_ctc") = OfflineZipformerCtcModelConfig(), - py::arg("wenet_ctc") = OfflineWenetCtcModelConfig(), - py::arg("tokens"), py::arg("num_threads"), py::arg("debug") = false, - py::arg("provider") = "cpu", py::arg("model_type") = "", - py::arg("modeling_unit") = "cjkchar", py::arg("bpe_vocab") = "") + .def( + py::init< + const OfflineTransducerModelConfig &, + const OfflineParaformerModelConfig &, + const OfflineNemoEncDecCtcModelConfig &, + const OfflineWhisperModelConfig &, const OfflineTdnnModelConfig &, + const OfflineZipformerCtcModelConfig &, + const OfflineWenetCtcModelConfig &, const std::string &, + const std::string &, int32_t, bool, const std::string &, + const std::string &, const std::string &, const std::string &>(), + py::arg("transducer") = OfflineTransducerModelConfig(), + py::arg("paraformer") = OfflineParaformerModelConfig(), + py::arg("nemo_ctc") = OfflineNemoEncDecCtcModelConfig(), + py::arg("whisper") = OfflineWhisperModelConfig(), + py::arg("tdnn") = OfflineTdnnModelConfig(), + py::arg("zipformer_ctc") = OfflineZipformerCtcModelConfig(), + py::arg("wenet_ctc") = OfflineWenetCtcModelConfig(), + py::arg("telespeech_ctc") = "", py::arg("tokens"), + py::arg("num_threads"), py::arg("debug") = false, + py::arg("provider") = "cpu", py::arg("model_type") = "", + py::arg("modeling_unit") = "cjkchar", py::arg("bpe_vocab") = "") .def_readwrite("transducer", &PyClass::transducer) .def_readwrite("paraformer", &PyClass::paraformer) .def_readwrite("nemo_ctc", &PyClass::nemo_ctc) @@ -55,6 +57,7 @@ void PybindOfflineModelConfig(py::module *m) { .def_readwrite("tdnn", &PyClass::tdnn) .def_readwrite("zipformer_ctc", &PyClass::zipformer_ctc) .def_readwrite("wenet_ctc", &PyClass::wenet_ctc) + .def_readwrite("telespeech_ctc", &PyClass::telespeech_ctc) .def_readwrite("tokens", &PyClass::tokens) .def_readwrite("num_threads", &PyClass::num_threads) .def_readwrite("debug", &PyClass::debug) diff --git a/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py b/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py index 87c5132d2..480ea23ce 100644 --- a/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py +++ b/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py @@ -211,6 +211,71 @@ def from_paraformer( self.config = recognizer_config return self + @classmethod + def from_telespeech_ctc( + cls, + model: str, + tokens: str, + num_threads: int = 1, + sample_rate: int = 16000, + feature_dim: int = 40, + decoding_method: str = "greedy_search", + debug: bool = False, + provider: str = "cpu", + ): + """ + Please refer to + ``_ + to download pre-trained models. + + Args: + model: + Path to ``model.onnx``. + tokens: + Path to ``tokens.txt``. Each line in ``tokens.txt`` contains two + columns:: + + symbol integer_id + + num_threads: + Number of threads for neural network computation. + sample_rate: + Sample rate of the training data used to train the model. It is + ignored and is hard-coded in C++ to 40. + feature_dim: + Dimension of the feature used to train the model. It is ignored + and is hard-coded in C++ to 40. + decoding_method: + Valid values are greedy_search. + debug: + True to show debug messages. + provider: + onnxruntime execution providers. Valid values are: cpu, cuda, coreml. + """ + self = cls.__new__(cls) + model_config = OfflineModelConfig( + telespeech_ctc=model, + tokens=tokens, + num_threads=num_threads, + debug=debug, + provider=provider, + model_type="nemo_ctc", + ) + + feat_config = FeatureExtractorConfig( + sampling_rate=sample_rate, + feature_dim=feature_dim, + ) + + recognizer_config = OfflineRecognizerConfig( + feat_config=feat_config, + model_config=model_config, + decoding_method=decoding_method, + ) + self.recognizer = _Recognizer(recognizer_config) + self.config = recognizer_config + return self + @classmethod def from_nemo_ctc( cls, diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift index 0c8d22f3d..7346ac4b8 100644 --- a/swift-api-examples/SherpaOnnx.swift +++ b/swift-api-examples/SherpaOnnx.swift @@ -102,7 +102,7 @@ func sherpaOnnxOnlineModelConfig( debug: Int32(debug), model_type: toCPointer(modelType), modeling_unit: toCPointer(modelingUnit), - bpeVocab: toCPointer(bpeVocab) + bpe_vocab: toCPointer(bpeVocab) ) } @@ -360,7 +360,8 @@ func sherpaOnnxOfflineModelConfig( debug: Int = 0, modelType: String = "", modelingUnit: String = "cjkchar", - bpeVocab: String = "" + bpeVocab: String = "", + teleSpeechCtc: String = "" ) -> SherpaOnnxOfflineModelConfig { return SherpaOnnxOfflineModelConfig( transducer: transducer, @@ -374,7 +375,8 @@ func sherpaOnnxOfflineModelConfig( provider: toCPointer(provider), model_type: toCPointer(modelType), modeling_unit: toCPointer(modelingUnit), - bpeVocab: toCPointer(bpeVocab) + bpe_vocab: toCPointer(bpeVocab), + telespeech_ctc: toCPointer(teleSpeechCtc) ) } diff --git a/wasm/asr/sherpa-onnx-asr.js b/wasm/asr/sherpa-onnx-asr.js index c77794a68..53afe1875 100644 --- a/wasm/asr/sherpa-onnx-asr.js +++ b/wasm/asr/sherpa-onnx-asr.js @@ -529,7 +529,7 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { const tdnn = initSherpaOnnxOfflineTdnnModelConfig(config.tdnn, Module); const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + - tdnn.len + 7 * 4; + tdnn.len + 8 * 4; const ptr = Module._malloc(len); let offset = 0; @@ -553,9 +553,11 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { const modelTypeLen = Module.lengthBytesUTF8(config.modelType) + 1; const modelingUnitLen = Module.lengthBytesUTF8(config.modelingUnit || '') + 1; const bpeVocabLen = Module.lengthBytesUTF8(config.bpeVocab || '') + 1; + const teleSpeechCtcLen = + Module.lengthBytesUTF8(config.teleSpeechCtc || '') + 1; - const bufferLen = - tokensLen + providerLen + modelTypeLen + modelingUnitLen + bpeVocabLen; + const bufferLen = tokensLen + providerLen + modelTypeLen + modelingUnitLen + + bpeVocabLen + teleSpeechCtcLen; const buffer = Module._malloc(bufferLen); offset = 0; @@ -575,6 +577,10 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { Module.stringToUTF8(config.bpeVocab || '', buffer + offset, bpeVocabLen); offset += bpeVocabLen; + Module.stringToUTF8( + config.teleSpeechCtc || '', buffer + offset, teleSpeechCtcLen); + offset += teleSpeechCtcLen; + offset = transducer.len + paraformer.len + nemoCtc.len + whisper.len + tdnn.len; Module.setValue(ptr + offset, buffer, 'i8*'); // tokens @@ -604,6 +610,13 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { 'i8*'); // bpeVocab offset += 4; + Module.setValue( + ptr + offset, + buffer + tokensLen + providerLen + modelTypeLen + modelingUnitLen + + bpeVocabLen, + 'i8*'); // teleSpeechCtc + offset += 4; + return { buffer: buffer, ptr: ptr, len: len, transducer: transducer, paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn diff --git a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc index ceb5a2442..177fb6f04 100644 --- a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc +++ b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc @@ -23,7 +23,7 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == sizeof(SherpaOnnxOfflineParaformerModelConfig) + sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) + sizeof(SherpaOnnxOfflineWhisperModelConfig) + - sizeof(SherpaOnnxOfflineTdnnModelConfig) + 7 * 4, + sizeof(SherpaOnnxOfflineTdnnModelConfig) + 8 * 4, ""); static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineRecognizerConfig) == @@ -92,6 +92,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { fprintf(stdout, "model type: %s\n", model_config->model_type); fprintf(stdout, "modeling unit: %s\n", model_config->modeling_unit); fprintf(stdout, "bpe vocab: %s\n", model_config->bpe_vocab); + fprintf(stdout, "telespeech_ctc: %s\n", model_config->telespeech_ctc); fprintf(stdout, "----------feat config----------\n"); fprintf(stdout, "sample rate: %d\n", feat->sample_rate);