diff --git a/.github/workflows/wasm-simd-hf-space-de-tts.yaml b/.github/workflows/wasm-simd-hf-space-de-tts.yaml index 2c1d978aa..f51535379 100644 --- a/.github/workflows/wasm-simd-hf-space-de-tts.yaml +++ b/.github/workflows/wasm-simd-hf-space-de-tts.yaml @@ -1,9 +1,11 @@ name: wasm-simd-hf-space-de-tts on: - release: - types: - - published + push: + branches: + - wasm + tags: + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: @@ -71,6 +73,14 @@ jobs: name: sherpa-onnx-wasm-simd-de-tts path: ./sherpa-onnx-wasm-simd-*.tar.bz2 + - name: Release + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/') + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + overwrite: true + file: ./*.tar.bz2 + - name: Publish to ModelScope # if: false env: diff --git a/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml b/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml index e123e65f1..975266917 100644 --- a/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml +++ b/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml @@ -1,9 +1,11 @@ name: wasm-simd-hf-space-en-asr-zipformer on: - release: - types: - - published + push: + branches: + - wasm + tags: + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: @@ -73,6 +75,14 @@ jobs: name: sherpa-onnx-wasm-simd-en-asr-zipformer path: ./sherpa-onnx-wasm-simd-*.tar.bz2 + - name: Release + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/') + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + overwrite: true + file: ./*.tar.bz2 + - name: Publish to ModelScope # if: false env: diff --git a/.github/workflows/wasm-simd-hf-space-en-tts.yaml b/.github/workflows/wasm-simd-hf-space-en-tts.yaml index 09d42911b..f5f950c3c 100644 --- a/.github/workflows/wasm-simd-hf-space-en-tts.yaml +++ b/.github/workflows/wasm-simd-hf-space-en-tts.yaml @@ -1,9 +1,11 @@ name: wasm-simd-hf-space-en-tts on: - release: - types: - - published + push: + branches: + - wasm + tags: + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: @@ -69,6 +71,14 @@ jobs: name: sherpa-onnx-wasm-simd-en-tts path: ./sherpa-onnx-wasm-simd-*.tar.bz2 + - name: Release + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/') + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + overwrite: true + file: ./*.tar.bz2 + - name: Publish to ModelScope # if: false env: diff --git a/.github/workflows/wasm-simd-hf-space-silero-vad.yaml b/.github/workflows/wasm-simd-hf-space-silero-vad.yaml new file mode 100644 index 000000000..e384af3fb --- /dev/null +++ b/.github/workflows/wasm-simd-hf-space-silero-vad.yaml @@ -0,0 +1,152 @@ +name: wasm-simd-hf-space-silero-vad + +on: + push: + branches: + - wasm + tags: + - 'v[0-9]+.[0-9]+.[0-9]+*' + + workflow_dispatch: + +concurrency: + group: wasm-simd-hf-space-silero-vad-${{ github.ref }} + cancel-in-progress: true + +jobs: + wasm-simd-hf-space-silero-vad: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Install emsdk + uses: mymindstorm/setup-emsdk@v14 + with: + version: 3.1.51 + actions-cache-folder: 'emsdk-cache' + + - name: View emsdk version + shell: bash + run: | + emcc -v + echo "--------------------" + emcc --check + + - name: Download model files + shell: bash + run: | + cd wasm/vad/assets + ls -lh + echo "----------" + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + ls -lh + + - name: Build sherpa-onnx for WebAssembly + shell: bash + run: | + ./build-wasm-simd-vad.sh + + - name: collect files + shell: bash + run: | + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + + dst=sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-vad + mv build-wasm-simd-vad/install/bin/wasm/vad $dst + ls -lh $dst + tar cjfv $dst.tar.bz2 ./$dst + + - name: Upload wasm files + uses: actions/upload-artifact@v4 + with: + name: sherpa-onnx-wasm-simd-vad + path: ./sherpa-onnx-wasm-simd-*.tar.bz2 + + - name: Release + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/') + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + overwrite: true + file: ./*.tar.bz2 + + - name: Publish to ModelScope + # if: false + env: + MS_TOKEN: ${{ secrets.MODEL_SCOPE_GIT_TOKEN }} + uses: nick-fields/retry@v2 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf ms + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + + git clone https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-sherpa-onnx.git ms + cd ms + rm -fv *.js + rm -fv *.data + git fetch + git pull + git merge -m "merge remote" --ff origin main + + cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-vad/* . + + git status + git lfs track "*.data" + git lfs track "*.wasm" + ls -lh + + git add . + git commit -m "update model" + git push https://oauth2:${MS_TOKEN}@www.modelscope.cn/studios/csukuangfj/web-assembly-vad-sherpa-onnx.git + + - name: Publish to huggingface + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v2 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + + git clone https://huggingface.co/spaces/k2-fsa/web-assembly-vad-sherpa-onnx huggingface + cd huggingface + rm -fv *.js + rm -fv *.data + git fetch + git pull + git merge -m "merge remote" --ff origin main + + cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-vad/* . + + git status + git lfs track "*.data" + git lfs track "*.wasm" + ls -lh + + git add . + git commit -m "update model" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/spaces/k2-fsa/web-assembly-vad-sherpa-onnx main diff --git a/.github/workflows/wasm-simd-hf-space-zh-cantonese-en-asr-paraformer.yaml b/.github/workflows/wasm-simd-hf-space-zh-cantonese-en-asr-paraformer.yaml index 1c7625655..e0c665737 100644 --- a/.github/workflows/wasm-simd-hf-space-zh-cantonese-en-asr-paraformer.yaml +++ b/.github/workflows/wasm-simd-hf-space-zh-cantonese-en-asr-paraformer.yaml @@ -1,9 +1,11 @@ name: wasm-simd-hf-space-zh-cantonese-en-asr-paraformer on: - release: - types: - - published + push: + branches: + - wasm + tags: + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: @@ -80,6 +82,14 @@ jobs: name: sherpa-onnx-wasm-simd-zh-cantonese-en-asr-paraformer path: ./sherpa-onnx-wasm-simd-*.tar.bz2 + - name: Release + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/') + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + overwrite: true + file: ./*.tar.bz2 + - name: Publish to huggingface env: HF_TOKEN: ${{ secrets.HF_TOKEN }} diff --git a/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml b/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml index a67385b12..500305420 100644 --- a/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml +++ b/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml @@ -1,9 +1,11 @@ name: wasm-simd-hf-space-zh-en-asr-paraformer on: - release: - types: - - published + push: + branches: + - wasm + tags: + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: @@ -80,6 +82,14 @@ jobs: name: sherpa-onnx-wasm-simd-zh-en-asr-paraformer path: ./sherpa-onnx-wasm-simd-*.tar.bz2 + - name: Release + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/') + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + overwrite: true + file: ./*.tar.bz2 + - name: Publish to ModelScope # if: false env: diff --git a/.github/workflows/wasm-simd-hf-space-zh-en-asr-zipformer.yaml b/.github/workflows/wasm-simd-hf-space-zh-en-asr-zipformer.yaml index 4dea90f49..dfa0e1614 100644 --- a/.github/workflows/wasm-simd-hf-space-zh-en-asr-zipformer.yaml +++ b/.github/workflows/wasm-simd-hf-space-zh-en-asr-zipformer.yaml @@ -1,9 +1,11 @@ name: wasm-simd-hf-space-zh-en-asr-zipformer on: - release: - types: - - published + push: + branches: + - wasm + tags: + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: @@ -71,6 +73,14 @@ jobs: name: sherpa-onnx-wasm-simd-zh-en-asr-zipformer path: ./sherpa-onnx-wasm-simd-*.tar.bz2 + - name: Release + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/') + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + overwrite: true + file: ./*.tar.bz2 + - name: Publish to ModelScope # if: false env: diff --git a/CMakeLists.txt b/CMakeLists.txt index 415ab418b..7408f8d69 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,6 +35,7 @@ option(SHERPA_ONNX_ENABLE_WASM "Whether to enable WASM" OFF) option(SHERPA_ONNX_ENABLE_WASM_TTS "Whether to enable WASM for TTS" OFF) option(SHERPA_ONNX_ENABLE_WASM_ASR "Whether to enable WASM for ASR" OFF) option(SHERPA_ONNX_ENABLE_WASM_KWS "Whether to enable WASM for KWS" OFF) +option(SHERPA_ONNX_ENABLE_WASM_VAD "Whether to enable WASM for VAD" OFF) option(SHERPA_ONNX_ENABLE_WASM_NODEJS "Whether to enable WASM for NodeJS" OFF) option(SHERPA_ONNX_ENABLE_BINARY "Whether to build binaries" ON) option(SHERPA_ONNX_ENABLE_TTS "Whether to build TTS related code" ON) @@ -135,6 +136,7 @@ message(STATUS "SHERPA_ONNX_ENABLE_WASM ${SHERPA_ONNX_ENABLE_WASM}") message(STATUS "SHERPA_ONNX_ENABLE_WASM_TTS ${SHERPA_ONNX_ENABLE_WASM_TTS}") message(STATUS "SHERPA_ONNX_ENABLE_WASM_ASR ${SHERPA_ONNX_ENABLE_WASM_ASR}") message(STATUS "SHERPA_ONNX_ENABLE_WASM_KWS ${SHERPA_ONNX_ENABLE_WASM_KWS}") +message(STATUS "SHERPA_ONNX_ENABLE_WASM_VAD ${SHERPA_ONNX_ENABLE_WASM_VAD}") message(STATUS "SHERPA_ONNX_ENABLE_WASM_NODEJS ${SHERPA_ONNX_ENABLE_WASM_NODEJS}") message(STATUS "SHERPA_ONNX_ENABLE_BINARY ${SHERPA_ONNX_ENABLE_BINARY}") message(STATUS "SHERPA_ONNX_ENABLE_TTS ${SHERPA_ONNX_ENABLE_TTS}") @@ -212,6 +214,10 @@ if(SHERPA_ONNX_ENABLE_WASM_KWS) add_definitions(-DSHERPA_ONNX_ENABLE_WASM_KWS=1) endif() +if(SHERPA_ONNX_ENABLE_WASM_VAD) + add_definitions(-DSHERPA_ONNX_ENABLE_WASM_VAD=1) +endif() + if(NOT CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ version to be used.") endif() diff --git a/README.md b/README.md index 3590cc3aa..dcdaec2f2 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,32 @@ with the following APIs - Swift, Rust - Dart, Object Pascal +### Links for Huggingface Spaces + +You can visit the following Huggingface spaces to try `sherpa-onnx` without +installing anything. All you need is a browser. + +| Description | URL | +|---|---| +| Speech recognition | [Click me](https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition)| +| Speech recognition with [Whisper](https://github.com/openai/whisper)| [Click me](https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition-with-whisper)| +| Speech synthesis | [Click me](https://huggingface.co/spaces/k2-fsa/text-to-speech)| +| Generate subtitles| [Click me](https://huggingface.co/spaces/k2-fsa/generate-subtitles-for-videos)| +|Audio tagging| [Click me](https://huggingface.co/spaces/k2-fsa/audio-tagging)| +|Spoken language identification with [Whisper](https://github.com/openai/whisper)|[Click me](https://huggingface.co/spaces/k2-fsa/spoken-language-identification)| + +We also have spaces built using WebAssembly. The are listed below: + +| Description | URL| Chinese users| +|---|---|---| +|Voice activity detection with [silero-vad](https://github.com/snakers4/silero-vad)| [Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-vad-sherpa-onnx)|[地址](https://modelscope.cn/studios/csukuangfj/web-assembly-vad-sherpa-onnx)| +|Real-time speech recognition (Chinese + English) with Zipformer | [Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en)|[地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en)| +|Real-time speech recognition (Chinese + English) with Paraformer|[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer)| +|Real-time speech recognition (Chinese + English + Cantonese) with Paraformer|[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer)| +|Real-time speech recognition (English) |[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en)|[地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en)| +|Speech synthesis (English) |[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-en)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-en)| +|Speech synthesis (German)|[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-de)| + ### Links for pre-built Android APKs | Description | URL | 中国用户 | @@ -130,7 +156,7 @@ with the following APIs | Keyword spotting |[Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/kws-models)| | Audio tagging | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models)| | Speaker identification (Speaker ID) | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models)| -| Spoken language identification (Language ID) | See multi-lingual Whisper ASR models from [Speech recognition](https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models) | +| Spoken language identification (Language ID) | See multi-lingual [Whisper](https://github.com/openai/whisper) ASR models from [Speech recognition](https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models) | | Punctuation| [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/punctuation-models)| ### Useful links diff --git a/build-wasm-simd-asr.sh b/build-wasm-simd-asr.sh index b894087ac..eda18f74d 100755 --- a/build-wasm-simd-asr.sh +++ b/build-wasm-simd-asr.sh @@ -48,6 +48,7 @@ cmake \ -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ -DSHERPA_ONNX_ENABLE_JNI=OFF \ -DSHERPA_ONNX_ENABLE_C_API=ON \ + -DSHERPA_ONNX_ENABLE_TTS=OFF \ -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \ -DSHERPA_ONNX_ENABLE_GPU=OFF \ -DSHERPA_ONNX_ENABLE_WASM=ON \ diff --git a/build-wasm-simd-kws.sh b/build-wasm-simd-kws.sh index 8310c2098..6fdf8218f 100755 --- a/build-wasm-simd-kws.sh +++ b/build-wasm-simd-kws.sh @@ -43,6 +43,7 @@ cmake \ -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ -DSHERPA_ONNX_ENABLE_JNI=OFF \ -DSHERPA_ONNX_ENABLE_C_API=ON \ + -DSHERPA_ONNX_ENABLE_TTS=OFF \ -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \ -DSHERPA_ONNX_ENABLE_GPU=OFF \ -DSHERPA_ONNX_ENABLE_WASM=ON \ diff --git a/build-wasm-simd-vad.sh b/build-wasm-simd-vad.sh new file mode 100755 index 000000000..c74f57d37 --- /dev/null +++ b/build-wasm-simd-vad.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# Copyright (c) 2024 Xiaomi Corporation +# +# This script is to build sherpa-onnx for WebAssembly (VAD) + +set -ex + +if [ x"$EMSCRIPTEN" == x"" ]; then + if ! command -v emcc &> /dev/null; then + echo "Please install emscripten first" + echo "" + echo "You can use the following commands to install it:" + echo "" + echo "git clone https://github.com/emscripten-core/emsdk.git" + echo "cd emsdk" + echo "git pull" + echo "./emsdk install latest" + echo "./emsdk activate latest" + echo "source ./emsdk_env.sh" + exit 1 + else + EMSCRIPTEN=$(dirname $(realpath $(which emcc))) + fi +fi + +export EMSCRIPTEN=$EMSCRIPTEN +echo "EMSCRIPTEN: $EMSCRIPTEN" +if [ ! -f $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake ]; then + echo "Cannot find $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake" + echo "Please make sure you have installed emsdk correctly" + exit 1 +fi + +mkdir -p build-wasm-simd-vad +pushd build-wasm-simd-vad + +export SHERPA_ONNX_IS_USING_BUILD_WASM_SH=ON + +cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=$EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake \ + \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=OFF \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=OFF \ + -DSHERPA_ONNX_ENABLE_TTS=OFF \ + -DSHERPA_ONNX_ENABLE_C_API=ON \ + -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \ + -DSHERPA_ONNX_ENABLE_GPU=OFF \ + -DSHERPA_ONNX_ENABLE_WASM=ON \ + -DSHERPA_ONNX_ENABLE_WASM_VAD=ON \ + -DSHERPA_ONNX_ENABLE_BINARY=OFF \ + -DSHERPA_ONNX_LINK_LIBSTDCPP_STATICALLY=OFF \ + .. +make -j2 +make install + +ls -lh install/bin/wasm/vad diff --git a/scripts/node-addon-api/lib/vad.js b/scripts/node-addon-api/lib/vad.js index a7f7daa24..3ef7b6cad 100644 --- a/scripts/node-addon-api/lib/vad.js +++ b/scripts/node-addon-api/lib/vad.js @@ -71,7 +71,7 @@ config = { /* { samples: a 1-d float32 array, - start: a int32 + start: an int32 } */ front(enableExternalBuffer = true) { diff --git a/wasm/CMakeLists.txt b/wasm/CMakeLists.txt index d7c7a1a17..075dfbf8d 100644 --- a/wasm/CMakeLists.txt +++ b/wasm/CMakeLists.txt @@ -10,6 +10,10 @@ if(SHERPA_ONNX_ENABLE_WASM_KWS) add_subdirectory(kws) endif() +if(SHERPA_ONNX_ENABLE_WASM_VAD) + add_subdirectory(vad) +endif() + if(SHERPA_ONNX_ENABLE_WASM_NODEJS) add_subdirectory(nodejs) endif() diff --git a/wasm/vad/CMakeLists.txt b/wasm/vad/CMakeLists.txt new file mode 100644 index 000000000..30b09bf59 --- /dev/null +++ b/wasm/vad/CMakeLists.txt @@ -0,0 +1,72 @@ +if(NOT $ENV{SHERPA_ONNX_IS_USING_BUILD_WASM_SH}) + message(FATAL_ERROR "Please use ./build-wasm-simd-vad.sh to build for wasm VAD") +endif() + +if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/assets/silero_vad.onnx") + message(FATAL_ERROR "Please read ${CMAKE_CURRENT_SOURCE_DIR}/assets/README.md before you continue") +endif() + +set(exported_functions + MyPrint + # VAD + SherpaOnnxCreateCircularBuffer + SherpaOnnxDestroyCircularBuffer + SherpaOnnxCircularBufferPush + SherpaOnnxCircularBufferGet + SherpaOnnxCircularBufferFree + SherpaOnnxCircularBufferPop + SherpaOnnxCircularBufferSize + SherpaOnnxCircularBufferHead + SherpaOnnxCircularBufferReset + SherpaOnnxCreateVoiceActivityDetector + SherpaOnnxDestroyVoiceActivityDetector + SherpaOnnxVoiceActivityDetectorAcceptWaveform + SherpaOnnxVoiceActivityDetectorEmpty + SherpaOnnxVoiceActivityDetectorDetected + SherpaOnnxVoiceActivityDetectorPop + SherpaOnnxVoiceActivityDetectorClear + SherpaOnnxVoiceActivityDetectorFront + SherpaOnnxDestroySpeechSegment + SherpaOnnxVoiceActivityDetectorReset + SherpaOnnxVoiceActivityDetectorFlush + # +) +set(mangled_exported_functions) +foreach(x IN LISTS exported_functions) + list(APPEND mangled_exported_functions "_${x}") +endforeach() +list(JOIN mangled_exported_functions "," all_exported_functions) + +include_directories(${CMAKE_SOURCE_DIR}) +set(MY_FLAGS " -s FORCE_FILESYSTEM=1 -s INITIAL_MEMORY=64MB -s ALLOW_MEMORY_GROWTH=1") +string(APPEND MY_FLAGS " -sSTACK_SIZE=10485760 ") # 10MB +string(APPEND MY_FLAGS " -sEXPORTED_FUNCTIONS=[_CopyHeap,_malloc,_free,${all_exported_functions}] ") +string(APPEND MY_FLAGS "--preload-file ${CMAKE_CURRENT_SOURCE_DIR}/assets@. ") +string(APPEND MY_FLAGS " -sEXPORTED_RUNTIME_METHODS=['ccall','stringToUTF8','setValue','getValue','lengthBytesUTF8','UTF8ToString'] ") + +message(STATUS "MY_FLAGS: ${MY_FLAGS}") + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MY_FLAGS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MY_FLAGS}") +set(CMAKE_EXECUTBLE_LINKER_FLAGS "${CMAKE_EXECUTBLE_LINKER_FLAGS} ${MY_FLAGS}") + +if (NOT CMAKE_EXECUTABLE_SUFFIX STREQUAL ".js") + message(FATAL_ERROR "The default suffix for building executables should be .js!") +endif() +# set(CMAKE_EXECUTABLE_SUFFIX ".html") + +add_executable(sherpa-onnx-wasm-main-vad sherpa-onnx-wasm-main-vad.cc) +target_link_libraries(sherpa-onnx-wasm-main-vad sherpa-onnx-c-api) +install(TARGETS sherpa-onnx-wasm-main-vad DESTINATION bin/wasm/vad) + +install( + FILES + "$/sherpa-onnx-wasm-main-vad.js" + "index.html" + "sherpa-onnx-vad.js" + "app-vad.js" + "$/sherpa-onnx-wasm-main-vad.wasm" + "$/sherpa-onnx-wasm-main-vad.data" + DESTINATION + bin/wasm/vad +) diff --git a/wasm/vad/app-vad.js b/wasm/vad/app-vad.js new file mode 100644 index 000000000..db35c1fea --- /dev/null +++ b/wasm/vad/app-vad.js @@ -0,0 +1,307 @@ +// This file copies and modifies code +// from https://mdn.github.io/web-dictaphone/scripts/app.js +// and https://gist.github.com/meziantou/edb7217fddfbb70e899e + +const startBtn = document.getElementById('startBtn'); +const stopBtn = document.getElementById('stopBtn'); +const clearBtn = document.getElementById('clearBtn'); +const hint = document.getElementById('hint'); +const soundClips = document.getElementById('sound-clips'); + +let textArea = document.getElementById('results'); + +let lastResult = ''; +let resultList = []; + +clearBtn.onclick = function() { + resultList = []; + textArea.value = getDisplayResult(); + textArea.scrollTop = textArea.scrollHeight; // auto scroll +}; + +function getDisplayResult() { + let i = 0; + let ans = ''; + for (let s in resultList) { + if (resultList[s] == '') { + continue; + } + + if (resultList[s] == 'Speech detected') { + ans += '' + i + ': ' + resultList[s]; + i += 1; + } else { + ans += ', ' + resultList[s] + '\n'; + } + } + + if (lastResult.length > 0) { + ans += '' + i + ': ' + lastResult + '\n'; + } + return ans; +} + + +Module = {}; +Module.onRuntimeInitialized = function() { + console.log('inited!'); + hint.innerText = 'Model loaded! Please click start'; + + startBtn.disabled = false; + + vad = createVad(Module); + console.log('vad is created!', vad); + + buffer = new CircularBuffer(30 * 16000, Module); + console.log('CircularBuffer is created!', buffer); +}; + +let audioCtx; +let mediaStream; + +let expectedSampleRate = 16000; +let recordSampleRate; // the sampleRate of the microphone +let recorder = null; // the microphone +let leftchannel = []; // TODO: Use a single channel + +let recordingLength = 0; // number of samples so far + +let vad = null; +let buffer = null; +let printed = false; + +if (navigator.mediaDevices.getUserMedia) { + console.log('getUserMedia supported.'); + + // see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia + const constraints = {audio: true}; + + let onSuccess = function(stream) { + if (!audioCtx) { + audioCtx = new AudioContext({sampleRate: expectedSampleRate}); + } + console.log(audioCtx); + recordSampleRate = audioCtx.sampleRate; + console.log('sample rate ' + recordSampleRate); + + // creates an audio node from the microphone incoming stream + mediaStream = audioCtx.createMediaStreamSource(stream); + console.log('media stream', mediaStream); + + // https://developer.mozilla.org/en-US/docs/Web/API/AudioContext/createScriptProcessor + // bufferSize: the onaudioprocess event is called when the buffer is full + var bufferSize = 4096; + var numberOfInputChannels = 1; + var numberOfOutputChannels = 2; + if (audioCtx.createScriptProcessor) { + recorder = audioCtx.createScriptProcessor( + bufferSize, numberOfInputChannels, numberOfOutputChannels); + } else { + recorder = audioCtx.createJavaScriptNode( + bufferSize, numberOfInputChannels, numberOfOutputChannels); + } + console.log('recorder', recorder); + + recorder.onaudioprocess = function(e) { + let samples = new Float32Array(e.inputBuffer.getChannelData(0)) + samples = downsampleBuffer(samples, expectedSampleRate); + buffer.push(samples); + while (buffer.size() > vad.config.sileroVad.windowSize) { + const s = buffer.get(buffer.head(), vad.config.sileroVad.windowSize); + vad.acceptWaveform(s); + buffer.pop(vad.config.sileroVad.windowSize); + + if (vad.isDetected() && !printed) { + printed = true; + lastResult = 'Speech detected'; + } + + if (!vad.isDetected()) { + printed = false; + if (lastResult != '') { + resultList.push(lastResult); + } + lastResult = ''; + } + + while (!vad.isEmpty()) { + const segment = vad.front(); + const duration = segment.samples.length / expectedSampleRate; + const durationStr = `Duration: ${duration.toFixed(3)} seconds`; + resultList.push(durationStr); + vad.pop(); + + // now save the segment to a wav file + let buf = new Int16Array(segment.samples.length); + for (var i = 0; i < segment.samples.length; ++i) { + let s = segment.samples[i]; + if (s >= 1) + s = 1; + else if (s <= -1) + s = -1; + + buf[i] = s * 32767; + } + + let clipName = new Date().toISOString() + '--' + durationStr; + + const clipContainer = document.createElement('article'); + const clipLabel = document.createElement('p'); + const audio = document.createElement('audio'); + const deleteButton = document.createElement('button'); + + clipContainer.classList.add('clip'); + audio.setAttribute('controls', ''); + deleteButton.textContent = 'Delete'; + deleteButton.className = 'delete'; + + clipLabel.textContent = clipName; + + clipContainer.appendChild(audio); + + clipContainer.appendChild(clipLabel); + clipContainer.appendChild(deleteButton); + soundClips.appendChild(clipContainer); + + audio.controls = true; + const blob = toWav(buf); + + leftchannel = []; + const audioURL = window.URL.createObjectURL(blob); + audio.src = audioURL; + + deleteButton.onclick = function(e) { + let evtTgt = e.target; + evtTgt.parentNode.parentNode.removeChild(evtTgt.parentNode); + }; + + clipLabel.onclick = function() { + const existingName = clipLabel.textContent; + const newClipName = prompt('Enter a new name for your sound clip?'); + if (newClipName === null) { + clipLabel.textContent = existingName; + } else { + clipLabel.textContent = newClipName; + } + }; + } + } + + textArea.value = getDisplayResult(); + textArea.scrollTop = textArea.scrollHeight; // auto scroll + }; + + startBtn.onclick = function() { + mediaStream.connect(recorder); + recorder.connect(audioCtx.destination); + + console.log('recorder started'); + + stopBtn.disabled = false; + startBtn.disabled = true; + }; + + stopBtn.onclick = function() { + vad.reset(); + buffer.reset(); + console.log('recorder stopped'); + + // stopBtn recording + recorder.disconnect(audioCtx.destination); + mediaStream.disconnect(recorder); + + startBtn.style.background = ''; + startBtn.style.color = ''; + // mediaRecorder.requestData(); + + stopBtn.disabled = true; + startBtn.disabled = false; + }; + }; + + let onError = function(err) { + console.log('The following error occured: ' + err); + }; + + navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError); +} else { + console.log('getUserMedia not supported on your browser!'); + alert('getUserMedia not supported on your browser!'); +} + + +// this function is copied/modified from +// https://gist.github.com/meziantou/edb7217fddfbb70e899e +function flatten(listOfSamples) { + let n = 0; + for (let i = 0; i < listOfSamples.length; ++i) { + n += listOfSamples[i].length; + } + let ans = new Int16Array(n); + + let offset = 0; + for (let i = 0; i < listOfSamples.length; ++i) { + ans.set(listOfSamples[i], offset); + offset += listOfSamples[i].length; + } + return ans; +} + +// this function is copied/modified from +// https://gist.github.com/meziantou/edb7217fddfbb70e899e +function toWav(samples) { + let buf = new ArrayBuffer(44 + samples.length * 2); + var view = new DataView(buf); + + // http://soundfile.sapp.org/doc/WaveFormat/ + // F F I R + view.setUint32(0, 0x46464952, true); // chunkID + view.setUint32(4, 36 + samples.length * 2, true); // chunkSize + // E V A W + view.setUint32(8, 0x45564157, true); // format + // + // t m f + view.setUint32(12, 0x20746d66, true); // subchunk1ID + view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM + view.setUint32(20, 1, true); // audioFormat, 1 for PCM + view.setUint16(22, 1, true); // numChannels: 1 channel + view.setUint32(24, expectedSampleRate, true); // sampleRate + view.setUint32(28, expectedSampleRate * 2, true); // byteRate + view.setUint16(32, 2, true); // blockAlign + view.setUint16(34, 16, true); // bitsPerSample + view.setUint32(36, 0x61746164, true); // Subchunk2ID + view.setUint32(40, samples.length * 2, true); // subchunk2Size + + let offset = 44; + for (let i = 0; i < samples.length; ++i) { + view.setInt16(offset, samples[i], true); + offset += 2; + } + + return new Blob([view], {type: 'audio/wav'}); +} + +// this function is copied from +// https://github.com/awslabs/aws-lex-browser-audio-capture/blob/master/lib/worker.js#L46 +function downsampleBuffer(buffer, exportSampleRate) { + if (exportSampleRate === recordSampleRate) { + return buffer; + } + var sampleRateRatio = recordSampleRate / exportSampleRate; + var newLength = Math.round(buffer.length / sampleRateRatio); + var result = new Float32Array(newLength); + var offsetResult = 0; + var offsetBuffer = 0; + while (offsetResult < result.length) { + var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio); + var accum = 0, count = 0; + for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) { + accum += buffer[i]; + count++; + } + result[offsetResult] = accum / count; + offsetResult++; + offsetBuffer = nextOffsetBuffer; + } + return result; +}; diff --git a/wasm/vad/assets/README.md b/wasm/vad/assets/README.md new file mode 100644 index 000000000..99510982a --- /dev/null +++ b/wasm/vad/assets/README.md @@ -0,0 +1,5 @@ +# Introduction + +Please download +https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +and put `silero_vad.onnx` into the current directory, i.e., `wasm/vad/assets`. diff --git a/wasm/vad/index.html b/wasm/vad/index.html new file mode 100644 index 000000000..5d8e0372c --- /dev/null +++ b/wasm/vad/index.html @@ -0,0 +1,42 @@ + + + + + + Next-gen Kaldi WebAssembly with sherpa-onnx for Text-to-speech + + + + +

+ Next-gen Kaldi + WebAssembly
+ VAD Demo with sherpa-onnx
+ (with silero-vad) +

+ +
+ Loading model ... ... +
+
+ + + +
+
+ +
+ +
+
+ + + + + diff --git a/wasm/vad/sherpa-onnx-vad.js b/wasm/vad/sherpa-onnx-vad.js new file mode 100644 index 000000000..154bbea0f --- /dev/null +++ b/wasm/vad/sherpa-onnx-vad.js @@ -0,0 +1,253 @@ +function freeConfig(config, Module) { + if ('buffer' in config) { + Module._free(config.buffer); + } + + if ('sileroVad' in config) { + freeConfig(config.sileroVad, Module) + } + + + Module._free(config.ptr); +} + +// The user should free the returned pointers +function initSherpaOnnxSileroVadModelConfig(config, Module) { + const modelLen = Module.lengthBytesUTF8(config.model || '') + 1; + + const n = modelLen; + + const buffer = Module._malloc(n); + + const len = 5 * 4; + const ptr = Module._malloc(len); + + Module.stringToUTF8(config.model || '', buffer, modelLen); + + offset = 0; + Module.setValue(ptr, buffer, 'i8*'); + offset += 4; + + Module.setValue(ptr + offset, config.threshold || 0.5, 'float'); + offset += 4; + + Module.setValue(ptr + offset, config.minSilenceDuration || 0.5, 'float'); + offset += 4; + + Module.setValue(ptr + offset, config.minSpeechDuration || 0.25, 'float'); + offset += 4; + + Module.setValue(ptr + offset, config.windowSize || 512, 'i32'); + offset += 4; + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + +function initSherpaOnnxVadModelConfig(config, Module) { + if (!('sileroVad' in config)) { + config.sileroVad = { + model: '', + threshold: 0.50, + minSilenceDuration: 0.50, + minSpeechDuration: 0.25, + windowSize: 512, + }; + } + + const sileroVad = + initSherpaOnnxSileroVadModelConfig(config.sileroVad, Module); + + const len = sileroVad.len + 4 * 4; + const ptr = Module._malloc(len); + + const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1; + const buffer = Module._malloc(providerLen); + Module.stringToUTF8(config.provider || 'cpu', buffer, providerLen); + + let offset = 0; + Module._CopyHeap(sileroVad.ptr, sileroVad.len, ptr + offset); + offset += sileroVad.len; + + Module.setValue(ptr + offset, config.sampleRate || 16000, 'i32'); + offset += 4; + + Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); + offset += 4; + + Module.setValue(ptr + offset, buffer, 'i8*'); // provider + offset += 4; + + Module.setValue(ptr + offset, config.debug || 0, 'i32'); + offset += 4; + + return { + buffer: buffer, ptr: ptr, len: len, sileroVad: sileroVad, + } +} + +function createVad(Module, myConfig) { + const sileroVad = { + model: './silero_vad.onnx', + threshold: 0.50, + minSilenceDuration: 0.50, + minSpeechDuration: 0.25, + windowSize: 512, + }; + + let config = { + sileroVad: sileroVad, + sampleRate: 16000, + numThreads: 1, + provider: 'cpu', + debug: 1, + bufferSizeInSeconds: 30, + }; + + if (myConfig) { + config = myConfig; + } + + return new Vad(config, Module); +} + + +class CircularBuffer { + constructor(capacity, Module) { + this.handle = Module._SherpaOnnxCreateCircularBuffer(capacity); + this.Module = Module; + } + + free() { + this.Module._SherpaOnnxDestroyCircularBuffer(this.handle); + this.handle = 0 + } + + /** + * @param samples {Float32Array} + */ + push(samples) { + const pointer = + this.Module._malloc(samples.length * samples.BYTES_PER_ELEMENT); + this.Module.HEAPF32.set(samples, pointer / samples.BYTES_PER_ELEMENT); + this.Module._SherpaOnnxCircularBufferPush( + this.handle, pointer, samples.length); + this.Module._free(pointer); + } + + get(startIndex, n) { + const p = + this.Module._SherpaOnnxCircularBufferGet(this.handle, startIndex, n); + + const samplesPtr = p / 4; + const samples = new Float32Array(n); + for (let i = 0; i < n; i++) { + samples[i] = this.Module.HEAPF32[samplesPtr + i]; + } + + this.Module._SherpaOnnxCircularBufferFree(p); + + return samples; + } + + pop(n) { + this.Module._SherpaOnnxCircularBufferPop(this.handle, n); + } + + size() { + return this.Module._SherpaOnnxCircularBufferSize(this.handle); + } + + head() { + return this.Module._SherpaOnnxCircularBufferHead(this.handle); + } + + reset() { + this.Module._SherpaOnnxCircularBufferReset(this.handle); + } +} + +class Vad { + constructor(configObj, Module) { + this.config = configObj; + const config = initSherpaOnnxVadModelConfig(configObj, Module); + Module._MyPrint(config.ptr); + const handle = Module._SherpaOnnxCreateVoiceActivityDetector( + config.ptr, configObj.bufferSizeInSeconds || 30); + freeConfig(config, Module); + + this.handle = handle; + this.Module = Module; + } + + free() { + this.Module._SherpaOnnxDestroyVoiceActivityDetector(this.handle); + this.handle = 0 + } + + // samples is a float32 array + acceptWaveform(samples) { + const pointer = + this.Module._malloc(samples.length * samples.BYTES_PER_ELEMENT); + this.Module.HEAPF32.set(samples, pointer / samples.BYTES_PER_ELEMENT); + this.Module._SherpaOnnxVoiceActivityDetectorAcceptWaveform( + this.handle, pointer, samples.length); + this.Module._free(pointer); + } + + isEmpty() { + return this.Module._SherpaOnnxVoiceActivityDetectorEmpty(this.handle) == 1; + } + + isDetected() { + return this.Module._SherpaOnnxVoiceActivityDetectorDetected(this.handle) == + 1; + } + + pop() { + this.Module._SherpaOnnxVoiceActivityDetectorPop(this.handle); + } + + clear() { + this.Module._SherpaOnnxVoiceActivityDetectorClear(this.handle); + } + + /* +{ + samples: a 1-d float32 array, + start: an int32 +} + */ + front() { + const h = this.Module._SherpaOnnxVoiceActivityDetectorFront(this.handle); + + const start = this.Module.HEAP32[h / 4]; + const samplesPtr = this.Module.HEAP32[h / 4 + 1] / 4; + const numSamples = this.Module.HEAP32[h / 4 + 2]; + + const samples = new Float32Array(numSamples); + for (let i = 0; i < numSamples; i++) { + samples[i] = this.Module.HEAPF32[samplesPtr + i]; + } + + this.Module._SherpaOnnxDestroySpeechSegment(h); + return {samples: samples, start: start}; + } + + reset() { + this.Module._SherpaOnnxVoiceActivityDetectorReset(this.handle); + } + + flush() { + this.Module._SherpaOnnxVoiceActivityDetectorFlush(this.handle); + } +}; + +if (typeof process == 'object' && typeof process.versions == 'object' && + typeof process.versions.node == 'string') { + module.exports = { + createVad, + CircularBuffer, + }; +} diff --git a/wasm/vad/sherpa-onnx-wasm-main-vad.cc b/wasm/vad/sherpa-onnx-wasm-main-vad.cc new file mode 100644 index 000000000..3c1600ba1 --- /dev/null +++ b/wasm/vad/sherpa-onnx-wasm-main-vad.cc @@ -0,0 +1,45 @@ +// wasm/sherpa-onnx-wasm-main-vad.cc +// +// Copyright (c) 2024 Xiaomi Corporation +#include + +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +// see also +// https://emscripten.org/docs/porting/connecting_cpp_and_javascript/Interacting-with-code.html + +extern "C" { + +static_assert(sizeof(SherpaOnnxSileroVadModelConfig) == 5 * 4, ""); + +static_assert(sizeof(SherpaOnnxVadModelConfig) == + sizeof(SherpaOnnxSileroVadModelConfig) + 4 * 4, + ""); +void MyPrint(SherpaOnnxVadModelConfig *config) { + auto silero_vad = &config->silero_vad; + + fprintf(stdout, "----------silero_vad config----------\n"); + fprintf(stdout, "model: %s\n", silero_vad->model); + fprintf(stdout, "threshold: %.3f\n", silero_vad->threshold); + fprintf(stdout, "min_silence_duration: %.3f\n", + silero_vad->min_silence_duration); + fprintf(stdout, "min_speech_duration: %.3f\n", + silero_vad->min_speech_duration); + fprintf(stdout, "window_size: %d\n", silero_vad->window_size); + + fprintf(stdout, "----------config----------\n"); + + fprintf(stdout, "sample_rate: %d\n", config->sample_rate); + fprintf(stdout, "num_threads: %d\n", config->num_threads); + + fprintf(stdout, "provider: %s\n", config->provider); + fprintf(stdout, "debug: %d\n", config->debug); +} + +void CopyHeap(const char *src, int32_t num_bytes, char *dst) { + std::copy(src, src + num_bytes, dst); +} +}