diff --git a/.github/workflows/build-wheels-aarch64-cuda.yaml b/.github/workflows/build-wheels-aarch64-cuda.yaml
new file mode 100644
index 000000000..9c226c431
--- /dev/null
+++ b/.github/workflows/build-wheels-aarch64-cuda.yaml
@@ -0,0 +1,120 @@
+name: build-wheels-aarch64-cuda
+
+on:
+  push:
+    branches:
+      - wheel
+  workflow_dispatch:
+
+env:
+  SHERPA_ONNX_IS_IN_GITHUB_ACTIONS: 1
+
+concurrency:
+  group: build-wheels-aarch64-cuda-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build_wheels_aarch64_cuda:
+    name: ${{ matrix.manylinux }} ${{ matrix.python-version }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        python-version: ["cp37", "cp38", "cp39", "cp310", "cp311", "cp312", "cp313"]
+        manylinux: [manylinux2014] #, manylinux_2_28]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2
+        with:
+          platforms: all
+
+      # see https://cibuildwheel.readthedocs.io/en/stable/changelog/
+      # for a list of versions
+      - name: Build wheels
+        uses: pypa/cibuildwheel@v2.21.3
+        env:
+          CIBW_BEFORE_ALL: |
+            git clone --depth 1 --branch v1.2.12 https://github.com/alsa-project/alsa-lib
+            cd alsa-lib
+            ./gitcompile
+            cd ..
+            echo "PWD"
+            ls -lh /project/alsa-lib/src/.libs
+
+          CIBW_ENVIRONMENT: CPLUS_INCLUDE_PATH=/project/alsa-lib/include:$CPLUS_INCLUDE_PATH SHERPA_ONNX_ALSA_LIB_DIR=/project/alsa-lib/src/.libs LD_LIBRARY_PATH=/project/build/bdist.linux-x86_64/wheel/sherpa_onnx/lib:$SHERPA_ONNX_ALSA_LIB_DIR SHERPA_ONNX_MAKE_ARGS="VERBOSE=1" SHERPA_ONNX_ENABLE_ALSA=1 SHERPA_ONNX_ENABLE_GPU=ON
+          CIBW_BUILD: "${{ matrix.python-version}}-* "
+          CIBW_SKIP: "cp27-* cp35-* cp36-* *-win32 pp* *-musllinux* *-manylinux_i686"
+          CIBW_BUILD_VERBOSITY: 3
+          CIBW_ARCHS_LINUX: aarch64
+          CIBW_MANYLINUX_AARCH64_IMAGE: quay.io/pypa/${{ matrix.manylinux }}_aarch64
+          # From onnxruntime >= 1.17.0, it drops support for CentOS 7.0 and it supports only manylinux_2_28.
+          # manylinux_2_24 is no longer supported
+
+      - name: Display wheels
+        shell: bash
+        run: |
+          ls -lh ./wheelhouse/
+
+      - name: Install patchelf
+        if: matrix.os == 'ubuntu-latest'
+        shell: bash
+        run: |
+          sudo apt-get update -q
+          sudo apt-get install -q -y patchelf
+          patchelf --help
+
+      - name: Patch wheels
+        shell: bash
+        if: matrix.os == 'ubuntu-latest'
+        run: |
+          mkdir ./wheels
+          sudo ./scripts/wheel/patch_wheel.py --in-dir ./wheelhouse --out-dir ./wheels
+
+          ls -lh ./wheels/
+          rm -rf ./wheelhouse
+          mv ./wheels ./wheelhouse
+
+      - name: Publish to huggingface
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 20
+          timeout_seconds: 200
+          shell: bash
+          command: |
+            git config --global user.email "csukuangfj@gmail.com"
+            git config --global user.name "Fangjun Kuang"
+
+            rm -rf huggingface
+            export GIT_LFS_SKIP_SMUDGE=1
+            export GIT_CLONE_PROTECTION_ACTIVE=false
+
+            SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt  | cut -d " " -f 2  | cut -d '"' -f 2)
+            echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION"
+
+            d=cuda/$SHERPA_ONNX_VERSION
+
+            git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-wheels huggingface
+            cd huggingface
+            git fetch
+            git pull
+            git merge -m "merge remote" --ff origin main
+
+            mkdir -p $d
+
+            cp -v ../wheelhouse/*.whl $d/
+
+            git status
+            git add .
+            git commit -m "add more wheels"
+            git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-wheels main
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: wheel-${{ matrix.python-version }}-${{ matrix.manylinux }}
+          path: ./wheelhouse/*.whl
diff --git a/cxx-api-examples/streaming-zipformer-rtf-cxx-api.cc b/cxx-api-examples/streaming-zipformer-rtf-cxx-api.cc
index 9ec69aaba..2e74d30be 100644
--- a/cxx-api-examples/streaming-zipformer-rtf-cxx-api.cc
+++ b/cxx-api-examples/streaming-zipformer-rtf-cxx-api.cc
@@ -7,10 +7,28 @@
 //
 // clang-format off
 //
+// cd /path/sherpa-onnx/
+// mkdir build
+// cd build
+// cmake ..
+// make
+//
 // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
 // tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
 // rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
 //
+// #  1. Test on CPU, run once
+//
+// ./bin/streaming-zipformer-rtf-cxx-api
+//
+// #  2. Test on CPU, run 10 times
+//
+// ./bin/streaming-zipformer-rtf-cxx-api 10
+//
+// #  3. Test on GPU, run 10 times
+//
+// ./bin/streaming-zipformer-rtf-cxx-api 10 cuda
+//
 // clang-format on
 
 #include <chrono>  // NOLINT
@@ -21,13 +39,15 @@
 
 int32_t main(int argc, char *argv[]) {
   int32_t num_runs = 1;
-  if (argc == 2) {
+  if (argc >= 2) {
     num_runs = atoi(argv[1]);
     if (num_runs < 0) {
       num_runs = 1;
     }
   }
 
+  bool use_gpu = (argc == 3);
+
   using namespace sherpa_onnx::cxx;  // NOLINT
   OnlineRecognizerConfig config;
 
@@ -50,6 +70,7 @@ int32_t main(int argc, char *argv[]) {
       "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt";
 
   config.model_config.num_threads = 1;
+  config.model_config.provider = use_gpu ? "cuda" : "cpu";
 
   std::cout << "Loading model\n";
   OnlineRecognizer recongizer = OnlineRecognizer::Create(config);