From 15bd9a841e347a8881fc6df599fd440ebb118da4 Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Wed, 13 Mar 2024 17:39:01 +0800
Subject: [PATCH] add CI for ljspeech (#1548)

---
 .github/scripts/librispeech/ASR/run.sh        |   6 +-
 .github/scripts/ljspeech/TTS/run.sh           | 157 ++++++++++++++++++
 .github/workflows/ljspeech.yml                | 102 ++++++++++++
 docs/source/recipes/TTS/ljspeech/vits.rst     |  69 ++++++++
 .../TTS/local/prepare_tokens_ljspeech.py      |   6 +-
 egs/ljspeech/TTS/prepare.sh                   |   5 +-
 .../TTS/vits/monotonic_align/__init__.py      |   6 +-
 egs/ljspeech/TTS/vits/tokenizer.py            |   4 +-
 egs/ljspeech/TTS/vits/tts_datamodule.py       |   2 +
 9 files changed, 347 insertions(+), 10 deletions(-)
 create mode 100755 .github/scripts/ljspeech/TTS/run.sh
 create mode 100644 .github/workflows/ljspeech.yml

diff --git a/.github/scripts/librispeech/ASR/run.sh b/.github/scripts/librispeech/ASR/run.sh
index 7e9bd8a478..293ed66e53 100755
--- a/.github/scripts/librispeech/ASR/run.sh
+++ b/.github/scripts/librispeech/ASR/run.sh
@@ -15,9 +15,9 @@ function prepare_data() {
   # cause OOM error for CI later.
   mkdir -p download/lm
   pushd download/lm
-  wget -q http://www.openslr.org/resources/11/librispeech-vocab.txt
-  wget -q http://www.openslr.org/resources/11/librispeech-lexicon.txt
-  wget -q http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz
+  wget -q https://huggingface.co/csukuangfj/librispeech-for-ci/resolve/main/librispeech-lm-norm.txt.gz
+  wget -q https://huggingface.co/csukuangfj/librispeech-for-ci/resolve/main/librispeech-lexicon.txt
+  wget -q https://huggingface.co/csukuangfj/librispeech-for-ci/resolve/main/librispeech-vocab.txt
   ls -lh
   gunzip librispeech-lm-norm.txt.gz
 
diff --git a/.github/scripts/ljspeech/TTS/run.sh b/.github/scripts/ljspeech/TTS/run.sh
new file mode 100755
index 0000000000..707361782f
--- /dev/null
+++ b/.github/scripts/ljspeech/TTS/run.sh
@@ -0,0 +1,157 @@
+#!/usr/bin/env bash
+
+set -ex
+
+python3 -m pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
+python3 -m pip install espnet_tts_frontend
+python3 -m pip install numba
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/ljspeech/TTS
+
+sed -i.bak s/600/8/g ./prepare.sh
+sed -i.bak s/"first 100"/"first 3"/g ./prepare.sh
+sed -i.bak s/500/5/g ./prepare.sh
+git diff
+
+function prepare_data() {
+  # We have created a subset of the data for testing
+  #
+  mkdir download
+  pushd download
+  wget -q https://huggingface.co/csukuangfj/ljspeech-subset-for-ci-test/resolve/main/LJSpeech-1.1.tar.bz2
+  tar xvf LJSpeech-1.1.tar.bz2
+  popd
+
+  ./prepare.sh
+  tree .
+}
+
+function train() {
+  pushd ./vits
+  sed -i.bak s/200/3/g ./train.py
+  git diff .
+  popd
+
+  for t in low medium high; do
+    ./vits/train.py \
+      --exp-dir vits/exp-$t \
+      --model-type $t \
+      --num-epochs 1 \
+      --save-every-n 1 \
+      --num-buckets 2 \
+      --tokens data/tokens.txt \
+      --max-duration 20
+
+    ls -lh vits/exp-$t
+  done
+}
+
+function infer() {
+  for t in low medium high; do
+    ./vits/infer.py \
+      --num-buckets 2 \
+      --model-type $t \
+      --epoch 1 \
+      --exp-dir ./vits/exp-$t \
+      --tokens data/tokens.txt \
+      --max-duration 20
+  done
+}
+
+function export_onnx() {
+  for t in low medium high; do
+    ./vits/export-onnx.py \
+      --model-type $t \
+      --epoch 1 \
+      --exp-dir ./vits/exp-$t \
+      --tokens data/tokens.txt
+
+    ls -lh vits/exp-$t/
+  done
+}
+
+function test_medium() {
+  git clone https://huggingface.co/csukuangfj/icefall-tts-ljspeech-vits-medium-2024-03-12
+
+  ./vits/export-onnx.py \
+    --model-type medium \
+    --epoch 820 \
+    --exp-dir ./icefall-tts-ljspeech-vits-medium-2024-03-12/exp \
+    --tokens ./icefall-tts-ljspeech-vits-medium-2024-03-12/data/tokens.txt
+
+  ls -lh ./icefall-tts-ljspeech-vits-medium-2024-03-12/exp
+
+  ./vits/test_onnx.py \
+    --model-filename ./icefall-tts-ljspeech-vits-medium-2024-03-12/exp/vits-epoch-820.onnx \
+    --tokens ./icefall-tts-ljspeech-vits-medium-2024-03-12/data/tokens.txt \
+    --output-filename /icefall/test-medium.wav
+
+  ls -lh /icefall/test-medium.wav
+
+  d=/icefall/vits-icefall-en_US-ljspeech-medium
+  mkdir $d
+  cp -v ./icefall-tts-ljspeech-vits-medium-2024-03-12/data/tokens.txt $d/
+  cp -v ./icefall-tts-ljspeech-vits-medium-2024-03-12/exp/vits-epoch-820.onnx $d/model.onnx
+
+  rm -rf icefall-tts-ljspeech-vits-medium-2024-03-12
+
+  pushd $d
+  wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
+  tar xf espeak-ng-data.tar.bz2
+  rm espeak-ng-data.tar.bz2
+  cd ..
+  tar cjf vits-icefall-en_US-ljspeech-medium.tar.bz2 vits-icefall-en_US-ljspeech-medium
+  rm -rf vits-icefall-en_US-ljspeech-medium
+  ls -lh *.tar.bz2
+  popd
+}
+
+function test_low() {
+  git clone https://huggingface.co/csukuangfj/icefall-tts-ljspeech-vits-low-2024-03-12
+
+  ./vits/export-onnx.py \
+    --model-type low \
+    --epoch 1600 \
+    --exp-dir ./icefall-tts-ljspeech-vits-low-2024-03-12/exp \
+    --tokens ./icefall-tts-ljspeech-vits-low-2024-03-12/data/tokens.txt
+
+  ls -lh ./icefall-tts-ljspeech-vits-low-2024-03-12/exp
+
+  ./vits/test_onnx.py \
+    --model-filename ./icefall-tts-ljspeech-vits-low-2024-03-12/exp/vits-epoch-1600.onnx \
+    --tokens ./icefall-tts-ljspeech-vits-low-2024-03-12/data/tokens.txt \
+    --output-filename /icefall/test-low.wav
+
+  ls -lh /icefall/test-low.wav
+
+  d=/icefall/vits-icefall-en_US-ljspeech-low
+  mkdir $d
+  cp -v ./icefall-tts-ljspeech-vits-low-2024-03-12/data/tokens.txt $d/
+  cp -v ./icefall-tts-ljspeech-vits-low-2024-03-12/exp/vits-epoch-1600.onnx $d/model.onnx
+
+  rm -rf icefall-tts-ljspeech-vits-low-2024-03-12
+
+  pushd $d
+  wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
+  tar xf espeak-ng-data.tar.bz2
+  rm espeak-ng-data.tar.bz2
+  cd ..
+  tar cjf vits-icefall-en_US-ljspeech-low.tar.bz2 vits-icefall-en_US-ljspeech-low
+  rm -rf vits-icefall-en_US-ljspeech-low
+  ls -lh *.tar.bz2
+  popd
+}
+
+prepare_data
+train
+infer
+export_onnx
+rm -rf vits/exp-{low,medium,high}
+test_medium
+test_low
diff --git a/.github/workflows/ljspeech.yml b/.github/workflows/ljspeech.yml
new file mode 100644
index 0000000000..25402275b4
--- /dev/null
+++ b/.github/workflows/ljspeech.yml
@@ -0,0 +1,102 @@
+name: ljspeech
+
+on:
+  push:
+    branches:
+      - master
+
+  pull_request:
+    branches:
+      - master
+
+  workflow_dispatch:
+
+concurrency:
+  group: ljspeech-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  generate_build_matrix:
+    if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa'
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python ./.github/scripts/docker/generate_build_matrix.py
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
+          echo "::set-output name=matrix::${MATRIX}"
+
+  ljspeech:
+    needs: generate_build_matrix
+    name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Free space
+        shell: bash
+        run: |
+          ls -lh
+          df -h
+          rm -rf /opt/hostedtoolcache
+          df -h
+          echo "pwd: $PWD"
+          echo "github.workspace ${{ github.workspace }}"
+
+      - name: Run tests
+        uses: addnab/docker-run-action@v3
+        with:
+            image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }}
+            options: |
+              --volume ${{ github.workspace }}/:/icefall
+            shell: bash
+            run: |
+              export PYTHONPATH=/icefall:$PYTHONPATH
+              cd /icefall
+              git config --global --add safe.directory /icefall
+
+              .github/scripts/ljspeech/TTS/run.sh
+
+      - name: display files
+        shell: bash
+        run: |
+          ls -lh
+
+      - uses: actions/upload-artifact@v4
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0'
+        with:
+          name: generated-test-files-${{ matrix.python-version }}-${{ matrix.torch-version }}
+          path: ./*.wav
+
+      - uses: actions/upload-artifact@v4
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0'
+        with:
+          name: generated-models-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}
+          path: ./*.wav
+
+      - name: Release exported onnx models
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0'
+        uses: svenstaro/upload-release-action@v2
+        with:
+          file_glob: true
+          overwrite: true
+          file: vits-icefall-*.tar.bz2
+          repo_name: k2-fsa/sherpa-onnx
+          repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
+          tag: tts-models
+
diff --git a/docs/source/recipes/TTS/ljspeech/vits.rst b/docs/source/recipes/TTS/ljspeech/vits.rst
index d31bf63022..9499a3aea2 100644
--- a/docs/source/recipes/TTS/ljspeech/vits.rst
+++ b/docs/source/recipes/TTS/ljspeech/vits.rst
@@ -13,6 +13,14 @@ with the `LJSpeech <https://keithito.com/LJ-Speech-Dataset/>`_ dataset.
    The VITS paper: `Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech <https://arxiv.org/pdf/2106.06103.pdf>`_
 
 
+Install extra dependencies
+--------------------------
+
+.. code-block:: bash
+
+  pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
+  pip install numba espnet_tts_frontend
+
 Data preparation
 ----------------
 
@@ -130,3 +138,64 @@ by visiting the following link:
   - ``--model-type=medium``: `<https://huggingface.co/csukuangfj/icefall-tts-ljspeech-vits-medium-2024-03-12>`_
   - ``--model-type=low``: `<https://huggingface.co/csukuangfj/icefall-tts-ljspeech-vits-low-2024-03-12>`_
 
+Usage in sherpa-onnx
+--------------------
+
+The following describes how to test the exported ONNX model in `sherpa-onnx`_.
+
+.. hint::
+
+   `sherpa-onnx`_ supports different programming languages, e.g., C++, C, Python,
+   Kotlin, Java, Swift, Go, C#, etc. It also supports Android and iOS.
+
+   We only describe how to use pre-built binaries from `sherpa-onnx`_ below.
+   Please refer to `<https://k2-fsa.github.io/sherpa/onnx/>`_
+   for more documentation.
+
+Install sherpa-onnx
+^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+   pip install sherpa-onnx
+
+To check that you have installed `sherpa-onnx`_ successfully, please run:
+
+.. code-block:: bash
+
+   which sherpa-onnx-offline-tts
+   sherpa-onnx-offline-tts --help
+
+Download lexicon files
+^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+   cd /tmp
+   wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
+   tar xf espeak-ng-data.tar.bz2
+
+Run sherpa-onnx
+^^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+  cd egs/ljspeech/TTS
+
+  sherpa-onnx-offline-tts \
+    --vits-model=vits/exp/vits-epoch-1000.onnx \
+    --vits-tokens=data/tokens.txt \
+    --vits-data-dir=/tmp/espeak-ng-data \
+    --num-threads=1 \
+    --output-filename=./high.wav \
+    "Ask not what your country can do for you; ask what you can do for your country."
+
+.. hint::
+
+   You can also use ``sherpa-onnx-offline-tts-play`` to play the audio
+   as it is generating.
+
+You should get a file ``high.wav`` after running the above command.
+
+Congratulations! You have successfully trained and exported a text-to-speech
+model and run it with `sherpa-onnx`_.
diff --git a/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py b/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py
index 08fe7430ef..4ba88604ce 100755
--- a/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py
+++ b/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py
@@ -23,7 +23,11 @@
 import logging
 from pathlib import Path
 
-import tacotron_cleaner.cleaners
+try:
+    import tacotron_cleaner.cleaners
+except ModuleNotFoundError as ex:
+    raise RuntimeError(f"{ex}\nPlease run\n  pip install espnet_tts_frontend\n")
+
 from lhotse import CutSet, load_manifest
 from piper_phonemize import phonemize_espeak
 
diff --git a/egs/ljspeech/TTS/prepare.sh b/egs/ljspeech/TTS/prepare.sh
index bded423ac9..9ed0f93fde 100755
--- a/egs/ljspeech/TTS/prepare.sh
+++ b/egs/ljspeech/TTS/prepare.sh
@@ -28,7 +28,7 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
   log "Stage -1: build monotonic_align lib"
   if [ ! -d vits/monotonic_align/build ]; then
     cd vits/monotonic_align
-    python setup.py build_ext --inplace
+    python3 setup.py build_ext --inplace
     cd ../../
   else
     log "monotonic_align lib already built"
@@ -82,8 +82,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
   log "Stage 3: Prepare phoneme tokens for LJSpeech"
   # We assume you have installed piper_phonemize and espnet_tts_frontend.
   # If not, please install them with:
-  #   - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize,
-  #                      could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5
+  #   - piper_phonemize: pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html,
   #   - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
   if [ ! -e data/spectrogram/.ljspeech_with_token.done ]; then
     ./local/prepare_tokens_ljspeech.py
diff --git a/egs/ljspeech/TTS/vits/monotonic_align/__init__.py b/egs/ljspeech/TTS/vits/monotonic_align/__init__.py
index 2b35654f51..5dc3641e59 100644
--- a/egs/ljspeech/TTS/vits/monotonic_align/__init__.py
+++ b/egs/ljspeech/TTS/vits/monotonic_align/__init__.py
@@ -10,7 +10,11 @@
 
 import numpy as np
 import torch
-from numba import njit, prange
+
+try:
+    from numba import njit, prange
+except ModuleNotFoundError as ex:
+    raise RuntimeError(f"{ex}/nPlease run\n  pip install numba")
 
 try:
     from .core import maximum_path_c
diff --git a/egs/ljspeech/TTS/vits/tokenizer.py b/egs/ljspeech/TTS/vits/tokenizer.py
index 8144ffe1eb..3c9046adde 100644
--- a/egs/ljspeech/TTS/vits/tokenizer.py
+++ b/egs/ljspeech/TTS/vits/tokenizer.py
@@ -23,8 +23,8 @@
     from piper_phonemize import phonemize_espeak
 except Exception as ex:
     raise RuntimeError(
-        f"{ex}\nPlease follow instructions in "
-        "../prepare.sh to install piper-phonemize"
+        f"{ex}\nPlease run\n"
+        "pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html"
     )
 
 from utils import intersperse
diff --git a/egs/ljspeech/TTS/vits/tts_datamodule.py b/egs/ljspeech/TTS/vits/tts_datamodule.py
index 8ff868bc8b..e1a9c7b3ca 100644
--- a/egs/ljspeech/TTS/vits/tts_datamodule.py
+++ b/egs/ljspeech/TTS/vits/tts_datamodule.py
@@ -255,6 +255,7 @@ def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
         valid_sampler = DynamicBucketingSampler(
             cuts_valid,
             max_duration=self.args.max_duration,
+            num_buckets=self.args.num_buckets,
             shuffle=False,
         )
         logging.info("About to create valid dataloader")
@@ -294,6 +295,7 @@ def test_dataloaders(self, cuts: CutSet) -> DataLoader:
         test_sampler = DynamicBucketingSampler(
             cuts,
             max_duration=self.args.max_duration,
+            num_buckets=self.args.num_buckets,
             shuffle=False,
         )
         logging.info("About to create test dataloader")