Skip to content

Commit

Permalink
Add Matcha-TTS (#1773)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Oct 29, 2024
1 parent 7e9eea6 commit 516b486
Show file tree
Hide file tree
Showing 47 changed files with 5,442 additions and 26 deletions.
120 changes: 120 additions & 0 deletions .github/scripts/ljspeech/TTS/run-matcha.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/usr/bin/env bash

set -ex

apt-get update
apt-get install -y sox

python3 -m pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
python3 -m pip install espnet_tts_frontend
python3 -m pip install numba conformer==0.3.2 diffusers librosa

log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}

cd egs/ljspeech/TTS

sed -i.bak s/600/8/g ./prepare.sh
sed -i.bak s/"first 100"/"first 3"/g ./prepare.sh
sed -i.bak s/500/5/g ./prepare.sh
git diff

function prepare_data() {
# We have created a subset of the data for testing
#
mkdir -p download
pushd download
wget -q https://huggingface.co/csukuangfj/ljspeech-subset-for-ci-test/resolve/main/LJSpeech-1.1.tar.bz2
tar xvf LJSpeech-1.1.tar.bz2
popd

./prepare.sh
tree .
}

function train() {
pushd ./matcha
sed -i.bak s/1500/3/g ./train.py
git diff .
popd

./matcha/train.py \
--exp-dir matcha/exp \
--num-epochs 1 \
--save-every-n 1 \
--num-buckets 2 \
--tokens data/tokens.txt \
--max-duration 20

ls -lh matcha/exp
}

function infer() {

curl -SL -O https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1

./matcha/inference.py \
--epoch 1 \
--exp-dir ./matcha/exp \
--tokens data/tokens.txt \
--vocoder ./generator_v1 \
--input-text "how are you doing?" \
--output-wav ./generated.wav

ls -lh *.wav
soxi ./generated.wav
rm -v ./generated.wav
rm -v generator_v1
}

function export_onnx() {
pushd matcha/exp
curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/exp/epoch-4000.pt
popd

pushd data/fbank
rm -v *.json
curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/data/cmvn.json
popd

./matcha/export_onnx.py \
--exp-dir ./matcha/exp \
--epoch 4000 \
--tokens ./data/tokens.txt \
--cmvn ./data/fbank/cmvn.json

ls -lh *.onnx

if false; then
# THe CI machine does not have enough memory to run it
#
curl -SL -O https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
curl -SL -O https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v2
curl -SL -O https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v3
python3 ./matcha/export_onnx_hifigan.py
else
curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/exp/hifigan_v1.onnx
fi

ls -lh *.onnx

python3 ./matcha/onnx_pretrained.py \
--acoustic-model ./model-steps-6.onnx \
--vocoder ./hifigan_v1.onnx \
--tokens ./data/tokens.txt \
--input-text "how are you doing?" \
--output-wav /icefall/generated-matcha-tts-steps-6-v1.wav

ls -lh /icefall/*.wav
soxi /icefall/generated-matcha-tts-steps-6-v1.wav
}

prepare_data
train
infer
export_onnx

rm -rfv generator_v* matcha/exp
2 changes: 1 addition & 1 deletion .github/scripts/ljspeech/TTS/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ git diff
function prepare_data() {
# We have created a subset of the data for testing
#
mkdir download
mkdir -p download
pushd download
wget -q https://huggingface.co/csukuangfj/ljspeech-subset-for-ci-test/resolve/main/LJSpeech-1.1.tar.bz2
tar xvf LJSpeech-1.1.tar.bz2
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/audioset.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ jobs:
ls -lh ./model-onnx/*
- name: Upload model to huggingface
if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0' && github.event_name == 'push'
if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
uses: nick-fields/retry@v3
Expand Down Expand Up @@ -116,7 +116,7 @@ jobs:
rm -rf huggingface
- name: Prepare for release
if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0' && github.event_name == 'push'
if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
shell: bash
run: |
d=sherpa-onnx-zipformer-audio-tagging-2024-04-09
Expand All @@ -125,7 +125,7 @@ jobs:
ls -lh
- name: Release exported onnx models
if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0' && github.event_name == 'push'
if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
uses: svenstaro/upload-release-action@v2
with:
file_glob: true
Expand Down
12 changes: 3 additions & 9 deletions .github/workflows/ljspeech.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ jobs:
cd /icefall
git config --global --add safe.directory /icefall
.github/scripts/ljspeech/TTS/run-matcha.sh
.github/scripts/ljspeech/TTS/run.sh
- name: display files
Expand All @@ -78,19 +79,13 @@ jobs:
ls -lh
- uses: actions/upload-artifact@v4
if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0'
if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
with:
name: generated-test-files-${{ matrix.python-version }}-${{ matrix.torch-version }}
path: ./*.wav

- uses: actions/upload-artifact@v4
if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0'
with:
name: generated-models-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}
path: ./*.wav

- name: Release exported onnx models
if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0' && github.event_name == 'push'
if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
uses: svenstaro/upload-release-action@v2
with:
file_glob: true
Expand All @@ -99,4 +94,3 @@ jobs:
repo_name: k2-fsa/sherpa-onnx
repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
tag: tts-models

7 changes: 7 additions & 0 deletions egs/ljspeech/TTS/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
build
core.c
*.so
my-output*
*.wav
*.onnx
generator_v*
118 changes: 118 additions & 0 deletions egs/ljspeech/TTS/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,121 @@ export CUDA_VISIBLE_DEVICES=4,5,6,7

# (Note it is killed after `epoch-820.pt`)
```
# matcha

[./matcha](./matcha) contains the code for training [Matcha-TTS](https://github.com/shivammehta25/Matcha-TTS)

This recipe provides a Matcha-TTS model trained on the LJSpeech dataset.

Checkpoints and training logs can be found [here](https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28).
The pull-request for this recipe can be found at <https://github.com/k2-fsa/icefall/pull/1773>

The training command is given below:
```bash
export CUDA_VISIBLE_DEVICES=0,1,2,3

python3 ./matcha/train.py \
--exp-dir ./matcha/exp-new-3/ \
--num-workers 4 \
--world-size 4 \
--num-epochs 4000 \
--max-duration 1000 \
--bucketing-sampler 1 \
--start-epoch 1
```

To inference, use:

```bash
# Download Hifigan vocoder. We use Hifigan v1 below. You can select from v1, v2, or v3

wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1

./matcha/inference \
--exp-dir ./matcha/exp-new-3 \
--epoch 4000 \
--tokens ./data/tokens.txt \
--vocoder ./generator_v1 \
--input-text "how are you doing?"
--output-wav ./generated.wav
```

```bash
soxi ./generated.wav
```
prints:
```
Input File : './generated.wav'
Channels : 1
Sample Rate : 22050
Precision : 16-bit
Duration : 00:00:01.29 = 28416 samples ~ 96.6531 CDDA sectors
File Size : 56.9k
Bit Rate : 353k
Sample Encoding: 16-bit Signed Integer PCM
```

To export the checkpoint to onnx:

```bash
# export the acoustic model to onnx

./matcha/export_onnx.py \
--exp-dir ./matcha/exp-new-3 \
--epoch 4000 \
--tokens ./data/tokens.txt
```

The above command generate the following files:

- model-steps-2.onnx
- model-steps-3.onnx
- model-steps-4.onnx
- model-steps-5.onnx
- model-steps-6.onnx

where the 2 in `model-steps-2.onnx` means it uses 2 steps for the ODE solver.


To export the Hifigan vocoder to onnx, please use:

```bash
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v2
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v3

python3 ./matcha/export_onnx_hifigan.py
```

The above command generates 3 files:

- hifigan_v1.onnx
- hifigan_v2.onnx
- hifigan_v3.onnx

To use the generated onnx files to generate speech from text, please run:

```bash
python3 ./matcha/onnx_pretrained.py \
--acoustic-model ./model-steps-6.onnx \
--vocoder ./hifigan_v1.onnx \
--tokens ./data/tokens.txt \
--input-text "Ask not what your country can do for you; ask what you can do for your country." \
--output-wav ./matcha-epoch-4000-step6-hfigian-v1.wav
```

```bash
soxi ./matcha-epoch-4000-step6-hfigian-v1.wav

Input File : './matcha-epoch-4000-step6-hfigian-v1.wav'
Channels : 1
Sample Rate : 22050
Precision : 16-bit
Duration : 00:00:05.46 = 120320 samples ~ 409.252 CDDA sectors
File Size : 241k
Bit Rate : 353k
Sample Encoding: 16-bit Signed Integer PCM
```

https://github.com/user-attachments/assets/b7c197a6-3870-49c6-90ca-db4d3776869b

Loading

0 comments on commit 516b486

Please sign in to comment.