add infer code

k2-fsa · Nov 19, 2024 · d55a534 · d55a534
1 parent 5361ecd
commit d55a534
Show file tree

Hide file tree

Showing 15 changed files with 1,029 additions and 855 deletions.
diff --git a/egs/libritts/TTS/local/compute_neural_codec_and_prepare_text_tokens.py b/egs/libritts/TTS/local/compute_neural_codec_and_prepare_text_tokens.py
diff --git a/egs/libritts/TTS/local/compute_neural_codec_and_prepare_text_tokens.py b/egs/libritts/TTS/local/compute_neural_codec_and_prepare_text_tokens.py
@@ -0,0 +1 @@
+../../../wenetspeech4tts/TTS/local/compute_neural_codec_and_prepare_text_tokens.py
diff --git a/egs/libritts/TTS/prepare.sh b/egs/libritts/TTS/prepare.sh
@@ -32,7 +32,7 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
     cd vits/monotonic_align
     python setup.py build_ext --inplace
     cd ../../
-  else 
+  else
     log "monotonic_align lib already built"
   fi
 fi
@@ -75,11 +75,11 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
   log "Stage 2: Compute Spectrogram for LibriTTS"
   mkdir -p data/spectrogram
   if [ ! -e data/spectrogram/.libritts.done ]; then
-    ./local/compute_spectrogram_libritts.py --sampling-rate $sampling_rate 
+    ./local/compute_spectrogram_libritts.py --sampling-rate $sampling_rate
     touch data/spectrogram/.libritts.done
   fi
 
-  # Here we shuffle and combine the train-clean-100, train-clean-360 and 
+  # Here we shuffle and combine the train-clean-100, train-clean-360 and
   # train-other-500 together to form the training set.
   if [ ! -f data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz ]; then
     cat <(gunzip -c data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz) \
@@ -88,7 +88,7 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
       shuf | gzip -c > data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz
   fi
 
-  # Here we shuffle and combine the train-clean-100, train-clean-360 
+  # Here we shuffle and combine the train-clean-100, train-clean-360
   # together to form the training set.
   if [ ! -f data/spectrogram/libritts_cuts_train-clean-460.jsonl.gz ]; then
     cat <(gunzip -c data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz) \
@@ -108,10 +108,10 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
   log "Stage 3: Prepare phoneme tokens for LibriTTS"
   # We assume you have installed piper_phonemize and espnet_tts_frontend.
   # If not, please install them with:
-  #   - piper_phonemize: 
+  #   - piper_phonemize:
   #       refer to https://github.com/rhasspy/piper-phonemize,
   #       could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5
-  #   - espnet_tts_frontend: 
+  #   - espnet_tts_frontend:
   #       `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
   if [ ! -e data/spectrogram/.libritts_with_token.done ]; then
     ./local/prepare_tokens_libritts.py
@@ -123,12 +123,39 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
   log "Stage 4: Generate token file"
   # We assume you have installed piper_phonemize and espnet_tts_frontend.
   # If not, please install them with:
-  #   - piper_phonemize: 
+  #   - piper_phonemize:
   #       refer to https://github.com/rhasspy/piper-phonemize,
   #       could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5
-  #   - espnet_tts_frontend: 
+  #   - espnet_tts_frontend:
   #       `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
   if [ ! -e data/tokens.txt ]; then
     ./local/prepare_token_file.py --tokens data/tokens.txt
   fi
 fi
+
+audio_feats_dir=data/tokenized
+dataset_parts="--dataset-parts all"  # debug "-p dev-clean -p test-clean"
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+  log "Stage 5: Tokenize/Fbank LibriTTS for valle"
+  mkdir -p ${audio_feats_dir}
+  if [ ! -e ${audio_feats_dir}/.libritts.tokenize.done ]; then
+    python3 ./local/compute_neural_codec_and_prepare_text_tokens.py --dataset-parts "${dataset_parts}" \
+        --audio-extractor "Encodec" \
+        --batch-duration 400 \
+        --src-dir "data/manifests" \
+        --output-dir "${audio_feats_dir}"
+  fi
+  touch ${audio_feats_dir}/.libritts.tokenize.done
+
+  lhotse combine \
+    ${audio_feats_dir}/libritts_cuts_train-clean-100.jsonl.gz \
+    ${audio_feats_dir}/libritts_cuts_train-clean-360.jsonl.gz \
+    ${audio_feats_dir}/libritts_cuts_train-other-500.jsonl.gz \
+    ${audio_feats_dir}/cuts_train.jsonl.gz
+  lhotse copy \
+    ${audio_feats_dir}/libritts_cuts_dev-clean.jsonl.gz \
+    ${audio_feats_dir}/cuts_dev.jsonl.gz
+  lhotse copy \
+    ${audio_feats_dir}/libritts_cuts_test-clean.jsonl.gz \
+    ${audio_feats_dir}/cuts_test.jsonl.gz
+fi
diff --git a/egs/libritts/TTS/valle b/egs/libritts/TTS/valle
@@ -0,0 +1 @@
+../../wenetspeech4tts/TTS/valle/
diff --git a/egs/wenetspeech4tts/TTS/README.md b/egs/wenetspeech4tts/TTS/README.md
@@ -0,0 +1,51 @@
+# Introduction
+
+LibriTTS is a multi-speaker English corpus of approximately 585 hours of read English speech at 24kHz sampling rate, prepared by Heiga Zen with the assistance of Google Speech and Google Brain team members.
+The LibriTTS corpus is designed for TTS research. It is derived from the original materials (mp3 audio files from LibriVox and text files from Project Gutenberg) of the LibriSpeech corpus.
+The main differences from the LibriSpeech corpus are listed below:
+1. The audio files are at 24kHz sampling rate.
+2. The speech is split at sentence breaks.
+3. Both original and normalized texts are included.
+4. Contextual information (e.g., neighbouring sentences) can be extracted.
+5. Utterances with significant background noise are excluded.
+For more information, refer to the paper "LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech", Heiga Zen, Viet Dang, Rob Clark, Yu Zhang, Ron J. Weiss, Ye Jia, Zhifeng Chen, and Yonghui Wu, arXiv, 2019. If you use the LibriTTS corpus in your work, please cite this paper where it was introduced.
+
+> [!CAUTION]
+> The next-gen Kaldi framework provides tools and models for generating high-quality, synthetic speech (Text-to-Speech, TTS).
+> While these recipes has the potential to advance various fields such as accessibility, language education, and AI-driven solutions, it also carries certain ethical and legal responsibilities.
+>
+> By using this framework, you agree to the following:
+> 1.	Legal and Ethical Use: You shall not use this framework, or any models derived from it, for any unlawful or unethical purposes. This includes, but is not limited to: Creating voice clones without the explicit, informed consent of the individual whose voice is being cloned. Engaging in any form of identity theft, impersonation, or fraud using cloned voices. Violating any local, national, or international laws regarding privacy, intellectual property, or personal data.
+>
+> 2.	Responsibility of Use: The users of this framework are solely responsible for ensuring that their use of voice cloning technologies complies with all applicable laws and ethical guidelines. We explicitly disclaim any liability for misuse of the technology.
+>
+> 3.	Attribution and Use of Open-Source Components: This project is provided under the Apache 2.0 license. Users must adhere to the terms of this license and provide appropriate attribution when required.
+>
+> 4.	No Warranty: This framework is provided “as-is,” without warranty of any kind, either express or implied. We do not guarantee that the use of this software will comply with legal requirements or that it will not infringe the rights of third parties.
+
+
+# VITS
+
+This recipe provides a VITS model trained on the LibriTTS dataset.
+
+Pretrained model can be found [here](https://huggingface.co/zrjin/icefall-tts-libritts-vits-2024-10-30).
+
+The training command is given below:
+```
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+./vits/train.py \
+  --world-size 4 \
+  --num-epochs 400 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir vits/exp \
+  --max-duration 500
+```
+
+To inference, use:
+```
+./vits/infer.py \
+  --exp-dir vits/exp \
+  --epoch 400 \
+  --tokens data/tokens.txt
+```
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../../../wenetspeech4tts/TTS/local/compute_neural_codec_and_prepare_text_tokens.py