From 04c5e6d3a6a4f4fd4c3dc17550c03272cd992bfb Mon Sep 17 00:00:00 2001 From: jinzr Date: Thu, 29 Feb 2024 14:52:58 +0800 Subject: [PATCH 1/9] Delete prepare_token_file.py --- egs/vctk/TTS/local/prepare_token_file.py | 104 ----------------------- 1 file changed, 104 deletions(-) delete mode 100755 egs/vctk/TTS/local/prepare_token_file.py diff --git a/egs/vctk/TTS/local/prepare_token_file.py b/egs/vctk/TTS/local/prepare_token_file.py deleted file mode 100755 index c6636c3ad6..0000000000 --- a/egs/vctk/TTS/local/prepare_token_file.py +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2023 Xiaomi Corp. (authors: Zengwei Yao) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This file reads the texts in given manifest and generates the file that maps tokens to IDs. -""" - -import argparse -import logging -from pathlib import Path -from typing import Dict - -from lhotse import load_manifest - - -def get_args(): - parser = argparse.ArgumentParser() - - parser.add_argument( - "--manifest-file", - type=Path, - default=Path("data/spectrogram/vctk_cuts_all.jsonl.gz"), - help="Path to the manifest file", - ) - - parser.add_argument( - "--tokens", - type=Path, - default=Path("data/tokens.txt"), - help="Path to the tokens", - ) - - return parser.parse_args() - - -def write_mapping(filename: str, sym2id: Dict[str, int]) -> None: - """Write a symbol to ID mapping to a file. - - Note: - No need to implement `read_mapping` as it can be done - through :func:`k2.SymbolTable.from_file`. - - Args: - filename: - Filename to save the mapping. - sym2id: - A dict mapping symbols to IDs. - Returns: - Return None. - """ - with open(filename, "w", encoding="utf-8") as f: - for sym, i in sym2id.items(): - f.write(f"{sym} {i}\n") - - -def get_token2id(manifest_file: Path) -> Dict[str, int]: - """Return a dict that maps token to IDs.""" - extra_tokens = [ - "", # 0 for blank - "", # 1 for sos and eos symbols. - "", # 2 for OOV - ] - all_tokens = set() - - cut_set = load_manifest(manifest_file) - - for cut in cut_set: - # Each cut only contain one supervision - assert len(cut.supervisions) == 1, len(cut.supervisions) - for t in cut.tokens: - all_tokens.add(t) - - all_tokens = extra_tokens + list(all_tokens) - - token2id: Dict[str, int] = {token: i for i, token in enumerate(all_tokens)} - return token2id - - -if __name__ == "__main__": - formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" - - logging.basicConfig(format=formatter, level=logging.INFO) - - args = get_args() - manifest_file = Path(args.manifest_file) - out_file = Path(args.tokens) - - token2id = get_token2id(manifest_file) - write_mapping(out_file, token2id) From 42d68f0755c64d5278a1d7db363b29c525cb18bd Mon Sep 17 00:00:00 2001 From: jinzr Date: Thu, 29 Feb 2024 14:53:10 +0800 Subject: [PATCH 2/9] Create prepare_token_file.py --- egs/vctk/TTS/local/prepare_token_file.py | 1 + 1 file changed, 1 insertion(+) create mode 120000 egs/vctk/TTS/local/prepare_token_file.py diff --git a/egs/vctk/TTS/local/prepare_token_file.py b/egs/vctk/TTS/local/prepare_token_file.py new file mode 120000 index 0000000000..afc29a22ba --- /dev/null +++ b/egs/vctk/TTS/local/prepare_token_file.py @@ -0,0 +1 @@ +../../../ljspeech/TTS/local/prepare_token_file.py \ No newline at end of file From a0dc097ad93938c648ba0e5e463983cf046f99a0 Mon Sep 17 00:00:00 2001 From: jinzr Date: Thu, 29 Feb 2024 15:04:08 +0800 Subject: [PATCH 3/9] init commit --- egs/vctk/TTS/local/prepare_tokens_vctk.py | 11 +++++++---- egs/vctk/TTS/prepare.sh | 20 ++++++++++++++------ egs/vctk/TTS/vits/export-onnx.py | 3 +-- egs/vctk/TTS/vits/infer.py | 9 +++++---- egs/vctk/TTS/vits/test_onnx.py | 7 +++++-- egs/vctk/TTS/vits/train.py | 9 +++++---- 6 files changed, 37 insertions(+), 22 deletions(-) diff --git a/egs/vctk/TTS/local/prepare_tokens_vctk.py b/egs/vctk/TTS/local/prepare_tokens_vctk.py index 32e1c7dfad..0748eba5af 100755 --- a/egs/vctk/TTS/local/prepare_tokens_vctk.py +++ b/egs/vctk/TTS/local/prepare_tokens_vctk.py @@ -24,9 +24,9 @@ import logging from pathlib import Path -import g2p_en import tacotron_cleaner.cleaners from lhotse import CutSet, load_manifest +from piper_phonemize import phonemize_espeak from tqdm.auto import tqdm @@ -37,17 +37,20 @@ def prepare_tokens_vctk(): partition = "all" cut_set = load_manifest(output_dir / f"{prefix}_cuts_{partition}.{suffix}") - g2p = g2p_en.G2p() new_cuts = [] for cut in tqdm(cut_set): # Each cut only contains one supervision - assert len(cut.supervisions) == 1, len(cut.supervisions) + assert len(cut.supervisions) == 1, (len(cut.supervisions), cut) text = cut.supervisions[0].text # Text normalization text = tacotron_cleaner.cleaners.custom_english_cleaners(text) # Convert to phonemes - cut.tokens = g2p(text) + tokens_list = phonemize_espeak(text, "en-us") + tokens = [] + for t in tokens_list: + tokens.extend(t) + cut.tokens = tokens new_cuts.append(cut) new_cut_set = CutSet.from_cuts(new_cuts) diff --git a/egs/vctk/TTS/prepare.sh b/egs/vctk/TTS/prepare.sh index 152c7b1680..aab0753125 100755 --- a/egs/vctk/TTS/prepare.sh +++ b/egs/vctk/TTS/prepare.sh @@ -78,6 +78,13 @@ fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "Stage 3: Prepare phoneme tokens for VCTK" + # We assume you have installed piper_phonemize and espnet_tts_frontend. + # If not, please install them with: + # - piper_phonemize: + # refer to https://github.com/rhasspy/piper-phonemize, + # could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5 + # - espnet_tts_frontend: + # `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/ if [ ! -e data/spectrogram/.vctk_with_token.done ]; then ./local/prepare_tokens_vctk.py mv data/spectrogram/vctk_cuts_with_tokens_all.jsonl.gz \ @@ -111,14 +118,15 @@ fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then log "Stage 5: Generate token file" - # We assume you have installed g2p_en and espnet_tts_frontend. + # We assume you have installed piper_phonemize and espnet_tts_frontend. # If not, please install them with: - # - g2p_en: `pip install g2p_en`, refer to https://github.com/Kyubyong/g2p - # - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/ + # - piper_phonemize: + # refer to https://github.com/rhasspy/piper-phonemize, + # could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5 + # - espnet_tts_frontend: + # `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/ if [ ! -e data/tokens.txt ]; then - ./local/prepare_token_file.py \ - --manifest-file data/spectrogram/vctk_cuts_train.jsonl.gz \ - --tokens data/tokens.txt + ./local/prepare_token_file.py --tokens data/tokens.txt fi fi diff --git a/egs/vctk/TTS/vits/export-onnx.py b/egs/vctk/TTS/vits/export-onnx.py index 80d1556261..fd90da8ee3 100755 --- a/egs/vctk/TTS/vits/export-onnx.py +++ b/egs/vctk/TTS/vits/export-onnx.py @@ -231,8 +231,7 @@ def main(): params.update(vars(args)) tokenizer = Tokenizer(params.tokens) - params.blank_id = tokenizer.blank_id - params.oov_id = tokenizer.oov_id + params.blank_id = tokenizer.pad_id params.vocab_size = tokenizer.vocab_size with open(args.speakers) as f: diff --git a/egs/vctk/TTS/vits/infer.py b/egs/vctk/TTS/vits/infer.py index 06c25f02eb..2e1abdefb5 100755 --- a/egs/vctk/TTS/vits/infer.py +++ b/egs/vctk/TTS/vits/infer.py @@ -135,14 +135,16 @@ def _save_worker( batch_size = len(batch["tokens"]) tokens = batch["tokens"] - tokens = tokenizer.tokens_to_token_ids(tokens) + tokens = tokenizer.tokens_to_token_ids( + tokens, intersperse_blank=True, add_sos=True, add_eos=True + ) tokens = k2.RaggedTensor(tokens) row_splits = tokens.shape.row_splits(1) tokens_lens = row_splits[1:] - row_splits[:-1] tokens = tokens.to(device) tokens_lens = tokens_lens.to(device) # tensor of shape (B, T) - tokens = tokens.pad(mode="constant", padding_value=tokenizer.blank_id) + tokens = tokens.pad(mode="constant", padding_value=tokenizer.pad_id) speakers = ( torch.Tensor([speaker_map[sid] for sid in batch["speakers"]]) .int() @@ -214,8 +216,7 @@ def main(): device = torch.device("cuda", 0) tokenizer = Tokenizer(params.tokens) - params.blank_id = tokenizer.blank_id - params.oov_id = tokenizer.oov_id + params.blank_id = tokenizer.pad_id params.vocab_size = tokenizer.vocab_size # we need cut ids to display recognition results. diff --git a/egs/vctk/TTS/vits/test_onnx.py b/egs/vctk/TTS/vits/test_onnx.py index d85c0a27bd..ae6587338e 100755 --- a/egs/vctk/TTS/vits/test_onnx.py +++ b/egs/vctk/TTS/vits/test_onnx.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # -# Copyright 2023 Xiaomi Corporation (Author: Zengwei Yao) +# Copyright 2023-2024 Xiaomi Corporation (Author: Zengwei Yao, +# Zengrui Jin,) # # See ../../../../LICENSE for clarification regarding multiple authors # @@ -122,7 +123,9 @@ def main(): model = OnnxModel(args.model_filename) text = "I went there to see the land, the people and how their system works, end quote." - tokens = tokenizer.texts_to_token_ids([text]) + tokens = tokenizer.texts_to_token_ids( + [text], intersperse_blank=True, add_sos=True, add_eos=True + ) tokens = torch.tensor(tokens) # (1, T) tokens_lens = torch.tensor([tokens.shape[1]], dtype=torch.int64) # (1, T) speaker = torch.tensor([1], dtype=torch.int64) # (1, ) diff --git a/egs/vctk/TTS/vits/train.py b/egs/vctk/TTS/vits/train.py index 56f167a178..300b9a8a12 100755 --- a/egs/vctk/TTS/vits/train.py +++ b/egs/vctk/TTS/vits/train.py @@ -342,14 +342,16 @@ def prepare_input( torch.Tensor([speaker_map[sid] for sid in batch["speakers"]]).int().to(device) ) - tokens = tokenizer.tokens_to_token_ids(tokens) + tokens = tokenizer.texts_to_token_ids( + tokens, intersperse_blank=True, add_sos=True, add_eos=True + ) tokens = k2.RaggedTensor(tokens) row_splits = tokens.shape.row_splits(1) tokens_lens = row_splits[1:] - row_splits[:-1] tokens = tokens.to(device) tokens_lens = tokens_lens.to(device) # a tensor of shape (B, T) - tokens = tokens.pad(mode="constant", padding_value=tokenizer.blank_id) + tokens = tokens.pad(mode="constant", padding_value=tokenizer.pad_id) return audio, audio_lens, features, features_lens, tokens, tokens_lens, speakers @@ -812,8 +814,7 @@ def run(rank, world_size, args): logging.info(f"Device: {device}") tokenizer = Tokenizer(params.tokens) - params.blank_id = tokenizer.blank_id - params.oov_id = tokenizer.oov_id + params.blank_id = tokenizer.pad_id params.vocab_size = tokenizer.vocab_size vctk = VctkTtsDataModule(args) From 8f8e516e29a7879cb782e43941ee8bc9715b5bb7 Mon Sep 17 00:00:00 2001 From: jinzr Date: Fri, 1 Mar 2024 09:40:43 +0800 Subject: [PATCH 4/9] Update train.py --- egs/vctk/TTS/vits/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/vctk/TTS/vits/train.py b/egs/vctk/TTS/vits/train.py index 300b9a8a12..9b2f2b2467 100755 --- a/egs/vctk/TTS/vits/train.py +++ b/egs/vctk/TTS/vits/train.py @@ -342,7 +342,7 @@ def prepare_input( torch.Tensor([speaker_map[sid] for sid in batch["speakers"]]).int().to(device) ) - tokens = tokenizer.texts_to_token_ids( + tokens = tokenizer.tokens_to_token_ids( tokens, intersperse_blank=True, add_sos=True, add_eos=True ) tokens = k2.RaggedTensor(tokens) From da181ac2b61a14be13fcbd0738da4bec5705efc4 Mon Sep 17 00:00:00 2001 From: jinzr Date: Mon, 4 Mar 2024 14:57:11 +0800 Subject: [PATCH 5/9] to align with PR #1524 --- egs/vctk/TTS/vits/export-onnx.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/egs/vctk/TTS/vits/export-onnx.py b/egs/vctk/TTS/vits/export-onnx.py index fd90da8ee3..31be01a2d4 100755 --- a/egs/vctk/TTS/vits/export-onnx.py +++ b/egs/vctk/TTS/vits/export-onnx.py @@ -97,7 +97,7 @@ def add_meta_data(filename: str, meta_data: Dict[str, str]): for key, value in meta_data.items(): meta = model.metadata_props.add() meta.key = key - meta.value = value + meta.value = str(value) onnx.save(model, filename) @@ -212,10 +212,15 @@ def export_model_onnx( ) meta_data = { - "model_type": "VITS", + "model_type": "vits", "version": "1", "model_author": "k2-fsa", - "comment": "VITS generator", + "comment": "icefall", # must be icefall for models from icefall + "language": "English", + "voice": "en-us", # Choose your language appropriately + "has_espeak": 1, + "n_speakers": 108, + "sample_rate": 22050, # Must match the real sample rate } logging.info(f"meta_data: {meta_data}") From 89632bf748f90565e2c75b6aeb6c6460309713d5 Mon Sep 17 00:00:00 2001 From: jinzr Date: Fri, 8 Mar 2024 02:56:53 +0800 Subject: [PATCH 6/9] Update train.py --- egs/vctk/TTS/vits/train.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/egs/vctk/TTS/vits/train.py b/egs/vctk/TTS/vits/train.py index 9b2f2b2467..d65dfd5c36 100755 --- a/egs/vctk/TTS/vits/train.py +++ b/egs/vctk/TTS/vits/train.py @@ -514,9 +514,10 @@ def save_bad_model(suffix: str = ""): logging.warning(f"Grad scale is small: {cur_grad_scale}") if cur_grad_scale < 1.0e-05: save_bad_model() - raise RuntimeError( - f"grad_scale is too small, exiting: {cur_grad_scale}" - ) + logging.warning(f"Grad scale is small: {cur_grad_scale}") + # raise RuntimeError( + # f"grad_scale is too small, exiting: {cur_grad_scale}" + # ) if params.batch_idx_train % params.log_interval == 0: cur_lr_g = max(scheduler_g.get_last_lr()) From e69b60e579482b73ae6e6d9ff584190176e07b03 Mon Sep 17 00:00:00 2001 From: jinzr Date: Mon, 11 Mar 2024 23:14:14 +0800 Subject: [PATCH 7/9] enable the grad_scale is too small error --- egs/vctk/TTS/vits/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/egs/vctk/TTS/vits/train.py b/egs/vctk/TTS/vits/train.py index d65dfd5c36..8dca57a6a8 100755 --- a/egs/vctk/TTS/vits/train.py +++ b/egs/vctk/TTS/vits/train.py @@ -515,9 +515,9 @@ def save_bad_model(suffix: str = ""): if cur_grad_scale < 1.0e-05: save_bad_model() logging.warning(f"Grad scale is small: {cur_grad_scale}") - # raise RuntimeError( - # f"grad_scale is too small, exiting: {cur_grad_scale}" - # ) + raise RuntimeError( + f"grad_scale is too small, exiting: {cur_grad_scale}" + ) if params.batch_idx_train % params.log_interval == 0: cur_lr_g = max(scheduler_g.get_last_lr()) From fa73dc54a55dc2c2d24fdcc59c282ed01fe08f3d Mon Sep 17 00:00:00 2001 From: jinzr Date: Mon, 18 Mar 2024 10:39:01 +0800 Subject: [PATCH 8/9] misc. update --- egs/vctk/TTS/README.md | 3 +-- egs/vctk/TTS/vits/export-onnx.py | 7 +++++-- egs/vctk/TTS/vits/train.py | 3 ++- egs/vctk/TTS/vits/tts_datamodule.py | 5 +++-- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/egs/vctk/TTS/README.md b/egs/vctk/TTS/README.md index c07516b777..c2703dbe2c 100644 --- a/egs/vctk/TTS/README.md +++ b/egs/vctk/TTS/README.md @@ -10,7 +10,7 @@ The above information is from the [CSTR VCTK website](https://datashare.ed.ac.uk This recipe provides a VITS model trained on the VCTK dataset. -Pretrained model can be found [here](https://huggingface.co/zrjin/icefall-tts-vctk-vits-2023-12-05), note that this model was pretrained on the Edinburgh DataShare VCTK dataset. +Pretrained model can be found [here](https://huggingface.co/zrjin/icefall-tts-vctk-vits-2024-03-18), note that this model was pretrained on the Edinburgh DataShare VCTK dataset. For tutorial and more details, please refer to the [VITS documentation](https://k2-fsa.github.io/icefall/recipes/TTS/vctk/vits.html). @@ -21,7 +21,6 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3" --world-size 4 \ --num-epochs 1000 \ --start-epoch 1 \ - --use-fp16 1 \ --exp-dir vits/exp \ --tokens data/tokens.txt --max-duration 350 diff --git a/egs/vctk/TTS/vits/export-onnx.py b/egs/vctk/TTS/vits/export-onnx.py index 31be01a2d4..d00450f080 100755 --- a/egs/vctk/TTS/vits/export-onnx.py +++ b/egs/vctk/TTS/vits/export-onnx.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # -# Copyright 2023 Xiaomi Corporation (Author: Zengwei Yao) +# Copyright 2023-2024 Xiaomi Corporation (Author: Zengwei Yao, +# Zengrui Jin,) # # See ../../../../LICENSE for clarification regarding multiple authors # @@ -160,6 +161,7 @@ def export_model_onnx( model: nn.Module, model_filename: str, vocab_size: int, + n_speakers: int, opset_version: int = 11, ) -> None: """Export the given generator model to ONNX format. @@ -219,7 +221,7 @@ def export_model_onnx( "language": "English", "voice": "en-us", # Choose your language appropriately "has_espeak": 1, - "n_speakers": 108, + "n_speakers": n_speakers, "sample_rate": 22050, # Must match the real sample rate } logging.info(f"meta_data: {meta_data}") @@ -269,6 +271,7 @@ def main(): model, model_filename, params.vocab_size, + params.num_spks, opset_version=opset_version, ) logging.info(f"Exported generator to {model_filename}") diff --git a/egs/vctk/TTS/vits/train.py b/egs/vctk/TTS/vits/train.py index 8dca57a6a8..81e318360b 100755 --- a/egs/vctk/TTS/vits/train.py +++ b/egs/vctk/TTS/vits/train.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 -# Copyright 2023 Xiaomi Corp. (authors: Zengwei Yao) +# Copyright 2023-2024 Xiaomi Corporation (Author: Zengwei Yao, +# Zengrui Jin,) # # See ../../../../LICENSE for clarification regarding multiple authors # diff --git a/egs/vctk/TTS/vits/tts_datamodule.py b/egs/vctk/TTS/vits/tts_datamodule.py index 52fc5179f4..6c785d8c36 100644 --- a/egs/vctk/TTS/vits/tts_datamodule.py +++ b/egs/vctk/TTS/vits/tts_datamodule.py @@ -1,6 +1,7 @@ # Copyright 2021 Piotr Żelasko -# Copyright 2022-2023 Xiaomi Corporation (Authors: Mingshuang Luo, -# Zengwei Yao) +# Copyright 2022-2024 Xiaomi Corporation (Authors: Mingshuang Luo, +# Zengwei Yao, +# Zengrui Jin,) # # See ../../../../LICENSE for clarification regarding multiple authors # From 4a8bd4294a58b5fd8ef2d66ad12fc99fb1d2a7be Mon Sep 17 00:00:00 2001 From: jinzr Date: Mon, 18 Mar 2024 16:50:49 +0800 Subject: [PATCH 9/9] removed unnecessary warning --- egs/vctk/TTS/vits/train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/egs/vctk/TTS/vits/train.py b/egs/vctk/TTS/vits/train.py index 81e318360b..55bd693275 100755 --- a/egs/vctk/TTS/vits/train.py +++ b/egs/vctk/TTS/vits/train.py @@ -515,7 +515,6 @@ def save_bad_model(suffix: str = ""): logging.warning(f"Grad scale is small: {cur_grad_scale}") if cur_grad_scale < 1.0e-05: save_bad_model() - logging.warning(f"Grad scale is small: {cur_grad_scale}") raise RuntimeError( f"grad_scale is too small, exiting: {cur_grad_scale}" )