-
Notifications
You must be signed in to change notification settings - Fork 304
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use piper_phonemize as text tokenizer in ljspeech recipe #1511
Changes from 5 commits
27b1bf4
ff6784d
2cf5891
e774912
cb04833
1851443
595d4a3
ae83d80
956e58f
1d66426
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,88 +17,42 @@ | |
|
||
|
||
""" | ||
This file reads the texts in given manifest and generates the file that maps tokens to IDs. | ||
This file generates the file that maps tokens to IDs. | ||
""" | ||
|
||
import argparse | ||
import logging | ||
from pathlib import Path | ||
from typing import Dict | ||
|
||
from lhotse import load_manifest | ||
from piper_phonemize import get_espeak_map | ||
|
||
|
||
def get_args(): | ||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument( | ||
"--manifest-file", | ||
type=Path, | ||
default=Path("data/spectrogram/ljspeech_cuts_train.jsonl.gz"), | ||
help="Path to the manifest file", | ||
) | ||
|
||
parser.add_argument( | ||
"--tokens", | ||
type=Path, | ||
default=Path("data/tokens.txt"), | ||
help="Path to the tokens", | ||
help="Path to the dict that maps the text tokens to IDs", | ||
) | ||
|
||
return parser.parse_args() | ||
|
||
|
||
def write_mapping(filename: str, sym2id: Dict[str, int]) -> None: | ||
"""Write a symbol to ID mapping to a file. | ||
|
||
Note: | ||
No need to implement `read_mapping` as it can be done | ||
through :func:`k2.SymbolTable.from_file`. | ||
|
||
Args: | ||
filename: | ||
Filename to save the mapping. | ||
sym2id: | ||
A dict mapping symbols to IDs. | ||
Returns: | ||
Return None. | ||
""" | ||
def get_token2id(filename: Path) -> Dict[str, int]: | ||
"""Get a dict that maps token to IDs, and save it to the given filename.""" | ||
all_tokens = get_espeak_map() | ||
with open(filename, "w", encoding="utf-8") as f: | ||
for sym, i in sym2id.items(): | ||
f.write(f"{sym} {i}\n") | ||
|
||
|
||
def get_token2id(manifest_file: Path) -> Dict[str, int]: | ||
"""Return a dict that maps token to IDs.""" | ||
extra_tokens = [ | ||
"<blk>", # 0 for blank | ||
"<sos/eos>", # 1 for sos and eos symbols. | ||
"<unk>", # 2 for OOV | ||
] | ||
all_tokens = set() | ||
|
||
cut_set = load_manifest(manifest_file) | ||
|
||
for cut in cut_set: | ||
# Each cut only contain one supervision | ||
assert len(cut.supervisions) == 1, len(cut.supervisions) | ||
for t in cut.tokens: | ||
all_tokens.add(t) | ||
|
||
all_tokens = extra_tokens + list(all_tokens) | ||
|
||
token2id: Dict[str, int] = {token: i for i, token in enumerate(all_tokens)} | ||
return token2id | ||
for token, token_id in all_tokens.items(): | ||
f.write(f"{token} {token_id[0]}\n") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you sort by token_id in That is, sort the second column from 0 to vocab_size-1 in ascending order? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. |
||
|
||
|
||
if __name__ == "__main__": | ||
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" | ||
|
||
logging.basicConfig(format=formatter, level=logging.INFO) | ||
|
||
args = get_args() | ||
manifest_file = Path(args.manifest_file) | ||
out_file = Path(args.tokens) | ||
|
||
token2id = get_token2id(manifest_file) | ||
write_mapping(out_file, token2id) | ||
get_token2id(out_file) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,9 +23,9 @@ | |
import logging | ||
from pathlib import Path | ||
|
||
import g2p_en | ||
import tacotron_cleaner.cleaners | ||
from lhotse import CutSet, load_manifest | ||
from piper_phonemize import phonemize_espeak | ||
|
||
|
||
def prepare_tokens_ljspeech(): | ||
|
@@ -35,7 +35,6 @@ def prepare_tokens_ljspeech(): | |
partition = "all" | ||
|
||
cut_set = load_manifest(output_dir / f"{prefix}_cuts_{partition}.{suffix}") | ||
g2p = g2p_en.G2p() | ||
|
||
new_cuts = [] | ||
for cut in cut_set: | ||
|
@@ -45,7 +44,11 @@ def prepare_tokens_ljspeech(): | |
# Text normalization | ||
text = tacotron_cleaner.cleaners.custom_english_cleaners(text) | ||
# Convert to phonemes | ||
cut.tokens = g2p(text) | ||
tokens_list = phonemize_espeak(text, "en-us") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At line 42
Please use assert len(cut.supervisions) == 1, (len(cut.supervisions), cut) It is helpful to print the problematic cut on error. |
||
tokens = [] | ||
for t in tokens_list: | ||
tokens.extend(t) | ||
cut.tokens = tokens | ||
new_cuts.append(cut) | ||
|
||
new_cut_set = CutSet.from_cuts(new_cuts) | ||
|
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -16,8 +16,8 @@ | |||||||||
|
||||||||||
from typing import Dict, List | ||||||||||
|
||||||||||
import g2p_en | ||||||||||
import tacotron_cleaner.cleaners | ||||||||||
from piper_phonemize import phonemize_espeak | ||||||||||
from utils import intersperse | ||||||||||
|
||||||||||
|
||||||||||
|
@@ -38,21 +38,34 @@ def __init__(self, tokens: str): | |||||||||
id = int(info[0]) | ||||||||||
else: | ||||||||||
token, id = info[0], int(info[1]) | ||||||||||
assert token not in self.token2id, token | ||||||||||
self.token2id[token] = id | ||||||||||
|
||||||||||
self.blank_id = self.token2id["<blk>"] | ||||||||||
self.oov_id = self.token2id["<unk>"] | ||||||||||
self.vocab_size = len(self.token2id) | ||||||||||
# Refer to https://github.com/rhasspy/piper/blob/master/TRAINING.md | ||||||||||
self.pad_id = self.token2id["_"] # padding | ||||||||||
self.sos_id = self.token2id["^"] # beginning of an utterance (bos) | ||||||||||
self.eos_id = self.token2id["$"] # end of an utterance (eos) | ||||||||||
self.space_id = self.token2id[" "] # word separator (whitespace) | ||||||||||
|
||||||||||
self.g2p = g2p_en.G2p() | ||||||||||
self.vocab_size = len(self.token2id) | ||||||||||
|
||||||||||
def texts_to_token_ids(self, texts: List[str], intersperse_blank: bool = True): | ||||||||||
def texts_to_token_ids( | ||||||||||
self, | ||||||||||
texts: List[str], | ||||||||||
intersperse_blank: bool = True, | ||||||||||
add_sos: bool = False, | ||||||||||
add_eos: bool = False, | ||||||||||
): | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please give the return value a type hint. |
||||||||||
""" | ||||||||||
Args: | ||||||||||
texts: | ||||||||||
A list of transcripts. | ||||||||||
intersperse_blank: | ||||||||||
Whether to intersperse blanks in the token sequence. | ||||||||||
add_sos: | ||||||||||
Whether to add sos token at the start. | ||||||||||
add_eos: | ||||||||||
Whether to add eos token at the end. | ||||||||||
|
||||||||||
Returns: | ||||||||||
Return a list of token id list [utterance][token_id] | ||||||||||
|
@@ -63,30 +76,44 @@ def texts_to_token_ids(self, texts: List[str], intersperse_blank: bool = True): | |||||||||
# Text normalization | ||||||||||
text = tacotron_cleaner.cleaners.custom_english_cleaners(text) | ||||||||||
# Convert to phonemes | ||||||||||
tokens = self.g2p(text) | ||||||||||
tokens_list = phonemize_espeak(text, "en-us") | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please pass lang: str = 'en-us` as the last argument for this function There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks a lot! |
||||||||||
tokens = [] | ||||||||||
for t in tokens_list: | ||||||||||
tokens.extend(t) | ||||||||||
|
||||||||||
token_ids = [] | ||||||||||
for t in tokens: | ||||||||||
if t in self.token2id: | ||||||||||
token_ids.append(self.token2id[t]) | ||||||||||
else: | ||||||||||
token_ids.append(self.oov_id) | ||||||||||
assert t in self.token2id, t | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
We just skip OOVs instead of throwing an assertion error, which |
||||||||||
token_ids.append(self.token2id[t]) | ||||||||||
|
||||||||||
if intersperse_blank: | ||||||||||
token_ids = intersperse(token_ids, self.blank_id) | ||||||||||
token_ids = intersperse(token_ids, self.pad_id) | ||||||||||
if add_sos: | ||||||||||
token_ids = [self.sos_id] + token_ids | ||||||||||
if add_eos: | ||||||||||
token_ids = token_ids + [self.eos_id] | ||||||||||
|
||||||||||
token_ids_list.append(token_ids) | ||||||||||
|
||||||||||
return token_ids_list | ||||||||||
|
||||||||||
def tokens_to_token_ids( | ||||||||||
self, tokens_list: List[str], intersperse_blank: bool = True | ||||||||||
self, | ||||||||||
tokens_list: List[str], | ||||||||||
intersperse_blank: bool = True, | ||||||||||
add_sos: bool = False, | ||||||||||
add_eos: bool = False, | ||||||||||
): | ||||||||||
""" | ||||||||||
Args: | ||||||||||
tokens_list: | ||||||||||
A list of token list, each corresponding to one utterance. | ||||||||||
intersperse_blank: | ||||||||||
Whether to intersperse blanks in the token sequence. | ||||||||||
add_sos: | ||||||||||
Whether to add sos token at the start. | ||||||||||
add_eos: | ||||||||||
Whether to add eos token at the end. | ||||||||||
|
||||||||||
Returns: | ||||||||||
Return a list of token id list [utterance][token_id] | ||||||||||
|
@@ -96,13 +123,15 @@ def tokens_to_token_ids( | |||||||||
for tokens in tokens_list: | ||||||||||
token_ids = [] | ||||||||||
for t in tokens: | ||||||||||
if t in self.token2id: | ||||||||||
token_ids.append(self.token2id[t]) | ||||||||||
else: | ||||||||||
token_ids.append(self.oov_id) | ||||||||||
assert t in self.token2id, t | ||||||||||
token_ids.append(self.token2id[t]) | ||||||||||
|
||||||||||
if intersperse_blank: | ||||||||||
token_ids = intersperse(token_ids, self.blank_id) | ||||||||||
token_ids = intersperse(token_ids, self.pad_id) | ||||||||||
if add_sos: | ||||||||||
token_ids = [self.sos_id] + token_ids | ||||||||||
if add_eos: | ||||||||||
token_ids = token_ids + [self.eos_id] | ||||||||||
|
||||||||||
token_ids_list.append(token_ids) | ||||||||||
|
||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The copyright should be 2023-2024.