Skip to content

Commit

Permalink
Bug fix in Japenese tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
Edresson committed Nov 6, 2023
1 parent 1a9ca35 commit a1c441f
Showing 1 changed file with 19 additions and 15 deletions.
34 changes: 19 additions & 15 deletions TTS/tts/layers/xtts/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,31 +529,18 @@ def korean_cleaners(text):
return r.translit(text)


def preprocess_text(txt, lang):
if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "zh", "ar", "cs", "ru", "nl", "tr", "hu"]:
txt = multilingual_cleaners(txt, lang)
elif lang == "ja":
txt = japanese_cleaners(txt)
elif lang == "zh-cn" or lang == "zh":
txt = chinese_transliterate(txt)
elif lang == "ko":
txt = korean_cleaners(txt)
else:
raise NotImplementedError()
return txt


DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tokenizer.json")


class VoiceBpeTokenizer:
def __init__(self, vocab_file=None):
self.tokenizer = None
self.katsu = None
if vocab_file is not None:
self.tokenizer = Tokenizer.from_file(vocab_file)

def encode(self, txt, lang):
txt = preprocess_text(txt, lang)
txt = self.preprocess_text(txt, lang)
txt = f"[{lang}]{txt}"
txt = txt.replace(" ", "[SPACE]")
return self.tokenizer.encode(txt).ids
Expand All @@ -567,6 +554,23 @@ def decode(self, seq):
txt = txt.replace("[UNK]", "")
return txt

def preprocess_text(self, txt, lang):
if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "zh", "ar", "cs", "ru", "nl", "tr", "hu"]:
txt = multilingual_cleaners(txt, lang)
elif lang == "ja":
if self.katsu is None:
import cutlet

self.katsu = cutlet.Cutlet()
txt = japanese_cleaners(txt, self.katsu)
elif lang == "zh-cn" or lang == "zh":
txt = chinese_transliterate(txt)
elif lang == "ko":
txt = korean_cleaners(txt)
else:
raise NotImplementedError()
return txt

def __len__(self):
return self.tokenizer.get_vocab_size()

Expand Down

0 comments on commit a1c441f

Please sign in to comment.