Skip to content

Commit

Permalink
update text2token
Browse files Browse the repository at this point in the history
  • Loading branch information
pkufool committed Jan 1, 2024
1 parent d6124dc commit fdf7369
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 3 deletions.
5 changes: 4 additions & 1 deletion scripts/text2token.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,10 @@ def get_args():
"--tokens-type",
type=str,
required=True,
help="The type of modeling units, should be cjkchar, bpe or cjkchar+bpe",
help="""The type of modeling units, should be cjkchar, bpe, cjkchar+bpe, fpinyin or ppinyin.
fpinyin means full pinyin, each cjkchar has a pinyin(with tone).
ppinyin means partial pinyin, it splits pinyin into initial and final,
""",
)

parser.add_argument(
Expand Down
5 changes: 4 additions & 1 deletion sherpa-onnx/python/sherpa_onnx/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@ def cli():
"--tokens-type",
type=str,
required=True,
help="The type of modeling units, should be cjkchar, bpe or cjkchar+bpe",
help="""The type of modeling units, should be cjkchar, bpe, cjkchar+bpe, fpinyin or ppinyin.
fpinyin means full pinyin, each cjkchar has a pinyin(with tone).
ppinyin means partial pinyin, it splits pinyin into initial and final,
""",
)
@click.option(
"--bpe-model",
Expand Down
25 changes: 24 additions & 1 deletion sherpa-onnx/python/sherpa_onnx/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

import sentencepiece as spm

from pypinyin import pinyin
from pypinyin.contrib.tone_convert import to_initials, to_finals_tone


def text2token(
texts: List[str],
Expand All @@ -23,7 +26,9 @@ def text2token(
tokens:
The path of the tokens.txt.
tokens_type:
The valid values are cjkchar, bpe, cjkchar+bpe.
The valid values are cjkchar, bpe, cjkchar+bpe, fpinyin, ppinyin.
fpinyin means full pinyin, each cjkchar has a pinyin(with tone).
ppinyin means partial pinyin, it splits pinyin into initial and final,
bpe_model:
The path of the bpe model. Only required when tokens_type is bpe or
cjkchar+bpe.
Expand Down Expand Up @@ -53,6 +58,24 @@ def text2token(
texts_list = [list("".join(text.split())) for text in texts]
elif tokens_type == "bpe":
texts_list = sp.encode(texts, out_type=str)
elif "pinyin" in tokens_type:
for txt in texts:
py = [x[0] for x in pinyin(txt)]
if "ppinyin" == tokens_type:
res = []
for x in py:
initial = to_initials(x, strict=False)
final = to_finals_tone(x, strict=False)
if initial == "" and final == "":
res.append(x)
else:
if initial != "":
res.append(initial)
if final != "":
res.append(final)
texts_list.append(res)
else:
texts_list.append(py)
else:
assert (
tokens_type == "cjkchar+bpe"
Expand Down

0 comments on commit fdf7369

Please sign in to comment.