Skip to content

Commit

Permalink
updated scripts for text
Browse files Browse the repository at this point in the history
  • Loading branch information
JinZr committed Mar 12, 2024
1 parent d45e4c6 commit d887bf8
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 2 deletions.
12 changes: 11 additions & 1 deletion egs/commonvoice/ASR/local/word_segment_yue.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from typing import List

import pycantonese
from preprocess_commonvoice import normalize_text
from tqdm.auto import tqdm

from icefall.utils import is_cjk
Expand All @@ -54,6 +55,13 @@ def get_parser():
type=str,
help="The output directory",
)
parser.add_argument(
"--lang",
"-l",
default="yue",
type=str,
help="The language",
)
return parser


Expand Down Expand Up @@ -102,13 +110,15 @@ def get_words(lines: List[str]) -> List[str]:

input_file = Path(args.input_file)
output_dir = Path(args.output_dir)
lang = Path(args.lang)

assert input_file.is_file(), f"{input_file} does not exist"
assert output_dir.is_dir(), f"{output_dir} does not exist"

lines = input_file.read_text(encoding="utf-8").strip().split("\n")
norm_lines = [normalize_text(line, lang) for line in lines]

text_words_segments = get_word_segments(lines)
text_words_segments = get_word_segments(norm_lines)
with open(output_dir / "transcript_words.txt", "w+", encoding="utf-8") as f:
f.writelines(text_words_segments)

Expand Down
3 changes: 2 additions & 1 deletion egs/commonvoice/ASR/prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,8 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
# Get words.txt and words_no_ids.txt
./local/word_segment_yue.py \
--input-file $lang_dir/text \
--output-dir $lang_dir
--output-dir $lang_dir \
--lang $lang

mv $lang_dir/text $lang_dir/_text
cp $lang_dir/transcript_words.txt $lang_dir/text
Expand Down

0 comments on commit d887bf8

Please sign in to comment.