Skip to content

Commit

Permalink
Update prepare.sh
Browse files Browse the repository at this point in the history
  • Loading branch information
JinZr committed Mar 12, 2024
1 parent 9820bf9 commit a9df06c
Showing 1 changed file with 14 additions and 8 deletions.
22 changes: 14 additions & 8 deletions egs/commonvoice/ASR/prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -181,14 +181,14 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then

if [ ! -f $lang_dir/transcript_words.txt ]; then
log "Generate data for lang preparation"
file=$(
find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz"
)
gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/text

# Ensure space only appears once
sed -i 's/\t/ /g' $lang_dir/text
sed -i 's/[ ][ ]*/ /g' $lang_dir/text
# Prepare text.
# Note: in Linux, you can install jq with the following command:
# 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
# 2. chmod +x ./jq
# 3. cp jq /usr/bin
gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_train.jsonl.gz \
| jq '.text' | sed 's/"//g' > $lang_char_dir/text

if [ $lang == "yue" ]; then
# Get words.txt and words_no_ids.txt
Expand Down Expand Up @@ -218,7 +218,13 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
file=$(
find "data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz"
)
gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/transcript_words.txt
# Prepare text.
# Note: in Linux, you can install jq with the following command:
# 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
# 2. chmod +x ./jq
# 3. cp jq /usr/bin
gunzip -c ${file} \
| jq '.text' | sed 's/"//g' > $lang_dir/transcript_words.txt

# Ensure space only appears once
sed -i 's/\t/ /g' $lang_dir/transcript_words.txt
Expand Down

0 comments on commit a9df06c

Please sign in to comment.