Skip to content

Commit

Permalink
fix index error
Browse files Browse the repository at this point in the history
  • Loading branch information
yuekaizhang committed Feb 20, 2024
1 parent 6fd14d2 commit be001a8
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 10 deletions.
4 changes: 2 additions & 2 deletions egs/multi_zh-hans/ASR/local/compute_fbank_kespeech_splits.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,8 @@ def compute_fbank_kespeech_splits(args):
set_audio_duration_mismatch_tolerance(0.01) # 10ms tolerance
set_caching_enabled(False)
for i in range(start, stop):
idx = f"{i + 1}".zfill(num_digits)
logging.info(f"Processing {idx}/{num_splits}")
idx = f"{i}".zfill(num_digits)
logging.info(f"Processing {i+1}/{num_splits}")

cuts_path = output_dir / f"kespeech-asr_cuts_{subset}.{idx}.jsonl.gz"
if cuts_path.is_file():
Expand Down
27 changes: 21 additions & 6 deletions egs/multi_zh-hans/ASR/prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

set -eou pipefail

stage=120
stop_stage=120
stage=121
stop_stage=121
num_splits=100

dl_dir=$PWD/download
Expand Down Expand Up @@ -274,15 +274,15 @@ if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
touch data/fbank/.kespeech_preprocess_complete
fi

if [ -f data/fbank/.kespeech.train_phase1.split.${num_splits}.done ]; then
if [ ! -f data/fbank/.kespeech.train_phase1.split.${num_splits}.done ]; then
log "Spliting KeSpeech train_phase1"
lhotse split ${num_splits} \
data/fbank/kespeech/kespeech-asr_cuts_train_phase1_raw.jsonl.gz \
data/fbank/kespeech/train_phase1_split_${num_splits}
touch data/fbank/.kespeech.train_phase1.split.${num_splits}.done
fi

if [ -f data/fbank/.kespeech.train_phase2.split.${num_splits}.done ]; then
if [ ! -f data/fbank/.kespeech.train_phase2.split.${num_splits}.done ]; then
log "Spliting KeSpeech train_phase2"
lhotse split ${num_splits} \
data/fbank/kespeech/kespeech-asr_cuts_train_phase2_raw.jsonl.gz \
Expand Down Expand Up @@ -327,15 +327,15 @@ if [ $stage -le 120 ] && [ $stop_stage -ge 120 ]; then
touch data/fbank/.kespeech_preprocess_complete
fi

if [ -f data/fbank/.kespeech.train_phase1.split.${num_splits}.done ]; then
if [ ! -f data/fbank/.kespeech.train_phase1.split.${num_splits}.done ]; then
log "Spliting KeSpeech train_phase1"
lhotse split ${num_splits} \
data/fbank/kespeech/kespeech-asr_cuts_train_phase1_raw.jsonl.gz \
data/fbank/kespeech/train_phase1_split_${num_splits}
touch data/fbank/.kespeech.train_phase1.split.${num_splits}.done
fi

if [ -f data/fbank/.kespeech.train_phase2.split.${num_splits}.done ]; then
if [ ! -f data/fbank/.kespeech.train_phase2.split.${num_splits}.done ]; then
log "Spliting KeSpeech train_phase2"
lhotse split ${num_splits} \
data/fbank/kespeech/kespeech-asr_cuts_train_phase2_raw.jsonl.gz \
Expand All @@ -356,6 +356,21 @@ if [ $stage -le 120 ] && [ $stop_stage -ge 120 ]; then
fi
fi

if [ $stage -le 121 ] && [ $stop_stage -ge 121 ]; then
log "Stage 121: tmp"
log "Compute KeSpeech fbank for train_phase1"
./local/compute_fbank_kespeech_splits.py --num-splits ${num_splits} --stop 1 --training-subset train_phase1 --num-mel-bins ${whisper_mel_bins} --whisper-fbank true

log "Compute KeSpeech fbank for train_phase2"
./local/compute_fbank_kespeech_splits.py --num-splits ${num_splits} --training-subset train_phase2 --num-mel-bins ${whisper_mel_bins} --whisper-fbank true

log "Compute KeSpeech fbank for test/dev"
./local/compute_fbank_kespeech_dev_test.py --num-mel-bins ${whisper_mel_bins} --whisper-fbank true

touch data/fbank/.kespeech.done
fi
fi


if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
log "Stage 13: BPE model training (note that we use transcripts of wenetspeech only for BPE training)"
Expand Down
4 changes: 2 additions & 2 deletions egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ def compute_fbank_wenetspeech_splits(args):
set_caching_enabled(False)
#with get_executor() as ex: # Initialize the executor only once.
for i in range(start, stop):
idx = f"{i + 1}".zfill(num_digits)
logging.info(f"Processing {idx}/{num_splits}")
idx = f"{i}".zfill(num_digits)
logging.info(f"Processing {i+1}/{num_splits}")

cuts_path = output_dir / f"cuts_{subset}.{idx}.jsonl.gz"
if cuts_path.is_file():
Expand Down

0 comments on commit be001a8

Please sign in to comment.