Skip to content

Commit

Permalink
[recipe] refine yaml for aishell4 (#2236)
Browse files Browse the repository at this point in the history
  • Loading branch information
xingchensong authored Dec 13, 2023
1 parent 45a53aa commit 256a3ad
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 10 deletions.
25 changes: 25 additions & 0 deletions examples/aishell4/s0/conf/train_conformer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,37 @@ decoder_conf:
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0

tokenizer: char
tokenizer_conf:
symbol_table_path: 'data/dict/lang_char.txt'
split_with_space: false
bpe_path: null
non_lang_syms_path: null
is_multilingual: false
num_languages: 1
special_tokens:
<blank>: 0
<unk>: 1
<sos>: 2
<eos>: 2

ctc: ctc
ctc_conf:
ctc_blank_id: 0

cmvn: global_cmvn
cmvn_conf:
cmvn_file: 'data/aishell4_train/global_cmvn'
is_json_cmvn: true

# hybrid CTC/attention
model: asr_model
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

dataset: asr
dataset_conf:
filter_conf:
max_length: 4096
Expand Down
13 changes: 3 additions & 10 deletions examples/aishell4/s0/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ dev_set=aishell4_test
test_sets=aishell4_test

train_config=conf/train_conformer.yaml
cmvn=true
dir=exp/conformer
checkpoint=

Expand Down Expand Up @@ -88,10 +87,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
mkdir -p $(dirname $dict)
echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk> 1" >> ${dict} # <unk> must be 1
echo "<sos/eos> 2" >> $dict # <eos>
tools/text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \
| sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict}
num_token=$(cat $dict | wc -l)
echo "<sos/eos> $num_token" >> $dict # <eos>
| sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+2}' >> ${dict}
fi

if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
Expand All @@ -111,9 +109,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# Use "nccl" if it works, otherwise use "gloo"
dist_backend="nccl"
cmvn_opts=
$cmvn && cp data/${train_set}/global_cmvn $dir
$cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
# train.py will write $train_config to $dir/train.yaml with model input
# and output dimension, train.yaml will be used for inference or model
# export later
Expand All @@ -129,14 +124,12 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--train_engine ${train_engine} \
--config $train_config \
--data_type shard \
--symbol_table $dict \
--train_data data/$train_set/data.list \
--cv_data data/${dev_set}/data.list \
${checkpoint:+--checkpoint $checkpoint} \
--model_dir $dir \
--ddp.dist_backend $dist_backend \
--num_workers 1 \
$cmvn_opts \
--pin_memory \
--deepspeed_config ${deepspeed_config} \
--deepspeed.save_states ${deepspeed_save_states}
Expand Down Expand Up @@ -169,7 +162,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--beam_size 10 \
--batch_size 32 \
--penalty 0.0 \
--dict $dict \
--ctc_weight $ctc_weight \
--result_dir $test_dir \
${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
Expand All @@ -178,6 +170,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
data/${test_set}/text $test_dir/$mode/text > $test_dir/$mode/wer
done
}
done
fi

if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
Expand Down

0 comments on commit 256a3ad

Please sign in to comment.