From 256a3ad31e02907c22f58fa300826ea2b22c1d47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xingchen=20Song=28=E5=AE=8B=E6=98=9F=E8=BE=B0=29?= Date: Wed, 13 Dec 2023 21:57:50 +0800 Subject: [PATCH] [recipe] refine yaml for aishell4 (#2236) --- .../aishell4/s0/conf/train_conformer.yaml | 25 +++++++++++++++++++ examples/aishell4/s0/run.sh | 13 +++------- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/examples/aishell4/s0/conf/train_conformer.yaml b/examples/aishell4/s0/conf/train_conformer.yaml index dcd115b63..bf2f65026 100644 --- a/examples/aishell4/s0/conf/train_conformer.yaml +++ b/examples/aishell4/s0/conf/train_conformer.yaml @@ -28,12 +28,37 @@ decoder_conf: self_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0 +tokenizer: char +tokenizer_conf: + symbol_table_path: 'data/dict/lang_char.txt' + split_with_space: false + bpe_path: null + non_lang_syms_path: null + is_multilingual: false + num_languages: 1 + special_tokens: + : 0 + : 1 + : 2 + : 2 + +ctc: ctc +ctc_conf: + ctc_blank_id: 0 + +cmvn: global_cmvn +cmvn_conf: + cmvn_file: 'data/aishell4_train/global_cmvn' + is_json_cmvn: true + # hybrid CTC/attention +model: asr_model model_conf: ctc_weight: 0.3 lsm_weight: 0.1 # label smoothing option length_normalized_loss: false +dataset: asr dataset_conf: filter_conf: max_length: 4096 diff --git a/examples/aishell4/s0/run.sh b/examples/aishell4/s0/run.sh index 4e29c0f15..a3e408fa6 100755 --- a/examples/aishell4/s0/run.sh +++ b/examples/aishell4/s0/run.sh @@ -37,7 +37,6 @@ dev_set=aishell4_test test_sets=aishell4_test train_config=conf/train_conformer.yaml -cmvn=true dir=exp/conformer checkpoint= @@ -88,10 +87,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then mkdir -p $(dirname $dict) echo " 0" > ${dict} # 0 will be used for "blank" in CTC echo " 1" >> ${dict} # must be 1 + echo " 2" >> $dict # tools/text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \ - | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # + | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+2}' >> ${dict} fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then @@ -111,9 +109,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') # Use "nccl" if it works, otherwise use "gloo" dist_backend="nccl" - cmvn_opts= - $cmvn && cp data/${train_set}/global_cmvn $dir - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" # train.py will write $train_config to $dir/train.yaml with model input # and output dimension, train.yaml will be used for inference or model # export later @@ -129,14 +124,12 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then --train_engine ${train_engine} \ --config $train_config \ --data_type shard \ - --symbol_table $dict \ --train_data data/$train_set/data.list \ --cv_data data/${dev_set}/data.list \ ${checkpoint:+--checkpoint $checkpoint} \ --model_dir $dir \ --ddp.dist_backend $dist_backend \ --num_workers 1 \ - $cmvn_opts \ --pin_memory \ --deepspeed_config ${deepspeed_config} \ --deepspeed.save_states ${deepspeed_save_states} @@ -169,7 +162,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --beam_size 10 \ --batch_size 32 \ --penalty 0.0 \ - --dict $dict \ --ctc_weight $ctc_weight \ --result_dir $test_dir \ ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} @@ -178,6 +170,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then data/${test_set}/text $test_dir/$mode/text > $test_dir/$mode/wer done } + done fi if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then