add the pruned_transducer_stateless7_streaming recipe for commonvoice (…

…#1018) * add the pruned_transducer_stateless7_streaming recipe for commonvoice * fix the symlinks * Update RESULTS.md
k2-fsa · Nov 9, 2023 · 1b2e99d · 1b2e99d
1 parent 231bbcd
commit 1b2e99d
Show file tree

Hide file tree

Showing 42 changed files with 6,260 additions and 840 deletions.
diff --git a/egs/commonvoice/ASR/RESULTS.md b/egs/commonvoice/ASR/RESULTS.md
@@ -57,3 +57,28 @@ Pretrained model is available at
 
 The tensorboard log for training is available at
 <https://tensorboard.dev/experiment/j4pJQty6RMOkMJtRySREKw/>
+
+
+### Commonvoice (fr) BPE training results (Pruned Stateless Transducer 7_streaming)
+
+#### [pruned_transducer_stateless7_streaming](./pruned_transducer_stateless7_streaming)
+
+See #1018  for more details.
+
+Number of model parameters: 70369391, i.e., 70.37 M
+
+The best WER for Common Voice French 12.0 (cv-corpus-12.0-2022-12-07/fr) is below:
+
+Results are:
+
+|    decoding method   | Test  |
+|----------------------|-------|
+|    greedy search     | 9.95  | 
+| modified beam search | 9.57  |
+|   fast beam search   | 9.67  |
+
+Note: This best result is trained on the full librispeech and gigaspeech, and then fine-tuned on the full commonvoice.
+
+Detailed experimental results and Pretrained model are available at
+<https://huggingface.co/shaojieli/icefall-asr-commonvoice-fr-pruned-transducer-stateless7-streaming-2023-04-02>
+
diff --git a/egs/commonvoice/ASR/local/compile_hlg.py b/egs/commonvoice/ASR/local/compile_hlg.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/local/compile_hlg.py
diff --git a/egs/commonvoice/ASR/local/compile_lg.py b/egs/commonvoice/ASR/local/compile_lg.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/local/compile_lg.py
diff --git a/egs/commonvoice/ASR/local/compute_fbank_commonvoice_dev_test.py b/egs/commonvoice/ASR/local/compute_fbank_commonvoice_dev_test.py
@@ -56,8 +56,8 @@ def get_args():
 def compute_fbank_commonvoice_dev_test(language: str):
     src_dir = Path(f"data/{language}/manifests")
     output_dir = Path(f"data/{language}/fbank")
-    num_workers = 42
-    batch_duration = 600
+    num_workers = 16
+    batch_duration = 200
 
     subsets = ("dev", "test")
 

diff --git a/egs/commonvoice/ASR/local/preprocess_commonvoice.py b/egs/commonvoice/ASR/local/preprocess_commonvoice.py
@@ -43,9 +43,13 @@ def get_args():
     return parser.parse_args()
 
 
-def normalize_text(utt: str) -> str:
+def normalize_text(utt: str, language: str) -> str:
     utt = re.sub(r"[{0}]+".format("-"), " ", utt)
-    return re.sub(r"[^a-zA-Z\s']", "", utt).upper()
+    utt = re.sub("’", "'", utt)
+    if language == "en":
+        return re.sub(r"[^a-zA-Z\s]", "", utt).upper()
+    if language == "fr":
+        return re.sub(r"[^A-ZÀÂÆÇÉÈÊËÎÏÔŒÙÛÜ' ]", "", utt).upper()
 
 
 def preprocess_commonvoice(
@@ -94,7 +98,7 @@ def preprocess_commonvoice(
         for sup in m["supervisions"]:
             text = str(sup.text)
             orig_text = text
-            sup.text = normalize_text(sup.text)
+            sup.text = normalize_text(sup.text, language)
             text = str(sup.text)
             if len(orig_text) != len(text):
                 logging.info(

diff --git a/egs/commonvoice/ASR/prepare.sh b/egs/commonvoice/ASR/prepare.sh
@@ -36,8 +36,8 @@ num_splits=1000
 #     - speech
 
 dl_dir=$PWD/download
-release=cv-corpus-13.0-2023-03-09
-lang=en
+release=cv-corpus-12.0-2022-12-07
+lang=fr
 
 . shared/parse_options.sh || exit 1
 
@@ -146,7 +146,7 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
   if [ ! -e data/${lang}/fbank/.cv-${lang}_train.done ]; then
     ./local/compute_fbank_commonvoice_splits.py \
       --num-workers $nj \
-      --batch-duration 600 \
+      --batch-duration 200 \
       --start 0 \
       --num-splits $num_splits \
       --language $lang
@@ -189,7 +189,7 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
       sed -i 's/\t/ /g' $lang_dir/transcript_words.txt
       sed -i 's/[ ][ ]*/ /g' $lang_dir/transcript_words.txt
     fi
- 
+
     if [ ! -f $lang_dir/words.txt ]; then
       cat $lang_dir/transcript_words.txt | sed 's/ /\n/g' \
         | sort -u | sed '/^$/d' > $lang_dir/words.txt
@@ -216,14 +216,14 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
         }' > $lang_dir/words || exit 1;
       mv $lang_dir/words $lang_dir/words.txt
     fi
- 
+
     if [ ! -f $lang_dir/bpe.model ]; then
       ./local/train_bpe_model.py \
         --lang-dir $lang_dir \
         --vocab-size $vocab_size \
         --transcript $lang_dir/transcript_words.txt
     fi
-  
+
     if [ ! -f $lang_dir/L_disambig.pt ]; then
       ./local/prepare_lang_bpe.py --lang-dir $lang_dir
 
@@ -250,3 +250,55 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
     fi
   done
 fi
+
+if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
+  log "Stage 10: Prepare G"
+  # We assume you have install kaldilm, if not, please install
+  # it using: pip install kaldilm
+
+  for vocab_size in ${vocab_sizes[@]}; do
+    lang_dir=data/${lang}/lang_bpe_${vocab_size}
+    mkdir -p $lang_dir/lm
+    #3-gram used in building HLG, 4-gram used for LM rescoring
+    for ngram in 3 4; do
+      if [ ! -f $lang_dir/lm/${ngram}gram.arpa ]; then
+        ./shared/make_kn_lm.py \
+          -ngram-order ${ngram} \
+          -text $lang_dir/transcript_words.txt \
+          -lm $lang_dir/lm/${ngram}gram.arpa
+      fi
+
+      if [ ! -f $lang_dir/lm/${ngram}gram.fst.txt ]; then
+        python3 -m kaldilm \
+          --read-symbol-table="$lang_dir/words.txt" \
+          --disambig-symbol='#0' \
+          --max-order=${ngram} \
+          $lang_dir/lm/${ngram}gram.arpa > $lang_dir/lm/G_${ngram}_gram.fst.txt
+      fi
+    done
+  done
+fi
+
+if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
+  log "Stage 11: Compile HLG"
+
+  for vocab_size in ${vocab_sizes[@]}; do
+    lang_dir=data/${lang}/lang_bpe_${vocab_size}
+    ./local/compile_hlg.py --lang-dir $lang_dir
+
+    # Note If ./local/compile_hlg.py throws OOM,
+    # please switch to the following command
+    #
+    # ./local/compile_hlg_using_openfst.py --lang-dir $lang_dir
+  done
+fi
+
+# Compile LG for RNN-T fast_beam_search decoding
+if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
+  log "Stage 12: Compile LG"
+
+  for vocab_size in ${vocab_sizes[@]}; do
+    lang_dir=data/${lang}/lang_bpe_${vocab_size}
+    ./local/compile_lg.py --lang-dir $lang_dir
+  done
+fi
diff --git a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/README.md b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/README.md
@@ -0,0 +1,9 @@
+This recipe implements Streaming Zipformer-Transducer model.
+
+See https://k2-fsa.github.io/icefall/recipes/Streaming-ASR/librispeech/zipformer_transducer.html for detailed tutorials.
+
+[./emformer.py](./emformer.py) and [./train.py](./train.py)
+are basically the same as
+[./emformer2.py](./emformer2.py) and [./train2.py](./train2.py).
+The only purpose of [./emformer2.py](./emformer2.py) and [./train2.py](./train2.py)
+is for exporting to [sherpa-ncnn](https://github.com/k2-fsa/sherpa-ncnn).
diff --git a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/beam_search.py b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/beam_search.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless7_streaming/beam_search.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../../../librispeech/ASR/pruned_transducer_stateless7_streaming/beam_search.py