Skip to content

Commit

Permalink
a bilingual recipe similar to the multi-zh_hans (#1265)
Browse files Browse the repository at this point in the history
  • Loading branch information
JinZr authored Nov 26, 2023
1 parent 238b45b commit ae67f75
Show file tree
Hide file tree
Showing 45 changed files with 4,363 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,41 @@ for method in modified_beam_search fast_beam_search; do
$repo/test_wavs/DEV_T0000000001.wav \
$repo/test_wavs/DEV_T0000000002.wav
done

rm -rf $repo

cd ../../../egs/multi_zh_en/ASR
log "==== Test icefall-asr-zipformer-multi-zh-en-2023-11-22 ===="
repo_url=https://huggingface.co/zrjin/icefall-asr-zipformer-multi-zh-en-2023-11-22/

log "Downloading pre-trained model from $repo_url"
git lfs install
git clone $repo_url
repo=$(basename $repo_url)

log "Display test files"
tree $repo/
ls -lh $repo/test_wavs/*.wav

./zipformer/pretrained.py \
--checkpoint $repo/exp/pretrained.pt \
--bpe-model $repo/data/lang_bbpe_2000/bbpe.model \
--method greedy_search \
$repo/test_wavs/_1634_210_2577_1_1525157964032_3712259_29.wav \
$repo/test_wavs/_1634_210_2577_1_1525157964032_3712259_55.wav \
$repo/test_wavs/_1634_210_2577_1_1525157964032_3712259_75.wav

for method in modified_beam_search fast_beam_search; do
log "$method"

./zipformer/pretrained.py \
--method $method \
--beam-size 4 \
--checkpoint $repo/exp/pretrained.pt \
--bpe-model $repo/data/lang_bbpe_2000/bbpe.model \
$repo/test_wavs/_1634_210_2577_1_1525157964032_3712259_29.wav \
$repo/test_wavs/_1634_210_2577_1_1525157964032_3712259_55.wav \
$repo/test_wavs/_1634_210_2577_1_1525157964032_3712259_75.wav
done

rm -rf $repo
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

name: run-multi-zh_hans-zipformer
name: run-multi-corpora-zipformer

on:
push:
Expand All @@ -24,12 +24,12 @@ on:
types: [labeled]

concurrency:
group: run_multi-zh_hans_zipformer-${{ github.ref }}
group: run_multi-corpora_zipformer-${{ github.ref }}
cancel-in-progress: true

jobs:
run_multi-zh_hans_zipformer:
if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event_name == 'push' || github.event.label.name == 'multi-zh_hans' || github.event.label.name == 'zipformer'
run_multi-corpora_zipformer:
if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event_name == 'push' || github.event.label.name == 'multi-zh_hans' || github.event.label.name == 'zipformer' || github.event.label.name == 'multi-corpora'
runs-on: ${{ matrix.os }}
strategy:
matrix:
Expand Down Expand Up @@ -81,4 +81,4 @@ jobs:
export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
.github/scripts/run-multi-zh_hans-zipformer.sh
.github/scripts/run-multi-corpora-zipformer.sh
19 changes: 19 additions & 0 deletions egs/multi_zh_en/ASR/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Introduction

This recipe includes scripts for training Zipformer model using both English and Chinese datasets.

# Included Training Sets

1. LibriSpeech (English)
2. AiShell-2 (Chinese)
3. TAL-CSASR (Code-Switching, Chinese and English)

|Datset| Number of hours| URL|
|---|---:|---|
|**TOTAL**|2,547|---|
|LibriSpeech|960|https://www.openslr.org/12/|
|AiShell-2|1,000|http://www.aishelltech.com/aishell_2|
|TAL-CSASR|587|https://ai.100tal.com/openData/voice|



44 changes: 44 additions & 0 deletions egs/multi_zh_en/ASR/RESULTS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
## Results

### Zh-En datasets bpe-based training results (Non-streaming) on Zipformer model

This is the [pull request #1238](https://github.com/k2-fsa/icefall/pull/1265) in icefall.

#### Non-streaming (Byte-Level BPE vocab_size=2000)

Best results (num of params : ~69M):

The training command:

```
./zipformer/train.py \
--world-size 4 \
--num-epochs 35 \
--use-fp16 1 \
--max-duration 1000 \
--num-workers 8
```

The decoding command:

```
for method in greedy_search modified_beam_search fast_beam_search; do
./zipformer/decode.py \
--epoch 34 \
--avg 19 \
--decoding-method $method
done
```

Word Error Rates (WERs) listed below are produced by the checkpoint of the 20th epoch using greedy search and BPE model (# tokens is 2000).

| Datasets | TAL-CSASR | TAL-CSASR | AiShell-2 | AiShell-2 | LibriSpeech | LibriSpeech |
|----------------------|-----------|-----------|-----------|-----------|-------------|-------------|
| Zipformer WER (%) | dev | test | dev | test | test-clean | test-other |
| greedy_search | 6.65 | 6.69 | 6.57 | 7.03 | 2.43 | 5.70 |
| modified_beam_search | 6.46 | 6.51 | 6.18 | 6.60 | 2.41 | 5.57 |
| fast_beam_search | 6.57 | 6.68 | 6.40 | 6.74 | 2.40 | 5.56 |

Pre-trained model can be found here : https://huggingface.co/zrjin/icefall-asr-zipformer-multi-zh-en-2023-11-22, which is trained on LibriSpeech 960-hour training set (with speed perturbation), TAL-CSASR training set (with speed perturbation) and AiShell-2 (w/o speed perturbation).


1 change: 1 addition & 0 deletions egs/multi_zh_en/ASR/local/compile_lg.py
1 change: 1 addition & 0 deletions egs/multi_zh_en/ASR/local/prepare_char.py
65 changes: 65 additions & 0 deletions egs/multi_zh_en/ASR/local/prepare_for_bpe_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/usr/bin/env python3
# Copyright 2023 Xiaomi Corp. (authors: Zengrui Jin)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This script tokenizes the training transcript by CJK characters
# and saves the result to transcript_chars.txt, which is used
# to train the BPE model later.

import argparse
from pathlib import Path

from tqdm.auto import tqdm

from icefall.utils import tokenize_by_CJK_char


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--lang-dir",
type=str,
help="""Output directory.
The generated transcript_chars.txt is saved to this directory.
""",
)

parser.add_argument(
"--text",
type=str,
help="Training transcript.",
)

return parser.parse_args()


def main():
args = get_args()
lang_dir = Path(args.lang_dir)
text = Path(args.text)

assert lang_dir.exists() and text.exists(), f"{lang_dir} or {text} does not exist!"

transcript_path = lang_dir / "transcript_chars.txt"

with open(text, "r", encoding="utf-8") as fin:
with open(transcript_path, "w+", encoding="utf-8") as fout:
for line in tqdm(fin):
fout.write(tokenize_by_CJK_char(line) + "\n")


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions egs/multi_zh_en/ASR/local/prepare_lang.py
1 change: 1 addition & 0 deletions egs/multi_zh_en/ASR/local/prepare_lang_bbpe.py
1 change: 1 addition & 0 deletions egs/multi_zh_en/ASR/local/prepare_lang_bpe.py
1 change: 1 addition & 0 deletions egs/multi_zh_en/ASR/local/prepare_words.py
1 change: 1 addition & 0 deletions egs/multi_zh_en/ASR/local/text2segments.py
1 change: 1 addition & 0 deletions egs/multi_zh_en/ASR/local/text2token.py
1 change: 1 addition & 0 deletions egs/multi_zh_en/ASR/local/train_bbpe_model.py
1 change: 1 addition & 0 deletions egs/multi_zh_en/ASR/local/validate_bpe_lexicon.py
149 changes: 149 additions & 0 deletions egs/multi_zh_en/ASR/prepare.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#!/usr/bin/env bash

# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

set -eou pipefail

stage=-1
stop_stage=100

dl_dir=$PWD/download

. shared/parse_options.sh || exit 1

vocab_sizes=(
2000
)

# All files generated by this script are saved in "data".
# You can safely remove "data" and rerun this script to regenerate it.
mkdir -p data

log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}

log "dl_dir: $dl_dir"

log "Dataset: musan"
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Soft link fbank of musan"
mkdir -p data/fbank
if [ -e ../../librispeech/ASR/data/fbank/.musan.done ]; then
cd data/fbank
ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/musan_feats) .
ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/musan_cuts.jsonl.gz) .
cd ../..
else
log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 4 --stop-stage 4"
exit 1
fi
fi

log "Dataset: LibriSpeech"
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Soft link fbank of LibriSpeech"
mkdir -p data/fbank
if [ -e ../../librispeech/ASR/data/fbank/.librispeech.done ]; then
cd data/fbank
ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_cuts*) .
ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_feats*) .
cd ../..
else
log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 3 --stop-stage 3"
exit 1
fi
fi

log "Dataset: AiShell-2"
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Soft link fbank of AiShell-2"
mkdir -p data/fbank
if [ -e ../../aishell2/ASR/data/fbank/.aishell2.done ]; then
cd data/fbank
ln -svf $(realpath ../../../../aishell2/ASR/data/fbank/aishell2_cuts*) .
ln -svf $(realpath ../../../../aishell2/ASR/data/fbank/aishell2_feats*) .
cd ../..
else
log "Abort! Please run ../../aishell2/ASR/prepare.sh --stage 3 --stop-stage 3"
exit 1
fi
fi

if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Prepare Byte BPE based lang"
mkdir -p data/fbank
if [ ! -d ../../aishell2/ASR/data/lang_char ] && [ ! -d ./data/lang_char ]; then
log "Abort! Please run ../../aishell2/ASR/prepare.sh --stage 3 --stop-stage 3"
exit 1
fi

if [ ! -d ../../librispeech/ASR/data/lang_bpe_500 ] && [ ! -d ./data/lang_bpe_500 ]; then
log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 6 --stop-stage 6"
exit 1
fi

cd data/
if [ ! -d ./lang_char ]; then
ln -svf $(realpath ../../../aishell2/ASR/data/lang_char) .
fi
if [ ! -d ./lang_bpe_500 ]; then
ln -svf $(realpath ../../../librispeech/ASR/data/lang_bpe_500) .
fi
cd ../

for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bbpe_${vocab_size}
mkdir -p $lang_dir

cat data/lang_char/text data/lang_bpe_500/transcript_words.txt \
> $lang_dir/text

if [ ! -f $lang_dir/transcript_chars.txt ]; then
./local/prepare_for_bpe_model.py \
--lang-dir ./$lang_dir \
--text $lang_dir/text
fi

if [ ! -f $lang_dir/text_words_segmentation ]; then
python3 ./local/text2segments.py \
--input-file ./data/lang_char/text \
--output-file $lang_dir/text_words_segmentation

cat ./data/lang_bpe_500/transcript_words.txt \
>> $lang_dir/text_words_segmentation

cat ./data/lang_char/text \
>> $lang_dir/text
fi

cat $lang_dir/text_words_segmentation | sed 's/ /\n/g' \
| sort -u | sed '/^$/d' | uniq > $lang_dir/words_no_ids.txt

if [ ! -f $lang_dir/words.txt ]; then
python3 ./local/prepare_words.py \
--input-file $lang_dir/words_no_ids.txt \
--output-file $lang_dir/words.txt
fi

if [ ! -f $lang_dir/bbpe.model ]; then
./local/train_bbpe_model.py \
--lang-dir $lang_dir \
--vocab-size $vocab_size \
--transcript $lang_dir/text
fi

if [ ! -f $lang_dir/L_disambig.pt ]; then
./local/prepare_lang_bbpe.py --lang-dir $lang_dir

log "Validating $lang_dir/lexicon.txt"
./local/validate_bpe_lexicon.py \
--lexicon $lang_dir/lexicon.txt \
--bpe-model $lang_dir/bbpe.model
fi
done
fi

1 change: 1 addition & 0 deletions egs/multi_zh_en/ASR/shared
Loading

0 comments on commit ae67f75

Please sign in to comment.