From 1a6729e7ca7955877a29dfabea164d739d947a79 Mon Sep 17 00:00:00 2001
From: liu zhengxi <380185688@qq.com>
Date: Tue, 7 Sep 2021 21:47:57 +0800
Subject: [PATCH] Add custom vocab file for ce (#963)

---
 .../transformer/deploy/python/inference.py    | 26 ++++++++++++++
 .../transformer/export_model.py               | 26 ++++++++++++++
 .../encoder_decoding_predict.py               | 26 ++++++++++++++
 .../faster_transformer/export_model.py        | 26 ++++++++++++++
 .../transformer/predict.py                    | 26 ++++++++++++++
 .../machine_translation/transformer/reader.py | 31 ++++++++++++----
 .../transformer/static/predict.py             | 26 ++++++++++++++
 .../transformer/static/train.py               | 26 ++++++++++++++
 .../machine_translation/transformer/train.py  | 26 ++++++++++++++
 tests/prepare.sh                              | 36 +++++++++----------
 tests/transformer_base_dygraph_params.txt     |  8 ++---
 tests/transformer_base_static_params.txt      |  4 +--
 tests/transformer_big_dygraph_params.txt      |  8 ++---
 tests/transformer_big_static_params.txt       |  4 +--
 14 files changed, 261 insertions(+), 38 deletions(-)

diff --git a/examples/machine_translation/transformer/deploy/python/inference.py b/examples/machine_translation/transformer/deploy/python/inference.py
index 36fb8500a280a..d5220f185e7ae 100644
--- a/examples/machine_translation/transformer/deploy/python/inference.py
+++ b/examples/machine_translation/transformer/deploy/python/inference.py
@@ -64,6 +64,28 @@ def parse_args():
         default="./output/",
         type=str,
         help="The path to save logs when profile is enabled. ")
+    parser.add_argument(
+        "--vocab_file",
+        default=None,
+        type=str,
+        help="The vocab file. Normally, it shouldn't be set and in this case, the default WMT14 dataset will be used."
+    )
+    parser.add_argument(
+        "--unk_token",
+        default=None,
+        type=str,
+        help="The unknown token. It should be provided when use custom vocab_file. "
+    )
+    parser.add_argument(
+        "--bos_token",
+        default=None,
+        type=str,
+        help="The bos token. It should be provided when use custom vocab_file. ")
+    parser.add_argument(
+        "--eos_token",
+        default=None,
+        type=str,
+        help="The eos token. It should be provided when use custom vocab_file. ")
     args = parser.parse_args()
     return args
 
@@ -222,6 +244,10 @@ def do_inference(args):
         args.inference_model_dir = ARGS.model_dir
     args.test_file = ARGS.test_file
     args.save_log_path = ARGS.save_log_path
+    args.vocab_file = ARGS.vocab_file
+    args.unk_token = ARGS.unk_token
+    args.bos_token = ARGS.bos_token
+    args.eos_token = ARGS.eos_token
     pprint(args)
 
     if args.profile:
diff --git a/examples/machine_translation/transformer/export_model.py b/examples/machine_translation/transformer/export_model.py
index dcf8779552be4..9bf1564102bbd 100644
--- a/examples/machine_translation/transformer/export_model.py
+++ b/examples/machine_translation/transformer/export_model.py
@@ -24,6 +24,28 @@ def parse_args():
         action="store_true",
         help="Whether to print logs on each cards and use benchmark vocab. Normally, not necessary to set --benchmark. "
     )
+    parser.add_argument(
+        "--vocab_file",
+        default=None,
+        type=str,
+        help="The vocab file. Normally, it shouldn't be set and in this case, the default WMT14 dataset will be used."
+    )
+    parser.add_argument(
+        "--unk_token",
+        default=None,
+        type=str,
+        help="The unknown token. It should be provided when use custom vocab_file. "
+    )
+    parser.add_argument(
+        "--bos_token",
+        default=None,
+        type=str,
+        help="The bos token. It should be provided when use custom vocab_file. ")
+    parser.add_argument(
+        "--eos_token",
+        default=None,
+        type=str,
+        help="The eos token. It should be provided when use custom vocab_file. ")
     args = parser.parse_args()
     return args
 
@@ -87,6 +109,10 @@ def do_export(args):
     with open(yaml_file, 'rt') as f:
         args = AttrDict(yaml.safe_load(f))
     args.benchmark = ARGS.benchmark
+    args.vocab_file = ARGS.vocab_file
+    args.unk_token = ARGS.unk_token
+    args.bos_token = ARGS.bos_token
+    args.eos_token = ARGS.eos_token
     pprint(args)
 
     do_export(args)
diff --git a/examples/machine_translation/transformer/faster_transformer/encoder_decoding_predict.py b/examples/machine_translation/transformer/faster_transformer/encoder_decoding_predict.py
index 6204d5c8c1be2..5764b692c3408 100644
--- a/examples/machine_translation/transformer/faster_transformer/encoder_decoding_predict.py
+++ b/examples/machine_translation/transformer/faster_transformer/encoder_decoding_predict.py
@@ -74,6 +74,28 @@ def parse_args():
         action="store_true",
         help="Whether to print logs on each cards and use benchmark vocab. Normally, not necessary to set --benchmark. "
     )
+    parser.add_argument(
+        "--vocab_file",
+        default=None,
+        type=str,
+        help="The vocab file. Normally, it shouldn't be set and in this case, the default WMT14 dataset will be used."
+    )
+    parser.add_argument(
+        "--unk_token",
+        default=None,
+        type=str,
+        help="The unknown token. It should be provided when use custom vocab_file. "
+    )
+    parser.add_argument(
+        "--bos_token",
+        default=None,
+        type=str,
+        help="The bos token. It should be provided when use custom vocab_file. ")
+    parser.add_argument(
+        "--eos_token",
+        default=None,
+        type=str,
+        help="The eos token. It should be provided when use custom vocab_file. ")
     args = parser.parse_args()
     return args
 
@@ -191,6 +213,10 @@ def do_predict(args):
     if ARGS.batch_size:
         args.infer_batch_size = ARGS.batch_size
     args.test_file = ARGS.test_file
+    args.vocab_file = ARGS.vocab_file
+    args.unk_token = ARGS.unk_token
+    args.bos_token = ARGS.bos_token
+    args.eos_token = ARGS.eos_token
     pprint(args)
 
     do_predict(args)
diff --git a/examples/machine_translation/transformer/faster_transformer/export_model.py b/examples/machine_translation/transformer/faster_transformer/export_model.py
index a38cb1a3b30b2..c2ef1258711e4 100644
--- a/examples/machine_translation/transformer/faster_transformer/export_model.py
+++ b/examples/machine_translation/transformer/faster_transformer/export_model.py
@@ -62,6 +62,28 @@ def parse_args():
         action="store_true",
         help="Whether to print logs on each cards and use benchmark vocab. Normally, not necessary to set --benchmark. "
     )
+    parser.add_argument(
+        "--vocab_file",
+        default=None,
+        type=str,
+        help="The vocab file. Normally, it shouldn't be set and in this case, the default WMT14 dataset will be used."
+    )
+    parser.add_argument(
+        "--unk_token",
+        default=None,
+        type=str,
+        help="The unknown token. It should be provided when use custom vocab_file. "
+    )
+    parser.add_argument(
+        "--bos_token",
+        default=None,
+        type=str,
+        help="The bos token. It should be provided when use custom vocab_file. ")
+    parser.add_argument(
+        "--eos_token",
+        default=None,
+        type=str,
+        help="The eos token. It should be provided when use custom vocab_file. ")
     args = parser.parse_args()
     return args
 
@@ -133,6 +155,10 @@ def do_predict(args):
     args.topk = ARGS.topk
     args.topp = ARGS.topp
     args.benchmark = ARGS.benchmark
+    args.vocab_file = ARGS.vocab_file
+    args.unk_token = ARGS.unk_token
+    args.bos_token = ARGS.bos_token
+    args.eos_token = ARGS.eos_token
     pprint(args)
 
     do_predict(args)
diff --git a/examples/machine_translation/transformer/predict.py b/examples/machine_translation/transformer/predict.py
index 1f72c333a02e5..6a1f1550a564f 100644
--- a/examples/machine_translation/transformer/predict.py
+++ b/examples/machine_translation/transformer/predict.py
@@ -35,6 +35,28 @@ def parse_args():
         "--without_ft",
         action="store_true",
         help="Whether to use Faster Transformer to do predict. ")
+    parser.add_argument(
+        "--vocab_file",
+        default=None,
+        type=str,
+        help="The vocab file. Normally, it shouldn't be set and in this case, the default WMT14 dataset will be used."
+    )
+    parser.add_argument(
+        "--unk_token",
+        default=None,
+        type=str,
+        help="The unknown token. It should be provided when use custom vocab_file. "
+    )
+    parser.add_argument(
+        "--bos_token",
+        default=None,
+        type=str,
+        help="The bos token. It should be provided when use custom vocab_file. ")
+    parser.add_argument(
+        "--eos_token",
+        default=None,
+        type=str,
+        help="The eos token. It should be provided when use custom vocab_file. ")
     args = parser.parse_args()
     return args
 
@@ -127,6 +149,10 @@ def do_predict(args):
     args.benchmark = ARGS.benchmark
     args.test_file = ARGS.test_file
     args.without_ft = ARGS.without_ft
+    args.vocab_file = ARGS.vocab_file
+    args.unk_token = ARGS.unk_token
+    args.bos_token = ARGS.bos_token
+    args.eos_token = ARGS.eos_token
     pprint(args)
 
     do_predict(args)
diff --git a/examples/machine_translation/transformer/reader.py b/examples/machine_translation/transformer/reader.py
index b908ef2276d0f..cf8989e4ebdef 100644
--- a/examples/machine_translation/transformer/reader.py
+++ b/examples/machine_translation/transformer/reader.py
@@ -45,7 +45,13 @@ def create_data_loader(args, places=None):
         raise ValueError(
             "--train_file and --dev_file must be both or neither set. ")
 
-    if not args.benchmark:
+    if args.vocab_file is not None:
+        src_vocab = Vocab.load_vocabulary(
+            filepath=args.vocab_file,
+            unk_token=args.unk_token,
+            bos_token=args.bos_token,
+            eos_token=args.eos_token)
+    elif not args.benchmark:
         src_vocab = Vocab.load_vocabulary(**datasets[0].vocab_info["bpe"])
     else:
         src_vocab = Vocab.load_vocabulary(**datasets[0].vocab_info["benchmark"])
@@ -109,7 +115,13 @@ def create_infer_loader(args):
     else:
         dataset = load_dataset('wmt14ende', splits=('test'))
 
-    if not args.benchmark:
+    if args.vocab_file is not None:
+        src_vocab = Vocab.load_vocabulary(
+            filepath=args.vocab_file,
+            unk_token=args.unk_token,
+            bos_token=args.bos_token,
+            eos_token=args.eos_token)
+    elif not args.benchmark:
         src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"])
     else:
         src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["benchmark"])
@@ -151,11 +163,18 @@ def convert_samples(sample):
 
 
 def adapt_vocab_size(args):
-    dataset = load_dataset('wmt14ende', splits=('test'))
-    if not args.benchmark:
-        src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"])
+    if args.vocab_file is not None:
+        src_vocab = Vocab.load_vocabulary(
+            filepath=args.vocab_file,
+            unk_token=args.unk_token,
+            bos_token=args.bos_token,
+            eos_token=args.eos_token)
     else:
-        src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["benchmark"])
+        dataset = load_dataset('wmt14ende', splits=('test'))
+        if not args.benchmark:
+            src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"])
+        else:
+            src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["benchmark"])
     trg_vocab = src_vocab
 
     padding_vocab = (
diff --git a/examples/machine_translation/transformer/static/predict.py b/examples/machine_translation/transformer/static/predict.py
index 5fee0b55b22f0..242c9d97549b6 100644
--- a/examples/machine_translation/transformer/static/predict.py
+++ b/examples/machine_translation/transformer/static/predict.py
@@ -51,6 +51,28 @@ def parse_args():
         type=str,
         help="The file for testing. Normally, it shouldn't be set and in this case, the default WMT14 dataset will be used to process testing."
     )
+    parser.add_argument(
+        "--vocab_file",
+        default=None,
+        type=str,
+        help="The vocab file. Normally, it shouldn't be set and in this case, the default WMT14 dataset will be used."
+    )
+    parser.add_argument(
+        "--unk_token",
+        default=None,
+        type=str,
+        help="The unknown token. It should be provided when use custom vocab_file. "
+    )
+    parser.add_argument(
+        "--bos_token",
+        default=None,
+        type=str,
+        help="The bos token. It should be provided when use custom vocab_file. ")
+    parser.add_argument(
+        "--eos_token",
+        default=None,
+        type=str,
+        help="The eos token. It should be provided when use custom vocab_file. ")
     args = parser.parse_args()
     return args
 
@@ -146,6 +168,10 @@ def do_predict(args):
         args = AttrDict(yaml.safe_load(f))
     args.benchmark = ARGS.benchmark
     args.test_file = ARGS.test_file
+    args.vocab_file = ARGS.vocab_file
+    args.unk_token = ARGS.unk_token
+    args.bos_token = ARGS.bos_token
+    args.eos_token = ARGS.eos_token
     pprint(args)
 
     do_predict(args)
diff --git a/examples/machine_translation/transformer/static/train.py b/examples/machine_translation/transformer/static/train.py
index 94c72935b8dc6..4f8e40753e61d 100644
--- a/examples/machine_translation/transformer/static/train.py
+++ b/examples/machine_translation/transformer/static/train.py
@@ -60,6 +60,28 @@ def parse_args():
         type=str,
         help="The files for validation, including [source language file, target language file]. Normally, it shouldn't be set and in this case, the default WMT14 dataset will be used to do validation. "
     )
+    parser.add_argument(
+        "--vocab_file",
+        default=None,
+        type=str,
+        help="The vocab file. Normally, it shouldn't be set and in this case, the default WMT14 dataset will be used."
+    )
+    parser.add_argument(
+        "--unk_token",
+        default=None,
+        type=str,
+        help="The unknown token. It should be provided when use custom vocab_file. "
+    )
+    parser.add_argument(
+        "--bos_token",
+        default=None,
+        type=str,
+        help="The bos token. It should be provided when use custom vocab_file. ")
+    parser.add_argument(
+        "--eos_token",
+        default=None,
+        type=str,
+        help="The eos token. It should be provided when use custom vocab_file. ")
     args = parser.parse_args()
     return args
 
@@ -299,6 +321,10 @@ def do_train(args):
         args.max_iter = ARGS.max_iter
     args.train_file = ARGS.train_file
     args.dev_file = ARGS.dev_file
+    args.vocab_file = ARGS.vocab_file
+    args.unk_token = ARGS.unk_token
+    args.bos_token = ARGS.bos_token
+    args.eos_token = ARGS.eos_token
     pprint(args)
 
     do_train(args)
diff --git a/examples/machine_translation/transformer/train.py b/examples/machine_translation/transformer/train.py
index c179752644bd3..6268881845180 100644
--- a/examples/machine_translation/transformer/train.py
+++ b/examples/machine_translation/transformer/train.py
@@ -48,6 +48,28 @@ def parse_args():
         type=str,
         help="The files for validation, including [source language file, target language file]. Normally, it shouldn't be set and in this case, the default WMT14 dataset will be used to do validation. "
     )
+    parser.add_argument(
+        "--vocab_file",
+        default=None,
+        type=str,
+        help="The vocab file. Normally, it shouldn't be set and in this case, the default WMT14 dataset will be used."
+    )
+    parser.add_argument(
+        "--unk_token",
+        default=None,
+        type=str,
+        help="The unknown token. It should be provided when use custom vocab_file. "
+    )
+    parser.add_argument(
+        "--bos_token",
+        default=None,
+        type=str,
+        help="The bos token. It should be provided when use custom vocab_file. ")
+    parser.add_argument(
+        "--eos_token",
+        default=None,
+        type=str,
+        help="The eos token. It should be provided when use custom vocab_file. ")
     args = parser.parse_args()
     return args
 
@@ -270,6 +292,10 @@ def do_train(args):
         args.max_iter = ARGS.max_iter
     args.train_file = ARGS.train_file
     args.dev_file = ARGS.dev_file
+    args.vocab_file = ARGS.vocab_file
+    args.unk_token = ARGS.unk_token
+    args.bos_token = ARGS.bos_token
+    args.eos_token = ARGS.eos_token
     pprint(args)
 
     do_train(args)
diff --git a/tests/prepare.sh b/tests/prepare.sh
index b5abeb9d8716b..2e5601dc354bf 100644
--- a/tests/prepare.sh
+++ b/tests/prepare.sh
@@ -38,12 +38,11 @@ if [ ${MODE} = "lite_train_infer" ]; then
     if [ -f test.de ]; then
         rm -f test.de
     fi
-    mkdir -p ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/
-    rm -f ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33712
-    rm -f ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33708
+    rm -f vocab_all.bpe.33712
+    rm -f vocab_all.bpe.33708
     # Vocab
-    cp -f WMT14.en-de.partial/wmt14_ende_data_bpe/vocab_all.bpe.33712 ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33712
-    cp -f WMT14.en-de.partial/wmt14_ende_data_bpe/vocab_all.bpe.33708 ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33708
+    cp -f WMT14.en-de.partial/wmt14_ende_data_bpe/vocab_all.bpe.33712 ./
+    cp -f WMT14.en-de.partial/wmt14_ende_data_bpe/vocab_all.bpe.33708 ./
     # Train
     ln -s WMT14.en-de.partial/wmt14_ende_data_bpe/train.tok.clean.bpe.en train.en
     ln -s WMT14.en-de.partial/wmt14_ende_data_bpe/train.tok.clean.bpe.de train.de
@@ -103,12 +102,11 @@ elif [ ${MODE} = "whole_infer" ]; then
     if [ -f test.de ]; then
         rm -f test.de
     fi
-    mkdir -p ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/
-    rm -f ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33712
-    rm -f ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33708
+    rm -f vocab_all.bpe.33712
+    rm -f vocab_all.bpe.33708
     # Vocab
-    cp -f WMT14.en-de.partial/wmt14_ende_data_bpe/vocab_all.bpe.33712 ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33712
-    cp -f WMT14.en-de.partial/wmt14_ende_data_bpe/vocab_all.bpe.33708 ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33708
+    cp -f WMT14.en-de.partial/wmt14_ende_data_bpe/vocab_all.bpe.33712 ./
+    cp -f WMT14.en-de.partial/wmt14_ende_data_bpe/vocab_all.bpe.33708 ./
     # Train with partial data. 
     ln -s WMT14.en-de.partial/wmt14_ende_data_bpe/train.tok.clean.bpe.en train.en
     ln -s WMT14.en-de.partial/wmt14_ende_data_bpe/train.tok.clean.bpe.de train.de
@@ -156,12 +154,11 @@ elif [ ${MODE} = "whole_train_infer" ]; then
     if [ -f test.de ]; then
         rm -f test.de
     fi
-    mkdir -p ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/
-    rm -f ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33712
-    rm -f ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33708
+    rm -f vocab_all.bpe.33712
+    rm -f vocab_all.bpe.33708
     # Vocab
-    cp -f WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33712 ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33712
-    cp -f WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33708 ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33708
+    cp -f WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33712 ./
+    cp -f WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33708 ./
     # Train with whole data. 
     ln -s WMT14.en-de/wmt14_ende_data_bpe/train.tok.clean.bpe.33708.en train.en
     ln -s WMT14.en-de/wmt14_ende_data_bpe/train.tok.clean.bpe.33708.de train.de
@@ -198,12 +195,11 @@ else # infer
     if [ -f test.de ]; then
         rm -f test.de
     fi
-    mkdir -p ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/
-    rm -f ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33712
-    rm -f ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33708
+    rm -f vocab_all.bpe.33712
+    rm -f vocab_all.bpe.33708
     # Vocab
-    cp -f WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33712 ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33712
-    cp -f WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33708 ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33708
+    cp -f WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33712 ./
+    cp -f WMT14.en-de/wmt14_ende_data_bpe/vocab_all.bpe.33708 ./
     # Test with whole data. 
     ln -s WMT14.en-de/wmt14_ende_data_bpe/newstest2014.tok.bpe.33708.en test.en
     ln -s WMT14.en-de/wmt14_ende_data_bpe/newstest2014.tok.bpe.33708.de test.de
diff --git a/tests/transformer_base_dygraph_params.txt b/tests/transformer_base_dygraph_params.txt
index 3c73580eb894f..88c09c144a951 100644
--- a/tests/transformer_base_dygraph_params.txt
+++ b/tests/transformer_base_dygraph_params.txt
@@ -13,7 +13,7 @@ null:null
 null:null
 ##
 trainer:norm_train
-norm_train:../examples/machine_translation/transformer/train.py --config ../examples/machine_translation/transformer/configs/transformer.base.yaml --train_file ../examples/machine_translation/transformer/train.en ../examples/machine_translation/transformer/train.de --dev_file ../examples/machine_translation/transformer/dev.en ../examples/machine_translation/transformer/dev.de
+norm_train:../examples/machine_translation/transformer/train.py --config ../examples/machine_translation/transformer/configs/transformer.base.yaml --train_file ../examples/machine_translation/transformer/train.en ../examples/machine_translation/transformer/train.de --dev_file ../examples/machine_translation/transformer/dev.en ../examples/machine_translation/transformer/dev.de --vocab_file ../examples/machine_translation/transformer/vocab_all.bpe.33708 --unk_token "<unk>" --bos_token "<s>" --eos_token "<e>"
 pact_train:null
 fpgm_train:null
 distill_train:null
@@ -21,13 +21,13 @@ null:null
 null:null
 ##
 ===========================eval_params=========================== 
-eval:../examples/machine_translation/transformer/predict.py --config ../examples/machine_translation/transformer/configs/transformer.base.yaml --test_file ../examples/machine_translation/transformer/test.en ../examples/machine_translation/transformer/test.de --without_ft
+eval:../examples/machine_translation/transformer/predict.py --config ../examples/machine_translation/transformer/configs/transformer.base.yaml --test_file ../examples/machine_translation/transformer/test.en ../examples/machine_translation/transformer/test.de --without_ft --vocab_file ../examples/machine_translation/transformer/vocab_all.bpe.33708 --unk_token "<unk>" --bos_token "<s>" --eos_token "<e>"
 null:null
 ##
 ===========================infer_params===========================
 null:null
 null:null
-norm_export:../examples/machine_translation/transformer/export_model.py --config ../examples/machine_translation/transformer/configs/transformer.base.yaml
+norm_export:../examples/machine_translation/transformer/export_model.py --config ../examples/machine_translation/transformer/configs/transformer.base.yaml --vocab_file ../examples/machine_translation/transformer/vocab_all.bpe.33708 --unk_token "<unk>" --bos_token "<s>" --eos_token "<e>"
 quant_export:null
 fpgm_export:null
 distill_export:null
@@ -37,7 +37,7 @@ export2:null
 infer_model:null
 infer_export:null
 infer_quant:null
-inference:../examples/machine_translation/transformer/deploy/python/inference.py --config ../examples/machine_translation/transformer/configs/transformer.base.yaml --profile --test_file ../examples/machine_translation/transformer/test.en ../examples/machine_translation/transformer/test.de
+inference:../examples/machine_translation/transformer/deploy/python/inference.py --config ../examples/machine_translation/transformer/configs/transformer.base.yaml --profile --test_file ../examples/machine_translation/transformer/test.en ../examples/machine_translation/transformer/test.de  --vocab_file ../examples/machine_translation/transformer/vocab_all.bpe.33708 --unk_token "<unk>" --bos_token "<s>" --eos_token "<e>"
 --device:gpu|cpu
 --use_mkl:True
 --threads:1|6
diff --git a/tests/transformer_base_static_params.txt b/tests/transformer_base_static_params.txt
index 973524aefb01b..d38745c4ab287 100644
--- a/tests/transformer_base_static_params.txt
+++ b/tests/transformer_base_static_params.txt
@@ -13,7 +13,7 @@ null:null
 null:null
 ##
 trainer:norm_train
-norm_train:../examples/machine_translation/transformer/static/train.py --config ../examples/machine_translation/transformer/configs/transformer.base.yaml --distributed --train_file ../examples/machine_translation/transformer/train.en ../examples/machine_translation/transformer/train.de --dev_file ../examples/machine_translation/transformer/dev.en ../examples/machine_translation/transformer/dev.de
+norm_train:../examples/machine_translation/transformer/static/train.py --config ../examples/machine_translation/transformer/configs/transformer.base.yaml --distributed --train_file ../examples/machine_translation/transformer/train.en ../examples/machine_translation/transformer/train.de --dev_file ../examples/machine_translation/transformer/dev.en ../examples/machine_translation/transformer/dev.de  --vocab_file ../examples/machine_translation/transformer/vocab_all.bpe.33708 --unk_token "<unk>" --bos_token "<s>" --eos_token "<e>"
 pact_train:null
 fpgm_train:null
 distill_train:null
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params=========================== 
-eval:../examples/machine_translation/transformer/static/predict.py --config ../examples/machine_translation/transformer/configs/transformer.base.yaml --test_file ../examples/machine_translation/transformer/test.en ../examples/machine_translation/transformer/test.de
+eval:../examples/machine_translation/transformer/static/predict.py --config ../examples/machine_translation/transformer/configs/transformer.base.yaml --test_file ../examples/machine_translation/transformer/test.en ../examples/machine_translation/transformer/test.de  --vocab_file ../examples/machine_translation/transformer/vocab_all.bpe.33708 --unk_token "<unk>" --bos_token "<s>" --eos_token "<e>"
 null:null
 ##
 ===========================infer_params===========================
diff --git a/tests/transformer_big_dygraph_params.txt b/tests/transformer_big_dygraph_params.txt
index ea860d19a9551..82fd24e63bded 100644
--- a/tests/transformer_big_dygraph_params.txt
+++ b/tests/transformer_big_dygraph_params.txt
@@ -13,7 +13,7 @@ null:null
 null:null
 ##
 trainer:norm_train
-norm_train:../examples/machine_translation/transformer/train.py --config ../examples/machine_translation/transformer/configs/transformer.big.yaml --train_file ../examples/machine_translation/transformer/train.en ../examples/machine_translation/transformer/train.de --dev_file ../examples/machine_translation/transformer/dev.en ../examples/machine_translation/transformer/dev.de
+norm_train:../examples/machine_translation/transformer/train.py --config ../examples/machine_translation/transformer/configs/transformer.big.yaml --train_file ../examples/machine_translation/transformer/train.en ../examples/machine_translation/transformer/train.de --dev_file ../examples/machine_translation/transformer/dev.en ../examples/machine_translation/transformer/dev.de --vocab_file ../examples/machine_translation/transformer/vocab_all.bpe.33708 --unk_token "<unk>" --bos_token "<s>" --eos_token "<e>"
 pact_train:null
 fpgm_train:null
 distill_train:null
@@ -21,13 +21,13 @@ null:null
 null:null
 ##
 ===========================eval_params=========================== 
-eval:../examples/machine_translation/transformer/predict.py --config ../examples/machine_translation/transformer/configs/transformer.big.yaml --test_file ../examples/machine_translation/transformer/test.en ../examples/machine_translation/transformer/test.de --without_ft
+eval:../examples/machine_translation/transformer/predict.py --config ../examples/machine_translation/transformer/configs/transformer.big.yaml --test_file ../examples/machine_translation/transformer/test.en ../examples/machine_translation/transformer/test.de --without_ft  --vocab_file ../examples/machine_translation/transformer/vocab_all.bpe.33708 --unk_token "<unk>" --bos_token "<s>" --eos_token "<e>"
 null:null
 ##
 ===========================infer_params===========================
 null:null
 null:null
-norm_export:../examples/machine_translation/transformer/export_model.py --config ../examples/machine_translation/transformer/configs/transformer.big.yaml
+norm_export:../examples/machine_translation/transformer/export_model.py --config ../examples/machine_translation/transformer/configs/transformer.big.yaml  --vocab_file ../examples/machine_translation/transformer/vocab_all.bpe.33708 --unk_token "<unk>" --bos_token "<s>" --eos_token "<e>"
 quant_export:null
 fpgm_export:null
 distill_export:null
@@ -37,7 +37,7 @@ export2:null
 infer_model:null
 infer_export:null
 infer_quant:null
-inference:../examples/machine_translation/transformer/deploy/python/inference.py --config ../examples/machine_translation/transformer/configs/transformer.big.yaml --profile --test_file ../examples/machine_translation/transformer/test.en ../examples/machine_translation/transformer/test.de
+inference:../examples/machine_translation/transformer/deploy/python/inference.py --config ../examples/machine_translation/transformer/configs/transformer.big.yaml --profile --test_file ../examples/machine_translation/transformer/test.en ../examples/machine_translation/transformer/test.de  --vocab_file ../examples/machine_translation/transformer/vocab_all.bpe.33708 --unk_token "<unk>" --bos_token "<s>" --eos_token "<e>"
 --device:gpu|cpu
 --use_mkl:True
 --threads:1|6
diff --git a/tests/transformer_big_static_params.txt b/tests/transformer_big_static_params.txt
index 8f899cdad81cc..98aa9171fcb5f 100644
--- a/tests/transformer_big_static_params.txt
+++ b/tests/transformer_big_static_params.txt
@@ -13,7 +13,7 @@ null:null
 null:null
 ##
 trainer:norm_train
-norm_train:../examples/machine_translation/transformer/static/train.py --config ../examples/machine_translation/transformer/configs/transformer.big.yaml --distributed --train_file ../examples/machine_translation/transformer/train.en ../examples/machine_translation/transformer/train.de --dev_file ../examples/machine_translation/transformer/dev.en ../examples/machine_translation/transformer/dev.de
+norm_train:../examples/machine_translation/transformer/static/train.py --config ../examples/machine_translation/transformer/configs/transformer.big.yaml --distributed --train_file ../examples/machine_translation/transformer/train.en ../examples/machine_translation/transformer/train.de --dev_file ../examples/machine_translation/transformer/dev.en ../examples/machine_translation/transformer/dev.de --vocab_file ../examples/machine_translation/transformer/vocab_all.bpe.33708 --unk_token "<unk>" --bos_token "<s>" --eos_token "<e>"
 pact_train:null
 fpgm_train:null
 distill_train:null
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params=========================== 
-eval:../examples/machine_translation/transformer/static/predict.py --config ../examples/machine_translation/transformer/configs/transformer.big.yaml --test_file ../examples/machine_translation/transformer/test.en ../examples/machine_translation/transformer/test.de
+eval:../examples/machine_translation/transformer/static/predict.py --config ../examples/machine_translation/transformer/configs/transformer.big.yaml --test_file ../examples/machine_translation/transformer/test.en ../examples/machine_translation/transformer/test.de --vocab_file ../examples/machine_translation/transformer/vocab_all.bpe.33708 --unk_token "<unk>" --bos_token "<s>" --eos_token "<e>"
 null:null
 ##
 ===========================infer_params===========================