diff --git a/examples/machine_translation/transformer/configs/transformer.base.yaml b/examples/machine_translation/transformer/configs/transformer.base.yaml
index ab7622858715f..eb98e37f448f9 100644
--- a/examples/machine_translation/transformer/configs/transformer.base.yaml
+++ b/examples/machine_translation/transformer/configs/transformer.base.yaml
@@ -18,6 +18,8 @@ random_seed: None
 output_file: "predict.txt"
 # The <bos>, <eos> and <unk> tokens in the dictionary.
 special_token: ["<s>", "<e>", "<unk>"]
+# The data type of input ids. 
+input_dtype: "int64"
 
 # Device to use. 
 device: "gpu"
diff --git a/examples/machine_translation/transformer/configs/transformer.big.yaml b/examples/machine_translation/transformer/configs/transformer.big.yaml
index 31292d2e0fc58..f9cd40b73a627 100644
--- a/examples/machine_translation/transformer/configs/transformer.big.yaml
+++ b/examples/machine_translation/transformer/configs/transformer.big.yaml
@@ -18,6 +18,8 @@ random_seed: None
 output_file: "predict.txt"
 # The <bos>, <eos> and <unk> tokens in the dictionary.
 special_token: ["<s>", "<e>", "<unk>"]
+# The data type of input ids. 
+input_dtype: "int64"
 
 # Device to use. 
 device: "gpu"
diff --git a/examples/machine_translation/transformer/reader.py b/examples/machine_translation/transformer/reader.py
index 3c2d258b9b558..b908ef2276d0f 100644
--- a/examples/machine_translation/transformer/reader.py
+++ b/examples/machine_translation/transformer/reader.py
@@ -95,7 +95,8 @@ def convert_samples(sample):
                 bos_idx=args.bos_idx,
                 eos_idx=args.eos_idx,
                 pad_idx=args.bos_idx,
-                pad_seq=args.pad_seq),
+                pad_seq=args.pad_seq,
+                dtype=args.input_dtype),
             num_workers=0)
         data_loaders[i] = (data_loader)
     return data_loaders
@@ -142,7 +143,8 @@ def convert_samples(sample):
             bos_idx=args.bos_idx,
             eos_idx=args.eos_idx,
             pad_idx=args.bos_idx,
-            pad_seq=args.pad_seq),
+            pad_seq=args.pad_seq,
+            dtype=args.input_dtype),
         num_workers=0,
         return_list=True)
     return data_loader, trg_vocab.to_tokens
@@ -163,11 +165,16 @@ def adapt_vocab_size(args):
     args.trg_vocab_size = padding_vocab(len(trg_vocab))
 
 
-def prepare_train_input(insts, bos_idx, eos_idx, pad_idx, pad_seq=1):
+def prepare_train_input(insts,
+                        bos_idx,
+                        eos_idx,
+                        pad_idx,
+                        pad_seq=1,
+                        dtype="int64"):
     """
     Put all padded data needed by training into a list.
     """
-    word_pad = Pad(pad_idx, dtype="int64")
+    word_pad = Pad(pad_idx, dtype=dtype)
     src_max_len = (
         max([len(inst[0]) for inst in insts]) + pad_seq) // pad_seq * pad_seq
     trg_max_len = (
@@ -190,11 +197,16 @@ def prepare_train_input(insts, bos_idx, eos_idx, pad_idx, pad_seq=1):
     return data_inputs
 
 
-def prepare_infer_input(insts, bos_idx, eos_idx, pad_idx, pad_seq=1):
+def prepare_infer_input(insts,
+                        bos_idx,
+                        eos_idx,
+                        pad_idx,
+                        pad_seq=1,
+                        dtype="int64"):
     """
     Put all padded data needed by beam search decoder into a list.
     """
-    word_pad = Pad(pad_idx, dtype="int64")
+    word_pad = Pad(pad_idx, dtype=dtype)
     src_max_len = (
         max([len(inst[0]) for inst in insts]) + pad_seq) // pad_seq * pad_seq
     src_word = word_pad([
diff --git a/examples/machine_translation/transformer/static/predict.py b/examples/machine_translation/transformer/static/predict.py
index d9fb3807d0357..5fee0b55b22f0 100644
--- a/examples/machine_translation/transformer/static/predict.py
+++ b/examples/machine_translation/transformer/static/predict.py
@@ -85,7 +85,7 @@ def do_predict(args):
     startup_program = paddle.static.Program()
     with paddle.static.program_guard(test_program, startup_program):
         src_word = paddle.static.data(
-            name="src_word", shape=[None, None], dtype="int64")
+            name="src_word", shape=[None, None], dtype=args.input_dtype)
 
         # Define model
         transformer = InferTransformerModel(
diff --git a/examples/machine_translation/transformer/static/train.py b/examples/machine_translation/transformer/static/train.py
index e5b6854b1a303..94c72935b8dc6 100644
--- a/examples/machine_translation/transformer/static/train.py
+++ b/examples/machine_translation/transformer/static/train.py
@@ -92,11 +92,11 @@ def do_train(args):
     startup_program = paddle.static.Program()
     with paddle.static.program_guard(train_program, startup_program):
         src_word = paddle.static.data(
-            name="src_word", shape=[None, None], dtype="int64")
+            name="src_word", shape=[None, None], dtype=args.input_dtype)
         trg_word = paddle.static.data(
-            name="trg_word", shape=[None, None], dtype="int64")
+            name="trg_word", shape=[None, None], dtype=args.input_dtype)
         lbl_word = paddle.static.data(
-            name="lbl_word", shape=[None, None, 1], dtype="int64")
+            name="lbl_word", shape=[None, None, 1], dtype=args.input_dtype)
 
         # Define model
         transformer = TransformerModel(
diff --git a/paddlenlp/ops/faster_transformer/transformer/faster_transformer.py b/paddlenlp/ops/faster_transformer/transformer/faster_transformer.py
index 5c4e9d68f2c47..8bee55af23d20 100644
--- a/paddlenlp/ops/faster_transformer/transformer/faster_transformer.py
+++ b/paddlenlp/ops/faster_transformer/transformer/faster_transformer.py
@@ -108,7 +108,7 @@ def forward(self, src_word):
             src_word == self.bos_id,
             dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9
         src_pos = paddle.cast(
-            src_word != self.bos_id, dtype="int64") * paddle.arange(
+            src_word != self.bos_id, dtype=src_word.dtype) * paddle.arange(
                 start=0, end=src_max_len)
 
         # Run encoder
diff --git a/paddlenlp/transformers/transformer/modeling.py b/paddlenlp/transformers/transformer/modeling.py
index 75a479c768dcd..bf3c86f4cfbd4 100644
--- a/paddlenlp/transformers/transformer/modeling.py
+++ b/paddlenlp/transformers/transformer/modeling.py
@@ -754,10 +754,10 @@ def forward(self, src_word, trg_word):
         trg_slf_attn_bias.stop_gradient = True
         trg_src_attn_bias = src_slf_attn_bias
         src_pos = paddle.cast(
-            src_word != self.bos_id, dtype="int64") * paddle.arange(
+            src_word != self.bos_id, dtype=src_word.dtype) * paddle.arange(
                 start=0, end=src_max_len)
         trg_pos = paddle.cast(
-            trg_word != self.bos_id, dtype="int64") * paddle.arange(
+            trg_word != self.bos_id, dtype=src_word.dtype) * paddle.arange(
                 start=0, end=trg_max_len)
         with paddle.static.amp.fp16_guard():
             src_emb = self.src_word_embedding(src_word)
@@ -785,6 +785,207 @@ def forward(self, src_word, trg_word):
 
         return predict
 
+
+class InferTransformerModel(TransformerModel):
+    """
+    The Transformer model for auto-regressive generation.
+
+    Args:
+        src_vocab_size (int):
+            The size of source vocabulary.
+        trg_vocab_size (int):
+            The size of target vocabulary.
+        max_length (int):
+            The maximum length of input sequences.
+        num_encoder_layers (int):
+            The number of sub-layers to be stacked in the encoder.
+        num_decoder_layers (int):
+            The number of sub-layers to be stacked in the decoder.
+        n_head (int):
+            The number of head used in multi-head attention.
+        d_model (int):
+            The dimension for word embeddings, which is also the last dimension of
+            the input and output of multi-head attention, position-wise feed-forward
+            networks, encoder and decoder.
+        d_inner_hid (int):
+            Size of the hidden layer in position-wise feed-forward networks.
+        dropout (float):
+            Dropout rates. Used for pre-process, activation and inside attention.
+        weight_sharing (bool):
+            Whether to use weight sharing. 
+        attn_dropout (float):
+            The dropout probability used in MHA to drop some attention target.
+            If None, use the value of dropout. Defaults to None.
+        act_dropout (float):
+            The dropout probability used after FFN activition. If None, use
+            the value of dropout. Defaults to None.
+        bos_id (int, optional):
+            The start token id and also is used as padding id. Defaults to 0.
+        eos_id (int, optional):
+            The end token id. Defaults to 1.
+        beam_size (int, optional):
+            The beam width for beam search. Defaults to 4. 
+        max_out_len (int, optional):
+            The maximum output length. Defaults to 256.
+        output_time_major(bool, optional):
+            Indicate the data layout of predicted
+            Tensor. If `False`, the data layout would be batch major with shape
+            `[batch_size, seq_len, beam_size]`. If  `True`, the data layout would
+            be time major with shape `[seq_len, batch_size, beam_size]`. Default
+            to `False`.
+        beam_search_version (str):
+            Specify beam search version. It should be in one
+            of [`v1`, `v2`]. If `v2`, need to set `alpha`(default to 0.6) for length
+            penalty. Default to `v1`.
+    """
+
+    def __init__(self,
+                 src_vocab_size,
+                 trg_vocab_size,
+                 max_length,
+                 num_encoder_layers,
+                 num_decoder_layers,
+                 n_head,
+                 d_model,
+                 d_inner_hid,
+                 dropout,
+                 weight_sharing,
+                 attn_dropout=None,
+                 act_dropout=None,
+                 bos_id=0,
+                 eos_id=1,
+                 beam_size=4,
+                 max_out_len=256,
+                 output_time_major=False,
+                 beam_search_version='v1',
+                 **kwargs):
+        args = dict(locals())
+        args.pop("self")
+        args.pop("__class__", None)
+        self.beam_size = args.pop("beam_size")
+        self.max_out_len = args.pop("max_out_len")
+        self.output_time_major = args.pop("output_time_major")
+        self.dropout = dropout
+        self.beam_search_version = args.pop('beam_search_version')
+        kwargs = args.pop("kwargs")
+        if self.beam_search_version == 'v2':
+            if 'alpha' in kwargs:
+                self.alpha = kwargs['alpha']
+            else:
+                self.alpha = 0.6
+        super(InferTransformerModel, self).__init__(**args)
+
+        cell = TransformerDecodeCell(
+            self.transformer.decoder, self.trg_word_embedding,
+            self.trg_pos_embedding, self.linear, self.dropout)
+
+        self.decode = TransformerBeamSearchDecoder(
+            cell, bos_id, eos_id, beam_size, var_dim_in_state=2)
+
+    def forward(self, src_word, trg_word=None):
+        r"""
+        The Transformer forward method.
+
+        Args:
+            src_word (Tensor):
+                The ids of source sequence words. It is a tensor with shape
+                `[batch_size, source_sequence_length]` and its data type can be
+                int or int64.
+            trg_word (Tensor):
+                The ids of target sequence words. Normally, it should NOT be
+                given. If it's given, force decoding with previous output token
+                will be trigger. Defaults to None. 
+        
+        Returns:
+            Tensor:
+                An int64 tensor shaped indicating the predicted ids. Its shape is
+                `[batch_size, seq_len, beam_size]` or `[seq_len, batch_size, beam_size]`
+                according to `output_time_major`.
+        
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import InferTransformerModel
+
+                transformer = InferTransformerModel(
+                    src_vocab_size=30000,
+                    trg_vocab_size=30000,
+                    max_length=256,
+                    num_encoder_layers=6,
+                    num_decoder_layers=6,
+                    n_head=8,
+                    d_model=512,
+                    d_inner_hid=2048,
+                    dropout=0.1,
+                    weight_sharing=True,
+                    bos_id=0,
+                    eos_id=1,
+                    beam_size=4,
+                    max_out_len=256)
+
+                batch_size = 5
+                seq_len = 10
+                transformer(
+                    src_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len]))
+        """
+        if self.beam_search_version == 'v1':
+            src_max_len = paddle.shape(src_word)[-1]
+            src_slf_attn_bias = paddle.cast(
+                src_word == self.bos_id,
+                dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9
+            trg_src_attn_bias = src_slf_attn_bias
+            src_pos = paddle.cast(
+                src_word != self.bos_id, dtype=src_word.dtype) * paddle.arange(
+                    start=0, end=src_max_len)
+
+            # Run encoder
+            src_emb = self.src_word_embedding(src_word)
+            src_pos_emb = self.src_pos_embedding(src_pos)
+            src_emb = src_emb + src_pos_emb
+            enc_input = F.dropout(
+                src_emb, p=self.dropout,
+                training=False) if self.dropout else src_emb
+            enc_output = self.transformer.encoder(enc_input, src_slf_attn_bias)
+
+            # Init states (caches) for transformer, need to be updated according to selected beam
+            incremental_cache, static_cache = self.transformer.decoder.gen_cache(
+                enc_output, do_zip=True)
+
+            static_cache, enc_output, trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+                (static_cache, enc_output, trg_src_attn_bias), self.beam_size)
+
+            if trg_word is not None:
+                trg_length = paddle.sum(paddle.cast(
+                    trg_word != self.bos_id, dtype="int32"),
+                                        axis=-1)
+            else:
+                trg_length = None
+
+            rs, _ = nn.decode.dynamic_decode(
+                decoder=self.decode,
+                inits=incremental_cache,
+                max_step_num=self.max_out_len,
+                memory=enc_output,
+                trg_src_attn_bias=trg_src_attn_bias,
+                static_cache=static_cache,
+                is_test=True,
+                output_time_major=self.output_time_major,
+                trg_word=trg_word,
+                trg_length=trg_length)
+
+            return rs
+
+        elif self.beam_search_version == 'v2':
+            finished_seq, finished_scores = self.beam_search_v2(
+                src_word, self.beam_size, self.max_out_len, self.alpha)
+            if self.output_time_major:
+                finished_seq = finished_seq.transpose([2, 0, 1])
+            else:
+                finished_seq = finished_seq.transpose([0, 2, 1])
+
+            return finished_seq
+
     def beam_search_v2(self, src_word, beam_size=4, max_len=None, alpha=0.6):
         """
         Beam search with the alive and finished two queues, both have a beam size
@@ -816,7 +1017,7 @@ def merge_beam_dim(tensor):
             dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9
         src_slf_attn_bias.stop_gradient = True
         src_pos = paddle.cast(
-            src_word != self.bos_id, dtype="int64") * paddle.arange(
+            src_word != self.bos_id, dtype=src_word.dtype) * paddle.arange(
                 start=0, end=src_max_len)
         src_emb = self.src_word_embedding(src_word)
         src_pos_emb = self.src_pos_embedding(src_pos)
@@ -842,8 +1043,8 @@ def merge_beam_dim(tensor):
         alive_seq = paddle.to_tensor(
             np.tile(
                 np.array(
-                    [[[self.bos_id]]], dtype="int64"), (batch_size, beam_size, 1
-                                                        )))
+                    [[[self.bos_id]]], dtype=src_word.dtype), (batch_size,
+                                                               beam_size, 1)))
 
         ## init for the finished ##
         finished_scores = paddle.to_tensor(
@@ -853,8 +1054,8 @@ def merge_beam_dim(tensor):
         finished_seq = paddle.to_tensor(
             np.tile(
                 np.array(
-                    [[[self.bos_id]]], dtype="int64"), (batch_size, beam_size, 1
-                                                        )))
+                    [[[self.bos_id]]], dtype=src_word.dtype), (batch_size,
+                                                               beam_size, 1)))
         finished_flags = paddle.zeros_like(finished_scores)
 
         ### initialize inputs and states of transformer decoder ###
@@ -952,7 +1153,9 @@ def grow_topk(i, logits, alive_seq, alive_log_probs, states):
                 axis=2)
             states = update_states(states, topk_coordinates, beam_size)
             eos = paddle.full(
-                shape=topk_ids.shape, dtype="int64", fill_value=self.eos_id)
+                shape=topk_ids.shape,
+                dtype=alive_seq.dtype,
+                fill_value=self.eos_id)
             topk_finished = paddle.cast(paddle.equal(topk_ids, eos), "float32")
 
             # topk_seq: [batch_size, 2*beam_size, i+1]
@@ -987,7 +1190,7 @@ def grow_finished(finished_seq, finished_scores, finished_flags,
                 [
                     finished_seq, paddle.full(
                         shape=[batch_size, beam_size, 1],
-                        dtype="int64",
+                        dtype=finished_seq.dtype,
                         fill_value=self.eos_id)
                 ],
                 axis=2)
@@ -1012,7 +1215,7 @@ def grow_finished(finished_seq, finished_scores, finished_flags,
         def inner_loop(i, trg_word, alive_seq, alive_log_probs, finished_seq,
                        finished_scores, finished_flags, caches):
             trg_pos = paddle.full(
-                shape=trg_word.shape, dtype="int64", fill_value=i)
+                shape=trg_word.shape, dtype=alive_seq.dtype, fill_value=i)
             trg_emb = self.trg_word_embedding(trg_word)
             trg_pos_emb = self.trg_pos_embedding(trg_pos)
             trg_emb = trg_emb + trg_pos_emb
@@ -1059,202 +1262,3 @@ def is_not_finish(i, trg_word, alive_seq, alive_log_probs, finished_seq,
         finished_scores = paddle.where(finished_flags, finished_scores,
                                        alive_log_probs)
         return finished_seq, finished_scores
-
-
-class InferTransformerModel(TransformerModel):
-    """
-    The Transformer model for auto-regressive generation.
-
-    Args:
-        src_vocab_size (int):
-            The size of source vocabulary.
-        trg_vocab_size (int):
-            The size of target vocabulary.
-        max_length (int):
-            The maximum length of input sequences.
-        num_encoder_layers (int):
-            The number of sub-layers to be stacked in the encoder.
-        num_decoder_layers (int):
-            The number of sub-layers to be stacked in the decoder.
-        n_head (int):
-            The number of head used in multi-head attention.
-        d_model (int):
-            The dimension for word embeddings, which is also the last dimension of
-            the input and output of multi-head attention, position-wise feed-forward
-            networks, encoder and decoder.
-        d_inner_hid (int):
-            Size of the hidden layer in position-wise feed-forward networks.
-        dropout (float):
-            Dropout rates. Used for pre-process, activation and inside attention.
-        weight_sharing (bool):
-            Whether to use weight sharing. 
-        attn_dropout (float):
-            The dropout probability used in MHA to drop some attention target.
-            If None, use the value of dropout. Defaults to None.
-        act_dropout (float):
-            The dropout probability used after FFN activition. If None, use
-            the value of dropout. Defaults to None.
-        bos_id (int, optional):
-            The start token id and also is used as padding id. Defaults to 0.
-        eos_id (int, optional):
-            The end token id. Defaults to 1.
-        beam_size (int, optional):
-            The beam width for beam search. Defaults to 4. 
-        max_out_len (int, optional):
-            The maximum output length. Defaults to 256.
-        output_time_major(bool, optional): Indicate the data layout of predicted
-            Tensor. If `False`, the data layout would be batch major with shape
-            `[batch_size, seq_len, beam_size]`. If  `True`, the data layout would
-            be time major with shape `[seq_len, batch_size, beam_size]`. Default
-            to `False`.
-        beam_search_version (str): Specify beam search version. It should be in one
-            of [`v1`, `v2`]. If `v2`, need to set `alpha`(default to 0.6) for length
-            penalty. Default to `v1`.
-    """
-
-    def __init__(self,
-                 src_vocab_size,
-                 trg_vocab_size,
-                 max_length,
-                 num_encoder_layers,
-                 num_decoder_layers,
-                 n_head,
-                 d_model,
-                 d_inner_hid,
-                 dropout,
-                 weight_sharing,
-                 attn_dropout=None,
-                 act_dropout=None,
-                 bos_id=0,
-                 eos_id=1,
-                 beam_size=4,
-                 max_out_len=256,
-                 output_time_major=False,
-                 beam_search_version='v1',
-                 **kwargs):
-        args = dict(locals())
-        args.pop("self")
-        args.pop("__class__", None)
-        self.beam_size = args.pop("beam_size")
-        self.max_out_len = args.pop("max_out_len")
-        self.output_time_major = args.pop("output_time_major")
-        self.dropout = dropout
-        self.beam_search_version = args.pop('beam_search_version')
-        kwargs = args.pop("kwargs")
-        if self.beam_search_version == 'v2':
-            if 'alpha' in kwargs:
-                self.alpha = kwargs['alpha']
-            else:
-                self.alpha = 0.6
-        super(InferTransformerModel, self).__init__(**args)
-
-        cell = TransformerDecodeCell(
-            self.transformer.decoder, self.trg_word_embedding,
-            self.trg_pos_embedding, self.linear, self.dropout)
-
-        self.decode = TransformerBeamSearchDecoder(
-            cell, bos_id, eos_id, beam_size, var_dim_in_state=2)
-
-    def forward(self, src_word, trg_word=None):
-        r"""
-        The Transformer forward method.
-
-        Args:
-            src_word (Tensor):
-                The ids of source sequence words. It is a tensor with shape
-                `[batch_size, source_sequence_length]` and its data type can be
-                int or int64.
-            trg_word (Tensor):
-                The ids of target sequence words. Normally, it should NOT be
-                given. If it's given, force decoding with previous output token
-                will be trigger. Defaults to None. 
-        
-        Returns:
-            Tensor:
-                An int64 tensor shaped indicating the predicted ids. Its shape is
-                `[batch_size, seq_len, beam_size]` or `[seq_len, batch_size, beam_size]`
-                according to `output_time_major`.
-        
-        Example:
-            .. code-block::
-
-                import paddle
-                from paddlenlp.transformers import InferTransformerModel
-
-                transformer = InferTransformerModel(
-                    src_vocab_size=30000,
-                    trg_vocab_size=30000,
-                    max_length=256,
-                    num_encoder_layers=6,
-                    num_decoder_layers=6,
-                    n_head=8,
-                    d_model=512,
-                    d_inner_hid=2048,
-                    dropout=0.1,
-                    weight_sharing=True,
-                    bos_id=0,
-                    eos_id=1,
-                    beam_size=4,
-                    max_out_len=256)
-
-                batch_size = 5
-                seq_len = 10
-                transformer(
-                    src_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len]))
-        """
-        if self.beam_search_version == 'v1':
-            src_max_len = paddle.shape(src_word)[-1]
-            src_slf_attn_bias = paddle.cast(
-                src_word == self.bos_id,
-                dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9
-            trg_src_attn_bias = src_slf_attn_bias
-            src_pos = paddle.cast(
-                src_word != self.bos_id, dtype="int64") * paddle.arange(
-                    start=0, end=src_max_len)
-
-            # Run encoder
-            src_emb = self.src_word_embedding(src_word)
-            src_pos_emb = self.src_pos_embedding(src_pos)
-            src_emb = src_emb + src_pos_emb
-            enc_input = F.dropout(
-                src_emb, p=self.dropout,
-                training=False) if self.dropout else src_emb
-            enc_output = self.transformer.encoder(enc_input, src_slf_attn_bias)
-
-            # Init states (caches) for transformer, need to be updated according to selected beam
-            incremental_cache, static_cache = self.transformer.decoder.gen_cache(
-                enc_output, do_zip=True)
-
-            static_cache, enc_output, trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
-                (static_cache, enc_output, trg_src_attn_bias), self.beam_size)
-
-            if trg_word is not None:
-                trg_length = paddle.sum(paddle.cast(
-                    trg_word != self.bos_id, dtype="int64"),
-                                        axis=-1)
-            else:
-                trg_length = None
-
-            rs, _ = nn.decode.dynamic_decode(
-                decoder=self.decode,
-                inits=incremental_cache,
-                max_step_num=self.max_out_len,
-                memory=enc_output,
-                trg_src_attn_bias=trg_src_attn_bias,
-                static_cache=static_cache,
-                is_test=True,
-                output_time_major=self.output_time_major,
-                trg_word=trg_word,
-                trg_length=trg_length)
-
-            return rs
-
-        elif self.beam_search_version == 'v2':
-            finished_seq, finished_scores = self.beam_search_v2(
-                src_word, self.beam_size, self.max_out_len, self.alpha)
-            if self.output_time_major:
-                finished_seq = finished_seq.transpose([2, 0, 1])
-            else:
-                finished_seq = finished_seq.transpose([0, 2, 1])
-
-            return finished_seq