diff --git a/CHANGELOG.md b/CHANGELOG.md
index b3b398bfa..9b24fcf68 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -102,6 +102,7 @@ This release is compatible with TensorFlow 2 RC1.
 - Support string dtype in InputLayer (#PR 1017)
 - Support Dynamic RNN in RNN (#PR 1023)
 - Add ResNet50 static model (#PR 1030)
+- Add Transformer model (#PR 1027)
 - Add performance test code in static model (#PR 1041)
 
 ### Changed
@@ -139,8 +140,8 @@ This release is compatible with TensorFlow 2 RC1.
 - @luomai
 - @ChrisWu1997: #1010 #1015 #1025 #1030 #1040
 - @warshallrho: #1017 #1021 #1026 #1029 #1032 #1041
-- @ArnoldLIULJ: #1023
-- @JingqingZ: #1023
+- @ArnoldLIULJ: #1023 #1027
+- @JingqingZ: #1023 #1027
 
 ## [2.1.0]
 
diff --git a/docs/modules/models.rst b/docs/modules/models.rst
index 272f1d9c6..b8cb3f5f0 100644
--- a/docs/modules/models.rst
+++ b/docs/modules/models.rst
@@ -16,6 +16,7 @@ TensorLayer provides many pretrained models, you can easily use the whole or a p
     ResNet50
     Seq2seq
     Seq2seqLuongAttention
+    Transorformer
 
 
 Base Model
@@ -57,3 +58,8 @@ Seq2seq Luong Attention
 ------------------------
 
 .. autoclass:: Seq2seqLuongAttention
+
+Transformer
+------------------------
+
+.. autoclass:: Transformer
\ No newline at end of file
diff --git a/examples/translation_task/tutorial_transformer.py b/examples/translation_task/tutorial_transformer.py
new file mode 100644
index 000000000..cc3cf4bd4
--- /dev/null
+++ b/examples/translation_task/tutorial_transformer.py
@@ -0,0 +1,157 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import tensorflow_datasets as tfds
+import tensorflow as tf
+import time
+import numpy as np
+import matplotlib.pyplot as plt
+from tensorlayer.models.transformer import Transformer
+from tensorlayer.models.transformer.utils import metrics
+from tensorlayer.models.transformer.utils import attention_visualisation
+import tensorlayer as tl
+""" Translation from Portugese to English by Transformer model
+This tutorial provides basic instructions on how to define and train Transformer model on Tensorlayer for 
+Translation task. You can also learn how to visualize the attention block via this tutorial. 
+"""
+
+
+def set_up_dataset():
+    # Set up dataset for Portugese-English translation from the TED Talks Open Translation Project.
+    # This dataset contains approximately 50000 training examples, 1100 validation examples, and 2000 test examples.
+    # https://www.ted.com/participate/translate
+
+    examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
+    train_examples, val_examples = examples['train'], examples['validation']
+
+    # Set up tokenizer and save the tokenizer
+    tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
+        (en.numpy() and pt.numpy() for pt, en in train_examples), target_vocab_size=2**14
+    )
+
+    tokenizer.save_to_file("tokenizer")
+    tokenizer = tfds.features.text.SubwordTextEncoder.load_from_file("tokenizer")
+
+    return tokenizer, train_examples
+
+
+def test_tokenizer_success(tokenizer):
+    sample_string = 'TensorLayer is awesome.'
+
+    tokenized_string = tokenizer.encode(sample_string)
+    print('Tokenized string is {}'.format(tokenized_string))
+
+    original_string = tokenizer.decode(tokenized_string)
+    print('The original string: {}'.format(original_string))
+    assert original_string == sample_string
+
+
+def generate_training_dataset(train_examples, tokenizer):
+
+    def encode(lang1, lang2):
+        lang1 = tokenizer.encode(lang1.numpy()) + [tokenizer.vocab_size + 1]
+
+        lang2 = tokenizer.encode(lang2.numpy()) + [tokenizer.vocab_size + 1]
+
+        return lang1, lang2
+
+    MAX_LENGTH = 50
+
+    def filter_max_length(x, y, max_length=MAX_LENGTH):
+        return tf.logical_and(tf.size(x) <= max_length, tf.size(y) <= max_length)
+
+    def tf_encode(pt, en):
+        return tf.py_function(encode, [pt, en], [tf.int64, tf.int64])
+
+    train_dataset = train_examples.map(tf_encode)
+    train_dataset = train_dataset.filter(filter_max_length)
+    # cache the dataset to memory to get a speedup while reading from it.
+    train_dataset = train_dataset.cache()
+    BUFFER_SIZE = 20000
+    BATCH_SIZE = 64
+    train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1]))
+    train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
+
+    return train_dataset
+
+
+def model_setup(tokenizer):
+    # define Hyper parameters for transformer
+    class HYPER_PARAMS(object):
+        vocab_size = tokenizer.vocab_size + 10
+        encoder_num_layers = 4
+        decoder_num_layers = 4
+        hidden_size = 128
+        ff_size = 512
+        num_heads = 8
+        keep_prob = 0.9
+
+        # Default prediction params
+        extra_decode_length = 50
+        beam_size = 5
+        alpha = 0.6  # used to calculate length normalization in beam search
+
+        label_smoothing = 0.1
+        learning_rate = 2.0
+        learning_rate_decay_rate = 1.0
+        learning_rate_warmup_steps = 4000
+
+        sos_id = 0
+        eos_id = tokenizer.vocab_size + 1
+
+    model = Transformer(HYPER_PARAMS)
+
+    # Set the optimizer
+    learning_rate = CustomSchedule(HYPER_PARAMS.hidden_size, warmup_steps=HYPER_PARAMS.learning_rate_warmup_steps)
+    optimizer = tl.optimizers.LazyAdamOptimizer(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
+    return model, optimizer, HYPER_PARAMS
+
+
+# Use the Adam optimizer with a custom learning rate scheduler according to the formula in the Paper "Attention is All you need"
+class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+
+    def __init__(self, d_model, warmup_steps=5):
+        super(CustomSchedule, self).__init__()
+
+        self.d_model = d_model
+        self.d_model = tf.cast(self.d_model, tf.float32)
+
+        self.warmup_steps = warmup_steps
+
+    def __call__(self, step):
+        arg1 = tf.math.rsqrt(step)
+        arg2 = step * (self.warmup_steps**-1.5)
+
+        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
+
+
+def tutorial_transformer():
+    tokenizer, train_examples = set_up_dataset()
+    train_dataset = generate_training_dataset(train_examples, tokenizer)
+    model, optimizer, HYPER_PARAMS = model_setup(tokenizer)
+
+    num_epochs = 10
+    for epoch in range(num_epochs):
+        model.train()
+        for (batch, (inp, tar)) in enumerate(train_dataset):
+            with tf.GradientTape() as tape:
+                logits, weights_encoder, weights_decoder = model(inputs=inp, targets=tar)
+                logits = metrics.MetricLayer(HYPER_PARAMS.vocab_size)([logits, tar])
+                logits, loss = metrics.LossLayer(HYPER_PARAMS.vocab_size, 0.1)([logits, tar])
+                grad = tape.gradient(loss, model.all_weights)
+                optimizer.apply_gradients(zip(grad, model.all_weights))
+                if (batch % 50 == 0):
+                    print('Batch ID {} at Epoch [{}/{}]: loss {:.4f}'.format(batch, epoch + 1, num_epochs, loss))
+
+    model.eval()
+    sentence_en = tokenizer.encode('TensorLayer is awesome.')
+    [prediction, weights_decoder], weights_encoder = model(inputs=[sentence_en])
+
+    predicted_sentence = tokenizer.decode([i for i in prediction["outputs"][0] if i < tokenizer.vocab_size])
+    print("Translated: ", predicted_sentence)
+
+    # visualize the self attention
+    tokenizer_str = [tokenizer.decode([ts]) for ts in (sentence_en)]
+    attention_visualisation.plot_attention_weights(weights_encoder["layer_0"], tokenizer_str, tokenizer_str)
+
+
+if __name__ == "__main__":
+    tutorial_transformer()
diff --git a/tensorlayer/models/__init__.py b/tensorlayer/models/__init__.py
index 19f5bb665..6241bce97 100644
--- a/tensorlayer/models/__init__.py
+++ b/tensorlayer/models/__init__.py
@@ -10,3 +10,4 @@
 from .vgg import *
 from .seq2seq import Seq2seq
 from .seq2seq_with_attention import Seq2seqLuongAttention
+from .transformer.transformer import Transformer
diff --git a/tensorlayer/models/transformer/__init__.py b/tensorlayer/models/transformer/__init__.py
new file mode 100644
index 000000000..28c174abc
--- /dev/null
+++ b/tensorlayer/models/transformer/__init__.py
@@ -0,0 +1,6 @@
+from .attention_layer import *
+from .transformer import Transformer
+from .beamsearchHelper import *
+from .feedforward_layer import *
+from .embedding_layer import *
+from .utils import *
\ No newline at end of file
diff --git a/tensorlayer/models/transformer/attention_layer.py b/tensorlayer/models/transformer/attention_layer.py
new file mode 100644
index 000000000..5d9e5cca7
--- /dev/null
+++ b/tensorlayer/models/transformer/attention_layer.py
@@ -0,0 +1,156 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of multiheaded attention and self-attention layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import tensorlayer as tl
+
+
+class MultiHeadAttentionLayer(tl.layers.Layer):
+    """The :class:`MultiHeadAttentionLayer` layer is for multi-head attention computation.
+    The weight computation is between "key" and "query", which will then matmul with "value" to generate information
+    that selectively focuses on the "query" messages.
+
+    Parameters
+    -----------
+    num_heads : int
+        The number of heads which allow attention computation for different features
+    hidden_size : int
+        Out dim for the layer
+    keep_prob : float
+        Keep probablity for drop-out mechanism between 0 and 1
+    """
+
+    def __init__(self, num_heads, hidden_size, keep_prob):
+
+        if hidden_size % num_heads:
+            raise ValueError(
+                "Hidden size ({}) must be divisible by the number of heads ({}).".format(hidden_size, num_heads)
+            )
+
+        super(MultiHeadAttentionLayer, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.attention_dropout = 1 - keep_prob
+
+        self.build(None)
+        self._built = True
+
+    def get_config(self):
+        return {
+            "hidden_size": self.hidden_size,
+            "num_heads": self.num_heads,
+            "attention_dropout": self.attention_dropout,
+        }
+
+    def build(self, inputs_shape):
+
+        # Transformation for linearly projecting the queries, keys, and values.
+        self.q_transformation = self._get_weights(
+            "q_project", shape=(self.hidden_size, self.hidden_size), init=tf.initializers.get('glorot_uniform')
+        )
+        self.v_transformation = self._get_weights(
+            "v_project", shape=(self.hidden_size, self.hidden_size), init=tf.initializers.get('glorot_uniform')
+        )
+        self.k_transformation = self._get_weights(
+            "k_project", shape=(self.hidden_size, self.hidden_size), init=tf.initializers.get('glorot_uniform')
+        )
+        self.out_transformation = self._get_weights(
+            "out_project", shape=(self.hidden_size, self.hidden_size), init=tf.initializers.get('glorot_uniform')
+        )
+
+    def split_heads(self, x):
+
+        with tf.name_scope("split_heads"):
+            batch_size = tf.shape(x)[0]
+            length = tf.shape(x)[1]
+
+            # Calculate depth of last dimension after it has been split.
+            depth = (self.hidden_size // self.num_heads)
+
+            # Split the last dimension
+            x = tf.reshape(x, [batch_size, length, self.num_heads, depth])
+
+            # Transpose the result
+            return tf.transpose(x, [0, 2, 1, 3])
+
+    def combine_heads(self, x):
+
+        with tf.name_scope("combine_heads"):
+            batch_size = tf.shape(x)[0]
+            length = tf.shape(x)[2]
+            x = tf.transpose(x, [0, 2, 1, 3])  # --> [batch, length, num_heads, depth]
+            return tf.reshape(x, [batch_size, length, self.hidden_size])
+
+    def forward(self, x, y, mask, cache=None):
+        """Apply attention mechanism to x and y."""
+        # Linearly project the query (q), key (k) and value (v) using different
+        # learned projections. This is in preparation of splitting them into
+        # multiple heads. Multi-head attention uses multiple queries, keys, and
+        # values rather than regular attention (which uses a single q, k, v).
+
+        v = k = y
+        q = x
+
+        q = tf.tensordot(q, self.q_transformation, axes=[[2], [0]])
+        k = tf.tensordot(k, self.k_transformation, axes=[[2], [0]])
+        v = tf.tensordot(v, self.v_transformation, axes=[[2], [0]])
+
+        if cache is not None:
+
+            # Combine cached keys and values with new keys and values.
+            k = tf.concat([cache["k"], k], axis=1)
+            v = tf.concat([cache["v"], v], axis=1)
+
+            # Update cache
+            cache["k"] = k
+            cache["v"] = v
+
+        # Split q, k, v into heads.
+        q = self.split_heads(q)
+        k = self.split_heads(k)
+        v = self.split_heads(v)  #(Batch, num_head, length_v, dk)
+
+        # Scale q to prevent the dot product between q and k from growing too large.
+        depth = (self.hidden_size // self.num_heads)
+        q *= depth**-0.5
+
+        # Calculate dot product attention
+        logits = tf.matmul(q, k, transpose_b=True)  #(Batch, num_head, length_q, length_k)
+        logits += mask
+        weights = tf.nn.softmax(logits, name="attention_weights")  #(Batch, num_head, length_q, length_k)
+        weights_store = weights
+        if self.is_train:
+            weights = tf.nn.dropout(weights, rate=self.attention_dropout)
+
+        attention_output = tf.matmul(weights, v)
+
+        # Recombine heads --> [batch_size, length, hidden_size]
+        attention_output = self.combine_heads(attention_output)
+
+        # Run the combined outputs through another linear projection layer.
+        attention_output = tf.tensordot(attention_output, self.out_transformation, axes=[[2], [0]])
+        return attention_output, weights_store
+
+
+class SelfAttentionLayer(MultiHeadAttentionLayer):
+    """Multiheaded self-attention layer."""
+
+    def forward(self, inputs, mask, cache=None):
+        return super(SelfAttentionLayer, self).forward(x=inputs, y=inputs, mask=mask, cache=cache)
diff --git a/tensorlayer/models/transformer/beamsearchHelper/__init__.py b/tensorlayer/models/transformer/beamsearchHelper/__init__.py
new file mode 100644
index 000000000..83c248180
--- /dev/null
+++ b/tensorlayer/models/transformer/beamsearchHelper/__init__.py
@@ -0,0 +1 @@
+from .beam_search import *
diff --git a/tensorlayer/models/transformer/beamsearchHelper/beam_search.py b/tensorlayer/models/transformer/beamsearchHelper/beam_search.py
new file mode 100644
index 000000000..b1959f901
--- /dev/null
+++ b/tensorlayer/models/transformer/beamsearchHelper/beam_search.py
@@ -0,0 +1,115 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Beam search in TF v2.
+"""
+
+import tensorflow as tf
+import tensorlayer.models.transformer.beamsearchHelper.beam_search_v1 as v1
+
+_StateKeys = v1._StateKeys  # pylint: disable=protected-access
+
+
+class SequenceBeamSearchV2(v1.SequenceBeamSearch):
+    """Implementation of beam search loop in v2."""
+
+    def search(self, initial_ids, initial_cache):
+        """Beam search for sequences with highest scores."""
+        state, state_shapes = self._create_initial_state(initial_ids, initial_cache)
+        finished_state = tf.while_loop(
+            self._continue_search, self._search_step, loop_vars=[state], shape_invariants=[state_shapes],
+            parallel_iterations=1, back_prop=False
+        )
+        finished_state = finished_state[0]
+
+        alive_seq = finished_state[_StateKeys.ALIVE_SEQ]
+        alive_log_probs = finished_state[_StateKeys.ALIVE_LOG_PROBS]
+        finished_seq = finished_state[_StateKeys.FINISHED_SEQ]
+        finished_scores = finished_state[_StateKeys.FINISHED_SCORES]
+        finished_flags = finished_state[_StateKeys.FINISHED_FLAGS]
+
+        return finished_seq, finished_scores
+
+
+def sequence_beam_search(
+        symbols_to_logits_fn, initial_ids, initial_cache, vocab_size, beam_size, alpha, max_decode_length, eos_id
+):
+    """Search for sequence of subtoken ids with the largest probability.
+    
+    Parameters
+    -----------
+    symbols_to_logits_fn : A function with ids, index, and cache as arguments. 
+        The passed in arguments will have shape:
+            ids -> [batch_size * beam_size, index]
+            index -> [] (scalar)
+            cache -> nested dictionary of tensors [batch_size * beam_size, ...]
+        The function must return logits and new cache.
+            logits -> [batch * beam_size, vocab_size]
+            new cache -> same shape/structure as inputted cache
+    initial_ids : int with shape [batch_size]
+        Starting ids for each batch item.
+    initial_cache: dict 
+        contain starting decoder variables information
+    vocab_size: int 
+        size of tokens
+    beam_size: int 
+        number of beams
+    alpha: float 
+        strength of length normalization
+    max_decode_length: int
+        maximum length to decoded sequence
+    eos_id: int 
+        id of eos token, used to determine when a sequence has finished
+    
+    Notes
+    -------
+    The function would return:
+        Top decoded sequences [batch_size, beam_size, max_decode_length]
+        sequence scores [batch_size, beam_size]
+  """
+
+    batch_size = tf.shape(initial_ids)[0]
+
+    sbs = SequenceBeamSearchV2(
+        symbols_to_logits_fn, vocab_size, batch_size, beam_size, alpha, max_decode_length, eos_id
+    )
+    return sbs.search(initial_ids, initial_cache)
+
+
+def _expand_to_same_rank(tensor, target):
+    """Expands a given tensor to target's rank to be broadcastable.
+
+    Parameters
+    -----------
+    
+        tensor: input tensor to tile. Shape: [b, d1, ..., da]
+        target: target tensor. Shape: [b, d1, ..., da, ..., dn]
+
+     Returns:
+    -----------
+        Tiled tensor of shape [b, d1, ..., da, 1, ..., 1] with same rank of target.
+
+    Raises:
+        ValueError, if the shape rank of rank tensor/target is None.
+    """
+    if tensor.shape.rank is None:
+        raise ValueError("Expect rank for tensor shape, but got None.")
+    if target.shape.rank is None:
+        raise ValueError("Expect rank for target shape, but got None.")
+
+    with tf.name_scope("expand_rank"):
+        diff_rank = target.shape.rank - tensor.shape.rank
+        for _ in range(diff_rank):
+            tensor = tf.expand_dims(tensor, -1)
+        return tensor
diff --git a/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py b/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py
new file mode 100644
index 000000000..db3166366
--- /dev/null
+++ b/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py
@@ -0,0 +1,493 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Beam search to find the translated sequence with the highest probability.
+
+Source implementation from Tensor2Tensor:
+https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/beam_search.py
+"""
+
+import tensorflow as tf
+from tensorflow.python.util import nest
+
+# Default value for INF
+INF = 1. * 1e7
+
+
+class _StateKeys(object):
+    """Keys to dictionary storing the state of the beam search loop."""
+
+    # Variable storing the loop index.
+    CUR_INDEX = "CUR_INDEX"
+
+    # Top sequences that are alive for each batch item. Alive sequences are ones
+    # that have not generated an EOS token. Sequences that reach EOS are marked as
+    # finished and moved to the FINISHED_SEQ tensor.
+    # Has shape [batch_size, beam_size, CUR_INDEX + 1]
+    ALIVE_SEQ = "ALIVE_SEQ"
+    # Log probabilities of each alive sequence. Shape [batch_size, beam_size]
+    ALIVE_LOG_PROBS = "ALIVE_LOG_PROBS"
+    # Dictionary of cached values for each alive sequence. The cache stores
+    # the encoder output, attention bias, and the decoder attention output from
+    # the previous iteration.
+    ALIVE_CACHE = "ALIVE_CACHE"
+
+    # Top finished sequences for each batch item.
+    # Has shape [batch_size, beam_size, CUR_INDEX + 1]. Sequences that are
+    # shorter than CUR_INDEX + 1 are padded with 0s.
+    FINISHED_SEQ = "FINISHED_SEQ"
+    # Scores for each finished sequence. Score = log probability / length norm
+    # Shape [batch_size, beam_size]
+    FINISHED_SCORES = "FINISHED_SCORES"
+    # Flags indicating which sequences in the finished sequences are finished.
+    # At the beginning, all of the sequences in FINISHED_SEQ are filler values.
+    # True -> finished sequence, False -> filler. Shape [batch_size, beam_size]
+    FINISHED_FLAGS = "FINISHED_FLAGS"
+
+
+class SequenceBeamSearch(object):
+    """Implementation of beam search loop."""
+
+    def __init__(self, symbols_to_logits_fn, vocab_size, batch_size, beam_size, alpha, max_decode_length, eos_id):
+        self.symbols_to_logits_fn = symbols_to_logits_fn
+        self.vocab_size = vocab_size
+        self.batch_size = batch_size
+        self.beam_size = beam_size
+        self.alpha = alpha
+        self.max_decode_length = max_decode_length
+        self.eos_id = eos_id
+
+    def search(self, initial_ids, initial_cache):
+        """Beam search for sequences with highest scores."""
+        state, state_shapes = self._create_initial_state(initial_ids, initial_cache)
+
+        finished_state = tf.while_loop(
+            self._continue_search, self._search_step, loop_vars=[state], shape_invariants=[state_shapes],
+            parallel_iterations=1, back_prop=False
+        )
+        finished_state = finished_state[0]
+
+        alive_seq = finished_state[_StateKeys.ALIVE_SEQ]
+        alive_log_probs = finished_state[_StateKeys.ALIVE_LOG_PROBS]
+        finished_seq = finished_state[_StateKeys.FINISHED_SEQ]
+        finished_scores = finished_state[_StateKeys.FINISHED_SCORES]
+        finished_flags = finished_state[_StateKeys.FINISHED_FLAGS]
+
+        # Account for corner case where there are no finished sequences for a
+        # particular batch item. In that case, return alive sequences for that batch
+        # item.
+        finished_seq = tf.where(tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
+        finished_scores = tf.where(tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
+        return finished_seq, finished_scores
+
+    def _create_initial_state(self, initial_ids, initial_cache):
+        """Return initial state dictionary and its shape invariants.
+
+    Parameters
+    -----------
+      initial_ids: initial ids to pass into the symbols_to_logits_fn.
+        int tensor with shape [batch_size, 1]
+      initial_cache: dictionary storing values to be passed into the
+        symbols_to_logits_fn.
+
+    Returns:
+    -----------
+        state and shape invariant dictionaries with keys from _StateKeys
+    """
+        # Current loop index (starts at 0)
+        cur_index = tf.constant(0)
+
+        # Create alive sequence with shape [batch_size, beam_size, 1]
+        alive_seq = _expand_to_beam_size(initial_ids, self.beam_size)
+        alive_seq = tf.expand_dims(alive_seq, axis=2)
+
+        # Create tensor for storing initial log probabilities.
+        # Assume initial_ids are prob 1.0
+        initial_log_probs = tf.constant([[0.] + [-float("inf")] * (self.beam_size - 1)])
+        alive_log_probs = tf.tile(initial_log_probs, [self.batch_size, 1])
+
+        # Expand all values stored in the dictionary to the beam size, so that each
+        # beam has a separate cache.
+        alive_cache = nest.map_structure(lambda t: _expand_to_beam_size(t, self.beam_size), initial_cache)
+
+        # Initialize tensor storing finished sequences with filler values.
+        finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32)
+
+        # Set scores of the initial finished seqs to negative infinity.
+        finished_scores = tf.ones([self.batch_size, self.beam_size]) * -INF
+
+        # Initialize finished flags with all False values.
+        finished_flags = tf.zeros([self.batch_size, self.beam_size], tf.bool)
+
+        # Create state dictionary
+        state = {
+            _StateKeys.CUR_INDEX: cur_index,
+            _StateKeys.ALIVE_SEQ: alive_seq,
+            _StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
+            _StateKeys.ALIVE_CACHE: alive_cache,
+            _StateKeys.FINISHED_SEQ: finished_seq,
+            _StateKeys.FINISHED_SCORES: finished_scores,
+            _StateKeys.FINISHED_FLAGS: finished_flags
+        }
+
+        # Create state invariants for each value in the state dictionary. Each
+        # dimension must be a constant or None. A None dimension means either:
+        #   1) the dimension's value is a tensor that remains the same but may
+        #      depend on the input sequence to the model (e.g. batch size).
+        #   2) the dimension may have different values on different iterations.
+        state_shape_invariants = {
+            _StateKeys.CUR_INDEX: tf.TensorShape([]),
+            _StateKeys.ALIVE_SEQ: tf.TensorShape([None, self.beam_size, None]),
+            _StateKeys.ALIVE_LOG_PROBS: tf.TensorShape([None, self.beam_size]),
+            _StateKeys.ALIVE_CACHE: nest.map_structure(_get_shape_keep_last_dim, alive_cache),
+            _StateKeys.FINISHED_SEQ: tf.TensorShape([None, self.beam_size, None]),
+            _StateKeys.FINISHED_SCORES: tf.TensorShape([None, self.beam_size]),
+            _StateKeys.FINISHED_FLAGS: tf.TensorShape([None, self.beam_size])
+        }
+
+        return state, state_shape_invariants
+
+    def _continue_search(self, state):
+        """Return whether to continue the search loop.
+
+    The loops should terminate when
+      1) when decode length has been reached, or
+      2) when the worst score in the finished sequences is better than the best
+         score in the alive sequences (i.e. the finished sequences are provably
+         unchanging)
+    """
+        i = state[_StateKeys.CUR_INDEX]
+        alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
+        finished_scores = state[_StateKeys.FINISHED_SCORES]
+        finished_flags = state[_StateKeys.FINISHED_FLAGS]
+
+        not_at_max_decode_length = tf.less(i, self.max_decode_length)
+
+        # Calculate largest length penalty (the larger penalty, the better score).
+        max_length_norm = _length_normalization(self.alpha, self.max_decode_length)
+        # Get the best possible scores from alive sequences.
+        best_alive_scores = alive_log_probs[:, 0] / max_length_norm
+
+        # Compute worst score in finished sequences for each batch element
+        finished_scores *= tf.cast(finished_flags, tf.float32)  # set filler scores to zero
+        lowest_finished_scores = tf.reduce_min(finished_scores, axis=1)
+
+        # If there are no finished sequences in a batch element, then set the lowest
+        # finished score to -INF for that element.
+        finished_batches = tf.reduce_any(finished_flags, 1)
+        lowest_finished_scores += (1.0 - tf.cast(finished_batches, tf.float32)) * -INF
+
+        worst_finished_score_better_than_best_alive_score = tf.reduce_all(
+            tf.greater(lowest_finished_scores, best_alive_scores)
+        )
+
+        return tf.logical_and(
+            not_at_max_decode_length, tf.logical_not(worst_finished_score_better_than_best_alive_score)
+        )
+
+    def _search_step(self, state):
+        """Beam search loop body.
+
+    Grow alive sequences by a single ID. Sequences that have reached the EOS
+    token are marked as finished. The alive and finished sequences with the
+    highest log probabilities and scores are returned.
+
+    A sequence's finished score is calculating by dividing the log probability
+    by the length normalization factor. Without length normalization, the
+    search is more likely to return shorter sequences.
+
+    """
+        # Grow alive sequences by one token.
+        new_seq, new_log_probs, new_cache = self._grow_alive_seq(state)
+        # Collect top beam_size alive sequences
+        alive_state = self._get_new_alive_state(new_seq, new_log_probs, new_cache)
+
+        # Combine newly finished sequences with existing finished sequences, and
+        # collect the top k scoring sequences.
+        finished_state = self._get_new_finished_state(state, new_seq, new_log_probs)
+
+        # Increment loop index and create new state dictionary
+        new_state = {_StateKeys.CUR_INDEX: state[_StateKeys.CUR_INDEX] + 1}
+        new_state.update(alive_state)
+        new_state.update(finished_state)
+        return [new_state]
+
+    def _grow_alive_seq(self, state):
+        """Grow alive sequences by one token, and collect top 2*beam_size sequences.
+    2*beam_size sequences are collected because some sequences may have reached
+    the EOS token. 2*beam_size ensures that at least beam_size sequences are
+    still alive.
+    """
+        i = state[_StateKeys.CUR_INDEX]
+        alive_seq = state[_StateKeys.ALIVE_SEQ]
+        alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
+        alive_cache = state[_StateKeys.ALIVE_CACHE]
+
+        beams_to_keep = 2 * self.beam_size
+
+        # Get logits for the next candidate IDs for the alive sequences. Get the new
+        # cache values at the same time.
+        flat_ids = _flatten_beam_dim(alive_seq)  # [batch_size * beam_size]
+        flat_cache = nest.map_structure(_flatten_beam_dim, alive_cache)
+
+        flat_logits, flat_cache = self.symbols_to_logits_fn(flat_ids, i, flat_cache)
+
+        # Unflatten logits to shape [batch_size, beam_size, vocab_size]
+        logits = _unflatten_beam_dim(flat_logits, self.batch_size, self.beam_size)
+        new_cache = nest.map_structure(lambda t: _unflatten_beam_dim(t, self.batch_size, self.beam_size), flat_cache)
+
+        # Convert logits to normalized log probs
+        candidate_log_probs = _log_prob_from_logits(logits)
+
+        # Calculate new log probabilities if each of the alive sequences were
+        # extended # by the the candidate IDs.
+        # Shape [batch_size, beam_size, vocab_size]
+        log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2)
+
+        # Each batch item has beam_size * vocab_size candidate sequences. For each
+        # batch item, get the k candidates with the highest log probabilities.
+        flat_log_probs = tf.reshape(log_probs, [-1, self.beam_size * self.vocab_size])
+        topk_log_probs, topk_indices = tf.nn.top_k(flat_log_probs, k=beams_to_keep)
+
+        # Extract the alive sequences that generate the highest log probabilities
+        # after being extended.
+        topk_beam_indices = topk_indices // self.vocab_size
+        topk_seq, new_cache = _gather_beams([alive_seq, new_cache], topk_beam_indices, self.batch_size, beams_to_keep)
+
+        # Append the most probable IDs to the topk sequences
+        topk_ids = topk_indices % self.vocab_size
+        topk_ids = tf.expand_dims(topk_ids, axis=2)
+        topk_seq = tf.concat([topk_seq, topk_ids], axis=2)
+        return topk_seq, topk_log_probs, new_cache
+
+    def _get_new_alive_state(self, new_seq, new_log_probs, new_cache):
+        """Gather the top k sequences that are still alive.
+    """
+        # To prevent finished sequences from being considered, set log probs to -INF
+        new_finished_flags = tf.equal(new_seq[:, :, -1], self.eos_id)
+        new_log_probs += tf.cast(new_finished_flags, tf.float32) * -INF
+
+        top_alive_seq, top_alive_log_probs, top_alive_cache = _gather_topk_beams(
+            [new_seq, new_log_probs, new_cache], new_log_probs, self.batch_size, self.beam_size
+        )
+
+        return {
+            _StateKeys.ALIVE_SEQ: top_alive_seq,
+            _StateKeys.ALIVE_LOG_PROBS: top_alive_log_probs,
+            _StateKeys.ALIVE_CACHE: top_alive_cache
+        }
+
+    def _get_new_finished_state(self, state, new_seq, new_log_probs):
+        """Combine new and old finished sequences, and gather the top k sequences.
+    """
+        i = state[_StateKeys.CUR_INDEX]
+        finished_seq = state[_StateKeys.FINISHED_SEQ]
+        finished_scores = state[_StateKeys.FINISHED_SCORES]
+        finished_flags = state[_StateKeys.FINISHED_FLAGS]
+
+        # First append a column of 0-ids to finished_seq to increment the length.
+        # New shape of finished_seq: [batch_size, beam_size, i + 1]
+        finished_seq = tf.concat([finished_seq, tf.zeros([self.batch_size, self.beam_size, 1], tf.int32)], axis=2)
+
+        # Calculate new seq scores from log probabilities.
+        length_norm = _length_normalization(self.alpha, i + 1)
+        new_scores = new_log_probs / length_norm
+
+        # Set the scores of the still-alive seq in new_seq to large negative values.
+        new_finished_flags = tf.equal(new_seq[:, :, -1], self.eos_id)
+        new_scores += (1. - tf.cast(new_finished_flags, tf.float32)) * -INF
+
+        # Combine sequences, scores, and flags.
+        finished_seq = tf.concat([finished_seq, new_seq], axis=1)
+        finished_scores = tf.concat([finished_scores, new_scores], axis=1)
+        finished_flags = tf.concat([finished_flags, new_finished_flags], axis=1)
+
+        # Return the finished sequences with the best scores.
+        top_finished_seq, top_finished_scores, top_finished_flags = (
+            _gather_topk_beams(
+                [finished_seq, finished_scores, finished_flags], finished_scores, self.batch_size, self.beam_size
+            )
+        )
+
+        return {
+            _StateKeys.FINISHED_SEQ: top_finished_seq,
+            _StateKeys.FINISHED_SCORES: top_finished_scores,
+            _StateKeys.FINISHED_FLAGS: top_finished_flags
+        }
+
+
+def sequence_beam_search(
+        symbols_to_logits_fn, initial_ids, initial_cache, vocab_size, beam_size, alpha, max_decode_length, eos_id
+):
+    """Search for sequence of subtoken ids with the largest probability.
+    
+    Parameters
+    -----------
+    symbols_to_logits_fn : A function with ids, index, and cache as arguments. 
+        The passed in arguments will have shape:
+            ids -> [batch_size * beam_size, index]
+            index -> [] (scalar)
+            cache -> nested dictionary of tensors [batch_size * beam_size, ...]
+        The function must return logits and new cache.
+            logits -> [batch * beam_size, vocab_size]
+            new cache -> same shape/structure as inputted cache
+    initial_ids : int with shape [batch_size]
+        Starting ids for each batch item.
+    initial_cache: dict 
+        contain starting decoder variables information
+    vocab_size: int 
+        size of tokens
+    beam_size: int 
+        number of beams
+    alpha: float 
+        strength of length normalization
+    max_decode_length: int
+        maximum length to decoded sequence
+    eos_id: int 
+        id of eos token, used to determine when a sequence has finished
+    
+    Notes
+    -------
+    The function would return:
+      Top decoded sequences [batch_size, beam_size, max_decode_length]
+      sequence scores [batch_size, beam_size]
+  """
+    batch_size = tf.shape(initial_ids)[0]
+    sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, batch_size, beam_size, alpha, max_decode_length, eos_id)
+    return sbs.search(initial_ids, initial_cache)
+
+
+def _log_prob_from_logits(logits):
+    return logits - tf.reduce_logsumexp(logits, axis=2, keepdims=True)
+
+
+def _length_normalization(alpha, length):
+    """Return length normalization factor."""
+    return tf.pow(((5. + tf.cast(length, tf.float32)) / 6.), alpha)
+
+
+def _expand_to_beam_size(tensor, beam_size):
+    """Tiles a given tensor by beam_size.
+
+  Parameters
+  -----------
+    tensor: tensor to tile [batch_size, ...]
+    beam_size: How much to tile the tensor by.
+
+  Returns
+  -----------
+    Tiled tensor [batch_size, beam_size, ...]
+  """
+    tensor = tf.expand_dims(tensor, axis=1)
+    tile_dims = [1] * tensor.shape.ndims
+    tile_dims[1] = beam_size
+
+    return tf.tile(tensor, tile_dims)
+
+
+def _shape_list(tensor):
+    """Return a list of the tensor's shape, and ensure no None values in list."""
+    # Get statically known shape (may contain None's for unknown dimensions)
+    shape = tensor.get_shape().as_list()
+
+    # Ensure that the shape values are not None
+    dynamic_shape = tf.shape(tensor)
+    for i in range(len(shape)):  # pylint: disable=consider-using-enumerate
+        if shape[i] is None:
+            shape[i] = dynamic_shape[i]
+    return shape
+
+
+def _get_shape_keep_last_dim(tensor):
+    shape_list = _shape_list(tensor)
+
+    # Only the last
+    for i in range(len(shape_list) - 1):
+        shape_list[i] = None
+
+    if isinstance(shape_list[-1], tf.Tensor):
+        shape_list[-1] = None
+    return tf.TensorShape(shape_list)
+
+
+def _flatten_beam_dim(tensor):
+    """Reshapes first two dimensions in to single dimension.
+  """
+    shape = _shape_list(tensor)
+    shape[0] *= shape[1]
+    shape.pop(1)  # Remove beam dim
+    return tf.reshape(tensor, shape)
+
+
+def _unflatten_beam_dim(tensor, batch_size, beam_size):
+    """Reshapes first dimension back to [batch_size, beam_size].
+
+  Parameters
+  -----------
+    tensor: Tensor to reshape of shape [batch_size*beam_size, ...]
+    batch_size: Tensor, original batch size.
+    beam_size: int, original beam size.
+
+  Returns
+  -----------
+    Reshaped tensor of shape [batch_size, beam_size, ...]
+  """
+    shape = _shape_list(tensor)
+    new_shape = [batch_size, beam_size] + shape[1:]
+    return tf.reshape(tensor, new_shape)
+
+
+def _gather_beams(nested, beam_indices, batch_size, new_beam_size):
+    """Gather beams from nested structure of tensors.
+
+  Each tensor in nested represents a batch of beams, where beam refers to a
+  single search state (beam search involves searching through multiple states
+  in parallel).
+
+  This function is used to gather the top beams, specified by
+  beam_indices, from the nested tensors.
+
+  Parameters
+  -----------
+    nested: Nested structure (tensor, list, tuple or dict) containing tensors
+      with shape [batch_size, beam_size, ...].
+    beam_indices: int32 tensor with shape [batch_size, new_beam_size]. Each
+     value in beam_indices must be between [0, beam_size), and are not
+     necessarily unique.
+    batch_size: int size of batch
+    new_beam_size: int number of beams to be pulled from the nested tensors.
+
+  Returns:
+  -----------
+
+    Nested structure containing tensors with shape
+      [batch_size, new_beam_size, ...]
+  """
+    # Computes the i'th coodinate that contains the batch index for gather_nd.
+    # Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..].
+    batch_pos = tf.range(batch_size * new_beam_size) // new_beam_size
+    batch_pos = tf.reshape(batch_pos, [batch_size, new_beam_size])
+
+    # Create coordinates to be passed to tf.gather_nd. Stacking creates a tensor
+    # with shape [batch_size, beam_size, 2], where the last dimension contains
+    # the (i, j) gathering coordinates.
+    coordinates = tf.stack([batch_pos, beam_indices], axis=2)
+
+    return nest.map_structure(lambda state: tf.gather_nd(state, coordinates), nested)
+
+
+def _gather_topk_beams(nested, score_or_log_prob, batch_size, beam_size):
+    """Gather top beams from nested structure."""
+    _, topk_indexes = tf.nn.top_k(score_or_log_prob, k=beam_size)
+    return _gather_beams(nested, topk_indexes, batch_size, beam_size)
diff --git a/tensorlayer/models/transformer/embedding_layer.py b/tensorlayer/models/transformer/embedding_layer.py
new file mode 100644
index 000000000..1897b0a22
--- /dev/null
+++ b/tensorlayer/models/transformer/embedding_layer.py
@@ -0,0 +1,95 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of embedding layer with shared weights."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import tensorlayer as tl
+
+
+class EmbeddingLayer(tl.layers.Layer):
+    """Calculates input embeddings and pre-softmax linear with shared weights."""
+
+    def __init__(self, vocab_size, hidden_size):
+        """Specify characteristic parameters of embedding layer.
+
+    Parameters
+    -----------
+    vocab_size : int
+        Number of tokens in the embedding. (Typically ~32,000)
+    hidden_size : int
+        Dimensionality of the embedding. (Typically 512 or 1024)
+
+    Examples
+    ---------
+    with TensorLayer
+
+    
+    """
+        super(EmbeddingLayer, self).__init__()
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+
+        self.build(tuple())
+        self._built = True
+
+    def build(self, inputs_shape):
+        with tf.name_scope("embedding_and_softmax"):
+            # Create and initialize weights. The random normal initializer was chosen
+            # arbitrarily, and works well.
+            self.W = self._get_weights(
+                'weights', shape=(self.vocab_size, self.hidden_size),
+                init=tf.random_normal_initializer(mean=0., stddev=self.hidden_size**-0.5)
+            )
+
+    def get_config(self):
+        return {
+            "vocab_size": self.vocab_size,
+            "hidden_size": self.hidden_size,
+        }
+
+    def forward(self, inputs, mode="embedding"):
+        """Get token embeddings of inputs."""
+        if mode == "embedding":
+            return self._embedding(inputs)
+        elif mode == "linear":
+            return self._linear(inputs)
+        else:
+            raise ValueError("mode {} is not valid.".format(mode))
+
+    def _embedding(self, inputs):
+        """Applies embedding based on inputs tensor."""
+        with tf.name_scope("embedding"):
+            # Create binary mask of size [batch_size, length]
+            mask = tf.cast(tf.not_equal(inputs, 0), tf.float32)
+            embeddings = tf.gather(self.W, inputs)
+            embeddings *= tf.expand_dims(mask, -1)
+            # Scale embedding by the sqrt of the hidden size
+            embeddings *= self.hidden_size**0.5
+            return embeddings
+
+    def _linear(self, inputs):
+        """Computes logits by running inputs through a linear layer."""
+        with tf.name_scope("presoftmax_linear"):
+            batch_size = tf.shape(inputs)[0]
+            length = tf.shape(inputs)[1]
+
+            x = tf.reshape(inputs, [-1, self.hidden_size])
+            logits = tf.matmul(x, self.W, transpose_b=True)
+
+            return tf.reshape(logits, [batch_size, length, self.vocab_size])
diff --git a/tensorlayer/models/transformer/feedforward_layer.py b/tensorlayer/models/transformer/feedforward_layer.py
new file mode 100644
index 000000000..ecc9e5249
--- /dev/null
+++ b/tensorlayer/models/transformer/feedforward_layer.py
@@ -0,0 +1,81 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of fully connected network."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import tensorlayer as tl
+
+
+class TransformerFeedForwardLayer(tl.layers.Layer):
+    """Fully connected feedforward network."""
+
+    def __init__(self, hidden_size, filter_size, keep_prob):
+        """Initialize FeedForwardNetwork.
+
+    Parameters
+    -----------
+      hidden_size: int
+        output dim of hidden layer.
+      filter_size: int
+        filter size for the inner (first) dense layer.
+      relu_dropout: float
+        dropout rate for training.
+    """
+        super(TransformerFeedForwardLayer, self).__init__()
+        self.hidden_size = hidden_size
+        self.filter_size = filter_size
+        self.relu_dropout = 1 - keep_prob
+        self.filter_dense_layer = tl.layers.Dense(
+            self.filter_size, in_channels=self.hidden_size, W_init=tf.initializers.get('glorot_uniform'),
+            name="input_layer"
+        )
+        self.output_dense_layer = tl.layers.Dense(
+            self.hidden_size, in_channels=self.filter_size, W_init=tf.initializers.get('glorot_uniform'),
+            name="output_layer"
+        )
+        self.build(None)
+        self._built = True
+
+    def build(self, inputs_shape):
+        pass
+
+    def get_config(self):
+        return {
+            "hidden_size": self.hidden_size,
+            "filter_size": self.filter_size,
+            "relu_dropout": self.relu_dropout,
+        }
+
+    def forward(self, inputs):
+        """Return outputs of the feedforward network."""
+        # Retrieve dynamically known shapes
+        x = inputs
+        batch_size = tf.shape(x)[0]
+        length = tf.shape(x)[1]
+        x = tf.reshape(x, [-1, x.shape[-1]])
+        output = self.filter_dense_layer(x)
+        output = tf.nn.relu(output)
+        output = tf.reshape(output, [batch_size, -1, output.shape[-1]])
+        if self.is_train:
+            output = tf.nn.dropout(output, rate=self.relu_dropout)
+        output = tf.reshape(output, [-1, output.shape[-1]])
+        output = self.output_dense_layer(output)
+        output = tf.reshape(output, [batch_size, -1, output.shape[-1]])
+
+        return output
\ No newline at end of file
diff --git a/tensorlayer/models/transformer/transformer.py b/tensorlayer/models/transformer/transformer.py
new file mode 100644
index 000000000..c03701eb4
--- /dev/null
+++ b/tensorlayer/models/transformer/transformer.py
@@ -0,0 +1,529 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines the Transformer model in TF 2.0.
+
+Model paper: https://arxiv.org/pdf/1706.03762.pdf
+Transformer model code source: https://github.com/tensorflow/tensor2tensor
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import tensorlayer as tl
+from tensorlayer.models import Model
+import tensorlayer.models.transformer.embedding_layer as embedding_layer
+from tensorlayer.models.transformer.attention_layer import SelfAttentionLayer, MultiHeadAttentionLayer
+from tensorlayer.models.transformer.feedforward_layer import TransformerFeedForwardLayer
+from tensorlayer.models.transformer.utils.model_utils import positional_encoding
+from tensorlayer.models.transformer.utils.model_utils import get_decoder_self_attention_bias as get_target_mask
+from tensorlayer.models.transformer.utils.model_utils import get_padding_bias as get_input_mask
+import tensorlayer.models.transformer.beamsearchHelper.beam_search as beam_search
+
+
+class Transformer(Model):
+    """Transformer model.
+
+    Parameters
+    ----------
+    params: class
+        Hyper-parameters of the model including vocab_size, encoder_num_layers, decoder_num_layers, 
+        hidden_size, ff_size, num_heads and keep_prob for training; 
+        and extra_decode_length, beam_size and alpha for inference.
+    
+    Examples
+    ---------
+    example/translation_task/tutorial_transformer
+
+    Returns
+    -------
+        Stacked-layer transformer model.
+    """
+
+    def __init__(self, params, name=None):
+
+        super(Transformer, self).__init__(name=name)
+        self.params = params
+        self.embedding_softmax_layer = embedding_layer.EmbeddingLayer(params.vocab_size, params.hidden_size)
+        self.encoder_stack = EncoderStack(params)
+        self.decoder_stack = DecoderStack(params)
+
+    def get_config(self):
+        return {
+            "params": self.params,
+        }
+
+    def forward(self, inputs, targets=None):
+        """Calculate target logits or inferred target sequences.
+
+    Parameters
+    ----------
+      inputs: input tensor list of size 1 or 2.
+        First item, inputs: int tensor with shape [batch_size, input_length].
+        Second item (optional), targets: None or int tensor with shape
+          [batch_size, target_length].
+      training: boolean
+        whether in training mode or not.
+
+    Notes
+    -------
+    The function would return:
+      If targets is defined:
+        Logits for each word in the target sequence: 
+            float tensor with shape [batch_size, target_length, vocab_size]
+        Self-attention weights for encoder part:
+            a dictionary of float tensors {
+                "layer_0": [batch_size, number_of_heads, source_length, source_length],
+                "layer_1": [batch_size, number_of_heads, source_length, source_length],
+                ...
+            }
+        Weights for decoder part:
+            a dictionary of dictionary of float tensors {
+                "self": {
+                    "layer_0": [batch_size, number_of_heads, target_length, target_length],
+                    "layer_1": [batch_size, number_of_heads, target_length, target_length],
+                    ...
+                }
+                "enc_dec": {
+                    "layer_0": [batch_size, number_of_heads, source_length, target_length],
+                    "layer_1": [batch_size, number_of_heads, source_length, target_length],
+                    ...
+                }
+            }
+    
+      If target is none:
+        Auto-regressive beam-search decoding to generate output each one time step:
+            a dictionary {
+            outputs: [batch_size, decoded length]
+            scores: [batch_size, float]}
+            }
+        Weights for decoder part:
+            a dictionary of dictionary of float tensors {
+                "self": {
+                    "layer_0": [batch_size, number_of_heads, target_length, target_length],
+                    "layer_1": [batch_size, number_of_heads, target_length, target_length],
+                    ...
+                }
+                "enc_dec": {
+                    "layer_0": [batch_size, number_of_heads, source_length, target_length],
+                    "layer_1": [batch_size, number_of_heads, source_length, target_length],
+                    ...
+                }
+            }
+        Self-attention weights for encoder part:
+            a dictionary of float tensors {
+                "layer_0": [batch_size, number_of_heads, source_length, source_length],
+                "layer_1": [batch_size, number_of_heads, source_length, source_length],
+                ...
+            }
+
+    """
+        # # Variance scaling is used here because it seems to work in many problems.
+        # # Other reasonable initializers may also work just as well.
+
+        # Calculate attention bias for encoder self-attention and decoder
+        # multi-headed attention layers.
+        attention_bias = get_input_mask(inputs)
+
+        # Run the inputs through the encoder layer to map the symbol
+        # representations to continuous representations.
+        # Prepare inputs to the layer stack by adding positional encodings and
+        # applying dropout.
+        embedded_inputs = self.embedding_softmax_layer(inputs)
+        inputs_padding = get_input_mask(inputs)
+
+        encoder_outputs, weights_encoder = self.encode(inputs, inputs_padding)
+        # Generate output sequence if targets is None, or return logits if target
+        # sequence is known.
+        if targets is None:
+            return self.predict(encoder_outputs, attention_bias), weights_encoder
+        else:
+            logits, weights_decoder = self.decode(targets, encoder_outputs, attention_bias)
+        return logits, weights_encoder, weights_decoder
+
+    def encode(self, inputs, attention_bias):
+        """Generate continuous representation for inputs.
+
+    Parameters
+    ----------
+      inputs: int tensor with shape [batch_size, input_length].
+      attention_bias: float tensor with shape [batch_size, 1, 1, input_length].
+      training: boolean, whether in training mode or not.
+
+    Returns
+    -------
+      Float tensor with shape [batch_size, input_length, hidden_size]:
+        The output of encoder
+      
+      Dictionary of float tensors {
+            "layer_0": [batch_size, number_of_heads, source_length, source_length],
+            "layer_1": [batch_size, number_of_heads, source_length, source_length],
+            ...
+        }:
+        Self-attention weights for encoder part
+    """
+
+        # Prepare inputs to the layer stack by adding positional encodings and
+        # applying dropout.
+        embedded_inputs = self.embedding_softmax_layer(inputs)
+        inputs_padding = get_input_mask(inputs)
+
+        length = tf.shape(embedded_inputs)[1]
+        pos_encoding = positional_encoding(length, self.params.hidden_size)
+        encoder_inputs = embedded_inputs + pos_encoding
+
+        if self.is_train:
+            encoder_inputs = tf.nn.dropout(encoder_inputs, rate=1 - self.params.keep_prob)
+        return self.encoder_stack(encoder_inputs, input_mask=attention_bias)
+
+    def decode(self, targets, encoder_outputs, attention_bias):
+        """Generate logits for each value in the target sequence.
+
+    Parameters
+    ----------
+      targets: target values for the output sequence. int tensor with shape
+        [batch_size, target_length]
+      encoder_outputs: continuous representation of input sequence. float tensor
+        with shape [batch_size, input_length, hidden_size]
+      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
+      training: boolean, whether in training mode or not.
+
+    Returns
+    -------
+      Float32 tensor with shape [batch_size, target_length, vocab_size]:
+        Output of decoder part
+        
+      Dictionary of dictionary of float tensors {
+            "self": {
+                "layer_0": [batch_size, number_of_heads, target_length, target_length],
+                "layer_1": [batch_size, number_of_heads, target_length, target_length],
+                ...
+            }
+            "enc_dec": {
+                "layer_0": [batch_size, number_of_heads, source_length, target_length],
+                "layer_1": [batch_size, number_of_heads, source_length, target_length],
+                ...
+            }
+        }:
+        Weights for decoder part
+    """
+        with tf.name_scope("decode"):
+            # Prepare inputs to decoder layers by shifting targets, adding positional
+            # encoding and applying dropout.
+            decoder_inputs = self.embedding_softmax_layer(targets)
+            with tf.name_scope("shift_targets"):
+                # Shift targets to the right, and remove the last element
+                decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]],
+                                        constant_values=self.params.sos_id)[:, :-1, :]
+            with tf.name_scope("add_pos_encoding"):
+                length = tf.shape(decoder_inputs)[1]
+                decoder_inputs += positional_encoding(length, self.params.hidden_size)
+            if self.is_train:
+                decoder_inputs = tf.nn.dropout(decoder_inputs, rate=1 - self.params.keep_prob)
+
+            # Run values
+            decoder_self_attention_bias = get_target_mask(length)
+            outputs, weights = self.decoder_stack(
+                decoder_inputs,
+                features=encoder_outputs,
+                input_mask=attention_bias,
+                target_mask=decoder_self_attention_bias,
+            )
+            logits = self.embedding_softmax_layer(outputs, mode="linear")
+            return logits, weights
+
+    def _get_symbols_to_logits_fn(self, max_decode_length):
+        """Returns a decoding function that calculates logits of the next tokens."""
+
+        timing_signal = positional_encoding(max_decode_length + 1, self.params.hidden_size)
+        decoder_self_attention_bias = get_target_mask(max_decode_length)
+        weights = []
+
+        def symbols_to_logits_fn(ids, i, cache):
+            """Generate logits for next potential IDs."""
+
+            # Set decoder input to the last generated IDs
+            decoder_input = ids[:, -1:]
+
+            # Preprocess decoder input by getting embeddings and adding timing signal.
+            decoder_input = self.embedding_softmax_layer(decoder_input)
+            decoder_input += timing_signal[i:i + 1]
+
+            self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
+            decoder_outputs, weight = self.decoder_stack(
+                decoder_input, features=cache.get("encoder_outputs"), target_mask=self_attention_bias,
+                input_mask=cache.get("encoder_decoder_attention_bias"), cache=cache
+            )
+            weights.append(weight)
+            logits = self.embedding_softmax_layer(decoder_outputs, mode="linear")
+            logits = tf.squeeze(logits, axis=[1])
+            return logits, cache
+
+        return symbols_to_logits_fn, weights
+
+    def predict(self, encoder_outputs, encoder_decoder_attention_bias):
+
+        batch_size = tf.shape(encoder_outputs)[0]
+        input_length = tf.shape(encoder_outputs)[1]
+        max_decode_length = input_length + self.params.extra_decode_length
+
+        symbols_to_logits_fn, weights = self._get_symbols_to_logits_fn(max_decode_length)
+
+        # Create initial set of IDs that will be passed into symbols_to_logits_fn.
+        initial_ids = tf.ones([batch_size], dtype=tf.int32) * self.params.sos_id
+
+        # Create cache storing decoder attention values for each layer.
+        # pylint: disable=g-complex-comprehension
+        cache = {
+            "layer_%d" % layer: {
+                "k": tf.zeros([batch_size, 0, self.params.hidden_size]),
+                "v": tf.zeros([batch_size, 0, self.params.hidden_size])
+            } for layer in range(self.params.encoder_num_layers)
+        }
+        # pylint: enable=g-complex-comprehension
+
+        # Add encoder output and attention bias to the cache.
+        cache["encoder_outputs"] = encoder_outputs
+        cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
+
+        # Use beam search to find the top beam_size sequences and scores.
+        decoded_ids, scores = beam_search.sequence_beam_search(
+            symbols_to_logits_fn=symbols_to_logits_fn, initial_ids=initial_ids, initial_cache=cache,
+            vocab_size=self.params.vocab_size, beam_size=self.params.beam_size, alpha=self.params.alpha,
+            max_decode_length=max_decode_length, eos_id=self.params.eos_id
+        )
+
+        # Get the top sequence for each batch element
+        top_decoded_ids = decoded_ids[:, 0, 1:]
+        top_scores = scores[:, 0]
+
+        # post-process the weight attention
+        for i, weight in enumerate(weights):
+            if (i == 0):
+                w = weight
+            else:
+                for k in range(len(w['self'])):
+                    w['self']['layer_%d' % k
+                             ] = tf.concat([w['self']['layer_%d' % k], weight['self']['layer_%d' % k]], 3)
+                    w['enc_dec']['layer_%d' % k
+                                ] = tf.concat([w['enc_dec']['layer_%d' % k], weight['enc_dec']['layer_%d' % k]], 2)
+        return {"outputs": top_decoded_ids, "scores": top_scores}, w
+
+
+class LayerNormalization(tl.layers.Layer):
+    """
+    Layer normalization
+
+    Parameters
+    ----------
+    hidden_size: int
+        hidden size of features
+    epsilon: float
+        value to prevent division by zero
+    """
+
+    def __init__(self, hidden_size, epsilon=1e-6):
+        super(LayerNormalization, self).__init__()
+        self.hidden_size = hidden_size
+        self.epsilon = epsilon
+
+        self.build(tuple())
+        self._built = True
+
+    def build(self, inputs_shape):
+        self.scale = self._get_weights('scale', shape=(self.hidden_size), init=tl.initializers.Ones())
+        self.bias = self._get_weights('bias', shape=(self.hidden_size), init=tl.initializers.Zeros())
+
+    def forward(self, inputs):
+        mean = tf.reduce_mean(inputs, axis=[-1], keepdims=True)
+        var = tf.reduce_mean(tf.square(inputs - mean), axis=[-1], keepdims=True)
+        norm_inputs = (inputs - mean) * tf.math.rsqrt(var + self.epsilon)
+        return norm_inputs * self.scale + self.bias
+
+    def __repr__(self):
+        return "layer normalization"
+
+
+class PrePostProcessingWrapper(Model):
+    """Wrapper class that applies layer pre-processing and post-processing."""
+
+    def __init__(self, layer, params):
+        super(PrePostProcessingWrapper, self).__init__()
+        self.layer = layer
+        self.params = params
+        self.postprocess_dropout = 1 - params.keep_prob
+        self.layer_norm = LayerNormalization(self.params.hidden_size)
+
+    def get_config(self):
+        return {
+            "params": self.params,
+        }
+
+    def forward(self, inputs, get_weight=False, *args, **kwargs):
+        """Calls wrapped layer with same parameters."""
+
+        x = inputs
+        y = self.layer_norm(x)
+
+        # Get layer output
+        if (get_weight):
+            y, weight = self.layer(y, *args, **kwargs)
+        else:
+            y = self.layer(y, *args, **kwargs)
+
+        # Postprocessing: apply dropout and residual connection
+        if self.is_train:
+            y = tf.nn.dropout(y, rate=self.postprocess_dropout)
+        if (get_weight):
+            return x + y, weight
+        else:
+            return x + y
+
+
+class EncoderStack(Model):
+    """Transformer encoder stack.
+
+  The encoder stack is made up of N identical layers. Each layer is composed
+  of the sublayers:
+    1. Self-attention layer
+    2. Feedforward network (which is 2 fully-connected layers)
+  """
+
+    def __init__(self, params):
+        super(EncoderStack, self).__init__()
+        self.params = params
+        self.layers = []
+        for _ in range(params.encoder_num_layers):
+            # Create sublayers for each layer.
+            self_attention_layer = SelfAttentionLayer(params.num_heads, params.hidden_size, params.keep_prob)
+            feed_forward_network = TransformerFeedForwardLayer(params.hidden_size, params.ff_size, params.keep_prob)
+
+            self.layers.append(
+                [
+                    PrePostProcessingWrapper(self_attention_layer, params),
+                    PrePostProcessingWrapper(feed_forward_network, params)
+                ]
+            )
+
+        # Create final layer normalization layer.
+        self.output_normalization = LayerNormalization(params.hidden_size)
+
+    def get_config(self):
+        return {
+            "params": self.params,
+        }
+
+    def forward(self, inputs, input_mask):
+        """Return the output of the encoder layer stacks."""
+        encoder_inputs = inputs
+        weights = {}
+        for n, layer in enumerate(self.layers):
+            # Run inputs through the sublayers.
+            self_attention_layer = layer[0]
+            feed_forward_network = layer[1]
+
+            with tf.name_scope("layer_%d" % n):
+                with tf.name_scope("self_attention"):
+                    encoder_inputs, weight = self_attention_layer(encoder_inputs, mask=input_mask, get_weight=True)
+                    weights["layer_%d" % n] = weight
+                with tf.name_scope("ffn"):
+                    encoder_inputs = feed_forward_network(encoder_inputs)
+
+        return self.output_normalization(encoder_inputs), weights
+
+
+class DecoderStack(Model):
+    """Transformer decoder stack.
+
+  Like the encoder stack, the decoder stack is made up of N identical layers.
+  Each layer is composed of the sublayers:
+    1. Self-attention layer
+    2. Multi-headed attention layer combining encoder outputs with results from
+       the previous self-attention layer.
+    3. Feedforward network (2 fully-connected layers)
+  """
+
+    def __init__(self, params):
+        super(DecoderStack, self).__init__()
+        self.params = params
+        self.layers = []
+        for _ in range(params.decoder_num_layers):
+            self_attention_layer = SelfAttentionLayer(params.num_heads, params.hidden_size, params.keep_prob)
+            enc_dec_attention_layer = MultiHeadAttentionLayer(params.num_heads, params.hidden_size, params.keep_prob)
+            feed_forward_network = TransformerFeedForwardLayer(params.hidden_size, params.ff_size, params.keep_prob)
+
+            self.layers.append(
+                [
+                    PrePostProcessingWrapper(self_attention_layer, params),
+                    PrePostProcessingWrapper(enc_dec_attention_layer, params),
+                    PrePostProcessingWrapper(feed_forward_network, params)
+                ]
+            )
+        self.output_normalization = LayerNormalization(params.hidden_size)
+
+    def get_config(self):
+        return {
+            "params": self.params,
+        }
+
+    def forward(self, inputs, features, input_mask, target_mask, cache=None):
+        """Return the output of the decoder layer stacks.
+
+    Parameters
+    -----------
+      decoder_inputs : tensor with shape [batch_size, target_length, hidden_size]
+      encoder_outputs : tensor with shape [batch_size, input_length, hidden_size]
+      decoder_self_attention_bias: bias for decoder self-attention layer. [1, 1,
+        target_len, target_length]
+      attention_bias : bias for encoder-decoder attention layer. [batch_size, 1,
+        1, input_length]
+      training : boolean 
+        whether in training mode or not.
+      cache: (Used for fast decoding) A nested dictionary storing previous
+        decoder self-attention values. The items are:
+          {layer_n: {"k": tensor with shape [batch_size, i, key_channels],
+                     "v": tensor with shape [batch_size, i, value_channels]},
+                       ...}
+
+    """
+        decoder_inputs = inputs
+        decoder_self_attention_bias = target_mask
+        encoder_outputs = features
+        attention_bias = input_mask
+        weights_all = {"self": {}, "enc_dec": {}}
+        for n, layer in enumerate(self.layers):
+            self_attention_layer = layer[0]
+            enc_dec_attention_layer = layer[1]
+            feed_forward_network = layer[2]
+
+            # Run inputs through the sublayers.
+            layer_name = "layer_%d" % n
+            layer_cache = cache[layer_name] if cache is not None else None
+
+            with tf.name_scope(layer_name):
+                with tf.name_scope("self_attention"):
+                    decoder_inputs, weight_self = self_attention_layer(
+                        decoder_inputs, get_weight=True, mask=decoder_self_attention_bias, cache=layer_cache
+                    )
+                    weights_all['self']["layer_%d" % n] = weight_self
+                with tf.name_scope("encdec_attention"):
+                    decoder_inputs, weight_enc_dec = enc_dec_attention_layer(
+                        decoder_inputs, get_weight=True, y=encoder_outputs, mask=attention_bias
+                    )
+                    weights_all['enc_dec']["layer_%d" % n] = weight_enc_dec
+                with tf.name_scope("ffn"):
+                    decoder_inputs = feed_forward_network(decoder_inputs)
+
+        return self.output_normalization(decoder_inputs), weights_all
diff --git a/tensorlayer/models/transformer/utils/__init__.py b/tensorlayer/models/transformer/utils/__init__.py
new file mode 100644
index 000000000..830f64ecd
--- /dev/null
+++ b/tensorlayer/models/transformer/utils/__init__.py
@@ -0,0 +1,3 @@
+from .model_utils import *
+from .metrics import *
+from .attention_visualisation import *
\ No newline at end of file
diff --git a/tensorlayer/models/transformer/utils/attention_visualisation.py b/tensorlayer/models/transformer/utils/attention_visualisation.py
new file mode 100644
index 000000000..e98775b4e
--- /dev/null
+++ b/tensorlayer/models/transformer/utils/attention_visualisation.py
@@ -0,0 +1,38 @@
+import matplotlib.pyplot as plt
+import tensorflow as tf
+
+
+def plot_attention_weights(attention, key, query):
+    '''Attention visualisation for Transformer
+
+    Parameters
+    ----------
+    attention : attention weights
+        shape of (1, number of head, length of key, length of query).
+    
+    key : key for attention computation
+        a list of values which would be shown as xtick labels
+
+    value : value for attention computation
+        a list of values which would be shown as ytick labels
+
+    '''
+
+    fig = plt.figure(figsize=(16, 8))
+    attention = tf.squeeze(attention, axis=0)
+
+    for head in range(attention.shape[0]):
+        ax = fig.add_subplot(attention.shape[0] // 2, 2, head + 1)
+        ax.matshow(attention[head], cmap='viridis')
+        fontdict = {'fontsize': 12}
+        ax.set_xticks(range(len(key)))
+        ax.set_yticks(range(len(query)))
+
+        # ax.set_ylim(len(query)-1.5, -0.5)
+        ax.set_xticklabels([str(i) for i in key], fontdict=fontdict, rotation=90)
+
+        ax.set_yticklabels([str(i) for i in query], fontdict=fontdict)
+
+        ax.set_xlabel('Head {}'.format(head + 1), fontdict=fontdict)
+    plt.tight_layout()
+    plt.show()
diff --git a/tensorlayer/models/transformer/utils/metrics.py b/tensorlayer/models/transformer/utils/metrics.py
new file mode 100644
index 000000000..6a5aa5d35
--- /dev/null
+++ b/tensorlayer/models/transformer/utils/metrics.py
@@ -0,0 +1,680 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions for calculating loss, accuracy, and other model metrics.
+
+Metrics:
+ - Padded loss, accuracy, and negative log perplexity. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/metrics.py
+ - BLEU approximation. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py
+ - ROUGE score. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/rouge.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+
+import numpy as np
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+
+def _pad_tensors_to_same_length(x, y):
+    """Pad x and y so that the results have the same length (second dimension)."""
+    with tf.name_scope("pad_to_same_length"):
+        x_length = tf.shape(x)[1]
+        y_length = tf.shape(y)[1]
+
+        max_length = tf.maximum(x_length, y_length)
+
+        x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]])
+        y = tf.pad(y, [[0, 0], [0, max_length - y_length]])
+        return x, y
+
+
+def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
+    """Calculate cross entropy loss while ignoring padding.
+
+  Parameters
+-----------
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch_size, length_labels]
+    smoothing: Label smoothing constant, used to determine the on and off values
+    vocab_size: int size of the vocabulary
+   Returns:
+-----------
+    Returns the cross entropy loss and weight tensors: float32 tensors with
+      shape [batch_size, max(length_logits, length_labels)]
+  """
+    with tf.name_scope("loss", values=[logits, labels]):
+        logits, labels = _pad_tensors_to_same_length(logits, labels)
+
+        # Calculate smoothing cross entropy
+        with tf.name_scope("smoothing_cross_entropy", values=[logits, labels]):
+            confidence = 1.0 - smoothing
+            low_confidence = (1.0 - confidence) / tf.to_float(vocab_size - 1)
+            soft_targets = tf.one_hot(
+                tf.cast(labels, tf.int32), depth=vocab_size, on_value=confidence, off_value=low_confidence
+            )
+            xentropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=soft_targets)
+
+            # Calculate the best (lowest) possible value of cross entropy, and
+            # subtract from the cross entropy loss.
+            normalizing_constant = -(
+                confidence * tf.log(confidence) +
+                tf.to_float(vocab_size - 1) * low_confidence * tf.log(low_confidence + 1e-20)
+            )
+            xentropy -= normalizing_constant
+
+        weights = tf.to_float(tf.not_equal(labels, 0))
+        return xentropy * weights, weights
+
+
+def _convert_to_eval_metric(metric_fn):
+    """Wrap a metric fn that returns scores and weights as an eval metric fn.
+
+  The input metric_fn returns values for the current batch. The wrapper
+  aggregates the return values collected over all of the batches evaluated.
+
+  Parameters
+-----------
+    metric_fn: function that returns scores and weights for the current batch's
+      logits and predicted labels.
+
+   Returns:
+-----------
+    function that aggregates the scores and weights from metric_fn.
+  """
+
+    def problem_metric_fn(*args):
+        """Returns an aggregation of the metric_fn's returned values."""
+        (scores, weights) = metric_fn(*args)
+
+        # The tf.metrics.mean function assures correct aggregation.
+        return tf.metrics.mean(scores, weights)
+
+    return problem_metric_fn
+
+
+def get_eval_metrics(logits, labels, params):
+    """Return dictionary of model evaluation metrics."""
+    metrics = {
+        "accuracy": _convert_to_eval_metric(padded_accuracy)(logits, labels),
+        "accuracy_top5": _convert_to_eval_metric(padded_accuracy_top5)(logits, labels),
+        "accuracy_per_sequence": _convert_to_eval_metric(padded_sequence_accuracy)(logits, labels),
+        "neg_log_perplexity": _convert_to_eval_metric(padded_neg_log_perplexity)(logits, labels, params["vocab_size"]),
+    }
+
+    if not params["use_tpu"]:
+        # TPU does not support tf.py_func
+        metrics.update(
+            {
+                "approx_bleu_score": _convert_to_eval_metric(bleu_score)(logits, labels),
+                "rouge_2_fscore": _convert_to_eval_metric(rouge_2_fscore)(logits, labels),
+                "rouge_L_fscore": _convert_to_eval_metric(rouge_l_fscore)(logits, labels),
+            }
+        )
+
+    # Prefix each of the metric names with "metrics/". This allows the metric
+    # graphs to display under the "metrics" category in TensorBoard.
+    metrics = {"metrics/%s" % k: v for k, v in six.iteritems(metrics)}
+    return metrics
+
+
+def padded_accuracy(logits, labels):
+    """Percentage of times that predictions matches labels on non-0s."""
+    with tf.variable_scope("padded_accuracy", values=[logits, labels]):
+        logits, labels = _pad_tensors_to_same_length(logits, labels)
+        weights = tf.to_float(tf.not_equal(labels, 0))
+        outputs = tf.to_int32(tf.argmax(logits, axis=-1))
+        padded_labels = tf.to_int32(labels)
+        return tf.to_float(tf.equal(outputs, padded_labels)), weights
+
+
+def padded_accuracy_topk(logits, labels, k):
+    """Percentage of times that top-k predictions matches labels on non-0s."""
+    with tf.variable_scope("padded_accuracy_topk", values=[logits, labels]):
+        logits, labels = _pad_tensors_to_same_length(logits, labels)
+        weights = tf.to_float(tf.not_equal(labels, 0))
+        effective_k = tf.minimum(k, tf.shape(logits)[-1])
+        _, outputs = tf.nn.top_k(logits, k=effective_k)
+        outputs = tf.to_int32(outputs)
+        padded_labels = tf.to_int32(labels)
+        padded_labels = tf.expand_dims(padded_labels, axis=-1)
+        padded_labels += tf.zeros_like(outputs)  # Pad to same shape.
+        same = tf.to_float(tf.equal(outputs, padded_labels))
+        same_topk = tf.reduce_sum(same, axis=-1)
+        return same_topk, weights
+
+
+def padded_accuracy_top5(logits, labels):
+    return padded_accuracy_topk(logits, labels, 5)
+
+
+def padded_sequence_accuracy(logits, labels):
+    """Percentage of times that predictions matches labels everywhere (non-0)."""
+    with tf.variable_scope("padded_sequence_accuracy", values=[logits, labels]):
+        logits, labels = _pad_tensors_to_same_length(logits, labels)
+        weights = tf.to_float(tf.not_equal(labels, 0))
+        outputs = tf.to_int32(tf.argmax(logits, axis=-1))
+        padded_labels = tf.to_int32(labels)
+        not_correct = tf.to_float(tf.not_equal(outputs, padded_labels)) * weights
+        axis = list(range(1, len(outputs.get_shape())))
+        correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
+        return correct_seq, tf.constant(1.0)
+
+
+def padded_neg_log_perplexity(logits, labels, vocab_size):
+    """Average log-perplexity excluding padding 0s. No smoothing."""
+    num, den = padded_cross_entropy_loss(logits, labels, 0, vocab_size)
+    return -num, den
+
+
+def bleu_score(logits, labels):
+    """Approximate BLEU score computation between labels and predictions.
+
+  An approximate BLEU scoring method since we do not glue word pieces or
+  decode the ids and tokenize the output. By default, we use ngram order of 4
+  and use brevity penalty. Also, this does not have beam search.
+
+  Parameters
+-----------
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch-size, length_labels]
+
+   Returns:
+-----------
+    bleu: int, approx bleu score
+  """
+    predictions = tf.to_int32(tf.argmax(logits, axis=-1))
+    # TODO: Look into removing use of py_func
+    bleu = tf.py_func(compute_bleu, (labels, predictions), tf.float32)
+    return bleu, tf.constant(1.0)
+
+
+def _get_ngrams_with_counter(segment, max_order):
+    """Extracts all n-grams up to a given maximum order from an input segment.
+
+  Parameters
+-----------
+    segment: text segment from which n-grams will be extracted.
+    max_order: maximum length in tokens of the n-grams returned by this
+        methods.
+
+   Returns:
+-----------
+    The Counter containing all n-grams upto max_order in segment
+    with a count of how many times each n-gram occurred.
+  """
+    ngram_counts = collections.Counter()
+    for order in xrange(1, max_order + 1):
+        for i in xrange(0, len(segment) - order + 1):
+            ngram = tuple(segment[i:i + order])
+            ngram_counts[ngram] += 1
+    return ngram_counts
+
+
+def compute_bleu(reference_corpus, translation_corpus, max_order=4, use_bp=True):
+    """Computes BLEU score of translated segments against one or more references.
+
+  Parameters
+-----------
+    reference_corpus: list of references for each translation. Each
+        reference should be tokenized into a list of tokens.
+    translation_corpus: list of translations to score. Each translation
+        should be tokenized into a list of tokens.
+    max_order: Maximum n-gram order to use when computing BLEU score.
+    use_bp: boolean, whether to apply brevity penalty.
+
+   Returns:
+-----------
+    BLEU score.
+  """
+    reference_length = 0
+    translation_length = 0
+    bp = 1.0
+    geo_mean = 0
+
+    matches_by_order = [0] * max_order
+    possible_matches_by_order = [0] * max_order
+    precisions = []
+
+    for (references, translations) in zip(reference_corpus, translation_corpus):
+        reference_length += len(references)
+        translation_length += len(translations)
+        ref_ngram_counts = _get_ngrams_with_counter(references, max_order)
+        translation_ngram_counts = _get_ngrams_with_counter(translations, max_order)
+
+        overlap = dict(
+            (ngram, min(count, translation_ngram_counts[ngram])) for ngram, count in ref_ngram_counts.items()
+        )
+
+        for ngram in overlap:
+            matches_by_order[len(ngram) - 1] += overlap[ngram]
+        for ngram in translation_ngram_counts:
+            possible_matches_by_order[len(ngram) - 1] += translation_ngram_counts[ngram]
+
+    precisions = [0] * max_order
+    smooth = 1.0
+
+    for i in xrange(0, max_order):
+        if possible_matches_by_order[i] > 0:
+            precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i]
+            if matches_by_order[i] > 0:
+                precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i]
+            else:
+                smooth *= 2
+                precisions[i] = 1.0 / (smooth * possible_matches_by_order[i])
+        else:
+            precisions[i] = 0.0
+
+    if max(precisions) > 0:
+        p_log_sum = sum(math.log(p) for p in precisions if p)
+        geo_mean = math.exp(p_log_sum / max_order)
+
+    if use_bp:
+        ratio = translation_length / reference_length
+        bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0
+    bleu = geo_mean * bp
+    return np.float32(bleu)
+
+
+def rouge_2_fscore(logits, labels):
+    """ROUGE-2 F1 score computation between labels and predictions.
+
+  This is an approximate ROUGE scoring method since we do not glue word pieces
+  or decode the ids and tokenize the output.
+
+  Parameters
+-----------
+    logits: tensor, model predictions
+    labels: tensor, gold output.
+
+   Returns:
+-----------
+    rouge2_fscore: approx rouge-2 f1 score.
+  """
+    predictions = tf.to_int32(tf.argmax(logits, axis=-1))
+    # TODO: Look into removing use of py_func
+    rouge_2_f_score = tf.py_func(rouge_n, (predictions, labels), tf.float32)
+    return rouge_2_f_score, tf.constant(1.0)
+
+
+def _get_ngrams(n, text):
+    """Calculates n-grams.
+
+  Parameters
+-----------
+    n: which n-grams to calculate
+    text: An array of tokens
+
+   Returns:
+-----------
+    A set of n-grams
+  """
+    ngram_set = set()
+    text_length = len(text)
+    max_index_ngram_start = text_length - n
+    for i in range(max_index_ngram_start + 1):
+        ngram_set.add(tuple(text[i:i + n]))
+    return ngram_set
+
+
+def rouge_n(eval_sentences, ref_sentences, n=2):
+    """Computes ROUGE-N f1 score of two text collections of sentences.
+
+  Source: https://www.microsoft.com/en-us/research/publication/
+  rouge-a-package-for-automatic-evaluation-of-summaries/
+
+  Parameters
+-----------
+    eval_sentences: Predicted sentences.
+    ref_sentences: Sentences from the reference set
+    n: Size of ngram.  Defaults to 2.
+
+   Returns:
+-----------
+    f1 score for ROUGE-N
+  """
+    f1_scores = []
+    for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
+        eval_ngrams = _get_ngrams(n, eval_sentence)
+        ref_ngrams = _get_ngrams(n, ref_sentence)
+        ref_count = len(ref_ngrams)
+        eval_count = len(eval_ngrams)
+
+        # Count the overlapping ngrams between evaluated and reference
+        overlapping_ngrams = eval_ngrams.intersection(ref_ngrams)
+        overlapping_count = len(overlapping_ngrams)
+
+        # Handle edge case. This isn't mathematically correct, but it's good enough
+        if eval_count == 0:
+            precision = 0.0
+        else:
+            precision = float(overlapping_count) / eval_count
+        if ref_count == 0:
+            recall = 0.0
+        else:
+            recall = float(overlapping_count) / ref_count
+        f1_scores.append(2.0 * ((precision * recall) / (precision + recall + 1e-8)))
+
+    # return overlapping_count / reference_count
+    return np.mean(f1_scores, dtype=np.float32)
+
+
+def rouge_l_fscore(predictions, labels):
+    """ROUGE scores computation between labels and predictions.
+
+  This is an approximate ROUGE scoring method since we do not glue word pieces
+  or decode the ids and tokenize the output.
+
+  Parameters
+-----------
+    predictions: tensor, model predictions
+    labels: tensor, gold output.
+
+   Returns:
+-----------
+    rouge_l_fscore: approx rouge-l f1 score.
+  """
+    outputs = tf.to_int32(tf.argmax(predictions, axis=-1))
+    rouge_l_f_score = tf.py_func(rouge_l_sentence_level, (outputs, labels), tf.float32)
+    return rouge_l_f_score, tf.constant(1.0)
+
+
+def rouge_l_sentence_level(eval_sentences, ref_sentences):
+    """Computes ROUGE-L (sentence level) of two collections of sentences.
+
+  Source: https://www.microsoft.com/en-us/research/publication/
+  rouge-a-package-for-automatic-evaluation-of-summaries/
+
+  Calculated according to:
+  R_lcs = LCS(X,Y)/m
+  P_lcs = LCS(X,Y)/n
+  F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
+
+  where:
+  X = reference summary
+  Y = Candidate summary
+  m = length of reference summary
+  n = length of candidate summary
+
+  Parameters
+-----------
+    eval_sentences: The sentences that have been picked by the summarizer
+    ref_sentences: The sentences from the reference set
+
+   Returns:
+-----------
+    A float: F_lcs
+  """
+
+    f1_scores = []
+    for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
+        m = float(len(ref_sentence))
+        n = float(len(eval_sentence))
+        lcs = _len_lcs(eval_sentence, ref_sentence)
+        f1_scores.append(_f_lcs(lcs, m, n))
+    return np.mean(f1_scores, dtype=np.float32)
+
+
+def _len_lcs(x, y):
+    """Returns the length of the Longest Common Subsequence between two seqs.
+
+  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+
+  Parameters
+-----------
+    x: sequence of words
+    y: sequence of words
+
+  Returns
+    integer: Length of LCS between x and y
+  """
+    table = _lcs(x, y)
+    n, m = len(x), len(y)
+    return table[n, m]
+
+
+def _lcs(x, y):
+    """Computes the length of the LCS between two seqs.
+
+  The implementation below uses a DP programming algorithm and runs
+  in O(nm) time where n = len(x) and m = len(y).
+  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+
+  Parameters
+-----------
+    x: collection of words
+    y: collection of words
+
+   Returns:
+-----------
+    Table of dictionary of coord and len lcs
+  """
+    n, m = len(x), len(y)
+    table = dict()
+    for i in range(n + 1):
+        for j in range(m + 1):
+            if i == 0 or j == 0:
+                table[i, j] = 0
+            elif x[i - 1] == y[j - 1]:
+                table[i, j] = table[i - 1, j - 1] + 1
+            else:
+                table[i, j] = max(table[i - 1, j], table[i, j - 1])
+    return table
+
+
+def _f_lcs(llcs, m, n):
+    """Computes the LCS-based F-measure score.
+
+  Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
+  rouge-working-note-v1.3.1.pdf
+
+  Parameters
+-----------
+    llcs: Length of LCS
+    m: number of words in reference summary
+    n: number of words in candidate summary
+
+   Returns:
+-----------
+    Float. LCS-based F-measure score
+  """
+    r_lcs = llcs / m
+    p_lcs = llcs / n
+    beta = p_lcs / (r_lcs + 1e-12)
+    num = (1 + (beta**2)) * r_lcs * p_lcs
+    denom = r_lcs + ((beta**2) * p_lcs)
+    f_lcs = num / (denom + 1e-12)
+    return f_lcs
+
+
+def _pad_tensors_to_same_length(x, y):
+    """Pad x and y so that the results have the same length (second dimension)."""
+    with tf.name_scope("pad_to_same_length"):
+        x_length = tf.shape(x)[1]
+        y_length = tf.shape(y)[1]
+
+        max_length = tf.maximum(x_length, y_length)
+
+        x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]])
+        y = tf.pad(y, [[0, 0], [0, max_length - y_length]])
+        return x, y
+
+
+def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
+    """Calculate cross entropy loss while ignoring padding.
+
+  Parameters
+-----------
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch_size, length_labels]
+    smoothing: Label smoothing constant, used to determine the on and off values
+    vocab_size: int size of the vocabulary
+
+   Returns:
+-----------
+    Returns the cross entropy loss and weight tensors: float32 tensors with
+      shape [batch_size, max(length_logits, length_labels)]
+  """
+    with tf.name_scope("loss"):
+        logits, labels = _pad_tensors_to_same_length(logits, labels)
+
+        # Calculate smoothing cross entropy
+        with tf.name_scope("smoothing_cross_entropy"):
+            confidence = 1.0 - smoothing
+            low_confidence = (1.0 - confidence) / tf.cast(vocab_size - 1, tf.float32)
+            soft_targets = tf.one_hot(
+                tf.cast(labels, tf.int32), depth=vocab_size, on_value=confidence, off_value=low_confidence
+            )
+            xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=soft_targets)
+
+            # Calculate the best (lowest) possible value of cross entropy, and
+            # subtract from the cross entropy loss.
+            normalizing_constant = -(
+                confidence * tf.math.log(confidence) +
+                tf.cast(vocab_size - 1, tf.float32) * low_confidence * tf.math.log(low_confidence + 1e-20)
+            )
+            xentropy -= normalizing_constant
+
+        weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+        return xentropy * weights, weights
+
+
+def padded_accuracy(logits, labels):
+    """Percentage of times that predictions matches labels on non-0s."""
+    with tf.name_scope("padded_accuracy"):
+        logits, labels = _pad_tensors_to_same_length(logits, labels)
+        weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+        outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
+        padded_labels = tf.cast(labels, tf.int32)
+        return tf.cast(tf.equal(outputs, padded_labels), tf.float32), weights
+
+
+def padded_accuracy_topk(logits, labels, k):
+    """Percentage of times that top-k predictions matches labels on non-0s."""
+    with tf.name_scope("padded_accuracy_topk"):
+        logits, labels = _pad_tensors_to_same_length(logits, labels)
+        weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+        effective_k = tf.minimum(k, tf.shape(logits)[-1])
+        _, outputs = tf.nn.top_k(logits, k=effective_k)
+        outputs = tf.cast(outputs, tf.int32)
+        padded_labels = tf.cast(labels, tf.int32)
+        padded_labels = tf.expand_dims(padded_labels, axis=-1)
+        padded_labels += tf.zeros_like(outputs)  # Pad to same shape.
+        same = tf.cast(tf.equal(outputs, padded_labels), tf.float32)
+        same_topk = tf.reduce_sum(same, axis=-1)
+        return same_topk, weights
+
+
+def padded_accuracy_top5(logits, labels):
+    return padded_accuracy_topk(logits, labels, 5)
+
+
+def padded_sequence_accuracy(logits, labels):
+    """Percentage of times that predictions matches labels everywhere (non-0)."""
+    with tf.name_scope("padded_sequence_accuracy"):
+        logits, labels = _pad_tensors_to_same_length(logits, labels)
+        weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+        outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
+        padded_labels = tf.cast(labels, tf.int32)
+        not_correct = tf.cast(tf.not_equal(outputs, padded_labels), tf.float32) * weights
+        axis = list(range(1, len(outputs.get_shape())))
+        correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
+        return correct_seq, tf.constant(1.0)
+
+
+def padded_neg_log_perplexity(logits, labels, vocab_size):
+    """Average log-perplexity excluding padding 0s. No smoothing."""
+    num, den = padded_cross_entropy_loss(logits, labels, 0, vocab_size)
+    return -num, den
+
+
+import functools
+
+
+class MetricLayer(tf.keras.layers.Layer):
+    """Custom a layer of metrics for Transformer model."""
+
+    def __init__(self, vocab_size):
+        super(MetricLayer, self).__init__()
+        self.vocab_size = vocab_size
+        self.metric_mean_fns = []
+
+    def build(self, input_shape):
+        """"Builds metric layer."""
+        neg_log_perplexity = functools.partial(padded_neg_log_perplexity, vocab_size=self.vocab_size)
+        self.metric_mean_fns = [
+            (tf.keras.metrics.Mean("accuracy"), padded_accuracy),
+            (tf.keras.metrics.Mean("accuracy_top5"), padded_accuracy_top5),
+            (tf.keras.metrics.Mean("accuracy_per_sequence"), padded_sequence_accuracy),
+            (tf.keras.metrics.Mean("neg_log_perplexity"), neg_log_perplexity),
+        ]
+        super(MetricLayer, self).build(input_shape)
+
+    def get_config(self):
+        return {"vocab_size": self.vocab_size}
+
+    def call(self, inputs):
+        logits, targets = inputs[0], inputs[1]
+        for mean, fn in self.metric_mean_fns:
+            m = mean(*fn(logits, targets))
+            self.add_metric(m, name="metric", aggregation='mean')
+        return logits
+
+
+def transformer_loss(logits, labels, smoothing, vocab_size):
+    """Calculates total loss containing cross entropy with padding ignored.
+
+  Parameters
+-----------
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch_size, length_labels]
+    smoothing: Label smoothing constant, used to determine the on and off values
+    vocab_size: int size of the vocabulary
+
+   Returns:
+-----------
+    A scalar float tensor for loss.
+  """
+    xentropy, weights = padded_cross_entropy_loss(logits, labels, smoothing, vocab_size)
+    return tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
+
+
+class LossLayer(tf.keras.layers.Layer):
+    """Custom a layer of transformer loss for Transformer model."""
+
+    def __init__(self, vocab_size, label_smoothing):
+        super(LossLayer, self).__init__()
+        self.vocab_size = vocab_size
+        self.label_smoothing = label_smoothing
+
+    def get_config(self):
+        return {
+            "vocab_size": self.vocab_size,
+            "label_smoothing": self.label_smoothing,
+        }
+
+    def call(self, inputs):
+        logits, targets = inputs[0], inputs[1]
+        loss = transformer_loss(logits, targets, self.label_smoothing, self.vocab_size)
+        self.add_loss(loss)
+        return logits, loss
diff --git a/tensorlayer/models/transformer/utils/model_utils.py b/tensorlayer/models/transformer/utils/model_utils.py
new file mode 100644
index 000000000..5410a97e2
--- /dev/null
+++ b/tensorlayer/models/transformer/utils/model_utils.py
@@ -0,0 +1,108 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Transformer model helper methods."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import tensorflow as tf
+
+_NEG_INF = -1e9
+
+
+def positional_encoding(length, hidden_size, min_timescale=1.0, max_timescale=1.0e4):
+    """Return positional encoding.
+
+  Calculates the position encoding as a mix of sine and cosine functions with
+  geometrically increasing wavelengths.
+  Defined and formulized in Attention is All You Need, section 3.5.
+
+  Parameters
+``-----------
+    length : int
+      Sequence length.
+    hidden_size : int
+      channel number of input
+    min_timescale : float
+      Minimum scale that will be applied at each position
+    max_timescale : float
+      Maximum scale that will be applied at each position
+
+  """
+    position = tf.cast(tf.range(length), tf.float32)
+    num_timescales = hidden_size // 2
+    log_timescale_increment = (
+        math.log(float(max_timescale) / float(min_timescale)) / (tf.cast(num_timescales, tf.float32) - 1)
+    )
+    inv_timescales = min_timescale * tf.exp(tf.cast(tf.range(num_timescales), tf.float32) * -log_timescale_increment)
+    scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
+    signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
+    return signal
+
+
+def get_decoder_self_attention_bias(length):
+    """Calculate bias for decoder that maintains model's autoregressive property.
+
+  Creates a tensor that masks out locations that correspond to illegal
+  connections, so prediction at position i cannot draw information from future
+  positions.
+
+  Parameters
+  -----------
+    length: int 
+      length of sequences in batch.
+
+
+  """
+    with tf.name_scope("decoder_self_attention_bias"):
+        valid_locs = tf.linalg.band_part(tf.ones([length, length]), -1, 0)
+        valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
+        decoder_bias = _NEG_INF * (1.0 - valid_locs)
+    return decoder_bias
+
+
+def get_padding(x, padding_value=0):
+    """Return float tensor representing the padding values in x.
+
+  Parameters
+  -----------
+    x: int tensor with any shape
+    padding_value: int 
+
+  """
+    with tf.name_scope("padding"):
+        return tf.cast(tf.equal(x, padding_value), tf.float32)
+
+
+def get_padding_bias(x):
+    """Calculate bias tensor from padding values in tensor.
+
+  Bias tensor that is added to the pre-softmax multi-headed attention logits,
+  which has shape [batch_size, num_heads, length, length]. The tensor is zero at
+  non-padding locations, and -1e9 (negative infinity) at padding locations.
+
+  Parameters
+  -----------
+    x: int tensor with shape [batch_size, length]
+
+  """
+    with tf.name_scope("attention_bias"):
+        padding = get_padding(x)
+        attention_bias = padding * _NEG_INF
+        attention_bias = tf.expand_dims(tf.expand_dims(attention_bias, axis=1), axis=1)
+    return attention_bias
diff --git a/tensorlayer/optimizers/__init__.py b/tensorlayer/optimizers/__init__.py
index e74b38801..0e9890929 100644
--- a/tensorlayer/optimizers/__init__.py
+++ b/tensorlayer/optimizers/__init__.py
@@ -10,3 +10,4 @@
 """
 
 from .amsgrad import AMSGrad
+from .lazy_adam import LazyAdamOptimizer
diff --git a/tensorlayer/optimizers/lazy_adam.py b/tensorlayer/optimizers/lazy_adam.py
new file mode 100644
index 000000000..5cdbab982
--- /dev/null
+++ b/tensorlayer/optimizers/lazy_adam.py
@@ -0,0 +1,76 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Optimizer from addons and learning rate scheduler."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+
+class LazyAdamOptimizer(tf.optimizers.Adam):
+    """Variant of the Adam optimizer that handles sparse updates more efficiently.
+
+  The original Adam algorithm maintains two moving-average accumulators for
+  each trainable variable; the accumulators are updated at every step.
+  This class provides lazier handling of gradient updates for sparse
+  variables.  It only updates moving-average accumulators for sparse variable
+  indices that appear in the current batch, rather than updating the
+  accumulators for all indices. Compared with the original Adam optimizer,
+  it can provide large improvements in model training throughput for some
+  applications. However, it provides slightly different semantics than the
+  original Adam algorithm, and may lead to different empirical results.
+  Note, amsgrad is currently not supported and the argument can only be
+  False.
+
+  This class is borrowed from:
+  https://github.com/tensorflow/addons/blob/master/tensorflow_addons/optimizers/lazy_adam.py
+  """
+
+    def _resource_apply_sparse(self, grad, var, indices):
+        """Applies grad for one step."""
+        var_dtype = var.dtype.base_dtype
+        lr_t = self._decayed_lr(var_dtype)
+        beta_1_t = self._get_hyper('beta_1', var_dtype)
+        beta_2_t = self._get_hyper('beta_2', var_dtype)
+        local_step = tf.cast(self.iterations + 1, var_dtype)
+        beta_1_power = tf.math.pow(beta_1_t, local_step)
+        beta_2_power = tf.math.pow(beta_2_t, local_step)
+        epsilon_t = tf.convert_to_tensor(self.epsilon, var_dtype)
+        lr = (lr_t * tf.math.sqrt(1 - beta_2_power) / (1 - beta_1_power))
+
+        # \\(m := beta1 * m + (1 - beta1) * g_t\\)
+        m = self.get_slot(var, 'm')
+        m_t_slice = beta_1_t * tf.gather(m, indices) + (1 - beta_1_t) * grad
+
+        m_update_kwargs = {'resource': m.handle, 'indices': indices, 'updates': m_t_slice}
+        m_update_op = tf.raw_ops.ResourceScatterUpdate(**m_update_kwargs)
+
+        # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
+        v = self.get_slot(var, 'v')
+        v_t_slice = (beta_2_t * tf.gather(v, indices) + (1 - beta_2_t) * tf.math.square(grad))
+
+        v_update_kwargs = {'resource': v.handle, 'indices': indices, 'updates': v_t_slice}
+        v_update_op = tf.raw_ops.ResourceScatterUpdate(**v_update_kwargs)
+
+        # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
+        var_slice = lr * m_t_slice / (tf.math.sqrt(v_t_slice) + epsilon_t)
+
+        var_update_kwargs = {'resource': var.handle, 'indices': indices, 'updates': var_slice}
+        var_update_op = tf.raw_ops.ResourceScatterSub(**var_update_kwargs)
+
+        return tf.group(*[var_update_op, m_update_op, v_update_op])
diff --git a/tests/models/test_transformer.py b/tests/models/test_transformer.py
new file mode 100644
index 000000000..a7ee307ce
--- /dev/null
+++ b/tests/models/test_transformer.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import unittest
+
+import numpy as np
+import tensorflow as tf
+import tensorlayer as tl
+from tqdm import tqdm
+from sklearn.utils import shuffle
+from tensorlayer.models.transformer import Transformer
+from tests.utils import CustomTestCase
+from tensorlayer.models.transformer.utils import metrics
+from tensorlayer.models.transformer.utils import attention_visualisation
+import time
+
+
+class TINY_PARAMS(object):
+    vocab_size = 50+2
+    encoder_num_layers = 2
+    decoder_num_layers = 2
+    hidden_size = 64
+    ff_size = 16
+    num_heads = 4
+    keep_prob = 0.9
+
+    # Default prediction params
+    extra_decode_length = 5
+    beam_size = 1
+    alpha = 0.6  # used to calculate length normalization in beam search
+
+    eos_id = 51
+    sos_id = 0
+
+
+class Model_Transformer_Test(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.batch_size = 50
+
+        cls.embedding_size = 32
+        cls.dec_seq_length = 5
+        cls.trainX = np.random.randint(low=0, high=50, size=(50, 11))
+        cls.trainY = np.random.randint(low=0, high=50, size=(50, 10))
+
+        cls.trainX[:, -1] = 51
+        cls.trainY[:, -1] = 51
+        # Parameters
+        cls.src_len = len(cls.trainX)
+        cls.tgt_len = len(cls.trainY)
+
+        assert cls.src_len == cls.tgt_len
+
+        cls.num_epochs = 100
+        cls.n_step = cls.src_len // cls.batch_size
+
+    @classmethod
+    def tearDownClass(cls):
+        pass
+
+    def test_basic_simpleSeq2Seq(self):
+
+        model_ = Transformer(TINY_PARAMS)
+
+        # print(", ".join(x for x in [t.name for t in model_.trainable_weights]))
+
+        self.vocab_size = TINY_PARAMS.vocab_size
+        optimizer = tf.optimizers.Adam(learning_rate=0.01)
+        for epoch in range(self.num_epochs):
+            model_.train()
+            t = time.time()
+            trainX, trainY = shuffle(self.trainX, self.trainY)
+            total_loss, n_iter = 0, 0
+            for X, Y in tqdm(tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=self.batch_size,
+                                                    shuffle=False), total=self.n_step,
+                             desc='Epoch[{}/{}]'.format(epoch + 1, self.num_epochs), leave=False):
+
+                with tf.GradientTape() as tape:
+
+                    targets = Y
+                    logits, weights_encoder, weights_decoder = model_(inputs=X, targets=Y)
+                    logits = metrics.MetricLayer(self.vocab_size)([logits, targets])
+                    logits, loss = metrics.LossLayer(self.vocab_size, 0.1)([logits, targets])
+
+                    grad = tape.gradient(loss, model_.all_weights)
+                    optimizer.apply_gradients(zip(grad, model_.all_weights))
+
+                total_loss += loss
+                n_iter += 1
+            print(time.time() - t)
+            tl.files.save_npz(model_.all_weights, name='./model_v4.npz')
+            model_.eval()
+            test_sample = trainX[0:2, :]
+            model_.eval()
+            [prediction, weights_decoder], weights_encoder = model_(inputs=test_sample)
+
+            print("Prediction: >>>>>  ", prediction["outputs"], "\n Target: >>>>>  ", trainY[0:2, :], "\n\n")
+
+            print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, self.num_epochs, total_loss / n_iter))
+
+        # visualise the self-attention weights at encoder during training
+        trainX, trainY = shuffle(self.trainX, self.trainY)
+        X = [trainX[0]]
+        Y = [trainY[0]]
+        logits, weights_encoder, weights_decoder = model_(inputs=X, targets=Y)
+        attention_visualisation.plot_attention_weights(weights_encoder["layer_0"], X[0].numpy(), X[0].numpy())
+
+        # visualise the encoder-decoder-attention weights at decoder during training
+        trainX, trainY = shuffle(self.trainX, self.trainY)
+        X = [trainX[0]]
+        Y = [trainY[0]]
+        logits, weights_encoder, weights_decoder = model_(inputs=X, targets=Y)
+        attention_visualisation.plot_attention_weights(weights_decoder["enc_dec"]["layer_0"], X[0].numpy(), Y[0])
+
+        # visualise the encoder-decoder-attention weights at decoder during inference
+        trainX, trainY = shuffle(self.trainX, self.trainY)
+        X = [trainX[0]]
+        # Y = [trainY[0]]
+        model_.eval()
+        [prediction, weights_decoder], weights_encoder = model_(inputs=X)
+        # print(X[0].numpy(), prediction["outputs"][0].numpy())
+        attention_visualisation.plot_attention_weights(
+            weights_decoder["enc_dec"]["layer_0"], X[0].numpy(), prediction["outputs"][0].numpy()
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()