From 5b85af75ff6602e77624d9376abad8419904e5c0 Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@LingjundeMacBook-Pro.local>
Date: Mon, 22 Jul 2019 10:21:04 +0100
Subject: [PATCH 01/22] transformer updated

---
 tensorlayer/models/__init__.py                |   1 +
 tensorlayer/models/transformer/__init__.py    |   6 +
 .../models/transformer/attention_layer.py     | 186 +++++
 .../transformer/beamsearchHelper/__init__.py  |   1 +
 .../beamsearchHelper/beam_search.py           | 107 +++
 .../beamsearchHelper/beam_search_v1.py        | 528 ++++++++++++++
 .../models/transformer/embedding_layer.py     | 103 +++
 .../models/transformer/feedforward_layer.py   |  86 +++
 tensorlayer/models/transformer/transformer.py | 460 +++++++++++++
 .../models/transformer/utils/__init__.py      |   3 +
 .../models/transformer/utils/metrics.py       | 651 ++++++++++++++++++
 .../models/transformer/utils/model_utils.py   | 107 +++
 .../models/transformer/utils/optimizer.py     | 147 ++++
 13 files changed, 2386 insertions(+)
 create mode 100644 tensorlayer/models/transformer/__init__.py
 create mode 100644 tensorlayer/models/transformer/attention_layer.py
 create mode 100644 tensorlayer/models/transformer/beamsearchHelper/__init__.py
 create mode 100644 tensorlayer/models/transformer/beamsearchHelper/beam_search.py
 create mode 100644 tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py
 create mode 100644 tensorlayer/models/transformer/embedding_layer.py
 create mode 100644 tensorlayer/models/transformer/feedforward_layer.py
 create mode 100644 tensorlayer/models/transformer/transformer.py
 create mode 100644 tensorlayer/models/transformer/utils/__init__.py
 create mode 100644 tensorlayer/models/transformer/utils/metrics.py
 create mode 100644 tensorlayer/models/transformer/utils/model_utils.py
 create mode 100644 tensorlayer/models/transformer/utils/optimizer.py

diff --git a/tensorlayer/models/__init__.py b/tensorlayer/models/__init__.py
index 065b94885..53556f86e 100644
--- a/tensorlayer/models/__init__.py
+++ b/tensorlayer/models/__init__.py
@@ -9,3 +9,4 @@
 from .vgg import *
 from .seq2seq import Seq2seq
 from .seq2seq_with_attention import Seq2seqLuongAttention
+from .transformer import *
diff --git a/tensorlayer/models/transformer/__init__.py b/tensorlayer/models/transformer/__init__.py
new file mode 100644
index 000000000..28c174abc
--- /dev/null
+++ b/tensorlayer/models/transformer/__init__.py
@@ -0,0 +1,6 @@
+from .attention_layer import *
+from .transformer import Transformer
+from .beamsearchHelper import *
+from .feedforward_layer import *
+from .embedding_layer import *
+from .utils import *
\ No newline at end of file
diff --git a/tensorlayer/models/transformer/attention_layer.py b/tensorlayer/models/transformer/attention_layer.py
new file mode 100644
index 000000000..205ec8244
--- /dev/null
+++ b/tensorlayer/models/transformer/attention_layer.py
@@ -0,0 +1,186 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of multiheaded attention and self-attention layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import tensorlayer as tl
+
+
+class MultiHeadAttentionLayer(tl.layers.Layer):
+    """Multi-headed attention layer."""
+
+    def __init__(self, num_heads, hidden_size, keep_prob):
+        """Initialize Attention.
+
+    Args:
+      hidden_size: int, output dim of hidden layer.
+      num_heads: int, number of heads to repeat the same attention structure.
+      keep_prob: float, keep rate for dropout mechanism inside attention for training.
+    """
+        if hidden_size % num_heads:
+            raise ValueError(
+                "Hidden size ({}) must be divisible by the number of heads ({}).".format(hidden_size, num_heads)
+            )
+
+        super(MultiHeadAttentionLayer, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.attention_dropout = 1 - keep_prob
+
+        self.build(None)
+        self._built = True
+
+    def get_config(self):
+        return {
+            "hidden_size": self.hidden_size,
+            "num_heads": self.num_heads,
+            "attention_dropout": self.attention_dropout,
+        }
+
+    def build(self, inputs_shape):
+        # Transformation for linearly projecting the queries, keys, and values.
+        self.q_transformation = self._get_weights(
+            "q_project", shape=(self.hidden_size, self.hidden_size), init=tf.keras.initializers.get('glorot_uniform')
+        )
+        self.v_transformation = self._get_weights(
+            "v_project", shape=(self.hidden_size, self.hidden_size), init=tf.keras.initializers.get('glorot_uniform')
+        )
+        self.k_transformation = self._get_weights(
+            "k_project", shape=(self.hidden_size, self.hidden_size), init=tf.keras.initializers.get('glorot_uniform')
+        )
+        self.out_transformation = self._get_weights(
+            "out_project", shape=(self.hidden_size, self.hidden_size), init=tf.keras.initializers.get('glorot_uniform')
+        )
+
+    def split_heads(self, x):
+        """Split x into different heads, and transpose the resulting value.
+
+    The tensor is transposed to insure the inner dimensions hold the correct
+    values during the matrix multiplication.
+
+    Args:
+      x: A tensor with shape [batch_size, length, hidden_size]
+
+    Returns:
+      A tensor with shape [batch_size, num_heads, length, hidden_size/num_heads]
+    """
+        with tf.name_scope("split_heads"):
+            batch_size = tf.shape(x)[0]
+            length = tf.shape(x)[1]
+
+            # Calculate depth of last dimension after it has been split.
+            depth = (self.hidden_size // self.num_heads)
+
+            # Split the last dimension
+            x = tf.reshape(x, [batch_size, length, self.num_heads, depth])
+
+            # Transpose the result
+            return tf.transpose(x, [0, 2, 1, 3])
+
+    def combine_heads(self, x):
+        """Combine tensor that has been split.
+
+    Args:
+      x: A tensor [batch_size, num_heads, length, hidden_size/num_heads]
+
+    Returns:
+      A tensor with shape [batch_size, length, hidden_size]
+    """
+        with tf.name_scope("combine_heads"):
+            batch_size = tf.shape(x)[0]
+            length = tf.shape(x)[2]
+            x = tf.transpose(x, [0, 2, 1, 3])  # --> [batch, length, num_heads, depth]
+            return tf.reshape(x, [batch_size, length, self.hidden_size])
+
+    def forward(self, inputs, mask, cache=None):
+        """Apply attention mechanism to x and y.
+
+    Args:
+      x: a tensor with shape [batch_size, length_x, hidden_size]
+      y: a tensor with shape [batch_size, length_y, hidden_size]
+      mask: attention bias that will be added to the result of the dot product.
+      training: boolean, whether in training mode or not.
+      cache: (Used during prediction) dictionary with tensors containing results
+        of previous attentions. The dictionary must have the items:
+            {"k": tensor with shape [batch_size, i, key_channels],
+             "v": tensor with shape [batch_size, i, value_channels]}
+        where i is the current decoded length.
+
+    Returns:
+      Attention layer output with shape [batch_size, length_x, hidden_size]
+    """
+        # Linearly project the query (q), key (k) and value (v) using different
+        # learned projections. This is in preparation of splitting them into
+        # multiple heads. Multi-head attention uses multiple queries, keys, and
+        # values rather than regular attention (which uses a single q, k, v).
+
+        if (len(inputs) == 2):
+            q = inputs[0]
+            k = v = inputs[1]
+
+        if (len(inputs) == 3):
+            q = inputs[0]
+            k = inputs[1]
+            v = inputs[2]
+
+        q = tf.tensordot(q, self.q_transformation, axes=[[2], [0]])
+        k = tf.tensordot(k, self.k_transformation, axes=[[2], [0]])
+        v = tf.tensordot(v, self.v_transformation, axes=[[2], [0]])
+
+        if cache is not None:
+
+            # Combine cached keys and values with new keys and values.
+            k = tf.concat([cache["k"], k], axis=1)
+            v = tf.concat([cache["v"], v], axis=1)
+
+            # Update cache
+            cache["k"] = k
+            cache["v"] = v
+
+        # Split q, k, v into heads.
+        q = self.split_heads(q)
+        k = self.split_heads(k)
+        v = self.split_heads(v)  #(Batch, num_head, length_v, dk)
+
+        # Scale q to prevent the dot product between q and k from growing too large.
+        depth = (self.hidden_size // self.num_heads)
+        q *= depth**-0.5
+
+        # Calculate dot product attention
+        logits = tf.matmul(q, k, transpose_b=True)  #(Batch, num_head, length_q, length_k)
+        logits += mask
+        weights = tf.nn.softmax(logits, name="attention_weights")  #(Batch, num_head, length_q, length_k)
+        if self.is_train:
+            weights = tf.nn.dropout(weights, rate=self.attention_dropout)
+
+        attention_output = tf.matmul(weights, v)
+
+        # Recombine heads --> [batch_size, length, hidden_size]
+        attention_output = self.combine_heads(attention_output)
+
+        # Run the combined outputs through another linear projection layer.
+        attention_output = tf.tensordot(attention_output, self.out_transformation, axes=[[2], [0]])
+        return attention_output
+
+
+class SelfAttentionLayer(MultiHeadAttentionLayer):
+    """Multiheaded self-attention layer."""
+
+    def forward(self, inputs, mask, cache=None):
+        return super(SelfAttentionLayer, self).forward(inputs=[inputs, inputs], mask=mask, cache=cache)
diff --git a/tensorlayer/models/transformer/beamsearchHelper/__init__.py b/tensorlayer/models/transformer/beamsearchHelper/__init__.py
new file mode 100644
index 000000000..83c248180
--- /dev/null
+++ b/tensorlayer/models/transformer/beamsearchHelper/__init__.py
@@ -0,0 +1 @@
+from .beam_search import *
diff --git a/tensorlayer/models/transformer/beamsearchHelper/beam_search.py b/tensorlayer/models/transformer/beamsearchHelper/beam_search.py
new file mode 100644
index 000000000..971e76fe0
--- /dev/null
+++ b/tensorlayer/models/transformer/beamsearchHelper/beam_search.py
@@ -0,0 +1,107 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Beam search in TF v2.
+"""
+
+import tensorflow as tf
+import tensorlayer.models.transformer.beamsearchHelper.beam_search_v1 as v1
+
+_StateKeys = v1._StateKeys  # pylint: disable=protected-access
+
+
+class SequenceBeamSearchV2(v1.SequenceBeamSearch):
+    """Implementation of beam search loop in v2."""
+
+    def search(self, initial_ids, initial_cache):
+        """Beam search for sequences with highest scores."""
+        state, state_shapes = self._create_initial_state(initial_ids, initial_cache)
+        finished_state = tf.while_loop(
+            self._continue_search, self._search_step, loop_vars=[state], shape_invariants=[state_shapes],
+            parallel_iterations=1, back_prop=False
+        )
+        finished_state = finished_state[0]
+
+        alive_seq = finished_state[_StateKeys.ALIVE_SEQ]
+        alive_log_probs = finished_state[_StateKeys.ALIVE_LOG_PROBS]
+        finished_seq = finished_state[_StateKeys.FINISHED_SEQ]
+        finished_scores = finished_state[_StateKeys.FINISHED_SCORES]
+        finished_flags = finished_state[_StateKeys.FINISHED_FLAGS]
+
+        # Account for corner case where there are no finished sequences for a
+        # particular batch item. In that case, return alive sequences for that batch
+        # item.
+        finished_seq = tf.where(tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
+        finished_scores = tf.where(tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
+        return finished_seq, finished_scores
+
+
+def sequence_beam_search(
+        symbols_to_logits_fn, initial_ids, initial_cache, vocab_size, beam_size, alpha, max_decode_length, eos_id
+):
+    """Search for sequence of subtoken ids with the largest probability.
+
+    Args:
+        symbols_to_logits_fn: A function that takes in ids, index, and cache as
+        arguments. The passed in arguments will have shape:
+            ids -> [batch_size * beam_size, index]
+            index -> [] (scalar)
+            cache -> nested dictionary of tensors [batch_size * beam_size, ...]
+        The function must return logits and new cache.
+            logits -> [batch * beam_size, vocab_size]
+            new cache -> same shape/structure as inputted cache
+        initial_ids: Starting ids for each batch item.
+        int32 tensor with shape [batch_size]
+        initial_cache: dict containing starting decoder variables information
+        vocab_size: int size of tokens
+        beam_size: int number of beams
+        alpha: float defining the strength of length normalization
+        max_decode_length: maximum length to decoded sequence
+        eos_id: int id of eos token, used to determine when a sequence has finished
+
+    Returns:
+        Top decoded sequences [batch_size, beam_size, max_decode_length]
+        sequence scores [batch_size, beam_size]
+    """
+    batch_size = tf.shape(initial_ids)[0]
+
+    sbs = SequenceBeamSearchV2(
+        symbols_to_logits_fn, vocab_size, batch_size, beam_size, alpha, max_decode_length, eos_id
+    )
+    return sbs.search(initial_ids, initial_cache)
+
+
+def _expand_to_same_rank(tensor, target):
+    """Expands a given tensor to target's rank to be broadcastable.
+
+    Args:
+        tensor: input tensor to tile. Shape: [b, d1, ..., da]
+        target: target tensor. Shape: [b, d1, ..., da, ..., dn]
+
+    Returns:
+        Tiled tensor of shape [b, d1, ..., da, 1, ..., 1] with same rank of target.
+
+    Raises:
+        ValueError, if the shape rank of rank tensor/target is None.
+    """
+    if tensor.shape.rank is None:
+        raise ValueError("Expect rank for tensor shape, but got None.")
+    if target.shape.rank is None:
+        raise ValueError("Expect rank for target shape, but got None.")
+
+    with tf.name_scope("expand_rank"):
+        diff_rank = target.shape.rank - tensor.shape.rank
+        for _ in range(diff_rank):
+            tensor = tf.expand_dims(tensor, -1)
+        return tensor
diff --git a/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py b/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py
new file mode 100644
index 000000000..bf3f85c3f
--- /dev/null
+++ b/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py
@@ -0,0 +1,528 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Beam search to find the translated sequence with the highest probability.
+
+Source implementation from Tensor2Tensor:
+https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/beam_search.py
+"""
+
+import tensorflow as tf
+from tensorflow.python.util import nest
+
+# Default value for INF
+INF = 1. * 1e7
+
+
+class _StateKeys(object):
+    """Keys to dictionary storing the state of the beam search loop."""
+
+    # Variable storing the loop index.
+    CUR_INDEX = "CUR_INDEX"
+
+    # Top sequences that are alive for each batch item. Alive sequences are ones
+    # that have not generated an EOS token. Sequences that reach EOS are marked as
+    # finished and moved to the FINISHED_SEQ tensor.
+    # Has shape [batch_size, beam_size, CUR_INDEX + 1]
+    ALIVE_SEQ = "ALIVE_SEQ"
+    # Log probabilities of each alive sequence. Shape [batch_size, beam_size]
+    ALIVE_LOG_PROBS = "ALIVE_LOG_PROBS"
+    # Dictionary of cached values for each alive sequence. The cache stores
+    # the encoder output, attention bias, and the decoder attention output from
+    # the previous iteration.
+    ALIVE_CACHE = "ALIVE_CACHE"
+
+    # Top finished sequences for each batch item.
+    # Has shape [batch_size, beam_size, CUR_INDEX + 1]. Sequences that are
+    # shorter than CUR_INDEX + 1 are padded with 0s.
+    FINISHED_SEQ = "FINISHED_SEQ"
+    # Scores for each finished sequence. Score = log probability / length norm
+    # Shape [batch_size, beam_size]
+    FINISHED_SCORES = "FINISHED_SCORES"
+    # Flags indicating which sequences in the finished sequences are finished.
+    # At the beginning, all of the sequences in FINISHED_SEQ are filler values.
+    # True -> finished sequence, False -> filler. Shape [batch_size, beam_size]
+    FINISHED_FLAGS = "FINISHED_FLAGS"
+
+
+class SequenceBeamSearch(object):
+    """Implementation of beam search loop."""
+
+    def __init__(self, symbols_to_logits_fn, vocab_size, batch_size, beam_size, alpha, max_decode_length, eos_id):
+        self.symbols_to_logits_fn = symbols_to_logits_fn
+        self.vocab_size = vocab_size
+        self.batch_size = batch_size
+        self.beam_size = beam_size
+        self.alpha = alpha
+        self.max_decode_length = max_decode_length
+        self.eos_id = eos_id
+
+    def search(self, initial_ids, initial_cache):
+        """Beam search for sequences with highest scores."""
+        state, state_shapes = self._create_initial_state(initial_ids, initial_cache)
+
+        finished_state = tf.while_loop(
+            self._continue_search, self._search_step, loop_vars=[state], shape_invariants=[state_shapes],
+            parallel_iterations=1, back_prop=False
+        )
+        finished_state = finished_state[0]
+
+        alive_seq = finished_state[_StateKeys.ALIVE_SEQ]
+        alive_log_probs = finished_state[_StateKeys.ALIVE_LOG_PROBS]
+        finished_seq = finished_state[_StateKeys.FINISHED_SEQ]
+        finished_scores = finished_state[_StateKeys.FINISHED_SCORES]
+        finished_flags = finished_state[_StateKeys.FINISHED_FLAGS]
+
+        # Account for corner case where there are no finished sequences for a
+        # particular batch item. In that case, return alive sequences for that batch
+        # item.
+        finished_seq = tf.where(tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
+        finished_scores = tf.where(tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
+        return finished_seq, finished_scores
+
+    def _create_initial_state(self, initial_ids, initial_cache):
+        """Return initial state dictionary and its shape invariants.
+
+    Args:
+      initial_ids: initial ids to pass into the symbols_to_logits_fn.
+        int tensor with shape [batch_size, 1]
+      initial_cache: dictionary storing values to be passed into the
+        symbols_to_logits_fn.
+
+    Returns:
+        state and shape invariant dictionaries with keys from _StateKeys
+    """
+        # Current loop index (starts at 0)
+        cur_index = tf.constant(0)
+
+        # Create alive sequence with shape [batch_size, beam_size, 1]
+        alive_seq = _expand_to_beam_size(initial_ids, self.beam_size)
+        alive_seq = tf.expand_dims(alive_seq, axis=2)
+
+        # Create tensor for storing initial log probabilities.
+        # Assume initial_ids are prob 1.0
+        initial_log_probs = tf.constant([[0.] + [-float("inf")] * (self.beam_size - 1)])
+        alive_log_probs = tf.tile(initial_log_probs, [self.batch_size, 1])
+
+        # Expand all values stored in the dictionary to the beam size, so that each
+        # beam has a separate cache.
+        alive_cache = nest.map_structure(lambda t: _expand_to_beam_size(t, self.beam_size), initial_cache)
+
+        # Initialize tensor storing finished sequences with filler values.
+        finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32)
+
+        # Set scores of the initial finished seqs to negative infinity.
+        finished_scores = tf.ones([self.batch_size, self.beam_size]) * -INF
+
+        # Initialize finished flags with all False values.
+        finished_flags = tf.zeros([self.batch_size, self.beam_size], tf.bool)
+
+        # Create state dictionary
+        state = {
+            _StateKeys.CUR_INDEX: cur_index,
+            _StateKeys.ALIVE_SEQ: alive_seq,
+            _StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
+            _StateKeys.ALIVE_CACHE: alive_cache,
+            _StateKeys.FINISHED_SEQ: finished_seq,
+            _StateKeys.FINISHED_SCORES: finished_scores,
+            _StateKeys.FINISHED_FLAGS: finished_flags
+        }
+
+        # Create state invariants for each value in the state dictionary. Each
+        # dimension must be a constant or None. A None dimension means either:
+        #   1) the dimension's value is a tensor that remains the same but may
+        #      depend on the input sequence to the model (e.g. batch size).
+        #   2) the dimension may have different values on different iterations.
+        state_shape_invariants = {
+            _StateKeys.CUR_INDEX: tf.TensorShape([]),
+            _StateKeys.ALIVE_SEQ: tf.TensorShape([None, self.beam_size, None]),
+            _StateKeys.ALIVE_LOG_PROBS: tf.TensorShape([None, self.beam_size]),
+            _StateKeys.ALIVE_CACHE: nest.map_structure(_get_shape_keep_last_dim, alive_cache),
+            _StateKeys.FINISHED_SEQ: tf.TensorShape([None, self.beam_size, None]),
+            _StateKeys.FINISHED_SCORES: tf.TensorShape([None, self.beam_size]),
+            _StateKeys.FINISHED_FLAGS: tf.TensorShape([None, self.beam_size])
+        }
+
+        return state, state_shape_invariants
+
+    def _continue_search(self, state):
+        """Return whether to continue the search loop.
+
+    The loops should terminate when
+      1) when decode length has been reached, or
+      2) when the worst score in the finished sequences is better than the best
+         score in the alive sequences (i.e. the finished sequences are provably
+         unchanging)
+
+    Args:
+      state: A dictionary with the current loop state.
+
+    Returns:
+      Bool tensor with value True if loop should continue, False if loop should
+      terminate.
+    """
+        i = state[_StateKeys.CUR_INDEX]
+        alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
+        finished_scores = state[_StateKeys.FINISHED_SCORES]
+        finished_flags = state[_StateKeys.FINISHED_FLAGS]
+
+        not_at_max_decode_length = tf.less(i, self.max_decode_length)
+
+        # Calculate largest length penalty (the larger penalty, the better score).
+        max_length_norm = _length_normalization(self.alpha, self.max_decode_length)
+        # Get the best possible scores from alive sequences.
+        best_alive_scores = alive_log_probs[:, 0] / max_length_norm
+
+        # Compute worst score in finished sequences for each batch element
+        finished_scores *= tf.cast(finished_flags, tf.float32)  # set filler scores to zero
+        lowest_finished_scores = tf.reduce_min(finished_scores, axis=1)
+
+        # If there are no finished sequences in a batch element, then set the lowest
+        # finished score to -INF for that element.
+        finished_batches = tf.reduce_any(finished_flags, 1)
+        lowest_finished_scores += (1.0 - tf.cast(finished_batches, tf.float32)) * -INF
+
+        worst_finished_score_better_than_best_alive_score = tf.reduce_all(
+            tf.greater(lowest_finished_scores, best_alive_scores)
+        )
+
+        return tf.logical_and(
+            not_at_max_decode_length, tf.logical_not(worst_finished_score_better_than_best_alive_score)
+        )
+
+    def _search_step(self, state):
+        """Beam search loop body.
+
+    Grow alive sequences by a single ID. Sequences that have reached the EOS
+    token are marked as finished. The alive and finished sequences with the
+    highest log probabilities and scores are returned.
+
+    A sequence's finished score is calculating by dividing the log probability
+    by the length normalization factor. Without length normalization, the
+    search is more likely to return shorter sequences.
+
+    Args:
+      state: A dictionary with the current loop state.
+
+    Returns:
+      new state dictionary.
+    """
+        # Grow alive sequences by one token.
+        new_seq, new_log_probs, new_cache = self._grow_alive_seq(state)
+        # Collect top beam_size alive sequences
+        alive_state = self._get_new_alive_state(new_seq, new_log_probs, new_cache)
+
+        # Combine newly finished sequences with existing finished sequences, and
+        # collect the top k scoring sequences.
+        finished_state = self._get_new_finished_state(state, new_seq, new_log_probs)
+
+        # Increment loop index and create new state dictionary
+        new_state = {_StateKeys.CUR_INDEX: state[_StateKeys.CUR_INDEX] + 1}
+        new_state.update(alive_state)
+        new_state.update(finished_state)
+        return [new_state]
+
+    def _grow_alive_seq(self, state):
+        """Grow alive sequences by one token, and collect top 2*beam_size sequences.
+
+    2*beam_size sequences are collected because some sequences may have reached
+    the EOS token. 2*beam_size ensures that at least beam_size sequences are
+    still alive.
+
+    Args:
+      state: A dictionary with the current loop state.
+    Returns:
+      Tuple of
+      (Top 2*beam_size sequences [batch_size, 2 * beam_size, cur_index + 1],
+       Scores of returned sequences [batch_size, 2 * beam_size],
+       New alive cache, for each of the 2 * beam_size sequences)
+    """
+        i = state[_StateKeys.CUR_INDEX]
+        alive_seq = state[_StateKeys.ALIVE_SEQ]
+        alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
+        alive_cache = state[_StateKeys.ALIVE_CACHE]
+
+        beams_to_keep = 2 * self.beam_size
+
+        # Get logits for the next candidate IDs for the alive sequences. Get the new
+        # cache values at the same time.
+        flat_ids = _flatten_beam_dim(alive_seq)  # [batch_size * beam_size]
+        flat_cache = nest.map_structure(_flatten_beam_dim, alive_cache)
+
+        flat_logits, flat_cache = self.symbols_to_logits_fn(flat_ids, i, flat_cache)
+
+        # Unflatten logits to shape [batch_size, beam_size, vocab_size]
+        logits = _unflatten_beam_dim(flat_logits, self.batch_size, self.beam_size)
+        new_cache = nest.map_structure(lambda t: _unflatten_beam_dim(t, self.batch_size, self.beam_size), flat_cache)
+
+        # Convert logits to normalized log probs
+        candidate_log_probs = _log_prob_from_logits(logits)
+
+        # Calculate new log probabilities if each of the alive sequences were
+        # extended # by the the candidate IDs.
+        # Shape [batch_size, beam_size, vocab_size]
+        log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2)
+
+        # Each batch item has beam_size * vocab_size candidate sequences. For each
+        # batch item, get the k candidates with the highest log probabilities.
+        flat_log_probs = tf.reshape(log_probs, [-1, self.beam_size * self.vocab_size])
+        topk_log_probs, topk_indices = tf.nn.top_k(flat_log_probs, k=beams_to_keep)
+
+        # Extract the alive sequences that generate the highest log probabilities
+        # after being extended.
+        topk_beam_indices = topk_indices // self.vocab_size
+        topk_seq, new_cache = _gather_beams([alive_seq, new_cache], topk_beam_indices, self.batch_size, beams_to_keep)
+
+        # Append the most probable IDs to the topk sequences
+        topk_ids = topk_indices % self.vocab_size
+        topk_ids = tf.expand_dims(topk_ids, axis=2)
+        topk_seq = tf.concat([topk_seq, topk_ids], axis=2)
+        return topk_seq, topk_log_probs, new_cache
+
+    def _get_new_alive_state(self, new_seq, new_log_probs, new_cache):
+        """Gather the top k sequences that are still alive.
+
+    Args:
+      new_seq: New sequences generated by growing the current alive sequences
+        int32 tensor with shape [batch_size, 2 * beam_size, cur_index + 1]
+      new_log_probs: Log probabilities of new sequences
+        float32 tensor with shape [batch_size, beam_size]
+      new_cache: Dict of cached values for each sequence.
+
+    Returns:
+      Dictionary with alive keys from _StateKeys:
+        {Top beam_size sequences that are still alive (don't end with eos_id)
+         Log probabilities of top alive sequences
+         Dict cache storing decoder states for top alive sequences}
+    """
+        # To prevent finished sequences from being considered, set log probs to -INF
+        new_finished_flags = tf.equal(new_seq[:, :, -1], self.eos_id)
+        new_log_probs += tf.cast(new_finished_flags, tf.float32) * -INF
+
+        top_alive_seq, top_alive_log_probs, top_alive_cache = _gather_topk_beams(
+            [new_seq, new_log_probs, new_cache], new_log_probs, self.batch_size, self.beam_size
+        )
+
+        return {
+            _StateKeys.ALIVE_SEQ: top_alive_seq,
+            _StateKeys.ALIVE_LOG_PROBS: top_alive_log_probs,
+            _StateKeys.ALIVE_CACHE: top_alive_cache
+        }
+
+    def _get_new_finished_state(self, state, new_seq, new_log_probs):
+        """Combine new and old finished sequences, and gather the top k sequences.
+
+    Args:
+      state: A dictionary with the current loop state.
+      new_seq: New sequences generated by growing the current alive sequences
+        int32 tensor with shape [batch_size, beam_size, i + 1]
+      new_log_probs: Log probabilities of new sequences
+        float32 tensor with shape [batch_size, beam_size]
+
+    Returns:
+      Dictionary with finished keys from _StateKeys:
+        {Top beam_size finished sequences based on score,
+         Scores of finished sequences,
+         Finished flags of finished sequences}
+    """
+        i = state[_StateKeys.CUR_INDEX]
+        finished_seq = state[_StateKeys.FINISHED_SEQ]
+        finished_scores = state[_StateKeys.FINISHED_SCORES]
+        finished_flags = state[_StateKeys.FINISHED_FLAGS]
+
+        # First append a column of 0-ids to finished_seq to increment the length.
+        # New shape of finished_seq: [batch_size, beam_size, i + 1]
+        finished_seq = tf.concat([finished_seq, tf.zeros([self.batch_size, self.beam_size, 1], tf.int32)], axis=2)
+
+        # Calculate new seq scores from log probabilities.
+        length_norm = _length_normalization(self.alpha, i + 1)
+        new_scores = new_log_probs / length_norm
+
+        # Set the scores of the still-alive seq in new_seq to large negative values.
+        new_finished_flags = tf.equal(new_seq[:, :, -1], self.eos_id)
+        new_scores += (1. - tf.cast(new_finished_flags, tf.float32)) * -INF
+
+        # Combine sequences, scores, and flags.
+        finished_seq = tf.concat([finished_seq, new_seq], axis=1)
+        finished_scores = tf.concat([finished_scores, new_scores], axis=1)
+        finished_flags = tf.concat([finished_flags, new_finished_flags], axis=1)
+
+        # Return the finished sequences with the best scores.
+        top_finished_seq, top_finished_scores, top_finished_flags = (
+            _gather_topk_beams(
+                [finished_seq, finished_scores, finished_flags], finished_scores, self.batch_size, self.beam_size
+            )
+        )
+
+        return {
+            _StateKeys.FINISHED_SEQ: top_finished_seq,
+            _StateKeys.FINISHED_SCORES: top_finished_scores,
+            _StateKeys.FINISHED_FLAGS: top_finished_flags
+        }
+
+
+def sequence_beam_search(
+        symbols_to_logits_fn, initial_ids, initial_cache, vocab_size, beam_size, alpha, max_decode_length, eos_id
+):
+    """Search for sequence of subtoken ids with the largest probability.
+
+  Args:
+    symbols_to_logits_fn: A function that takes in ids, index, and cache as
+      arguments. The passed in arguments will have shape:
+        ids -> [batch_size * beam_size, index]
+        index -> [] (scalar)
+        cache -> nested dictionary of tensors [batch_size * beam_size, ...]
+      The function must return logits and new cache.
+        logits -> [batch * beam_size, vocab_size]
+        new cache -> same shape/structure as inputted cache
+    initial_ids: Starting ids for each batch item.
+      int32 tensor with shape [batch_size]
+    initial_cache: dict containing starting decoder variables information
+    vocab_size: int size of tokens
+    beam_size: int number of beams
+    alpha: float defining the strength of length normalization
+    max_decode_length: maximum length to decoded sequence
+    eos_id: int id of eos token, used to determine when a sequence has finished
+
+  Returns:
+    Top decoded sequences [batch_size, beam_size, max_decode_length]
+    sequence scores [batch_size, beam_size]
+  """
+    batch_size = tf.shape(initial_ids)[0]
+    sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, batch_size, beam_size, alpha, max_decode_length, eos_id)
+    return sbs.search(initial_ids, initial_cache)
+
+
+def _log_prob_from_logits(logits):
+    return logits - tf.reduce_logsumexp(logits, axis=2, keepdims=True)
+
+
+def _length_normalization(alpha, length):
+    """Return length normalization factor."""
+    return tf.pow(((5. + tf.cast(length, tf.float32)) / 6.), alpha)
+
+
+def _expand_to_beam_size(tensor, beam_size):
+    """Tiles a given tensor by beam_size.
+
+  Args:
+    tensor: tensor to tile [batch_size, ...]
+    beam_size: How much to tile the tensor by.
+
+  Returns:
+    Tiled tensor [batch_size, beam_size, ...]
+  """
+    tensor = tf.expand_dims(tensor, axis=1)
+    tile_dims = [1] * tensor.shape.ndims
+    tile_dims[1] = beam_size
+
+    return tf.tile(tensor, tile_dims)
+
+
+def _shape_list(tensor):
+    """Return a list of the tensor's shape, and ensure no None values in list."""
+    # Get statically known shape (may contain None's for unknown dimensions)
+    shape = tensor.get_shape().as_list()
+
+    # Ensure that the shape values are not None
+    dynamic_shape = tf.shape(tensor)
+    for i in range(len(shape)):  # pylint: disable=consider-using-enumerate
+        if shape[i] is None:
+            shape[i] = dynamic_shape[i]
+    return shape
+
+
+def _get_shape_keep_last_dim(tensor):
+    shape_list = _shape_list(tensor)
+
+    # Only the last
+    for i in range(len(shape_list) - 1):
+        shape_list[i] = None
+
+    if isinstance(shape_list[-1], tf.Tensor):
+        shape_list[-1] = None
+    return tf.TensorShape(shape_list)
+
+
+def _flatten_beam_dim(tensor):
+    """Reshapes first two dimensions in to single dimension.
+
+  Args:
+    tensor: Tensor to reshape of shape [A, B, ...]
+
+  Returns:
+    Reshaped tensor of shape [A*B, ...]
+  """
+    shape = _shape_list(tensor)
+    shape[0] *= shape[1]
+    shape.pop(1)  # Remove beam dim
+    return tf.reshape(tensor, shape)
+
+
+def _unflatten_beam_dim(tensor, batch_size, beam_size):
+    """Reshapes first dimension back to [batch_size, beam_size].
+
+  Args:
+    tensor: Tensor to reshape of shape [batch_size*beam_size, ...]
+    batch_size: Tensor, original batch size.
+    beam_size: int, original beam size.
+
+  Returns:
+    Reshaped tensor of shape [batch_size, beam_size, ...]
+  """
+    shape = _shape_list(tensor)
+    new_shape = [batch_size, beam_size] + shape[1:]
+    return tf.reshape(tensor, new_shape)
+
+
+def _gather_beams(nested, beam_indices, batch_size, new_beam_size):
+    """Gather beams from nested structure of tensors.
+
+  Each tensor in nested represents a batch of beams, where beam refers to a
+  single search state (beam search involves searching through multiple states
+  in parallel).
+
+  This function is used to gather the top beams, specified by
+  beam_indices, from the nested tensors.
+
+  Args:
+    nested: Nested structure (tensor, list, tuple or dict) containing tensors
+      with shape [batch_size, beam_size, ...].
+    beam_indices: int32 tensor with shape [batch_size, new_beam_size]. Each
+     value in beam_indices must be between [0, beam_size), and are not
+     necessarily unique.
+    batch_size: int size of batch
+    new_beam_size: int number of beams to be pulled from the nested tensors.
+
+  Returns:
+    Nested structure containing tensors with shape
+      [batch_size, new_beam_size, ...]
+  """
+    # Computes the i'th coodinate that contains the batch index for gather_nd.
+    # Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..].
+    batch_pos = tf.range(batch_size * new_beam_size) // new_beam_size
+    batch_pos = tf.reshape(batch_pos, [batch_size, new_beam_size])
+
+    # Create coordinates to be passed to tf.gather_nd. Stacking creates a tensor
+    # with shape [batch_size, beam_size, 2], where the last dimension contains
+    # the (i, j) gathering coordinates.
+    coordinates = tf.stack([batch_pos, beam_indices], axis=2)
+
+    return nest.map_structure(lambda state: tf.gather_nd(state, coordinates), nested)
+
+
+def _gather_topk_beams(nested, score_or_log_prob, batch_size, beam_size):
+    """Gather top beams from nested structure."""
+    _, topk_indexes = tf.nn.top_k(score_or_log_prob, k=beam_size)
+    return _gather_beams(nested, topk_indexes, batch_size, beam_size)
diff --git a/tensorlayer/models/transformer/embedding_layer.py b/tensorlayer/models/transformer/embedding_layer.py
new file mode 100644
index 000000000..5276ed48d
--- /dev/null
+++ b/tensorlayer/models/transformer/embedding_layer.py
@@ -0,0 +1,103 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of embedding layer with shared weights."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import tensorlayer as tl
+
+
+class EmbeddingLayer(tl.layers.Layer):
+    """Calculates input embeddings and pre-softmax linear with shared weights."""
+
+    def __init__(self, vocab_size, hidden_size):
+        """Specify characteristic parameters of embedding layer.
+
+    Args:
+      vocab_size: Number of tokens in the embedding. (Typically ~32,000)
+      hidden_size: Dimensionality of the embedding. (Typically 512 or 1024)
+    """
+        super(EmbeddingLayer, self).__init__()
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+
+        self.build(tuple())
+        self._built = True
+
+    def build(self, inputs_shape):
+        with tf.name_scope("embedding_and_softmax"):
+            # Create and initialize weights. The random normal initializer was chosen
+            # arbitrarily, and works well.
+            self.W = self._get_weights(
+                'weights', shape=(self.vocab_size, self.hidden_size),
+                init=tf.random_normal_initializer(mean=0., stddev=self.hidden_size**-0.5)
+            )
+
+    def get_config(self):
+        return {
+            "vocab_size": self.vocab_size,
+            "hidden_size": self.hidden_size,
+        }
+
+    def forward(self, inputs, mode="embedding"):
+        """Get token embeddings of inputs.
+
+    Args:
+      inputs: An int64 tensor with shape [batch_size, length]
+      mode: string, a valid value is one of "embedding" and "linear".
+    Returns:
+      outputs: (1) If mode == "embedding", output embedding tensor, float32 with
+        shape [batch_size, length, embedding_size]; (2) mode == "linear", output
+        linear tensor, float32 with shape [batch_size, length, vocab_size].
+    Raises:
+      ValueError: if mode is not valid.
+    """
+        if mode == "embedding":
+            return self._embedding(inputs)
+        elif mode == "linear":
+            return self._linear(inputs)
+        else:
+            raise ValueError("mode {} is not valid.".format(mode))
+
+    def _embedding(self, inputs):
+        """Applies embedding based on inputs tensor."""
+        with tf.name_scope("embedding"):
+            # Create binary mask of size [batch_size, length]
+            mask = tf.cast(tf.not_equal(inputs, 0), tf.float32)
+            embeddings = tf.gather(self.W, inputs)
+            embeddings *= tf.expand_dims(mask, -1)
+            # Scale embedding by the sqrt of the hidden size
+            embeddings *= self.hidden_size**0.5
+            return embeddings
+
+    def _linear(self, inputs):
+        """Computes logits by running inputs through a linear layer.
+
+    Args:
+      inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+    Returns:
+      float32 tensor with shape [batch_size, length, vocab_size].
+    """
+        with tf.name_scope("presoftmax_linear"):
+            batch_size = tf.shape(inputs)[0]
+            length = tf.shape(inputs)[1]
+
+            x = tf.reshape(inputs, [-1, self.hidden_size])
+            logits = tf.matmul(x, self.W, transpose_b=True)
+
+            return tf.reshape(logits, [batch_size, length, self.vocab_size])
diff --git a/tensorlayer/models/transformer/feedforward_layer.py b/tensorlayer/models/transformer/feedforward_layer.py
new file mode 100644
index 000000000..a6b1fc049
--- /dev/null
+++ b/tensorlayer/models/transformer/feedforward_layer.py
@@ -0,0 +1,86 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of fully connected network."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import tensorlayer as tl
+
+
+class FeedForwardLayer(tl.layers.Layer):
+    """Fully connected feedforward network."""
+
+    def __init__(self, hidden_size, filter_size, keep_prob):
+        """Initialize FeedForwardNetwork.
+
+    Args:
+      hidden_size: int, output dim of hidden layer.
+      filter_size: int, filter size for the inner (first) dense layer.
+      relu_dropout: float, dropout rate for training.
+    """
+        super(FeedForwardLayer, self).__init__()
+        self.hidden_size = hidden_size
+        self.filter_size = filter_size
+        self.relu_dropout = 1 - keep_prob
+        self.filter_dense_layer = tl.layers.Dense(
+            self.filter_size, in_channels=self.hidden_size, W_init=tf.keras.initializers.get('glorot_uniform'),
+            name="input_layer"
+        )
+        self.output_dense_layer = tl.layers.Dense(
+            self.hidden_size, in_channels=self.filter_size, W_init=tf.keras.initializers.get('glorot_uniform'),
+            name="output_layer"
+        )
+        self.build(None)
+        self._built = True
+
+    def build(self, inputs_shape):
+        pass
+
+    def get_config(self):
+        return {
+            "hidden_size": self.hidden_size,
+            "filter_size": self.filter_size,
+            "relu_dropout": self.relu_dropout,
+        }
+
+    def forward(self, inputs):
+        """Return outputs of the feedforward network.
+
+    Args:
+      x: tensor with shape [batch_size, length, hidden_size]
+      training: boolean, whether in training mode or not.
+
+    Returns:
+      Output of the feedforward network.
+      tensor with shape [batch_size, length, hidden_size]
+    """
+        # Retrieve dynamically known shapes
+        x = inputs
+        batch_size = tf.shape(x)[0]
+        length = tf.shape(x)[1]
+        x = tf.reshape(x, [-1, x.shape[-1]])
+        output = self.filter_dense_layer(x)
+        output = tf.nn.relu(output)
+        output = tf.reshape(output, [batch_size, -1, output.shape[-1]])
+        if self.is_train:
+            output = tf.nn.dropout(output, rate=self.relu_dropout)
+        output = tf.reshape(output, [-1, output.shape[-1]])
+        output = self.output_dense_layer(output)
+        output = tf.reshape(output, [batch_size, -1, output.shape[-1]])
+
+        return output
\ No newline at end of file
diff --git a/tensorlayer/models/transformer/transformer.py b/tensorlayer/models/transformer/transformer.py
new file mode 100644
index 000000000..af1a3647e
--- /dev/null
+++ b/tensorlayer/models/transformer/transformer.py
@@ -0,0 +1,460 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines the Transformer model in TF 2.0.
+
+Model paper: https://arxiv.org/pdf/1706.03762.pdf
+Transformer model code source: https://github.com/tensorflow/tensor2tensor
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import tensorlayer as tl
+from tensorlayer.models import Model
+import tensorlayer.models.transformer.embedding_layer as embedding_layer
+from tensorlayer.models.transformer.attention_layer import SelfAttentionLayer, MultiHeadAttentionLayer
+from tensorlayer.models.transformer.feedforward_layer import FeedForwardLayer
+from tensorlayer.models.transformer.utils.model_utils import positional_encoding
+from tensorlayer.models.transformer.utils.model_utils import get_decoder_self_attention_bias as get_target_mask
+from tensorlayer.models.transformer.utils.model_utils import get_padding_bias as get_input_mask
+import tensorlayer.models.transformer.beamsearchHelper.beam_search as beam_search
+
+
+class Transformer(Model):
+    """Transformer model with tensorlayer.
+
+  Implemented as described in: https://arxiv.org/pdf/1706.03762.pdf
+
+  The Transformer model consists of an encoder and decoder. The input is an int
+  sequence (or a batch of sequences). The encoder produces a continuous
+  representation, and the decoder uses the encoder output to generate
+  probabilities for the output sequence.
+  """
+
+    def __init__(self, params, name=None):
+        """Initialize layers to build Transformer model.
+
+    Args:
+      params: hyperparameter object defining layer sizes, dropout values, etc.
+      name: name of the model.
+    """
+        super(Transformer, self).__init__(name=name)
+        self.params = params
+        self.embedding_softmax_layer = embedding_layer.EmbeddingLayer(params.vocab_size, params.hidden_size)
+        self.encoder_stack = EncoderStack(params)
+        self.decoder_stack = DecoderStack(params)
+
+    def get_config(self):
+        return {
+            "params": self.params,
+        }
+
+    def forward(self, inputs, targets=None):
+        """Calculate target logits or inferred target sequences.
+
+    Args:
+      inputs: input tensor list of size 1 or 2.
+        First item, inputs: int tensor with shape [batch_size, input_length].
+        Second item (optional), targets: None or int tensor with shape
+          [batch_size, target_length].
+      training: boolean, whether in training mode or not.
+
+    Returns:
+      If targets is defined, then return logits for each word in the target
+      sequence. float tensor with shape [batch_size, target_length, vocab_size]
+      If target is none, then generate output sequence one token at a time.
+        returns a dictionary {
+          outputs: [batch_size, decoded length]
+          scores: [batch_size, float]}
+    """
+        # # Variance scaling is used here because it seems to work in many problems.
+        # # Other reasonable initializers may also work just as well.
+
+        # Calculate attention bias for encoder self-attention and decoder
+        # multi-headed attention layers.
+        attention_bias = get_input_mask(inputs)
+
+        # Run the inputs through the encoder layer to map the symbol
+        # representations to continuous representations.
+        # Prepare inputs to the layer stack by adding positional encodings and
+        # applying dropout.
+        embedded_inputs = self.embedding_softmax_layer(inputs)
+        inputs_padding = get_input_mask(inputs)
+
+        encoder_outputs = self.encode(inputs, inputs_padding)
+        # Generate output sequence if targets is None, or return logits if target
+        # sequence is known.
+        if targets is None:
+            return self.predict(encoder_outputs, attention_bias)
+        else:
+            logits = self.decode(targets, encoder_outputs, attention_bias)
+        return logits
+
+    def encode(self, inputs, attention_bias):
+        """Generate continuous representation for inputs.
+
+    Args:
+      inputs: int tensor with shape [batch_size, input_length].
+      attention_bias: float tensor with shape [batch_size, 1, 1, input_length].
+      training: boolean, whether in training mode or not.
+
+    Returns:
+      float tensor with shape [batch_size, input_length, hidden_size]
+    """
+
+        # Prepare inputs to the layer stack by adding positional encodings and
+        # applying dropout.
+        embedded_inputs = self.embedding_softmax_layer(inputs)
+        inputs_padding = get_input_mask(inputs)
+
+        length = tf.shape(embedded_inputs)[1]
+        pos_encoding = positional_encoding(length, self.params.hidden_size)
+        encoder_inputs = embedded_inputs + pos_encoding
+
+        if self.is_train:
+            encoder_inputs = tf.nn.dropout(encoder_inputs, rate=1 - self.params.keep_prob)
+        return self.encoder_stack(encoder_inputs, input_mask=attention_bias)
+
+    def decode(self, targets, encoder_outputs, attention_bias):
+        """Generate logits for each value in the target sequence.
+
+    Args:
+      targets: target values for the output sequence. int tensor with shape
+        [batch_size, target_length]
+      encoder_outputs: continuous representation of input sequence. float tensor
+        with shape [batch_size, input_length, hidden_size]
+      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
+      training: boolean, whether in training mode or not.
+
+    Returns:
+      float32 tensor with shape [batch_size, target_length, vocab_size]
+    """
+        with tf.name_scope("decode"):
+            # Prepare inputs to decoder layers by shifting targets, adding positional
+            # encoding and applying dropout.
+            decoder_inputs = self.embedding_softmax_layer(targets)
+            with tf.name_scope("shift_targets"):
+                # Shift targets to the right, and remove the last element
+                decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
+            with tf.name_scope("add_pos_encoding"):
+                length = tf.shape(decoder_inputs)[1]
+                decoder_inputs += positional_encoding(length, self.params.hidden_size)
+            if self.is_train:
+                decoder_inputs = tf.nn.dropout(decoder_inputs, rate=1 - self.params.keep_prob)
+
+            # Run values
+            decoder_self_attention_bias = get_target_mask(length)
+            outputs = self.decoder_stack(
+                decoder_inputs,
+                features=encoder_outputs,
+                input_mask=attention_bias,
+                target_mask=decoder_self_attention_bias,
+            )
+            logits = self.embedding_softmax_layer(outputs, mode="linear")
+            return logits
+
+    def _get_symbols_to_logits_fn(self, max_decode_length):
+        """Returns a decoding function that calculates logits of the next tokens."""
+
+        timing_signal = positional_encoding(max_decode_length + 1, self.params.hidden_size)
+        decoder_self_attention_bias = get_target_mask(max_decode_length)
+
+        def symbols_to_logits_fn(ids, i, cache):
+            """Generate logits for next potential IDs.
+
+      Args:
+        ids: Current decoded sequences. int tensor with shape [batch_size *
+          beam_size, i + 1]
+        i: Loop index
+        cache: dictionary of values storing the encoder output, encoder-decoder
+          attention bias, and previous decoder attention values.
+
+      Returns:
+        Tuple of
+          (logits with shape [batch_size * beam_size, vocab_size],
+           updated cache values)
+      """
+            # Set decoder input to the last generated IDs
+            decoder_input = ids[:, -1:]
+
+            # Preprocess decoder input by getting embeddings and adding timing signal.
+            decoder_input = self.embedding_softmax_layer(decoder_input)
+            decoder_input += timing_signal[i:i + 1]
+
+            self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
+            decoder_outputs = self.decoder_stack(
+                decoder_input, features=cache.get("encoder_outputs"), target_mask=self_attention_bias,
+                input_mask=cache.get("encoder_decoder_attention_bias"), cache=cache
+            )
+            logits = self.embedding_softmax_layer(decoder_outputs, mode="linear")
+            logits = tf.squeeze(logits, axis=[1])
+            return logits, cache
+
+        return symbols_to_logits_fn
+
+    def predict(self, encoder_outputs, encoder_decoder_attention_bias):
+        """Return predicted sequence."""
+        batch_size = tf.shape(encoder_outputs)[0]
+        input_length = tf.shape(encoder_outputs)[1]
+        max_decode_length = input_length + self.params.extra_decode_length
+
+        symbols_to_logits_fn = self._get_symbols_to_logits_fn(max_decode_length)
+
+        # Create initial set of IDs that will be passed into symbols_to_logits_fn.
+        initial_ids = tf.zeros([batch_size], dtype=tf.int32)
+
+        # Create cache storing decoder attention values for each layer.
+        # pylint: disable=g-complex-comprehension
+        cache = {
+            "layer_%d" % layer: {
+                "k": tf.zeros([batch_size, 0, self.params.hidden_size]),
+                "v": tf.zeros([batch_size, 0, self.params.hidden_size])
+            } for layer in range(self.params.encoder_num_layers)
+        }
+
+        # Add encoder output and attention bias to the cache.
+        cache["encoder_outputs"] = encoder_outputs
+        cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
+
+        # Use beam search to find the top beam_size sequences and scores.
+        decoded_ids, scores = beam_search.sequence_beam_search(
+            symbols_to_logits_fn=symbols_to_logits_fn, initial_ids=initial_ids, initial_cache=cache,
+            vocab_size=self.params.vocab_size, beam_size=self.params.beam_size, alpha=self.params.alpha,
+            max_decode_length=max_decode_length, eos_id=1
+        )
+
+        # Get the top sequence for each batch element
+        top_decoded_ids = decoded_ids[:, 0, 1:]
+        top_scores = scores[:, 0]
+
+        return {"outputs": top_decoded_ids, "scores": top_scores}
+
+
+class LayerNormalization(tl.layers.Layer):
+    """
+    Layer normalization
+
+    Parameters
+    ----------
+    hidden_size:
+        hidden size of features
+    epsilon:
+        value to prevent division by zero
+
+    """
+
+    def __init__(self, hidden_size, epsilon=1e-6):
+        super(LayerNormalization, self).__init__()
+        self.hidden_size = hidden_size
+        self.epsilon = epsilon
+
+        self.build(tuple())
+        self._built = True
+
+    def build(self, inputs_shape):
+        self.scale = self._get_weights('scale', shape=(self.hidden_size), init=tl.initializers.Ones())
+        self.bias = self._get_weights('bias', shape=(self.hidden_size), init=tl.initializers.Zeros())
+
+    def forward(self, inputs):
+        mean = tf.reduce_mean(inputs, axis=[-1], keepdims=True)
+        var = tf.reduce_mean(tf.square(inputs - mean), axis=[-1], keepdims=True)
+        norm_inputs = (inputs - mean) * tf.math.rsqrt(var + self.epsilon)
+        return norm_inputs * self.scale + self.bias
+
+    def __repr__(self):
+        return "layer normalization"
+
+
+class PrePostProcessingWrapper(Model):
+    """Wrapper class that applies layer pre-processing and post-processing."""
+
+    def __init__(self, layer, params):
+        super(PrePostProcessingWrapper, self).__init__()
+        self.layer = layer
+        self.params = params
+        self.postprocess_dropout = 1 - params.keep_prob
+        self.layer_norm = LayerNormalization(self.params.hidden_size)
+
+    def get_config(self):
+        return {
+            "params": self.params,
+        }
+
+    def forward(self, inputs, *args, **kwargs):
+        """Calls wrapped layer with same parameters."""
+
+        if (type(inputs) == list):
+            if (len(inputs) == 2):
+                x = decoder_input = inputs[0]
+                encoder_output = inputs[1]
+                decoder_input = self.layer_norm(decoder_input)
+                # Get layer output
+                y = self.layer([decoder_input, encoder_output], *args, **kwargs)
+
+        else:
+            x = inputs
+            y = self.layer_norm(inputs)
+            y = self.layer(y, *args, **kwargs)
+
+        # Postprocessing: apply dropout and residual connection
+        if self.is_train:
+            y = tf.nn.dropout(y, rate=self.postprocess_dropout)
+        return x + y
+
+
+class EncoderStack(Model):
+    """Transformer encoder stack.
+
+  The encoder stack is made up of N identical layers. Each layer is composed
+  of the sublayers:
+    1. Self-attention layer
+    2. Feedforward network (which is 2 fully-connected layers)
+  """
+
+    def __init__(self, params):
+        super(EncoderStack, self).__init__()
+        self.params = params
+        self.layers = []
+        for _ in range(params.encoder_num_layers):
+            # Create sublayers for each layer.
+            self_attention_layer = SelfAttentionLayer(params.num_heads, params.hidden_size, params.keep_prob)
+            feed_forward_network = FeedForwardLayer(params.hidden_size, params.ff_size, params.keep_prob)
+
+            self.layers.append(
+                [
+                    PrePostProcessingWrapper(self_attention_layer, params),
+                    PrePostProcessingWrapper(feed_forward_network, params)
+                ]
+            )
+
+        # Create final layer normalization layer.
+        self.output_normalization = LayerNormalization(params.hidden_size)
+
+    def get_config(self):
+        return {
+            "params": self.params,
+        }
+
+    def forward(self, inputs, input_mask):
+        """Return the output of the encoder layer stacks.
+
+    Args:
+      encoder_inputs: tensor with shape [batch_size, input_length, hidden_size]
+      attention_bias: bias for the encoder self-attention layer. [batch_size, 1,
+        1, input_length]
+      inputs_padding: tensor with shape [batch_size, input_length], inputs with
+        zero paddings.
+      training: boolean, whether in training mode or not.
+
+    Returns:
+      Output of encoder layer stack.
+      float32 tensor with shape [batch_size, input_length, hidden_size]
+    """
+        encoder_inputs = inputs
+        for n, layer in enumerate(self.layers):
+            # Run inputs through the sublayers.
+            self_attention_layer = layer[0]
+            feed_forward_network = layer[1]
+
+            with tf.name_scope("layer_%d" % n):
+                with tf.name_scope("self_attention"):
+                    encoder_inputs = self_attention_layer(encoder_inputs, mask=input_mask)
+                # with tf.name_scope("layer_attention"):
+                #   encoder_inputs = (inputs, y=encoder_inputs, mask=input_mask)
+                with tf.name_scope("ffn"):
+                    encoder_inputs = feed_forward_network(encoder_inputs)
+
+        return self.output_normalization(encoder_inputs)
+
+
+class DecoderStack(Model):
+    """Transformer decoder stack.
+
+  Like the encoder stack, the decoder stack is made up of N identical layers.
+  Each layer is composed of the sublayers:
+    1. Self-attention layer
+    2. Multi-headed attention layer combining encoder outputs with results from
+       the previous self-attention layer.
+    3. Feedforward network (2 fully-connected layers)
+  """
+
+    def __init__(self, params):
+        super(DecoderStack, self).__init__()
+        self.params = params
+        self.layers = []
+        for _ in range(params.decoder_num_layers):
+            self_attention_layer = SelfAttentionLayer(params.num_heads, params.hidden_size, params.keep_prob)
+            enc_dec_attention_layer = MultiHeadAttentionLayer(params.num_heads, params.hidden_size, params.keep_prob)
+            feed_forward_network = FeedForwardLayer(params.hidden_size, params.ff_size, params.keep_prob)
+
+            self.layers.append(
+                [
+                    PrePostProcessingWrapper(self_attention_layer, params),
+                    PrePostProcessingWrapper(enc_dec_attention_layer, params),
+                    PrePostProcessingWrapper(feed_forward_network, params)
+                ]
+            )
+        self.output_normalization = LayerNormalization(params.hidden_size)
+
+    def get_config(self):
+        return {
+            "params": self.params,
+        }
+
+    def forward(self, inputs, features, input_mask, target_mask, cache=None):
+        """Return the output of the decoder layer stacks.
+
+    Args:
+      decoder_inputs: tensor with shape [batch_size, target_length, hidden_size]
+      encoder_outputs: tensor with shape [batch_size, input_length, hidden_size]
+      decoder_self_attention_bias: bias for decoder self-attention layer. [1, 1,
+        target_len, target_length]
+      attention_bias: bias for encoder-decoder attention layer. [batch_size, 1,
+        1, input_length]
+      training: boolean, whether in training mode or not.
+      cache: (Used for fast decoding) A nested dictionary storing previous
+        decoder self-attention values. The items are:
+          {layer_n: {"k": tensor with shape [batch_size, i, key_channels],
+                     "v": tensor with shape [batch_size, i, value_channels]},
+                       ...}
+
+    Returns:
+      Output of decoder layer stack.
+      float32 tensor with shape [batch_size, target_length, hidden_size]
+    """
+        decoder_inputs = inputs
+        decoder_self_attention_bias = target_mask
+        encoder_outputs = features
+        attention_bias = input_mask
+        for n, layer in enumerate(self.layers):
+            self_attention_layer = layer[0]
+            enc_dec_attention_layer = layer[1]
+            feed_forward_network = layer[2]
+
+            # Run inputs through the sublayers.
+            layer_name = "layer_%d" % n
+            layer_cache = cache[layer_name] if cache is not None else None
+            with tf.name_scope(layer_name):
+                with tf.name_scope("self_attention"):
+                    decoder_inputs = self_attention_layer(
+                        decoder_inputs, mask=decoder_self_attention_bias, cache=layer_cache
+                    )
+                with tf.name_scope("encdec_attention"):
+                    decoder_inputs = enc_dec_attention_layer([decoder_inputs, encoder_outputs], mask=attention_bias)
+                with tf.name_scope("ffn"):
+                    decoder_inputs = feed_forward_network(decoder_inputs)
+
+        return self.output_normalization(decoder_inputs)
diff --git a/tensorlayer/models/transformer/utils/__init__.py b/tensorlayer/models/transformer/utils/__init__.py
new file mode 100644
index 000000000..13b4fe535
--- /dev/null
+++ b/tensorlayer/models/transformer/utils/__init__.py
@@ -0,0 +1,3 @@
+from .model_utils import *
+from .optimizer import *
+from .metrics import *
\ No newline at end of file
diff --git a/tensorlayer/models/transformer/utils/metrics.py b/tensorlayer/models/transformer/utils/metrics.py
new file mode 100644
index 000000000..25c4eaae4
--- /dev/null
+++ b/tensorlayer/models/transformer/utils/metrics.py
@@ -0,0 +1,651 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions for calculating loss, accuracy, and other model metrics.
+
+Metrics:
+ - Padded loss, accuracy, and negative log perplexity. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/metrics.py
+ - BLEU approximation. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py
+ - ROUGE score. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/rouge.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+
+import numpy as np
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+
+def _pad_tensors_to_same_length(x, y):
+    """Pad x and y so that the results have the same length (second dimension)."""
+    with tf.name_scope("pad_to_same_length"):
+        x_length = tf.shape(x)[1]
+        y_length = tf.shape(y)[1]
+
+        max_length = tf.maximum(x_length, y_length)
+
+        x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]])
+        y = tf.pad(y, [[0, 0], [0, max_length - y_length]])
+        return x, y
+
+
+def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
+    """Calculate cross entropy loss while ignoring padding.
+
+  Args:
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch_size, length_labels]
+    smoothing: Label smoothing constant, used to determine the on and off values
+    vocab_size: int size of the vocabulary
+  Returns:
+    Returns the cross entropy loss and weight tensors: float32 tensors with
+      shape [batch_size, max(length_logits, length_labels)]
+  """
+    with tf.name_scope("loss", values=[logits, labels]):
+        logits, labels = _pad_tensors_to_same_length(logits, labels)
+
+        # Calculate smoothing cross entropy
+        with tf.name_scope("smoothing_cross_entropy", values=[logits, labels]):
+            confidence = 1.0 - smoothing
+            low_confidence = (1.0 - confidence) / tf.to_float(vocab_size - 1)
+            soft_targets = tf.one_hot(
+                tf.cast(labels, tf.int32), depth=vocab_size, on_value=confidence, off_value=low_confidence
+            )
+            xentropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=soft_targets)
+
+            # Calculate the best (lowest) possible value of cross entropy, and
+            # subtract from the cross entropy loss.
+            normalizing_constant = -(
+                confidence * tf.log(confidence) +
+                tf.to_float(vocab_size - 1) * low_confidence * tf.log(low_confidence + 1e-20)
+            )
+            xentropy -= normalizing_constant
+
+        weights = tf.to_float(tf.not_equal(labels, 0))
+        return xentropy * weights, weights
+
+
+def _convert_to_eval_metric(metric_fn):
+    """Wrap a metric fn that returns scores and weights as an eval metric fn.
+
+  The input metric_fn returns values for the current batch. The wrapper
+  aggregates the return values collected over all of the batches evaluated.
+
+  Args:
+    metric_fn: function that returns scores and weights for the current batch's
+      logits and predicted labels.
+
+  Returns:
+    function that aggregates the scores and weights from metric_fn.
+  """
+
+    def problem_metric_fn(*args):
+        """Returns an aggregation of the metric_fn's returned values."""
+        (scores, weights) = metric_fn(*args)
+
+        # The tf.metrics.mean function assures correct aggregation.
+        return tf.metrics.mean(scores, weights)
+
+    return problem_metric_fn
+
+
+def get_eval_metrics(logits, labels, params):
+    """Return dictionary of model evaluation metrics."""
+    metrics = {
+        "accuracy": _convert_to_eval_metric(padded_accuracy)(logits, labels),
+        "accuracy_top5": _convert_to_eval_metric(padded_accuracy_top5)(logits, labels),
+        "accuracy_per_sequence": _convert_to_eval_metric(padded_sequence_accuracy)(logits, labels),
+        "neg_log_perplexity": _convert_to_eval_metric(padded_neg_log_perplexity)(logits, labels, params["vocab_size"]),
+    }
+
+    if not params["use_tpu"]:
+        # TPU does not support tf.py_func
+        metrics.update(
+            {
+                "approx_bleu_score": _convert_to_eval_metric(bleu_score)(logits, labels),
+                "rouge_2_fscore": _convert_to_eval_metric(rouge_2_fscore)(logits, labels),
+                "rouge_L_fscore": _convert_to_eval_metric(rouge_l_fscore)(logits, labels),
+            }
+        )
+
+    # Prefix each of the metric names with "metrics/". This allows the metric
+    # graphs to display under the "metrics" category in TensorBoard.
+    metrics = {"metrics/%s" % k: v for k, v in six.iteritems(metrics)}
+    return metrics
+
+
+def padded_accuracy(logits, labels):
+    """Percentage of times that predictions matches labels on non-0s."""
+    with tf.variable_scope("padded_accuracy", values=[logits, labels]):
+        logits, labels = _pad_tensors_to_same_length(logits, labels)
+        weights = tf.to_float(tf.not_equal(labels, 0))
+        outputs = tf.to_int32(tf.argmax(logits, axis=-1))
+        padded_labels = tf.to_int32(labels)
+        return tf.to_float(tf.equal(outputs, padded_labels)), weights
+
+
+def padded_accuracy_topk(logits, labels, k):
+    """Percentage of times that top-k predictions matches labels on non-0s."""
+    with tf.variable_scope("padded_accuracy_topk", values=[logits, labels]):
+        logits, labels = _pad_tensors_to_same_length(logits, labels)
+        weights = tf.to_float(tf.not_equal(labels, 0))
+        effective_k = tf.minimum(k, tf.shape(logits)[-1])
+        _, outputs = tf.nn.top_k(logits, k=effective_k)
+        outputs = tf.to_int32(outputs)
+        padded_labels = tf.to_int32(labels)
+        padded_labels = tf.expand_dims(padded_labels, axis=-1)
+        padded_labels += tf.zeros_like(outputs)  # Pad to same shape.
+        same = tf.to_float(tf.equal(outputs, padded_labels))
+        same_topk = tf.reduce_sum(same, axis=-1)
+        return same_topk, weights
+
+
+def padded_accuracy_top5(logits, labels):
+    return padded_accuracy_topk(logits, labels, 5)
+
+
+def padded_sequence_accuracy(logits, labels):
+    """Percentage of times that predictions matches labels everywhere (non-0)."""
+    with tf.variable_scope("padded_sequence_accuracy", values=[logits, labels]):
+        logits, labels = _pad_tensors_to_same_length(logits, labels)
+        weights = tf.to_float(tf.not_equal(labels, 0))
+        outputs = tf.to_int32(tf.argmax(logits, axis=-1))
+        padded_labels = tf.to_int32(labels)
+        not_correct = tf.to_float(tf.not_equal(outputs, padded_labels)) * weights
+        axis = list(range(1, len(outputs.get_shape())))
+        correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
+        return correct_seq, tf.constant(1.0)
+
+
+def padded_neg_log_perplexity(logits, labels, vocab_size):
+    """Average log-perplexity excluding padding 0s. No smoothing."""
+    num, den = padded_cross_entropy_loss(logits, labels, 0, vocab_size)
+    return -num, den
+
+
+def bleu_score(logits, labels):
+    """Approximate BLEU score computation between labels and predictions.
+
+  An approximate BLEU scoring method since we do not glue word pieces or
+  decode the ids and tokenize the output. By default, we use ngram order of 4
+  and use brevity penalty. Also, this does not have beam search.
+
+  Args:
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch-size, length_labels]
+
+  Returns:
+    bleu: int, approx bleu score
+  """
+    predictions = tf.to_int32(tf.argmax(logits, axis=-1))
+    # TODO: Look into removing use of py_func
+    bleu = tf.py_func(compute_bleu, (labels, predictions), tf.float32)
+    return bleu, tf.constant(1.0)
+
+
+def _get_ngrams_with_counter(segment, max_order):
+    """Extracts all n-grams up to a given maximum order from an input segment.
+
+  Args:
+    segment: text segment from which n-grams will be extracted.
+    max_order: maximum length in tokens of the n-grams returned by this
+        methods.
+
+  Returns:
+    The Counter containing all n-grams upto max_order in segment
+    with a count of how many times each n-gram occurred.
+  """
+    ngram_counts = collections.Counter()
+    for order in xrange(1, max_order + 1):
+        for i in xrange(0, len(segment) - order + 1):
+            ngram = tuple(segment[i:i + order])
+            ngram_counts[ngram] += 1
+    return ngram_counts
+
+
+def compute_bleu(reference_corpus, translation_corpus, max_order=4, use_bp=True):
+    """Computes BLEU score of translated segments against one or more references.
+
+  Args:
+    reference_corpus: list of references for each translation. Each
+        reference should be tokenized into a list of tokens.
+    translation_corpus: list of translations to score. Each translation
+        should be tokenized into a list of tokens.
+    max_order: Maximum n-gram order to use when computing BLEU score.
+    use_bp: boolean, whether to apply brevity penalty.
+
+  Returns:
+    BLEU score.
+  """
+    reference_length = 0
+    translation_length = 0
+    bp = 1.0
+    geo_mean = 0
+
+    matches_by_order = [0] * max_order
+    possible_matches_by_order = [0] * max_order
+    precisions = []
+
+    for (references, translations) in zip(reference_corpus, translation_corpus):
+        reference_length += len(references)
+        translation_length += len(translations)
+        ref_ngram_counts = _get_ngrams_with_counter(references, max_order)
+        translation_ngram_counts = _get_ngrams_with_counter(translations, max_order)
+
+        overlap = dict(
+            (ngram, min(count, translation_ngram_counts[ngram])) for ngram, count in ref_ngram_counts.items()
+        )
+
+        for ngram in overlap:
+            matches_by_order[len(ngram) - 1] += overlap[ngram]
+        for ngram in translation_ngram_counts:
+            possible_matches_by_order[len(ngram) - 1] += translation_ngram_counts[ngram]
+
+    precisions = [0] * max_order
+    smooth = 1.0
+
+    for i in xrange(0, max_order):
+        if possible_matches_by_order[i] > 0:
+            precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i]
+            if matches_by_order[i] > 0:
+                precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i]
+            else:
+                smooth *= 2
+                precisions[i] = 1.0 / (smooth * possible_matches_by_order[i])
+        else:
+            precisions[i] = 0.0
+
+    if max(precisions) > 0:
+        p_log_sum = sum(math.log(p) for p in precisions if p)
+        geo_mean = math.exp(p_log_sum / max_order)
+
+    if use_bp:
+        ratio = translation_length / reference_length
+        bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0
+    bleu = geo_mean * bp
+    return np.float32(bleu)
+
+
+def rouge_2_fscore(logits, labels):
+    """ROUGE-2 F1 score computation between labels and predictions.
+
+  This is an approximate ROUGE scoring method since we do not glue word pieces
+  or decode the ids and tokenize the output.
+
+  Args:
+    logits: tensor, model predictions
+    labels: tensor, gold output.
+
+  Returns:
+    rouge2_fscore: approx rouge-2 f1 score.
+  """
+    predictions = tf.to_int32(tf.argmax(logits, axis=-1))
+    # TODO: Look into removing use of py_func
+    rouge_2_f_score = tf.py_func(rouge_n, (predictions, labels), tf.float32)
+    return rouge_2_f_score, tf.constant(1.0)
+
+
+def _get_ngrams(n, text):
+    """Calculates n-grams.
+
+  Args:
+    n: which n-grams to calculate
+    text: An array of tokens
+
+  Returns:
+    A set of n-grams
+  """
+    ngram_set = set()
+    text_length = len(text)
+    max_index_ngram_start = text_length - n
+    for i in range(max_index_ngram_start + 1):
+        ngram_set.add(tuple(text[i:i + n]))
+    return ngram_set
+
+
+def rouge_n(eval_sentences, ref_sentences, n=2):
+    """Computes ROUGE-N f1 score of two text collections of sentences.
+
+  Source: https://www.microsoft.com/en-us/research/publication/
+  rouge-a-package-for-automatic-evaluation-of-summaries/
+
+  Args:
+    eval_sentences: Predicted sentences.
+    ref_sentences: Sentences from the reference set
+    n: Size of ngram.  Defaults to 2.
+
+  Returns:
+    f1 score for ROUGE-N
+  """
+    f1_scores = []
+    for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
+        eval_ngrams = _get_ngrams(n, eval_sentence)
+        ref_ngrams = _get_ngrams(n, ref_sentence)
+        ref_count = len(ref_ngrams)
+        eval_count = len(eval_ngrams)
+
+        # Count the overlapping ngrams between evaluated and reference
+        overlapping_ngrams = eval_ngrams.intersection(ref_ngrams)
+        overlapping_count = len(overlapping_ngrams)
+
+        # Handle edge case. This isn't mathematically correct, but it's good enough
+        if eval_count == 0:
+            precision = 0.0
+        else:
+            precision = float(overlapping_count) / eval_count
+        if ref_count == 0:
+            recall = 0.0
+        else:
+            recall = float(overlapping_count) / ref_count
+        f1_scores.append(2.0 * ((precision * recall) / (precision + recall + 1e-8)))
+
+    # return overlapping_count / reference_count
+    return np.mean(f1_scores, dtype=np.float32)
+
+
+def rouge_l_fscore(predictions, labels):
+    """ROUGE scores computation between labels and predictions.
+
+  This is an approximate ROUGE scoring method since we do not glue word pieces
+  or decode the ids and tokenize the output.
+
+  Args:
+    predictions: tensor, model predictions
+    labels: tensor, gold output.
+
+  Returns:
+    rouge_l_fscore: approx rouge-l f1 score.
+  """
+    outputs = tf.to_int32(tf.argmax(predictions, axis=-1))
+    rouge_l_f_score = tf.py_func(rouge_l_sentence_level, (outputs, labels), tf.float32)
+    return rouge_l_f_score, tf.constant(1.0)
+
+
+def rouge_l_sentence_level(eval_sentences, ref_sentences):
+    """Computes ROUGE-L (sentence level) of two collections of sentences.
+
+  Source: https://www.microsoft.com/en-us/research/publication/
+  rouge-a-package-for-automatic-evaluation-of-summaries/
+
+  Calculated according to:
+  R_lcs = LCS(X,Y)/m
+  P_lcs = LCS(X,Y)/n
+  F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
+
+  where:
+  X = reference summary
+  Y = Candidate summary
+  m = length of reference summary
+  n = length of candidate summary
+
+  Args:
+    eval_sentences: The sentences that have been picked by the summarizer
+    ref_sentences: The sentences from the reference set
+
+  Returns:
+    A float: F_lcs
+  """
+
+    f1_scores = []
+    for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
+        m = float(len(ref_sentence))
+        n = float(len(eval_sentence))
+        lcs = _len_lcs(eval_sentence, ref_sentence)
+        f1_scores.append(_f_lcs(lcs, m, n))
+    return np.mean(f1_scores, dtype=np.float32)
+
+
+def _len_lcs(x, y):
+    """Returns the length of the Longest Common Subsequence between two seqs.
+
+  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+
+  Args:
+    x: sequence of words
+    y: sequence of words
+
+  Returns
+    integer: Length of LCS between x and y
+  """
+    table = _lcs(x, y)
+    n, m = len(x), len(y)
+    return table[n, m]
+
+
+def _lcs(x, y):
+    """Computes the length of the LCS between two seqs.
+
+  The implementation below uses a DP programming algorithm and runs
+  in O(nm) time where n = len(x) and m = len(y).
+  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+
+  Args:
+    x: collection of words
+    y: collection of words
+
+  Returns:
+    Table of dictionary of coord and len lcs
+  """
+    n, m = len(x), len(y)
+    table = dict()
+    for i in range(n + 1):
+        for j in range(m + 1):
+            if i == 0 or j == 0:
+                table[i, j] = 0
+            elif x[i - 1] == y[j - 1]:
+                table[i, j] = table[i - 1, j - 1] + 1
+            else:
+                table[i, j] = max(table[i - 1, j], table[i, j - 1])
+    return table
+
+
+def _f_lcs(llcs, m, n):
+    """Computes the LCS-based F-measure score.
+
+  Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
+  rouge-working-note-v1.3.1.pdf
+
+  Args:
+    llcs: Length of LCS
+    m: number of words in reference summary
+    n: number of words in candidate summary
+
+  Returns:
+    Float. LCS-based F-measure score
+  """
+    r_lcs = llcs / m
+    p_lcs = llcs / n
+    beta = p_lcs / (r_lcs + 1e-12)
+    num = (1 + (beta**2)) * r_lcs * p_lcs
+    denom = r_lcs + ((beta**2) * p_lcs)
+    f_lcs = num / (denom + 1e-12)
+    return f_lcs
+
+
+def _pad_tensors_to_same_length(x, y):
+    """Pad x and y so that the results have the same length (second dimension)."""
+    with tf.name_scope("pad_to_same_length"):
+        x_length = tf.shape(x)[1]
+        y_length = tf.shape(y)[1]
+
+        max_length = tf.maximum(x_length, y_length)
+
+        x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]])
+        y = tf.pad(y, [[0, 0], [0, max_length - y_length]])
+        return x, y
+
+
+def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
+    """Calculate cross entropy loss while ignoring padding.
+
+  Args:
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch_size, length_labels]
+    smoothing: Label smoothing constant, used to determine the on and off values
+    vocab_size: int size of the vocabulary
+
+  Returns:
+    Returns the cross entropy loss and weight tensors: float32 tensors with
+      shape [batch_size, max(length_logits, length_labels)]
+  """
+    with tf.name_scope("loss"):
+        logits, labels = _pad_tensors_to_same_length(logits, labels)
+
+        # Calculate smoothing cross entropy
+        with tf.name_scope("smoothing_cross_entropy"):
+            confidence = 1.0 - smoothing
+            low_confidence = (1.0 - confidence) / tf.cast(vocab_size - 1, tf.float32)
+            soft_targets = tf.one_hot(
+                tf.cast(labels, tf.int32), depth=vocab_size, on_value=confidence, off_value=low_confidence
+            )
+            xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=soft_targets)
+
+            # Calculate the best (lowest) possible value of cross entropy, and
+            # subtract from the cross entropy loss.
+            normalizing_constant = -(
+                confidence * tf.math.log(confidence) +
+                tf.cast(vocab_size - 1, tf.float32) * low_confidence * tf.math.log(low_confidence + 1e-20)
+            )
+            xentropy -= normalizing_constant
+
+        weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+        return xentropy * weights, weights
+
+
+def padded_accuracy(logits, labels):
+    """Percentage of times that predictions matches labels on non-0s."""
+    with tf.name_scope("padded_accuracy"):
+        logits, labels = _pad_tensors_to_same_length(logits, labels)
+        weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+        outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
+        padded_labels = tf.cast(labels, tf.int32)
+        return tf.cast(tf.equal(outputs, padded_labels), tf.float32), weights
+
+
+def padded_accuracy_topk(logits, labels, k):
+    """Percentage of times that top-k predictions matches labels on non-0s."""
+    with tf.name_scope("padded_accuracy_topk"):
+        logits, labels = _pad_tensors_to_same_length(logits, labels)
+        weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+        effective_k = tf.minimum(k, tf.shape(logits)[-1])
+        _, outputs = tf.nn.top_k(logits, k=effective_k)
+        outputs = tf.cast(outputs, tf.int32)
+        padded_labels = tf.cast(labels, tf.int32)
+        padded_labels = tf.expand_dims(padded_labels, axis=-1)
+        padded_labels += tf.zeros_like(outputs)  # Pad to same shape.
+        same = tf.cast(tf.equal(outputs, padded_labels), tf.float32)
+        same_topk = tf.reduce_sum(same, axis=-1)
+        return same_topk, weights
+
+
+def padded_accuracy_top5(logits, labels):
+    return padded_accuracy_topk(logits, labels, 5)
+
+
+def padded_sequence_accuracy(logits, labels):
+    """Percentage of times that predictions matches labels everywhere (non-0)."""
+    with tf.name_scope("padded_sequence_accuracy"):
+        logits, labels = _pad_tensors_to_same_length(logits, labels)
+        weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+        outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
+        padded_labels = tf.cast(labels, tf.int32)
+        not_correct = tf.cast(tf.not_equal(outputs, padded_labels), tf.float32) * weights
+        axis = list(range(1, len(outputs.get_shape())))
+        correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
+        return correct_seq, tf.constant(1.0)
+
+
+def padded_neg_log_perplexity(logits, labels, vocab_size):
+    """Average log-perplexity excluding padding 0s. No smoothing."""
+    num, den = padded_cross_entropy_loss(logits, labels, 0, vocab_size)
+    return -num, den
+
+
+import functools
+
+
+class MetricLayer(tf.keras.layers.Layer):
+    """Custom a layer of metrics for Transformer model."""
+
+    def __init__(self, vocab_size):
+        super(MetricLayer, self).__init__()
+        self.vocab_size = vocab_size
+        self.metric_mean_fns = []
+
+    def build(self, input_shape):
+        """"Builds metric layer."""
+        neg_log_perplexity = functools.partial(padded_neg_log_perplexity, vocab_size=self.vocab_size)
+        self.metric_mean_fns = [
+            (tf.keras.metrics.Mean("accuracy"), padded_accuracy),
+            (tf.keras.metrics.Mean("accuracy_top5"), padded_accuracy_top5),
+            (tf.keras.metrics.Mean("accuracy_per_sequence"), padded_sequence_accuracy),
+            (tf.keras.metrics.Mean("neg_log_perplexity"), neg_log_perplexity),
+        ]
+        super(MetricLayer, self).build(input_shape)
+
+    def get_config(self):
+        return {"vocab_size": self.vocab_size}
+
+    def call(self, inputs):
+        logits, targets = inputs[0], inputs[1]
+        for mean, fn in self.metric_mean_fns:
+            m = mean(*fn(logits, targets))
+            self.add_metric(m, name="metric", aggregation='mean')
+        return logits
+
+
+def transformer_loss(logits, labels, smoothing, vocab_size):
+    """Calculates total loss containing cross entropy with padding ignored.
+
+  Args:
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch_size, length_labels]
+    smoothing: Label smoothing constant, used to determine the on and off values
+    vocab_size: int size of the vocabulary
+
+  Returns:
+    A scalar float tensor for loss.
+  """
+    xentropy, weights = padded_cross_entropy_loss(logits, labels, smoothing, vocab_size)
+    return tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
+
+
+class LossLayer(tf.keras.layers.Layer):
+    """Custom a layer of transformer loss for Transformer model."""
+
+    def __init__(self, vocab_size, label_smoothing):
+        super(LossLayer, self).__init__()
+        self.vocab_size = vocab_size
+        self.label_smoothing = label_smoothing
+
+    def get_config(self):
+        return {
+            "vocab_size": self.vocab_size,
+            "label_smoothing": self.label_smoothing,
+        }
+
+    def call(self, inputs):
+        logits, targets = inputs[0], inputs[1]
+        loss = transformer_loss(logits, targets, self.label_smoothing, self.vocab_size)
+        self.add_loss(loss)
+        return logits, loss
diff --git a/tensorlayer/models/transformer/utils/model_utils.py b/tensorlayer/models/transformer/utils/model_utils.py
new file mode 100644
index 000000000..10c4a3e2c
--- /dev/null
+++ b/tensorlayer/models/transformer/utils/model_utils.py
@@ -0,0 +1,107 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Transformer model helper methods."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import tensorflow as tf
+
+_NEG_INF = -1e9
+
+
+def positional_encoding(length, hidden_size, min_timescale=1.0, max_timescale=1.0e4):
+    """Return positional encoding.
+
+  Calculates the position encoding as a mix of sine and cosine functions with
+  geometrically increasing wavelengths.
+  Defined and formulized in Attention is All You Need, section 3.5.
+
+  Args:
+    length: Sequence length.
+    hidden_size: Size of the
+    min_timescale: Minimum scale that will be applied at each position
+    max_timescale: Maximum scale that will be applied at each position
+
+  Returns:
+    Tensor with shape [length, hidden_size]
+  """
+    position = tf.cast(tf.range(length), tf.float32)
+    num_timescales = hidden_size // 2
+    log_timescale_increment = (
+        math.log(float(max_timescale) / float(min_timescale)) / (tf.cast(num_timescales, tf.float32) - 1)
+    )
+    inv_timescales = min_timescale * tf.exp(tf.cast(tf.range(num_timescales), tf.float32) * -log_timescale_increment)
+    scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
+    signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
+    return signal
+
+
+def get_decoder_self_attention_bias(length):
+    """Calculate bias for decoder that maintains model's autoregressive property.
+
+  Creates a tensor that masks out locations that correspond to illegal
+  connections, so prediction at position i cannot draw information from future
+  positions.
+
+  Args:
+    length: int length of sequences in batch.
+
+  Returns:
+    float tensor of shape [1, 1, length, length]
+  """
+    with tf.name_scope("decoder_self_attention_bias"):
+        valid_locs = tf.linalg.band_part(tf.ones([length, length]), -1, 0)
+        valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
+        decoder_bias = _NEG_INF * (1.0 - valid_locs)
+    return decoder_bias
+
+
+def get_padding(x, padding_value=0):
+    """Return float tensor representing the padding values in x.
+
+  Args:
+    x: int tensor with any shape
+    padding_value: int value that
+
+  Returns:
+    float tensor with same shape as x containing values 0 or 1.
+      0 -> non-padding, 1 -> padding
+  """
+    with tf.name_scope("padding"):
+        return tf.cast(tf.equal(x, padding_value), tf.float32)
+
+
+def get_padding_bias(x):
+    """Calculate bias tensor from padding values in tensor.
+
+  Bias tensor that is added to the pre-softmax multi-headed attention logits,
+  which has shape [batch_size, num_heads, length, length]. The tensor is zero at
+  non-padding locations, and -1e9 (negative infinity) at padding locations.
+
+  Args:
+    x: int tensor with shape [batch_size, length]
+
+  Returns:
+    Attention bias tensor of shape [batch_size, 1, 1, length].
+  """
+    with tf.name_scope("attention_bias"):
+        padding = get_padding(x)
+        attention_bias = padding * _NEG_INF
+        attention_bias = tf.expand_dims(tf.expand_dims(attention_bias, axis=1), axis=1)
+    return attention_bias
diff --git a/tensorlayer/models/transformer/utils/optimizer.py b/tensorlayer/models/transformer/utils/optimizer.py
new file mode 100644
index 000000000..9fa27f69f
--- /dev/null
+++ b/tensorlayer/models/transformer/utils/optimizer.py
@@ -0,0 +1,147 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Optimizer from addons and learning rate scheduler."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+K = tf.keras.backend
+
+
+class LazyAdam(tf.keras.optimizers.Adam):
+    """Variant of the Adam optimizer that handles sparse updates more efficiently.
+
+  The original Adam algorithm maintains two moving-average accumulators for
+  each trainable variable; the accumulators are updated at every step.
+  This class provides lazier handling of gradient updates for sparse
+  variables.  It only updates moving-average accumulators for sparse variable
+  indices that appear in the current batch, rather than updating the
+  accumulators for all indices. Compared with the original Adam optimizer,
+  it can provide large improvements in model training throughput for some
+  applications. However, it provides slightly different semantics than the
+  original Adam algorithm, and may lead to different empirical results.
+  Note, amsgrad is currently not supported and the argument can only be
+  False.
+
+  This class is borrowed from:
+  https://github.com/tensorflow/addons/blob/master/tensorflow_addons/optimizers/lazy_adam.py
+  """
+
+    def _resource_apply_sparse(self, grad, var, indices):
+        """Applies grad for one step."""
+        var_dtype = var.dtype.base_dtype
+        lr_t = self._decayed_lr(var_dtype)
+        beta_1_t = self._get_hyper('beta_1', var_dtype)
+        beta_2_t = self._get_hyper('beta_2', var_dtype)
+        local_step = tf.cast(self.iterations + 1, var_dtype)
+        beta_1_power = tf.math.pow(beta_1_t, local_step)
+        beta_2_power = tf.math.pow(beta_2_t, local_step)
+        epsilon_t = tf.convert_to_tensor(self.epsilon, var_dtype)
+        lr = (lr_t * tf.math.sqrt(1 - beta_2_power) / (1 - beta_1_power))
+
+        # \\(m := beta1 * m + (1 - beta1) * g_t\\)
+        m = self.get_slot(var, 'm')
+        m_t_slice = beta_1_t * tf.gather(m, indices) + (1 - beta_1_t) * grad
+
+        m_update_kwargs = {'resource': m.handle, 'indices': indices, 'updates': m_t_slice}
+        m_update_op = tf.raw_ops.ResourceScatterUpdate(**m_update_kwargs)
+
+        # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
+        v = self.get_slot(var, 'v')
+        v_t_slice = (beta_2_t * tf.gather(v, indices) + (1 - beta_2_t) * tf.math.square(grad))
+
+        v_update_kwargs = {'resource': v.handle, 'indices': indices, 'updates': v_t_slice}
+        v_update_op = tf.raw_ops.ResourceScatterUpdate(**v_update_kwargs)
+
+        # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
+        var_slice = lr * m_t_slice / (tf.math.sqrt(v_t_slice) + epsilon_t)
+
+        var_update_kwargs = {'resource': var.handle, 'indices': indices, 'updates': var_slice}
+        var_update_op = tf.raw_ops.ResourceScatterSub(**var_update_kwargs)
+
+        return tf.group(*[var_update_op, m_update_op, v_update_op])
+
+
+class LearningRateFn(object):
+    """Creates learning rate function."""
+
+    def __init__(self, learning_rate, hidden_size, warmup_steps):
+        self.learning_rate = learning_rate
+        self.hidden_size = hidden_size
+        self.warmup_steps = float(warmup_steps)
+
+    def __call__(self, global_step):
+        """Calculate learning rate with linear warmup and rsqrt decay."""
+        step = float(global_step)
+        learning_rate = self.learning_rate
+        learning_rate *= (self.hidden_size**-0.5)
+        # Apply linear warmup
+        learning_rate *= np.minimum(1.0, step / self.warmup_steps)
+        # Apply rsqrt decay
+        learning_rate /= np.sqrt(np.maximum(step, self.warmup_steps))
+        return learning_rate
+
+
+class LearningRateScheduler(tf.keras.callbacks.Callback):
+    """Keras callback to schedule learning rate.
+
+  TODO(tianlin): Refactor this scheduler and LearningRateBatchScheduler in
+  official/resnet/keras/keras_common.py.
+  """
+
+    def __init__(self, schedule, init_steps=None, verbose=False):
+        super(LearningRateScheduler, self).__init__()
+        self.schedule = schedule
+        self.verbose = verbose
+        if init_steps is None:
+            init_steps = 0.0
+        self.steps = float(init_steps)  # Total steps during training.
+
+    def on_epoch_begin(self, epoch, logs=None):
+        if not hasattr(self.model.optimizer, 'lr'):
+            raise ValueError('Optimizer must have a "lr" attribute.')
+        if not hasattr(self.model.optimizer, 'iterations'):
+            raise ValueError('Optimizer must have a "iterations" attribute.')
+
+    def on_train_batch_begin(self, batch, logs=None):
+        """Adjusts learning rate for each train batch."""
+        if self.verbose > 0:
+            iterations = K.get_value(self.model.optimizer.iterations)
+            print('Original iteration %d' % iterations)
+
+        self.steps += 1.0
+        try:  # new API
+            lr = float(K.get_value(self.model.optimizer.lr))
+            lr = self.schedule(self.steps, lr)
+        except TypeError:  # Support for old API for backward compatibility
+            lr = self.schedule(self.steps)
+        if not isinstance(lr, (float, np.float32, np.float64)):
+            raise ValueError('The output of the "schedule" function ' 'should be float.')
+        K.set_value(self.model.optimizer.lr, lr)
+        K.set_value(self.model.optimizer.iterations, self.steps)
+
+        if self.verbose > 0:
+            print(
+                'Batch %05d Step %05d: LearningRateScheduler setting learning '
+                'rate to %s.' % (batch + 1, self.steps, lr)
+            )
+
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        logs['lr'] = K.get_value(self.model.optimizer.lr)
+        logs['steps'] = self.steps

From d1a20df4601fbe3b4f1d2572032f5b3d31470e33 Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@LingjundeMacBook-Pro.local>
Date: Mon, 22 Jul 2019 10:44:37 +0100
Subject: [PATCH 02/22] minor change

---
 tensorlayer/models/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorlayer/models/__init__.py b/tensorlayer/models/__init__.py
index 53556f86e..dd56188d2 100644
--- a/tensorlayer/models/__init__.py
+++ b/tensorlayer/models/__init__.py
@@ -9,4 +9,4 @@
 from .vgg import *
 from .seq2seq import Seq2seq
 from .seq2seq_with_attention import Seq2seqLuongAttention
-from .transformer import *
+from .transformer.transformer import Transformer

From 21161cba0e5d304a6ce306718ae5839fb02a433f Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@LingjundeMacBook-Pro.local>
Date: Sat, 31 Aug 2019 12:29:06 +0100
Subject: [PATCH 03/22] adjust files

---
 tensorlayer/optimizers/lazyAdam.py | 147 +++++++++++++++++++++++++++++
 tests/models/test_transformer.py   | 119 +++++++++++++++++++++++
 2 files changed, 266 insertions(+)
 create mode 100644 tensorlayer/optimizers/lazyAdam.py
 create mode 100644 tests/models/test_transformer.py

diff --git a/tensorlayer/optimizers/lazyAdam.py b/tensorlayer/optimizers/lazyAdam.py
new file mode 100644
index 000000000..9fa27f69f
--- /dev/null
+++ b/tensorlayer/optimizers/lazyAdam.py
@@ -0,0 +1,147 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Optimizer from addons and learning rate scheduler."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+K = tf.keras.backend
+
+
+class LazyAdam(tf.keras.optimizers.Adam):
+    """Variant of the Adam optimizer that handles sparse updates more efficiently.
+
+  The original Adam algorithm maintains two moving-average accumulators for
+  each trainable variable; the accumulators are updated at every step.
+  This class provides lazier handling of gradient updates for sparse
+  variables.  It only updates moving-average accumulators for sparse variable
+  indices that appear in the current batch, rather than updating the
+  accumulators for all indices. Compared with the original Adam optimizer,
+  it can provide large improvements in model training throughput for some
+  applications. However, it provides slightly different semantics than the
+  original Adam algorithm, and may lead to different empirical results.
+  Note, amsgrad is currently not supported and the argument can only be
+  False.
+
+  This class is borrowed from:
+  https://github.com/tensorflow/addons/blob/master/tensorflow_addons/optimizers/lazy_adam.py
+  """
+
+    def _resource_apply_sparse(self, grad, var, indices):
+        """Applies grad for one step."""
+        var_dtype = var.dtype.base_dtype
+        lr_t = self._decayed_lr(var_dtype)
+        beta_1_t = self._get_hyper('beta_1', var_dtype)
+        beta_2_t = self._get_hyper('beta_2', var_dtype)
+        local_step = tf.cast(self.iterations + 1, var_dtype)
+        beta_1_power = tf.math.pow(beta_1_t, local_step)
+        beta_2_power = tf.math.pow(beta_2_t, local_step)
+        epsilon_t = tf.convert_to_tensor(self.epsilon, var_dtype)
+        lr = (lr_t * tf.math.sqrt(1 - beta_2_power) / (1 - beta_1_power))
+
+        # \\(m := beta1 * m + (1 - beta1) * g_t\\)
+        m = self.get_slot(var, 'm')
+        m_t_slice = beta_1_t * tf.gather(m, indices) + (1 - beta_1_t) * grad
+
+        m_update_kwargs = {'resource': m.handle, 'indices': indices, 'updates': m_t_slice}
+        m_update_op = tf.raw_ops.ResourceScatterUpdate(**m_update_kwargs)
+
+        # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
+        v = self.get_slot(var, 'v')
+        v_t_slice = (beta_2_t * tf.gather(v, indices) + (1 - beta_2_t) * tf.math.square(grad))
+
+        v_update_kwargs = {'resource': v.handle, 'indices': indices, 'updates': v_t_slice}
+        v_update_op = tf.raw_ops.ResourceScatterUpdate(**v_update_kwargs)
+
+        # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
+        var_slice = lr * m_t_slice / (tf.math.sqrt(v_t_slice) + epsilon_t)
+
+        var_update_kwargs = {'resource': var.handle, 'indices': indices, 'updates': var_slice}
+        var_update_op = tf.raw_ops.ResourceScatterSub(**var_update_kwargs)
+
+        return tf.group(*[var_update_op, m_update_op, v_update_op])
+
+
+class LearningRateFn(object):
+    """Creates learning rate function."""
+
+    def __init__(self, learning_rate, hidden_size, warmup_steps):
+        self.learning_rate = learning_rate
+        self.hidden_size = hidden_size
+        self.warmup_steps = float(warmup_steps)
+
+    def __call__(self, global_step):
+        """Calculate learning rate with linear warmup and rsqrt decay."""
+        step = float(global_step)
+        learning_rate = self.learning_rate
+        learning_rate *= (self.hidden_size**-0.5)
+        # Apply linear warmup
+        learning_rate *= np.minimum(1.0, step / self.warmup_steps)
+        # Apply rsqrt decay
+        learning_rate /= np.sqrt(np.maximum(step, self.warmup_steps))
+        return learning_rate
+
+
+class LearningRateScheduler(tf.keras.callbacks.Callback):
+    """Keras callback to schedule learning rate.
+
+  TODO(tianlin): Refactor this scheduler and LearningRateBatchScheduler in
+  official/resnet/keras/keras_common.py.
+  """
+
+    def __init__(self, schedule, init_steps=None, verbose=False):
+        super(LearningRateScheduler, self).__init__()
+        self.schedule = schedule
+        self.verbose = verbose
+        if init_steps is None:
+            init_steps = 0.0
+        self.steps = float(init_steps)  # Total steps during training.
+
+    def on_epoch_begin(self, epoch, logs=None):
+        if not hasattr(self.model.optimizer, 'lr'):
+            raise ValueError('Optimizer must have a "lr" attribute.')
+        if not hasattr(self.model.optimizer, 'iterations'):
+            raise ValueError('Optimizer must have a "iterations" attribute.')
+
+    def on_train_batch_begin(self, batch, logs=None):
+        """Adjusts learning rate for each train batch."""
+        if self.verbose > 0:
+            iterations = K.get_value(self.model.optimizer.iterations)
+            print('Original iteration %d' % iterations)
+
+        self.steps += 1.0
+        try:  # new API
+            lr = float(K.get_value(self.model.optimizer.lr))
+            lr = self.schedule(self.steps, lr)
+        except TypeError:  # Support for old API for backward compatibility
+            lr = self.schedule(self.steps)
+        if not isinstance(lr, (float, np.float32, np.float64)):
+            raise ValueError('The output of the "schedule" function ' 'should be float.')
+        K.set_value(self.model.optimizer.lr, lr)
+        K.set_value(self.model.optimizer.iterations, self.steps)
+
+        if self.verbose > 0:
+            print(
+                'Batch %05d Step %05d: LearningRateScheduler setting learning '
+                'rate to %s.' % (batch + 1, self.steps, lr)
+            )
+
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        logs['lr'] = K.get_value(self.model.optimizer.lr)
+        logs['steps'] = self.steps
diff --git a/tests/models/test_transformer.py b/tests/models/test_transformer.py
new file mode 100644
index 000000000..60654ede6
--- /dev/null
+++ b/tests/models/test_transformer.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import unittest
+
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+
+import numpy as np
+import tensorflow as tf
+import tensorlayer as tl
+from tqdm import tqdm
+from sklearn.utils import shuffle
+from tensorlayer.models.transformer import Transformer
+from tests.utils import CustomTestCase
+from tensorlayer.models.transformer.utils import metrics
+from tensorlayer.cost import cross_entropy_seq
+from tensorlayer.optimizers import lazyAdam as optimizer
+import time
+
+
+
+
+
+class TINY_PARAMS(object):
+    vocab_size = 50
+    encoder_num_layers = 2
+    decoder_num_layers = 2
+    filter_number = 256
+    R1 = 4
+    R2 = 8
+    n_channels = 2
+    n_units = 128
+    H = 32
+    light_filter_size=(1,3)
+    filter_size = light_filter_size[-1]
+    hidden_size = 64
+    ff_size = 16
+    num_heads = 4
+    keep_prob = 0.9
+
+
+
+    # Default prediction params
+    extra_decode_length=5
+    beam_size=2
+    alpha=0.6 # used to calculate length normalization in beam search
+
+
+class Model_SEQ2SEQ_Test(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.batch_size = 16
+
+        cls.embedding_size = 32
+        cls.dec_seq_length = 5
+        cls.trainX = np.random.randint(low=2, high=50, size=(50, 11))
+        cls.trainY = np.random.randint(low=2, high=50, size=(50, 10))
+
+        cls.trainX[:,-1] = 1
+        cls.trainY[:,-1] = 1
+        # Parameters
+        cls.src_len = len(cls.trainX)
+        cls.tgt_len = len(cls.trainY)
+
+        assert cls.src_len == cls.tgt_len
+
+        cls.num_epochs = 1000
+        cls.n_step = cls.src_len // cls.batch_size
+
+    @classmethod
+    def tearDownClass(cls):
+        pass
+
+    def test_basic_simpleSeq2Seq(self):
+        
+        model_ = Transformer(TINY_PARAMS)
+
+        # print(", ".join(x for x in [t.name for t in model_.trainable_weights]))
+
+        self.vocab_size = TINY_PARAMS.vocab_size
+        optimizer = tf.optimizers.Adam(learning_rate=0.01)
+        for epoch in range(self.num_epochs):
+            model_.train()
+            t = time.time()
+            trainX, trainY = shuffle(self.trainX, self.trainY)
+            total_loss, n_iter = 0, 0
+            for X, Y in tqdm(tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=self.batch_size,
+                                                    shuffle=False), total=self.n_step,
+                             desc='Epoch[{}/{}]'.format(epoch + 1, self.num_epochs), leave=False):
+
+                with tf.GradientTape() as tape:
+
+                    targets = Y
+                    logits = model_(inputs = X, targets = Y)
+                    logits = metrics.MetricLayer(self.vocab_size)([logits, targets])
+                    logits, loss = metrics.LossLayer(self.vocab_size, 0.1)([logits, targets])
+                    
+                    grad = tape.gradient(loss, model_.all_weights)
+                    optimizer.apply_gradients(zip(grad, model_.all_weights))
+                    
+            
+                total_loss += loss
+                n_iter += 1
+            print(time.time()-t)
+            tl.files.save_npz(model_.all_weights, name='./model_v4.npz')
+            model_.eval()
+            test_sample = trainX[0:2, :]
+            model_.eval()
+            prediction = model_(inputs = test_sample)
+            
+            print("Prediction: >>>>>  ", prediction["outputs"], "\n Target: >>>>>  ", trainY[0:2, :], "\n\n")
+
+            print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, self.num_epochs, total_loss / n_iter))
+
+
+if __name__ == '__main__':
+    unittest.main()

From 005ab91b65e8ddbf59328bc6e460f2f2621b6a21 Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@LingjundeMacBook-Pro.local>
Date: Sat, 31 Aug 2019 13:54:00 +0100
Subject: [PATCH 04/22] attention visualisation

---
 tests/models/test_transformer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/models/test_transformer.py b/tests/models/test_transformer.py
index 60654ede6..193e18401 100644
--- a/tests/models/test_transformer.py
+++ b/tests/models/test_transformer.py
@@ -74,7 +74,7 @@ def tearDownClass(cls):
         pass
 
     def test_basic_simpleSeq2Seq(self):
-        
+
         model_ = Transformer(TINY_PARAMS)
 
         # print(", ".join(x for x in [t.name for t in model_.trainable_weights]))
@@ -93,7 +93,7 @@ def test_basic_simpleSeq2Seq(self):
                 with tf.GradientTape() as tape:
 
                     targets = Y
-                    logits = model_(inputs = X, targets = Y)
+                    logits, weights_encoder, weights_decoder = model_(inputs = X, targets = Y)
                     logits = metrics.MetricLayer(self.vocab_size)([logits, targets])
                     logits, loss = metrics.LossLayer(self.vocab_size, 0.1)([logits, targets])
                     
@@ -108,7 +108,7 @@ def test_basic_simpleSeq2Seq(self):
             model_.eval()
             test_sample = trainX[0:2, :]
             model_.eval()
-            prediction = model_(inputs = test_sample)
+            [prediction, weights_decoder], weights_encoder = model_(inputs = test_sample)
             
             print("Prediction: >>>>>  ", prediction["outputs"], "\n Target: >>>>>  ", trainY[0:2, :], "\n\n")
 

From 8911654d416252b97e1a580f005ae8742af24cde Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@LingjundeMacBook-Pro.local>
Date: Sat, 31 Aug 2019 13:55:11 +0100
Subject: [PATCH 05/22] add attention visualisation

---
 .../models/transformer/attention_layer.py     |  25 +-
 .../models/transformer/feedforward_layer.py   |   4 +-
 tensorlayer/models/transformer/transformer.py | 570 ++++++++++--------
 3 files changed, 317 insertions(+), 282 deletions(-)

diff --git a/tensorlayer/models/transformer/attention_layer.py b/tensorlayer/models/transformer/attention_layer.py
index 205ec8244..acf474584 100644
--- a/tensorlayer/models/transformer/attention_layer.py
+++ b/tensorlayer/models/transformer/attention_layer.py
@@ -56,16 +56,16 @@ def get_config(self):
     def build(self, inputs_shape):
         # Transformation for linearly projecting the queries, keys, and values.
         self.q_transformation = self._get_weights(
-            "q_project", shape=(self.hidden_size, self.hidden_size), init=tf.keras.initializers.get('glorot_uniform')
+            "q_project", shape=(self.hidden_size, self.hidden_size), init=tf.initializers.get('glorot_uniform')
         )
         self.v_transformation = self._get_weights(
-            "v_project", shape=(self.hidden_size, self.hidden_size), init=tf.keras.initializers.get('glorot_uniform')
+            "v_project", shape=(self.hidden_size, self.hidden_size), init=tf.initializers.get('glorot_uniform')
         )
         self.k_transformation = self._get_weights(
-            "k_project", shape=(self.hidden_size, self.hidden_size), init=tf.keras.initializers.get('glorot_uniform')
+            "k_project", shape=(self.hidden_size, self.hidden_size), init=tf.initializers.get('glorot_uniform')
         )
         self.out_transformation = self._get_weights(
-            "out_project", shape=(self.hidden_size, self.hidden_size), init=tf.keras.initializers.get('glorot_uniform')
+            "out_project", shape=(self.hidden_size, self.hidden_size), init=tf.initializers.get('glorot_uniform')
         )
 
     def split_heads(self, x):
@@ -108,7 +108,7 @@ def combine_heads(self, x):
             x = tf.transpose(x, [0, 2, 1, 3])  # --> [batch, length, num_heads, depth]
             return tf.reshape(x, [batch_size, length, self.hidden_size])
 
-    def forward(self, inputs, mask, cache=None):
+    def forward(self, x, y, mask, cache=None):
         """Apply attention mechanism to x and y.
 
     Args:
@@ -130,14 +130,8 @@ def forward(self, inputs, mask, cache=None):
         # multiple heads. Multi-head attention uses multiple queries, keys, and
         # values rather than regular attention (which uses a single q, k, v).
 
-        if (len(inputs) == 2):
-            q = inputs[0]
-            k = v = inputs[1]
-
-        if (len(inputs) == 3):
-            q = inputs[0]
-            k = inputs[1]
-            v = inputs[2]
+        v = k = y
+        q = x
 
         q = tf.tensordot(q, self.q_transformation, axes=[[2], [0]])
         k = tf.tensordot(k, self.k_transformation, axes=[[2], [0]])
@@ -166,6 +160,7 @@ def forward(self, inputs, mask, cache=None):
         logits = tf.matmul(q, k, transpose_b=True)  #(Batch, num_head, length_q, length_k)
         logits += mask
         weights = tf.nn.softmax(logits, name="attention_weights")  #(Batch, num_head, length_q, length_k)
+        weights_store = weights
         if self.is_train:
             weights = tf.nn.dropout(weights, rate=self.attention_dropout)
 
@@ -176,11 +171,11 @@ def forward(self, inputs, mask, cache=None):
 
         # Run the combined outputs through another linear projection layer.
         attention_output = tf.tensordot(attention_output, self.out_transformation, axes=[[2], [0]])
-        return attention_output
+        return attention_output, weights_store
 
 
 class SelfAttentionLayer(MultiHeadAttentionLayer):
     """Multiheaded self-attention layer."""
 
     def forward(self, inputs, mask, cache=None):
-        return super(SelfAttentionLayer, self).forward(inputs=[inputs, inputs], mask=mask, cache=cache)
+        return super(SelfAttentionLayer, self).forward(x=inputs, y=inputs, mask=mask, cache=cache)
diff --git a/tensorlayer/models/transformer/feedforward_layer.py b/tensorlayer/models/transformer/feedforward_layer.py
index a6b1fc049..7ae6f5f68 100644
--- a/tensorlayer/models/transformer/feedforward_layer.py
+++ b/tensorlayer/models/transformer/feedforward_layer.py
@@ -38,11 +38,11 @@ def __init__(self, hidden_size, filter_size, keep_prob):
         self.filter_size = filter_size
         self.relu_dropout = 1 - keep_prob
         self.filter_dense_layer = tl.layers.Dense(
-            self.filter_size, in_channels=self.hidden_size, W_init=tf.keras.initializers.get('glorot_uniform'),
+            self.filter_size, in_channels=self.hidden_size, W_init=tf.initializers.get('glorot_uniform'),
             name="input_layer"
         )
         self.output_dense_layer = tl.layers.Dense(
-            self.hidden_size, in_channels=self.filter_size, W_init=tf.keras.initializers.get('glorot_uniform'),
+            self.hidden_size, in_channels=self.filter_size, W_init=tf.initializers.get('glorot_uniform'),
             name="output_layer"
         )
         self.build(None)
diff --git a/tensorlayer/models/transformer/transformer.py b/tensorlayer/models/transformer/transformer.py
index af1a3647e..cef8071f1 100644
--- a/tensorlayer/models/transformer/transformer.py
+++ b/tensorlayer/models/transformer/transformer.py
@@ -33,8 +33,11 @@
 import tensorlayer.models.transformer.beamsearchHelper.beam_search as beam_search
 
 
+
+
+
 class Transformer(Model):
-    """Transformer model with tensorlayer.
+  """Transformer model with weights visualisation.
 
   Implemented as described in: https://arxiv.org/pdf/1706.03762.pdf
 
@@ -44,26 +47,27 @@ class Transformer(Model):
   probabilities for the output sequence.
   """
 
-    def __init__(self, params, name=None):
-        """Initialize layers to build Transformer model.
+  def __init__(self, params, name=None):
+    """Initialize layers to build Transformer model.
 
     Args:
       params: hyperparameter object defining layer sizes, dropout values, etc.
       name: name of the model.
     """
-        super(Transformer, self).__init__(name=name)
-        self.params = params
-        self.embedding_softmax_layer = embedding_layer.EmbeddingLayer(params.vocab_size, params.hidden_size)
-        self.encoder_stack = EncoderStack(params)
-        self.decoder_stack = DecoderStack(params)
+    super(Transformer, self).__init__(name=name)
+    self.params = params
+    self.embedding_softmax_layer = embedding_layer.EmbeddingLayer(
+        params.vocab_size, params.hidden_size)
+    self.encoder_stack = EncoderStack(params)
+    self.decoder_stack = DecoderStack(params)
 
-    def get_config(self):
-        return {
-            "params": self.params,
-        }
+  def get_config(self):
+    return {
+        "params": self.params,
+    }
 
-    def forward(self, inputs, targets=None):
-        """Calculate target logits or inferred target sequences.
+  def forward(self, inputs, targets=None):
+    """Calculate target logits or inferred target sequences.
 
     Args:
       inputs: input tensor list of size 1 or 2.
@@ -80,31 +84,32 @@ def forward(self, inputs, targets=None):
           outputs: [batch_size, decoded length]
           scores: [batch_size, float]}
     """
-        # # Variance scaling is used here because it seems to work in many problems.
-        # # Other reasonable initializers may also work just as well.
-
-        # Calculate attention bias for encoder self-attention and decoder
-        # multi-headed attention layers.
-        attention_bias = get_input_mask(inputs)
-
-        # Run the inputs through the encoder layer to map the symbol
-        # representations to continuous representations.
-        # Prepare inputs to the layer stack by adding positional encodings and
-        # applying dropout.
-        embedded_inputs = self.embedding_softmax_layer(inputs)
-        inputs_padding = get_input_mask(inputs)
-
-        encoder_outputs = self.encode(inputs, inputs_padding)
-        # Generate output sequence if targets is None, or return logits if target
-        # sequence is known.
-        if targets is None:
-            return self.predict(encoder_outputs, attention_bias)
-        else:
-            logits = self.decode(targets, encoder_outputs, attention_bias)
-        return logits
-
-    def encode(self, inputs, attention_bias):
-        """Generate continuous representation for inputs.
+    # # Variance scaling is used here because it seems to work in many problems.
+    # # Other reasonable initializers may also work just as well.
+
+    # Calculate attention bias for encoder self-attention and decoder
+    # multi-headed attention layers.
+    attention_bias = get_input_mask(inputs)
+
+    # Run the inputs through the encoder layer to map the symbol
+    # representations to continuous representations.
+    # Prepare inputs to the layer stack by adding positional encodings and
+    # applying dropout.
+    embedded_inputs = self.embedding_softmax_layer(inputs)
+    inputs_padding = get_input_mask(inputs)
+
+
+    encoder_outputs, weights_encoder = self.encode(inputs, inputs_padding)
+    # Generate output sequence if targets is None, or return logits if target
+    # sequence is known.
+    if targets is None:
+        return self.predict(encoder_outputs, attention_bias), weights_encoder
+    else:
+        logits, weights_decoder = self.decode(targets, encoder_outputs, attention_bias)
+    return logits, weights_encoder, weights_decoder
+
+  def encode(self, inputs, attention_bias):
+    """Generate continuous representation for inputs.
 
     Args:
       inputs: int tensor with shape [batch_size, input_length].
@@ -114,22 +119,26 @@ def encode(self, inputs, attention_bias):
     Returns:
       float tensor with shape [batch_size, input_length, hidden_size]
     """
-
-        # Prepare inputs to the layer stack by adding positional encodings and
-        # applying dropout.
-        embedded_inputs = self.embedding_softmax_layer(inputs)
-        inputs_padding = get_input_mask(inputs)
-
-        length = tf.shape(embedded_inputs)[1]
-        pos_encoding = positional_encoding(length, self.params.hidden_size)
-        encoder_inputs = embedded_inputs + pos_encoding
-
-        if self.is_train:
-            encoder_inputs = tf.nn.dropout(encoder_inputs, rate=1 - self.params.keep_prob)
-        return self.encoder_stack(encoder_inputs, input_mask=attention_bias)
-
-    def decode(self, targets, encoder_outputs, attention_bias):
-        """Generate logits for each value in the target sequence.
+    
+      # Prepare inputs to the layer stack by adding positional encodings and
+      # applying dropout.
+    embedded_inputs = self.embedding_softmax_layer(inputs)
+    inputs_padding = get_input_mask(inputs)
+
+    
+    length = tf.shape(embedded_inputs)[1]
+    pos_encoding = positional_encoding(
+        length, self.params.hidden_size)
+    encoder_inputs = embedded_inputs + pos_encoding
+    
+    if self.is_train:
+        encoder_inputs = tf.nn.dropout(
+            encoder_inputs, rate=1-self.params.keep_prob)
+    return self.encoder_stack(
+        encoder_inputs, input_mask=attention_bias)
+
+  def decode(self, targets, encoder_outputs, attention_bias):
+    """Generate logits for each value in the target sequence.
 
     Args:
       targets: target values for the output sequence. int tensor with shape
@@ -142,38 +151,43 @@ def decode(self, targets, encoder_outputs, attention_bias):
     Returns:
       float32 tensor with shape [batch_size, target_length, vocab_size]
     """
-        with tf.name_scope("decode"):
-            # Prepare inputs to decoder layers by shifting targets, adding positional
-            # encoding and applying dropout.
-            decoder_inputs = self.embedding_softmax_layer(targets)
-            with tf.name_scope("shift_targets"):
-                # Shift targets to the right, and remove the last element
-                decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
-            with tf.name_scope("add_pos_encoding"):
-                length = tf.shape(decoder_inputs)[1]
-                decoder_inputs += positional_encoding(length, self.params.hidden_size)
-            if self.is_train:
-                decoder_inputs = tf.nn.dropout(decoder_inputs, rate=1 - self.params.keep_prob)
-
-            # Run values
-            decoder_self_attention_bias = get_target_mask(length)
-            outputs = self.decoder_stack(
-                decoder_inputs,
-                features=encoder_outputs,
-                input_mask=attention_bias,
-                target_mask=decoder_self_attention_bias,
-            )
-            logits = self.embedding_softmax_layer(outputs, mode="linear")
-            return logits
-
-    def _get_symbols_to_logits_fn(self, max_decode_length):
-        """Returns a decoding function that calculates logits of the next tokens."""
-
-        timing_signal = positional_encoding(max_decode_length + 1, self.params.hidden_size)
-        decoder_self_attention_bias = get_target_mask(max_decode_length)
-
-        def symbols_to_logits_fn(ids, i, cache):
-            """Generate logits for next potential IDs.
+    with tf.name_scope("decode"):
+      # Prepare inputs to decoder layers by shifting targets, adding positional
+      # encoding and applying dropout.
+      decoder_inputs = self.embedding_softmax_layer(targets)
+      with tf.name_scope("shift_targets"):
+        # Shift targets to the right, and remove the last element
+        decoder_inputs = tf.pad(decoder_inputs,
+                                [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
+      with tf.name_scope("add_pos_encoding"):
+        length = tf.shape(decoder_inputs)[1]
+        decoder_inputs += positional_encoding(
+            length, self.params.hidden_size)
+      if self.is_train:
+        decoder_inputs = tf.nn.dropout(
+            decoder_inputs, rate=1-self.params.keep_prob)
+
+      # Run values
+      decoder_self_attention_bias = get_target_mask(
+          length)
+      outputs, weights = self.decoder_stack(
+          decoder_inputs,
+          features=encoder_outputs,
+          input_mask=attention_bias,
+          target_mask=decoder_self_attention_bias,)
+      logits = self.embedding_softmax_layer(outputs, mode="linear")
+      return logits, weights
+
+  def _get_symbols_to_logits_fn(self, max_decode_length):
+    """Returns a decoding function that calculates logits of the next tokens."""
+
+    timing_signal = positional_encoding(
+        max_decode_length + 1, self.params.hidden_size)
+    decoder_self_attention_bias = get_target_mask(
+        max_decode_length)
+    weights = []
+    def symbols_to_logits_fn(ids, i, cache):
+      """Generate logits for next potential IDs.
 
       Args:
         ids: Current decoded sequences. int tensor with shape [batch_size *
@@ -187,60 +201,69 @@ def symbols_to_logits_fn(ids, i, cache):
           (logits with shape [batch_size * beam_size, vocab_size],
            updated cache values)
       """
-            # Set decoder input to the last generated IDs
-            decoder_input = ids[:, -1:]
-
-            # Preprocess decoder input by getting embeddings and adding timing signal.
-            decoder_input = self.embedding_softmax_layer(decoder_input)
-            decoder_input += timing_signal[i:i + 1]
-
-            self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
-            decoder_outputs = self.decoder_stack(
-                decoder_input, features=cache.get("encoder_outputs"), target_mask=self_attention_bias,
-                input_mask=cache.get("encoder_decoder_attention_bias"), cache=cache
-            )
-            logits = self.embedding_softmax_layer(decoder_outputs, mode="linear")
-            logits = tf.squeeze(logits, axis=[1])
-            return logits, cache
-
-        return symbols_to_logits_fn
-
-    def predict(self, encoder_outputs, encoder_decoder_attention_bias):
-        """Return predicted sequence."""
-        batch_size = tf.shape(encoder_outputs)[0]
-        input_length = tf.shape(encoder_outputs)[1]
-        max_decode_length = input_length + self.params.extra_decode_length
-
-        symbols_to_logits_fn = self._get_symbols_to_logits_fn(max_decode_length)
-
-        # Create initial set of IDs that will be passed into symbols_to_logits_fn.
-        initial_ids = tf.zeros([batch_size], dtype=tf.int32)
-
-        # Create cache storing decoder attention values for each layer.
-        # pylint: disable=g-complex-comprehension
-        cache = {
-            "layer_%d" % layer: {
-                "k": tf.zeros([batch_size, 0, self.params.hidden_size]),
-                "v": tf.zeros([batch_size, 0, self.params.hidden_size])
-            } for layer in range(self.params.encoder_num_layers)
-        }
-
-        # Add encoder output and attention bias to the cache.
-        cache["encoder_outputs"] = encoder_outputs
-        cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
-
-        # Use beam search to find the top beam_size sequences and scores.
-        decoded_ids, scores = beam_search.sequence_beam_search(
-            symbols_to_logits_fn=symbols_to_logits_fn, initial_ids=initial_ids, initial_cache=cache,
-            vocab_size=self.params.vocab_size, beam_size=self.params.beam_size, alpha=self.params.alpha,
-            max_decode_length=max_decode_length, eos_id=1
-        )
-
-        # Get the top sequence for each batch element
-        top_decoded_ids = decoded_ids[:, 0, 1:]
-        top_scores = scores[:, 0]
-
-        return {"outputs": top_decoded_ids, "scores": top_scores}
+      # Set decoder input to the last generated IDs
+      decoder_input = ids[:, -1:]
+
+      # Preprocess decoder input by getting embeddings and adding timing signal.
+      decoder_input = self.embedding_softmax_layer(decoder_input)
+      decoder_input += timing_signal[i:i + 1]
+
+      self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
+      decoder_outputs, weight = self.decoder_stack(
+          decoder_input,
+          features=cache.get("encoder_outputs"),
+          target_mask=self_attention_bias,
+          input_mask=cache.get("encoder_decoder_attention_bias"),
+          cache=cache)
+      weights.append(weight)
+      logits = self.embedding_softmax_layer(decoder_outputs, mode="linear")
+      logits = tf.squeeze(logits, axis=[1])
+      return logits, cache
+
+    return symbols_to_logits_fn, weights
+
+  def predict(self, encoder_outputs, encoder_decoder_attention_bias):
+    """Return predicted sequence."""
+    batch_size = tf.shape(encoder_outputs)[0]
+    input_length = tf.shape(encoder_outputs)[1]
+    max_decode_length = input_length + self.params.extra_decode_length
+
+    symbols_to_logits_fn, weights = self._get_symbols_to_logits_fn(
+        max_decode_length)
+
+    # Create initial set of IDs that will be passed into symbols_to_logits_fn.
+    initial_ids = tf.zeros([batch_size], dtype=tf.int32)
+
+    # Create cache storing decoder attention values for each layer.
+    # pylint: disable=g-complex-comprehension
+    cache = {
+        "layer_%d" % layer: {
+            "k": tf.zeros([batch_size, 0, self.params.hidden_size]),
+            "v": tf.zeros([batch_size, 0, self.params.hidden_size])
+        } for layer in range(self.params.encoder_num_layers)
+    }
+    # pylint: enable=g-complex-comprehension
+
+    # Add encoder output and attention bias to the cache.
+    cache["encoder_outputs"] = encoder_outputs
+    cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
+
+    # Use beam search to find the top beam_size sequences and scores.
+    decoded_ids, scores = beam_search.sequence_beam_search(
+        symbols_to_logits_fn=symbols_to_logits_fn,
+        initial_ids=initial_ids,
+        initial_cache=cache,
+        vocab_size=self.params.vocab_size,
+        beam_size=self.params.beam_size,
+        alpha=self.params.alpha,
+        max_decode_length=max_decode_length,
+        eos_id=1)
+
+    # Get the top sequence for each batch element
+    top_decoded_ids = decoded_ids[:, 0, 1:]
+    top_scores = scores[:, 0]
+
+    return {"outputs": top_decoded_ids, "scores": top_scores}, weights
 
 
 class LayerNormalization(tl.layers.Layer):
@@ -279,44 +302,42 @@ def __repr__(self):
 
 
 class PrePostProcessingWrapper(Model):
-    """Wrapper class that applies layer pre-processing and post-processing."""
-
-    def __init__(self, layer, params):
-        super(PrePostProcessingWrapper, self).__init__()
-        self.layer = layer
-        self.params = params
-        self.postprocess_dropout = 1 - params.keep_prob
-        self.layer_norm = LayerNormalization(self.params.hidden_size)
-
-    def get_config(self):
-        return {
-            "params": self.params,
-        }
-
-    def forward(self, inputs, *args, **kwargs):
-        """Calls wrapped layer with same parameters."""
-
-        if (type(inputs) == list):
-            if (len(inputs) == 2):
-                x = decoder_input = inputs[0]
-                encoder_output = inputs[1]
-                decoder_input = self.layer_norm(decoder_input)
-                # Get layer output
-                y = self.layer([decoder_input, encoder_output], *args, **kwargs)
-
-        else:
-            x = inputs
-            y = self.layer_norm(inputs)
-            y = self.layer(y, *args, **kwargs)
-
-        # Postprocessing: apply dropout and residual connection
-        if self.is_train:
-            y = tf.nn.dropout(y, rate=self.postprocess_dropout)
-        return x + y
-
+  """Wrapper class that applies layer pre-processing and post-processing."""
+
+  def __init__(self, layer, params):
+    super(PrePostProcessingWrapper, self).__init__()
+    self.layer = layer
+    self.params = params
+    self.postprocess_dropout = 1-params.keep_prob
+    self.layer_norm = LayerNormalization(self.params.hidden_size)
+
+  def get_config(self):
+    return {
+        "params": self.params,
+    }
+
+  def forward(self, inputs, get_weight=False, *args, **kwargs):
+    """Calls wrapped layer with same parameters."""
+
+    x = inputs
+    y = self.layer_norm(x)
+
+    # Get layer output
+    if (get_weight):
+        y, weight = self.layer(y, *args, **kwargs)
+    else:
+        y = self.layer(y, *args, **kwargs)
+
+    # Postprocessing: apply dropout and residual connection
+    if self.is_train:
+      y = tf.nn.dropout(y, rate=self.postprocess_dropout)
+    if (get_weight):
+        return x + y, weight
+    else:
+        return x+y
 
 class EncoderStack(Model):
-    """Transformer encoder stack.
+  """Transformer encoder stack.
 
   The encoder stack is made up of N identical layers. Each layer is composed
   of the sublayers:
@@ -324,32 +345,33 @@ class EncoderStack(Model):
     2. Feedforward network (which is 2 fully-connected layers)
   """
 
-    def __init__(self, params):
-        super(EncoderStack, self).__init__()
-        self.params = params
-        self.layers = []
-        for _ in range(params.encoder_num_layers):
-            # Create sublayers for each layer.
-            self_attention_layer = SelfAttentionLayer(params.num_heads, params.hidden_size, params.keep_prob)
-            feed_forward_network = FeedForwardLayer(params.hidden_size, params.ff_size, params.keep_prob)
-
-            self.layers.append(
-                [
-                    PrePostProcessingWrapper(self_attention_layer, params),
-                    PrePostProcessingWrapper(feed_forward_network, params)
-                ]
-            )
-
-        # Create final layer normalization layer.
-        self.output_normalization = LayerNormalization(params.hidden_size)
-
-    def get_config(self):
-        return {
-            "params": self.params,
-        }
-
-    def forward(self, inputs, input_mask):
-        """Return the output of the encoder layer stacks.
+  def __init__(self, params):
+    super(EncoderStack, self).__init__()
+    self.params = params
+    self.layers = []
+    for _ in range(params.encoder_num_layers):
+      # Create sublayers for each layer.
+      self_attention_layer = SelfAttentionLayer(
+          params.num_heads, params.hidden_size, 
+          params.keep_prob)
+      feed_forward_network = FeedForwardLayer(
+          params.hidden_size, params.ff_size, params.keep_prob)
+
+      self.layers.append([
+          PrePostProcessingWrapper(self_attention_layer, params),
+          PrePostProcessingWrapper(feed_forward_network, params)
+      ])
+
+    # Create final layer normalization layer.
+    self.output_normalization = LayerNormalization(params.hidden_size)
+
+  def get_config(self):
+    return {
+        "params": self.params,
+    }
+
+  def forward(self, inputs, input_mask):
+    """Return the output of the encoder layer stacks.
 
     Args:
       encoder_inputs: tensor with shape [batch_size, input_length, hidden_size]
@@ -363,25 +385,27 @@ def forward(self, inputs, input_mask):
       Output of encoder layer stack.
       float32 tensor with shape [batch_size, input_length, hidden_size]
     """
-        encoder_inputs = inputs
-        for n, layer in enumerate(self.layers):
-            # Run inputs through the sublayers.
-            self_attention_layer = layer[0]
-            feed_forward_network = layer[1]
+    encoder_inputs = inputs
+    weights = {}
+    for n, layer in enumerate(self.layers):
+      # Run inputs through the sublayers.
+      self_attention_layer = layer[0]
+      feed_forward_network = layer[1]
 
-            with tf.name_scope("layer_%d" % n):
-                with tf.name_scope("self_attention"):
-                    encoder_inputs = self_attention_layer(encoder_inputs, mask=input_mask)
-                # with tf.name_scope("layer_attention"):
-                #   encoder_inputs = (inputs, y=encoder_inputs, mask=input_mask)
-                with tf.name_scope("ffn"):
-                    encoder_inputs = feed_forward_network(encoder_inputs)
+      with tf.name_scope("layer_%d" % n):
+        with tf.name_scope("self_attention"):
+          encoder_inputs, weight= self_attention_layer(
+              encoder_inputs, mask=input_mask, get_weight=True)
+          weights["layer_%d" % n] = weight
+        with tf.name_scope("ffn"):
+          encoder_inputs = feed_forward_network(
+              encoder_inputs)
 
-        return self.output_normalization(encoder_inputs)
+    return self.output_normalization(encoder_inputs), weights
 
 
 class DecoderStack(Model):
-    """Transformer decoder stack.
+  """Transformer decoder stack.
 
   Like the encoder stack, the decoder stack is made up of N identical layers.
   Each layer is composed of the sublayers:
@@ -391,31 +415,34 @@ class DecoderStack(Model):
     3. Feedforward network (2 fully-connected layers)
   """
 
-    def __init__(self, params):
-        super(DecoderStack, self).__init__()
-        self.params = params
-        self.layers = []
-        for _ in range(params.decoder_num_layers):
-            self_attention_layer = SelfAttentionLayer(params.num_heads, params.hidden_size, params.keep_prob)
-            enc_dec_attention_layer = MultiHeadAttentionLayer(params.num_heads, params.hidden_size, params.keep_prob)
-            feed_forward_network = FeedForwardLayer(params.hidden_size, params.ff_size, params.keep_prob)
-
-            self.layers.append(
-                [
-                    PrePostProcessingWrapper(self_attention_layer, params),
-                    PrePostProcessingWrapper(enc_dec_attention_layer, params),
-                    PrePostProcessingWrapper(feed_forward_network, params)
-                ]
-            )
-        self.output_normalization = LayerNormalization(params.hidden_size)
-
-    def get_config(self):
-        return {
-            "params": self.params,
-        }
-
-    def forward(self, inputs, features, input_mask, target_mask, cache=None):
-        """Return the output of the decoder layer stacks.
+  def __init__(self, params):
+    super(DecoderStack, self).__init__()
+    self.params = params
+    self.layers = []
+    for _ in range(params.decoder_num_layers):
+      self_attention_layer = SelfAttentionLayer(
+          params.num_heads, params.hidden_size, 
+          params.keep_prob)
+      enc_dec_attention_layer = MultiHeadAttentionLayer(
+          params.num_heads, params.hidden_size, 
+          params.keep_prob)
+      feed_forward_network = FeedForwardLayer(
+          params.hidden_size, params.ff_size, params.keep_prob)
+
+      self.layers.append([
+          PrePostProcessingWrapper(self_attention_layer, params),
+          PrePostProcessingWrapper(enc_dec_attention_layer, params),
+          PrePostProcessingWrapper(feed_forward_network, params)
+      ])
+    self.output_normalization = LayerNormalization(params.hidden_size)
+
+  def get_config(self):
+    return {
+        "params": self.params,
+    }
+
+  def forward(self, inputs, features, input_mask, target_mask, cache=None):
+    """Return the output of the decoder layer stacks.
 
     Args:
       decoder_inputs: tensor with shape [batch_size, target_length, hidden_size]
@@ -435,26 +462,39 @@ def forward(self, inputs, features, input_mask, target_mask, cache=None):
       Output of decoder layer stack.
       float32 tensor with shape [batch_size, target_length, hidden_size]
     """
-        decoder_inputs = inputs
-        decoder_self_attention_bias = target_mask
-        encoder_outputs = features
-        attention_bias = input_mask
-        for n, layer in enumerate(self.layers):
-            self_attention_layer = layer[0]
-            enc_dec_attention_layer = layer[1]
-            feed_forward_network = layer[2]
-
-            # Run inputs through the sublayers.
-            layer_name = "layer_%d" % n
-            layer_cache = cache[layer_name] if cache is not None else None
-            with tf.name_scope(layer_name):
-                with tf.name_scope("self_attention"):
-                    decoder_inputs = self_attention_layer(
-                        decoder_inputs, mask=decoder_self_attention_bias, cache=layer_cache
-                    )
-                with tf.name_scope("encdec_attention"):
-                    decoder_inputs = enc_dec_attention_layer([decoder_inputs, encoder_outputs], mask=attention_bias)
-                with tf.name_scope("ffn"):
-                    decoder_inputs = feed_forward_network(decoder_inputs)
-
-        return self.output_normalization(decoder_inputs)
+    decoder_inputs = inputs
+    decoder_self_attention_bias = target_mask
+    encoder_outputs = features
+    attention_bias = input_mask
+    weights_all = {"self":{}, "enc_dec":{}}
+    for n, layer in enumerate(self.layers):
+      self_attention_layer = layer[0]
+      enc_dec_attention_layer = layer[1]
+      feed_forward_network = layer[2]
+
+      # Run inputs through the sublayers.
+      layer_name = "layer_%d" % n
+      layer_cache = cache[layer_name] if cache is not None else None
+      
+      with tf.name_scope(layer_name):
+        with tf.name_scope("self_attention"):
+          decoder_inputs,weight_self = self_attention_layer(
+              decoder_inputs, get_weight=True,
+              mask=decoder_self_attention_bias,
+              cache=layer_cache)
+          weights_all['self']["layer_%d" % n] = weight_self
+        with tf.name_scope("encdec_attention"):
+          decoder_inputs, weight_enc_dec = enc_dec_attention_layer(
+              decoder_inputs, get_weight=True,
+              y=encoder_outputs,
+              mask=attention_bias)
+          weights_all['enc_dec']["layer_%d" % n] = weight_enc_dec
+        with tf.name_scope("ffn"):
+          decoder_inputs  = feed_forward_network(
+              decoder_inputs)
+
+    return self.output_normalization(decoder_inputs), weights_all
+
+
+
+

From 61bf27f2a4b3499b6c8164977e152fb05ed30165 Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@LingjundeMacBook-Pro.local>
Date: Sat, 31 Aug 2019 13:57:45 +0100
Subject: [PATCH 06/22] optimizer update

---
 .../models/transformer/utils/optimizer.py     | 147 ------------------
 tensorlayer/optimizers/lazyAdam.py            |   2 +-
 2 files changed, 1 insertion(+), 148 deletions(-)
 delete mode 100644 tensorlayer/models/transformer/utils/optimizer.py

diff --git a/tensorlayer/models/transformer/utils/optimizer.py b/tensorlayer/models/transformer/utils/optimizer.py
deleted file mode 100644
index 9fa27f69f..000000000
--- a/tensorlayer/models/transformer/utils/optimizer.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Optimizer from addons and learning rate scheduler."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-K = tf.keras.backend
-
-
-class LazyAdam(tf.keras.optimizers.Adam):
-    """Variant of the Adam optimizer that handles sparse updates more efficiently.
-
-  The original Adam algorithm maintains two moving-average accumulators for
-  each trainable variable; the accumulators are updated at every step.
-  This class provides lazier handling of gradient updates for sparse
-  variables.  It only updates moving-average accumulators for sparse variable
-  indices that appear in the current batch, rather than updating the
-  accumulators for all indices. Compared with the original Adam optimizer,
-  it can provide large improvements in model training throughput for some
-  applications. However, it provides slightly different semantics than the
-  original Adam algorithm, and may lead to different empirical results.
-  Note, amsgrad is currently not supported and the argument can only be
-  False.
-
-  This class is borrowed from:
-  https://github.com/tensorflow/addons/blob/master/tensorflow_addons/optimizers/lazy_adam.py
-  """
-
-    def _resource_apply_sparse(self, grad, var, indices):
-        """Applies grad for one step."""
-        var_dtype = var.dtype.base_dtype
-        lr_t = self._decayed_lr(var_dtype)
-        beta_1_t = self._get_hyper('beta_1', var_dtype)
-        beta_2_t = self._get_hyper('beta_2', var_dtype)
-        local_step = tf.cast(self.iterations + 1, var_dtype)
-        beta_1_power = tf.math.pow(beta_1_t, local_step)
-        beta_2_power = tf.math.pow(beta_2_t, local_step)
-        epsilon_t = tf.convert_to_tensor(self.epsilon, var_dtype)
-        lr = (lr_t * tf.math.sqrt(1 - beta_2_power) / (1 - beta_1_power))
-
-        # \\(m := beta1 * m + (1 - beta1) * g_t\\)
-        m = self.get_slot(var, 'm')
-        m_t_slice = beta_1_t * tf.gather(m, indices) + (1 - beta_1_t) * grad
-
-        m_update_kwargs = {'resource': m.handle, 'indices': indices, 'updates': m_t_slice}
-        m_update_op = tf.raw_ops.ResourceScatterUpdate(**m_update_kwargs)
-
-        # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
-        v = self.get_slot(var, 'v')
-        v_t_slice = (beta_2_t * tf.gather(v, indices) + (1 - beta_2_t) * tf.math.square(grad))
-
-        v_update_kwargs = {'resource': v.handle, 'indices': indices, 'updates': v_t_slice}
-        v_update_op = tf.raw_ops.ResourceScatterUpdate(**v_update_kwargs)
-
-        # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
-        var_slice = lr * m_t_slice / (tf.math.sqrt(v_t_slice) + epsilon_t)
-
-        var_update_kwargs = {'resource': var.handle, 'indices': indices, 'updates': var_slice}
-        var_update_op = tf.raw_ops.ResourceScatterSub(**var_update_kwargs)
-
-        return tf.group(*[var_update_op, m_update_op, v_update_op])
-
-
-class LearningRateFn(object):
-    """Creates learning rate function."""
-
-    def __init__(self, learning_rate, hidden_size, warmup_steps):
-        self.learning_rate = learning_rate
-        self.hidden_size = hidden_size
-        self.warmup_steps = float(warmup_steps)
-
-    def __call__(self, global_step):
-        """Calculate learning rate with linear warmup and rsqrt decay."""
-        step = float(global_step)
-        learning_rate = self.learning_rate
-        learning_rate *= (self.hidden_size**-0.5)
-        # Apply linear warmup
-        learning_rate *= np.minimum(1.0, step / self.warmup_steps)
-        # Apply rsqrt decay
-        learning_rate /= np.sqrt(np.maximum(step, self.warmup_steps))
-        return learning_rate
-
-
-class LearningRateScheduler(tf.keras.callbacks.Callback):
-    """Keras callback to schedule learning rate.
-
-  TODO(tianlin): Refactor this scheduler and LearningRateBatchScheduler in
-  official/resnet/keras/keras_common.py.
-  """
-
-    def __init__(self, schedule, init_steps=None, verbose=False):
-        super(LearningRateScheduler, self).__init__()
-        self.schedule = schedule
-        self.verbose = verbose
-        if init_steps is None:
-            init_steps = 0.0
-        self.steps = float(init_steps)  # Total steps during training.
-
-    def on_epoch_begin(self, epoch, logs=None):
-        if not hasattr(self.model.optimizer, 'lr'):
-            raise ValueError('Optimizer must have a "lr" attribute.')
-        if not hasattr(self.model.optimizer, 'iterations'):
-            raise ValueError('Optimizer must have a "iterations" attribute.')
-
-    def on_train_batch_begin(self, batch, logs=None):
-        """Adjusts learning rate for each train batch."""
-        if self.verbose > 0:
-            iterations = K.get_value(self.model.optimizer.iterations)
-            print('Original iteration %d' % iterations)
-
-        self.steps += 1.0
-        try:  # new API
-            lr = float(K.get_value(self.model.optimizer.lr))
-            lr = self.schedule(self.steps, lr)
-        except TypeError:  # Support for old API for backward compatibility
-            lr = self.schedule(self.steps)
-        if not isinstance(lr, (float, np.float32, np.float64)):
-            raise ValueError('The output of the "schedule" function ' 'should be float.')
-        K.set_value(self.model.optimizer.lr, lr)
-        K.set_value(self.model.optimizer.iterations, self.steps)
-
-        if self.verbose > 0:
-            print(
-                'Batch %05d Step %05d: LearningRateScheduler setting learning '
-                'rate to %s.' % (batch + 1, self.steps, lr)
-            )
-
-    def on_epoch_end(self, epoch, logs=None):
-        logs = logs or {}
-        logs['lr'] = K.get_value(self.model.optimizer.lr)
-        logs['steps'] = self.steps
diff --git a/tensorlayer/optimizers/lazyAdam.py b/tensorlayer/optimizers/lazyAdam.py
index 9fa27f69f..75ae77f65 100644
--- a/tensorlayer/optimizers/lazyAdam.py
+++ b/tensorlayer/optimizers/lazyAdam.py
@@ -23,7 +23,7 @@
 K = tf.keras.backend
 
 
-class LazyAdam(tf.keras.optimizers.Adam):
+class LazyAdam(tf.optimizers.Adam):
     """Variant of the Adam optimizer that handles sparse updates more efficiently.
 
   The original Adam algorithm maintains two moving-average accumulators for

From 3ef8d8b76cb346b43cff76486fc857c3b0b16e31 Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@LingjundeMacBook-Pro.local>
Date: Sat, 31 Aug 2019 15:18:03 +0100
Subject: [PATCH 07/22] fix

---
 tensorlayer/models/transformer/utils/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorlayer/models/transformer/utils/__init__.py b/tensorlayer/models/transformer/utils/__init__.py
index 13b4fe535..63ab4a4f5 100644
--- a/tensorlayer/models/transformer/utils/__init__.py
+++ b/tensorlayer/models/transformer/utils/__init__.py
@@ -1,3 +1,2 @@
 from .model_utils import *
-from .optimizer import *
 from .metrics import *
\ No newline at end of file

From 048d9a34063566cdbac38dadf9f867187464216d Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@LingjundeMacBook-Pro.local>
Date: Sun, 1 Sep 2019 11:15:10 +0100
Subject: [PATCH 08/22] add attention visualisation

---
 tests/models/test_transformer.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/tests/models/test_transformer.py b/tests/models/test_transformer.py
index 193e18401..27ffcdfc1 100644
--- a/tests/models/test_transformer.py
+++ b/tests/models/test_transformer.py
@@ -4,8 +4,6 @@
 import os
 import unittest
 
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
-
 import numpy as np
 import tensorflow as tf
 import tensorlayer as tl
@@ -14,8 +12,8 @@
 from tensorlayer.models.transformer import Transformer
 from tests.utils import CustomTestCase
 from tensorlayer.models.transformer.utils import metrics
-from tensorlayer.cost import cross_entropy_seq
 from tensorlayer.optimizers import lazyAdam as optimizer
+from tensorlayer.models.transformer.utils import attention_visualisation
 import time
 
 
@@ -51,7 +49,7 @@ class Model_SEQ2SEQ_Test(CustomTestCase):
 
     @classmethod
     def setUpClass(cls):
-        cls.batch_size = 16
+        cls.batch_size = 50
 
         cls.embedding_size = 32
         cls.dec_seq_length = 5
@@ -66,7 +64,7 @@ def setUpClass(cls):
 
         assert cls.src_len == cls.tgt_len
 
-        cls.num_epochs = 1000
+        cls.num_epochs = 20
         cls.n_step = cls.src_len // cls.batch_size
 
     @classmethod
@@ -99,8 +97,8 @@ def test_basic_simpleSeq2Seq(self):
                     
                     grad = tape.gradient(loss, model_.all_weights)
                     optimizer.apply_gradients(zip(grad, model_.all_weights))
-                    
             
+
                 total_loss += loss
                 n_iter += 1
             print(time.time()-t)
@@ -115,5 +113,20 @@ def test_basic_simpleSeq2Seq(self):
             print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, self.num_epochs, total_loss / n_iter))
 
 
+        # visualise the self-attention weights at encoder 
+        trainX, trainY = shuffle(self.trainX, self.trainY)
+        X = [trainX[0]]
+        Y = [trainY[0]]
+        logits, weights_encoder, weights_decoder = model_(inputs = X, targets = Y)
+        attention_visualisation.plot_attention_weights(weights_encoder["layer_0"], X[0].numpy(), X[0].numpy())
+
+        # visualise the self-attention weights at encoder 
+        trainX, trainY = shuffle(self.trainX, self.trainY)
+        X = [trainX[0]]
+        Y = [trainY[0]]
+        logits, weights_encoder, weights_decoder = model_(inputs = X, targets = Y)
+        attention_visualisation.plot_attention_weights(weights_decoder["enc_dec"]["layer_0"], X[0].numpy(), Y[0])
+
+
 if __name__ == '__main__':
     unittest.main()

From a47aee115a6145005a8bb09b1c140753f3933597 Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@LingjundeMacBook-Pro.local>
Date: Sun, 1 Sep 2019 11:15:46 +0100
Subject: [PATCH 09/22] add attention visualisation

---
 .../models/transformer/utils/__init__.py      |  4 +-
 .../utils/attention_visualisation.py          | 41 +++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 tensorlayer/models/transformer/utils/attention_visualisation.py

diff --git a/tensorlayer/models/transformer/utils/__init__.py b/tensorlayer/models/transformer/utils/__init__.py
index 63ab4a4f5..1786a4e0f 100644
--- a/tensorlayer/models/transformer/utils/__init__.py
+++ b/tensorlayer/models/transformer/utils/__init__.py
@@ -1,2 +1,4 @@
 from .model_utils import *
-from .metrics import *
\ No newline at end of file
+from .metrics import *
+from .subtokenizer import *
+from .attention_visualisation import *
\ No newline at end of file
diff --git a/tensorlayer/models/transformer/utils/attention_visualisation.py b/tensorlayer/models/transformer/utils/attention_visualisation.py
new file mode 100644
index 000000000..376a5c5cd
--- /dev/null
+++ b/tensorlayer/models/transformer/utils/attention_visualisation.py
@@ -0,0 +1,41 @@
+import matplotlib.pyplot as plt
+import tensorflow as tf
+def plot_attention_weights(attention, key, query):
+
+    '''Attention visualisation for Transformer
+
+    Parameters
+    ----------
+    attention : attention weights
+        shape of (1, number of head, length of key, length of query).
+    
+    key : key for attention computation
+        a list of values which would be shown as xtick labels
+
+    value : value for attention computation
+        a list of values which would be shown as ytick labels
+
+    '''
+
+
+    fig = plt.figure(figsize=(16, 8))
+
+    attention = tf.squeeze(attention, axis=0)
+    
+    for head in range(attention.shape[0]):
+        ax = fig.add_subplot(attention.shape[0]//2, 2, head+1)
+        ax.matshow(attention[head], cmap='viridis')
+        fontdict = {'fontsize': 12}
+        ax.set_xticks(range(len(key)))
+        ax.set_yticks(range(len(query)))
+
+        # ax.set_ylim(len(query)-1.5, -0.5)
+        ax.set_xticklabels(
+            [str(i) for i in key], 
+            fontdict=fontdict, rotation=90)
+
+        ax.set_yticklabels([str(i) for i in query], fontdict=fontdict)
+
+        ax.set_xlabel('Head {}'.format(head+1), fontdict = fontdict)
+    plt.tight_layout()
+    plt.show()
\ No newline at end of file

From 3c4cae17d134236338be566839b4e6b8691c44fe Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@LingjundeMacBook-Pro.local>
Date: Sun, 1 Sep 2019 12:24:31 +0100
Subject: [PATCH 10/22] add decoder part attention visualisation

---
 .../models/transformer/attention_layer.py     |  1 +
 tensorlayer/models/transformer/transformer.py | 73 +++++++++++++++++--
 .../models/transformer/utils/__init__.py      |  1 -
 tests/models/test_transformer.py              | 17 ++++-
 4 files changed, 79 insertions(+), 13 deletions(-)

diff --git a/tensorlayer/models/transformer/attention_layer.py b/tensorlayer/models/transformer/attention_layer.py
index acf474584..36e27b380 100644
--- a/tensorlayer/models/transformer/attention_layer.py
+++ b/tensorlayer/models/transformer/attention_layer.py
@@ -124,6 +124,7 @@ def forward(self, x, y, mask, cache=None):
 
     Returns:
       Attention layer output with shape [batch_size, length_x, hidden_size]
+      Attention weights with shape [batch_size, number_of_head, length_x, length_y]
     """
         # Linearly project the query (q), key (k) and value (v) using different
         # learned projections. This is in preparation of splitting them into
diff --git a/tensorlayer/models/transformer/transformer.py b/tensorlayer/models/transformer/transformer.py
index cef8071f1..fed6bb480 100644
--- a/tensorlayer/models/transformer/transformer.py
+++ b/tensorlayer/models/transformer/transformer.py
@@ -77,12 +77,55 @@ def forward(self, inputs, targets=None):
       training: boolean, whether in training mode or not.
 
     Returns:
-      If targets is defined, then return logits for each word in the target
-      sequence. float tensor with shape [batch_size, target_length, vocab_size]
-      If target is none, then generate output sequence one token at a time.
-        returns a dictionary {
-          outputs: [batch_size, decoded length]
-          scores: [batch_size, float]}
+      If targets is defined:
+        Logits for each word in the target sequence: 
+            float tensor with shape [batch_size, target_length, vocab_size]
+        Self-attention weights for encoder part:
+            a dictionary of float tensors {
+                "layer_0": [batch_size, number_of_heads, source_length, source_length],
+                "layer_1": [batch_size, number_of_heads, source_length, source_length],
+                ...
+            }
+        Weights for decoder part:
+            a dictionary of dictionary of float tensors {
+                "self": {
+                    "layer_0": [batch_size, number_of_heads, target_length, target_length],
+                    "layer_1": [batch_size, number_of_heads, target_length, target_length],
+                    ...
+                }
+                "enc_dec": {
+                    "layer_0": [batch_size, number_of_heads, source_length, target_length],
+                    "layer_1": [batch_size, number_of_heads, source_length, target_length],
+                    ...
+                }
+            }
+    
+      If target is none:
+        Auto-regressive beam-search decoding to generate output each one time step:
+            a dictionary {
+            outputs: [batch_size, decoded length]
+            scores: [batch_size, float]}
+            }
+        Weights for decoder part:
+            a dictionary of dictionary of float tensors {
+                "self": {
+                    "layer_0": [batch_size, number_of_heads, target_length, target_length],
+                    "layer_1": [batch_size, number_of_heads, target_length, target_length],
+                    ...
+                }
+                "enc_dec": {
+                    "layer_0": [batch_size, number_of_heads, source_length, target_length],
+                    "layer_1": [batch_size, number_of_heads, source_length, target_length],
+                    ...
+                }
+            }
+        Self-attention weights for encoder part:
+            a dictionary of float tensors {
+                "layer_0": [batch_size, number_of_heads, source_length, source_length],
+                "layer_1": [batch_size, number_of_heads, source_length, source_length],
+                ...
+            }
+
     """
     # # Variance scaling is used here because it seems to work in many problems.
     # # Other reasonable initializers may also work just as well.
@@ -118,6 +161,7 @@ def encode(self, inputs, attention_bias):
 
     Returns:
       float tensor with shape [batch_size, input_length, hidden_size]
+      
     """
     
       # Prepare inputs to the layer stack by adding positional encodings and
@@ -223,7 +267,12 @@ def symbols_to_logits_fn(ids, i, cache):
     return symbols_to_logits_fn, weights
 
   def predict(self, encoder_outputs, encoder_decoder_attention_bias):
-    """Return predicted sequence."""
+    """
+    
+    Return predicted sequence, and decoder attention weights.
+
+    
+    """
     batch_size = tf.shape(encoder_outputs)[0]
     input_length = tf.shape(encoder_outputs)[1]
     max_decode_length = input_length + self.params.extra_decode_length
@@ -263,7 +312,15 @@ def predict(self, encoder_outputs, encoder_decoder_attention_bias):
     top_decoded_ids = decoded_ids[:, 0, 1:]
     top_scores = scores[:, 0]
 
-    return {"outputs": top_decoded_ids, "scores": top_scores}, weights
+    # post-process the weight attention
+    for i, weight in enumerate(weights):
+        if (i == 0):
+            w = weight
+        else:
+            for k in range(len(w['self'])):
+                w['self']['layer_%d' % k] = tf.concat([w['self']['layer_%d' % k], weight['self']['layer_%d' % k]], 3)
+                w['enc_dec']['layer_%d' % k] = tf.concat([w['enc_dec']['layer_%d' % k], weight['enc_dec']['layer_%d' % k]], 2)
+    return {"outputs": top_decoded_ids, "scores": top_scores}, w
 
 
 class LayerNormalization(tl.layers.Layer):
diff --git a/tensorlayer/models/transformer/utils/__init__.py b/tensorlayer/models/transformer/utils/__init__.py
index 1786a4e0f..830f64ecd 100644
--- a/tensorlayer/models/transformer/utils/__init__.py
+++ b/tensorlayer/models/transformer/utils/__init__.py
@@ -1,4 +1,3 @@
 from .model_utils import *
 from .metrics import *
-from .subtokenizer import *
 from .attention_visualisation import *
\ No newline at end of file
diff --git a/tests/models/test_transformer.py b/tests/models/test_transformer.py
index 27ffcdfc1..ef0470ef3 100644
--- a/tests/models/test_transformer.py
+++ b/tests/models/test_transformer.py
@@ -41,7 +41,7 @@ class TINY_PARAMS(object):
 
     # Default prediction params
     extra_decode_length=5
-    beam_size=2
+    beam_size=1
     alpha=0.6 # used to calculate length normalization in beam search
 
 
@@ -64,7 +64,7 @@ def setUpClass(cls):
 
         assert cls.src_len == cls.tgt_len
 
-        cls.num_epochs = 20
+        cls.num_epochs = 100
         cls.n_step = cls.src_len // cls.batch_size
 
     @classmethod
@@ -108,25 +108,34 @@ def test_basic_simpleSeq2Seq(self):
             model_.eval()
             [prediction, weights_decoder], weights_encoder = model_(inputs = test_sample)
             
+
             print("Prediction: >>>>>  ", prediction["outputs"], "\n Target: >>>>>  ", trainY[0:2, :], "\n\n")
 
             print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, self.num_epochs, total_loss / n_iter))
 
 
-        # visualise the self-attention weights at encoder 
+        # visualise the self-attention weights at encoder during training
         trainX, trainY = shuffle(self.trainX, self.trainY)
         X = [trainX[0]]
         Y = [trainY[0]]
         logits, weights_encoder, weights_decoder = model_(inputs = X, targets = Y)
         attention_visualisation.plot_attention_weights(weights_encoder["layer_0"], X[0].numpy(), X[0].numpy())
 
-        # visualise the self-attention weights at encoder 
+        # visualise the encoder-decoder-attention weights at decoder during training
         trainX, trainY = shuffle(self.trainX, self.trainY)
         X = [trainX[0]]
         Y = [trainY[0]]
         logits, weights_encoder, weights_decoder = model_(inputs = X, targets = Y)
         attention_visualisation.plot_attention_weights(weights_decoder["enc_dec"]["layer_0"], X[0].numpy(), Y[0])
 
+        # visualise the encoder-decoder-attention weights at decoder during inference
+        trainX, trainY = shuffle(self.trainX, self.trainY)
+        X = [trainX[0]]
+        # Y = [trainY[0]]
+        model_.eval()
+        [prediction, weights_decoder], weights_encoder = model_(inputs = X)
+        # print(X[0].numpy(), prediction["outputs"][0].numpy())
+        attention_visualisation.plot_attention_weights(weights_decoder["enc_dec"]["layer_0"], X[0].numpy(), prediction["outputs"][0].numpy())
 
 if __name__ == '__main__':
     unittest.main()

From 4d2e19e465bf1ccc0b186f682ad95f5bc0596782 Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@LingjundeMacBook-Pro.local>
Date: Mon, 2 Sep 2019 10:40:52 +0100
Subject: [PATCH 11/22] documentation

---
 .../models/transformer/attention_layer.py     |  19 +-
 tensorlayer/models/transformer/transformer.py | 649 +++++++++---------
 tests/models/test_transformer.py              |  47 +-
 3 files changed, 353 insertions(+), 362 deletions(-)

diff --git a/tensorlayer/models/transformer/attention_layer.py b/tensorlayer/models/transformer/attention_layer.py
index 36e27b380..24b76cc67 100644
--- a/tensorlayer/models/transformer/attention_layer.py
+++ b/tensorlayer/models/transformer/attention_layer.py
@@ -23,16 +23,21 @@
 
 
 class MultiHeadAttentionLayer(tl.layers.Layer):
-    """Multi-headed attention layer."""
+    """The :class:`MultiHeadAttentionLayer` layer is for multi-head attention computation.
+    The weight computation is between "key" and "query", which will then matmul with "value" to generate information
+    that selectively focuses on the "query" messages.
+    Parameters
+    -----------
+    num_heads : int
+        The number of heads which allow attention computation for different features
+    hidden_size : int
+        Out dim for the layer
+    keep_prob : float
+        Keep probablity for drop-out mechanism between 0 and 1
+    """
 
     def __init__(self, num_heads, hidden_size, keep_prob):
-        """Initialize Attention.
 
-    Args:
-      hidden_size: int, output dim of hidden layer.
-      num_heads: int, number of heads to repeat the same attention structure.
-      keep_prob: float, keep rate for dropout mechanism inside attention for training.
-    """
         if hidden_size % num_heads:
             raise ValueError(
                 "Hidden size ({}) must be divisible by the number of heads ({}).".format(hidden_size, num_heads)
diff --git a/tensorlayer/models/transformer/transformer.py b/tensorlayer/models/transformer/transformer.py
index fed6bb480..7d7fc1d0a 100644
--- a/tensorlayer/models/transformer/transformer.py
+++ b/tensorlayer/models/transformer/transformer.py
@@ -33,41 +33,51 @@
 import tensorlayer.models.transformer.beamsearchHelper.beam_search as beam_search
 
 
-
-
-
 class Transformer(Model):
-  """Transformer model with weights visualisation.
+    """Transformer model.
 
-  Implemented as described in: https://arxiv.org/pdf/1706.03762.pdf
-
-  The Transformer model consists of an encoder and decoder. The input is an int
-  sequence (or a batch of sequences). The encoder produces a continuous
-  representation, and the decoder uses the encoder output to generate
-  probabilities for the output sequence.
-  """
+    Parameters
+    ----------
+    params: class
+        Hyper-parameters of the model including vocab_size, encoder_num_layers, decoder_num_layers, 
+        hidden_size, ff_size, num_heads and keep_prob for training; 
+        and extra_decode_length, beam_size and alpha for inference.
+    
+    Examples
+    ---------
+    >>> class TINY_PARAMS(object):
+    >>>     vocab_size = 50
+    >>>     encoder_num_layers = 2
+    >>>     decoder_num_layers = 2
+    >>>     hidden_size = 64
+    >>>     ff_size = 16
+    >>>     num_heads = 4
+    >>>     keep_prob = 0.9
+    >>>     extra_decode_length = 5
+    >>>     beam_size = 1
+    >>>     alpha = 0.6  
+    >>> model = Transformer(TINY_PARAMS)
+
+    Returns
+    -------
+        Stacked-layer transformer model.
+    """
 
-  def __init__(self, params, name=None):
-    """Initialize layers to build Transformer model.
+    def __init__(self, params, name=None):
 
-    Args:
-      params: hyperparameter object defining layer sizes, dropout values, etc.
-      name: name of the model.
-    """
-    super(Transformer, self).__init__(name=name)
-    self.params = params
-    self.embedding_softmax_layer = embedding_layer.EmbeddingLayer(
-        params.vocab_size, params.hidden_size)
-    self.encoder_stack = EncoderStack(params)
-    self.decoder_stack = DecoderStack(params)
+        super(Transformer, self).__init__(name=name)
+        self.params = params
+        self.embedding_softmax_layer = embedding_layer.EmbeddingLayer(params.vocab_size, params.hidden_size)
+        self.encoder_stack = EncoderStack(params)
+        self.decoder_stack = DecoderStack(params)
 
-  def get_config(self):
-    return {
-        "params": self.params,
-    }
+    def get_config(self):
+        return {
+            "params": self.params,
+        }
 
-  def forward(self, inputs, targets=None):
-    """Calculate target logits or inferred target sequences.
+    def forward(self, inputs, targets=None):
+        """Calculate target logits or inferred target sequences.
 
     Args:
       inputs: input tensor list of size 1 or 2.
@@ -127,32 +137,31 @@ def forward(self, inputs, targets=None):
             }
 
     """
-    # # Variance scaling is used here because it seems to work in many problems.
-    # # Other reasonable initializers may also work just as well.
-
-    # Calculate attention bias for encoder self-attention and decoder
-    # multi-headed attention layers.
-    attention_bias = get_input_mask(inputs)
-
-    # Run the inputs through the encoder layer to map the symbol
-    # representations to continuous representations.
-    # Prepare inputs to the layer stack by adding positional encodings and
-    # applying dropout.
-    embedded_inputs = self.embedding_softmax_layer(inputs)
-    inputs_padding = get_input_mask(inputs)
-
-
-    encoder_outputs, weights_encoder = self.encode(inputs, inputs_padding)
-    # Generate output sequence if targets is None, or return logits if target
-    # sequence is known.
-    if targets is None:
-        return self.predict(encoder_outputs, attention_bias), weights_encoder
-    else:
-        logits, weights_decoder = self.decode(targets, encoder_outputs, attention_bias)
-    return logits, weights_encoder, weights_decoder
+        # # Variance scaling is used here because it seems to work in many problems.
+        # # Other reasonable initializers may also work just as well.
+
+        # Calculate attention bias for encoder self-attention and decoder
+        # multi-headed attention layers.
+        attention_bias = get_input_mask(inputs)
+
+        # Run the inputs through the encoder layer to map the symbol
+        # representations to continuous representations.
+        # Prepare inputs to the layer stack by adding positional encodings and
+        # applying dropout.
+        embedded_inputs = self.embedding_softmax_layer(inputs)
+        inputs_padding = get_input_mask(inputs)
+
+        encoder_outputs, weights_encoder = self.encode(inputs, inputs_padding)
+        # Generate output sequence if targets is None, or return logits if target
+        # sequence is known.
+        if targets is None:
+            return self.predict(encoder_outputs, attention_bias), weights_encoder
+        else:
+            logits, weights_decoder = self.decode(targets, encoder_outputs, attention_bias)
+        return logits, weights_encoder, weights_decoder
 
-  def encode(self, inputs, attention_bias):
-    """Generate continuous representation for inputs.
+    def encode(self, inputs, attention_bias):
+        """Generate continuous representation for inputs.
 
     Args:
       inputs: int tensor with shape [batch_size, input_length].
@@ -161,28 +170,29 @@ def encode(self, inputs, attention_bias):
 
     Returns:
       float tensor with shape [batch_size, input_length, hidden_size]
-      
+      Self-attention weights for encoder part:
+        a dictionary of float tensors {
+            "layer_0": [batch_size, number_of_heads, source_length, source_length],
+            "layer_1": [batch_size, number_of_heads, source_length, source_length],
+            ...
+        }
     """
-    
-      # Prepare inputs to the layer stack by adding positional encodings and
-      # applying dropout.
-    embedded_inputs = self.embedding_softmax_layer(inputs)
-    inputs_padding = get_input_mask(inputs)
 
-    
-    length = tf.shape(embedded_inputs)[1]
-    pos_encoding = positional_encoding(
-        length, self.params.hidden_size)
-    encoder_inputs = embedded_inputs + pos_encoding
-    
-    if self.is_train:
-        encoder_inputs = tf.nn.dropout(
-            encoder_inputs, rate=1-self.params.keep_prob)
-    return self.encoder_stack(
-        encoder_inputs, input_mask=attention_bias)
+        # Prepare inputs to the layer stack by adding positional encodings and
+        # applying dropout.
+        embedded_inputs = self.embedding_softmax_layer(inputs)
+        inputs_padding = get_input_mask(inputs)
+
+        length = tf.shape(embedded_inputs)[1]
+        pos_encoding = positional_encoding(length, self.params.hidden_size)
+        encoder_inputs = embedded_inputs + pos_encoding
 
-  def decode(self, targets, encoder_outputs, attention_bias):
-    """Generate logits for each value in the target sequence.
+        if self.is_train:
+            encoder_inputs = tf.nn.dropout(encoder_inputs, rate=1 - self.params.keep_prob)
+        return self.encoder_stack(encoder_inputs, input_mask=attention_bias)
+
+    def decode(self, targets, encoder_outputs, attention_bias):
+        """Generate logits for each value in the target sequence.
 
     Args:
       targets: target values for the output sequence. int tensor with shape
@@ -194,44 +204,53 @@ def decode(self, targets, encoder_outputs, attention_bias):
 
     Returns:
       float32 tensor with shape [batch_size, target_length, vocab_size]
+      Weights for decoder part:
+        a dictionary of dictionary of float tensors {
+            "self": {
+                "layer_0": [batch_size, number_of_heads, target_length, target_length],
+                "layer_1": [batch_size, number_of_heads, target_length, target_length],
+                ...
+            }
+            "enc_dec": {
+                "layer_0": [batch_size, number_of_heads, source_length, target_length],
+                "layer_1": [batch_size, number_of_heads, source_length, target_length],
+                ...
+            }
+        }
     """
-    with tf.name_scope("decode"):
-      # Prepare inputs to decoder layers by shifting targets, adding positional
-      # encoding and applying dropout.
-      decoder_inputs = self.embedding_softmax_layer(targets)
-      with tf.name_scope("shift_targets"):
-        # Shift targets to the right, and remove the last element
-        decoder_inputs = tf.pad(decoder_inputs,
-                                [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
-      with tf.name_scope("add_pos_encoding"):
-        length = tf.shape(decoder_inputs)[1]
-        decoder_inputs += positional_encoding(
-            length, self.params.hidden_size)
-      if self.is_train:
-        decoder_inputs = tf.nn.dropout(
-            decoder_inputs, rate=1-self.params.keep_prob)
-
-      # Run values
-      decoder_self_attention_bias = get_target_mask(
-          length)
-      outputs, weights = self.decoder_stack(
-          decoder_inputs,
-          features=encoder_outputs,
-          input_mask=attention_bias,
-          target_mask=decoder_self_attention_bias,)
-      logits = self.embedding_softmax_layer(outputs, mode="linear")
-      return logits, weights
-
-  def _get_symbols_to_logits_fn(self, max_decode_length):
-    """Returns a decoding function that calculates logits of the next tokens."""
-
-    timing_signal = positional_encoding(
-        max_decode_length + 1, self.params.hidden_size)
-    decoder_self_attention_bias = get_target_mask(
-        max_decode_length)
-    weights = []
-    def symbols_to_logits_fn(ids, i, cache):
-      """Generate logits for next potential IDs.
+        with tf.name_scope("decode"):
+            # Prepare inputs to decoder layers by shifting targets, adding positional
+            # encoding and applying dropout.
+            decoder_inputs = self.embedding_softmax_layer(targets)
+            with tf.name_scope("shift_targets"):
+                # Shift targets to the right, and remove the last element
+                decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
+            with tf.name_scope("add_pos_encoding"):
+                length = tf.shape(decoder_inputs)[1]
+                decoder_inputs += positional_encoding(length, self.params.hidden_size)
+            if self.is_train:
+                decoder_inputs = tf.nn.dropout(decoder_inputs, rate=1 - self.params.keep_prob)
+
+            # Run values
+            decoder_self_attention_bias = get_target_mask(length)
+            outputs, weights = self.decoder_stack(
+                decoder_inputs,
+                features=encoder_outputs,
+                input_mask=attention_bias,
+                target_mask=decoder_self_attention_bias,
+            )
+            logits = self.embedding_softmax_layer(outputs, mode="linear")
+            return logits, weights
+
+    def _get_symbols_to_logits_fn(self, max_decode_length):
+        """Returns a decoding function that calculates logits of the next tokens."""
+
+        timing_signal = positional_encoding(max_decode_length + 1, self.params.hidden_size)
+        decoder_self_attention_bias = get_target_mask(max_decode_length)
+        weights = []
+
+        def symbols_to_logits_fn(ids, i, cache):
+            """Generate logits for next potential IDs.
 
       Args:
         ids: Current decoded sequences. int tensor with shape [batch_size *
@@ -245,82 +264,74 @@ def symbols_to_logits_fn(ids, i, cache):
           (logits with shape [batch_size * beam_size, vocab_size],
            updated cache values)
       """
-      # Set decoder input to the last generated IDs
-      decoder_input = ids[:, -1:]
-
-      # Preprocess decoder input by getting embeddings and adding timing signal.
-      decoder_input = self.embedding_softmax_layer(decoder_input)
-      decoder_input += timing_signal[i:i + 1]
-
-      self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
-      decoder_outputs, weight = self.decoder_stack(
-          decoder_input,
-          features=cache.get("encoder_outputs"),
-          target_mask=self_attention_bias,
-          input_mask=cache.get("encoder_decoder_attention_bias"),
-          cache=cache)
-      weights.append(weight)
-      logits = self.embedding_softmax_layer(decoder_outputs, mode="linear")
-      logits = tf.squeeze(logits, axis=[1])
-      return logits, cache
-
-    return symbols_to_logits_fn, weights
-
-  def predict(self, encoder_outputs, encoder_decoder_attention_bias):
-    """
-    
+            # Set decoder input to the last generated IDs
+            decoder_input = ids[:, -1:]
+
+            # Preprocess decoder input by getting embeddings and adding timing signal.
+            decoder_input = self.embedding_softmax_layer(decoder_input)
+            decoder_input += timing_signal[i:i + 1]
+
+            self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
+            decoder_outputs, weight = self.decoder_stack(
+                decoder_input, features=cache.get("encoder_outputs"), target_mask=self_attention_bias,
+                input_mask=cache.get("encoder_decoder_attention_bias"), cache=cache
+            )
+            weights.append(weight)
+            logits = self.embedding_softmax_layer(decoder_outputs, mode="linear")
+            logits = tf.squeeze(logits, axis=[1])
+            return logits, cache
+
+        return symbols_to_logits_fn, weights
+
+    def predict(self, encoder_outputs, encoder_decoder_attention_bias):
+        """
     Return predicted sequence, and decoder attention weights.
-
-    
     """
-    batch_size = tf.shape(encoder_outputs)[0]
-    input_length = tf.shape(encoder_outputs)[1]
-    max_decode_length = input_length + self.params.extra_decode_length
-
-    symbols_to_logits_fn, weights = self._get_symbols_to_logits_fn(
-        max_decode_length)
-
-    # Create initial set of IDs that will be passed into symbols_to_logits_fn.
-    initial_ids = tf.zeros([batch_size], dtype=tf.int32)
-
-    # Create cache storing decoder attention values for each layer.
-    # pylint: disable=g-complex-comprehension
-    cache = {
-        "layer_%d" % layer: {
-            "k": tf.zeros([batch_size, 0, self.params.hidden_size]),
-            "v": tf.zeros([batch_size, 0, self.params.hidden_size])
-        } for layer in range(self.params.encoder_num_layers)
-    }
-    # pylint: enable=g-complex-comprehension
-
-    # Add encoder output and attention bias to the cache.
-    cache["encoder_outputs"] = encoder_outputs
-    cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
-
-    # Use beam search to find the top beam_size sequences and scores.
-    decoded_ids, scores = beam_search.sequence_beam_search(
-        symbols_to_logits_fn=symbols_to_logits_fn,
-        initial_ids=initial_ids,
-        initial_cache=cache,
-        vocab_size=self.params.vocab_size,
-        beam_size=self.params.beam_size,
-        alpha=self.params.alpha,
-        max_decode_length=max_decode_length,
-        eos_id=1)
-
-    # Get the top sequence for each batch element
-    top_decoded_ids = decoded_ids[:, 0, 1:]
-    top_scores = scores[:, 0]
-
-    # post-process the weight attention
-    for i, weight in enumerate(weights):
-        if (i == 0):
-            w = weight
-        else:
-            for k in range(len(w['self'])):
-                w['self']['layer_%d' % k] = tf.concat([w['self']['layer_%d' % k], weight['self']['layer_%d' % k]], 3)
-                w['enc_dec']['layer_%d' % k] = tf.concat([w['enc_dec']['layer_%d' % k], weight['enc_dec']['layer_%d' % k]], 2)
-    return {"outputs": top_decoded_ids, "scores": top_scores}, w
+        batch_size = tf.shape(encoder_outputs)[0]
+        input_length = tf.shape(encoder_outputs)[1]
+        max_decode_length = input_length + self.params.extra_decode_length
+
+        symbols_to_logits_fn, weights = self._get_symbols_to_logits_fn(max_decode_length)
+
+        # Create initial set of IDs that will be passed into symbols_to_logits_fn.
+        initial_ids = tf.zeros([batch_size], dtype=tf.int32)
+
+        # Create cache storing decoder attention values for each layer.
+        # pylint: disable=g-complex-comprehension
+        cache = {
+            "layer_%d" % layer: {
+                "k": tf.zeros([batch_size, 0, self.params.hidden_size]),
+                "v": tf.zeros([batch_size, 0, self.params.hidden_size])
+            } for layer in range(self.params.encoder_num_layers)
+        }
+        # pylint: enable=g-complex-comprehension
+
+        # Add encoder output and attention bias to the cache.
+        cache["encoder_outputs"] = encoder_outputs
+        cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
+
+        # Use beam search to find the top beam_size sequences and scores.
+        decoded_ids, scores = beam_search.sequence_beam_search(
+            symbols_to_logits_fn=symbols_to_logits_fn, initial_ids=initial_ids, initial_cache=cache,
+            vocab_size=self.params.vocab_size, beam_size=self.params.beam_size, alpha=self.params.alpha,
+            max_decode_length=max_decode_length, eos_id=1
+        )
+
+        # Get the top sequence for each batch element
+        top_decoded_ids = decoded_ids[:, 0, 1:]
+        top_scores = scores[:, 0]
+
+        # post-process the weight attention
+        for i, weight in enumerate(weights):
+            if (i == 0):
+                w = weight
+            else:
+                for k in range(len(w['self'])):
+                    w['self']['layer_%d' % k
+                             ] = tf.concat([w['self']['layer_%d' % k], weight['self']['layer_%d' % k]], 3)
+                    w['enc_dec']['layer_%d' % k
+                                ] = tf.concat([w['enc_dec']['layer_%d' % k], weight['enc_dec']['layer_%d' % k]], 2)
+        return {"outputs": top_decoded_ids, "scores": top_scores}, w
 
 
 class LayerNormalization(tl.layers.Layer):
@@ -359,42 +370,43 @@ def __repr__(self):
 
 
 class PrePostProcessingWrapper(Model):
-  """Wrapper class that applies layer pre-processing and post-processing."""
-
-  def __init__(self, layer, params):
-    super(PrePostProcessingWrapper, self).__init__()
-    self.layer = layer
-    self.params = params
-    self.postprocess_dropout = 1-params.keep_prob
-    self.layer_norm = LayerNormalization(self.params.hidden_size)
-
-  def get_config(self):
-    return {
-        "params": self.params,
-    }
-
-  def forward(self, inputs, get_weight=False, *args, **kwargs):
-    """Calls wrapped layer with same parameters."""
-
-    x = inputs
-    y = self.layer_norm(x)
-
-    # Get layer output
-    if (get_weight):
-        y, weight = self.layer(y, *args, **kwargs)
-    else:
-        y = self.layer(y, *args, **kwargs)
-
-    # Postprocessing: apply dropout and residual connection
-    if self.is_train:
-      y = tf.nn.dropout(y, rate=self.postprocess_dropout)
-    if (get_weight):
-        return x + y, weight
-    else:
-        return x+y
+    """Wrapper class that applies layer pre-processing and post-processing."""
+
+    def __init__(self, layer, params):
+        super(PrePostProcessingWrapper, self).__init__()
+        self.layer = layer
+        self.params = params
+        self.postprocess_dropout = 1 - params.keep_prob
+        self.layer_norm = LayerNormalization(self.params.hidden_size)
+
+    def get_config(self):
+        return {
+            "params": self.params,
+        }
+
+    def forward(self, inputs, get_weight=False, *args, **kwargs):
+        """Calls wrapped layer with same parameters."""
+
+        x = inputs
+        y = self.layer_norm(x)
+
+        # Get layer output
+        if (get_weight):
+            y, weight = self.layer(y, *args, **kwargs)
+        else:
+            y = self.layer(y, *args, **kwargs)
+
+        # Postprocessing: apply dropout and residual connection
+        if self.is_train:
+            y = tf.nn.dropout(y, rate=self.postprocess_dropout)
+        if (get_weight):
+            return x + y, weight
+        else:
+            return x + y
+
 
 class EncoderStack(Model):
-  """Transformer encoder stack.
+    """Transformer encoder stack.
 
   The encoder stack is made up of N identical layers. Each layer is composed
   of the sublayers:
@@ -402,33 +414,32 @@ class EncoderStack(Model):
     2. Feedforward network (which is 2 fully-connected layers)
   """
 
-  def __init__(self, params):
-    super(EncoderStack, self).__init__()
-    self.params = params
-    self.layers = []
-    for _ in range(params.encoder_num_layers):
-      # Create sublayers for each layer.
-      self_attention_layer = SelfAttentionLayer(
-          params.num_heads, params.hidden_size, 
-          params.keep_prob)
-      feed_forward_network = FeedForwardLayer(
-          params.hidden_size, params.ff_size, params.keep_prob)
-
-      self.layers.append([
-          PrePostProcessingWrapper(self_attention_layer, params),
-          PrePostProcessingWrapper(feed_forward_network, params)
-      ])
-
-    # Create final layer normalization layer.
-    self.output_normalization = LayerNormalization(params.hidden_size)
-
-  def get_config(self):
-    return {
-        "params": self.params,
-    }
-
-  def forward(self, inputs, input_mask):
-    """Return the output of the encoder layer stacks.
+    def __init__(self, params):
+        super(EncoderStack, self).__init__()
+        self.params = params
+        self.layers = []
+        for _ in range(params.encoder_num_layers):
+            # Create sublayers for each layer.
+            self_attention_layer = SelfAttentionLayer(params.num_heads, params.hidden_size, params.keep_prob)
+            feed_forward_network = FeedForwardLayer(params.hidden_size, params.ff_size, params.keep_prob)
+
+            self.layers.append(
+                [
+                    PrePostProcessingWrapper(self_attention_layer, params),
+                    PrePostProcessingWrapper(feed_forward_network, params)
+                ]
+            )
+
+        # Create final layer normalization layer.
+        self.output_normalization = LayerNormalization(params.hidden_size)
+
+    def get_config(self):
+        return {
+            "params": self.params,
+        }
+
+    def forward(self, inputs, input_mask):
+        """Return the output of the encoder layer stacks.
 
     Args:
       encoder_inputs: tensor with shape [batch_size, input_length, hidden_size]
@@ -442,27 +453,25 @@ def forward(self, inputs, input_mask):
       Output of encoder layer stack.
       float32 tensor with shape [batch_size, input_length, hidden_size]
     """
-    encoder_inputs = inputs
-    weights = {}
-    for n, layer in enumerate(self.layers):
-      # Run inputs through the sublayers.
-      self_attention_layer = layer[0]
-      feed_forward_network = layer[1]
+        encoder_inputs = inputs
+        weights = {}
+        for n, layer in enumerate(self.layers):
+            # Run inputs through the sublayers.
+            self_attention_layer = layer[0]
+            feed_forward_network = layer[1]
 
-      with tf.name_scope("layer_%d" % n):
-        with tf.name_scope("self_attention"):
-          encoder_inputs, weight= self_attention_layer(
-              encoder_inputs, mask=input_mask, get_weight=True)
-          weights["layer_%d" % n] = weight
-        with tf.name_scope("ffn"):
-          encoder_inputs = feed_forward_network(
-              encoder_inputs)
+            with tf.name_scope("layer_%d" % n):
+                with tf.name_scope("self_attention"):
+                    encoder_inputs, weight = self_attention_layer(encoder_inputs, mask=input_mask, get_weight=True)
+                    weights["layer_%d" % n] = weight
+                with tf.name_scope("ffn"):
+                    encoder_inputs = feed_forward_network(encoder_inputs)
 
-    return self.output_normalization(encoder_inputs), weights
+        return self.output_normalization(encoder_inputs), weights
 
 
 class DecoderStack(Model):
-  """Transformer decoder stack.
+    """Transformer decoder stack.
 
   Like the encoder stack, the decoder stack is made up of N identical layers.
   Each layer is composed of the sublayers:
@@ -472,34 +481,31 @@ class DecoderStack(Model):
     3. Feedforward network (2 fully-connected layers)
   """
 
-  def __init__(self, params):
-    super(DecoderStack, self).__init__()
-    self.params = params
-    self.layers = []
-    for _ in range(params.decoder_num_layers):
-      self_attention_layer = SelfAttentionLayer(
-          params.num_heads, params.hidden_size, 
-          params.keep_prob)
-      enc_dec_attention_layer = MultiHeadAttentionLayer(
-          params.num_heads, params.hidden_size, 
-          params.keep_prob)
-      feed_forward_network = FeedForwardLayer(
-          params.hidden_size, params.ff_size, params.keep_prob)
-
-      self.layers.append([
-          PrePostProcessingWrapper(self_attention_layer, params),
-          PrePostProcessingWrapper(enc_dec_attention_layer, params),
-          PrePostProcessingWrapper(feed_forward_network, params)
-      ])
-    self.output_normalization = LayerNormalization(params.hidden_size)
-
-  def get_config(self):
-    return {
-        "params": self.params,
-    }
-
-  def forward(self, inputs, features, input_mask, target_mask, cache=None):
-    """Return the output of the decoder layer stacks.
+    def __init__(self, params):
+        super(DecoderStack, self).__init__()
+        self.params = params
+        self.layers = []
+        for _ in range(params.decoder_num_layers):
+            self_attention_layer = SelfAttentionLayer(params.num_heads, params.hidden_size, params.keep_prob)
+            enc_dec_attention_layer = MultiHeadAttentionLayer(params.num_heads, params.hidden_size, params.keep_prob)
+            feed_forward_network = FeedForwardLayer(params.hidden_size, params.ff_size, params.keep_prob)
+
+            self.layers.append(
+                [
+                    PrePostProcessingWrapper(self_attention_layer, params),
+                    PrePostProcessingWrapper(enc_dec_attention_layer, params),
+                    PrePostProcessingWrapper(feed_forward_network, params)
+                ]
+            )
+        self.output_normalization = LayerNormalization(params.hidden_size)
+
+    def get_config(self):
+        return {
+            "params": self.params,
+        }
+
+    def forward(self, inputs, features, input_mask, target_mask, cache=None):
+        """Return the output of the decoder layer stacks.
 
     Args:
       decoder_inputs: tensor with shape [batch_size, target_length, hidden_size]
@@ -519,39 +525,32 @@ def forward(self, inputs, features, input_mask, target_mask, cache=None):
       Output of decoder layer stack.
       float32 tensor with shape [batch_size, target_length, hidden_size]
     """
-    decoder_inputs = inputs
-    decoder_self_attention_bias = target_mask
-    encoder_outputs = features
-    attention_bias = input_mask
-    weights_all = {"self":{}, "enc_dec":{}}
-    for n, layer in enumerate(self.layers):
-      self_attention_layer = layer[0]
-      enc_dec_attention_layer = layer[1]
-      feed_forward_network = layer[2]
-
-      # Run inputs through the sublayers.
-      layer_name = "layer_%d" % n
-      layer_cache = cache[layer_name] if cache is not None else None
-      
-      with tf.name_scope(layer_name):
-        with tf.name_scope("self_attention"):
-          decoder_inputs,weight_self = self_attention_layer(
-              decoder_inputs, get_weight=True,
-              mask=decoder_self_attention_bias,
-              cache=layer_cache)
-          weights_all['self']["layer_%d" % n] = weight_self
-        with tf.name_scope("encdec_attention"):
-          decoder_inputs, weight_enc_dec = enc_dec_attention_layer(
-              decoder_inputs, get_weight=True,
-              y=encoder_outputs,
-              mask=attention_bias)
-          weights_all['enc_dec']["layer_%d" % n] = weight_enc_dec
-        with tf.name_scope("ffn"):
-          decoder_inputs  = feed_forward_network(
-              decoder_inputs)
-
-    return self.output_normalization(decoder_inputs), weights_all
-
-
-
-
+        decoder_inputs = inputs
+        decoder_self_attention_bias = target_mask
+        encoder_outputs = features
+        attention_bias = input_mask
+        weights_all = {"self": {}, "enc_dec": {}}
+        for n, layer in enumerate(self.layers):
+            self_attention_layer = layer[0]
+            enc_dec_attention_layer = layer[1]
+            feed_forward_network = layer[2]
+
+            # Run inputs through the sublayers.
+            layer_name = "layer_%d" % n
+            layer_cache = cache[layer_name] if cache is not None else None
+
+            with tf.name_scope(layer_name):
+                with tf.name_scope("self_attention"):
+                    decoder_inputs, weight_self = self_attention_layer(
+                        decoder_inputs, get_weight=True, mask=decoder_self_attention_bias, cache=layer_cache
+                    )
+                    weights_all['self']["layer_%d" % n] = weight_self
+                with tf.name_scope("encdec_attention"):
+                    decoder_inputs, weight_enc_dec = enc_dec_attention_layer(
+                        decoder_inputs, get_weight=True, y=encoder_outputs, mask=attention_bias
+                    )
+                    weights_all['enc_dec']["layer_%d" % n] = weight_enc_dec
+                with tf.name_scope("ffn"):
+                    decoder_inputs = feed_forward_network(decoder_inputs)
+
+        return self.output_normalization(decoder_inputs), weights_all
diff --git a/tests/models/test_transformer.py b/tests/models/test_transformer.py
index ef0470ef3..95c289e4e 100644
--- a/tests/models/test_transformer.py
+++ b/tests/models/test_transformer.py
@@ -17,35 +17,22 @@
 import time
 
 
-
-
-
 class TINY_PARAMS(object):
     vocab_size = 50
     encoder_num_layers = 2
     decoder_num_layers = 2
-    filter_number = 256
-    R1 = 4
-    R2 = 8
-    n_channels = 2
-    n_units = 128
-    H = 32
-    light_filter_size=(1,3)
-    filter_size = light_filter_size[-1]
     hidden_size = 64
     ff_size = 16
     num_heads = 4
     keep_prob = 0.9
 
-
-
     # Default prediction params
-    extra_decode_length=5
-    beam_size=1
-    alpha=0.6 # used to calculate length normalization in beam search
+    extra_decode_length = 5
+    beam_size = 1
+    alpha = 0.6  # used to calculate length normalization in beam search
 
 
-class Model_SEQ2SEQ_Test(CustomTestCase):
+class Model_Transformer_Test(CustomTestCase):
 
     @classmethod
     def setUpClass(cls):
@@ -56,8 +43,8 @@ def setUpClass(cls):
         cls.trainX = np.random.randint(low=2, high=50, size=(50, 11))
         cls.trainY = np.random.randint(low=2, high=50, size=(50, 10))
 
-        cls.trainX[:,-1] = 1
-        cls.trainY[:,-1] = 1
+        cls.trainX[:, -1] = 1
+        cls.trainY[:, -1] = 1
         # Parameters
         cls.src_len = len(cls.trainX)
         cls.tgt_len = len(cls.trainY)
@@ -91,41 +78,38 @@ def test_basic_simpleSeq2Seq(self):
                 with tf.GradientTape() as tape:
 
                     targets = Y
-                    logits, weights_encoder, weights_decoder = model_(inputs = X, targets = Y)
+                    logits, weights_encoder, weights_decoder = model_(inputs=X, targets=Y)
                     logits = metrics.MetricLayer(self.vocab_size)([logits, targets])
                     logits, loss = metrics.LossLayer(self.vocab_size, 0.1)([logits, targets])
-                    
+
                     grad = tape.gradient(loss, model_.all_weights)
                     optimizer.apply_gradients(zip(grad, model_.all_weights))
-            
 
                 total_loss += loss
                 n_iter += 1
-            print(time.time()-t)
+            print(time.time() - t)
             tl.files.save_npz(model_.all_weights, name='./model_v4.npz')
             model_.eval()
             test_sample = trainX[0:2, :]
             model_.eval()
-            [prediction, weights_decoder], weights_encoder = model_(inputs = test_sample)
-            
+            [prediction, weights_decoder], weights_encoder = model_(inputs=test_sample)
 
             print("Prediction: >>>>>  ", prediction["outputs"], "\n Target: >>>>>  ", trainY[0:2, :], "\n\n")
 
             print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, self.num_epochs, total_loss / n_iter))
 
-
         # visualise the self-attention weights at encoder during training
         trainX, trainY = shuffle(self.trainX, self.trainY)
         X = [trainX[0]]
         Y = [trainY[0]]
-        logits, weights_encoder, weights_decoder = model_(inputs = X, targets = Y)
+        logits, weights_encoder, weights_decoder = model_(inputs=X, targets=Y)
         attention_visualisation.plot_attention_weights(weights_encoder["layer_0"], X[0].numpy(), X[0].numpy())
 
         # visualise the encoder-decoder-attention weights at decoder during training
         trainX, trainY = shuffle(self.trainX, self.trainY)
         X = [trainX[0]]
         Y = [trainY[0]]
-        logits, weights_encoder, weights_decoder = model_(inputs = X, targets = Y)
+        logits, weights_encoder, weights_decoder = model_(inputs=X, targets=Y)
         attention_visualisation.plot_attention_weights(weights_decoder["enc_dec"]["layer_0"], X[0].numpy(), Y[0])
 
         # visualise the encoder-decoder-attention weights at decoder during inference
@@ -133,9 +117,12 @@ def test_basic_simpleSeq2Seq(self):
         X = [trainX[0]]
         # Y = [trainY[0]]
         model_.eval()
-        [prediction, weights_decoder], weights_encoder = model_(inputs = X)
+        [prediction, weights_decoder], weights_encoder = model_(inputs=X)
         # print(X[0].numpy(), prediction["outputs"][0].numpy())
-        attention_visualisation.plot_attention_weights(weights_decoder["enc_dec"]["layer_0"], X[0].numpy(), prediction["outputs"][0].numpy())
+        attention_visualisation.plot_attention_weights(
+            weights_decoder["enc_dec"]["layer_0"], X[0].numpy(), prediction["outputs"][0].numpy()
+        )
+
 
 if __name__ == '__main__':
     unittest.main()

From f5438a711864a3da6be4a145e183d1cb03e9fd60 Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@LingjundeMacBook-Pro.local>
Date: Mon, 2 Sep 2019 10:41:34 +0100
Subject: [PATCH 12/22] documentation

---
 .../transformer/utils/attention_visualisation.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/tensorlayer/models/transformer/utils/attention_visualisation.py b/tensorlayer/models/transformer/utils/attention_visualisation.py
index 376a5c5cd..e9bf73c12 100644
--- a/tensorlayer/models/transformer/utils/attention_visualisation.py
+++ b/tensorlayer/models/transformer/utils/attention_visualisation.py
@@ -1,7 +1,8 @@
 import matplotlib.pyplot as plt
 import tensorflow as tf
-def plot_attention_weights(attention, key, query):
 
+
+def plot_attention_weights(attention, key, query):
     '''Attention visualisation for Transformer
 
     Parameters
@@ -17,25 +18,22 @@ def plot_attention_weights(attention, key, query):
 
     '''
 
-
     fig = plt.figure(figsize=(16, 8))
 
     attention = tf.squeeze(attention, axis=0)
-    
+
     for head in range(attention.shape[0]):
-        ax = fig.add_subplot(attention.shape[0]//2, 2, head+1)
+        ax = fig.add_subplot(attention.shape[0] // 2, 2, head + 1)
         ax.matshow(attention[head], cmap='viridis')
         fontdict = {'fontsize': 12}
         ax.set_xticks(range(len(key)))
         ax.set_yticks(range(len(query)))
 
         # ax.set_ylim(len(query)-1.5, -0.5)
-        ax.set_xticklabels(
-            [str(i) for i in key], 
-            fontdict=fontdict, rotation=90)
+        ax.set_xticklabels([str(i) for i in key], fontdict=fontdict, rotation=90)
 
         ax.set_yticklabels([str(i) for i in query], fontdict=fontdict)
 
-        ax.set_xlabel('Head {}'.format(head+1), fontdict = fontdict)
+        ax.set_xlabel('Head {}'.format(head + 1), fontdict=fontdict)
     plt.tight_layout()
-    plt.show()
\ No newline at end of file
+    plt.show()

From a48e1d37eefb2b690b038baadb1f099ca8236334 Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@LingjundeMacBook-Pro.local>
Date: Mon, 2 Sep 2019 10:50:59 +0100
Subject: [PATCH 13/22] documentation

---
 docs/modules/models.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/modules/models.rst b/docs/modules/models.rst
index 272f1d9c6..aaae2ca2d 100644
--- a/docs/modules/models.rst
+++ b/docs/modules/models.rst
@@ -16,6 +16,7 @@ TensorLayer provides many pretrained models, you can easily use the whole or a p
     ResNet50
     Seq2seq
     Seq2seqLuongAttention
+    Transorformer
 
 
 Base Model

From 90d536e5959ed38364c0a343eb682d8385a8d8be Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@lingjundembp.home>
Date: Fri, 13 Sep 2019 23:07:53 +0100
Subject: [PATCH 14/22] add examples

---
 docs/modules/models.rst                       |   5 +
 .../translation_task/tutorial_transformer.py  | 168 ++++++++++++++++++
 .../beamsearchHelper/beam_search.py           |  10 +-
 .../models/transformer/feedforward_layer.py   |   4 +-
 tensorlayer/models/transformer/transformer.py |  14 +-
 .../utils/attention_visualisation.py          |   1 -
 tensorlayer/optimizers/__init__.py            |   1 +
 tensorlayer/optimizers/lazy_adam.py           |  76 ++++++++
 tests/models/test_transformer.py              |  14 +-
 9 files changed, 273 insertions(+), 20 deletions(-)
 create mode 100644 examples/translation_task/tutorial_transformer.py
 create mode 100644 tensorlayer/optimizers/lazy_adam.py

diff --git a/docs/modules/models.rst b/docs/modules/models.rst
index aaae2ca2d..b8cb3f5f0 100644
--- a/docs/modules/models.rst
+++ b/docs/modules/models.rst
@@ -58,3 +58,8 @@ Seq2seq Luong Attention
 ------------------------
 
 .. autoclass:: Seq2seqLuongAttention
+
+Transformer
+------------------------
+
+.. autoclass:: Transformer
\ No newline at end of file
diff --git a/examples/translation_task/tutorial_transformer.py b/examples/translation_task/tutorial_transformer.py
new file mode 100644
index 000000000..d515f62a7
--- /dev/null
+++ b/examples/translation_task/tutorial_transformer.py
@@ -0,0 +1,168 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import tensorflow_datasets as tfds
+import tensorflow as tf
+import time
+import numpy as np
+import matplotlib.pyplot as plt
+from tensorlayer.models.transformer import Transformer
+from tensorlayer.models.transformer.utils import metrics
+from tensorlayer.models.transformer.utils import attention_visualisation
+import tensorlayer as tl
+
+
+""" Translation from Portugese to English by Transformer model
+This tutorial provides basic instructions on how to define and train Transformer model on Tensorlayer for 
+Translation task. You can also learn how to visualize the attention block via this tutorial. 
+"""
+
+def set_up_dataset():
+    # Set up dataset for Portugese-English translation from the TED Talks Open Translation Project.
+    # This dataset contains approximately 50000 training examples, 1100 validation examples, and 2000 test examples.
+    # https://www.ted.com/participate/translate
+
+    examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,
+                                as_supervised=True)
+    train_examples, val_examples = examples['train'], examples['validation']
+
+    # Set up tokenizer and save the tokenizer
+    tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
+        (en.numpy() and pt.numpy() for pt, en in train_examples), target_vocab_size=2**14)
+
+    tokenizer.save_to_file("tokenizer")
+    tokenizer = tfds.features.text.SubwordTextEncoder.load_from_file("tokenizer")
+
+    return tokenizer, train_examples
+
+
+def test_tokenizer_success(tokenizer):
+    sample_string = 'TensorLayer is awesome.'
+
+    tokenized_string = tokenizer.encode(sample_string)
+    print ('Tokenized string is {}'.format(tokenized_string))
+
+    original_string = tokenizer.decode(tokenized_string)
+    print ('The original string: {}'.format(original_string))
+    assert original_string == sample_string
+
+
+
+def generate_training_dataset(train_examples, tokenizer):
+    def encode(lang1, lang2):
+        lang1 = tokenizer.encode(
+            lang1.numpy()) + [tokenizer.vocab_size+1]
+
+        lang2 = tokenizer.encode(
+            lang2.numpy()) + [tokenizer.vocab_size+1]
+        
+        return lang1, lang2
+    MAX_LENGTH = 50
+    def filter_max_length(x, y, max_length=MAX_LENGTH):
+        return tf.logical_and(tf.size(x) <= max_length,
+                            tf.size(y) <= max_length)
+    def tf_encode(pt, en):
+        return tf.py_function(encode, [pt, en], [tf.int64, tf.int64])
+    train_dataset = train_examples.map(tf_encode)
+    train_dataset = train_dataset.filter(filter_max_length)
+    # cache the dataset to memory to get a speedup while reading from it.
+    train_dataset = train_dataset.cache()
+    BUFFER_SIZE = 20000
+    BATCH_SIZE = 64
+    train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(
+        BATCH_SIZE, padded_shapes=([-1], [-1]))
+    train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
+
+    return train_dataset
+
+
+
+
+def model_setup(tokenizer):
+    # define Hyper parameters for transformer
+    class HYPER_PARAMS(object):
+        vocab_size = tokenizer.vocab_size + 10
+        encoder_num_layers = 4
+        decoder_num_layers = 4
+        hidden_size = 128
+        ff_size = 512
+        num_heads = 8
+        keep_prob = 0.9
+
+        # Default prediction params
+        extra_decode_length = 50
+        beam_size = 5
+        alpha = 0.6  # used to calculate length normalization in beam search
+        
+        
+        label_smoothing=0.1
+        learning_rate=2.0
+        learning_rate_decay_rate=1.0
+        learning_rate_warmup_steps=4000
+        
+        sos_id = 0
+        eos_id = tokenizer.vocab_size+1
+
+
+    model = Transformer(HYPER_PARAMS)
+
+    # Set the optimizer
+    learning_rate = CustomSchedule(HYPER_PARAMS.hidden_size, warmup_steps=HYPER_PARAMS.learning_rate_warmup_steps)
+    optimizer = tl.optimizers.LazyAdamOptimizer(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
+    return model, optimizer, HYPER_PARAMS
+
+
+# Use the Adam optimizer with a custom learning rate scheduler according to the formula in the Paper "Attention is All you need"
+class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+  def __init__(self, d_model, warmup_steps=5):
+    super(CustomSchedule, self).__init__()
+    
+    self.d_model = d_model
+    self.d_model = tf.cast(self.d_model, tf.float32)
+
+    self.warmup_steps = warmup_steps
+    
+  def __call__(self, step):
+    arg1 = tf.math.rsqrt(step)
+    arg2 = step * (self.warmup_steps ** -1.5)
+    
+    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
+
+
+
+def tutorial_transformer():
+    tokenizer, train_examples = set_up_dataset()
+    train_dataset = generate_training_dataset(train_examples, tokenizer)
+    model, optimizer, HYPER_PARAMS = model_setup(tokenizer)
+
+    num_epochs = 10
+    for epoch in range(num_epochs):
+        model.train()
+        for (batch, (inp, tar)) in enumerate(train_dataset):
+            with tf.GradientTape() as tape:
+                logits, weights_encoder, weights_decoder = model(inputs=inp, targets=tar)
+                logits = metrics.MetricLayer(HYPER_PARAMS.vocab_size)([logits, tar])
+                logits, loss = metrics.LossLayer(HYPER_PARAMS.vocab_size, 0.1)([logits, tar])
+                grad = tape.gradient(loss, model.all_weights)
+                optimizer.apply_gradients(zip(grad, model.all_weights))
+                if (batch % 50 == 0):
+                    print('Batch ID {} at Epoch [{}/{}]: loss {:.4f}'.format(batch, epoch + 1, num_epochs, loss))
+
+
+
+    model.eval()
+    sentence_en = tokenizer.encode('TensorLayer is awesome.')
+    [prediction, weights_decoder], weights_encoder = model(inputs=[sentence_en])
+
+    predicted_sentence = tokenizer.decode([i for i in prediction["outputs"][0]
+                                                if i < tokenizer.vocab_size]) 
+    print("Translated: ", predicted_sentence)
+
+
+    # visualize the self attention 
+    tokenizer_str = [tokenizer.decode([ts]) for ts in (sentence_en)]
+    attention_visualisation.plot_attention_weights(weights_encoder["layer_0"], tokenizer_str, tokenizer_str)
+
+    
+
+
+if __name__ == "__main__":
+    tutorial_transformer()
diff --git a/tensorlayer/models/transformer/beamsearchHelper/beam_search.py b/tensorlayer/models/transformer/beamsearchHelper/beam_search.py
index 971e76fe0..cd2690e8f 100644
--- a/tensorlayer/models/transformer/beamsearchHelper/beam_search.py
+++ b/tensorlayer/models/transformer/beamsearchHelper/beam_search.py
@@ -39,11 +39,11 @@ def search(self, initial_ids, initial_cache):
         finished_scores = finished_state[_StateKeys.FINISHED_SCORES]
         finished_flags = finished_state[_StateKeys.FINISHED_FLAGS]
 
-        # Account for corner case where there are no finished sequences for a
-        # particular batch item. In that case, return alive sequences for that batch
-        # item.
-        finished_seq = tf.where(tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
-        finished_scores = tf.where(tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
+        # # Account for corner case where there are no finished sequences for a
+        # # particular batch item. In that case, return alive sequences for that batch
+        # # item.
+        # finished_seq = tf.where(tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
+        # finished_scores = tf.where(tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
         return finished_seq, finished_scores
 
 
diff --git a/tensorlayer/models/transformer/feedforward_layer.py b/tensorlayer/models/transformer/feedforward_layer.py
index 7ae6f5f68..fbf40e26f 100644
--- a/tensorlayer/models/transformer/feedforward_layer.py
+++ b/tensorlayer/models/transformer/feedforward_layer.py
@@ -22,7 +22,7 @@
 import tensorlayer as tl
 
 
-class FeedForwardLayer(tl.layers.Layer):
+class TransformerFeedForwardLayer(tl.layers.Layer):
     """Fully connected feedforward network."""
 
     def __init__(self, hidden_size, filter_size, keep_prob):
@@ -33,7 +33,7 @@ def __init__(self, hidden_size, filter_size, keep_prob):
       filter_size: int, filter size for the inner (first) dense layer.
       relu_dropout: float, dropout rate for training.
     """
-        super(FeedForwardLayer, self).__init__()
+        super(TransformerFeedForwardLayer, self).__init__()
         self.hidden_size = hidden_size
         self.filter_size = filter_size
         self.relu_dropout = 1 - keep_prob
diff --git a/tensorlayer/models/transformer/transformer.py b/tensorlayer/models/transformer/transformer.py
index 7d7fc1d0a..421a5d496 100644
--- a/tensorlayer/models/transformer/transformer.py
+++ b/tensorlayer/models/transformer/transformer.py
@@ -26,7 +26,7 @@
 from tensorlayer.models import Model
 import tensorlayer.models.transformer.embedding_layer as embedding_layer
 from tensorlayer.models.transformer.attention_layer import SelfAttentionLayer, MultiHeadAttentionLayer
-from tensorlayer.models.transformer.feedforward_layer import FeedForwardLayer
+from tensorlayer.models.transformer.feedforward_layer import TransformerFeedForwardLayer
 from tensorlayer.models.transformer.utils.model_utils import positional_encoding
 from tensorlayer.models.transformer.utils.model_utils import get_decoder_self_attention_bias as get_target_mask
 from tensorlayer.models.transformer.utils.model_utils import get_padding_bias as get_input_mask
@@ -56,6 +56,8 @@ class Transformer(Model):
     >>>     extra_decode_length = 5
     >>>     beam_size = 1
     >>>     alpha = 0.6  
+    >>>     eos_id = 1
+    >>>     sos_id = 0
     >>> model = Transformer(TINY_PARAMS)
 
     Returns
@@ -224,7 +226,7 @@ def decode(self, targets, encoder_outputs, attention_bias):
             decoder_inputs = self.embedding_softmax_layer(targets)
             with tf.name_scope("shift_targets"):
                 # Shift targets to the right, and remove the last element
-                decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
+                decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]], constant_values=self.params.sos_id)[:, :-1, :]
             with tf.name_scope("add_pos_encoding"):
                 length = tf.shape(decoder_inputs)[1]
                 decoder_inputs += positional_encoding(length, self.params.hidden_size)
@@ -294,7 +296,7 @@ def predict(self, encoder_outputs, encoder_decoder_attention_bias):
         symbols_to_logits_fn, weights = self._get_symbols_to_logits_fn(max_decode_length)
 
         # Create initial set of IDs that will be passed into symbols_to_logits_fn.
-        initial_ids = tf.zeros([batch_size], dtype=tf.int32)
+        initial_ids = tf.ones([batch_size], dtype=tf.int32)*self.params.sos_id
 
         # Create cache storing decoder attention values for each layer.
         # pylint: disable=g-complex-comprehension
@@ -314,7 +316,7 @@ def predict(self, encoder_outputs, encoder_decoder_attention_bias):
         decoded_ids, scores = beam_search.sequence_beam_search(
             symbols_to_logits_fn=symbols_to_logits_fn, initial_ids=initial_ids, initial_cache=cache,
             vocab_size=self.params.vocab_size, beam_size=self.params.beam_size, alpha=self.params.alpha,
-            max_decode_length=max_decode_length, eos_id=1
+            max_decode_length=max_decode_length, eos_id=self.params.eos_id
         )
 
         # Get the top sequence for each batch element
@@ -421,7 +423,7 @@ def __init__(self, params):
         for _ in range(params.encoder_num_layers):
             # Create sublayers for each layer.
             self_attention_layer = SelfAttentionLayer(params.num_heads, params.hidden_size, params.keep_prob)
-            feed_forward_network = FeedForwardLayer(params.hidden_size, params.ff_size, params.keep_prob)
+            feed_forward_network = TransformerFeedForwardLayer(params.hidden_size, params.ff_size, params.keep_prob)
 
             self.layers.append(
                 [
@@ -488,7 +490,7 @@ def __init__(self, params):
         for _ in range(params.decoder_num_layers):
             self_attention_layer = SelfAttentionLayer(params.num_heads, params.hidden_size, params.keep_prob)
             enc_dec_attention_layer = MultiHeadAttentionLayer(params.num_heads, params.hidden_size, params.keep_prob)
-            feed_forward_network = FeedForwardLayer(params.hidden_size, params.ff_size, params.keep_prob)
+            feed_forward_network = TransformerFeedForwardLayer(params.hidden_size, params.ff_size, params.keep_prob)
 
             self.layers.append(
                 [
diff --git a/tensorlayer/models/transformer/utils/attention_visualisation.py b/tensorlayer/models/transformer/utils/attention_visualisation.py
index e9bf73c12..e98775b4e 100644
--- a/tensorlayer/models/transformer/utils/attention_visualisation.py
+++ b/tensorlayer/models/transformer/utils/attention_visualisation.py
@@ -19,7 +19,6 @@ def plot_attention_weights(attention, key, query):
     '''
 
     fig = plt.figure(figsize=(16, 8))
-
     attention = tf.squeeze(attention, axis=0)
 
     for head in range(attention.shape[0]):
diff --git a/tensorlayer/optimizers/__init__.py b/tensorlayer/optimizers/__init__.py
index e74b38801..0e9890929 100644
--- a/tensorlayer/optimizers/__init__.py
+++ b/tensorlayer/optimizers/__init__.py
@@ -10,3 +10,4 @@
 """
 
 from .amsgrad import AMSGrad
+from .lazy_adam import LazyAdamOptimizer
diff --git a/tensorlayer/optimizers/lazy_adam.py b/tensorlayer/optimizers/lazy_adam.py
new file mode 100644
index 000000000..5cdbab982
--- /dev/null
+++ b/tensorlayer/optimizers/lazy_adam.py
@@ -0,0 +1,76 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Optimizer from addons and learning rate scheduler."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+
+class LazyAdamOptimizer(tf.optimizers.Adam):
+    """Variant of the Adam optimizer that handles sparse updates more efficiently.
+
+  The original Adam algorithm maintains two moving-average accumulators for
+  each trainable variable; the accumulators are updated at every step.
+  This class provides lazier handling of gradient updates for sparse
+  variables.  It only updates moving-average accumulators for sparse variable
+  indices that appear in the current batch, rather than updating the
+  accumulators for all indices. Compared with the original Adam optimizer,
+  it can provide large improvements in model training throughput for some
+  applications. However, it provides slightly different semantics than the
+  original Adam algorithm, and may lead to different empirical results.
+  Note, amsgrad is currently not supported and the argument can only be
+  False.
+
+  This class is borrowed from:
+  https://github.com/tensorflow/addons/blob/master/tensorflow_addons/optimizers/lazy_adam.py
+  """
+
+    def _resource_apply_sparse(self, grad, var, indices):
+        """Applies grad for one step."""
+        var_dtype = var.dtype.base_dtype
+        lr_t = self._decayed_lr(var_dtype)
+        beta_1_t = self._get_hyper('beta_1', var_dtype)
+        beta_2_t = self._get_hyper('beta_2', var_dtype)
+        local_step = tf.cast(self.iterations + 1, var_dtype)
+        beta_1_power = tf.math.pow(beta_1_t, local_step)
+        beta_2_power = tf.math.pow(beta_2_t, local_step)
+        epsilon_t = tf.convert_to_tensor(self.epsilon, var_dtype)
+        lr = (lr_t * tf.math.sqrt(1 - beta_2_power) / (1 - beta_1_power))
+
+        # \\(m := beta1 * m + (1 - beta1) * g_t\\)
+        m = self.get_slot(var, 'm')
+        m_t_slice = beta_1_t * tf.gather(m, indices) + (1 - beta_1_t) * grad
+
+        m_update_kwargs = {'resource': m.handle, 'indices': indices, 'updates': m_t_slice}
+        m_update_op = tf.raw_ops.ResourceScatterUpdate(**m_update_kwargs)
+
+        # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
+        v = self.get_slot(var, 'v')
+        v_t_slice = (beta_2_t * tf.gather(v, indices) + (1 - beta_2_t) * tf.math.square(grad))
+
+        v_update_kwargs = {'resource': v.handle, 'indices': indices, 'updates': v_t_slice}
+        v_update_op = tf.raw_ops.ResourceScatterUpdate(**v_update_kwargs)
+
+        # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
+        var_slice = lr * m_t_slice / (tf.math.sqrt(v_t_slice) + epsilon_t)
+
+        var_update_kwargs = {'resource': var.handle, 'indices': indices, 'updates': var_slice}
+        var_update_op = tf.raw_ops.ResourceScatterSub(**var_update_kwargs)
+
+        return tf.group(*[var_update_op, m_update_op, v_update_op])
diff --git a/tests/models/test_transformer.py b/tests/models/test_transformer.py
index 95c289e4e..a7ee307ce 100644
--- a/tests/models/test_transformer.py
+++ b/tests/models/test_transformer.py
@@ -12,13 +12,12 @@
 from tensorlayer.models.transformer import Transformer
 from tests.utils import CustomTestCase
 from tensorlayer.models.transformer.utils import metrics
-from tensorlayer.optimizers import lazyAdam as optimizer
 from tensorlayer.models.transformer.utils import attention_visualisation
 import time
 
 
 class TINY_PARAMS(object):
-    vocab_size = 50
+    vocab_size = 50+2
     encoder_num_layers = 2
     decoder_num_layers = 2
     hidden_size = 64
@@ -31,6 +30,9 @@ class TINY_PARAMS(object):
     beam_size = 1
     alpha = 0.6  # used to calculate length normalization in beam search
 
+    eos_id = 51
+    sos_id = 0
+
 
 class Model_Transformer_Test(CustomTestCase):
 
@@ -40,11 +42,11 @@ def setUpClass(cls):
 
         cls.embedding_size = 32
         cls.dec_seq_length = 5
-        cls.trainX = np.random.randint(low=2, high=50, size=(50, 11))
-        cls.trainY = np.random.randint(low=2, high=50, size=(50, 10))
+        cls.trainX = np.random.randint(low=0, high=50, size=(50, 11))
+        cls.trainY = np.random.randint(low=0, high=50, size=(50, 10))
 
-        cls.trainX[:, -1] = 1
-        cls.trainY[:, -1] = 1
+        cls.trainX[:, -1] = 51
+        cls.trainY[:, -1] = 51
         # Parameters
         cls.src_len = len(cls.trainX)
         cls.tgt_len = len(cls.trainY)

From e2662c24a97a8c543781f2e34abd8e535faa6d36 Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@lingjundembp.home>
Date: Fri, 13 Sep 2019 23:46:16 +0100
Subject: [PATCH 15/22] documentation

---
 CHANGELOG.md                                  |   5 +-
 .../translation_task/tutorial_transformer.py  |  85 ++++++-------
 .../models/transformer/attention_layer.py     |   3 +
 .../beamsearchHelper/beam_search.py           |  55 +++++----
 .../beamsearchHelper/beam_search_v1.py        | 113 +++++++++---------
 .../models/transformer/embedding_layer.py     |  15 ++-
 .../models/transformer/feedforward_layer.py   |   7 +-
 tensorlayer/models/transformer/transformer.py |  58 ++++-----
 .../models/transformer/utils/metrics.py       |  87 +++++++++-----
 .../models/transformer/utils/model_utils.py   |  24 ++--
 10 files changed, 248 insertions(+), 204 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5b22341f0..6e09387c3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -95,6 +95,7 @@ To release a new version, please update the changelog as followed:
 - Support string dtype in InputLayer (#PR 1017)
 - Support Dynamic RNN in RNN (#PR 1023)
 - Add ResNet50 static model (#PR 1030)
+- Add Transformer model (#PR 1027)
 
 ### Changed
 
@@ -125,8 +126,8 @@ To release a new version, please update the changelog as followed:
 - @zsdonghao
 - @ChrisWu1997: #1010 #1015 #1025 #1030
 - @warshallrho: #1017 #1021 #1026 #1029 #1032
-- @ArnoldLIULJ: #1023
-- @JingqingZ: #1023
+- @ArnoldLIULJ: #1023 #1027
+- @JingqingZ: #1023 #1027
 
 ## [2.1.0]
 
diff --git a/examples/translation_task/tutorial_transformer.py b/examples/translation_task/tutorial_transformer.py
index d515f62a7..cc3cf4bd4 100644
--- a/examples/translation_task/tutorial_transformer.py
+++ b/examples/translation_task/tutorial_transformer.py
@@ -8,25 +8,24 @@
 from tensorlayer.models.transformer.utils import metrics
 from tensorlayer.models.transformer.utils import attention_visualisation
 import tensorlayer as tl
-
-
 """ Translation from Portugese to English by Transformer model
 This tutorial provides basic instructions on how to define and train Transformer model on Tensorlayer for 
 Translation task. You can also learn how to visualize the attention block via this tutorial. 
 """
 
+
 def set_up_dataset():
     # Set up dataset for Portugese-English translation from the TED Talks Open Translation Project.
     # This dataset contains approximately 50000 training examples, 1100 validation examples, and 2000 test examples.
     # https://www.ted.com/participate/translate
 
-    examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,
-                                as_supervised=True)
+    examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
     train_examples, val_examples = examples['train'], examples['validation']
 
     # Set up tokenizer and save the tokenizer
     tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
-        (en.numpy() and pt.numpy() for pt, en in train_examples), target_vocab_size=2**14)
+        (en.numpy() and pt.numpy() for pt, en in train_examples), target_vocab_size=2**14
+    )
 
     tokenizer.save_to_file("tokenizer")
     tokenizer = tfds.features.text.SubwordTextEncoder.load_from_file("tokenizer")
@@ -38,44 +37,42 @@ def test_tokenizer_success(tokenizer):
     sample_string = 'TensorLayer is awesome.'
 
     tokenized_string = tokenizer.encode(sample_string)
-    print ('Tokenized string is {}'.format(tokenized_string))
+    print('Tokenized string is {}'.format(tokenized_string))
 
     original_string = tokenizer.decode(tokenized_string)
-    print ('The original string: {}'.format(original_string))
+    print('The original string: {}'.format(original_string))
     assert original_string == sample_string
 
 
-
 def generate_training_dataset(train_examples, tokenizer):
+
     def encode(lang1, lang2):
-        lang1 = tokenizer.encode(
-            lang1.numpy()) + [tokenizer.vocab_size+1]
+        lang1 = tokenizer.encode(lang1.numpy()) + [tokenizer.vocab_size + 1]
+
+        lang2 = tokenizer.encode(lang2.numpy()) + [tokenizer.vocab_size + 1]
 
-        lang2 = tokenizer.encode(
-            lang2.numpy()) + [tokenizer.vocab_size+1]
-        
         return lang1, lang2
+
     MAX_LENGTH = 50
+
     def filter_max_length(x, y, max_length=MAX_LENGTH):
-        return tf.logical_and(tf.size(x) <= max_length,
-                            tf.size(y) <= max_length)
+        return tf.logical_and(tf.size(x) <= max_length, tf.size(y) <= max_length)
+
     def tf_encode(pt, en):
         return tf.py_function(encode, [pt, en], [tf.int64, tf.int64])
+
     train_dataset = train_examples.map(tf_encode)
     train_dataset = train_dataset.filter(filter_max_length)
     # cache the dataset to memory to get a speedup while reading from it.
     train_dataset = train_dataset.cache()
     BUFFER_SIZE = 20000
     BATCH_SIZE = 64
-    train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(
-        BATCH_SIZE, padded_shapes=([-1], [-1]))
+    train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1]))
     train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
 
     return train_dataset
 
 
-
-
 def model_setup(tokenizer):
     # define Hyper parameters for transformer
     class HYPER_PARAMS(object):
@@ -91,16 +88,14 @@ class HYPER_PARAMS(object):
         extra_decode_length = 50
         beam_size = 5
         alpha = 0.6  # used to calculate length normalization in beam search
-        
-        
-        label_smoothing=0.1
-        learning_rate=2.0
-        learning_rate_decay_rate=1.0
-        learning_rate_warmup_steps=4000
-        
-        sos_id = 0
-        eos_id = tokenizer.vocab_size+1
 
+        label_smoothing = 0.1
+        learning_rate = 2.0
+        learning_rate_decay_rate = 1.0
+        learning_rate_warmup_steps = 4000
+
+        sos_id = 0
+        eos_id = tokenizer.vocab_size + 1
 
     model = Transformer(HYPER_PARAMS)
 
@@ -112,20 +107,20 @@ class HYPER_PARAMS(object):
 
 # Use the Adam optimizer with a custom learning rate scheduler according to the formula in the Paper "Attention is All you need"
 class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
-  def __init__(self, d_model, warmup_steps=5):
-    super(CustomSchedule, self).__init__()
-    
-    self.d_model = d_model
-    self.d_model = tf.cast(self.d_model, tf.float32)
 
-    self.warmup_steps = warmup_steps
-    
-  def __call__(self, step):
-    arg1 = tf.math.rsqrt(step)
-    arg2 = step * (self.warmup_steps ** -1.5)
-    
-    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
+    def __init__(self, d_model, warmup_steps=5):
+        super(CustomSchedule, self).__init__()
+
+        self.d_model = d_model
+        self.d_model = tf.cast(self.d_model, tf.float32)
+
+        self.warmup_steps = warmup_steps
 
+    def __call__(self, step):
+        arg1 = tf.math.rsqrt(step)
+        arg2 = step * (self.warmup_steps**-1.5)
+
+        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
 
 
 def tutorial_transformer():
@@ -146,23 +141,17 @@ def tutorial_transformer():
                 if (batch % 50 == 0):
                     print('Batch ID {} at Epoch [{}/{}]: loss {:.4f}'.format(batch, epoch + 1, num_epochs, loss))
 
-
-
     model.eval()
     sentence_en = tokenizer.encode('TensorLayer is awesome.')
     [prediction, weights_decoder], weights_encoder = model(inputs=[sentence_en])
 
-    predicted_sentence = tokenizer.decode([i for i in prediction["outputs"][0]
-                                                if i < tokenizer.vocab_size]) 
+    predicted_sentence = tokenizer.decode([i for i in prediction["outputs"][0] if i < tokenizer.vocab_size])
     print("Translated: ", predicted_sentence)
 
-
-    # visualize the self attention 
+    # visualize the self attention
     tokenizer_str = [tokenizer.decode([ts]) for ts in (sentence_en)]
     attention_visualisation.plot_attention_weights(weights_encoder["layer_0"], tokenizer_str, tokenizer_str)
 
-    
-
 
 if __name__ == "__main__":
     tutorial_transformer()
diff --git a/tensorlayer/models/transformer/attention_layer.py b/tensorlayer/models/transformer/attention_layer.py
index 24b76cc67..25eca2ac6 100644
--- a/tensorlayer/models/transformer/attention_layer.py
+++ b/tensorlayer/models/transformer/attention_layer.py
@@ -83,6 +83,7 @@ def split_heads(self, x):
       x: A tensor with shape [batch_size, length, hidden_size]
 
     Returns:
+    -----------
       A tensor with shape [batch_size, num_heads, length, hidden_size/num_heads]
     """
         with tf.name_scope("split_heads"):
@@ -105,6 +106,7 @@ def combine_heads(self, x):
       x: A tensor [batch_size, num_heads, length, hidden_size/num_heads]
 
     Returns:
+-----------
       A tensor with shape [batch_size, length, hidden_size]
     """
         with tf.name_scope("combine_heads"):
@@ -128,6 +130,7 @@ def forward(self, x, y, mask, cache=None):
         where i is the current decoded length.
 
     Returns:
+    -----------
       Attention layer output with shape [batch_size, length_x, hidden_size]
       Attention weights with shape [batch_size, number_of_head, length_x, length_y]
     """
diff --git a/tensorlayer/models/transformer/beamsearchHelper/beam_search.py b/tensorlayer/models/transformer/beamsearchHelper/beam_search.py
index cd2690e8f..e5ffae949 100644
--- a/tensorlayer/models/transformer/beamsearchHelper/beam_search.py
+++ b/tensorlayer/models/transformer/beamsearchHelper/beam_search.py
@@ -39,11 +39,6 @@ def search(self, initial_ids, initial_cache):
         finished_scores = finished_state[_StateKeys.FINISHED_SCORES]
         finished_flags = finished_state[_StateKeys.FINISHED_FLAGS]
 
-        # # Account for corner case where there are no finished sequences for a
-        # # particular batch item. In that case, return alive sequences for that batch
-        # # item.
-        # finished_seq = tf.where(tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
-        # finished_scores = tf.where(tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
         return finished_seq, finished_scores
 
 
@@ -51,29 +46,38 @@ def sequence_beam_search(
         symbols_to_logits_fn, initial_ids, initial_cache, vocab_size, beam_size, alpha, max_decode_length, eos_id
 ):
     """Search for sequence of subtoken ids with the largest probability.
-
-    Args:
-        symbols_to_logits_fn: A function that takes in ids, index, and cache as
-        arguments. The passed in arguments will have shape:
+    
+    Parameters
+    -----------
+    symbols_to_logits_fn : A function with ids, index, and cache as arguments. 
+        The passed in arguments will have shape:
             ids -> [batch_size * beam_size, index]
             index -> [] (scalar)
             cache -> nested dictionary of tensors [batch_size * beam_size, ...]
         The function must return logits and new cache.
             logits -> [batch * beam_size, vocab_size]
             new cache -> same shape/structure as inputted cache
-        initial_ids: Starting ids for each batch item.
-        int32 tensor with shape [batch_size]
-        initial_cache: dict containing starting decoder variables information
-        vocab_size: int size of tokens
-        beam_size: int number of beams
-        alpha: float defining the strength of length normalization
-        max_decode_length: maximum length to decoded sequence
-        eos_id: int id of eos token, used to determine when a sequence has finished
-
-    Returns:
-        Top decoded sequences [batch_size, beam_size, max_decode_length]
-        sequence scores [batch_size, beam_size]
-    """
+    initial_ids : int with shape [batch_size]
+        Starting ids for each batch item.
+    initial_cache: dict 
+        contain starting decoder variables information
+    vocab_size: int 
+        size of tokens
+    beam_size: int 
+        number of beams
+    alpha: float 
+        strength of length normalization
+    max_decode_length: int
+        maximum length to decoded sequence
+    eos_id: int 
+        id of eos token, used to determine when a sequence has finished
+    
+    Returns
+    -------
+    Top decoded sequences [batch_size, beam_size, max_decode_length]
+    sequence scores [batch_size, beam_size]
+  """
+
     batch_size = tf.shape(initial_ids)[0]
 
     sbs = SequenceBeamSearchV2(
@@ -85,11 +89,14 @@ def sequence_beam_search(
 def _expand_to_same_rank(tensor, target):
     """Expands a given tensor to target's rank to be broadcastable.
 
-    Args:
+    Parameters
+    -----------
+    
         tensor: input tensor to tile. Shape: [b, d1, ..., da]
         target: target tensor. Shape: [b, d1, ..., da, ..., dn]
 
-    Returns:
+     Returns:
+    -----------
         Tiled tensor of shape [b, d1, ..., da, 1, ..., 1] with same rank of target.
 
     Raises:
diff --git a/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py b/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py
index bf3f85c3f..0fabe01b2 100644
--- a/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py
+++ b/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py
@@ -94,13 +94,15 @@ def search(self, initial_ids, initial_cache):
     def _create_initial_state(self, initial_ids, initial_cache):
         """Return initial state dictionary and its shape invariants.
 
-    Args:
+    Parameters
+    -----------
       initial_ids: initial ids to pass into the symbols_to_logits_fn.
         int tensor with shape [batch_size, 1]
       initial_cache: dictionary storing values to be passed into the
         symbols_to_logits_fn.
 
     Returns:
+    -----------
         state and shape invariant dictionaries with keys from _StateKeys
     """
         # Current loop index (starts at 0)
@@ -165,10 +167,12 @@ def _continue_search(self, state):
          score in the alive sequences (i.e. the finished sequences are provably
          unchanging)
 
-    Args:
+    Parameters
+    -----------
       state: A dictionary with the current loop state.
 
     Returns:
+    -----------
       Bool tensor with value True if loop should continue, False if loop should
       terminate.
     """
@@ -212,10 +216,12 @@ def _search_step(self, state):
     by the length normalization factor. Without length normalization, the
     search is more likely to return shorter sequences.
 
-    Args:
+    Parameters
+    -----------
       state: A dictionary with the current loop state.
 
     Returns:
+    -----------
       new state dictionary.
     """
         # Grow alive sequences by one token.
@@ -240,9 +246,11 @@ def _grow_alive_seq(self, state):
     the EOS token. 2*beam_size ensures that at least beam_size sequences are
     still alive.
 
-    Args:
+    Parameters
+    -----------
       state: A dictionary with the current loop state.
     Returns:
+    -----------
       Tuple of
       (Top 2*beam_size sequences [batch_size, 2 * beam_size, cur_index + 1],
        Scores of returned sequences [batch_size, 2 * beam_size],
@@ -292,19 +300,6 @@ def _grow_alive_seq(self, state):
 
     def _get_new_alive_state(self, new_seq, new_log_probs, new_cache):
         """Gather the top k sequences that are still alive.
-
-    Args:
-      new_seq: New sequences generated by growing the current alive sequences
-        int32 tensor with shape [batch_size, 2 * beam_size, cur_index + 1]
-      new_log_probs: Log probabilities of new sequences
-        float32 tensor with shape [batch_size, beam_size]
-      new_cache: Dict of cached values for each sequence.
-
-    Returns:
-      Dictionary with alive keys from _StateKeys:
-        {Top beam_size sequences that are still alive (don't end with eos_id)
-         Log probabilities of top alive sequences
-         Dict cache storing decoder states for top alive sequences}
     """
         # To prevent finished sequences from being considered, set log probs to -INF
         new_finished_flags = tf.equal(new_seq[:, :, -1], self.eos_id)
@@ -322,19 +317,6 @@ def _get_new_alive_state(self, new_seq, new_log_probs, new_cache):
 
     def _get_new_finished_state(self, state, new_seq, new_log_probs):
         """Combine new and old finished sequences, and gather the top k sequences.
-
-    Args:
-      state: A dictionary with the current loop state.
-      new_seq: New sequences generated by growing the current alive sequences
-        int32 tensor with shape [batch_size, beam_size, i + 1]
-      new_log_probs: Log probabilities of new sequences
-        float32 tensor with shape [batch_size, beam_size]
-
-    Returns:
-      Dictionary with finished keys from _StateKeys:
-        {Top beam_size finished sequences based on score,
-         Scores of finished sequences,
-         Finished flags of finished sequences}
     """
         i = state[_StateKeys.CUR_INDEX]
         finished_seq = state[_StateKeys.FINISHED_SEQ]
@@ -376,26 +358,34 @@ def sequence_beam_search(
         symbols_to_logits_fn, initial_ids, initial_cache, vocab_size, beam_size, alpha, max_decode_length, eos_id
 ):
     """Search for sequence of subtoken ids with the largest probability.
-
-  Args:
-    symbols_to_logits_fn: A function that takes in ids, index, and cache as
-      arguments. The passed in arguments will have shape:
-        ids -> [batch_size * beam_size, index]
-        index -> [] (scalar)
-        cache -> nested dictionary of tensors [batch_size * beam_size, ...]
-      The function must return logits and new cache.
-        logits -> [batch * beam_size, vocab_size]
-        new cache -> same shape/structure as inputted cache
-    initial_ids: Starting ids for each batch item.
-      int32 tensor with shape [batch_size]
-    initial_cache: dict containing starting decoder variables information
-    vocab_size: int size of tokens
-    beam_size: int number of beams
-    alpha: float defining the strength of length normalization
-    max_decode_length: maximum length to decoded sequence
-    eos_id: int id of eos token, used to determine when a sequence has finished
-
-  Returns:
+    
+    Parameters
+    -----------
+    symbols_to_logits_fn : A function with ids, index, and cache as arguments. 
+        The passed in arguments will have shape:
+            ids -> [batch_size * beam_size, index]
+            index -> [] (scalar)
+            cache -> nested dictionary of tensors [batch_size * beam_size, ...]
+        The function must return logits and new cache.
+            logits -> [batch * beam_size, vocab_size]
+            new cache -> same shape/structure as inputted cache
+    initial_ids : int with shape [batch_size]
+        Starting ids for each batch item.
+    initial_cache: dict 
+        contain starting decoder variables information
+    vocab_size: int 
+        size of tokens
+    beam_size: int 
+        number of beams
+    alpha: float 
+        strength of length normalization
+    max_decode_length: int
+        maximum length to decoded sequence
+    eos_id: int 
+        id of eos token, used to determine when a sequence has finished
+    
+    Returns
+    -------
     Top decoded sequences [batch_size, beam_size, max_decode_length]
     sequence scores [batch_size, beam_size]
   """
@@ -416,11 +406,13 @@ def _length_normalization(alpha, length):
 def _expand_to_beam_size(tensor, beam_size):
     """Tiles a given tensor by beam_size.
 
-  Args:
+  Parameters
+  -----------
     tensor: tensor to tile [batch_size, ...]
     beam_size: How much to tile the tensor by.
 
-  Returns:
+  Returns
+  -----------
     Tiled tensor [batch_size, beam_size, ...]
   """
     tensor = tf.expand_dims(tensor, axis=1)
@@ -458,10 +450,12 @@ def _get_shape_keep_last_dim(tensor):
 def _flatten_beam_dim(tensor):
     """Reshapes first two dimensions in to single dimension.
 
-  Args:
+  Parameters
+  -----------
     tensor: Tensor to reshape of shape [A, B, ...]
 
-  Returns:
+  Returns
+  -----------
     Reshaped tensor of shape [A*B, ...]
   """
     shape = _shape_list(tensor)
@@ -473,12 +467,14 @@ def _flatten_beam_dim(tensor):
 def _unflatten_beam_dim(tensor, batch_size, beam_size):
     """Reshapes first dimension back to [batch_size, beam_size].
 
-  Args:
+  Parameters
+  -----------
     tensor: Tensor to reshape of shape [batch_size*beam_size, ...]
     batch_size: Tensor, original batch size.
     beam_size: int, original beam size.
 
-  Returns:
+  Returns
+  -----------
     Reshaped tensor of shape [batch_size, beam_size, ...]
   """
     shape = _shape_list(tensor)
@@ -496,7 +492,8 @@ def _gather_beams(nested, beam_indices, batch_size, new_beam_size):
   This function is used to gather the top beams, specified by
   beam_indices, from the nested tensors.
 
-  Args:
+  Parameters
+  -----------
     nested: Nested structure (tensor, list, tuple or dict) containing tensors
       with shape [batch_size, beam_size, ...].
     beam_indices: int32 tensor with shape [batch_size, new_beam_size]. Each
@@ -506,6 +503,8 @@ def _gather_beams(nested, beam_indices, batch_size, new_beam_size):
     new_beam_size: int number of beams to be pulled from the nested tensors.
 
   Returns:
+  -----------
+
     Nested structure containing tensors with shape
       [batch_size, new_beam_size, ...]
   """
diff --git a/tensorlayer/models/transformer/embedding_layer.py b/tensorlayer/models/transformer/embedding_layer.py
index 5276ed48d..76ae97270 100644
--- a/tensorlayer/models/transformer/embedding_layer.py
+++ b/tensorlayer/models/transformer/embedding_layer.py
@@ -28,7 +28,8 @@ class EmbeddingLayer(tl.layers.Layer):
     def __init__(self, vocab_size, hidden_size):
         """Specify characteristic parameters of embedding layer.
 
-    Args:
+    Parameters
+    -----------
       vocab_size: Number of tokens in the embedding. (Typically ~32,000)
       hidden_size: Dimensionality of the embedding. (Typically 512 or 1024)
     """
@@ -57,10 +58,12 @@ def get_config(self):
     def forward(self, inputs, mode="embedding"):
         """Get token embeddings of inputs.
 
-    Args:
+    Parameters
+    -----------
       inputs: An int64 tensor with shape [batch_size, length]
       mode: string, a valid value is one of "embedding" and "linear".
-    Returns:
+     Returns:
+    -----------
       outputs: (1) If mode == "embedding", output embedding tensor, float32 with
         shape [batch_size, length, embedding_size]; (2) mode == "linear", output
         linear tensor, float32 with shape [batch_size, length, vocab_size].
@@ -88,9 +91,11 @@ def _embedding(self, inputs):
     def _linear(self, inputs):
         """Computes logits by running inputs through a linear layer.
 
-    Args:
+    Parameters
+    -----------
       inputs: A float32 tensor with shape [batch_size, length, hidden_size]
-    Returns:
+     Returns:
+    -----------
       float32 tensor with shape [batch_size, length, vocab_size].
     """
         with tf.name_scope("presoftmax_linear"):
diff --git a/tensorlayer/models/transformer/feedforward_layer.py b/tensorlayer/models/transformer/feedforward_layer.py
index fbf40e26f..b37a88090 100644
--- a/tensorlayer/models/transformer/feedforward_layer.py
+++ b/tensorlayer/models/transformer/feedforward_layer.py
@@ -28,7 +28,8 @@ class TransformerFeedForwardLayer(tl.layers.Layer):
     def __init__(self, hidden_size, filter_size, keep_prob):
         """Initialize FeedForwardNetwork.
 
-    Args:
+    Parameters
+    -----------
       hidden_size: int, output dim of hidden layer.
       filter_size: int, filter size for the inner (first) dense layer.
       relu_dropout: float, dropout rate for training.
@@ -61,11 +62,13 @@ def get_config(self):
     def forward(self, inputs):
         """Return outputs of the feedforward network.
 
-    Args:
+    Parameters
+    -----------
       x: tensor with shape [batch_size, length, hidden_size]
       training: boolean, whether in training mode or not.
 
     Returns:
+    -----------
       Output of the feedforward network.
       tensor with shape [batch_size, length, hidden_size]
     """
diff --git a/tensorlayer/models/transformer/transformer.py b/tensorlayer/models/transformer/transformer.py
index 421a5d496..28b59367b 100644
--- a/tensorlayer/models/transformer/transformer.py
+++ b/tensorlayer/models/transformer/transformer.py
@@ -45,20 +45,7 @@ class Transformer(Model):
     
     Examples
     ---------
-    >>> class TINY_PARAMS(object):
-    >>>     vocab_size = 50
-    >>>     encoder_num_layers = 2
-    >>>     decoder_num_layers = 2
-    >>>     hidden_size = 64
-    >>>     ff_size = 16
-    >>>     num_heads = 4
-    >>>     keep_prob = 0.9
-    >>>     extra_decode_length = 5
-    >>>     beam_size = 1
-    >>>     alpha = 0.6  
-    >>>     eos_id = 1
-    >>>     sos_id = 0
-    >>> model = Transformer(TINY_PARAMS)
+    example/translation_task/tutorial_transformer
 
     Returns
     -------
@@ -81,14 +68,16 @@ def get_config(self):
     def forward(self, inputs, targets=None):
         """Calculate target logits or inferred target sequences.
 
-    Args:
+    Parameters
+    ----------
       inputs: input tensor list of size 1 or 2.
         First item, inputs: int tensor with shape [batch_size, input_length].
         Second item (optional), targets: None or int tensor with shape
           [batch_size, target_length].
       training: boolean, whether in training mode or not.
 
-    Returns:
+    Returns
+    -------
       If targets is defined:
         Logits for each word in the target sequence: 
             float tensor with shape [batch_size, target_length, vocab_size]
@@ -165,12 +154,14 @@ def forward(self, inputs, targets=None):
     def encode(self, inputs, attention_bias):
         """Generate continuous representation for inputs.
 
-    Args:
+    Parameters
+    ----------
       inputs: int tensor with shape [batch_size, input_length].
       attention_bias: float tensor with shape [batch_size, 1, 1, input_length].
       training: boolean, whether in training mode or not.
 
-    Returns:
+    Returns
+    -------
       float tensor with shape [batch_size, input_length, hidden_size]
       Self-attention weights for encoder part:
         a dictionary of float tensors {
@@ -196,7 +187,8 @@ def encode(self, inputs, attention_bias):
     def decode(self, targets, encoder_outputs, attention_bias):
         """Generate logits for each value in the target sequence.
 
-    Args:
+    Parameters
+    ----------
       targets: target values for the output sequence. int tensor with shape
         [batch_size, target_length]
       encoder_outputs: continuous representation of input sequence. float tensor
@@ -204,7 +196,8 @@ def decode(self, targets, encoder_outputs, attention_bias):
       attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
       training: boolean, whether in training mode or not.
 
-    Returns:
+    Returns
+    -------
       float32 tensor with shape [batch_size, target_length, vocab_size]
       Weights for decoder part:
         a dictionary of dictionary of float tensors {
@@ -226,7 +219,8 @@ def decode(self, targets, encoder_outputs, attention_bias):
             decoder_inputs = self.embedding_softmax_layer(targets)
             with tf.name_scope("shift_targets"):
                 # Shift targets to the right, and remove the last element
-                decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]], constant_values=self.params.sos_id)[:, :-1, :]
+                decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]],
+                                        constant_values=self.params.sos_id)[:, :-1, :]
             with tf.name_scope("add_pos_encoding"):
                 length = tf.shape(decoder_inputs)[1]
                 decoder_inputs += positional_encoding(length, self.params.hidden_size)
@@ -254,18 +248,20 @@ def _get_symbols_to_logits_fn(self, max_decode_length):
         def symbols_to_logits_fn(ids, i, cache):
             """Generate logits for next potential IDs.
 
-      Args:
+        Parameters
+        ----------
         ids: Current decoded sequences. int tensor with shape [batch_size *
           beam_size, i + 1]
         i: Loop index
         cache: dictionary of values storing the encoder output, encoder-decoder
           attention bias, and previous decoder attention values.
 
-      Returns:
+        Returns
+        -------
         Tuple of
           (logits with shape [batch_size * beam_size, vocab_size],
            updated cache values)
-      """
+        """
             # Set decoder input to the last generated IDs
             decoder_input = ids[:, -1:]
 
@@ -296,7 +292,7 @@ def predict(self, encoder_outputs, encoder_decoder_attention_bias):
         symbols_to_logits_fn, weights = self._get_symbols_to_logits_fn(max_decode_length)
 
         # Create initial set of IDs that will be passed into symbols_to_logits_fn.
-        initial_ids = tf.ones([batch_size], dtype=tf.int32)*self.params.sos_id
+        initial_ids = tf.ones([batch_size], dtype=tf.int32) * self.params.sos_id
 
         # Create cache storing decoder attention values for each layer.
         # pylint: disable=g-complex-comprehension
@@ -443,7 +439,8 @@ def get_config(self):
     def forward(self, inputs, input_mask):
         """Return the output of the encoder layer stacks.
 
-    Args:
+    Parameters
+    -----------
       encoder_inputs: tensor with shape [batch_size, input_length, hidden_size]
       attention_bias: bias for the encoder self-attention layer. [batch_size, 1,
         1, input_length]
@@ -451,7 +448,8 @@ def forward(self, inputs, input_mask):
         zero paddings.
       training: boolean, whether in training mode or not.
 
-    Returns:
+     Returns:
+    -----------
       Output of encoder layer stack.
       float32 tensor with shape [batch_size, input_length, hidden_size]
     """
@@ -509,7 +507,8 @@ def get_config(self):
     def forward(self, inputs, features, input_mask, target_mask, cache=None):
         """Return the output of the decoder layer stacks.
 
-    Args:
+    Parameters
+    -----------
       decoder_inputs: tensor with shape [batch_size, target_length, hidden_size]
       encoder_outputs: tensor with shape [batch_size, input_length, hidden_size]
       decoder_self_attention_bias: bias for decoder self-attention layer. [1, 1,
@@ -523,7 +522,8 @@ def forward(self, inputs, features, input_mask, target_mask, cache=None):
                      "v": tensor with shape [batch_size, i, value_channels]},
                        ...}
 
-    Returns:
+     Returns:
+    -----------
       Output of decoder layer stack.
       float32 tensor with shape [batch_size, target_length, hidden_size]
     """
diff --git a/tensorlayer/models/transformer/utils/metrics.py b/tensorlayer/models/transformer/utils/metrics.py
index 25c4eaae4..6a5aa5d35 100644
--- a/tensorlayer/models/transformer/utils/metrics.py
+++ b/tensorlayer/models/transformer/utils/metrics.py
@@ -52,12 +52,14 @@ def _pad_tensors_to_same_length(x, y):
 def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
     """Calculate cross entropy loss while ignoring padding.
 
-  Args:
+  Parameters
+-----------
     logits: Tensor of size [batch_size, length_logits, vocab_size]
     labels: Tensor of size [batch_size, length_labels]
     smoothing: Label smoothing constant, used to determine the on and off values
     vocab_size: int size of the vocabulary
-  Returns:
+   Returns:
+-----------
     Returns the cross entropy loss and weight tensors: float32 tensors with
       shape [batch_size, max(length_logits, length_labels)]
   """
@@ -91,11 +93,13 @@ def _convert_to_eval_metric(metric_fn):
   The input metric_fn returns values for the current batch. The wrapper
   aggregates the return values collected over all of the batches evaluated.
 
-  Args:
+  Parameters
+-----------
     metric_fn: function that returns scores and weights for the current batch's
       logits and predicted labels.
 
-  Returns:
+   Returns:
+-----------
     function that aggregates the scores and weights from metric_fn.
   """
 
@@ -190,11 +194,13 @@ def bleu_score(logits, labels):
   decode the ids and tokenize the output. By default, we use ngram order of 4
   and use brevity penalty. Also, this does not have beam search.
 
-  Args:
+  Parameters
+-----------
     logits: Tensor of size [batch_size, length_logits, vocab_size]
     labels: Tensor of size [batch-size, length_labels]
 
-  Returns:
+   Returns:
+-----------
     bleu: int, approx bleu score
   """
     predictions = tf.to_int32(tf.argmax(logits, axis=-1))
@@ -206,12 +212,14 @@ def bleu_score(logits, labels):
 def _get_ngrams_with_counter(segment, max_order):
     """Extracts all n-grams up to a given maximum order from an input segment.
 
-  Args:
+  Parameters
+-----------
     segment: text segment from which n-grams will be extracted.
     max_order: maximum length in tokens of the n-grams returned by this
         methods.
 
-  Returns:
+   Returns:
+-----------
     The Counter containing all n-grams upto max_order in segment
     with a count of how many times each n-gram occurred.
   """
@@ -226,7 +234,8 @@ def _get_ngrams_with_counter(segment, max_order):
 def compute_bleu(reference_corpus, translation_corpus, max_order=4, use_bp=True):
     """Computes BLEU score of translated segments against one or more references.
 
-  Args:
+  Parameters
+-----------
     reference_corpus: list of references for each translation. Each
         reference should be tokenized into a list of tokens.
     translation_corpus: list of translations to score. Each translation
@@ -234,7 +243,8 @@ def compute_bleu(reference_corpus, translation_corpus, max_order=4, use_bp=True)
     max_order: Maximum n-gram order to use when computing BLEU score.
     use_bp: boolean, whether to apply brevity penalty.
 
-  Returns:
+   Returns:
+-----------
     BLEU score.
   """
     reference_length = 0
@@ -292,11 +302,13 @@ def rouge_2_fscore(logits, labels):
   This is an approximate ROUGE scoring method since we do not glue word pieces
   or decode the ids and tokenize the output.
 
-  Args:
+  Parameters
+-----------
     logits: tensor, model predictions
     labels: tensor, gold output.
 
-  Returns:
+   Returns:
+-----------
     rouge2_fscore: approx rouge-2 f1 score.
   """
     predictions = tf.to_int32(tf.argmax(logits, axis=-1))
@@ -308,11 +320,13 @@ def rouge_2_fscore(logits, labels):
 def _get_ngrams(n, text):
     """Calculates n-grams.
 
-  Args:
+  Parameters
+-----------
     n: which n-grams to calculate
     text: An array of tokens
 
-  Returns:
+   Returns:
+-----------
     A set of n-grams
   """
     ngram_set = set()
@@ -329,12 +343,14 @@ def rouge_n(eval_sentences, ref_sentences, n=2):
   Source: https://www.microsoft.com/en-us/research/publication/
   rouge-a-package-for-automatic-evaluation-of-summaries/
 
-  Args:
+  Parameters
+-----------
     eval_sentences: Predicted sentences.
     ref_sentences: Sentences from the reference set
     n: Size of ngram.  Defaults to 2.
 
-  Returns:
+   Returns:
+-----------
     f1 score for ROUGE-N
   """
     f1_scores = []
@@ -369,11 +385,13 @@ def rouge_l_fscore(predictions, labels):
   This is an approximate ROUGE scoring method since we do not glue word pieces
   or decode the ids and tokenize the output.
 
-  Args:
+  Parameters
+-----------
     predictions: tensor, model predictions
     labels: tensor, gold output.
 
-  Returns:
+   Returns:
+-----------
     rouge_l_fscore: approx rouge-l f1 score.
   """
     outputs = tf.to_int32(tf.argmax(predictions, axis=-1))
@@ -398,11 +416,13 @@ def rouge_l_sentence_level(eval_sentences, ref_sentences):
   m = length of reference summary
   n = length of candidate summary
 
-  Args:
+  Parameters
+-----------
     eval_sentences: The sentences that have been picked by the summarizer
     ref_sentences: The sentences from the reference set
 
-  Returns:
+   Returns:
+-----------
     A float: F_lcs
   """
 
@@ -420,7 +440,8 @@ def _len_lcs(x, y):
 
   Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
 
-  Args:
+  Parameters
+-----------
     x: sequence of words
     y: sequence of words
 
@@ -439,11 +460,13 @@ def _lcs(x, y):
   in O(nm) time where n = len(x) and m = len(y).
   Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
 
-  Args:
+  Parameters
+-----------
     x: collection of words
     y: collection of words
 
-  Returns:
+   Returns:
+-----------
     Table of dictionary of coord and len lcs
   """
     n, m = len(x), len(y)
@@ -465,12 +488,14 @@ def _f_lcs(llcs, m, n):
   Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
   rouge-working-note-v1.3.1.pdf
 
-  Args:
+  Parameters
+-----------
     llcs: Length of LCS
     m: number of words in reference summary
     n: number of words in candidate summary
 
-  Returns:
+   Returns:
+-----------
     Float. LCS-based F-measure score
   """
     r_lcs = llcs / m
@@ -498,13 +523,15 @@ def _pad_tensors_to_same_length(x, y):
 def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
     """Calculate cross entropy loss while ignoring padding.
 
-  Args:
+  Parameters
+-----------
     logits: Tensor of size [batch_size, length_logits, vocab_size]
     labels: Tensor of size [batch_size, length_labels]
     smoothing: Label smoothing constant, used to determine the on and off values
     vocab_size: int size of the vocabulary
 
-  Returns:
+   Returns:
+-----------
     Returns the cross entropy loss and weight tensors: float32 tensors with
       shape [batch_size, max(length_logits, length_labels)]
   """
@@ -617,13 +644,15 @@ def call(self, inputs):
 def transformer_loss(logits, labels, smoothing, vocab_size):
     """Calculates total loss containing cross entropy with padding ignored.
 
-  Args:
+  Parameters
+-----------
     logits: Tensor of size [batch_size, length_logits, vocab_size]
     labels: Tensor of size [batch_size, length_labels]
     smoothing: Label smoothing constant, used to determine the on and off values
     vocab_size: int size of the vocabulary
 
-  Returns:
+   Returns:
+-----------
     A scalar float tensor for loss.
   """
     xentropy, weights = padded_cross_entropy_loss(logits, labels, smoothing, vocab_size)
diff --git a/tensorlayer/models/transformer/utils/model_utils.py b/tensorlayer/models/transformer/utils/model_utils.py
index 10c4a3e2c..63f21c7a2 100644
--- a/tensorlayer/models/transformer/utils/model_utils.py
+++ b/tensorlayer/models/transformer/utils/model_utils.py
@@ -32,13 +32,15 @@ def positional_encoding(length, hidden_size, min_timescale=1.0, max_timescale=1.
   geometrically increasing wavelengths.
   Defined and formulized in Attention is All You Need, section 3.5.
 
-  Args:
+  Parameters
+-----------
     length: Sequence length.
     hidden_size: Size of the
     min_timescale: Minimum scale that will be applied at each position
     max_timescale: Maximum scale that will be applied at each position
 
-  Returns:
+   Returns:
+-----------
     Tensor with shape [length, hidden_size]
   """
     position = tf.cast(tf.range(length), tf.float32)
@@ -59,10 +61,12 @@ def get_decoder_self_attention_bias(length):
   connections, so prediction at position i cannot draw information from future
   positions.
 
-  Args:
+  Parameters
+-----------
     length: int length of sequences in batch.
 
-  Returns:
+   Returns:
+-----------
     float tensor of shape [1, 1, length, length]
   """
     with tf.name_scope("decoder_self_attention_bias"):
@@ -75,11 +79,13 @@ def get_decoder_self_attention_bias(length):
 def get_padding(x, padding_value=0):
     """Return float tensor representing the padding values in x.
 
-  Args:
+  Parameters
+-----------
     x: int tensor with any shape
     padding_value: int value that
 
-  Returns:
+   Returns:
+-----------
     float tensor with same shape as x containing values 0 or 1.
       0 -> non-padding, 1 -> padding
   """
@@ -94,10 +100,12 @@ def get_padding_bias(x):
   which has shape [batch_size, num_heads, length, length]. The tensor is zero at
   non-padding locations, and -1e9 (negative infinity) at padding locations.
 
-  Args:
+  Parameters
+-----------
     x: int tensor with shape [batch_size, length]
 
-  Returns:
+   Returns:
+-----------
     Attention bias tensor of shape [batch_size, 1, 1, length].
   """
     with tf.name_scope("attention_bias"):

From e0e81f0b28d04f07e86cb786526fa526710ab845 Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@lingjundembp.home>
Date: Fri, 13 Sep 2019 23:48:59 +0100
Subject: [PATCH 16/22] documentation

---
 tensorlayer/models/transformer/attention_layer.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorlayer/models/transformer/attention_layer.py b/tensorlayer/models/transformer/attention_layer.py
index 25eca2ac6..17ebae27b 100644
--- a/tensorlayer/models/transformer/attention_layer.py
+++ b/tensorlayer/models/transformer/attention_layer.py
@@ -26,6 +26,7 @@ class MultiHeadAttentionLayer(tl.layers.Layer):
     """The :class:`MultiHeadAttentionLayer` layer is for multi-head attention computation.
     The weight computation is between "key" and "query", which will then matmul with "value" to generate information
     that selectively focuses on the "query" messages.
+
     Parameters
     -----------
     num_heads : int
@@ -79,7 +80,9 @@ def split_heads(self, x):
     The tensor is transposed to insure the inner dimensions hold the correct
     values during the matrix multiplication.
 
-    Args:
+    Parameters
+    -----------
+
       x: A tensor with shape [batch_size, length, hidden_size]
 
     Returns:
@@ -106,7 +109,7 @@ def combine_heads(self, x):
       x: A tensor [batch_size, num_heads, length, hidden_size/num_heads]
 
     Returns:
------------
+    -----------
       A tensor with shape [batch_size, length, hidden_size]
     """
         with tf.name_scope("combine_heads"):

From 80c985c8c76a0fda46e51fa874e09100675760e8 Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@lingjundembp.home>
Date: Sat, 14 Sep 2019 11:18:42 +0100
Subject: [PATCH 17/22] doc

---
 .../models/transformer/attention_layer.py     | 41 +--------
 .../beamsearchHelper/beam_search.py           |  7 +-
 .../beamsearchHelper/beam_search_v1.py        | 42 +---------
 .../models/transformer/embedding_layer.py     | 31 ++-----
 .../models/transformer/feedforward_layer.py   | 22 ++---
 tensorlayer/models/transformer/transformer.py | 83 ++++++-------------
 6 files changed, 50 insertions(+), 176 deletions(-)

diff --git a/tensorlayer/models/transformer/attention_layer.py b/tensorlayer/models/transformer/attention_layer.py
index 17ebae27b..5d9e5cca7 100644
--- a/tensorlayer/models/transformer/attention_layer.py
+++ b/tensorlayer/models/transformer/attention_layer.py
@@ -60,6 +60,7 @@ def get_config(self):
         }
 
     def build(self, inputs_shape):
+
         # Transformation for linearly projecting the queries, keys, and values.
         self.q_transformation = self._get_weights(
             "q_project", shape=(self.hidden_size, self.hidden_size), init=tf.initializers.get('glorot_uniform')
@@ -75,20 +76,7 @@ def build(self, inputs_shape):
         )
 
     def split_heads(self, x):
-        """Split x into different heads, and transpose the resulting value.
-
-    The tensor is transposed to insure the inner dimensions hold the correct
-    values during the matrix multiplication.
 
-    Parameters
-    -----------
-
-      x: A tensor with shape [batch_size, length, hidden_size]
-
-    Returns:
-    -----------
-      A tensor with shape [batch_size, num_heads, length, hidden_size/num_heads]
-    """
         with tf.name_scope("split_heads"):
             batch_size = tf.shape(x)[0]
             length = tf.shape(x)[1]
@@ -103,15 +91,7 @@ def split_heads(self, x):
             return tf.transpose(x, [0, 2, 1, 3])
 
     def combine_heads(self, x):
-        """Combine tensor that has been split.
-
-    Args:
-      x: A tensor [batch_size, num_heads, length, hidden_size/num_heads]
 
-    Returns:
-    -----------
-      A tensor with shape [batch_size, length, hidden_size]
-    """
         with tf.name_scope("combine_heads"):
             batch_size = tf.shape(x)[0]
             length = tf.shape(x)[2]
@@ -119,24 +99,7 @@ def combine_heads(self, x):
             return tf.reshape(x, [batch_size, length, self.hidden_size])
 
     def forward(self, x, y, mask, cache=None):
-        """Apply attention mechanism to x and y.
-
-    Args:
-      x: a tensor with shape [batch_size, length_x, hidden_size]
-      y: a tensor with shape [batch_size, length_y, hidden_size]
-      mask: attention bias that will be added to the result of the dot product.
-      training: boolean, whether in training mode or not.
-      cache: (Used during prediction) dictionary with tensors containing results
-        of previous attentions. The dictionary must have the items:
-            {"k": tensor with shape [batch_size, i, key_channels],
-             "v": tensor with shape [batch_size, i, value_channels]}
-        where i is the current decoded length.
-
-    Returns:
-    -----------
-      Attention layer output with shape [batch_size, length_x, hidden_size]
-      Attention weights with shape [batch_size, number_of_head, length_x, length_y]
-    """
+        """Apply attention mechanism to x and y."""
         # Linearly project the query (q), key (k) and value (v) using different
         # learned projections. This is in preparation of splitting them into
         # multiple heads. Multi-head attention uses multiple queries, keys, and
diff --git a/tensorlayer/models/transformer/beamsearchHelper/beam_search.py b/tensorlayer/models/transformer/beamsearchHelper/beam_search.py
index e5ffae949..b1959f901 100644
--- a/tensorlayer/models/transformer/beamsearchHelper/beam_search.py
+++ b/tensorlayer/models/transformer/beamsearchHelper/beam_search.py
@@ -72,10 +72,11 @@ def sequence_beam_search(
     eos_id: int 
         id of eos token, used to determine when a sequence has finished
     
-    Returns
+    Notes
     -------
-    Top decoded sequences [batch_size, beam_size, max_decode_length]
-    sequence scores [batch_size, beam_size]
+    The function would return:
+        Top decoded sequences [batch_size, beam_size, max_decode_length]
+        sequence scores [batch_size, beam_size]
   """
 
     batch_size = tf.shape(initial_ids)[0]
diff --git a/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py b/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py
index 0fabe01b2..db3166366 100644
--- a/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py
+++ b/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py
@@ -166,15 +166,6 @@ def _continue_search(self, state):
       2) when the worst score in the finished sequences is better than the best
          score in the alive sequences (i.e. the finished sequences are provably
          unchanging)
-
-    Parameters
-    -----------
-      state: A dictionary with the current loop state.
-
-    Returns:
-    -----------
-      Bool tensor with value True if loop should continue, False if loop should
-      terminate.
     """
         i = state[_StateKeys.CUR_INDEX]
         alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
@@ -216,13 +207,6 @@ def _search_step(self, state):
     by the length normalization factor. Without length normalization, the
     search is more likely to return shorter sequences.
 
-    Parameters
-    -----------
-      state: A dictionary with the current loop state.
-
-    Returns:
-    -----------
-      new state dictionary.
     """
         # Grow alive sequences by one token.
         new_seq, new_log_probs, new_cache = self._grow_alive_seq(state)
@@ -241,20 +225,9 @@ def _search_step(self, state):
 
     def _grow_alive_seq(self, state):
         """Grow alive sequences by one token, and collect top 2*beam_size sequences.
-
     2*beam_size sequences are collected because some sequences may have reached
     the EOS token. 2*beam_size ensures that at least beam_size sequences are
     still alive.
-
-    Parameters
-    -----------
-      state: A dictionary with the current loop state.
-    Returns:
-    -----------
-      Tuple of
-      (Top 2*beam_size sequences [batch_size, 2 * beam_size, cur_index + 1],
-       Scores of returned sequences [batch_size, 2 * beam_size],
-       New alive cache, for each of the 2 * beam_size sequences)
     """
         i = state[_StateKeys.CUR_INDEX]
         alive_seq = state[_StateKeys.ALIVE_SEQ]
@@ -384,10 +357,11 @@ def sequence_beam_search(
     eos_id: int 
         id of eos token, used to determine when a sequence has finished
     
-    Returns
+    Notes
     -------
-    Top decoded sequences [batch_size, beam_size, max_decode_length]
-    sequence scores [batch_size, beam_size]
+    The function would return:
+      Top decoded sequences [batch_size, beam_size, max_decode_length]
+      sequence scores [batch_size, beam_size]
   """
     batch_size = tf.shape(initial_ids)[0]
     sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, batch_size, beam_size, alpha, max_decode_length, eos_id)
@@ -449,14 +423,6 @@ def _get_shape_keep_last_dim(tensor):
 
 def _flatten_beam_dim(tensor):
     """Reshapes first two dimensions in to single dimension.
-
-  Parameters
-  -----------
-    tensor: Tensor to reshape of shape [A, B, ...]
-
-  Returns
-  -----------
-    Reshaped tensor of shape [A*B, ...]
   """
     shape = _shape_list(tensor)
     shape[0] *= shape[1]
diff --git a/tensorlayer/models/transformer/embedding_layer.py b/tensorlayer/models/transformer/embedding_layer.py
index 76ae97270..ee9d07273 100644
--- a/tensorlayer/models/transformer/embedding_layer.py
+++ b/tensorlayer/models/transformer/embedding_layer.py
@@ -30,8 +30,10 @@ def __init__(self, vocab_size, hidden_size):
 
     Parameters
     -----------
-      vocab_size: Number of tokens in the embedding. (Typically ~32,000)
-      hidden_size: Dimensionality of the embedding. (Typically 512 or 1024)
+      vocab_size : int
+        Number of tokens in the embedding. (Typically ~32,000)
+      hidden_size : int
+        Dimensionality of the embedding. (Typically 512 or 1024)
     """
         super(EmbeddingLayer, self).__init__()
         self.vocab_size = vocab_size
@@ -56,20 +58,7 @@ def get_config(self):
         }
 
     def forward(self, inputs, mode="embedding"):
-        """Get token embeddings of inputs.
-
-    Parameters
-    -----------
-      inputs: An int64 tensor with shape [batch_size, length]
-      mode: string, a valid value is one of "embedding" and "linear".
-     Returns:
-    -----------
-      outputs: (1) If mode == "embedding", output embedding tensor, float32 with
-        shape [batch_size, length, embedding_size]; (2) mode == "linear", output
-        linear tensor, float32 with shape [batch_size, length, vocab_size].
-    Raises:
-      ValueError: if mode is not valid.
-    """
+        """Get token embeddings of inputs."""
         if mode == "embedding":
             return self._embedding(inputs)
         elif mode == "linear":
@@ -89,15 +78,7 @@ def _embedding(self, inputs):
             return embeddings
 
     def _linear(self, inputs):
-        """Computes logits by running inputs through a linear layer.
-
-    Parameters
-    -----------
-      inputs: A float32 tensor with shape [batch_size, length, hidden_size]
-     Returns:
-    -----------
-      float32 tensor with shape [batch_size, length, vocab_size].
-    """
+        """Computes logits by running inputs through a linear layer."""
         with tf.name_scope("presoftmax_linear"):
             batch_size = tf.shape(inputs)[0]
             length = tf.shape(inputs)[1]
diff --git a/tensorlayer/models/transformer/feedforward_layer.py b/tensorlayer/models/transformer/feedforward_layer.py
index b37a88090..ecc9e5249 100644
--- a/tensorlayer/models/transformer/feedforward_layer.py
+++ b/tensorlayer/models/transformer/feedforward_layer.py
@@ -30,9 +30,12 @@ def __init__(self, hidden_size, filter_size, keep_prob):
 
     Parameters
     -----------
-      hidden_size: int, output dim of hidden layer.
-      filter_size: int, filter size for the inner (first) dense layer.
-      relu_dropout: float, dropout rate for training.
+      hidden_size: int
+        output dim of hidden layer.
+      filter_size: int
+        filter size for the inner (first) dense layer.
+      relu_dropout: float
+        dropout rate for training.
     """
         super(TransformerFeedForwardLayer, self).__init__()
         self.hidden_size = hidden_size
@@ -60,18 +63,7 @@ def get_config(self):
         }
 
     def forward(self, inputs):
-        """Return outputs of the feedforward network.
-
-    Parameters
-    -----------
-      x: tensor with shape [batch_size, length, hidden_size]
-      training: boolean, whether in training mode or not.
-
-    Returns:
-    -----------
-      Output of the feedforward network.
-      tensor with shape [batch_size, length, hidden_size]
-    """
+        """Return outputs of the feedforward network."""
         # Retrieve dynamically known shapes
         x = inputs
         batch_size = tf.shape(x)[0]
diff --git a/tensorlayer/models/transformer/transformer.py b/tensorlayer/models/transformer/transformer.py
index 28b59367b..c03701eb4 100644
--- a/tensorlayer/models/transformer/transformer.py
+++ b/tensorlayer/models/transformer/transformer.py
@@ -74,10 +74,12 @@ def forward(self, inputs, targets=None):
         First item, inputs: int tensor with shape [batch_size, input_length].
         Second item (optional), targets: None or int tensor with shape
           [batch_size, target_length].
-      training: boolean, whether in training mode or not.
+      training: boolean
+        whether in training mode or not.
 
-    Returns
+    Notes
     -------
+    The function would return:
       If targets is defined:
         Logits for each word in the target sequence: 
             float tensor with shape [batch_size, target_length, vocab_size]
@@ -162,13 +164,15 @@ def encode(self, inputs, attention_bias):
 
     Returns
     -------
-      float tensor with shape [batch_size, input_length, hidden_size]
-      Self-attention weights for encoder part:
-        a dictionary of float tensors {
+      Float tensor with shape [batch_size, input_length, hidden_size]:
+        The output of encoder
+      
+      Dictionary of float tensors {
             "layer_0": [batch_size, number_of_heads, source_length, source_length],
             "layer_1": [batch_size, number_of_heads, source_length, source_length],
             ...
-        }
+        }:
+        Self-attention weights for encoder part
     """
 
         # Prepare inputs to the layer stack by adding positional encodings and
@@ -198,9 +202,10 @@ def decode(self, targets, encoder_outputs, attention_bias):
 
     Returns
     -------
-      float32 tensor with shape [batch_size, target_length, vocab_size]
-      Weights for decoder part:
-        a dictionary of dictionary of float tensors {
+      Float32 tensor with shape [batch_size, target_length, vocab_size]:
+        Output of decoder part
+        
+      Dictionary of dictionary of float tensors {
             "self": {
                 "layer_0": [batch_size, number_of_heads, target_length, target_length],
                 "layer_1": [batch_size, number_of_heads, target_length, target_length],
@@ -211,7 +216,8 @@ def decode(self, targets, encoder_outputs, attention_bias):
                 "layer_1": [batch_size, number_of_heads, source_length, target_length],
                 ...
             }
-        }
+        }:
+        Weights for decoder part
     """
         with tf.name_scope("decode"):
             # Prepare inputs to decoder layers by shifting targets, adding positional
@@ -246,22 +252,8 @@ def _get_symbols_to_logits_fn(self, max_decode_length):
         weights = []
 
         def symbols_to_logits_fn(ids, i, cache):
-            """Generate logits for next potential IDs.
-
-        Parameters
-        ----------
-        ids: Current decoded sequences. int tensor with shape [batch_size *
-          beam_size, i + 1]
-        i: Loop index
-        cache: dictionary of values storing the encoder output, encoder-decoder
-          attention bias, and previous decoder attention values.
-
-        Returns
-        -------
-        Tuple of
-          (logits with shape [batch_size * beam_size, vocab_size],
-           updated cache values)
-        """
+            """Generate logits for next potential IDs."""
+
             # Set decoder input to the last generated IDs
             decoder_input = ids[:, -1:]
 
@@ -282,9 +274,7 @@ def symbols_to_logits_fn(ids, i, cache):
         return symbols_to_logits_fn, weights
 
     def predict(self, encoder_outputs, encoder_decoder_attention_bias):
-        """
-    Return predicted sequence, and decoder attention weights.
-    """
+
         batch_size = tf.shape(encoder_outputs)[0]
         input_length = tf.shape(encoder_outputs)[1]
         max_decode_length = input_length + self.params.extra_decode_length
@@ -338,11 +328,10 @@ class LayerNormalization(tl.layers.Layer):
 
     Parameters
     ----------
-    hidden_size:
+    hidden_size: int
         hidden size of features
-    epsilon:
+    epsilon: float
         value to prevent division by zero
-
     """
 
     def __init__(self, hidden_size, epsilon=1e-6):
@@ -437,22 +426,7 @@ def get_config(self):
         }
 
     def forward(self, inputs, input_mask):
-        """Return the output of the encoder layer stacks.
-
-    Parameters
-    -----------
-      encoder_inputs: tensor with shape [batch_size, input_length, hidden_size]
-      attention_bias: bias for the encoder self-attention layer. [batch_size, 1,
-        1, input_length]
-      inputs_padding: tensor with shape [batch_size, input_length], inputs with
-        zero paddings.
-      training: boolean, whether in training mode or not.
-
-     Returns:
-    -----------
-      Output of encoder layer stack.
-      float32 tensor with shape [batch_size, input_length, hidden_size]
-    """
+        """Return the output of the encoder layer stacks."""
         encoder_inputs = inputs
         weights = {}
         for n, layer in enumerate(self.layers):
@@ -509,23 +483,20 @@ def forward(self, inputs, features, input_mask, target_mask, cache=None):
 
     Parameters
     -----------
-      decoder_inputs: tensor with shape [batch_size, target_length, hidden_size]
-      encoder_outputs: tensor with shape [batch_size, input_length, hidden_size]
+      decoder_inputs : tensor with shape [batch_size, target_length, hidden_size]
+      encoder_outputs : tensor with shape [batch_size, input_length, hidden_size]
       decoder_self_attention_bias: bias for decoder self-attention layer. [1, 1,
         target_len, target_length]
-      attention_bias: bias for encoder-decoder attention layer. [batch_size, 1,
+      attention_bias : bias for encoder-decoder attention layer. [batch_size, 1,
         1, input_length]
-      training: boolean, whether in training mode or not.
+      training : boolean 
+        whether in training mode or not.
       cache: (Used for fast decoding) A nested dictionary storing previous
         decoder self-attention values. The items are:
           {layer_n: {"k": tensor with shape [batch_size, i, key_channels],
                      "v": tensor with shape [batch_size, i, value_channels]},
                        ...}
 
-     Returns:
-    -----------
-      Output of decoder layer stack.
-      float32 tensor with shape [batch_size, target_length, hidden_size]
     """
         decoder_inputs = inputs
         decoder_self_attention_bias = target_mask

From 990e01493b36368c8403da32bc71c5be39793df1 Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@lingjundembp.home>
Date: Sat, 14 Sep 2019 11:28:43 +0100
Subject: [PATCH 18/22] reverse change

---
 CHANGELOG.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 124fb1f9e..c883bd878 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -133,8 +133,8 @@ This release is compatible with TensorFlow 2 RC1.
 ### Contributors
 
 - @zsdonghao
-- @ChrisWu1997: #1010 #1015 #1025 #1030
-- @warshallrho: #1017 #1021 #1026 #1029 #1032
+- @ChrisWu1997: #1010 #1015 #1025 #1030 #1040
+- @warshallrho: #1017 #1021 #1026 #1029 #1032 #1041
 - @ArnoldLIULJ: #1023 #1027
 - @JingqingZ: #1023 #1027
 

From 2c1ced856677316a1bf0e304a00136bde673d702 Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@lingjundembp.home>
Date: Sat, 14 Sep 2019 11:29:22 +0100
Subject: [PATCH 19/22] reverse change

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c883bd878..286856370 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -133,6 +133,7 @@ This release is compatible with TensorFlow 2 RC1.
 ### Contributors
 
 - @zsdonghao
+- @luomai
 - @ChrisWu1997: #1010 #1015 #1025 #1030 #1040
 - @warshallrho: #1017 #1021 #1026 #1029 #1032 #1041
 - @ArnoldLIULJ: #1023 #1027

From 91441656b291eefa3621123124aa5b056ddc5a29 Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@lingjundembp.home>
Date: Sat, 14 Sep 2019 11:33:28 +0100
Subject: [PATCH 20/22] doc

---
 .../models/transformer/utils/model_utils.py   | 41 ++++++++-----------
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/tensorlayer/models/transformer/utils/model_utils.py b/tensorlayer/models/transformer/utils/model_utils.py
index 63f21c7a2..5410a97e2 100644
--- a/tensorlayer/models/transformer/utils/model_utils.py
+++ b/tensorlayer/models/transformer/utils/model_utils.py
@@ -33,15 +33,16 @@ def positional_encoding(length, hidden_size, min_timescale=1.0, max_timescale=1.
   Defined and formulized in Attention is All You Need, section 3.5.
 
   Parameters
------------
-    length: Sequence length.
-    hidden_size: Size of the
-    min_timescale: Minimum scale that will be applied at each position
-    max_timescale: Maximum scale that will be applied at each position
-
-   Returns:
------------
-    Tensor with shape [length, hidden_size]
+``-----------
+    length : int
+      Sequence length.
+    hidden_size : int
+      channel number of input
+    min_timescale : float
+      Minimum scale that will be applied at each position
+    max_timescale : float
+      Maximum scale that will be applied at each position
+
   """
     position = tf.cast(tf.range(length), tf.float32)
     num_timescales = hidden_size // 2
@@ -62,12 +63,11 @@ def get_decoder_self_attention_bias(length):
   positions.
 
   Parameters
------------
-    length: int length of sequences in batch.
+  -----------
+    length: int 
+      length of sequences in batch.
+
 
-   Returns:
------------
-    float tensor of shape [1, 1, length, length]
   """
     with tf.name_scope("decoder_self_attention_bias"):
         valid_locs = tf.linalg.band_part(tf.ones([length, length]), -1, 0)
@@ -80,14 +80,10 @@ def get_padding(x, padding_value=0):
     """Return float tensor representing the padding values in x.
 
   Parameters
------------
+  -----------
     x: int tensor with any shape
-    padding_value: int value that
+    padding_value: int 
 
-   Returns:
------------
-    float tensor with same shape as x containing values 0 or 1.
-      0 -> non-padding, 1 -> padding
   """
     with tf.name_scope("padding"):
         return tf.cast(tf.equal(x, padding_value), tf.float32)
@@ -101,12 +97,9 @@ def get_padding_bias(x):
   non-padding locations, and -1e9 (negative infinity) at padding locations.
 
   Parameters
------------
+  -----------
     x: int tensor with shape [batch_size, length]
 
-   Returns:
------------
-    Attention bias tensor of shape [batch_size, 1, 1, length].
   """
     with tf.name_scope("attention_bias"):
         padding = get_padding(x)

From 576af52b01fc206a075831620b6dd8002910f537 Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@LingjundeMacBook-Pro.local>
Date: Wed, 18 Sep 2019 14:02:16 +0100
Subject: [PATCH 21/22] optimizer

---
 tensorlayer/optimizers/lazyAdam.py | 147 -----------------------------
 1 file changed, 147 deletions(-)
 delete mode 100644 tensorlayer/optimizers/lazyAdam.py

diff --git a/tensorlayer/optimizers/lazyAdam.py b/tensorlayer/optimizers/lazyAdam.py
deleted file mode 100644
index 75ae77f65..000000000
--- a/tensorlayer/optimizers/lazyAdam.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Optimizer from addons and learning rate scheduler."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-K = tf.keras.backend
-
-
-class LazyAdam(tf.optimizers.Adam):
-    """Variant of the Adam optimizer that handles sparse updates more efficiently.
-
-  The original Adam algorithm maintains two moving-average accumulators for
-  each trainable variable; the accumulators are updated at every step.
-  This class provides lazier handling of gradient updates for sparse
-  variables.  It only updates moving-average accumulators for sparse variable
-  indices that appear in the current batch, rather than updating the
-  accumulators for all indices. Compared with the original Adam optimizer,
-  it can provide large improvements in model training throughput for some
-  applications. However, it provides slightly different semantics than the
-  original Adam algorithm, and may lead to different empirical results.
-  Note, amsgrad is currently not supported and the argument can only be
-  False.
-
-  This class is borrowed from:
-  https://github.com/tensorflow/addons/blob/master/tensorflow_addons/optimizers/lazy_adam.py
-  """
-
-    def _resource_apply_sparse(self, grad, var, indices):
-        """Applies grad for one step."""
-        var_dtype = var.dtype.base_dtype
-        lr_t = self._decayed_lr(var_dtype)
-        beta_1_t = self._get_hyper('beta_1', var_dtype)
-        beta_2_t = self._get_hyper('beta_2', var_dtype)
-        local_step = tf.cast(self.iterations + 1, var_dtype)
-        beta_1_power = tf.math.pow(beta_1_t, local_step)
-        beta_2_power = tf.math.pow(beta_2_t, local_step)
-        epsilon_t = tf.convert_to_tensor(self.epsilon, var_dtype)
-        lr = (lr_t * tf.math.sqrt(1 - beta_2_power) / (1 - beta_1_power))
-
-        # \\(m := beta1 * m + (1 - beta1) * g_t\\)
-        m = self.get_slot(var, 'm')
-        m_t_slice = beta_1_t * tf.gather(m, indices) + (1 - beta_1_t) * grad
-
-        m_update_kwargs = {'resource': m.handle, 'indices': indices, 'updates': m_t_slice}
-        m_update_op = tf.raw_ops.ResourceScatterUpdate(**m_update_kwargs)
-
-        # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
-        v = self.get_slot(var, 'v')
-        v_t_slice = (beta_2_t * tf.gather(v, indices) + (1 - beta_2_t) * tf.math.square(grad))
-
-        v_update_kwargs = {'resource': v.handle, 'indices': indices, 'updates': v_t_slice}
-        v_update_op = tf.raw_ops.ResourceScatterUpdate(**v_update_kwargs)
-
-        # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
-        var_slice = lr * m_t_slice / (tf.math.sqrt(v_t_slice) + epsilon_t)
-
-        var_update_kwargs = {'resource': var.handle, 'indices': indices, 'updates': var_slice}
-        var_update_op = tf.raw_ops.ResourceScatterSub(**var_update_kwargs)
-
-        return tf.group(*[var_update_op, m_update_op, v_update_op])
-
-
-class LearningRateFn(object):
-    """Creates learning rate function."""
-
-    def __init__(self, learning_rate, hidden_size, warmup_steps):
-        self.learning_rate = learning_rate
-        self.hidden_size = hidden_size
-        self.warmup_steps = float(warmup_steps)
-
-    def __call__(self, global_step):
-        """Calculate learning rate with linear warmup and rsqrt decay."""
-        step = float(global_step)
-        learning_rate = self.learning_rate
-        learning_rate *= (self.hidden_size**-0.5)
-        # Apply linear warmup
-        learning_rate *= np.minimum(1.0, step / self.warmup_steps)
-        # Apply rsqrt decay
-        learning_rate /= np.sqrt(np.maximum(step, self.warmup_steps))
-        return learning_rate
-
-
-class LearningRateScheduler(tf.keras.callbacks.Callback):
-    """Keras callback to schedule learning rate.
-
-  TODO(tianlin): Refactor this scheduler and LearningRateBatchScheduler in
-  official/resnet/keras/keras_common.py.
-  """
-
-    def __init__(self, schedule, init_steps=None, verbose=False):
-        super(LearningRateScheduler, self).__init__()
-        self.schedule = schedule
-        self.verbose = verbose
-        if init_steps is None:
-            init_steps = 0.0
-        self.steps = float(init_steps)  # Total steps during training.
-
-    def on_epoch_begin(self, epoch, logs=None):
-        if not hasattr(self.model.optimizer, 'lr'):
-            raise ValueError('Optimizer must have a "lr" attribute.')
-        if not hasattr(self.model.optimizer, 'iterations'):
-            raise ValueError('Optimizer must have a "iterations" attribute.')
-
-    def on_train_batch_begin(self, batch, logs=None):
-        """Adjusts learning rate for each train batch."""
-        if self.verbose > 0:
-            iterations = K.get_value(self.model.optimizer.iterations)
-            print('Original iteration %d' % iterations)
-
-        self.steps += 1.0
-        try:  # new API
-            lr = float(K.get_value(self.model.optimizer.lr))
-            lr = self.schedule(self.steps, lr)
-        except TypeError:  # Support for old API for backward compatibility
-            lr = self.schedule(self.steps)
-        if not isinstance(lr, (float, np.float32, np.float64)):
-            raise ValueError('The output of the "schedule" function ' 'should be float.')
-        K.set_value(self.model.optimizer.lr, lr)
-        K.set_value(self.model.optimizer.iterations, self.steps)
-
-        if self.verbose > 0:
-            print(
-                'Batch %05d Step %05d: LearningRateScheduler setting learning '
-                'rate to %s.' % (batch + 1, self.steps, lr)
-            )
-
-    def on_epoch_end(self, epoch, logs=None):
-        logs = logs or {}
-        logs['lr'] = K.get_value(self.model.optimizer.lr)
-        logs['steps'] = self.steps

From a2a1cbf6dc0eff68e8d44afd53cca78c50b004a3 Mon Sep 17 00:00:00 2001
From: Lingjun Liu <lingjunliu@LingjundeMacBook-Pro.local>
Date: Wed, 18 Sep 2019 14:06:23 +0100
Subject: [PATCH 22/22] doc

---
 tensorlayer/models/transformer/embedding_layer.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorlayer/models/transformer/embedding_layer.py b/tensorlayer/models/transformer/embedding_layer.py
index ee9d07273..1897b0a22 100644
--- a/tensorlayer/models/transformer/embedding_layer.py
+++ b/tensorlayer/models/transformer/embedding_layer.py
@@ -30,10 +30,16 @@ def __init__(self, vocab_size, hidden_size):
 
     Parameters
     -----------
-      vocab_size : int
+    vocab_size : int
         Number of tokens in the embedding. (Typically ~32,000)
-      hidden_size : int
+    hidden_size : int
         Dimensionality of the embedding. (Typically 512 or 1024)
+
+    Examples
+    ---------
+    with TensorLayer
+
+    
     """
         super(EmbeddingLayer, self).__init__()
         self.vocab_size = vocab_size