diff --git a/CHANGELOG.md b/CHANGELOG.md index b3b398bfa..9b24fcf68 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -102,6 +102,7 @@ This release is compatible with TensorFlow 2 RC1. - Support string dtype in InputLayer (#PR 1017) - Support Dynamic RNN in RNN (#PR 1023) - Add ResNet50 static model (#PR 1030) +- Add Transformer model (#PR 1027) - Add performance test code in static model (#PR 1041) ### Changed @@ -139,8 +140,8 @@ This release is compatible with TensorFlow 2 RC1. - @luomai - @ChrisWu1997: #1010 #1015 #1025 #1030 #1040 - @warshallrho: #1017 #1021 #1026 #1029 #1032 #1041 -- @ArnoldLIULJ: #1023 -- @JingqingZ: #1023 +- @ArnoldLIULJ: #1023 #1027 +- @JingqingZ: #1023 #1027 ## [2.1.0] diff --git a/docs/modules/models.rst b/docs/modules/models.rst index 272f1d9c6..b8cb3f5f0 100644 --- a/docs/modules/models.rst +++ b/docs/modules/models.rst @@ -16,6 +16,7 @@ TensorLayer provides many pretrained models, you can easily use the whole or a p ResNet50 Seq2seq Seq2seqLuongAttention + Transorformer Base Model @@ -57,3 +58,8 @@ Seq2seq Luong Attention ------------------------ .. autoclass:: Seq2seqLuongAttention + +Transformer +------------------------ + +.. autoclass:: Transformer \ No newline at end of file diff --git a/examples/translation_task/tutorial_transformer.py b/examples/translation_task/tutorial_transformer.py new file mode 100644 index 000000000..cc3cf4bd4 --- /dev/null +++ b/examples/translation_task/tutorial_transformer.py @@ -0,0 +1,157 @@ +from __future__ import absolute_import, division, print_function, unicode_literals +import tensorflow_datasets as tfds +import tensorflow as tf +import time +import numpy as np +import matplotlib.pyplot as plt +from tensorlayer.models.transformer import Transformer +from tensorlayer.models.transformer.utils import metrics +from tensorlayer.models.transformer.utils import attention_visualisation +import tensorlayer as tl +""" Translation from Portugese to English by Transformer model +This tutorial provides basic instructions on how to define and train Transformer model on Tensorlayer for +Translation task. You can also learn how to visualize the attention block via this tutorial. +""" + + +def set_up_dataset(): + # Set up dataset for Portugese-English translation from the TED Talks Open Translation Project. + # This dataset contains approximately 50000 training examples, 1100 validation examples, and 2000 test examples. + # https://www.ted.com/participate/translate + + examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True) + train_examples, val_examples = examples['train'], examples['validation'] + + # Set up tokenizer and save the tokenizer + tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus( + (en.numpy() and pt.numpy() for pt, en in train_examples), target_vocab_size=2**14 + ) + + tokenizer.save_to_file("tokenizer") + tokenizer = tfds.features.text.SubwordTextEncoder.load_from_file("tokenizer") + + return tokenizer, train_examples + + +def test_tokenizer_success(tokenizer): + sample_string = 'TensorLayer is awesome.' + + tokenized_string = tokenizer.encode(sample_string) + print('Tokenized string is {}'.format(tokenized_string)) + + original_string = tokenizer.decode(tokenized_string) + print('The original string: {}'.format(original_string)) + assert original_string == sample_string + + +def generate_training_dataset(train_examples, tokenizer): + + def encode(lang1, lang2): + lang1 = tokenizer.encode(lang1.numpy()) + [tokenizer.vocab_size + 1] + + lang2 = tokenizer.encode(lang2.numpy()) + [tokenizer.vocab_size + 1] + + return lang1, lang2 + + MAX_LENGTH = 50 + + def filter_max_length(x, y, max_length=MAX_LENGTH): + return tf.logical_and(tf.size(x) <= max_length, tf.size(y) <= max_length) + + def tf_encode(pt, en): + return tf.py_function(encode, [pt, en], [tf.int64, tf.int64]) + + train_dataset = train_examples.map(tf_encode) + train_dataset = train_dataset.filter(filter_max_length) + # cache the dataset to memory to get a speedup while reading from it. + train_dataset = train_dataset.cache() + BUFFER_SIZE = 20000 + BATCH_SIZE = 64 + train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1])) + train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE) + + return train_dataset + + +def model_setup(tokenizer): + # define Hyper parameters for transformer + class HYPER_PARAMS(object): + vocab_size = tokenizer.vocab_size + 10 + encoder_num_layers = 4 + decoder_num_layers = 4 + hidden_size = 128 + ff_size = 512 + num_heads = 8 + keep_prob = 0.9 + + # Default prediction params + extra_decode_length = 50 + beam_size = 5 + alpha = 0.6 # used to calculate length normalization in beam search + + label_smoothing = 0.1 + learning_rate = 2.0 + learning_rate_decay_rate = 1.0 + learning_rate_warmup_steps = 4000 + + sos_id = 0 + eos_id = tokenizer.vocab_size + 1 + + model = Transformer(HYPER_PARAMS) + + # Set the optimizer + learning_rate = CustomSchedule(HYPER_PARAMS.hidden_size, warmup_steps=HYPER_PARAMS.learning_rate_warmup_steps) + optimizer = tl.optimizers.LazyAdamOptimizer(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) + return model, optimizer, HYPER_PARAMS + + +# Use the Adam optimizer with a custom learning rate scheduler according to the formula in the Paper "Attention is All you need" +class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule): + + def __init__(self, d_model, warmup_steps=5): + super(CustomSchedule, self).__init__() + + self.d_model = d_model + self.d_model = tf.cast(self.d_model, tf.float32) + + self.warmup_steps = warmup_steps + + def __call__(self, step): + arg1 = tf.math.rsqrt(step) + arg2 = step * (self.warmup_steps**-1.5) + + return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2) + + +def tutorial_transformer(): + tokenizer, train_examples = set_up_dataset() + train_dataset = generate_training_dataset(train_examples, tokenizer) + model, optimizer, HYPER_PARAMS = model_setup(tokenizer) + + num_epochs = 10 + for epoch in range(num_epochs): + model.train() + for (batch, (inp, tar)) in enumerate(train_dataset): + with tf.GradientTape() as tape: + logits, weights_encoder, weights_decoder = model(inputs=inp, targets=tar) + logits = metrics.MetricLayer(HYPER_PARAMS.vocab_size)([logits, tar]) + logits, loss = metrics.LossLayer(HYPER_PARAMS.vocab_size, 0.1)([logits, tar]) + grad = tape.gradient(loss, model.all_weights) + optimizer.apply_gradients(zip(grad, model.all_weights)) + if (batch % 50 == 0): + print('Batch ID {} at Epoch [{}/{}]: loss {:.4f}'.format(batch, epoch + 1, num_epochs, loss)) + + model.eval() + sentence_en = tokenizer.encode('TensorLayer is awesome.') + [prediction, weights_decoder], weights_encoder = model(inputs=[sentence_en]) + + predicted_sentence = tokenizer.decode([i for i in prediction["outputs"][0] if i < tokenizer.vocab_size]) + print("Translated: ", predicted_sentence) + + # visualize the self attention + tokenizer_str = [tokenizer.decode([ts]) for ts in (sentence_en)] + attention_visualisation.plot_attention_weights(weights_encoder["layer_0"], tokenizer_str, tokenizer_str) + + +if __name__ == "__main__": + tutorial_transformer() diff --git a/tensorlayer/models/__init__.py b/tensorlayer/models/__init__.py index 19f5bb665..6241bce97 100644 --- a/tensorlayer/models/__init__.py +++ b/tensorlayer/models/__init__.py @@ -10,3 +10,4 @@ from .vgg import * from .seq2seq import Seq2seq from .seq2seq_with_attention import Seq2seqLuongAttention +from .transformer.transformer import Transformer diff --git a/tensorlayer/models/transformer/__init__.py b/tensorlayer/models/transformer/__init__.py new file mode 100644 index 000000000..28c174abc --- /dev/null +++ b/tensorlayer/models/transformer/__init__.py @@ -0,0 +1,6 @@ +from .attention_layer import * +from .transformer import Transformer +from .beamsearchHelper import * +from .feedforward_layer import * +from .embedding_layer import * +from .utils import * \ No newline at end of file diff --git a/tensorlayer/models/transformer/attention_layer.py b/tensorlayer/models/transformer/attention_layer.py new file mode 100644 index 000000000..5d9e5cca7 --- /dev/null +++ b/tensorlayer/models/transformer/attention_layer.py @@ -0,0 +1,156 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of multiheaded attention and self-attention layers.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +import tensorlayer as tl + + +class MultiHeadAttentionLayer(tl.layers.Layer): + """The :class:`MultiHeadAttentionLayer` layer is for multi-head attention computation. + The weight computation is between "key" and "query", which will then matmul with "value" to generate information + that selectively focuses on the "query" messages. + + Parameters + ----------- + num_heads : int + The number of heads which allow attention computation for different features + hidden_size : int + Out dim for the layer + keep_prob : float + Keep probablity for drop-out mechanism between 0 and 1 + """ + + def __init__(self, num_heads, hidden_size, keep_prob): + + if hidden_size % num_heads: + raise ValueError( + "Hidden size ({}) must be divisible by the number of heads ({}).".format(hidden_size, num_heads) + ) + + super(MultiHeadAttentionLayer, self).__init__() + self.hidden_size = hidden_size + self.num_heads = num_heads + self.attention_dropout = 1 - keep_prob + + self.build(None) + self._built = True + + def get_config(self): + return { + "hidden_size": self.hidden_size, + "num_heads": self.num_heads, + "attention_dropout": self.attention_dropout, + } + + def build(self, inputs_shape): + + # Transformation for linearly projecting the queries, keys, and values. + self.q_transformation = self._get_weights( + "q_project", shape=(self.hidden_size, self.hidden_size), init=tf.initializers.get('glorot_uniform') + ) + self.v_transformation = self._get_weights( + "v_project", shape=(self.hidden_size, self.hidden_size), init=tf.initializers.get('glorot_uniform') + ) + self.k_transformation = self._get_weights( + "k_project", shape=(self.hidden_size, self.hidden_size), init=tf.initializers.get('glorot_uniform') + ) + self.out_transformation = self._get_weights( + "out_project", shape=(self.hidden_size, self.hidden_size), init=tf.initializers.get('glorot_uniform') + ) + + def split_heads(self, x): + + with tf.name_scope("split_heads"): + batch_size = tf.shape(x)[0] + length = tf.shape(x)[1] + + # Calculate depth of last dimension after it has been split. + depth = (self.hidden_size // self.num_heads) + + # Split the last dimension + x = tf.reshape(x, [batch_size, length, self.num_heads, depth]) + + # Transpose the result + return tf.transpose(x, [0, 2, 1, 3]) + + def combine_heads(self, x): + + with tf.name_scope("combine_heads"): + batch_size = tf.shape(x)[0] + length = tf.shape(x)[2] + x = tf.transpose(x, [0, 2, 1, 3]) # --> [batch, length, num_heads, depth] + return tf.reshape(x, [batch_size, length, self.hidden_size]) + + def forward(self, x, y, mask, cache=None): + """Apply attention mechanism to x and y.""" + # Linearly project the query (q), key (k) and value (v) using different + # learned projections. This is in preparation of splitting them into + # multiple heads. Multi-head attention uses multiple queries, keys, and + # values rather than regular attention (which uses a single q, k, v). + + v = k = y + q = x + + q = tf.tensordot(q, self.q_transformation, axes=[[2], [0]]) + k = tf.tensordot(k, self.k_transformation, axes=[[2], [0]]) + v = tf.tensordot(v, self.v_transformation, axes=[[2], [0]]) + + if cache is not None: + + # Combine cached keys and values with new keys and values. + k = tf.concat([cache["k"], k], axis=1) + v = tf.concat([cache["v"], v], axis=1) + + # Update cache + cache["k"] = k + cache["v"] = v + + # Split q, k, v into heads. + q = self.split_heads(q) + k = self.split_heads(k) + v = self.split_heads(v) #(Batch, num_head, length_v, dk) + + # Scale q to prevent the dot product between q and k from growing too large. + depth = (self.hidden_size // self.num_heads) + q *= depth**-0.5 + + # Calculate dot product attention + logits = tf.matmul(q, k, transpose_b=True) #(Batch, num_head, length_q, length_k) + logits += mask + weights = tf.nn.softmax(logits, name="attention_weights") #(Batch, num_head, length_q, length_k) + weights_store = weights + if self.is_train: + weights = tf.nn.dropout(weights, rate=self.attention_dropout) + + attention_output = tf.matmul(weights, v) + + # Recombine heads --> [batch_size, length, hidden_size] + attention_output = self.combine_heads(attention_output) + + # Run the combined outputs through another linear projection layer. + attention_output = tf.tensordot(attention_output, self.out_transformation, axes=[[2], [0]]) + return attention_output, weights_store + + +class SelfAttentionLayer(MultiHeadAttentionLayer): + """Multiheaded self-attention layer.""" + + def forward(self, inputs, mask, cache=None): + return super(SelfAttentionLayer, self).forward(x=inputs, y=inputs, mask=mask, cache=cache) diff --git a/tensorlayer/models/transformer/beamsearchHelper/__init__.py b/tensorlayer/models/transformer/beamsearchHelper/__init__.py new file mode 100644 index 000000000..83c248180 --- /dev/null +++ b/tensorlayer/models/transformer/beamsearchHelper/__init__.py @@ -0,0 +1 @@ +from .beam_search import * diff --git a/tensorlayer/models/transformer/beamsearchHelper/beam_search.py b/tensorlayer/models/transformer/beamsearchHelper/beam_search.py new file mode 100644 index 000000000..b1959f901 --- /dev/null +++ b/tensorlayer/models/transformer/beamsearchHelper/beam_search.py @@ -0,0 +1,115 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Beam search in TF v2. +""" + +import tensorflow as tf +import tensorlayer.models.transformer.beamsearchHelper.beam_search_v1 as v1 + +_StateKeys = v1._StateKeys # pylint: disable=protected-access + + +class SequenceBeamSearchV2(v1.SequenceBeamSearch): + """Implementation of beam search loop in v2.""" + + def search(self, initial_ids, initial_cache): + """Beam search for sequences with highest scores.""" + state, state_shapes = self._create_initial_state(initial_ids, initial_cache) + finished_state = tf.while_loop( + self._continue_search, self._search_step, loop_vars=[state], shape_invariants=[state_shapes], + parallel_iterations=1, back_prop=False + ) + finished_state = finished_state[0] + + alive_seq = finished_state[_StateKeys.ALIVE_SEQ] + alive_log_probs = finished_state[_StateKeys.ALIVE_LOG_PROBS] + finished_seq = finished_state[_StateKeys.FINISHED_SEQ] + finished_scores = finished_state[_StateKeys.FINISHED_SCORES] + finished_flags = finished_state[_StateKeys.FINISHED_FLAGS] + + return finished_seq, finished_scores + + +def sequence_beam_search( + symbols_to_logits_fn, initial_ids, initial_cache, vocab_size, beam_size, alpha, max_decode_length, eos_id +): + """Search for sequence of subtoken ids with the largest probability. + + Parameters + ----------- + symbols_to_logits_fn : A function with ids, index, and cache as arguments. + The passed in arguments will have shape: + ids -> [batch_size * beam_size, index] + index -> [] (scalar) + cache -> nested dictionary of tensors [batch_size * beam_size, ...] + The function must return logits and new cache. + logits -> [batch * beam_size, vocab_size] + new cache -> same shape/structure as inputted cache + initial_ids : int with shape [batch_size] + Starting ids for each batch item. + initial_cache: dict + contain starting decoder variables information + vocab_size: int + size of tokens + beam_size: int + number of beams + alpha: float + strength of length normalization + max_decode_length: int + maximum length to decoded sequence + eos_id: int + id of eos token, used to determine when a sequence has finished + + Notes + ------- + The function would return: + Top decoded sequences [batch_size, beam_size, max_decode_length] + sequence scores [batch_size, beam_size] + """ + + batch_size = tf.shape(initial_ids)[0] + + sbs = SequenceBeamSearchV2( + symbols_to_logits_fn, vocab_size, batch_size, beam_size, alpha, max_decode_length, eos_id + ) + return sbs.search(initial_ids, initial_cache) + + +def _expand_to_same_rank(tensor, target): + """Expands a given tensor to target's rank to be broadcastable. + + Parameters + ----------- + + tensor: input tensor to tile. Shape: [b, d1, ..., da] + target: target tensor. Shape: [b, d1, ..., da, ..., dn] + + Returns: + ----------- + Tiled tensor of shape [b, d1, ..., da, 1, ..., 1] with same rank of target. + + Raises: + ValueError, if the shape rank of rank tensor/target is None. + """ + if tensor.shape.rank is None: + raise ValueError("Expect rank for tensor shape, but got None.") + if target.shape.rank is None: + raise ValueError("Expect rank for target shape, but got None.") + + with tf.name_scope("expand_rank"): + diff_rank = target.shape.rank - tensor.shape.rank + for _ in range(diff_rank): + tensor = tf.expand_dims(tensor, -1) + return tensor diff --git a/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py b/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py new file mode 100644 index 000000000..db3166366 --- /dev/null +++ b/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py @@ -0,0 +1,493 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Beam search to find the translated sequence with the highest probability. + +Source implementation from Tensor2Tensor: +https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/beam_search.py +""" + +import tensorflow as tf +from tensorflow.python.util import nest + +# Default value for INF +INF = 1. * 1e7 + + +class _StateKeys(object): + """Keys to dictionary storing the state of the beam search loop.""" + + # Variable storing the loop index. + CUR_INDEX = "CUR_INDEX" + + # Top sequences that are alive for each batch item. Alive sequences are ones + # that have not generated an EOS token. Sequences that reach EOS are marked as + # finished and moved to the FINISHED_SEQ tensor. + # Has shape [batch_size, beam_size, CUR_INDEX + 1] + ALIVE_SEQ = "ALIVE_SEQ" + # Log probabilities of each alive sequence. Shape [batch_size, beam_size] + ALIVE_LOG_PROBS = "ALIVE_LOG_PROBS" + # Dictionary of cached values for each alive sequence. The cache stores + # the encoder output, attention bias, and the decoder attention output from + # the previous iteration. + ALIVE_CACHE = "ALIVE_CACHE" + + # Top finished sequences for each batch item. + # Has shape [batch_size, beam_size, CUR_INDEX + 1]. Sequences that are + # shorter than CUR_INDEX + 1 are padded with 0s. + FINISHED_SEQ = "FINISHED_SEQ" + # Scores for each finished sequence. Score = log probability / length norm + # Shape [batch_size, beam_size] + FINISHED_SCORES = "FINISHED_SCORES" + # Flags indicating which sequences in the finished sequences are finished. + # At the beginning, all of the sequences in FINISHED_SEQ are filler values. + # True -> finished sequence, False -> filler. Shape [batch_size, beam_size] + FINISHED_FLAGS = "FINISHED_FLAGS" + + +class SequenceBeamSearch(object): + """Implementation of beam search loop.""" + + def __init__(self, symbols_to_logits_fn, vocab_size, batch_size, beam_size, alpha, max_decode_length, eos_id): + self.symbols_to_logits_fn = symbols_to_logits_fn + self.vocab_size = vocab_size + self.batch_size = batch_size + self.beam_size = beam_size + self.alpha = alpha + self.max_decode_length = max_decode_length + self.eos_id = eos_id + + def search(self, initial_ids, initial_cache): + """Beam search for sequences with highest scores.""" + state, state_shapes = self._create_initial_state(initial_ids, initial_cache) + + finished_state = tf.while_loop( + self._continue_search, self._search_step, loop_vars=[state], shape_invariants=[state_shapes], + parallel_iterations=1, back_prop=False + ) + finished_state = finished_state[0] + + alive_seq = finished_state[_StateKeys.ALIVE_SEQ] + alive_log_probs = finished_state[_StateKeys.ALIVE_LOG_PROBS] + finished_seq = finished_state[_StateKeys.FINISHED_SEQ] + finished_scores = finished_state[_StateKeys.FINISHED_SCORES] + finished_flags = finished_state[_StateKeys.FINISHED_FLAGS] + + # Account for corner case where there are no finished sequences for a + # particular batch item. In that case, return alive sequences for that batch + # item. + finished_seq = tf.where(tf.reduce_any(finished_flags, 1), finished_seq, alive_seq) + finished_scores = tf.where(tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs) + return finished_seq, finished_scores + + def _create_initial_state(self, initial_ids, initial_cache): + """Return initial state dictionary and its shape invariants. + + Parameters + ----------- + initial_ids: initial ids to pass into the symbols_to_logits_fn. + int tensor with shape [batch_size, 1] + initial_cache: dictionary storing values to be passed into the + symbols_to_logits_fn. + + Returns: + ----------- + state and shape invariant dictionaries with keys from _StateKeys + """ + # Current loop index (starts at 0) + cur_index = tf.constant(0) + + # Create alive sequence with shape [batch_size, beam_size, 1] + alive_seq = _expand_to_beam_size(initial_ids, self.beam_size) + alive_seq = tf.expand_dims(alive_seq, axis=2) + + # Create tensor for storing initial log probabilities. + # Assume initial_ids are prob 1.0 + initial_log_probs = tf.constant([[0.] + [-float("inf")] * (self.beam_size - 1)]) + alive_log_probs = tf.tile(initial_log_probs, [self.batch_size, 1]) + + # Expand all values stored in the dictionary to the beam size, so that each + # beam has a separate cache. + alive_cache = nest.map_structure(lambda t: _expand_to_beam_size(t, self.beam_size), initial_cache) + + # Initialize tensor storing finished sequences with filler values. + finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32) + + # Set scores of the initial finished seqs to negative infinity. + finished_scores = tf.ones([self.batch_size, self.beam_size]) * -INF + + # Initialize finished flags with all False values. + finished_flags = tf.zeros([self.batch_size, self.beam_size], tf.bool) + + # Create state dictionary + state = { + _StateKeys.CUR_INDEX: cur_index, + _StateKeys.ALIVE_SEQ: alive_seq, + _StateKeys.ALIVE_LOG_PROBS: alive_log_probs, + _StateKeys.ALIVE_CACHE: alive_cache, + _StateKeys.FINISHED_SEQ: finished_seq, + _StateKeys.FINISHED_SCORES: finished_scores, + _StateKeys.FINISHED_FLAGS: finished_flags + } + + # Create state invariants for each value in the state dictionary. Each + # dimension must be a constant or None. A None dimension means either: + # 1) the dimension's value is a tensor that remains the same but may + # depend on the input sequence to the model (e.g. batch size). + # 2) the dimension may have different values on different iterations. + state_shape_invariants = { + _StateKeys.CUR_INDEX: tf.TensorShape([]), + _StateKeys.ALIVE_SEQ: tf.TensorShape([None, self.beam_size, None]), + _StateKeys.ALIVE_LOG_PROBS: tf.TensorShape([None, self.beam_size]), + _StateKeys.ALIVE_CACHE: nest.map_structure(_get_shape_keep_last_dim, alive_cache), + _StateKeys.FINISHED_SEQ: tf.TensorShape([None, self.beam_size, None]), + _StateKeys.FINISHED_SCORES: tf.TensorShape([None, self.beam_size]), + _StateKeys.FINISHED_FLAGS: tf.TensorShape([None, self.beam_size]) + } + + return state, state_shape_invariants + + def _continue_search(self, state): + """Return whether to continue the search loop. + + The loops should terminate when + 1) when decode length has been reached, or + 2) when the worst score in the finished sequences is better than the best + score in the alive sequences (i.e. the finished sequences are provably + unchanging) + """ + i = state[_StateKeys.CUR_INDEX] + alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS] + finished_scores = state[_StateKeys.FINISHED_SCORES] + finished_flags = state[_StateKeys.FINISHED_FLAGS] + + not_at_max_decode_length = tf.less(i, self.max_decode_length) + + # Calculate largest length penalty (the larger penalty, the better score). + max_length_norm = _length_normalization(self.alpha, self.max_decode_length) + # Get the best possible scores from alive sequences. + best_alive_scores = alive_log_probs[:, 0] / max_length_norm + + # Compute worst score in finished sequences for each batch element + finished_scores *= tf.cast(finished_flags, tf.float32) # set filler scores to zero + lowest_finished_scores = tf.reduce_min(finished_scores, axis=1) + + # If there are no finished sequences in a batch element, then set the lowest + # finished score to -INF for that element. + finished_batches = tf.reduce_any(finished_flags, 1) + lowest_finished_scores += (1.0 - tf.cast(finished_batches, tf.float32)) * -INF + + worst_finished_score_better_than_best_alive_score = tf.reduce_all( + tf.greater(lowest_finished_scores, best_alive_scores) + ) + + return tf.logical_and( + not_at_max_decode_length, tf.logical_not(worst_finished_score_better_than_best_alive_score) + ) + + def _search_step(self, state): + """Beam search loop body. + + Grow alive sequences by a single ID. Sequences that have reached the EOS + token are marked as finished. The alive and finished sequences with the + highest log probabilities and scores are returned. + + A sequence's finished score is calculating by dividing the log probability + by the length normalization factor. Without length normalization, the + search is more likely to return shorter sequences. + + """ + # Grow alive sequences by one token. + new_seq, new_log_probs, new_cache = self._grow_alive_seq(state) + # Collect top beam_size alive sequences + alive_state = self._get_new_alive_state(new_seq, new_log_probs, new_cache) + + # Combine newly finished sequences with existing finished sequences, and + # collect the top k scoring sequences. + finished_state = self._get_new_finished_state(state, new_seq, new_log_probs) + + # Increment loop index and create new state dictionary + new_state = {_StateKeys.CUR_INDEX: state[_StateKeys.CUR_INDEX] + 1} + new_state.update(alive_state) + new_state.update(finished_state) + return [new_state] + + def _grow_alive_seq(self, state): + """Grow alive sequences by one token, and collect top 2*beam_size sequences. + 2*beam_size sequences are collected because some sequences may have reached + the EOS token. 2*beam_size ensures that at least beam_size sequences are + still alive. + """ + i = state[_StateKeys.CUR_INDEX] + alive_seq = state[_StateKeys.ALIVE_SEQ] + alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS] + alive_cache = state[_StateKeys.ALIVE_CACHE] + + beams_to_keep = 2 * self.beam_size + + # Get logits for the next candidate IDs for the alive sequences. Get the new + # cache values at the same time. + flat_ids = _flatten_beam_dim(alive_seq) # [batch_size * beam_size] + flat_cache = nest.map_structure(_flatten_beam_dim, alive_cache) + + flat_logits, flat_cache = self.symbols_to_logits_fn(flat_ids, i, flat_cache) + + # Unflatten logits to shape [batch_size, beam_size, vocab_size] + logits = _unflatten_beam_dim(flat_logits, self.batch_size, self.beam_size) + new_cache = nest.map_structure(lambda t: _unflatten_beam_dim(t, self.batch_size, self.beam_size), flat_cache) + + # Convert logits to normalized log probs + candidate_log_probs = _log_prob_from_logits(logits) + + # Calculate new log probabilities if each of the alive sequences were + # extended # by the the candidate IDs. + # Shape [batch_size, beam_size, vocab_size] + log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2) + + # Each batch item has beam_size * vocab_size candidate sequences. For each + # batch item, get the k candidates with the highest log probabilities. + flat_log_probs = tf.reshape(log_probs, [-1, self.beam_size * self.vocab_size]) + topk_log_probs, topk_indices = tf.nn.top_k(flat_log_probs, k=beams_to_keep) + + # Extract the alive sequences that generate the highest log probabilities + # after being extended. + topk_beam_indices = topk_indices // self.vocab_size + topk_seq, new_cache = _gather_beams([alive_seq, new_cache], topk_beam_indices, self.batch_size, beams_to_keep) + + # Append the most probable IDs to the topk sequences + topk_ids = topk_indices % self.vocab_size + topk_ids = tf.expand_dims(topk_ids, axis=2) + topk_seq = tf.concat([topk_seq, topk_ids], axis=2) + return topk_seq, topk_log_probs, new_cache + + def _get_new_alive_state(self, new_seq, new_log_probs, new_cache): + """Gather the top k sequences that are still alive. + """ + # To prevent finished sequences from being considered, set log probs to -INF + new_finished_flags = tf.equal(new_seq[:, :, -1], self.eos_id) + new_log_probs += tf.cast(new_finished_flags, tf.float32) * -INF + + top_alive_seq, top_alive_log_probs, top_alive_cache = _gather_topk_beams( + [new_seq, new_log_probs, new_cache], new_log_probs, self.batch_size, self.beam_size + ) + + return { + _StateKeys.ALIVE_SEQ: top_alive_seq, + _StateKeys.ALIVE_LOG_PROBS: top_alive_log_probs, + _StateKeys.ALIVE_CACHE: top_alive_cache + } + + def _get_new_finished_state(self, state, new_seq, new_log_probs): + """Combine new and old finished sequences, and gather the top k sequences. + """ + i = state[_StateKeys.CUR_INDEX] + finished_seq = state[_StateKeys.FINISHED_SEQ] + finished_scores = state[_StateKeys.FINISHED_SCORES] + finished_flags = state[_StateKeys.FINISHED_FLAGS] + + # First append a column of 0-ids to finished_seq to increment the length. + # New shape of finished_seq: [batch_size, beam_size, i + 1] + finished_seq = tf.concat([finished_seq, tf.zeros([self.batch_size, self.beam_size, 1], tf.int32)], axis=2) + + # Calculate new seq scores from log probabilities. + length_norm = _length_normalization(self.alpha, i + 1) + new_scores = new_log_probs / length_norm + + # Set the scores of the still-alive seq in new_seq to large negative values. + new_finished_flags = tf.equal(new_seq[:, :, -1], self.eos_id) + new_scores += (1. - tf.cast(new_finished_flags, tf.float32)) * -INF + + # Combine sequences, scores, and flags. + finished_seq = tf.concat([finished_seq, new_seq], axis=1) + finished_scores = tf.concat([finished_scores, new_scores], axis=1) + finished_flags = tf.concat([finished_flags, new_finished_flags], axis=1) + + # Return the finished sequences with the best scores. + top_finished_seq, top_finished_scores, top_finished_flags = ( + _gather_topk_beams( + [finished_seq, finished_scores, finished_flags], finished_scores, self.batch_size, self.beam_size + ) + ) + + return { + _StateKeys.FINISHED_SEQ: top_finished_seq, + _StateKeys.FINISHED_SCORES: top_finished_scores, + _StateKeys.FINISHED_FLAGS: top_finished_flags + } + + +def sequence_beam_search( + symbols_to_logits_fn, initial_ids, initial_cache, vocab_size, beam_size, alpha, max_decode_length, eos_id +): + """Search for sequence of subtoken ids with the largest probability. + + Parameters + ----------- + symbols_to_logits_fn : A function with ids, index, and cache as arguments. + The passed in arguments will have shape: + ids -> [batch_size * beam_size, index] + index -> [] (scalar) + cache -> nested dictionary of tensors [batch_size * beam_size, ...] + The function must return logits and new cache. + logits -> [batch * beam_size, vocab_size] + new cache -> same shape/structure as inputted cache + initial_ids : int with shape [batch_size] + Starting ids for each batch item. + initial_cache: dict + contain starting decoder variables information + vocab_size: int + size of tokens + beam_size: int + number of beams + alpha: float + strength of length normalization + max_decode_length: int + maximum length to decoded sequence + eos_id: int + id of eos token, used to determine when a sequence has finished + + Notes + ------- + The function would return: + Top decoded sequences [batch_size, beam_size, max_decode_length] + sequence scores [batch_size, beam_size] + """ + batch_size = tf.shape(initial_ids)[0] + sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, batch_size, beam_size, alpha, max_decode_length, eos_id) + return sbs.search(initial_ids, initial_cache) + + +def _log_prob_from_logits(logits): + return logits - tf.reduce_logsumexp(logits, axis=2, keepdims=True) + + +def _length_normalization(alpha, length): + """Return length normalization factor.""" + return tf.pow(((5. + tf.cast(length, tf.float32)) / 6.), alpha) + + +def _expand_to_beam_size(tensor, beam_size): + """Tiles a given tensor by beam_size. + + Parameters + ----------- + tensor: tensor to tile [batch_size, ...] + beam_size: How much to tile the tensor by. + + Returns + ----------- + Tiled tensor [batch_size, beam_size, ...] + """ + tensor = tf.expand_dims(tensor, axis=1) + tile_dims = [1] * tensor.shape.ndims + tile_dims[1] = beam_size + + return tf.tile(tensor, tile_dims) + + +def _shape_list(tensor): + """Return a list of the tensor's shape, and ensure no None values in list.""" + # Get statically known shape (may contain None's for unknown dimensions) + shape = tensor.get_shape().as_list() + + # Ensure that the shape values are not None + dynamic_shape = tf.shape(tensor) + for i in range(len(shape)): # pylint: disable=consider-using-enumerate + if shape[i] is None: + shape[i] = dynamic_shape[i] + return shape + + +def _get_shape_keep_last_dim(tensor): + shape_list = _shape_list(tensor) + + # Only the last + for i in range(len(shape_list) - 1): + shape_list[i] = None + + if isinstance(shape_list[-1], tf.Tensor): + shape_list[-1] = None + return tf.TensorShape(shape_list) + + +def _flatten_beam_dim(tensor): + """Reshapes first two dimensions in to single dimension. + """ + shape = _shape_list(tensor) + shape[0] *= shape[1] + shape.pop(1) # Remove beam dim + return tf.reshape(tensor, shape) + + +def _unflatten_beam_dim(tensor, batch_size, beam_size): + """Reshapes first dimension back to [batch_size, beam_size]. + + Parameters + ----------- + tensor: Tensor to reshape of shape [batch_size*beam_size, ...] + batch_size: Tensor, original batch size. + beam_size: int, original beam size. + + Returns + ----------- + Reshaped tensor of shape [batch_size, beam_size, ...] + """ + shape = _shape_list(tensor) + new_shape = [batch_size, beam_size] + shape[1:] + return tf.reshape(tensor, new_shape) + + +def _gather_beams(nested, beam_indices, batch_size, new_beam_size): + """Gather beams from nested structure of tensors. + + Each tensor in nested represents a batch of beams, where beam refers to a + single search state (beam search involves searching through multiple states + in parallel). + + This function is used to gather the top beams, specified by + beam_indices, from the nested tensors. + + Parameters + ----------- + nested: Nested structure (tensor, list, tuple or dict) containing tensors + with shape [batch_size, beam_size, ...]. + beam_indices: int32 tensor with shape [batch_size, new_beam_size]. Each + value in beam_indices must be between [0, beam_size), and are not + necessarily unique. + batch_size: int size of batch + new_beam_size: int number of beams to be pulled from the nested tensors. + + Returns: + ----------- + + Nested structure containing tensors with shape + [batch_size, new_beam_size, ...] + """ + # Computes the i'th coodinate that contains the batch index for gather_nd. + # Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..]. + batch_pos = tf.range(batch_size * new_beam_size) // new_beam_size + batch_pos = tf.reshape(batch_pos, [batch_size, new_beam_size]) + + # Create coordinates to be passed to tf.gather_nd. Stacking creates a tensor + # with shape [batch_size, beam_size, 2], where the last dimension contains + # the (i, j) gathering coordinates. + coordinates = tf.stack([batch_pos, beam_indices], axis=2) + + return nest.map_structure(lambda state: tf.gather_nd(state, coordinates), nested) + + +def _gather_topk_beams(nested, score_or_log_prob, batch_size, beam_size): + """Gather top beams from nested structure.""" + _, topk_indexes = tf.nn.top_k(score_or_log_prob, k=beam_size) + return _gather_beams(nested, topk_indexes, batch_size, beam_size) diff --git a/tensorlayer/models/transformer/embedding_layer.py b/tensorlayer/models/transformer/embedding_layer.py new file mode 100644 index 000000000..1897b0a22 --- /dev/null +++ b/tensorlayer/models/transformer/embedding_layer.py @@ -0,0 +1,95 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of embedding layer with shared weights.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +import tensorlayer as tl + + +class EmbeddingLayer(tl.layers.Layer): + """Calculates input embeddings and pre-softmax linear with shared weights.""" + + def __init__(self, vocab_size, hidden_size): + """Specify characteristic parameters of embedding layer. + + Parameters + ----------- + vocab_size : int + Number of tokens in the embedding. (Typically ~32,000) + hidden_size : int + Dimensionality of the embedding. (Typically 512 or 1024) + + Examples + --------- + with TensorLayer + + + """ + super(EmbeddingLayer, self).__init__() + self.vocab_size = vocab_size + self.hidden_size = hidden_size + + self.build(tuple()) + self._built = True + + def build(self, inputs_shape): + with tf.name_scope("embedding_and_softmax"): + # Create and initialize weights. The random normal initializer was chosen + # arbitrarily, and works well. + self.W = self._get_weights( + 'weights', shape=(self.vocab_size, self.hidden_size), + init=tf.random_normal_initializer(mean=0., stddev=self.hidden_size**-0.5) + ) + + def get_config(self): + return { + "vocab_size": self.vocab_size, + "hidden_size": self.hidden_size, + } + + def forward(self, inputs, mode="embedding"): + """Get token embeddings of inputs.""" + if mode == "embedding": + return self._embedding(inputs) + elif mode == "linear": + return self._linear(inputs) + else: + raise ValueError("mode {} is not valid.".format(mode)) + + def _embedding(self, inputs): + """Applies embedding based on inputs tensor.""" + with tf.name_scope("embedding"): + # Create binary mask of size [batch_size, length] + mask = tf.cast(tf.not_equal(inputs, 0), tf.float32) + embeddings = tf.gather(self.W, inputs) + embeddings *= tf.expand_dims(mask, -1) + # Scale embedding by the sqrt of the hidden size + embeddings *= self.hidden_size**0.5 + return embeddings + + def _linear(self, inputs): + """Computes logits by running inputs through a linear layer.""" + with tf.name_scope("presoftmax_linear"): + batch_size = tf.shape(inputs)[0] + length = tf.shape(inputs)[1] + + x = tf.reshape(inputs, [-1, self.hidden_size]) + logits = tf.matmul(x, self.W, transpose_b=True) + + return tf.reshape(logits, [batch_size, length, self.vocab_size]) diff --git a/tensorlayer/models/transformer/feedforward_layer.py b/tensorlayer/models/transformer/feedforward_layer.py new file mode 100644 index 000000000..ecc9e5249 --- /dev/null +++ b/tensorlayer/models/transformer/feedforward_layer.py @@ -0,0 +1,81 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of fully connected network.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +import tensorlayer as tl + + +class TransformerFeedForwardLayer(tl.layers.Layer): + """Fully connected feedforward network.""" + + def __init__(self, hidden_size, filter_size, keep_prob): + """Initialize FeedForwardNetwork. + + Parameters + ----------- + hidden_size: int + output dim of hidden layer. + filter_size: int + filter size for the inner (first) dense layer. + relu_dropout: float + dropout rate for training. + """ + super(TransformerFeedForwardLayer, self).__init__() + self.hidden_size = hidden_size + self.filter_size = filter_size + self.relu_dropout = 1 - keep_prob + self.filter_dense_layer = tl.layers.Dense( + self.filter_size, in_channels=self.hidden_size, W_init=tf.initializers.get('glorot_uniform'), + name="input_layer" + ) + self.output_dense_layer = tl.layers.Dense( + self.hidden_size, in_channels=self.filter_size, W_init=tf.initializers.get('glorot_uniform'), + name="output_layer" + ) + self.build(None) + self._built = True + + def build(self, inputs_shape): + pass + + def get_config(self): + return { + "hidden_size": self.hidden_size, + "filter_size": self.filter_size, + "relu_dropout": self.relu_dropout, + } + + def forward(self, inputs): + """Return outputs of the feedforward network.""" + # Retrieve dynamically known shapes + x = inputs + batch_size = tf.shape(x)[0] + length = tf.shape(x)[1] + x = tf.reshape(x, [-1, x.shape[-1]]) + output = self.filter_dense_layer(x) + output = tf.nn.relu(output) + output = tf.reshape(output, [batch_size, -1, output.shape[-1]]) + if self.is_train: + output = tf.nn.dropout(output, rate=self.relu_dropout) + output = tf.reshape(output, [-1, output.shape[-1]]) + output = self.output_dense_layer(output) + output = tf.reshape(output, [batch_size, -1, output.shape[-1]]) + + return output \ No newline at end of file diff --git a/tensorlayer/models/transformer/transformer.py b/tensorlayer/models/transformer/transformer.py new file mode 100644 index 000000000..c03701eb4 --- /dev/null +++ b/tensorlayer/models/transformer/transformer.py @@ -0,0 +1,529 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Defines the Transformer model in TF 2.0. + +Model paper: https://arxiv.org/pdf/1706.03762.pdf +Transformer model code source: https://github.com/tensorflow/tensor2tensor +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +import tensorlayer as tl +from tensorlayer.models import Model +import tensorlayer.models.transformer.embedding_layer as embedding_layer +from tensorlayer.models.transformer.attention_layer import SelfAttentionLayer, MultiHeadAttentionLayer +from tensorlayer.models.transformer.feedforward_layer import TransformerFeedForwardLayer +from tensorlayer.models.transformer.utils.model_utils import positional_encoding +from tensorlayer.models.transformer.utils.model_utils import get_decoder_self_attention_bias as get_target_mask +from tensorlayer.models.transformer.utils.model_utils import get_padding_bias as get_input_mask +import tensorlayer.models.transformer.beamsearchHelper.beam_search as beam_search + + +class Transformer(Model): + """Transformer model. + + Parameters + ---------- + params: class + Hyper-parameters of the model including vocab_size, encoder_num_layers, decoder_num_layers, + hidden_size, ff_size, num_heads and keep_prob for training; + and extra_decode_length, beam_size and alpha for inference. + + Examples + --------- + example/translation_task/tutorial_transformer + + Returns + ------- + Stacked-layer transformer model. + """ + + def __init__(self, params, name=None): + + super(Transformer, self).__init__(name=name) + self.params = params + self.embedding_softmax_layer = embedding_layer.EmbeddingLayer(params.vocab_size, params.hidden_size) + self.encoder_stack = EncoderStack(params) + self.decoder_stack = DecoderStack(params) + + def get_config(self): + return { + "params": self.params, + } + + def forward(self, inputs, targets=None): + """Calculate target logits or inferred target sequences. + + Parameters + ---------- + inputs: input tensor list of size 1 or 2. + First item, inputs: int tensor with shape [batch_size, input_length]. + Second item (optional), targets: None or int tensor with shape + [batch_size, target_length]. + training: boolean + whether in training mode or not. + + Notes + ------- + The function would return: + If targets is defined: + Logits for each word in the target sequence: + float tensor with shape [batch_size, target_length, vocab_size] + Self-attention weights for encoder part: + a dictionary of float tensors { + "layer_0": [batch_size, number_of_heads, source_length, source_length], + "layer_1": [batch_size, number_of_heads, source_length, source_length], + ... + } + Weights for decoder part: + a dictionary of dictionary of float tensors { + "self": { + "layer_0": [batch_size, number_of_heads, target_length, target_length], + "layer_1": [batch_size, number_of_heads, target_length, target_length], + ... + } + "enc_dec": { + "layer_0": [batch_size, number_of_heads, source_length, target_length], + "layer_1": [batch_size, number_of_heads, source_length, target_length], + ... + } + } + + If target is none: + Auto-regressive beam-search decoding to generate output each one time step: + a dictionary { + outputs: [batch_size, decoded length] + scores: [batch_size, float]} + } + Weights for decoder part: + a dictionary of dictionary of float tensors { + "self": { + "layer_0": [batch_size, number_of_heads, target_length, target_length], + "layer_1": [batch_size, number_of_heads, target_length, target_length], + ... + } + "enc_dec": { + "layer_0": [batch_size, number_of_heads, source_length, target_length], + "layer_1": [batch_size, number_of_heads, source_length, target_length], + ... + } + } + Self-attention weights for encoder part: + a dictionary of float tensors { + "layer_0": [batch_size, number_of_heads, source_length, source_length], + "layer_1": [batch_size, number_of_heads, source_length, source_length], + ... + } + + """ + # # Variance scaling is used here because it seems to work in many problems. + # # Other reasonable initializers may also work just as well. + + # Calculate attention bias for encoder self-attention and decoder + # multi-headed attention layers. + attention_bias = get_input_mask(inputs) + + # Run the inputs through the encoder layer to map the symbol + # representations to continuous representations. + # Prepare inputs to the layer stack by adding positional encodings and + # applying dropout. + embedded_inputs = self.embedding_softmax_layer(inputs) + inputs_padding = get_input_mask(inputs) + + encoder_outputs, weights_encoder = self.encode(inputs, inputs_padding) + # Generate output sequence if targets is None, or return logits if target + # sequence is known. + if targets is None: + return self.predict(encoder_outputs, attention_bias), weights_encoder + else: + logits, weights_decoder = self.decode(targets, encoder_outputs, attention_bias) + return logits, weights_encoder, weights_decoder + + def encode(self, inputs, attention_bias): + """Generate continuous representation for inputs. + + Parameters + ---------- + inputs: int tensor with shape [batch_size, input_length]. + attention_bias: float tensor with shape [batch_size, 1, 1, input_length]. + training: boolean, whether in training mode or not. + + Returns + ------- + Float tensor with shape [batch_size, input_length, hidden_size]: + The output of encoder + + Dictionary of float tensors { + "layer_0": [batch_size, number_of_heads, source_length, source_length], + "layer_1": [batch_size, number_of_heads, source_length, source_length], + ... + }: + Self-attention weights for encoder part + """ + + # Prepare inputs to the layer stack by adding positional encodings and + # applying dropout. + embedded_inputs = self.embedding_softmax_layer(inputs) + inputs_padding = get_input_mask(inputs) + + length = tf.shape(embedded_inputs)[1] + pos_encoding = positional_encoding(length, self.params.hidden_size) + encoder_inputs = embedded_inputs + pos_encoding + + if self.is_train: + encoder_inputs = tf.nn.dropout(encoder_inputs, rate=1 - self.params.keep_prob) + return self.encoder_stack(encoder_inputs, input_mask=attention_bias) + + def decode(self, targets, encoder_outputs, attention_bias): + """Generate logits for each value in the target sequence. + + Parameters + ---------- + targets: target values for the output sequence. int tensor with shape + [batch_size, target_length] + encoder_outputs: continuous representation of input sequence. float tensor + with shape [batch_size, input_length, hidden_size] + attention_bias: float tensor with shape [batch_size, 1, 1, input_length] + training: boolean, whether in training mode or not. + + Returns + ------- + Float32 tensor with shape [batch_size, target_length, vocab_size]: + Output of decoder part + + Dictionary of dictionary of float tensors { + "self": { + "layer_0": [batch_size, number_of_heads, target_length, target_length], + "layer_1": [batch_size, number_of_heads, target_length, target_length], + ... + } + "enc_dec": { + "layer_0": [batch_size, number_of_heads, source_length, target_length], + "layer_1": [batch_size, number_of_heads, source_length, target_length], + ... + } + }: + Weights for decoder part + """ + with tf.name_scope("decode"): + # Prepare inputs to decoder layers by shifting targets, adding positional + # encoding and applying dropout. + decoder_inputs = self.embedding_softmax_layer(targets) + with tf.name_scope("shift_targets"): + # Shift targets to the right, and remove the last element + decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]], + constant_values=self.params.sos_id)[:, :-1, :] + with tf.name_scope("add_pos_encoding"): + length = tf.shape(decoder_inputs)[1] + decoder_inputs += positional_encoding(length, self.params.hidden_size) + if self.is_train: + decoder_inputs = tf.nn.dropout(decoder_inputs, rate=1 - self.params.keep_prob) + + # Run values + decoder_self_attention_bias = get_target_mask(length) + outputs, weights = self.decoder_stack( + decoder_inputs, + features=encoder_outputs, + input_mask=attention_bias, + target_mask=decoder_self_attention_bias, + ) + logits = self.embedding_softmax_layer(outputs, mode="linear") + return logits, weights + + def _get_symbols_to_logits_fn(self, max_decode_length): + """Returns a decoding function that calculates logits of the next tokens.""" + + timing_signal = positional_encoding(max_decode_length + 1, self.params.hidden_size) + decoder_self_attention_bias = get_target_mask(max_decode_length) + weights = [] + + def symbols_to_logits_fn(ids, i, cache): + """Generate logits for next potential IDs.""" + + # Set decoder input to the last generated IDs + decoder_input = ids[:, -1:] + + # Preprocess decoder input by getting embeddings and adding timing signal. + decoder_input = self.embedding_softmax_layer(decoder_input) + decoder_input += timing_signal[i:i + 1] + + self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] + decoder_outputs, weight = self.decoder_stack( + decoder_input, features=cache.get("encoder_outputs"), target_mask=self_attention_bias, + input_mask=cache.get("encoder_decoder_attention_bias"), cache=cache + ) + weights.append(weight) + logits = self.embedding_softmax_layer(decoder_outputs, mode="linear") + logits = tf.squeeze(logits, axis=[1]) + return logits, cache + + return symbols_to_logits_fn, weights + + def predict(self, encoder_outputs, encoder_decoder_attention_bias): + + batch_size = tf.shape(encoder_outputs)[0] + input_length = tf.shape(encoder_outputs)[1] + max_decode_length = input_length + self.params.extra_decode_length + + symbols_to_logits_fn, weights = self._get_symbols_to_logits_fn(max_decode_length) + + # Create initial set of IDs that will be passed into symbols_to_logits_fn. + initial_ids = tf.ones([batch_size], dtype=tf.int32) * self.params.sos_id + + # Create cache storing decoder attention values for each layer. + # pylint: disable=g-complex-comprehension + cache = { + "layer_%d" % layer: { + "k": tf.zeros([batch_size, 0, self.params.hidden_size]), + "v": tf.zeros([batch_size, 0, self.params.hidden_size]) + } for layer in range(self.params.encoder_num_layers) + } + # pylint: enable=g-complex-comprehension + + # Add encoder output and attention bias to the cache. + cache["encoder_outputs"] = encoder_outputs + cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias + + # Use beam search to find the top beam_size sequences and scores. + decoded_ids, scores = beam_search.sequence_beam_search( + symbols_to_logits_fn=symbols_to_logits_fn, initial_ids=initial_ids, initial_cache=cache, + vocab_size=self.params.vocab_size, beam_size=self.params.beam_size, alpha=self.params.alpha, + max_decode_length=max_decode_length, eos_id=self.params.eos_id + ) + + # Get the top sequence for each batch element + top_decoded_ids = decoded_ids[:, 0, 1:] + top_scores = scores[:, 0] + + # post-process the weight attention + for i, weight in enumerate(weights): + if (i == 0): + w = weight + else: + for k in range(len(w['self'])): + w['self']['layer_%d' % k + ] = tf.concat([w['self']['layer_%d' % k], weight['self']['layer_%d' % k]], 3) + w['enc_dec']['layer_%d' % k + ] = tf.concat([w['enc_dec']['layer_%d' % k], weight['enc_dec']['layer_%d' % k]], 2) + return {"outputs": top_decoded_ids, "scores": top_scores}, w + + +class LayerNormalization(tl.layers.Layer): + """ + Layer normalization + + Parameters + ---------- + hidden_size: int + hidden size of features + epsilon: float + value to prevent division by zero + """ + + def __init__(self, hidden_size, epsilon=1e-6): + super(LayerNormalization, self).__init__() + self.hidden_size = hidden_size + self.epsilon = epsilon + + self.build(tuple()) + self._built = True + + def build(self, inputs_shape): + self.scale = self._get_weights('scale', shape=(self.hidden_size), init=tl.initializers.Ones()) + self.bias = self._get_weights('bias', shape=(self.hidden_size), init=tl.initializers.Zeros()) + + def forward(self, inputs): + mean = tf.reduce_mean(inputs, axis=[-1], keepdims=True) + var = tf.reduce_mean(tf.square(inputs - mean), axis=[-1], keepdims=True) + norm_inputs = (inputs - mean) * tf.math.rsqrt(var + self.epsilon) + return norm_inputs * self.scale + self.bias + + def __repr__(self): + return "layer normalization" + + +class PrePostProcessingWrapper(Model): + """Wrapper class that applies layer pre-processing and post-processing.""" + + def __init__(self, layer, params): + super(PrePostProcessingWrapper, self).__init__() + self.layer = layer + self.params = params + self.postprocess_dropout = 1 - params.keep_prob + self.layer_norm = LayerNormalization(self.params.hidden_size) + + def get_config(self): + return { + "params": self.params, + } + + def forward(self, inputs, get_weight=False, *args, **kwargs): + """Calls wrapped layer with same parameters.""" + + x = inputs + y = self.layer_norm(x) + + # Get layer output + if (get_weight): + y, weight = self.layer(y, *args, **kwargs) + else: + y = self.layer(y, *args, **kwargs) + + # Postprocessing: apply dropout and residual connection + if self.is_train: + y = tf.nn.dropout(y, rate=self.postprocess_dropout) + if (get_weight): + return x + y, weight + else: + return x + y + + +class EncoderStack(Model): + """Transformer encoder stack. + + The encoder stack is made up of N identical layers. Each layer is composed + of the sublayers: + 1. Self-attention layer + 2. Feedforward network (which is 2 fully-connected layers) + """ + + def __init__(self, params): + super(EncoderStack, self).__init__() + self.params = params + self.layers = [] + for _ in range(params.encoder_num_layers): + # Create sublayers for each layer. + self_attention_layer = SelfAttentionLayer(params.num_heads, params.hidden_size, params.keep_prob) + feed_forward_network = TransformerFeedForwardLayer(params.hidden_size, params.ff_size, params.keep_prob) + + self.layers.append( + [ + PrePostProcessingWrapper(self_attention_layer, params), + PrePostProcessingWrapper(feed_forward_network, params) + ] + ) + + # Create final layer normalization layer. + self.output_normalization = LayerNormalization(params.hidden_size) + + def get_config(self): + return { + "params": self.params, + } + + def forward(self, inputs, input_mask): + """Return the output of the encoder layer stacks.""" + encoder_inputs = inputs + weights = {} + for n, layer in enumerate(self.layers): + # Run inputs through the sublayers. + self_attention_layer = layer[0] + feed_forward_network = layer[1] + + with tf.name_scope("layer_%d" % n): + with tf.name_scope("self_attention"): + encoder_inputs, weight = self_attention_layer(encoder_inputs, mask=input_mask, get_weight=True) + weights["layer_%d" % n] = weight + with tf.name_scope("ffn"): + encoder_inputs = feed_forward_network(encoder_inputs) + + return self.output_normalization(encoder_inputs), weights + + +class DecoderStack(Model): + """Transformer decoder stack. + + Like the encoder stack, the decoder stack is made up of N identical layers. + Each layer is composed of the sublayers: + 1. Self-attention layer + 2. Multi-headed attention layer combining encoder outputs with results from + the previous self-attention layer. + 3. Feedforward network (2 fully-connected layers) + """ + + def __init__(self, params): + super(DecoderStack, self).__init__() + self.params = params + self.layers = [] + for _ in range(params.decoder_num_layers): + self_attention_layer = SelfAttentionLayer(params.num_heads, params.hidden_size, params.keep_prob) + enc_dec_attention_layer = MultiHeadAttentionLayer(params.num_heads, params.hidden_size, params.keep_prob) + feed_forward_network = TransformerFeedForwardLayer(params.hidden_size, params.ff_size, params.keep_prob) + + self.layers.append( + [ + PrePostProcessingWrapper(self_attention_layer, params), + PrePostProcessingWrapper(enc_dec_attention_layer, params), + PrePostProcessingWrapper(feed_forward_network, params) + ] + ) + self.output_normalization = LayerNormalization(params.hidden_size) + + def get_config(self): + return { + "params": self.params, + } + + def forward(self, inputs, features, input_mask, target_mask, cache=None): + """Return the output of the decoder layer stacks. + + Parameters + ----------- + decoder_inputs : tensor with shape [batch_size, target_length, hidden_size] + encoder_outputs : tensor with shape [batch_size, input_length, hidden_size] + decoder_self_attention_bias: bias for decoder self-attention layer. [1, 1, + target_len, target_length] + attention_bias : bias for encoder-decoder attention layer. [batch_size, 1, + 1, input_length] + training : boolean + whether in training mode or not. + cache: (Used for fast decoding) A nested dictionary storing previous + decoder self-attention values. The items are: + {layer_n: {"k": tensor with shape [batch_size, i, key_channels], + "v": tensor with shape [batch_size, i, value_channels]}, + ...} + + """ + decoder_inputs = inputs + decoder_self_attention_bias = target_mask + encoder_outputs = features + attention_bias = input_mask + weights_all = {"self": {}, "enc_dec": {}} + for n, layer in enumerate(self.layers): + self_attention_layer = layer[0] + enc_dec_attention_layer = layer[1] + feed_forward_network = layer[2] + + # Run inputs through the sublayers. + layer_name = "layer_%d" % n + layer_cache = cache[layer_name] if cache is not None else None + + with tf.name_scope(layer_name): + with tf.name_scope("self_attention"): + decoder_inputs, weight_self = self_attention_layer( + decoder_inputs, get_weight=True, mask=decoder_self_attention_bias, cache=layer_cache + ) + weights_all['self']["layer_%d" % n] = weight_self + with tf.name_scope("encdec_attention"): + decoder_inputs, weight_enc_dec = enc_dec_attention_layer( + decoder_inputs, get_weight=True, y=encoder_outputs, mask=attention_bias + ) + weights_all['enc_dec']["layer_%d" % n] = weight_enc_dec + with tf.name_scope("ffn"): + decoder_inputs = feed_forward_network(decoder_inputs) + + return self.output_normalization(decoder_inputs), weights_all diff --git a/tensorlayer/models/transformer/utils/__init__.py b/tensorlayer/models/transformer/utils/__init__.py new file mode 100644 index 000000000..830f64ecd --- /dev/null +++ b/tensorlayer/models/transformer/utils/__init__.py @@ -0,0 +1,3 @@ +from .model_utils import * +from .metrics import * +from .attention_visualisation import * \ No newline at end of file diff --git a/tensorlayer/models/transformer/utils/attention_visualisation.py b/tensorlayer/models/transformer/utils/attention_visualisation.py new file mode 100644 index 000000000..e98775b4e --- /dev/null +++ b/tensorlayer/models/transformer/utils/attention_visualisation.py @@ -0,0 +1,38 @@ +import matplotlib.pyplot as plt +import tensorflow as tf + + +def plot_attention_weights(attention, key, query): + '''Attention visualisation for Transformer + + Parameters + ---------- + attention : attention weights + shape of (1, number of head, length of key, length of query). + + key : key for attention computation + a list of values which would be shown as xtick labels + + value : value for attention computation + a list of values which would be shown as ytick labels + + ''' + + fig = plt.figure(figsize=(16, 8)) + attention = tf.squeeze(attention, axis=0) + + for head in range(attention.shape[0]): + ax = fig.add_subplot(attention.shape[0] // 2, 2, head + 1) + ax.matshow(attention[head], cmap='viridis') + fontdict = {'fontsize': 12} + ax.set_xticks(range(len(key))) + ax.set_yticks(range(len(query))) + + # ax.set_ylim(len(query)-1.5, -0.5) + ax.set_xticklabels([str(i) for i in key], fontdict=fontdict, rotation=90) + + ax.set_yticklabels([str(i) for i in query], fontdict=fontdict) + + ax.set_xlabel('Head {}'.format(head + 1), fontdict=fontdict) + plt.tight_layout() + plt.show() diff --git a/tensorlayer/models/transformer/utils/metrics.py b/tensorlayer/models/transformer/utils/metrics.py new file mode 100644 index 000000000..6a5aa5d35 --- /dev/null +++ b/tensorlayer/models/transformer/utils/metrics.py @@ -0,0 +1,680 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Functions for calculating loss, accuracy, and other model metrics. + +Metrics: + - Padded loss, accuracy, and negative log perplexity. Source: + https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/metrics.py + - BLEU approximation. Source: + https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py + - ROUGE score. Source: + https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/rouge.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import math + +import numpy as np +import six +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow as tf + + +def _pad_tensors_to_same_length(x, y): + """Pad x and y so that the results have the same length (second dimension).""" + with tf.name_scope("pad_to_same_length"): + x_length = tf.shape(x)[1] + y_length = tf.shape(y)[1] + + max_length = tf.maximum(x_length, y_length) + + x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]]) + y = tf.pad(y, [[0, 0], [0, max_length - y_length]]) + return x, y + + +def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size): + """Calculate cross entropy loss while ignoring padding. + + Parameters +----------- + logits: Tensor of size [batch_size, length_logits, vocab_size] + labels: Tensor of size [batch_size, length_labels] + smoothing: Label smoothing constant, used to determine the on and off values + vocab_size: int size of the vocabulary + Returns: +----------- + Returns the cross entropy loss and weight tensors: float32 tensors with + shape [batch_size, max(length_logits, length_labels)] + """ + with tf.name_scope("loss", values=[logits, labels]): + logits, labels = _pad_tensors_to_same_length(logits, labels) + + # Calculate smoothing cross entropy + with tf.name_scope("smoothing_cross_entropy", values=[logits, labels]): + confidence = 1.0 - smoothing + low_confidence = (1.0 - confidence) / tf.to_float(vocab_size - 1) + soft_targets = tf.one_hot( + tf.cast(labels, tf.int32), depth=vocab_size, on_value=confidence, off_value=low_confidence + ) + xentropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=soft_targets) + + # Calculate the best (lowest) possible value of cross entropy, and + # subtract from the cross entropy loss. + normalizing_constant = -( + confidence * tf.log(confidence) + + tf.to_float(vocab_size - 1) * low_confidence * tf.log(low_confidence + 1e-20) + ) + xentropy -= normalizing_constant + + weights = tf.to_float(tf.not_equal(labels, 0)) + return xentropy * weights, weights + + +def _convert_to_eval_metric(metric_fn): + """Wrap a metric fn that returns scores and weights as an eval metric fn. + + The input metric_fn returns values for the current batch. The wrapper + aggregates the return values collected over all of the batches evaluated. + + Parameters +----------- + metric_fn: function that returns scores and weights for the current batch's + logits and predicted labels. + + Returns: +----------- + function that aggregates the scores and weights from metric_fn. + """ + + def problem_metric_fn(*args): + """Returns an aggregation of the metric_fn's returned values.""" + (scores, weights) = metric_fn(*args) + + # The tf.metrics.mean function assures correct aggregation. + return tf.metrics.mean(scores, weights) + + return problem_metric_fn + + +def get_eval_metrics(logits, labels, params): + """Return dictionary of model evaluation metrics.""" + metrics = { + "accuracy": _convert_to_eval_metric(padded_accuracy)(logits, labels), + "accuracy_top5": _convert_to_eval_metric(padded_accuracy_top5)(logits, labels), + "accuracy_per_sequence": _convert_to_eval_metric(padded_sequence_accuracy)(logits, labels), + "neg_log_perplexity": _convert_to_eval_metric(padded_neg_log_perplexity)(logits, labels, params["vocab_size"]), + } + + if not params["use_tpu"]: + # TPU does not support tf.py_func + metrics.update( + { + "approx_bleu_score": _convert_to_eval_metric(bleu_score)(logits, labels), + "rouge_2_fscore": _convert_to_eval_metric(rouge_2_fscore)(logits, labels), + "rouge_L_fscore": _convert_to_eval_metric(rouge_l_fscore)(logits, labels), + } + ) + + # Prefix each of the metric names with "metrics/". This allows the metric + # graphs to display under the "metrics" category in TensorBoard. + metrics = {"metrics/%s" % k: v for k, v in six.iteritems(metrics)} + return metrics + + +def padded_accuracy(logits, labels): + """Percentage of times that predictions matches labels on non-0s.""" + with tf.variable_scope("padded_accuracy", values=[logits, labels]): + logits, labels = _pad_tensors_to_same_length(logits, labels) + weights = tf.to_float(tf.not_equal(labels, 0)) + outputs = tf.to_int32(tf.argmax(logits, axis=-1)) + padded_labels = tf.to_int32(labels) + return tf.to_float(tf.equal(outputs, padded_labels)), weights + + +def padded_accuracy_topk(logits, labels, k): + """Percentage of times that top-k predictions matches labels on non-0s.""" + with tf.variable_scope("padded_accuracy_topk", values=[logits, labels]): + logits, labels = _pad_tensors_to_same_length(logits, labels) + weights = tf.to_float(tf.not_equal(labels, 0)) + effective_k = tf.minimum(k, tf.shape(logits)[-1]) + _, outputs = tf.nn.top_k(logits, k=effective_k) + outputs = tf.to_int32(outputs) + padded_labels = tf.to_int32(labels) + padded_labels = tf.expand_dims(padded_labels, axis=-1) + padded_labels += tf.zeros_like(outputs) # Pad to same shape. + same = tf.to_float(tf.equal(outputs, padded_labels)) + same_topk = tf.reduce_sum(same, axis=-1) + return same_topk, weights + + +def padded_accuracy_top5(logits, labels): + return padded_accuracy_topk(logits, labels, 5) + + +def padded_sequence_accuracy(logits, labels): + """Percentage of times that predictions matches labels everywhere (non-0).""" + with tf.variable_scope("padded_sequence_accuracy", values=[logits, labels]): + logits, labels = _pad_tensors_to_same_length(logits, labels) + weights = tf.to_float(tf.not_equal(labels, 0)) + outputs = tf.to_int32(tf.argmax(logits, axis=-1)) + padded_labels = tf.to_int32(labels) + not_correct = tf.to_float(tf.not_equal(outputs, padded_labels)) * weights + axis = list(range(1, len(outputs.get_shape()))) + correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis)) + return correct_seq, tf.constant(1.0) + + +def padded_neg_log_perplexity(logits, labels, vocab_size): + """Average log-perplexity excluding padding 0s. No smoothing.""" + num, den = padded_cross_entropy_loss(logits, labels, 0, vocab_size) + return -num, den + + +def bleu_score(logits, labels): + """Approximate BLEU score computation between labels and predictions. + + An approximate BLEU scoring method since we do not glue word pieces or + decode the ids and tokenize the output. By default, we use ngram order of 4 + and use brevity penalty. Also, this does not have beam search. + + Parameters +----------- + logits: Tensor of size [batch_size, length_logits, vocab_size] + labels: Tensor of size [batch-size, length_labels] + + Returns: +----------- + bleu: int, approx bleu score + """ + predictions = tf.to_int32(tf.argmax(logits, axis=-1)) + # TODO: Look into removing use of py_func + bleu = tf.py_func(compute_bleu, (labels, predictions), tf.float32) + return bleu, tf.constant(1.0) + + +def _get_ngrams_with_counter(segment, max_order): + """Extracts all n-grams up to a given maximum order from an input segment. + + Parameters +----------- + segment: text segment from which n-grams will be extracted. + max_order: maximum length in tokens of the n-grams returned by this + methods. + + Returns: +----------- + The Counter containing all n-grams upto max_order in segment + with a count of how many times each n-gram occurred. + """ + ngram_counts = collections.Counter() + for order in xrange(1, max_order + 1): + for i in xrange(0, len(segment) - order + 1): + ngram = tuple(segment[i:i + order]) + ngram_counts[ngram] += 1 + return ngram_counts + + +def compute_bleu(reference_corpus, translation_corpus, max_order=4, use_bp=True): + """Computes BLEU score of translated segments against one or more references. + + Parameters +----------- + reference_corpus: list of references for each translation. Each + reference should be tokenized into a list of tokens. + translation_corpus: list of translations to score. Each translation + should be tokenized into a list of tokens. + max_order: Maximum n-gram order to use when computing BLEU score. + use_bp: boolean, whether to apply brevity penalty. + + Returns: +----------- + BLEU score. + """ + reference_length = 0 + translation_length = 0 + bp = 1.0 + geo_mean = 0 + + matches_by_order = [0] * max_order + possible_matches_by_order = [0] * max_order + precisions = [] + + for (references, translations) in zip(reference_corpus, translation_corpus): + reference_length += len(references) + translation_length += len(translations) + ref_ngram_counts = _get_ngrams_with_counter(references, max_order) + translation_ngram_counts = _get_ngrams_with_counter(translations, max_order) + + overlap = dict( + (ngram, min(count, translation_ngram_counts[ngram])) for ngram, count in ref_ngram_counts.items() + ) + + for ngram in overlap: + matches_by_order[len(ngram) - 1] += overlap[ngram] + for ngram in translation_ngram_counts: + possible_matches_by_order[len(ngram) - 1] += translation_ngram_counts[ngram] + + precisions = [0] * max_order + smooth = 1.0 + + for i in xrange(0, max_order): + if possible_matches_by_order[i] > 0: + precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i] + if matches_by_order[i] > 0: + precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i] + else: + smooth *= 2 + precisions[i] = 1.0 / (smooth * possible_matches_by_order[i]) + else: + precisions[i] = 0.0 + + if max(precisions) > 0: + p_log_sum = sum(math.log(p) for p in precisions if p) + geo_mean = math.exp(p_log_sum / max_order) + + if use_bp: + ratio = translation_length / reference_length + bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0 + bleu = geo_mean * bp + return np.float32(bleu) + + +def rouge_2_fscore(logits, labels): + """ROUGE-2 F1 score computation between labels and predictions. + + This is an approximate ROUGE scoring method since we do not glue word pieces + or decode the ids and tokenize the output. + + Parameters +----------- + logits: tensor, model predictions + labels: tensor, gold output. + + Returns: +----------- + rouge2_fscore: approx rouge-2 f1 score. + """ + predictions = tf.to_int32(tf.argmax(logits, axis=-1)) + # TODO: Look into removing use of py_func + rouge_2_f_score = tf.py_func(rouge_n, (predictions, labels), tf.float32) + return rouge_2_f_score, tf.constant(1.0) + + +def _get_ngrams(n, text): + """Calculates n-grams. + + Parameters +----------- + n: which n-grams to calculate + text: An array of tokens + + Returns: +----------- + A set of n-grams + """ + ngram_set = set() + text_length = len(text) + max_index_ngram_start = text_length - n + for i in range(max_index_ngram_start + 1): + ngram_set.add(tuple(text[i:i + n])) + return ngram_set + + +def rouge_n(eval_sentences, ref_sentences, n=2): + """Computes ROUGE-N f1 score of two text collections of sentences. + + Source: https://www.microsoft.com/en-us/research/publication/ + rouge-a-package-for-automatic-evaluation-of-summaries/ + + Parameters +----------- + eval_sentences: Predicted sentences. + ref_sentences: Sentences from the reference set + n: Size of ngram. Defaults to 2. + + Returns: +----------- + f1 score for ROUGE-N + """ + f1_scores = [] + for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences): + eval_ngrams = _get_ngrams(n, eval_sentence) + ref_ngrams = _get_ngrams(n, ref_sentence) + ref_count = len(ref_ngrams) + eval_count = len(eval_ngrams) + + # Count the overlapping ngrams between evaluated and reference + overlapping_ngrams = eval_ngrams.intersection(ref_ngrams) + overlapping_count = len(overlapping_ngrams) + + # Handle edge case. This isn't mathematically correct, but it's good enough + if eval_count == 0: + precision = 0.0 + else: + precision = float(overlapping_count) / eval_count + if ref_count == 0: + recall = 0.0 + else: + recall = float(overlapping_count) / ref_count + f1_scores.append(2.0 * ((precision * recall) / (precision + recall + 1e-8))) + + # return overlapping_count / reference_count + return np.mean(f1_scores, dtype=np.float32) + + +def rouge_l_fscore(predictions, labels): + """ROUGE scores computation between labels and predictions. + + This is an approximate ROUGE scoring method since we do not glue word pieces + or decode the ids and tokenize the output. + + Parameters +----------- + predictions: tensor, model predictions + labels: tensor, gold output. + + Returns: +----------- + rouge_l_fscore: approx rouge-l f1 score. + """ + outputs = tf.to_int32(tf.argmax(predictions, axis=-1)) + rouge_l_f_score = tf.py_func(rouge_l_sentence_level, (outputs, labels), tf.float32) + return rouge_l_f_score, tf.constant(1.0) + + +def rouge_l_sentence_level(eval_sentences, ref_sentences): + """Computes ROUGE-L (sentence level) of two collections of sentences. + + Source: https://www.microsoft.com/en-us/research/publication/ + rouge-a-package-for-automatic-evaluation-of-summaries/ + + Calculated according to: + R_lcs = LCS(X,Y)/m + P_lcs = LCS(X,Y)/n + F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) + + where: + X = reference summary + Y = Candidate summary + m = length of reference summary + n = length of candidate summary + + Parameters +----------- + eval_sentences: The sentences that have been picked by the summarizer + ref_sentences: The sentences from the reference set + + Returns: +----------- + A float: F_lcs + """ + + f1_scores = [] + for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences): + m = float(len(ref_sentence)) + n = float(len(eval_sentence)) + lcs = _len_lcs(eval_sentence, ref_sentence) + f1_scores.append(_f_lcs(lcs, m, n)) + return np.mean(f1_scores, dtype=np.float32) + + +def _len_lcs(x, y): + """Returns the length of the Longest Common Subsequence between two seqs. + + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + + Parameters +----------- + x: sequence of words + y: sequence of words + + Returns + integer: Length of LCS between x and y + """ + table = _lcs(x, y) + n, m = len(x), len(y) + return table[n, m] + + +def _lcs(x, y): + """Computes the length of the LCS between two seqs. + + The implementation below uses a DP programming algorithm and runs + in O(nm) time where n = len(x) and m = len(y). + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + + Parameters +----------- + x: collection of words + y: collection of words + + Returns: +----------- + Table of dictionary of coord and len lcs + """ + n, m = len(x), len(y) + table = dict() + for i in range(n + 1): + for j in range(m + 1): + if i == 0 or j == 0: + table[i, j] = 0 + elif x[i - 1] == y[j - 1]: + table[i, j] = table[i - 1, j - 1] + 1 + else: + table[i, j] = max(table[i - 1, j], table[i, j - 1]) + return table + + +def _f_lcs(llcs, m, n): + """Computes the LCS-based F-measure score. + + Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/ + rouge-working-note-v1.3.1.pdf + + Parameters +----------- + llcs: Length of LCS + m: number of words in reference summary + n: number of words in candidate summary + + Returns: +----------- + Float. LCS-based F-measure score + """ + r_lcs = llcs / m + p_lcs = llcs / n + beta = p_lcs / (r_lcs + 1e-12) + num = (1 + (beta**2)) * r_lcs * p_lcs + denom = r_lcs + ((beta**2) * p_lcs) + f_lcs = num / (denom + 1e-12) + return f_lcs + + +def _pad_tensors_to_same_length(x, y): + """Pad x and y so that the results have the same length (second dimension).""" + with tf.name_scope("pad_to_same_length"): + x_length = tf.shape(x)[1] + y_length = tf.shape(y)[1] + + max_length = tf.maximum(x_length, y_length) + + x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]]) + y = tf.pad(y, [[0, 0], [0, max_length - y_length]]) + return x, y + + +def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size): + """Calculate cross entropy loss while ignoring padding. + + Parameters +----------- + logits: Tensor of size [batch_size, length_logits, vocab_size] + labels: Tensor of size [batch_size, length_labels] + smoothing: Label smoothing constant, used to determine the on and off values + vocab_size: int size of the vocabulary + + Returns: +----------- + Returns the cross entropy loss and weight tensors: float32 tensors with + shape [batch_size, max(length_logits, length_labels)] + """ + with tf.name_scope("loss"): + logits, labels = _pad_tensors_to_same_length(logits, labels) + + # Calculate smoothing cross entropy + with tf.name_scope("smoothing_cross_entropy"): + confidence = 1.0 - smoothing + low_confidence = (1.0 - confidence) / tf.cast(vocab_size - 1, tf.float32) + soft_targets = tf.one_hot( + tf.cast(labels, tf.int32), depth=vocab_size, on_value=confidence, off_value=low_confidence + ) + xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=soft_targets) + + # Calculate the best (lowest) possible value of cross entropy, and + # subtract from the cross entropy loss. + normalizing_constant = -( + confidence * tf.math.log(confidence) + + tf.cast(vocab_size - 1, tf.float32) * low_confidence * tf.math.log(low_confidence + 1e-20) + ) + xentropy -= normalizing_constant + + weights = tf.cast(tf.not_equal(labels, 0), tf.float32) + return xentropy * weights, weights + + +def padded_accuracy(logits, labels): + """Percentage of times that predictions matches labels on non-0s.""" + with tf.name_scope("padded_accuracy"): + logits, labels = _pad_tensors_to_same_length(logits, labels) + weights = tf.cast(tf.not_equal(labels, 0), tf.float32) + outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32) + padded_labels = tf.cast(labels, tf.int32) + return tf.cast(tf.equal(outputs, padded_labels), tf.float32), weights + + +def padded_accuracy_topk(logits, labels, k): + """Percentage of times that top-k predictions matches labels on non-0s.""" + with tf.name_scope("padded_accuracy_topk"): + logits, labels = _pad_tensors_to_same_length(logits, labels) + weights = tf.cast(tf.not_equal(labels, 0), tf.float32) + effective_k = tf.minimum(k, tf.shape(logits)[-1]) + _, outputs = tf.nn.top_k(logits, k=effective_k) + outputs = tf.cast(outputs, tf.int32) + padded_labels = tf.cast(labels, tf.int32) + padded_labels = tf.expand_dims(padded_labels, axis=-1) + padded_labels += tf.zeros_like(outputs) # Pad to same shape. + same = tf.cast(tf.equal(outputs, padded_labels), tf.float32) + same_topk = tf.reduce_sum(same, axis=-1) + return same_topk, weights + + +def padded_accuracy_top5(logits, labels): + return padded_accuracy_topk(logits, labels, 5) + + +def padded_sequence_accuracy(logits, labels): + """Percentage of times that predictions matches labels everywhere (non-0).""" + with tf.name_scope("padded_sequence_accuracy"): + logits, labels = _pad_tensors_to_same_length(logits, labels) + weights = tf.cast(tf.not_equal(labels, 0), tf.float32) + outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32) + padded_labels = tf.cast(labels, tf.int32) + not_correct = tf.cast(tf.not_equal(outputs, padded_labels), tf.float32) * weights + axis = list(range(1, len(outputs.get_shape()))) + correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis)) + return correct_seq, tf.constant(1.0) + + +def padded_neg_log_perplexity(logits, labels, vocab_size): + """Average log-perplexity excluding padding 0s. No smoothing.""" + num, den = padded_cross_entropy_loss(logits, labels, 0, vocab_size) + return -num, den + + +import functools + + +class MetricLayer(tf.keras.layers.Layer): + """Custom a layer of metrics for Transformer model.""" + + def __init__(self, vocab_size): + super(MetricLayer, self).__init__() + self.vocab_size = vocab_size + self.metric_mean_fns = [] + + def build(self, input_shape): + """"Builds metric layer.""" + neg_log_perplexity = functools.partial(padded_neg_log_perplexity, vocab_size=self.vocab_size) + self.metric_mean_fns = [ + (tf.keras.metrics.Mean("accuracy"), padded_accuracy), + (tf.keras.metrics.Mean("accuracy_top5"), padded_accuracy_top5), + (tf.keras.metrics.Mean("accuracy_per_sequence"), padded_sequence_accuracy), + (tf.keras.metrics.Mean("neg_log_perplexity"), neg_log_perplexity), + ] + super(MetricLayer, self).build(input_shape) + + def get_config(self): + return {"vocab_size": self.vocab_size} + + def call(self, inputs): + logits, targets = inputs[0], inputs[1] + for mean, fn in self.metric_mean_fns: + m = mean(*fn(logits, targets)) + self.add_metric(m, name="metric", aggregation='mean') + return logits + + +def transformer_loss(logits, labels, smoothing, vocab_size): + """Calculates total loss containing cross entropy with padding ignored. + + Parameters +----------- + logits: Tensor of size [batch_size, length_logits, vocab_size] + labels: Tensor of size [batch_size, length_labels] + smoothing: Label smoothing constant, used to determine the on and off values + vocab_size: int size of the vocabulary + + Returns: +----------- + A scalar float tensor for loss. + """ + xentropy, weights = padded_cross_entropy_loss(logits, labels, smoothing, vocab_size) + return tf.reduce_sum(xentropy) / tf.reduce_sum(weights) + + +class LossLayer(tf.keras.layers.Layer): + """Custom a layer of transformer loss for Transformer model.""" + + def __init__(self, vocab_size, label_smoothing): + super(LossLayer, self).__init__() + self.vocab_size = vocab_size + self.label_smoothing = label_smoothing + + def get_config(self): + return { + "vocab_size": self.vocab_size, + "label_smoothing": self.label_smoothing, + } + + def call(self, inputs): + logits, targets = inputs[0], inputs[1] + loss = transformer_loss(logits, targets, self.label_smoothing, self.vocab_size) + self.add_loss(loss) + return logits, loss diff --git a/tensorlayer/models/transformer/utils/model_utils.py b/tensorlayer/models/transformer/utils/model_utils.py new file mode 100644 index 000000000..5410a97e2 --- /dev/null +++ b/tensorlayer/models/transformer/utils/model_utils.py @@ -0,0 +1,108 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Transformer model helper methods.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +import tensorflow as tf + +_NEG_INF = -1e9 + + +def positional_encoding(length, hidden_size, min_timescale=1.0, max_timescale=1.0e4): + """Return positional encoding. + + Calculates the position encoding as a mix of sine and cosine functions with + geometrically increasing wavelengths. + Defined and formulized in Attention is All You Need, section 3.5. + + Parameters +``----------- + length : int + Sequence length. + hidden_size : int + channel number of input + min_timescale : float + Minimum scale that will be applied at each position + max_timescale : float + Maximum scale that will be applied at each position + + """ + position = tf.cast(tf.range(length), tf.float32) + num_timescales = hidden_size // 2 + log_timescale_increment = ( + math.log(float(max_timescale) / float(min_timescale)) / (tf.cast(num_timescales, tf.float32) - 1) + ) + inv_timescales = min_timescale * tf.exp(tf.cast(tf.range(num_timescales), tf.float32) * -log_timescale_increment) + scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0) + signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1) + return signal + + +def get_decoder_self_attention_bias(length): + """Calculate bias for decoder that maintains model's autoregressive property. + + Creates a tensor that masks out locations that correspond to illegal + connections, so prediction at position i cannot draw information from future + positions. + + Parameters + ----------- + length: int + length of sequences in batch. + + + """ + with tf.name_scope("decoder_self_attention_bias"): + valid_locs = tf.linalg.band_part(tf.ones([length, length]), -1, 0) + valid_locs = tf.reshape(valid_locs, [1, 1, length, length]) + decoder_bias = _NEG_INF * (1.0 - valid_locs) + return decoder_bias + + +def get_padding(x, padding_value=0): + """Return float tensor representing the padding values in x. + + Parameters + ----------- + x: int tensor with any shape + padding_value: int + + """ + with tf.name_scope("padding"): + return tf.cast(tf.equal(x, padding_value), tf.float32) + + +def get_padding_bias(x): + """Calculate bias tensor from padding values in tensor. + + Bias tensor that is added to the pre-softmax multi-headed attention logits, + which has shape [batch_size, num_heads, length, length]. The tensor is zero at + non-padding locations, and -1e9 (negative infinity) at padding locations. + + Parameters + ----------- + x: int tensor with shape [batch_size, length] + + """ + with tf.name_scope("attention_bias"): + padding = get_padding(x) + attention_bias = padding * _NEG_INF + attention_bias = tf.expand_dims(tf.expand_dims(attention_bias, axis=1), axis=1) + return attention_bias diff --git a/tensorlayer/optimizers/__init__.py b/tensorlayer/optimizers/__init__.py index e74b38801..0e9890929 100644 --- a/tensorlayer/optimizers/__init__.py +++ b/tensorlayer/optimizers/__init__.py @@ -10,3 +10,4 @@ """ from .amsgrad import AMSGrad +from .lazy_adam import LazyAdamOptimizer diff --git a/tensorlayer/optimizers/lazy_adam.py b/tensorlayer/optimizers/lazy_adam.py new file mode 100644 index 000000000..5cdbab982 --- /dev/null +++ b/tensorlayer/optimizers/lazy_adam.py @@ -0,0 +1,76 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Optimizer from addons and learning rate scheduler.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import tensorflow as tf + + +class LazyAdamOptimizer(tf.optimizers.Adam): + """Variant of the Adam optimizer that handles sparse updates more efficiently. + + The original Adam algorithm maintains two moving-average accumulators for + each trainable variable; the accumulators are updated at every step. + This class provides lazier handling of gradient updates for sparse + variables. It only updates moving-average accumulators for sparse variable + indices that appear in the current batch, rather than updating the + accumulators for all indices. Compared with the original Adam optimizer, + it can provide large improvements in model training throughput for some + applications. However, it provides slightly different semantics than the + original Adam algorithm, and may lead to different empirical results. + Note, amsgrad is currently not supported and the argument can only be + False. + + This class is borrowed from: + https://github.com/tensorflow/addons/blob/master/tensorflow_addons/optimizers/lazy_adam.py + """ + + def _resource_apply_sparse(self, grad, var, indices): + """Applies grad for one step.""" + var_dtype = var.dtype.base_dtype + lr_t = self._decayed_lr(var_dtype) + beta_1_t = self._get_hyper('beta_1', var_dtype) + beta_2_t = self._get_hyper('beta_2', var_dtype) + local_step = tf.cast(self.iterations + 1, var_dtype) + beta_1_power = tf.math.pow(beta_1_t, local_step) + beta_2_power = tf.math.pow(beta_2_t, local_step) + epsilon_t = tf.convert_to_tensor(self.epsilon, var_dtype) + lr = (lr_t * tf.math.sqrt(1 - beta_2_power) / (1 - beta_1_power)) + + # \\(m := beta1 * m + (1 - beta1) * g_t\\) + m = self.get_slot(var, 'm') + m_t_slice = beta_1_t * tf.gather(m, indices) + (1 - beta_1_t) * grad + + m_update_kwargs = {'resource': m.handle, 'indices': indices, 'updates': m_t_slice} + m_update_op = tf.raw_ops.ResourceScatterUpdate(**m_update_kwargs) + + # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\) + v = self.get_slot(var, 'v') + v_t_slice = (beta_2_t * tf.gather(v, indices) + (1 - beta_2_t) * tf.math.square(grad)) + + v_update_kwargs = {'resource': v.handle, 'indices': indices, 'updates': v_t_slice} + v_update_op = tf.raw_ops.ResourceScatterUpdate(**v_update_kwargs) + + # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\) + var_slice = lr * m_t_slice / (tf.math.sqrt(v_t_slice) + epsilon_t) + + var_update_kwargs = {'resource': var.handle, 'indices': indices, 'updates': var_slice} + var_update_op = tf.raw_ops.ResourceScatterSub(**var_update_kwargs) + + return tf.group(*[var_update_op, m_update_op, v_update_op]) diff --git a/tests/models/test_transformer.py b/tests/models/test_transformer.py new file mode 100644 index 000000000..a7ee307ce --- /dev/null +++ b/tests/models/test_transformer.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import unittest + +import numpy as np +import tensorflow as tf +import tensorlayer as tl +from tqdm import tqdm +from sklearn.utils import shuffle +from tensorlayer.models.transformer import Transformer +from tests.utils import CustomTestCase +from tensorlayer.models.transformer.utils import metrics +from tensorlayer.models.transformer.utils import attention_visualisation +import time + + +class TINY_PARAMS(object): + vocab_size = 50+2 + encoder_num_layers = 2 + decoder_num_layers = 2 + hidden_size = 64 + ff_size = 16 + num_heads = 4 + keep_prob = 0.9 + + # Default prediction params + extra_decode_length = 5 + beam_size = 1 + alpha = 0.6 # used to calculate length normalization in beam search + + eos_id = 51 + sos_id = 0 + + +class Model_Transformer_Test(CustomTestCase): + + @classmethod + def setUpClass(cls): + cls.batch_size = 50 + + cls.embedding_size = 32 + cls.dec_seq_length = 5 + cls.trainX = np.random.randint(low=0, high=50, size=(50, 11)) + cls.trainY = np.random.randint(low=0, high=50, size=(50, 10)) + + cls.trainX[:, -1] = 51 + cls.trainY[:, -1] = 51 + # Parameters + cls.src_len = len(cls.trainX) + cls.tgt_len = len(cls.trainY) + + assert cls.src_len == cls.tgt_len + + cls.num_epochs = 100 + cls.n_step = cls.src_len // cls.batch_size + + @classmethod + def tearDownClass(cls): + pass + + def test_basic_simpleSeq2Seq(self): + + model_ = Transformer(TINY_PARAMS) + + # print(", ".join(x for x in [t.name for t in model_.trainable_weights])) + + self.vocab_size = TINY_PARAMS.vocab_size + optimizer = tf.optimizers.Adam(learning_rate=0.01) + for epoch in range(self.num_epochs): + model_.train() + t = time.time() + trainX, trainY = shuffle(self.trainX, self.trainY) + total_loss, n_iter = 0, 0 + for X, Y in tqdm(tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=self.batch_size, + shuffle=False), total=self.n_step, + desc='Epoch[{}/{}]'.format(epoch + 1, self.num_epochs), leave=False): + + with tf.GradientTape() as tape: + + targets = Y + logits, weights_encoder, weights_decoder = model_(inputs=X, targets=Y) + logits = metrics.MetricLayer(self.vocab_size)([logits, targets]) + logits, loss = metrics.LossLayer(self.vocab_size, 0.1)([logits, targets]) + + grad = tape.gradient(loss, model_.all_weights) + optimizer.apply_gradients(zip(grad, model_.all_weights)) + + total_loss += loss + n_iter += 1 + print(time.time() - t) + tl.files.save_npz(model_.all_weights, name='./model_v4.npz') + model_.eval() + test_sample = trainX[0:2, :] + model_.eval() + [prediction, weights_decoder], weights_encoder = model_(inputs=test_sample) + + print("Prediction: >>>>> ", prediction["outputs"], "\n Target: >>>>> ", trainY[0:2, :], "\n\n") + + print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, self.num_epochs, total_loss / n_iter)) + + # visualise the self-attention weights at encoder during training + trainX, trainY = shuffle(self.trainX, self.trainY) + X = [trainX[0]] + Y = [trainY[0]] + logits, weights_encoder, weights_decoder = model_(inputs=X, targets=Y) + attention_visualisation.plot_attention_weights(weights_encoder["layer_0"], X[0].numpy(), X[0].numpy()) + + # visualise the encoder-decoder-attention weights at decoder during training + trainX, trainY = shuffle(self.trainX, self.trainY) + X = [trainX[0]] + Y = [trainY[0]] + logits, weights_encoder, weights_decoder = model_(inputs=X, targets=Y) + attention_visualisation.plot_attention_weights(weights_decoder["enc_dec"]["layer_0"], X[0].numpy(), Y[0]) + + # visualise the encoder-decoder-attention weights at decoder during inference + trainX, trainY = shuffle(self.trainX, self.trainY) + X = [trainX[0]] + # Y = [trainY[0]] + model_.eval() + [prediction, weights_decoder], weights_encoder = model_(inputs=X) + # print(X[0].numpy(), prediction["outputs"][0].numpy()) + attention_visualisation.plot_attention_weights( + weights_decoder["enc_dec"]["layer_0"], X[0].numpy(), prediction["outputs"][0].numpy() + ) + + +if __name__ == '__main__': + unittest.main()