From c58607119daa95c9bd5a301dfdd10dec3abd3fca Mon Sep 17 00:00:00 2001 From: Edward Medvedev Date: Fri, 30 Jun 2017 20:53:18 +0300 Subject: [PATCH 01/12] Update to support TensorFlow 1.2.0 --- README.md | 2 +- src/model/seq2seq_model.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 90d3a3bd..b432ee12 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Visual Attention based OCR. The model first runs a sliding CNN on the image (ima # Prerequsites Most of our code is written based on Tensorflow, but we also use Keras for the convolution part of our model. Besides, we use python package distance to calculate edit distance for evaluation. (However, that is not mandatory, if distance is not installed, we will do exact match). -### Tensorflow: [Installation Instructions](https://www.tensorflow.org/get_started/os_setup#download-and-setup) (tested on 0.12.1) +### Tensorflow: [Installation Instructions](https://www.tensorflow.org/install/) (tested on 1.2.0) ### Distance (Optional): diff --git a/src/model/seq2seq_model.py b/src/model/seq2seq_model.py index 98b2bea0..97df3eac 100644 --- a/src/model/seq2seq_model.py +++ b/src/model/seq2seq_model.py @@ -84,22 +84,22 @@ def __init__(self, encoder_masks, encoder_inputs_tensor, self.encoder_masks = encoder_masks # Create the internal multi-layer cell for our RNN. - single_cell = tf.contrib.rnn.core_rnn_cell.BasicLSTMCell(attn_num_hidden, forget_bias=0.0, state_is_tuple=False) + single_cell = tf.contrib.rnn.BasicLSTMCell(attn_num_hidden, forget_bias=0.0, state_is_tuple=False) if use_gru: print("using GRU CELL in decoder") - single_cell = tf.contrib.rnn.core_rnn_cell.GRUCell(attn_num_hidden) + single_cell = tf.contrib.rnn.GRUCell(attn_num_hidden) cell = single_cell if attn_num_layers > 1: - cell = tf.contrib.rnn.core_rnn_cell.MultiRNNCell([single_cell] * attn_num_layers, state_is_tuple=False) + cell = tf.contrib.rnn.MultiRNNCell([single_cell] * attn_num_layers, state_is_tuple=False) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(lstm_inputs, decoder_inputs, seq_length, do_decode): num_hidden = attn_num_layers * attn_num_hidden - lstm_fw_cell = tf.contrib.rnn.core_rnn_cell.BasicLSTMCell(num_hidden, forget_bias=0.0, state_is_tuple=False) + lstm_fw_cell = tf.contrib.rnn.BasicLSTMCell(num_hidden, forget_bias=0.0, state_is_tuple=False) # Backward direction cell - lstm_bw_cell = tf.contrib.rnn.core_rnn_cell.BasicLSTMCell(num_hidden, forget_bias=0.0, state_is_tuple=False) + lstm_bw_cell = tf.contrib.rnn.BasicLSTMCell(num_hidden, forget_bias=0.0, state_is_tuple=False) pre_encoder_inputs, output_state_fw, output_state_bw = tf.contrib.rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, lstm_inputs, initial_state_fw=None, initial_state_bw=None, From ebc0a8468cd08c704617c739bb5d52147da50067 Mon Sep 17 00:00:00 2001 From: Edward Medvedev Date: Sat, 1 Jul 2017 10:39:09 +0200 Subject: [PATCH 02/12] Remove the temp files and local references --- .gitignore | 7 ++++++- run.sh | 23 ----------------------- test_demo.sh | 16 ---------------- tmp.py | 17 ----------------- train_demo.sh | 17 ----------------- 5 files changed, 6 insertions(+), 74 deletions(-) delete mode 100644 run.sh delete mode 100644 test_demo.sh delete mode 100644 tmp.py delete mode 100644 train_demo.sh diff --git a/.gitignore b/.gitignore index fb9cdef3..bdeec827 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +# ML data +datasets/ +checkpoints/ +models/ + ### Python template # Byte-compiled / optimized / DLL files __pycache__/ @@ -107,4 +112,4 @@ crashlytics-build.properties # Created by .ignore support plugin (hsz.mobi) misc/ -data/evaluation_data \ No newline at end of file +data/evaluation_data diff --git a/run.sh b/run.sh deleted file mode 100644 index 618c44a6..00000000 --- a/run.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env bash - -# train on iam (handwritten) -python src/launcher.py --data-base-dir=/ --data-path=/home/sivankeret/wolf_dir/Dev2/Datasets/iam-words/images/tmp_images_lists/trainset.txt --model-dir=Workplace --log-path=Workplace/log.txt --steps-per-checkpoint=200 --phase=train - -# train on Synth90k subset toy example -python src/launcher.py --data-base-dir=data/sample --data-path=data/sample/sample.txt --model-dir=Workplace/model --log-path=Workplace/model_log.txt --steps-per-checkpoint=200 --phase=train --no-load-model - -# train with load model -python src/launcher.py --data-base-dir=data/sample --data-path=data/sample/sample.txt --model-dir=Workplace --log-path=Workplace/log.txt --phase=train --load-model - -python src/train.py --phase=train --train-data-path=data/sample/sample.txt --val-data-path=data/sample/sample.txt --train-data-base-dir=data/sample --val-data-base-dir=data/sample --log-path=Workplace/log_test.txt --model-dir=Workplace - - -# test on same subset toy example -python src/launcher.py --phase=test --data-path=data/sample/sample.txt --data-base-dir=data/sample --log-path=Workplace/log_test.txt --load-model --model-dir=Workplace --output-dir=Workplace/results - - - -python src/test.py --phase=test --data-path=data/sample/sample.txt --data-base-dir=data/sample --log-path=Workplace/log_test.txt --model-dir=Workplace --output-dir=Workplace/results - - -python src/launcher.py --phase=train --data-path=/media/data2/sivankeret/Datasets/mnt/ramdisk/max/90kDICT32px/annotation_train_words.txt --data-base-dir=/media/data2/sivankeret/Datasets/mnt/ramdisk/max/90kDICT32px --log-path=Workplace/log_before_refactor.txt --model-dir=Workplace diff --git a/test_demo.sh b/test_demo.sh deleted file mode 100644 index 4a8cfc94..00000000 --- a/test_demo.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -python src/launcher.py \ - --phase=test \ - --data-path=/media/data2/sivankeret/Datasets/mnt/ramdisk/max/90kDICT32px/annotation_train_words.txt \ - --data-base-dir=/media/data2/sivankeret/Datasets/mnt/ramdisk/max/90kDICT32px \ - --log-path=log_01_16_test.txt \ - --attn-num-hidden 256 \ - --batch-size 64 \ - --model-dir=model_01_16 \ - --load-model \ - --num-epoch=3 \ - --gpu-id=1 \ - --output-dir=model_01_16/synth90 \ - --use-gru \ - --target-embedding-size=10 diff --git a/tmp.py b/tmp.py deleted file mode 100644 index cefc2972..00000000 --- a/tmp.py +++ /dev/null @@ -1,17 +0,0 @@ -import sys - -input_path = '/media/data2/sivankeret/Datasets/mnt/ramdisk/max/90kDICT32px/annotation_train.txt' -lex_path = '/media/data2/sivankeret/Datasets/mnt/ramdisk/max/90kDICT32px/lexicon.txt' -output_path = '/media/data2/sivankeret/Datasets/mnt/ramdisk/max/90kDICT32px/annotation_train_words.txt' - -with open(lex_path,'r') as lex_f: - all_words = lex_f.readlines() -word_dict = dict(enumerate(all_words)) - -with open(input_path,'r') as input_f: - all_lines = input_f.readlines() - -new_lines = [line.split(' ')[0] + ' ' + word_dict[int(line.split(' ')[1])] for line in all_lines] - -with open(output_path, 'w') as out_f: - out_f.writelines(new_lines) diff --git a/train_demo.sh b/train_demo.sh deleted file mode 100644 index ec0fb6c8..00000000 --- a/train_demo.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -python src/launcher.py \ - --phase=train \ - --data-path=/media/data2/sivankeret/Datasets/mnt/ramdisk/max/90kDICT32px/annotation_train_words.txt \ - --data-base-dir=/media/data2/sivankeret/Datasets/mnt/ramdisk/max/90kDICT32px \ - --log-path=log_01_16.txt \ - --attn-num-hidden 256 \ - --batch-size 64 \ - --model-dir=model_01_16 \ - --initial-learning-rate=1.0 \ - --no-load-model \ - --num-epoch=3 \ - --gpu-id=0 \ - --use-gru \ - --steps-per-checkpoint=2000 \ - --target-embedding-size=10 From b1d87a8f96fb6a019bc7ebab71fe8e0c5921d80f Mon Sep 17 00:00:00 2001 From: Edward Medvedev Date: Mon, 3 Jul 2017 10:46:33 +0200 Subject: [PATCH 03/12] Include setup.py --- setup.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 setup.py diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..81ef710d --- /dev/null +++ b/setup.py @@ -0,0 +1,16 @@ +from setuptools import find_packages +from setuptools import setup + +REQUIRED_PACKAGES = ['distance', 'tensorflow', 'numpy', 'six'] + +setup( + name='attentionocr', + url='https://github.com/emedvedev/attention-ocr', + author_name='Ed Medvedev', + version='0.1', + install_requires=REQUIRED_PACKAGES, + packages=find_packages(), + include_package_data=True, + description='''Optical character recognition model + for Tensorflow based on Visual Attention.''' +) From ffa5ed5f9493961b293d525488c037734eab4390 Mon Sep 17 00:00:00 2001 From: Edward Medvedev Date: Tue, 4 Jul 2017 11:48:45 +0200 Subject: [PATCH 04/12] Read from TFRecords in data_gen and bucketdata --- src/data_util/bucketdata.py | 20 ++-- src/data_util/data_gen.py | 192 +++++++++++++++++------------------- 2 files changed, 97 insertions(+), 115 deletions(-) diff --git a/src/data_util/bucketdata.py b/src/data_util/bucketdata.py index 51857711..d71fa803 100644 --- a/src/data_util/bucketdata.py +++ b/src/data_util/bucketdata.py @@ -1,13 +1,8 @@ __author__ = 'moonkey' -import os -import numpy as np -from PIL import Image -# from keras.preprocessing.sequence import pad_sequences -from collections import Counter -import pickle as cPickle -import random import math +import numpy as np + class BucketData(object): def __init__(self): @@ -18,13 +13,13 @@ def __init__(self): self.label_list = [] self.file_list = [] - def append(self, datum, label, filename): + def append(self, datum, width, label, filename): self.data_list.append(datum) - self.data_len_list.append(int(math.floor(datum.shape[-1] / 4)) - 1) + self.data_len_list.append(int(math.floor(float(width) / 4)) - 1) self.label_list.append(label) self.file_list.append(filename) - self.max_width = max(datum.shape[-1], self.max_width) + self.max_width = max(width, self.max_width) self.max_label_len = max(len(label), self.max_label_len) return len(self.data_list) @@ -53,7 +48,7 @@ def get_bucket_id(): # ENCODER PART res['data_len'] = [a.astype(np.int32) for a in - np.array(self.data_len_list)] + np.array(self.data_len_list)] res['data'] = np.array(self.data_list) real_len = max(int(math.floor(self.max_width / 4)) - 1, 0) padd_len = int(encoder_input_len) - real_len @@ -89,8 +84,7 @@ def get_bucket_id(): np.array(self.label_list).T] res['target_weights'] = [a.astype(np.float32) for a in np.array(target_weights).T] - #print (res['decoder_inputs'][0]) - #assert False + assert len(res['decoder_inputs']) == len(res['target_weights']) res['filenames'] = self.file_list diff --git a/src/data_util/data_gen.py b/src/data_util/data_gen.py index 747e89b6..50737937 100644 --- a/src/data_util/data_gen.py +++ b/src/data_util/data_gen.py @@ -1,35 +1,47 @@ -__author__ = 'moonkey' +__author__ = 'moonkey, emedvedev' import os -import numpy as np +import math + +from StringIO import StringIO from PIL import Image -from collections import Counter -import pickle as cPickle -import random, math -from data_util.bucketdata import BucketData +import numpy as np +import tensorflow as tf + +from .bucketdata import BucketData class DataGen(object): - GO = 1 - EOS = 2 + SYMBOLS = { + "GO": 1, + "EOS": 2 + } + IMG_HEIGHT = 32 def __init__(self, data_root, annotation_fn, - evaluate = False, - valid_target_len = float('inf'), - img_width_range = (12, 320), - word_len = 30): + evaluate=False, + valid_target_len=float('inf'), + img_width_range=(12, 320), + word_len=30, + epochs=1000): """ :param data_root: :param annotation_fn: :param lexicon_fn: + :param valid_target_len: :param img_width_range: only needed for training set + :param word_len: + :param epochs: :return: """ - - img_height = 32 self.data_root = data_root + self.epochs = epochs + self.image_height = self.IMG_HEIGHT + self.valid_target_len = valid_target_len + self.bucket_min_width, self.bucket_max_width = img_width_range + if os.path.exists(annotation_fn): self.annotation_path = annotation_fn else: @@ -41,108 +53,84 @@ def __init__(self, (int(math.floor(img_width_range[1] / 4)), int(word_len + 2))] else: self.bucket_specs = [(int(64 / 4), 9 + 2), (int(108 / 4), 15 + 2), - (int(140 / 4), 17 + 2), (int(256 / 4), 20 + 2), - (int(math.ceil(img_width_range[1] / 4)), word_len + 2)] - - self.bucket_min_width, self.bucket_max_width = img_width_range - self.image_height = img_height - self.valid_target_len = valid_target_len + (int(140 / 4), 17 + 2), (int(256 / 4), 20 + 2), + (int(math.ceil(img_width_range[1] / 4)), word_len + 2)] self.bucket_data = {i: BucketData() for i in range(self.bucket_max_width + 1)} + filename_queue = tf.train.string_input_producer([self.annotation_path], num_epochs=self.epochs) + self.images, self.labels = parse_tfrecords(filename_queue) + def clear(self): self.bucket_data = {i: BucketData() for i in range(self.bucket_max_width + 1)} - def get_size(self): - with open(self.annotation_path, 'r') as ann_file: - return len(ann_file.readlines()) - def gen(self, batch_size): valid_target_len = self.valid_target_len - with open(self.annotation_path, 'r') as ann_file: - lines = ann_file.readlines() - random.shuffle(lines) - for l in lines: - img_path, lex = l.strip().split() - try: - img_bw, word = self.read_data(img_path, lex) - if valid_target_len < float('inf'): - word = word[:valid_target_len + 1] - width = img_bw.shape[-1] - - # TODO:resize if > 320 - b_idx = min(width, self.bucket_max_width) - bs = self.bucket_data[b_idx].append(img_bw, word, os.path.join(self.data_root,img_path)) - if bs >= batch_size: - b = self.bucket_data[b_idx].flush_out( - self.bucket_specs, - valid_target_length=valid_target_len, - go_shift=1) - if b is not None: - yield b - else: - assert False, 'no valid bucket of width %d'%width - except IOError: - pass # ignore error images - #with open('error_img.txt', 'a') as ef: - # ef.write(img_path + '\n') + + images, labels = tf.train.shuffle_batch( + [self.images, self.labels], batch_size=batch_size, num_threads=2, + capacity=1000 + 3 * batch_size, min_after_dequeue=1000) + + with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: + sess.run([ + tf.local_variables_initializer(), + tf.global_variables_initializer(), + ]) + coord = tf.train.Coordinator() + threads = tf.train.start_queue_runners(sess=sess, coord=coord) + + try: + while not coord.should_stop(): + raw_images, raw_labels = sess.run([images, labels]) + for img, lex in zip(raw_images, raw_labels): + _, word = self.read_data(None, lex) + if valid_target_len < float('inf'): + word = word[:valid_target_len + 1] + + img_data = Image.open(StringIO(img)) + width, _ = img_data.size + + b_idx = min(width, self.bucket_max_width) + bucket_size = self.bucket_data[b_idx].append(img, width, word, lex) + if bucket_size >= batch_size: + bucket = self.bucket_data[b_idx].flush_out( + self.bucket_specs, + valid_target_length=valid_target_len, + go_shift=1) + if bucket is not None: + yield bucket + else: + assert False, 'no valid bucket of width %d' % width + + finally: + coord.request_stop() + coord.join(threads) + self.clear() - def read_data(self, img_path, lex): - assert 0 < len(lex) < self.bucket_specs[-1][1] - # L = R * 299/1000 + G * 587/1000 + B * 114/1000 - with open(os.path.join(self.data_root, img_path), 'rb') as img_file: - img = Image.open(img_file) - w, h = img.size - aspect_ratio = float(w) / float(h) - if aspect_ratio < float(self.bucket_min_width) / self.image_height: - img = img.resize( - (self.bucket_min_width, self.image_height), - Image.ANTIALIAS) - elif aspect_ratio > float( - self.bucket_max_width) / self.image_height: - img = img.resize( - (self.bucket_max_width, self.image_height), - Image.ANTIALIAS) - elif h != self.image_height: - img = img.resize( - (int(aspect_ratio * self.image_height), self.image_height), - Image.ANTIALIAS) - - img_bw = img.convert('L') - img_bw = np.asarray(img_bw, dtype=np.uint8) - img_bw = img_bw[np.newaxis, :] - - # 'a':97, '0':48 - word = [self.GO] - for c in lex: - assert 96 < ord(c) < 123 or 47 < ord(c) < 58 + def read_data(self, img, lex): + assert lex and len(lex) < self.bucket_specs[-1][1] + + word = [self.SYMBOLS['GO']] + for char in lex: + assert 96 < ord(char) < 123 or 47 < ord(char) < 58 word.append( - ord(c) - 97 + 13 if ord(c) > 96 else ord(c) - 48 + 3) - word.append(self.EOS) + ord(char) - 97 + 13 if ord(char) > 96 else ord(char) - 48 + 3) + word.append(self.SYMBOLS['EOS']) word = np.array(word, dtype=np.int32) - # word = np.array( [self.GO] + - # [ord(c) - 97 + 13 if ord(c) > 96 else ord(c) - 48 + 3 - # for c in lex] + [self.EOS], dtype=np.int32) - - return img_bw, word - -def test_gen(): - print('testing gen_valid') - # s_gen = EvalGen('../../data/evaluation_data/svt', 'test.txt') - # s_gen = EvalGen('../../data/evaluation_data/iiit5k', 'test.txt') - # s_gen = EvalGen('../../data/evaluation_data/icdar03', 'test.txt') - s_gen = EvalGen('../../data/evaluation_data/icdar13', 'test.txt') - count = 0 - for batch in s_gen.gen(1): - count += 1 - print(str(batch['bucket_id']) + ' ' + str(batch['data'].shape[2:])) - assert batch['data'].shape[2] == img_height - print(count) + return img, word -if __name__ == '__main__': - test_gen() +def parse_tfrecords(filename_queue): + reader = tf.TFRecordReader() + _, serialized_example = reader.read(filename_queue) + features = tf.parse_single_example( + serialized_example, + features={ + 'image': tf.FixedLenFeature([], tf.string), + 'label': tf.FixedLenFeature([], tf.string), + }) + return features['image'], features['label'] From 098d3d69089f39225a76b7414746bdb3a46e8d93 Mon Sep 17 00:00:00 2001 From: Edward Medvedev Date: Wed, 5 Jul 2017 11:52:05 +0200 Subject: [PATCH 05/12] Minor lint fixes to datagen --- src/data_util/data_gen.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/data_util/data_gen.py b/src/data_util/data_gen.py index 50737937..f693a75d 100644 --- a/src/data_util/data_gen.py +++ b/src/data_util/data_gen.py @@ -13,10 +13,9 @@ class DataGen(object): - SYMBOLS = { - "GO": 1, - "EOS": 2 - } + _GO = 1 + _EOS = 2 + IMG_HEIGHT = 32 def __init__(self, @@ -48,12 +47,16 @@ def __init__(self, self.annotation_path = os.path.join(data_root, annotation_fn) if evaluate: - self.bucket_specs = [(int(math.floor(64 / 4)), int(word_len + 2)), (int(math.floor(108 / 4)), int(word_len + 2)), - (int(math.floor(140 / 4)), int(word_len + 2)), (int(math.floor(256 / 4)), int(word_len + 2)), + self.bucket_specs = [(int(math.floor(64 / 4)), int(word_len + 2)), + (int(math.floor(108 / 4)), int(word_len + 2)), + (int(math.floor(140 / 4)), int(word_len + 2)), + (int(math.floor(256 / 4)), int(word_len + 2)), (int(math.floor(img_width_range[1] / 4)), int(word_len + 2))] else: - self.bucket_specs = [(int(64 / 4), 9 + 2), (int(108 / 4), 15 + 2), - (int(140 / 4), 17 + 2), (int(256 / 4), 20 + 2), + self.bucket_specs = [(int(64 / 4), 9 + 2), + (int(108 / 4), 15 + 2), + (int(140 / 4), 17 + 2), + (int(256 / 4), 20 + 2), (int(math.ceil(img_width_range[1] / 4)), word_len + 2)] self.bucket_data = {i: BucketData() @@ -96,9 +99,9 @@ def gen(self, batch_size): bucket_size = self.bucket_data[b_idx].append(img, width, word, lex) if bucket_size >= batch_size: bucket = self.bucket_data[b_idx].flush_out( - self.bucket_specs, - valid_target_length=valid_target_len, - go_shift=1) + self.bucket_specs, + valid_target_length=valid_target_len, + go_shift=1) if bucket is not None: yield bucket else: @@ -113,12 +116,12 @@ def gen(self, batch_size): def read_data(self, img, lex): assert lex and len(lex) < self.bucket_specs[-1][1] - word = [self.SYMBOLS['GO']] + word = [self._GO] for char in lex: assert 96 < ord(char) < 123 or 47 < ord(char) < 58 word.append( ord(char) - 97 + 13 if ord(char) > 96 else ord(char) - 48 + 3) - word.append(self.SYMBOLS['EOS']) + word.append(self._EOS) word = np.array(word, dtype=np.int32) return img, word From 52665acf876b5709100df04a2e17ce57c35756bb Mon Sep 17 00:00:00 2001 From: Edward Medvedev Date: Thu, 20 Jul 2017 12:03:25 +0200 Subject: [PATCH 06/12] Lint the seq2seq files --- src/model/seq2seq.py | 1746 ++++++++++++++++++------------------ src/model/seq2seq_model.py | 68 +- 2 files changed, 906 insertions(+), 908 deletions(-) diff --git a/src/model/seq2seq.py b/src/model/seq2seq.py index bbff9c05..e354eb16 100644 --- a/src/model/seq2seq.py +++ b/src/model/seq2seq.py @@ -22,34 +22,34 @@ Before using this module, it is recommended to read the TensorFlow tutorial on sequence-to-sequence models. It explains the basic concepts of this module and shows an end-to-end example of how to build a translation model. - https://www.tensorflow.org/versions/master/tutorials/seq2seq/index.html + https://www.tensorflow.org/versions/master/tutorials/seq2seq/index.html Here is an overview of functions available in this module. They all use a very similar interface, so after reading the above tutorial and using one of them, others should be easy to substitute. * Full sequence-to-sequence models. - - basic_rnn_seq2seq: The most basic RNN-RNN model. - - tied_rnn_seq2seq: The basic model with tied encoder and decoder weights. - - embedding_rnn_seq2seq: The basic model with input embedding. - - embedding_tied_rnn_seq2seq: The tied model with input embedding. - - embedding_attention_seq2seq: Advanced model with input embedding and - the neural attention mechanism; recommended for complex tasks. + - basic_rnn_seq2seq: The most basic RNN-RNN model. + - tied_rnn_seq2seq: The basic model with tied encoder and decoder weights. + - embedding_rnn_seq2seq: The basic model with input embedding. + - embedding_tied_rnn_seq2seq: The tied model with input embedding. + - embedding_attention_seq2seq: Advanced model with input embedding and + the neural attention mechanism; recommended for complex tasks. * Multi-task sequence-to-sequence models. - - one2many_rnn_seq2seq: The embedding model with multiple decoders. + - one2many_rnn_seq2seq: The embedding model with multiple decoders. * Decoders (when you write your own encoder, you can use these to decode; - e.g., if you want to write a model that generates captions for images). - - rnn_decoder: The basic decoder based on a pure RNN. - - attention_decoder: A decoder that uses the attention mechanism. + e.g., if you want to write a model that generates captions for images). + - rnn_decoder: The basic decoder based on a pure RNN. + - attention_decoder: A decoder that uses the attention mechanism. * Losses. - - sequence_loss: Loss for a sequence model returning average log-perplexity. - - sequence_loss_by_example: As above, but not averaging over all examples. + - sequence_loss: Loss for a sequence model returning average log-perplexity. + - sequence_loss_by_example: As above, but not averaging over all examples. * model_with_buckets: A convenience function to create models with bucketing - (see the tutorial above for an explanation of why and how to use it). + (see the tutorial above for an explanation of why and how to use it). """ from __future__ import absolute_import @@ -70,202 +70,204 @@ from tensorflow.python.ops import nn_ops from tensorflow.contrib.rnn.python.ops import rnn, rnn_cell from tensorflow.python.ops import variable_scope -linear = rnn_cell._linear # pylint: disable=protected-access + +linear = rnn_cell._linear # pylint: disable=protected-access + def _extract_argmax_and_embed(embedding, output_projection=None, update_embedding=True): - """Get a loop_function that extracts the previous symbol and embeds it. - - Args: - embedding: embedding tensor for symbols. - output_projection: None or a pair (W, B). If provided, each fed previous - output will first be multiplied by W and added B. - update_embedding: Boolean; if False, the gradients will not propagate - through the embeddings. - - Returns: - A loop function. - """ - def loop_function(prev, _): - if output_projection is not None: - prev = nn_ops.xw_plus_b( - prev, output_projection[0], output_projection[1]) - prev_symbol = math_ops.argmax(prev, 1) - # Note that gradients will not propagate through the second parameter of - # embedding_lookup. - emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) - if not update_embedding: - emb_prev = array_ops.stop_gradient(emb_prev) - return emb_prev - return loop_function + """Get a loop_function that extracts the previous symbol and embeds it. + + Args: + embedding: embedding tensor for symbols. + output_projection: None or a pair (W, B). If provided, each fed previous + output will first be multiplied by W and added B. + update_embedding: Boolean; if False, the gradients will not propagate + through the embeddings. + + Returns: + A loop function. + """ + def loop_function(prev, _): + if output_projection is not None: + prev = nn_ops.xw_plus_b( + prev, output_projection[0], output_projection[1]) + prev_symbol = math_ops.argmax(prev, 1) + # Note that gradients will not propagate through the second parameter of + # embedding_lookup. + emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) + if not update_embedding: + emb_prev = array_ops.stop_gradient(emb_prev) + return emb_prev + return loop_function def rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None, scope=None): - """RNN decoder for the sequence-to-sequence model. - - Args: - decoder_inputs: A list of 2D Tensors [batch_size x input_size]. - initial_state: 2D Tensor with shape [batch_size x cell.state_size]. - cell: rnn_cell.RNNCell defining the cell function and size. - loop_function: If not None, this function will be applied to the i-th output - in order to generate the i+1-st input, and decoder_inputs will be ignored, - except for the first element ("GO" symbol). This can be used for decoding, - but also for training to emulate http://arxiv.org/abs/1506.03099. - Signature -- loop_function(prev, i) = next - * prev is a 2D Tensor of shape [batch_size x output_size], - * i is an integer, the step number (when advanced control is needed), - * next is a 2D Tensor of shape [batch_size x input_size]. - scope: VariableScope for the created subgraph; defaults to "rnn_decoder". - - Returns: - A tuple of the form (outputs, state), where: - outputs: A list of the same length as decoder_inputs of 2D Tensors with - shape [batch_size x output_size] containing generated outputs. - state: The state of each cell at the final time-step. - It is a 2D Tensor of shape [batch_size x cell.state_size]. - (Note that in some cases, like basic RNN cell or GRU cell, outputs and - states can be the same. They are different for LSTM cells though.) - """ - with variable_scope.variable_scope(scope or "rnn_decoder"): - state = initial_state - outputs = [] - prev = None - for i, inp in enumerate(decoder_inputs): - if loop_function is not None and prev is not None: - with variable_scope.variable_scope("loop_function", reuse=True): - inp = loop_function(prev, i) - if i > 0: - variable_scope.get_variable_scope().reuse_variables() - output, state = cell(inp, state) - outputs.append(output) - if loop_function is not None: - prev = output - return outputs, state - - -def basic_rnn_seq2seq( - encoder_inputs, decoder_inputs, cell, dtype=dtypes.float32, scope=None): - """Basic RNN sequence-to-sequence model. - - This model first runs an RNN to encode encoder_inputs into a state vector, - then runs decoder, initialized with the last encoder state, on decoder_inputs. - Encoder and decoder use the same RNN cell type, but don't share parameters. - - Args: - encoder_inputs: A list of 2D Tensors [batch_size x input_size]. - decoder_inputs: A list of 2D Tensors [batch_size x input_size]. - cell: rnn_cell.RNNCell defining the cell function and size. - dtype: The dtype of the initial state of the RNN cell (default: tf.float32). - scope: VariableScope for the created subgraph; default: "basic_rnn_seq2seq". - - Returns: - A tuple of the form (outputs, state), where: - outputs: A list of the same length as decoder_inputs of 2D Tensors with - shape [batch_size x output_size] containing the generated outputs. - state: The state of each decoder cell in the final time-step. - It is a 2D Tensor of shape [batch_size x cell.state_size]. - """ - with variable_scope.variable_scope(scope or "basic_rnn_seq2seq"): - _, enc_state = rnn.rnn(cell, encoder_inputs, dtype=dtype) - return rnn_decoder(decoder_inputs, enc_state, cell) + """RNN decoder for the sequence-to-sequence model. + + Args: + decoder_inputs: A list of 2D Tensors [batch_size x input_size]. + initial_state: 2D Tensor with shape [batch_size x cell.state_size]. + cell: rnn_cell.RNNCell defining the cell function and size. + loop_function: If not None, this function will be applied to the i-th output + in order to generate the i+1-st input, and decoder_inputs will be ignored, + except for the first element ("GO" symbol). This can be used for decoding, + but also for training to emulate http://arxiv.org/abs/1506.03099. + Signature -- loop_function(prev, i) = next + * prev is a 2D Tensor of shape [batch_size x output_size], + * i is an integer, the step number (when advanced control is needed), + * next is a 2D Tensor of shape [batch_size x input_size]. + scope: VariableScope for the created subgraph; defaults to "rnn_decoder". + + Returns: + A tuple of the form (outputs, state), where: + outputs: A list of the same length as decoder_inputs of 2D Tensors with + shape [batch_size x output_size] containing generated outputs. + state: The state of each cell at the final time-step. + It is a 2D Tensor of shape [batch_size x cell.state_size]. + (Note that in some cases, like basic RNN cell or GRU cell, outputs and + states can be the same. They are different for LSTM cells though.) + """ + with variable_scope.variable_scope(scope or "rnn_decoder"): + state = initial_state + outputs = [] + prev = None + for i, inp in enumerate(decoder_inputs): + if loop_function is not None and prev is not None: + with variable_scope.variable_scope("loop_function", reuse=True): + inp = loop_function(prev, i) + if i > 0: + variable_scope.get_variable_scope().reuse_variables() + output, state = cell(inp, state) + outputs.append(output) + if loop_function is not None: + prev = output + return outputs, state + + +def basic_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, + dtype=dtypes.float32, scope=None): + """Basic RNN sequence-to-sequence model. + + This model first runs an RNN to encode encoder_inputs into a state vector, + then runs decoder, initialized with the last encoder state, on decoder_inputs. + Encoder and decoder use the same RNN cell type, but don't share parameters. + + Args: + encoder_inputs: A list of 2D Tensors [batch_size x input_size]. + decoder_inputs: A list of 2D Tensors [batch_size x input_size]. + cell: rnn_cell.RNNCell defining the cell function and size. + dtype: The dtype of the initial state of the RNN cell (default: tf.float32). + scope: VariableScope for the created subgraph; default: "basic_rnn_seq2seq". + + Returns: + A tuple of the form (outputs, state), where: + outputs: A list of the same length as decoder_inputs of 2D Tensors with + shape [batch_size x output_size] containing the generated outputs. + state: The state of each decoder cell in the final time-step. + It is a 2D Tensor of shape [batch_size x cell.state_size]. + """ + with variable_scope.variable_scope(scope or "basic_rnn_seq2seq"): + _, enc_state = rnn.rnn(cell, encoder_inputs, dtype=dtype) + return rnn_decoder(decoder_inputs, enc_state, cell) def tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, loop_function=None, dtype=dtypes.float32, scope=None): - """RNN sequence-to-sequence model with tied encoder and decoder parameters. - - This model first runs an RNN to encode encoder_inputs into a state vector, and - then runs decoder, initialized with the last encoder state, on decoder_inputs. - Encoder and decoder use the same RNN cell and share parameters. - - Args: - encoder_inputs: A list of 2D Tensors [batch_size x input_size]. - decoder_inputs: A list of 2D Tensors [batch_size x input_size]. - cell: rnn_cell.RNNCell defining the cell function and size. - loop_function: If not None, this function will be applied to i-th output - in order to generate i+1-th input, and decoder_inputs will be ignored, - except for the first element ("GO" symbol), see rnn_decoder for details. - dtype: The dtype of the initial state of the rnn cell (default: tf.float32). - scope: VariableScope for the created subgraph; default: "tied_rnn_seq2seq". - - Returns: - A tuple of the form (outputs, state), where: - outputs: A list of the same length as decoder_inputs of 2D Tensors with - shape [batch_size x output_size] containing the generated outputs. - state: The state of each decoder cell in each time-step. This is a list - with length len(decoder_inputs) -- one item for each time-step. - It is a 2D Tensor of shape [batch_size x cell.state_size]. - """ - with variable_scope.variable_scope("combined_tied_rnn_seq2seq"): - scope = scope or "tied_rnn_seq2seq" - _, enc_state = rnn.rnn( - cell, encoder_inputs, dtype=dtype, scope=scope) - variable_scope.get_variable_scope().reuse_variables() - return rnn_decoder(decoder_inputs, enc_state, cell, - loop_function=loop_function, scope=scope) + """RNN sequence-to-sequence model with tied encoder and decoder parameters. + + This model first runs an RNN to encode encoder_inputs into a state vector, and + then runs decoder, initialized with the last encoder state, on decoder_inputs. + Encoder and decoder use the same RNN cell and share parameters. + + Args: + encoder_inputs: A list of 2D Tensors [batch_size x input_size]. + decoder_inputs: A list of 2D Tensors [batch_size x input_size]. + cell: rnn_cell.RNNCell defining the cell function and size. + loop_function: If not None, this function will be applied to i-th output + in order to generate i+1-th input, and decoder_inputs will be ignored, + except for the first element ("GO" symbol), see rnn_decoder for details. + dtype: The dtype of the initial state of the rnn cell (default: tf.float32). + scope: VariableScope for the created subgraph; default: "tied_rnn_seq2seq". + + Returns: + A tuple of the form (outputs, state), where: + outputs: A list of the same length as decoder_inputs of 2D Tensors with + shape [batch_size x output_size] containing the generated outputs. + state: The state of each decoder cell in each time-step. This is a list + with length len(decoder_inputs) -- one item for each time-step. + It is a 2D Tensor of shape [batch_size x cell.state_size]. + """ + with variable_scope.variable_scope("combined_tied_rnn_seq2seq"): + scope = scope or "tied_rnn_seq2seq" + _, enc_state = rnn.rnn( + cell, encoder_inputs, dtype=dtype, scope=scope) + variable_scope.get_variable_scope().reuse_variables() + return rnn_decoder(decoder_inputs, enc_state, cell, + loop_function=loop_function, scope=scope) def embedding_rnn_decoder(decoder_inputs, initial_state, cell, num_symbols, embedding_size, output_projection=None, feed_previous=False, update_embedding_for_previous=True, scope=None): - """RNN decoder with embedding and a pure-decoding option. - - Args: - decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs). - initial_state: 2D Tensor [batch_size x cell.state_size]. - cell: rnn_cell.RNNCell defining the cell function. - num_symbols: Integer, how many symbols come into the embedding. - embedding_size: Integer, the length of the embedding vector for each symbol. - output_projection: None or a pair (W, B) of output projection weights and - biases; W has shape [output_size x num_symbols] and B has - shape [num_symbols]; if provided and feed_previous=True, each fed - previous output will first be multiplied by W and added B. - feed_previous: Boolean; if True, only the first of decoder_inputs will be - used (the "GO" symbol), and all other decoder inputs will be generated by: - next = embedding_lookup(embedding, argmax(previous_output)), - In effect, this implements a greedy decoder. It can also be used - during training to emulate http://arxiv.org/abs/1506.03099. - If False, decoder_inputs are used as given (the standard decoder case). - update_embedding_for_previous: Boolean; if False and feed_previous=True, - only the embedding for the first symbol of decoder_inputs (the "GO" - symbol) will be updated by back propagation. Embeddings for the symbols - generated from the decoder itself remain unchanged. This parameter has - no effect if feed_previous=False. - scope: VariableScope for the created subgraph; defaults to - "embedding_rnn_decoder". - - Returns: - A tuple of the form (outputs, state), where: - outputs: A list of the same length as decoder_inputs of 2D Tensors with - shape [batch_size x output_size] containing the generated outputs. - state: The state of each decoder cell in each time-step. This is a list - with length len(decoder_inputs) -- one item for each time-step. - It is a 2D Tensor of shape [batch_size x cell.state_size]. - - Raises: - ValueError: When output_projection has the wrong shape. - """ - if output_projection is not None: - proj_weights = ops.convert_to_tensor(output_projection[0], - dtype=dtypes.float32) - proj_weights.get_shape().assert_is_compatible_with([None, num_symbols]) - proj_biases = ops.convert_to_tensor( - output_projection[1], dtype=dtypes.float32) - proj_biases.get_shape().assert_is_compatible_with([num_symbols]) - - with variable_scope.variable_scope(scope or "embedding_rnn_decoder"): - with ops.device("/cpu:0"): - embedding = variable_scope.get_variable("embedding", - [num_symbols, embedding_size]) - loop_function = _extract_argmax_and_embed( - embedding, output_projection, - update_embedding_for_previous) if feed_previous else None - emb_inp = ( - embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs) - return rnn_decoder(emb_inp, initial_state, cell, - loop_function=loop_function) + """RNN decoder with embedding and a pure-decoding option. + + Args: + decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs). + initial_state: 2D Tensor [batch_size x cell.state_size]. + cell: rnn_cell.RNNCell defining the cell function. + num_symbols: Integer, how many symbols come into the embedding. + embedding_size: Integer, the length of the embedding vector for each symbol. + output_projection: None or a pair (W, B) of output projection weights and + biases; W has shape [output_size x num_symbols] and B has + shape [num_symbols]; if provided and feed_previous=True, each fed + previous output will first be multiplied by W and added B. + feed_previous: Boolean; if True, only the first of decoder_inputs will be + used (the "GO" symbol), and all other decoder inputs will be generated by: + next = embedding_lookup(embedding, argmax(previous_output)), + In effect, this implements a greedy decoder. It can also be used + during training to emulate http://arxiv.org/abs/1506.03099. + If False, decoder_inputs are used as given (the standard decoder case). + update_embedding_for_previous: Boolean; if False and feed_previous=True, + only the embedding for the first symbol of decoder_inputs (the "GO" + symbol) will be updated by back propagation. Embeddings for the symbols + generated from the decoder itself remain unchanged. This parameter has + no effect if feed_previous=False. + scope: VariableScope for the created subgraph; defaults to + "embedding_rnn_decoder". + + Returns: + A tuple of the form (outputs, state), where: + outputs: A list of the same length as decoder_inputs of 2D Tensors with + shape [batch_size x output_size] containing the generated outputs. + state: The state of each decoder cell in each time-step. This is a list + with length len(decoder_inputs) -- one item for each time-step. + It is a 2D Tensor of shape [batch_size x cell.state_size]. + + Raises: + ValueError: When output_projection has the wrong shape. + """ + if output_projection is not None: + proj_weights = ops.convert_to_tensor(output_projection[0], + dtype=dtypes.float32) + proj_weights.get_shape().assert_is_compatible_with([None, num_symbols]) + proj_biases = ops.convert_to_tensor( + output_projection[1], dtype=dtypes.float32) + proj_biases.get_shape().assert_is_compatible_with([num_symbols]) + + with variable_scope.variable_scope(scope or "embedding_rnn_decoder"): + with ops.device("/cpu:0"): + embedding = variable_scope.get_variable("embedding", + [num_symbols, embedding_size]) + loop_function = _extract_argmax_and_embed( + embedding, output_projection, + update_embedding_for_previous) if feed_previous else None + emb_inp = ( + embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs) + return rnn_decoder(emb_inp, initial_state, cell, + loop_function=loop_function) def embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, @@ -273,333 +275,333 @@ def embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, embedding_size, output_projection=None, feed_previous=False, dtype=dtypes.float32, scope=None): - """Embedding RNN sequence-to-sequence model. - - This model first embeds encoder_inputs by a newly created embedding (of shape - [num_encoder_symbols x input_size]). Then it runs an RNN to encode - embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs - by another newly created embedding (of shape [num_decoder_symbols x - input_size]). Then it runs RNN decoder, initialized with the last - encoder state, on embedded decoder_inputs. - - Args: - encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. - decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. - cell: rnn_cell.RNNCell defining the cell function and size. - num_encoder_symbols: Integer; number of symbols on the encoder side. - num_decoder_symbols: Integer; number of symbols on the decoder side. - embedding_size: Integer, the length of the embedding vector for each symbol. - output_projection: None or a pair (W, B) of output projection weights and - biases; W has shape [output_size x num_decoder_symbols] and B has - shape [num_decoder_symbols]; if provided and feed_previous=True, each - fed previous output will first be multiplied by W and added B. - feed_previous: Boolean or scalar Boolean Tensor; if True, only the first - of decoder_inputs will be used (the "GO" symbol), and all other decoder - inputs will be taken from previous outputs (as in embedding_rnn_decoder). - If False, decoder_inputs are used as given (the standard decoder case). - dtype: The dtype of the initial state for both the encoder and encoder - rnn cells (default: tf.float32). - scope: VariableScope for the created subgraph; defaults to - "embedding_rnn_seq2seq" - - Returns: - A tuple of the form (outputs, state), where: - outputs: A list of the same length as decoder_inputs of 2D Tensors with - shape [batch_size x num_decoder_symbols] containing the generated - outputs. - state: The state of each decoder cell in each time-step. This is a list - with length len(decoder_inputs) -- one item for each time-step. - It is a 2D Tensor of shape [batch_size x cell.state_size]. - """ - with variable_scope.variable_scope(scope or "embedding_rnn_seq2seq"): - # Encoder. - encoder_cell = rnn_cell.EmbeddingWrapper( - cell, embedding_classes=num_encoder_symbols, - embedding_size=embedding_size) - _, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype) - - # Decoder. - if output_projection is None: - cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) - - if isinstance(feed_previous, bool): - return embedding_rnn_decoder( - decoder_inputs, encoder_state, cell, num_decoder_symbols, - embedding_size, output_projection=output_projection, - feed_previous=feed_previous) - - # If feed_previous is a Tensor, we construct 2 graphs and use cond. - def decoder(feed_previous_bool): - reuse = None if feed_previous_bool else True - with variable_scope.variable_scope(variable_scope.get_variable_scope(), - reuse=reuse): - outputs, state = embedding_rnn_decoder( - decoder_inputs, encoder_state, cell, num_decoder_symbols, - embedding_size, output_projection=output_projection, - feed_previous=feed_previous_bool, - update_embedding_for_previous=False) - return outputs + [state] - - outputs_and_state = control_flow_ops.cond(feed_previous, - lambda: decoder(True), - lambda: decoder(False)) - return outputs_and_state[:-1], outputs_and_state[-1] + """Embedding RNN sequence-to-sequence model. + + This model first embeds encoder_inputs by a newly created embedding (of shape + [num_encoder_symbols x input_size]). Then it runs an RNN to encode + embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs + by another newly created embedding (of shape [num_decoder_symbols x + input_size]). Then it runs RNN decoder, initialized with the last + encoder state, on embedded decoder_inputs. + + Args: + encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. + decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. + cell: rnn_cell.RNNCell defining the cell function and size. + num_encoder_symbols: Integer; number of symbols on the encoder side. + num_decoder_symbols: Integer; number of symbols on the decoder side. + embedding_size: Integer, the length of the embedding vector for each symbol. + output_projection: None or a pair (W, B) of output projection weights and + biases; W has shape [output_size x num_decoder_symbols] and B has + shape [num_decoder_symbols]; if provided and feed_previous=True, each + fed previous output will first be multiplied by W and added B. + feed_previous: Boolean or scalar Boolean Tensor; if True, only the first + of decoder_inputs will be used (the "GO" symbol), and all other decoder + inputs will be taken from previous outputs (as in embedding_rnn_decoder). + If False, decoder_inputs are used as given (the standard decoder case). + dtype: The dtype of the initial state for both the encoder and encoder + rnn cells (default: tf.float32). + scope: VariableScope for the created subgraph; defaults to + "embedding_rnn_seq2seq" + + Returns: + A tuple of the form (outputs, state), where: + outputs: A list of the same length as decoder_inputs of 2D Tensors with + shape [batch_size x num_decoder_symbols] containing the generated + outputs. + state: The state of each decoder cell in each time-step. This is a list + with length len(decoder_inputs) -- one item for each time-step. + It is a 2D Tensor of shape [batch_size x cell.state_size]. + """ + with variable_scope.variable_scope(scope or "embedding_rnn_seq2seq"): + # Encoder. + encoder_cell = rnn_cell.EmbeddingWrapper( + cell, embedding_classes=num_encoder_symbols, + embedding_size=embedding_size) + _, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype) + + # Decoder. + if output_projection is None: + cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) + + if isinstance(feed_previous, bool): + return embedding_rnn_decoder( + decoder_inputs, encoder_state, cell, num_decoder_symbols, + embedding_size, output_projection=output_projection, + feed_previous=feed_previous) + + # If feed_previous is a Tensor, we construct 2 graphs and use cond. + def decoder(feed_previous_bool): + reuse = None if feed_previous_bool else True + with variable_scope.variable_scope(variable_scope.get_variable_scope(), + reuse=reuse): + outputs, state = embedding_rnn_decoder( + decoder_inputs, encoder_state, cell, num_decoder_symbols, + embedding_size, output_projection=output_projection, + feed_previous=feed_previous_bool, + update_embedding_for_previous=False) + return outputs + [state] + + outputs_and_state = control_flow_ops.cond(feed_previous, + lambda: decoder(True), + lambda: decoder(False)) + return outputs_and_state[:-1], outputs_and_state[-1] def embedding_tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, num_symbols, embedding_size, output_projection=None, feed_previous=False, dtype=dtypes.float32, scope=None): - """Embedding RNN sequence-to-sequence model with tied (shared) parameters. - - This model first embeds encoder_inputs by a newly created embedding (of shape - [num_symbols x input_size]). Then it runs an RNN to encode embedded - encoder_inputs into a state vector. Next, it embeds decoder_inputs using - the same embedding. Then it runs RNN decoder, initialized with the last - encoder state, on embedded decoder_inputs. - - Args: - encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. - decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. - cell: rnn_cell.RNNCell defining the cell function and size. - num_symbols: Integer; number of symbols for both encoder and decoder. - embedding_size: Integer, the length of the embedding vector for each symbol. - output_projection: None or a pair (W, B) of output projection weights and - biases; W has shape [output_size x num_symbols] and B has - shape [num_symbols]; if provided and feed_previous=True, each - fed previous output will first be multiplied by W and added B. - feed_previous: Boolean or scalar Boolean Tensor; if True, only the first - of decoder_inputs will be used (the "GO" symbol), and all other decoder - inputs will be taken from previous outputs (as in embedding_rnn_decoder). - If False, decoder_inputs are used as given (the standard decoder case). - dtype: The dtype to use for the initial RNN states (default: tf.float32). - scope: VariableScope for the created subgraph; defaults to - "embedding_tied_rnn_seq2seq". - - Returns: - A tuple of the form (outputs, state), where: - outputs: A list of the same length as decoder_inputs of 2D Tensors with - shape [batch_size x num_decoder_symbols] containing the generated - outputs. - state: The state of each decoder cell at the final time-step. - It is a 2D Tensor of shape [batch_size x cell.state_size]. - - Raises: - ValueError: When output_projection has the wrong shape. - """ - if output_projection is not None: - proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype) - proj_weights.get_shape().assert_is_compatible_with([None, num_symbols]) - proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) - proj_biases.get_shape().assert_is_compatible_with([num_symbols]) - - with variable_scope.variable_scope(scope or "embedding_tied_rnn_seq2seq"): - with ops.device("/cpu:0"): - embedding = variable_scope.get_variable("embedding", - [num_symbols, embedding_size]) - - emb_encoder_inputs = [embedding_ops.embedding_lookup(embedding, x) - for x in encoder_inputs] - emb_decoder_inputs = [embedding_ops.embedding_lookup(embedding, x) - for x in decoder_inputs] - - if output_projection is None: - cell = rnn_cell.OutputProjectionWrapper(cell, num_symbols) - - if isinstance(feed_previous, bool): - loop_function = _extract_argmax_and_embed( - embedding, output_projection, True) if feed_previous else None - return tied_rnn_seq2seq(emb_encoder_inputs, emb_decoder_inputs, cell, - loop_function=loop_function, dtype=dtype) - - # If feed_previous is a Tensor, we construct 2 graphs and use cond. - def decoder(feed_previous_bool): - loop_function = _extract_argmax_and_embed( - embedding, output_projection, False) if feed_previous_bool else None - reuse = None if feed_previous_bool else True - with variable_scope.variable_scope(variable_scope.get_variable_scope(), - reuse=reuse): - outputs, state = tied_rnn_seq2seq( - emb_encoder_inputs, emb_decoder_inputs, cell, - loop_function=loop_function, dtype=dtype) - return outputs + [state] - - outputs_and_state = control_flow_ops.cond(feed_previous, - lambda: decoder(True), - lambda: decoder(False)) - return outputs_and_state[:-1], outputs_and_state[-1] + """Embedding RNN sequence-to-sequence model with tied (shared) parameters. + + This model first embeds encoder_inputs by a newly created embedding (of shape + [num_symbols x input_size]). Then it runs an RNN to encode embedded + encoder_inputs into a state vector. Next, it embeds decoder_inputs using + the same embedding. Then it runs RNN decoder, initialized with the last + encoder state, on embedded decoder_inputs. + + Args: + encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. + decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. + cell: rnn_cell.RNNCell defining the cell function and size. + num_symbols: Integer; number of symbols for both encoder and decoder. + embedding_size: Integer, the length of the embedding vector for each symbol. + output_projection: None or a pair (W, B) of output projection weights and + biases; W has shape [output_size x num_symbols] and B has + shape [num_symbols]; if provided and feed_previous=True, each + fed previous output will first be multiplied by W and added B. + feed_previous: Boolean or scalar Boolean Tensor; if True, only the first + of decoder_inputs will be used (the "GO" symbol), and all other decoder + inputs will be taken from previous outputs (as in embedding_rnn_decoder). + If False, decoder_inputs are used as given (the standard decoder case). + dtype: The dtype to use for the initial RNN states (default: tf.float32). + scope: VariableScope for the created subgraph; defaults to + "embedding_tied_rnn_seq2seq". + + Returns: + A tuple of the form (outputs, state), where: + outputs: A list of the same length as decoder_inputs of 2D Tensors with + shape [batch_size x num_decoder_symbols] containing the generated + outputs. + state: The state of each decoder cell at the final time-step. + It is a 2D Tensor of shape [batch_size x cell.state_size]. + + Raises: + ValueError: When output_projection has the wrong shape. + """ + if output_projection is not None: + proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype) + proj_weights.get_shape().assert_is_compatible_with([None, num_symbols]) + proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) + proj_biases.get_shape().assert_is_compatible_with([num_symbols]) + + with variable_scope.variable_scope(scope or "embedding_tied_rnn_seq2seq"): + with ops.device("/cpu:0"): + embedding = variable_scope.get_variable("embedding", + [num_symbols, embedding_size]) + + emb_encoder_inputs = [embedding_ops.embedding_lookup(embedding, x) + for x in encoder_inputs] + emb_decoder_inputs = [embedding_ops.embedding_lookup(embedding, x) + for x in decoder_inputs] + + if output_projection is None: + cell = rnn_cell.OutputProjectionWrapper(cell, num_symbols) + + if isinstance(feed_previous, bool): + loop_function = _extract_argmax_and_embed( + embedding, output_projection, True) if feed_previous else None + return tied_rnn_seq2seq(emb_encoder_inputs, emb_decoder_inputs, cell, + loop_function=loop_function, dtype=dtype) + + # If feed_previous is a Tensor, we construct 2 graphs and use cond. + def decoder(feed_previous_bool): + loop_function = _extract_argmax_and_embed( + embedding, output_projection, False) if feed_previous_bool else None + reuse = None if feed_previous_bool else True + with variable_scope.variable_scope(variable_scope.get_variable_scope(), + reuse=reuse): + outputs, state = tied_rnn_seq2seq( + emb_encoder_inputs, emb_decoder_inputs, cell, + loop_function=loop_function, dtype=dtype) + return outputs + [state] + + outputs_and_state = control_flow_ops.cond(feed_previous, + lambda: decoder(True), + lambda: decoder(False)) + return outputs_and_state[:-1], outputs_and_state[-1] def attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=dtypes.float32, scope=None, initial_state_attention=False, attn_num_hidden=128): - """RNN decoder with attention for the sequence-to-sequence model. - - In this context "attention" means that, during decoding, the RNN can look up - information in the additional tensor attention_states, and it does this by - focusing on a few entries from the tensor. This model has proven to yield - especially good results in a number of sequence-to-sequence tasks. This - implementation is based on http://arxiv.org/abs/1412.7449 (see below for - details). It is recommended for complex sequence-to-sequence tasks. - - Args: - decoder_inputs: A list of 2D Tensors [batch_size x input_size]. - initial_state: 2D Tensor [batch_size x cell.state_size]. - attention_states: 3D Tensor [batch_size x attn_length x attn_size]. - cell: rnn_cell.RNNCell defining the cell function and size. - output_size: Size of the output vectors; if None, we use cell.output_size. - num_heads: Number of attention heads that read from attention_states. - loop_function: If not None, this function will be applied to i-th output - in order to generate i+1-th input, and decoder_inputs will be ignored, - except for the first element ("GO" symbol). This can be used for decoding, - but also for training to emulate http://arxiv.org/abs/1506.03099. - Signature -- loop_function(prev, i) = next - * prev is a 2D Tensor of shape [batch_size x output_size], - * i is an integer, the step number (when advanced control is needed), - * next is a 2D Tensor of shape [batch_size x input_size]. - dtype: The dtype to use for the RNN initial state (default: tf.float32). - scope: VariableScope for the created subgraph; default: "attention_decoder". - initial_state_attention: If False (default), initial attentions are zero. - If True, initialize the attentions from the initial state and attention - states -- useful when we wish to resume decoding from a previously - stored decoder state and attention states. - - Returns: - A tuple of the form (outputs, state), where: - outputs: A list of the same length as decoder_inputs of 2D Tensors of - shape [batch_size x output_size]. These represent the generated outputs. - Output i is computed from input i (which is either the i-th element - of decoder_inputs or loop_function(output {i-1}, i)) as follows. - First, we run the cell on a combination of the input and previous - attention masks: - cell_output, new_state = cell(linear(input, prev_attn), prev_state). - Then, we calculate new attention masks: - new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) - and then we calculate the output: - output = linear(cell_output, new_attn). - state: The state of each decoder cell the final time-step. - It is a 2D Tensor of shape [batch_size x cell.state_size]. - - Raises: - ValueError: when num_heads is not positive, there are no inputs, or shapes - of attention_states are not set. - """ - # MODIFIED ADD START - assert num_heads == 1, 'We only consider the case where num_heads=1!' - # MODIFIED ADD END - if not decoder_inputs: - raise ValueError("Must provide at least 1 input to attention decoder.") - if num_heads < 1: - raise ValueError("With less than 1 heads, use a non-attention decoder.") - if not attention_states.get_shape()[1:2].is_fully_defined(): - raise ValueError("Shape[1] and [2] of attention_states must be known: %s" - % attention_states.get_shape()) - if output_size is None: - output_size = cell.output_size - - with variable_scope.variable_scope(scope or "attention_decoder"): - batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. - attn_length = attention_states.get_shape()[1].value - attn_size = attention_states.get_shape()[2].value - - # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. - hidden = array_ops.reshape( - attention_states, [-1, attn_length, 1, attn_size]) - hidden_features = [] - v = [] - attention_vec_size = attn_size # Size of query vectors for attention. - for a in xrange(num_heads): - k = variable_scope.get_variable("AttnW_%d" % a, - [1, 1, attn_size, attention_vec_size]) - hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) - v.append(variable_scope.get_variable("AttnV_%d" % a, - [attention_vec_size])) - - state = initial_state - - # MODIFIED: return both context vector and attention weights - def attention(query): - """Put attention masks on hidden using hidden_features and query.""" - # MODIFIED ADD START - ss = None # record attention weights - # MODIFIED ADD END - ds = [] # Results of attention reads will be stored here. - for a in xrange(num_heads): - with variable_scope.variable_scope("Attention_%d" % a): - y = linear(query, attention_vec_size, True) - y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) - # Attention mask is a softmax of v^T * tanh(...). - s = math_ops.reduce_sum( - v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) - a = nn_ops.softmax(s) - ss = a - #a = tf.Print(a, [a], message="a: ",summarize=30) - # Now calculate the attention-weighted vector d. - d = math_ops.reduce_sum( - array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, - [1, 2]) - ds.append(array_ops.reshape(d, [-1, attn_size])) - # MODIFIED DELETED return ds - # MODIFIED ADD START - return ds, ss - # MODIFIED ADD END - - outputs = [] + """RNN decoder with attention for the sequence-to-sequence model. + + In this context "attention" means that, during decoding, the RNN can look up + information in the additional tensor attention_states, and it does this by + focusing on a few entries from the tensor. This model has proven to yield + especially good results in a number of sequence-to-sequence tasks. This + implementation is based on http://arxiv.org/abs/1412.7449 (see below for + details). It is recommended for complex sequence-to-sequence tasks. + + Args: + decoder_inputs: A list of 2D Tensors [batch_size x input_size]. + initial_state: 2D Tensor [batch_size x cell.state_size]. + attention_states: 3D Tensor [batch_size x attn_length x attn_size]. + cell: rnn_cell.RNNCell defining the cell function and size. + output_size: Size of the output vectors; if None, we use cell.output_size. + num_heads: Number of attention heads that read from attention_states. + loop_function: If not None, this function will be applied to i-th output + in order to generate i+1-th input, and decoder_inputs will be ignored, + except for the first element ("GO" symbol). This can be used for decoding, + but also for training to emulate http://arxiv.org/abs/1506.03099. + Signature -- loop_function(prev, i) = next + * prev is a 2D Tensor of shape [batch_size x output_size], + * i is an integer, the step number (when advanced control is needed), + * next is a 2D Tensor of shape [batch_size x input_size]. + dtype: The dtype to use for the RNN initial state (default: tf.float32). + scope: VariableScope for the created subgraph; default: "attention_decoder". + initial_state_attention: If False (default), initial attentions are zero. + If True, initialize the attentions from the initial state and attention + states -- useful when we wish to resume decoding from a previously + stored decoder state and attention states. + + Returns: + A tuple of the form (outputs, state), where: + outputs: A list of the same length as decoder_inputs of 2D Tensors of + shape [batch_size x output_size]. These represent the generated outputs. + Output i is computed from input i (which is either the i-th element + of decoder_inputs or loop_function(output {i-1}, i)) as follows. + First, we run the cell on a combination of the input and previous + attention masks: + cell_output, new_state = cell(linear(input, prev_attn), prev_state). + Then, we calculate new attention masks: + new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) + and then we calculate the output: + output = linear(cell_output, new_attn). + state: The state of each decoder cell the final time-step. + It is a 2D Tensor of shape [batch_size x cell.state_size]. + + Raises: + ValueError: when num_heads is not positive, there are no inputs, or shapes + of attention_states are not set. + """ # MODIFIED ADD START - attention_weights_history = [] + assert num_heads == 1, 'We only consider the case where num_heads=1!' # MODIFIED ADD END - prev = None - batch_attn_size = array_ops.stack([batch_size, attn_size]) - attns = [array_ops.zeros(batch_attn_size, dtype=dtype) - for _ in xrange(num_heads)] - for a in attns: # Ensure the second shape of attention vectors is set. - a.set_shape([None, attn_size]) - if initial_state_attention: - # MODIFIED DELETED attns = attention(initial_state) - # MODIFIED ADD START - attns, attn_weights = attention(initial_state) - attention_weights_history.append(attn_weights) - # MODIFIED ADD END - for i, inp in enumerate(decoder_inputs): - if i > 0: - variable_scope.get_variable_scope().reuse_variables() - # If loop_function is set, we use it instead of decoder_inputs. - if loop_function is not None and prev is not None: - with variable_scope.variable_scope("loop_function", reuse=True): - inp = loop_function(prev, i) - # Merge input and previous attentions into one vector of the right size. - #input_size = inp.get_shape().with_rank(2)[1] - # TODO: use input_size - input_size = attn_num_hidden - x = linear([inp] + attns, input_size, True) - # Run the RNN. - cell_output, state = cell(x, state) - # Run the attention mechanism. - if i == 0 and initial_state_attention: - with variable_scope.variable_scope(variable_scope.get_variable_scope(), - reuse=True): - # MODIFIED DELETED attns = attention(state) - # MODIFIED ADD START - attns, attn_weights = attention(state) - # MODIFIED ADD END - else: - # MODIFIED DELETED attns = attention(state) + if not decoder_inputs: + raise ValueError("Must provide at least 1 input to attention decoder.") + if num_heads < 1: + raise ValueError("With less than 1 heads, use a non-attention decoder.") + if not attention_states.get_shape()[1:2].is_fully_defined(): + raise ValueError("Shape[1] and [2] of attention_states must be known: %s" + % attention_states.get_shape()) + if output_size is None: + output_size = cell.output_size + + with variable_scope.variable_scope(scope or "attention_decoder"): + batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. + attn_length = attention_states.get_shape()[1].value + attn_size = attention_states.get_shape()[2].value + + # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. + hidden = array_ops.reshape( + attention_states, [-1, attn_length, 1, attn_size]) + hidden_features = [] + v = [] + attention_vec_size = attn_size # Size of query vectors for attention. + for a in xrange(num_heads): + k = variable_scope.get_variable("AttnW_%d" % a, + [1, 1, attn_size, attention_vec_size]) + hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) + v.append(variable_scope.get_variable("AttnV_%d" % a, + [attention_vec_size])) + + state = initial_state + + # MODIFIED: return both context vector and attention weights + def attention(query): + """Put attention masks on hidden using hidden_features and query.""" + # MODIFIED ADD START + ss = None # record attention weights + # MODIFIED ADD END + ds = [] # Results of attention reads will be stored here. + for a in xrange(num_heads): + with variable_scope.variable_scope("Attention_%d" % a): + y = linear(query, attention_vec_size, True) + y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) + # Attention mask is a softmax of v^T * tanh(...). + s = math_ops.reduce_sum( + v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) + a = nn_ops.softmax(s) + ss = a + #a = tf.Print(a, [a], message="a: ",summarize=30) + # Now calculate the attention-weighted vector d. + d = math_ops.reduce_sum( + array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, + [1, 2]) + ds.append(array_ops.reshape(d, [-1, attn_size])) + # MODIFIED DELETED return ds + # MODIFIED ADD START + return ds, ss + # MODIFIED ADD END + + outputs = [] # MODIFIED ADD START - attns, attn_weights = attention(state) - attention_weights_history.append(attn_weights) + attention_weights_history = [] # MODIFIED ADD END - - with variable_scope.variable_scope("AttnOutputProjection"): - output = linear([cell_output] + attns, output_size, True) - if loop_function is not None: - prev = output - outputs.append(output) - - # MODIFIED DELETED return outputs, state - # MODIFIED ADD START - return outputs, state, attention_weights_history - # MODIFIED ADD END + prev = None + batch_attn_size = array_ops.stack([batch_size, attn_size]) + attns = [array_ops.zeros(batch_attn_size, dtype=dtype) + for _ in xrange(num_heads)] + for a in attns: # Ensure the second shape of attention vectors is set. + a.set_shape([None, attn_size]) + if initial_state_attention: + # MODIFIED DELETED attns = attention(initial_state) + # MODIFIED ADD START + attns, attn_weights = attention(initial_state) + attention_weights_history.append(attn_weights) + # MODIFIED ADD END + for i, inp in enumerate(decoder_inputs): + if i > 0: + variable_scope.get_variable_scope().reuse_variables() + # If loop_function is set, we use it instead of decoder_inputs. + if loop_function is not None and prev is not None: + with variable_scope.variable_scope("loop_function", reuse=True): + inp = loop_function(prev, i) + # Merge input and previous attentions into one vector of the right size. + # input_size = inp.get_shape().with_rank(2)[1] + # TODO: use input_size + input_size = attn_num_hidden + x = linear([inp] + attns, input_size, True) + # Run the RNN. + cell_output, state = cell(x, state) + # Run the attention mechanism. + if i == 0 and initial_state_attention: + with variable_scope.variable_scope(variable_scope.get_variable_scope(), + reuse=True): + # MODIFIED DELETED attns = attention(state) + # MODIFIED ADD START + attns, attn_weights = attention(state) + # MODIFIED ADD END + else: + # MODIFIED DELETED attns = attention(state) + # MODIFIED ADD START + attns, attn_weights = attention(state) + attention_weights_history.append(attn_weights) + # MODIFIED ADD END + + with variable_scope.variable_scope("AttnOutputProjection"): + output = linear([cell_output] + attns, output_size, True) + if loop_function is not None: + prev = output + outputs.append(output) + + # MODIFIED DELETED return outputs, state + # MODIFIED ADD START + return outputs, state, attention_weights_history + # MODIFIED ADD END def embedding_attention_decoder(decoder_inputs, initial_state, attention_states, @@ -610,69 +612,69 @@ def embedding_attention_decoder(decoder_inputs, initial_state, attention_states, dtype=dtypes.float32, scope=None, initial_state_attention=False, attn_num_hidden=128): - """RNN decoder with embedding and attention and a pure-decoding option. - - Args: - decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs). - initial_state: 2D Tensor [batch_size x cell.state_size]. - attention_states: 3D Tensor [batch_size x attn_length x attn_size]. - cell: rnn_cell.RNNCell defining the cell function. - num_symbols: Integer, how many symbols come into the embedding. - embedding_size: Integer, the length of the embedding vector for each symbol. - num_heads: Number of attention heads that read from attention_states. - output_size: Size of the output vectors; if None, use output_size. - output_projection: None or a pair (W, B) of output projection weights and - biases; W has shape [output_size x num_symbols] and B has shape - [num_symbols]; if provided and feed_previous=True, each fed previous - output will first be multiplied by W and added B. - feed_previous: Boolean; if True, only the first of decoder_inputs will be - used (the "GO" symbol), and all other decoder inputs will be generated by: - next = embedding_lookup(embedding, argmax(previous_output)), - In effect, this implements a greedy decoder. It can also be used - during training to emulate http://arxiv.org/abs/1506.03099. - If False, decoder_inputs are used as given (the standard decoder case). - update_embedding_for_previous: Boolean; if False and feed_previous=True, - only the embedding for the first symbol of decoder_inputs (the "GO" - symbol) will be updated by back propagation. Embeddings for the symbols - generated from the decoder itself remain unchanged. This parameter has - no effect if feed_previous=False. - dtype: The dtype to use for the RNN initial states (default: tf.float32). - scope: VariableScope for the created subgraph; defaults to - "embedding_attention_decoder". - initial_state_attention: If False (default), initial attentions are zero. - If True, initialize the attentions from the initial state and attention - states -- useful when we wish to resume decoding from a previously - stored decoder state and attention states. - - Returns: - A tuple of the form (outputs, state), where: - outputs: A list of the same length as decoder_inputs of 2D Tensors with - shape [batch_size x output_size] containing the generated outputs. - state: The state of each decoder cell at the final time-step. - It is a 2D Tensor of shape [batch_size x cell.state_size]. - - Raises: - ValueError: When output_projection has the wrong shape. - """ - if output_size is None: - output_size = cell.output_size - if output_projection is not None: - proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) - proj_biases.get_shape().assert_is_compatible_with([num_symbols]) - - with variable_scope.variable_scope(scope or "embedding_attention_decoder"): - with ops.device("/cpu:0"): - embedding = variable_scope.get_variable("embedding", - [num_symbols, embedding_size]) - loop_function = _extract_argmax_and_embed( - embedding, output_projection, - update_embedding_for_previous) if feed_previous else None - emb_inp = [ - embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs] - return attention_decoder( - emb_inp, initial_state, attention_states, cell, output_size=output_size, - num_heads=num_heads, loop_function=loop_function, - initial_state_attention=initial_state_attention, attn_num_hidden=attn_num_hidden) + """RNN decoder with embedding and attention and a pure-decoding option. + + Args: + decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs). + initial_state: 2D Tensor [batch_size x cell.state_size]. + attention_states: 3D Tensor [batch_size x attn_length x attn_size]. + cell: rnn_cell.RNNCell defining the cell function. + num_symbols: Integer, how many symbols come into the embedding. + embedding_size: Integer, the length of the embedding vector for each symbol. + num_heads: Number of attention heads that read from attention_states. + output_size: Size of the output vectors; if None, use output_size. + output_projection: None or a pair (W, B) of output projection weights and + biases; W has shape [output_size x num_symbols] and B has shape + [num_symbols]; if provided and feed_previous=True, each fed previous + output will first be multiplied by W and added B. + feed_previous: Boolean; if True, only the first of decoder_inputs will be + used (the "GO" symbol), and all other decoder inputs will be generated by: + next = embedding_lookup(embedding, argmax(previous_output)), + In effect, this implements a greedy decoder. It can also be used + during training to emulate http://arxiv.org/abs/1506.03099. + If False, decoder_inputs are used as given (the standard decoder case). + update_embedding_for_previous: Boolean; if False and feed_previous=True, + only the embedding for the first symbol of decoder_inputs (the "GO" + symbol) will be updated by back propagation. Embeddings for the symbols + generated from the decoder itself remain unchanged. This parameter has + no effect if feed_previous=False. + dtype: The dtype to use for the RNN initial states (default: tf.float32). + scope: VariableScope for the created subgraph; defaults to + "embedding_attention_decoder". + initial_state_attention: If False (default), initial attentions are zero. + If True, initialize the attentions from the initial state and attention + states -- useful when we wish to resume decoding from a previously + stored decoder state and attention states. + + Returns: + A tuple of the form (outputs, state), where: + outputs: A list of the same length as decoder_inputs of 2D Tensors with + shape [batch_size x output_size] containing the generated outputs. + state: The state of each decoder cell at the final time-step. + It is a 2D Tensor of shape [batch_size x cell.state_size]. + + Raises: + ValueError: When output_projection has the wrong shape. + """ + if output_size is None: + output_size = cell.output_size + if output_projection is not None: + proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) + proj_biases.get_shape().assert_is_compatible_with([num_symbols]) + + with variable_scope.variable_scope(scope or "embedding_attention_decoder"): + with ops.device("/cpu:0"): + embedding = variable_scope.get_variable("embedding", + [num_symbols, embedding_size]) + loop_function = _extract_argmax_and_embed( + embedding, output_projection, + update_embedding_for_previous) if feed_previous else None + emb_inp = [ + embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs] + return attention_decoder( + emb_inp, initial_state, attention_states, cell, output_size=output_size, + num_heads=num_heads, loop_function=loop_function, + initial_state_attention=initial_state_attention, attn_num_hidden=attn_num_hidden) def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, @@ -681,329 +683,329 @@ def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, num_heads=1, output_projection=None, feed_previous=False, dtype=dtypes.float32, scope=None, initial_state_attention=False): - """Embedding sequence-to-sequence model with attention. - - This model first embeds encoder_inputs by a newly created embedding (of shape - [num_encoder_symbols x input_size]). Then it runs an RNN to encode - embedded encoder_inputs into a state vector. It keeps the outputs of this - RNN at every step to use for attention later. Next, it embeds decoder_inputs - by another newly created embedding (of shape [num_decoder_symbols x - input_size]). Then it runs attention decoder, initialized with the last - encoder state, on embedded decoder_inputs and attending to encoder outputs. - - Args: - encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. - decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. - cell: rnn_cell.RNNCell defining the cell function and size. - num_encoder_symbols: Integer; number of symbols on the encoder side. - num_decoder_symbols: Integer; number of symbols on the decoder side. - embedding_size: Integer, the length of the embedding vector for each symbol. - num_heads: Number of attention heads that read from attention_states. - output_projection: None or a pair (W, B) of output projection weights and - biases; W has shape [output_size x num_decoder_symbols] and B has - shape [num_decoder_symbols]; if provided and feed_previous=True, each - fed previous output will first be multiplied by W and added B. - feed_previous: Boolean or scalar Boolean Tensor; if True, only the first - of decoder_inputs will be used (the "GO" symbol), and all other decoder - inputs will be taken from previous outputs (as in embedding_rnn_decoder). - If False, decoder_inputs are used as given (the standard decoder case). - dtype: The dtype of the initial RNN state (default: tf.float32). - scope: VariableScope for the created subgraph; defaults to - "embedding_attention_seq2seq". - initial_state_attention: If False (default), initial attentions are zero. - If True, initialize the attentions from the initial state and attention - states. - - Returns: - A tuple of the form (outputs, state), where: - outputs: A list of the same length as decoder_inputs of 2D Tensors with - shape [batch_size x num_decoder_symbols] containing the generated - outputs. - state: The state of each decoder cell at the final time-step. - It is a 2D Tensor of shape [batch_size x cell.state_size]. - """ - with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"): - # Encoder. - encoder_cell = rnn_cell.EmbeddingWrapper( - cell, embedding_classes=num_encoder_symbols, - embedding_size=embedding_size) - encoder_outputs, encoder_state = rnn.rnn( - encoder_cell, encoder_inputs, dtype=dtype) - - # First calculate a concatenation of encoder outputs to put attention on. - top_states = [array_ops.reshape(e, [-1, 1, cell.output_size]) - for e in encoder_outputs] - attention_states = array_ops.concat(1, top_states) - - # Decoder. - output_size = None - if output_projection is None: - cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) - output_size = num_decoder_symbols - - if isinstance(feed_previous, bool): - return embedding_attention_decoder( - decoder_inputs, encoder_state, attention_states, cell, - num_decoder_symbols, embedding_size, num_heads=num_heads, - output_size=output_size, output_projection=output_projection, - feed_previous=feed_previous, - initial_state_attention=initial_state_attention) - - # If feed_previous is a Tensor, we construct 2 graphs and use cond. - def decoder(feed_previous_bool): - reuse = None if feed_previous_bool else True - with variable_scope.variable_scope(variable_scope.get_variable_scope(), - reuse=reuse): - outputs, state = embedding_attention_decoder( - decoder_inputs, encoder_state, attention_states, cell, - num_decoder_symbols, embedding_size, num_heads=num_heads, - output_size=output_size, output_projection=output_projection, - feed_previous=feed_previous_bool, - update_embedding_for_previous=False, - initial_state_attention=initial_state_attention) - return outputs + [state] - - outputs_and_state = control_flow_ops.cond(feed_previous, - lambda: decoder(True), - lambda: decoder(False)) - return outputs_and_state[:-1], outputs_and_state[-1] + """Embedding sequence-to-sequence model with attention. + + This model first embeds encoder_inputs by a newly created embedding (of shape + [num_encoder_symbols x input_size]). Then it runs an RNN to encode + embedded encoder_inputs into a state vector. It keeps the outputs of this + RNN at every step to use for attention later. Next, it embeds decoder_inputs + by another newly created embedding (of shape [num_decoder_symbols x + input_size]). Then it runs attention decoder, initialized with the last + encoder state, on embedded decoder_inputs and attending to encoder outputs. + + Args: + encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. + decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. + cell: rnn_cell.RNNCell defining the cell function and size. + num_encoder_symbols: Integer; number of symbols on the encoder side. + num_decoder_symbols: Integer; number of symbols on the decoder side. + embedding_size: Integer, the length of the embedding vector for each symbol. + num_heads: Number of attention heads that read from attention_states. + output_projection: None or a pair (W, B) of output projection weights and + biases; W has shape [output_size x num_decoder_symbols] and B has + shape [num_decoder_symbols]; if provided and feed_previous=True, each + fed previous output will first be multiplied by W and added B. + feed_previous: Boolean or scalar Boolean Tensor; if True, only the first + of decoder_inputs will be used (the "GO" symbol), and all other decoder + inputs will be taken from previous outputs (as in embedding_rnn_decoder). + If False, decoder_inputs are used as given (the standard decoder case). + dtype: The dtype of the initial RNN state (default: tf.float32). + scope: VariableScope for the created subgraph; defaults to + "embedding_attention_seq2seq". + initial_state_attention: If False (default), initial attentions are zero. + If True, initialize the attentions from the initial state and attention + states. + + Returns: + A tuple of the form (outputs, state), where: + outputs: A list of the same length as decoder_inputs of 2D Tensors with + shape [batch_size x num_decoder_symbols] containing the generated + outputs. + state: The state of each decoder cell at the final time-step. + It is a 2D Tensor of shape [batch_size x cell.state_size]. + """ + with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"): + # Encoder. + encoder_cell = rnn_cell.EmbeddingWrapper( + cell, embedding_classes=num_encoder_symbols, + embedding_size=embedding_size) + encoder_outputs, encoder_state = rnn.rnn( + encoder_cell, encoder_inputs, dtype=dtype) + + # First calculate a concatenation of encoder outputs to put attention on. + top_states = [array_ops.reshape(e, [-1, 1, cell.output_size]) + for e in encoder_outputs] + attention_states = array_ops.concat(1, top_states) + + # Decoder. + output_size = None + if output_projection is None: + cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) + output_size = num_decoder_symbols + + if isinstance(feed_previous, bool): + return embedding_attention_decoder( + decoder_inputs, encoder_state, attention_states, cell, + num_decoder_symbols, embedding_size, num_heads=num_heads, + output_size=output_size, output_projection=output_projection, + feed_previous=feed_previous, + initial_state_attention=initial_state_attention) + + # If feed_previous is a Tensor, we construct 2 graphs and use cond. + def decoder(feed_previous_bool): + reuse = None if feed_previous_bool else True + with variable_scope.variable_scope(variable_scope.get_variable_scope(), + reuse=reuse): + outputs, state = embedding_attention_decoder( + decoder_inputs, encoder_state, attention_states, cell, + num_decoder_symbols, embedding_size, num_heads=num_heads, + output_size=output_size, output_projection=output_projection, + feed_previous=feed_previous_bool, + update_embedding_for_previous=False, + initial_state_attention=initial_state_attention) + return outputs + [state] + + outputs_and_state = control_flow_ops.cond(feed_previous, + lambda: decoder(True), + lambda: decoder(False)) + return outputs_and_state[:-1], outputs_and_state[-1] def one2many_rnn_seq2seq(encoder_inputs, decoder_inputs_dict, cell, num_encoder_symbols, num_decoder_symbols_dict, embedding_size, feed_previous=False, dtype=dtypes.float32, scope=None): - """One-to-many RNN sequence-to-sequence model (multi-task). - - This is a multi-task sequence-to-sequence model with one encoder and multiple - decoders. Reference to multi-task sequence-to-sequence learning can be found - here: http://arxiv.org/abs/1511.06114 - - Args: - encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. - decoder_inputs_dict: A dictionany mapping decoder name (string) to - the corresponding decoder_inputs; each decoder_inputs is a list of 1D - Tensors of shape [batch_size]; num_decoders is defined as - len(decoder_inputs_dict). - cell: rnn_cell.RNNCell defining the cell function and size. - num_encoder_symbols: Integer; number of symbols on the encoder side. - num_decoder_symbols_dict: A dictionary mapping decoder name (string) to an - integer specifying number of symbols for the corresponding decoder; - len(num_decoder_symbols_dict) must be equal to num_decoders. - embedding_size: Integer, the length of the embedding vector for each symbol. - feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of - decoder_inputs will be used (the "GO" symbol), and all other decoder - inputs will be taken from previous outputs (as in embedding_rnn_decoder). - If False, decoder_inputs are used as given (the standard decoder case). - dtype: The dtype of the initial state for both the encoder and encoder - rnn cells (default: tf.float32). - scope: VariableScope for the created subgraph; defaults to - "one2many_rnn_seq2seq" - - Returns: - A tuple of the form (outputs_dict, state_dict), where: - outputs_dict: A mapping from decoder name (string) to a list of the same - length as decoder_inputs_dict[name]; each element in the list is a 2D - Tensors with shape [batch_size x num_decoder_symbol_list[name]] - containing the generated outputs. - state_dict: A mapping from decoder name (string) to the final state of the - corresponding decoder RNN; it is a 2D Tensor of shape - [batch_size x cell.state_size]. - """ - outputs_dict = {} - state_dict = {} - - with variable_scope.variable_scope(scope or "one2many_rnn_seq2seq"): - # Encoder. - encoder_cell = rnn_cell.EmbeddingWrapper( - cell, embedding_classes=num_encoder_symbols, - embedding_size=embedding_size) - _, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype) - - # Decoder. - for name, decoder_inputs in decoder_inputs_dict.items(): - num_decoder_symbols = num_decoder_symbols_dict[name] - - with variable_scope.variable_scope("one2many_decoder_" + str(name)): - decoder_cell = rnn_cell.OutputProjectionWrapper(cell, - num_decoder_symbols) - if isinstance(feed_previous, bool): - outputs, state = embedding_rnn_decoder( - decoder_inputs, encoder_state, decoder_cell, num_decoder_symbols, - embedding_size, feed_previous=feed_previous) - else: - # If feed_previous is a Tensor, we construct 2 graphs and use cond. - def filled_embedding_rnn_decoder(feed_previous): - # pylint: disable=cell-var-from-loop - reuse = None if feed_previous else True - vs = variable_scope.get_variable_scope() - with variable_scope.variable_scope(vs, reuse=reuse): - outputs, state = embedding_rnn_decoder( - decoder_inputs, encoder_state, decoder_cell, - num_decoder_symbols, embedding_size, - feed_previous=feed_previous) - # pylint: enable=cell-var-from-loop - return outputs + [state] - outputs_and_state = control_flow_ops.cond( - feed_previous, - lambda: filled_embedding_rnn_decoder(True), - lambda: filled_embedding_rnn_decoder(False)) - outputs = outputs_and_state[:-1] - state = outputs_and_state[-1] - - outputs_dict[name] = outputs - state_dict[name] = state - - return outputs_dict, state_dict + """One-to-many RNN sequence-to-sequence model (multi-task). + + This is a multi-task sequence-to-sequence model with one encoder and multiple + decoders. Reference to multi-task sequence-to-sequence learning can be found + here: http://arxiv.org/abs/1511.06114 + + Args: + encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. + decoder_inputs_dict: A dictionany mapping decoder name (string) to + the corresponding decoder_inputs; each decoder_inputs is a list of 1D + Tensors of shape [batch_size]; num_decoders is defined as + len(decoder_inputs_dict). + cell: rnn_cell.RNNCell defining the cell function and size. + num_encoder_symbols: Integer; number of symbols on the encoder side. + num_decoder_symbols_dict: A dictionary mapping decoder name (string) to an + integer specifying number of symbols for the corresponding decoder; + len(num_decoder_symbols_dict) must be equal to num_decoders. + embedding_size: Integer, the length of the embedding vector for each symbol. + feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of + decoder_inputs will be used (the "GO" symbol), and all other decoder + inputs will be taken from previous outputs (as in embedding_rnn_decoder). + If False, decoder_inputs are used as given (the standard decoder case). + dtype: The dtype of the initial state for both the encoder and encoder + rnn cells (default: tf.float32). + scope: VariableScope for the created subgraph; defaults to + "one2many_rnn_seq2seq" + + Returns: + A tuple of the form (outputs_dict, state_dict), where: + outputs_dict: A mapping from decoder name (string) to a list of the same + length as decoder_inputs_dict[name]; each element in the list is a 2D + Tensors with shape [batch_size x num_decoder_symbol_list[name]] + containing the generated outputs. + state_dict: A mapping from decoder name (string) to the final state of the + corresponding decoder RNN; it is a 2D Tensor of shape + [batch_size x cell.state_size]. + """ + outputs_dict = {} + state_dict = {} + + with variable_scope.variable_scope(scope or "one2many_rnn_seq2seq"): + # Encoder. + encoder_cell = rnn_cell.EmbeddingWrapper( + cell, embedding_classes=num_encoder_symbols, + embedding_size=embedding_size) + _, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype) + + # Decoder. + for name, decoder_inputs in decoder_inputs_dict.items(): + num_decoder_symbols = num_decoder_symbols_dict[name] + + with variable_scope.variable_scope("one2many_decoder_" + str(name)): + decoder_cell = rnn_cell.OutputProjectionWrapper(cell, + num_decoder_symbols) + if isinstance(feed_previous, bool): + outputs, state = embedding_rnn_decoder( + decoder_inputs, encoder_state, decoder_cell, num_decoder_symbols, + embedding_size, feed_previous=feed_previous) + else: + # If feed_previous is a Tensor, we construct 2 graphs and use cond. + def filled_embedding_rnn_decoder(feed_previous): + # pylint: disable=cell-var-from-loop + reuse = None if feed_previous else True + vs = variable_scope.get_variable_scope() + with variable_scope.variable_scope(vs, reuse=reuse): + outputs, state = embedding_rnn_decoder( + decoder_inputs, encoder_state, decoder_cell, + num_decoder_symbols, embedding_size, + feed_previous=feed_previous) + # pylint: enable=cell-var-from-loop + return outputs + [state] + outputs_and_state = control_flow_ops.cond( + feed_previous, + lambda: filled_embedding_rnn_decoder(True), + lambda: filled_embedding_rnn_decoder(False)) + outputs = outputs_and_state[:-1] + state = outputs_and_state[-1] + + outputs_dict[name] = outputs + state_dict[name] = state + + return outputs_dict, state_dict def sequence_loss_by_example(logits, targets, weights, average_across_timesteps=True, softmax_loss_function=None, name=None): - """Weighted cross-entropy loss for a sequence of logits (per example). - - Args: - logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols]. - targets: List of 1D batch-sized int32 Tensors of the same length as logits. - weights: List of 1D batch-sized float-Tensors of the same length as logits. - average_across_timesteps: If set, divide the returned cost by the total - label weight. - softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch - to be used instead of the standard softmax (the default if this is None). - name: Optional name for this operation, default: "sequence_loss_by_example". - - Returns: - 1D batch-sized float Tensor: The log-perplexity for each sequence. - - Raises: - ValueError: If len(logits) is different from len(targets) or len(weights). - """ - if len(targets) != len(logits) or len(weights) != len(logits): - raise ValueError("Lengths of logits, weights, and targets must be the same " - "%d, %d, %d." % (len(logits), len(weights), len(targets))) - with ops.name_scope(name, "sequence_loss_by_example", - logits + targets + weights): - log_perp_list = [] - for logit, target, weight in zip(logits, targets, weights): - if softmax_loss_function is None: - # TODO(irving,ebrevdo): This reshape is needed because - # sequence_loss_by_example is called with scalars sometimes, which - # violates our general scalar strictness policy. - target = array_ops.reshape(target, [-1]) - crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( - logits=logit, labels=target) - else: - crossent = softmax_loss_function(logits=logit, labels=target) - log_perp_list.append(crossent * weight) - log_perps = math_ops.add_n(log_perp_list) - if average_across_timesteps: - total_size = math_ops.add_n(weights) - total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. - log_perps /= total_size - return log_perps + """Weighted cross-entropy loss for a sequence of logits (per example). + + Args: + logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols]. + targets: List of 1D batch-sized int32 Tensors of the same length as logits. + weights: List of 1D batch-sized float-Tensors of the same length as logits. + average_across_timesteps: If set, divide the returned cost by the total + label weight. + softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch + to be used instead of the standard softmax (the default if this is None). + name: Optional name for this operation, default: "sequence_loss_by_example". + + Returns: + 1D batch-sized float Tensor: The log-perplexity for each sequence. + + Raises: + ValueError: If len(logits) is different from len(targets) or len(weights). + """ + if len(targets) != len(logits) or len(weights) != len(logits): + raise ValueError("Lengths of logits, weights, and targets must be the same " + "%d, %d, %d." % (len(logits), len(weights), len(targets))) + with ops.name_scope(name, "sequence_loss_by_example", + logits + targets + weights): + log_perp_list = [] + for logit, target, weight in zip(logits, targets, weights): + if softmax_loss_function is None: + # TODO(irving,ebrevdo): This reshape is needed because + # sequence_loss_by_example is called with scalars sometimes, which + # violates our general scalar strictness policy. + target = array_ops.reshape(target, [-1]) + crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( + logits=logit, labels=target) + else: + crossent = softmax_loss_function(logits=logit, labels=target) + log_perp_list.append(crossent * weight) + log_perps = math_ops.add_n(log_perp_list) + if average_across_timesteps: + total_size = math_ops.add_n(weights) + total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. + log_perps /= total_size + return log_perps def sequence_loss(logits, targets, weights, average_across_timesteps=True, average_across_batch=True, softmax_loss_function=None, name=None): - """Weighted cross-entropy loss for a sequence of logits, batch-collapsed. - - Args: - logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols]. - targets: List of 1D batch-sized int32 Tensors of the same length as logits. - weights: List of 1D batch-sized float-Tensors of the same length as logits. - average_across_timesteps: If set, divide the returned cost by the total - label weight. - average_across_batch: If set, divide the returned cost by the batch size. - softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch - to be used instead of the standard softmax (the default if this is None). - name: Optional name for this operation, defaults to "sequence_loss". - - Returns: - A scalar float Tensor: The average log-perplexity per symbol (weighted). - - Raises: - ValueError: If len(logits) is different from len(targets) or len(weights). - """ - with ops.name_scope(name, "sequence_loss", logits + targets + weights): - cost = math_ops.reduce_sum(sequence_loss_by_example( - logits, targets, weights, - average_across_timesteps=average_across_timesteps, - softmax_loss_function=softmax_loss_function)) - if average_across_batch: - batch_size = array_ops.shape(targets[0])[0] - return cost / math_ops.cast(batch_size, dtypes.float32) - else: - return cost + """Weighted cross-entropy loss for a sequence of logits, batch-collapsed. + + Args: + logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols]. + targets: List of 1D batch-sized int32 Tensors of the same length as logits. + weights: List of 1D batch-sized float-Tensors of the same length as logits. + average_across_timesteps: If set, divide the returned cost by the total + label weight. + average_across_batch: If set, divide the returned cost by the batch size. + softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch + to be used instead of the standard softmax (the default if this is None). + name: Optional name for this operation, defaults to "sequence_loss". + + Returns: + A scalar float Tensor: The average log-perplexity per symbol (weighted). + + Raises: + ValueError: If len(logits) is different from len(targets) or len(weights). + """ + with ops.name_scope(name, "sequence_loss", logits + targets + weights): + cost = math_ops.reduce_sum(sequence_loss_by_example( + logits, targets, weights, + average_across_timesteps=average_across_timesteps, + softmax_loss_function=softmax_loss_function)) + if average_across_batch: + batch_size = array_ops.shape(targets[0])[0] + return cost / math_ops.cast(batch_size, dtypes.float32) + + return cost def model_with_buckets(encoder_inputs_tensor, decoder_inputs, targets, weights, buckets, seq2seq, softmax_loss_function=None, per_example_loss=False, name=None): - """Create a sequence-to-sequence model with support for bucketing. - - The seq2seq argument is a function that defines a sequence-to-sequence model, - e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(x, y, rnn_cell.GRUCell(24)) - - Args: - encoder_inputs: A list of Tensors to feed the encoder; first seq2seq input. - decoder_inputs: A list of Tensors to feed the decoder; second seq2seq input. - targets: A list of 1D batch-sized int32 Tensors (desired output sequence). - weights: List of 1D batch-sized float-Tensors to weight the targets. - buckets: A list of pairs of (input size, output size) for each bucket. - seq2seq: A sequence-to-sequence model function; it takes 2 input that - agree with encoder_inputs and decoder_inputs, and returns a pair - consisting of outputs and states (as, e.g., basic_rnn_seq2seq). - softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch - to be used instead of the standard softmax (the default if this is None). - per_example_loss: Boolean. If set, the returned loss will be a batch-sized - tensor of losses for each sequence in the batch. If unset, it will be - a scalar with the averaged loss from all examples. - name: Optional name for this operation, defaults to "model_with_buckets". - - Returns: - A tuple of the form (outputs, losses), where: - outputs: The outputs for each bucket. Its j'th element consists of a list - of 2D Tensors of shape [batch_size x num_decoder_symbols] (jth outputs). - losses: List of scalar Tensors, representing losses for each bucket, or, - if per_example_loss is set, a list of 1D batch-sized float Tensors. - - Raises: - ValueError: If length of encoder_inputsut, targets, or weights is smaller - than the largest (last) bucket. - """ - if len(targets) < buckets[-1][1]: - raise ValueError("Length of targets (%d) must be at least that of last" - "bucket (%d)." % (len(targets), buckets[-1][1])) - if len(weights) < buckets[-1][1]: - raise ValueError("Length of weights (%d) must be at least that of last" - "bucket (%d)." % (len(weights), buckets[-1][1])) - - all_inputs = [encoder_inputs_tensor] + decoder_inputs + targets + weights - losses = [] - outputs = [] - attention_weights_histories = [] - with ops.name_scope(name, "model_with_buckets", all_inputs): - for j, bucket in enumerate(buckets): - with variable_scope.variable_scope(variable_scope.get_variable_scope(), - reuse=True if j > 0 else None): - encoder_inputs = tf.split(encoder_inputs_tensor, bucket[0], 0) - encoder_inputs = [tf.squeeze(encoder_input,squeeze_dims=[0]) for encoder_input in encoder_inputs] - bucket_outputs, attention_weights_history = seq2seq(encoder_inputs[:int(bucket[0])], - decoder_inputs[:int(bucket[1])], int(bucket[0])) - #bucket_outputs[0] = tf.Print(bucket_outputs[0], [bucket_outputs[0]], message="This is a: ",summarize=30) - outputs.append(bucket_outputs) - attention_weights_histories.append(attention_weights_history) - if per_example_loss: - losses.append(sequence_loss_by_example( - outputs[-1], targets[:int(bucket[1])], weights[:int(bucket[1])], - average_across_timesteps=True, - softmax_loss_function=softmax_loss_function)) - else: - losses.append(sequence_loss( - outputs[-1], targets[:int(bucket[1])], weights[:int(bucket[1])], - average_across_timesteps=True, - softmax_loss_function=softmax_loss_function)) - #losses[0] = tf.Print(losses[0], [losses[0]], message="This is b: ",summarize=3) - - return outputs, losses, attention_weights_histories + """Create a sequence-to-sequence model with support for bucketing. + + The seq2seq argument is a function that defines a sequence-to-sequence model, + e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(x, y, rnn_cell.GRUCell(24)) + + Args: + encoder_inputs: A list of Tensors to feed the encoder; first seq2seq input. + decoder_inputs: A list of Tensors to feed the decoder; second seq2seq input. + targets: A list of 1D batch-sized int32 Tensors (desired output sequence). + weights: List of 1D batch-sized float-Tensors to weight the targets. + buckets: A list of pairs of (input size, output size) for each bucket. + seq2seq: A sequence-to-sequence model function; it takes 2 input that + agree with encoder_inputs and decoder_inputs, and returns a pair + consisting of outputs and states (as, e.g., basic_rnn_seq2seq). + softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch + to be used instead of the standard softmax (the default if this is None). + per_example_loss: Boolean. If set, the returned loss will be a batch-sized + tensor of losses for each sequence in the batch. If unset, it will be + a scalar with the averaged loss from all examples. + name: Optional name for this operation, defaults to "model_with_buckets". + + Returns: + A tuple of the form (outputs, losses), where: + outputs: The outputs for each bucket. Its j'th element consists of a list + of 2D Tensors of shape [batch_size x num_decoder_symbols] (jth outputs). + losses: List of scalar Tensors, representing losses for each bucket, or, + if per_example_loss is set, a list of 1D batch-sized float Tensors. + + Raises: + ValueError: If length of encoder_inputsut, targets, or weights is smaller + than the largest (last) bucket. + """ + if len(targets) < buckets[-1][1]: + raise ValueError("Length of targets (%d) must be at least that of last" + "bucket (%d)." % (len(targets), buckets[-1][1])) + if len(weights) < buckets[-1][1]: + raise ValueError("Length of weights (%d) must be at least that of last" + "bucket (%d)." % (len(weights), buckets[-1][1])) + + all_inputs = [encoder_inputs_tensor] + decoder_inputs + targets + weights + losses = [] + outputs = [] + attention_weights_histories = [] + with ops.name_scope(name, "model_with_buckets", all_inputs): + for j, bucket in enumerate(buckets): + with variable_scope.variable_scope(variable_scope.get_variable_scope(), + reuse=True if j > 0 else None): + encoder_inputs = tf.split(encoder_inputs_tensor, bucket[0], 0) + encoder_inputs = [tf.squeeze(encoder_input, squeeze_dims=[0]) for encoder_input in encoder_inputs] + bucket_outputs, attention_weights_history = seq2seq(encoder_inputs[:int(bucket[0])], + decoder_inputs[:int(bucket[1])], int(bucket[0])) + # bucket_outputs[0] = tf.Print(bucket_outputs[0], [bucket_outputs[0]], message="This is a: ",summarize=30) + outputs.append(bucket_outputs) + attention_weights_histories.append(attention_weights_history) + if per_example_loss: + losses.append(sequence_loss_by_example( + outputs[-1], targets[:int(bucket[1])], weights[:int(bucket[1])], + average_across_timesteps=True, + softmax_loss_function=softmax_loss_function)) + else: + losses.append(sequence_loss( + outputs[-1], targets[:int(bucket[1])], weights[:int(bucket[1])], + average_across_timesteps=True, + softmax_loss_function=softmax_loss_function)) + # losses[0] = tf.Print(losses[0], [losses[0]], message="This is b: ",summarize=3) + + return outputs, losses, attention_weights_histories diff --git a/src/model/seq2seq_model.py b/src/model/seq2seq_model.py index 97df3eac..4c100b11 100644 --- a/src/model/seq2seq_model.py +++ b/src/model/seq2seq_model.py @@ -19,20 +19,15 @@ from __future__ import division from __future__ import print_function -import random - -import numpy as np from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf -#from tensorflow.models.rnn.translate import data_utils -#from tensorflow.nn import rnn, rnn_cell from tensorflow.python.ops import array_ops -from tensorflow.python.ops import variable_scope from .seq2seq import model_with_buckets from .seq2seq import embedding_attention_decoder + class Seq2SeqModel(object): """Sequence-to-sequence model with attention and for multiple buckets. This class implements a multi-layer recurrent neural network as encoder, @@ -47,16 +42,16 @@ class Seq2SeqModel(object): http://arxiv.org/abs/1412.2007 """ - def __init__(self, encoder_masks, encoder_inputs_tensor, - decoder_inputs, - target_weights, - target_vocab_size, - buckets, - target_embedding_size, - attn_num_layers, - attn_num_hidden, - forward_only, - use_gru): + def __init__(self, encoder_masks, encoder_inputs_tensor, + decoder_inputs, + target_weights, + target_vocab_size, + buckets, + target_embedding_size, + attn_num_layers, + attn_num_hidden, + forward_only, + use_gru): """Create the model. Args: @@ -101,41 +96,42 @@ def seq2seq_f(lstm_inputs, decoder_inputs, seq_length, do_decode): # Backward direction cell lstm_bw_cell = tf.contrib.rnn.BasicLSTMCell(num_hidden, forget_bias=0.0, state_is_tuple=False) - pre_encoder_inputs, output_state_fw, output_state_bw = tf.contrib.rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, lstm_inputs, + pre_encoder_inputs, output_state_fw, output_state_bw = tf.contrib.rnn.static_bidirectional_rnn( + lstm_fw_cell, lstm_bw_cell, lstm_inputs, initial_state_fw=None, initial_state_bw=None, dtype=tf.float32, sequence_length=None, scope=None) - encoder_inputs = [e*f for e,f in zip(pre_encoder_inputs,encoder_masks[:seq_length])] + encoder_inputs = [e*f for e, f in zip(pre_encoder_inputs, encoder_masks[:seq_length])] top_states = [array_ops.reshape(e, [-1, 1, num_hidden*2]) - for e in encoder_inputs] + for e in encoder_inputs] attention_states = array_ops.concat(top_states, 1) initial_state = tf.concat(axis=1, values=[output_state_fw, output_state_bw]) outputs, _, attention_weights_history = embedding_attention_decoder( - decoder_inputs, initial_state, attention_states, cell, - num_symbols=target_vocab_size, - embedding_size=target_embedding_size, - num_heads=1, - output_size=target_vocab_size, - output_projection=None, - feed_previous=do_decode, - initial_state_attention=False, - attn_num_hidden = attn_num_hidden) + decoder_inputs, initial_state, attention_states, cell, + num_symbols=target_vocab_size, + embedding_size=target_embedding_size, + num_heads=1, + output_size=target_vocab_size, + output_projection=None, + feed_previous=do_decode, + initial_state_attention=False, + attn_num_hidden=attn_num_hidden) return outputs, attention_weights_history # Our targets are decoder inputs shifted by one. targets = [decoder_inputs[i + 1] - for i in xrange(len(decoder_inputs) - 1)] + for i in xrange(len(decoder_inputs) - 1)] - softmax_loss_function = None # default to tf.nn.sparse_softmax_cross_entropy_with_logits + softmax_loss_function = None # default to tf.nn.sparse_softmax_cross_entropy_with_logits # Training outputs and losses. if forward_only: self.outputs, self.losses, self.attention_weights_histories = model_with_buckets( - encoder_inputs_tensor, decoder_inputs, targets, - self.target_weights, buckets, lambda x, y, z: seq2seq_f(x, y, z, True), - softmax_loss_function=softmax_loss_function) + encoder_inputs_tensor, decoder_inputs, targets, + self.target_weights, buckets, lambda x, y, z: seq2seq_f(x, y, z, True), + softmax_loss_function=softmax_loss_function) else: self.outputs, self.losses, self.attention_weights_histories = model_with_buckets( - encoder_inputs_tensor, decoder_inputs, targets, - self.target_weights, buckets, lambda x, y, z: seq2seq_f(x, y, z, False), - softmax_loss_function=softmax_loss_function) + encoder_inputs_tensor, decoder_inputs, targets, + self.target_weights, buckets, lambda x, y, z: seq2seq_f(x, y, z, False), + softmax_loss_function=softmax_loss_function) From 65d73ac0a2939058a7eaac1662ebc67848029e13 Mon Sep 17 00:00:00 2001 From: Edward Medvedev Date: Fri, 21 Jul 2017 16:47:36 +0200 Subject: [PATCH 07/12] Rename the module and fix the model training --- .gitignore | 1 + aocr/__init__.py | 1 + aocr/defaults.py | 35 ++ aocr/launcher.py | 175 ++++++++ aocr/model/__init__.py | 0 {src => aocr}/model/cnn.py | 19 +- aocr/model/model.py | 452 ++++++++++++++++++++ {src => aocr}/model/seq2seq.py | 0 {src => aocr}/model/seq2seq_model.py | 0 aocr/util/__init__.py | 0 {src/data_util => aocr/util}/bucketdata.py | 2 - {src/data_util => aocr/util}/data_gen.py | 39 +- setup.py | 18 +- src/__init__.py | 1 - src/data_util/__init__.py | 1 - src/exp_config.py | 38 -- src/launcher.py | 147 ------- src/model/__init__.py | 1 - src/model/model.py | 458 --------------------- 19 files changed, 704 insertions(+), 684 deletions(-) create mode 100644 aocr/__init__.py create mode 100644 aocr/defaults.py create mode 100644 aocr/launcher.py create mode 100644 aocr/model/__init__.py rename {src => aocr}/model/cnn.py (94%) create mode 100644 aocr/model/model.py rename {src => aocr}/model/seq2seq.py (100%) rename {src => aocr}/model/seq2seq_model.py (100%) create mode 100644 aocr/util/__init__.py rename {src/data_util => aocr/util}/bucketdata.py (99%) rename {src/data_util => aocr/util}/data_gen.py (85%) delete mode 100644 src/__init__.py delete mode 100644 src/data_util/__init__.py delete mode 100644 src/exp_config.py delete mode 100644 src/launcher.py delete mode 100644 src/model/__init__.py delete mode 100644 src/model/model.py diff --git a/.gitignore b/.gitignore index bdeec827..73d475de 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ datasets/ checkpoints/ models/ +results/ ### Python template # Byte-compiled / optimized / DLL files diff --git a/aocr/__init__.py b/aocr/__init__.py new file mode 100644 index 00000000..d619a01c --- /dev/null +++ b/aocr/__init__.py @@ -0,0 +1 @@ +__author__ = 'emedvedev' diff --git a/aocr/defaults.py b/aocr/defaults.py new file mode 100644 index 00000000..a40cf719 --- /dev/null +++ b/aocr/defaults.py @@ -0,0 +1,35 @@ +""" +Default parameters +""" + + +class Config: + + GPU_ID = 0 + VISUALIZE = False + + # I/O + DATA_PATH = 'data.tfrecords' + MODEL_DIR = 'models' + LOG_PATH = 'attentionocr.log' + OUTPUT_DIR = 'results' + STEPS_PER_CHECKPOINT = 500 + EXPORT_FORMAT = 'savedmodel' + EXPORT_PATH = 'exported' + + # Optimization + NUM_EPOCH = 1000 + BATCH_SIZE = 45 + INITIAL_LEARNING_RATE = 1.0 + + # Network parameters + CLIP_GRADIENTS = True # whether to perform gradient clipping + MAX_GRADIENT_NORM = 5.0 # Clip gradients to this norm + TARGET_EMBEDDING_SIZE = 10 # embedding dimension for each target + ATTN_USE_LSTM = True # whether or not use LSTM attention decoder cell + ATTN_NUM_HIDDEN = 128 # number of hidden units in attention decoder cell + ATTN_NUM_LAYERS = 2 # number of layers in attention decoder cell + # (Encoder number of hidden units will be ATTN_NUM_HIDDEN*ATTN_NUM_LAYERS) + LOAD_MODEL = True + OLD_MODEL_VERSION = False + TARGET_VOCAB_SIZE = 26+10+3 # 0: PADDING, 1: GO, 2: EOS, >2: 0-9, a-z diff --git a/aocr/launcher.py b/aocr/launcher.py new file mode 100644 index 00000000..85346b29 --- /dev/null +++ b/aocr/launcher.py @@ -0,0 +1,175 @@ +import sys +import argparse +import logging + +import tensorflow as tf + +from .model.model import Model +from .defaults import Config + +tf.logging.set_verbosity(tf.logging.ERROR) + + +def process_args(args, defaults): + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(help='Subcommands.') + + # Global arguments + parser.add_argument('--log-path', dest="log_path", + type=str, default=defaults.LOG_PATH, + help=('Log file path, default=%s' + % (defaults.LOG_PATH))) + parser.set_defaults(visualize=defaults.VISUALIZE) + parser.set_defaults(load_model=defaults.LOAD_MODEL) + + # Dataset generation + parser_dataset = subparsers.add_parser('dataset', help='Create a dataset in the .tfrecords format for training or testing.') + + # Training + parser_train = subparsers.add_parser('train', help='Train the model and save checkpoints.') + parser_train.set_defaults(phase='train') + parser_train.add_argument('dataset', + type=str, default=defaults.DATA_PATH, + help=('Path of the .tfrecords file containing the image/label pairs' + ', default=%s' + % (defaults.DATA_PATH))) + parser_train.add_argument('--no-resume', dest='load_model', action='store_false', + help=('Create an empty model even if checkpoints already exist.' + ', default=%s' % (defaults.LOAD_MODEL))) + + # Testing + parser_test = subparsers.add_parser('test', help='Test the saved model.') + parser_test.set_defaults(phase='test') + parser_test.add_argument('dataset', + type=str, default=defaults.DATA_PATH, + help=('Path of the .tfrecords file containing the image/label pairs' + ', default=%s' + % (defaults.DATA_PATH))) + parser_test.add_argument('--visualize', dest='visualize', action='store_true', + help=('Visualize attentions' + ', default=%s' % (defaults.VISUALIZE))) + + # Exporting + parser_export = subparsers.add_parser('export', help='Export the saved checkpoints for production.') + parser_export.add_argument('export_path', metavar='path', + type=str, default=defaults.EXPORT_PATH, + help=('Path to export the model in the specified format,' + 'default=%s' + % (defaults.EXPORT_PATH))) + parser_export.add_argument('--format', dest="format", + type=str, default=defaults.EXPORT_FORMAT, + choices=['frozengraph', 'savedmodel'], + help=('Export format for the model: either' + 'a frozen GraphDef or a SavedModel' + '(default=%s)' + % (defaults.EXPORT_FORMAT))) + + + + + parser.add_argument('--gpu-id', dest="gpu_id", + type=int, default=defaults.GPU_ID) + + parser.add_argument('--use-gru', dest='use_gru', action='store_true') + + parser.add_argument('--batch-size', dest="batch_size", + type=int, default=defaults.BATCH_SIZE, + help=('Batch size, default = %s' + % (defaults.BATCH_SIZE))) + parser.add_argument('--initial-learning-rate', dest="initial_learning_rate", + type=float, default=defaults.INITIAL_LEARNING_RATE, + help=('Initial learning rate, default = %s' + % (defaults.INITIAL_LEARNING_RATE))) + parser.add_argument('--num-epoch', dest="num_epoch", + type=int, default=defaults.NUM_EPOCH, + help=('Number of epochs, default = %s' + % (defaults.NUM_EPOCH))) + parser.add_argument('--steps-per-checkpoint', dest="steps_per_checkpoint", + type=int, default=defaults.STEPS_PER_CHECKPOINT, + help=('Checkpointing (print perplexity, save model) per' + ' how many steps, default = %s' + % (defaults.STEPS_PER_CHECKPOINT))) + parser.add_argument('--target-vocab-size', dest="target_vocab_size", + type=int, default=defaults.TARGET_VOCAB_SIZE, + help=('Target vocabulary size, default=%s' + % (defaults.TARGET_VOCAB_SIZE))) + parser.add_argument('--model-dir', dest="model_dir", + type=str, default=defaults.MODEL_DIR, + help=('The directory for saving and loading model ' + 'default=%s' %(defaults.MODEL_DIR))) + parser.add_argument('--target-embedding-size', dest="target_embedding_size", + type=int, default=defaults.TARGET_EMBEDDING_SIZE, + help=('Embedding dimension for each target, default=%s' + % (defaults.TARGET_EMBEDDING_SIZE))) + parser.add_argument('--attn-num-hidden', dest="attn_num_hidden", + type=int, default=defaults.ATTN_NUM_HIDDEN, + help=('number of hidden units in attention decoder cell' + ', default=%s' + % (defaults.ATTN_NUM_HIDDEN))) + parser.add_argument('--attn-num-layers', dest="attn_num_layers", + type=int, default=defaults.ATTN_NUM_LAYERS, + help=('number of hidden layers in attention decoder cell' + ', default=%s' + % (defaults.ATTN_NUM_LAYERS))) + parser.add_argument('--output-dir', dest="output_dir", + type=str, default=defaults.OUTPUT_DIR, + help=('Output directory, default=%s' + % (defaults.OUTPUT_DIR))) + parser.add_argument('--max_gradient_norm', dest="max_gradient_norm", + type=int, default=defaults.MAX_GRADIENT_NORM, + help=('Clip gradients to this norm.' + ', default=%s' + % (defaults.MAX_GRADIENT_NORM))) + parser.add_argument('--no-gradient_clipping', dest='clip_gradients', action='store_false', + help=('Do not perform gradient clipping, default for clip_gradients is %s' % + (defaults.CLIP_GRADIENTS))) + parser.set_defaults(clip_gradients=defaults.CLIP_GRADIENTS) + + parameters = parser.parse_args(args) + return parameters + + +def main(args): + parameters = process_args(args, Config) + logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s', + filename=parameters.log_path) + console = logging.StreamHandler() + console.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s') + console.setFormatter(formatter) + logging.getLogger('').addHandler(console) + + with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: + model = Model( + phase=parameters.phase, + visualize=parameters.visualize, + data_path=parameters.dataset, + output_dir=parameters.output_dir, + batch_size=parameters.batch_size, + initial_learning_rate=parameters.initial_learning_rate, + num_epoch=parameters.num_epoch, + steps_per_checkpoint=parameters.steps_per_checkpoint, + target_vocab_size=parameters.target_vocab_size, + model_dir=parameters.model_dir, + target_embedding_size=parameters.target_embedding_size, + attn_num_hidden=parameters.attn_num_hidden, + attn_num_layers=parameters.attn_num_layers, + clip_gradients=parameters.clip_gradients, + max_gradient_norm=parameters.max_gradient_norm, + session=sess, + load_model=parameters.load_model, + gpu_id=parameters.gpu_id, + use_gru=parameters.use_gru, + ) + if parameters.phase == 'train': + model.train() + elif parameters.phase == 'test': + model.test() + else: + raise NotImplementedError + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/aocr/model/__init__.py b/aocr/model/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/model/cnn.py b/aocr/model/cnn.py similarity index 94% rename from src/model/cnn.py rename to aocr/model/cnn.py index aef80b55..16b4e9cd 100644 --- a/src/model/cnn.py +++ b/aocr/model/cnn.py @@ -1,11 +1,4 @@ -__author__ = 'moonkey' - -#from keras import models, layers -import logging import numpy as np -# from src.data_util.synth_prepare import SynthGen - -#import keras.backend as K import tensorflow as tf @@ -24,6 +17,7 @@ def var_random(name, shape, regularizable=False): tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, tf.nn.l2_loss(v)) return v + def max_2x2pool(incoming, name): ''' max pooling on 2 dims. @@ -34,6 +28,7 @@ def max_2x2pool(incoming, name): with tf.variable_scope(name): return tf.nn.max_pool(incoming, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1), padding='VALID') + def max_2x1pool(incoming, name): ''' max pooling only on image width @@ -44,6 +39,7 @@ def max_2x1pool(incoming, name): with tf.variable_scope(name): return tf.nn.max_pool(incoming, ksize=(1, 2, 1, 1), strides=(1, 2, 1, 1), padding='VALID') + def ConvRelu(incoming, num_filters, filter_size, name): ''' Add a convolution layer followed by a Relu layer. @@ -87,15 +83,17 @@ def ConvReluBN(incoming, num_filters, filter_size, name, is_training, padding_ty with tf.variable_scope(name): conv_W = var_random('W', tuple(filter_size) + (num_filters_from, num_filters), regularizable=True) - after_conv = tf.nn.conv2d(incoming, conv_W, strides=(1,1,1,1), padding=padding_type) + after_conv = tf.nn.conv2d(incoming, conv_W, strides=(1, 1, 1, 1), padding=padding_type) after_bn = batch_norm(after_conv, is_training) return tf.nn.relu(after_bn) + def dropout(incoming, is_training, keep_prob=0.5): return tf.contrib.layers.dropout(incoming, keep_prob=keep_prob, is_training=is_training) + def tf_create_attention_map(incoming): ''' flatten hight and width into one dimention of size attn_length @@ -107,6 +105,7 @@ def tf_create_attention_map(incoming): print(shape) return tf.reshape(incoming, (-1, np.prod(shape[1:3]), shape[3])) + class CNN(object): """ Usage for tf tensor output: @@ -146,7 +145,7 @@ def _build_network(self, input_tensor, is_training): print('CNN outdim before squeeze: {}'.format(net.get_shape())) # 1x32x100 -> 24x512 - net = tf.squeeze(net,axis=1) + net = tf.squeeze(net, axis=1) print('CNN outdim: {}'.format(net.get_shape())) self.model = net @@ -160,5 +159,3 @@ def __call__(self, input_tensor): ''' def save(self): pass - - diff --git a/aocr/model/model.py b/aocr/model/model.py new file mode 100644 index 00000000..f0446acf --- /dev/null +++ b/aocr/model/model.py @@ -0,0 +1,452 @@ +"""Visual Attention Based OCR Model.""" + +from __future__ import absolute_import +from __future__ import division + +import time +import os +import math +import logging + +import distance +import numpy as np +import tensorflow as tf + +from PIL import Image +from six.moves import xrange # pylint: disable=redefined-builtin + +from .cnn import CNN +from .seq2seq_model import Seq2SeqModel +from ..util.data_gen import DataGen + +tf.reset_default_graph() + + +class Model(object): + SYMBOLS = ' 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ' + + def __init__(self, + phase, + visualize, + data_path, + output_dir, + batch_size, + initial_learning_rate, + num_epoch, + steps_per_checkpoint, + target_vocab_size, + model_dir, + target_embedding_size, + attn_num_hidden, + attn_num_layers, + clip_gradients, + max_gradient_norm, + session, + load_model, + gpu_id, + use_gru, + evaluate=False, + valid_target_length=float('inf'), + reg_val=0): + + gpu_device_id = '/gpu:' + str(gpu_id) + if not os.path.exists(model_dir): + os.makedirs(model_dir) + logging.info('loading data') + # load data + if phase == 'train': + self.s_gen = DataGen( + data_path, valid_target_len=valid_target_length, evaluate=False, + epochs=num_epoch) + else: + batch_size = 1 + self.s_gen = DataGen( + data_path, evaluate=True) + + logging.info('phase: %s' % phase) + logging.info('model_dir: %s' % (model_dir)) + logging.info('load_model: %s' % (load_model)) + logging.info('output_dir: %s' % (output_dir)) + logging.info('steps_per_checkpoint: %d' % (steps_per_checkpoint)) + logging.info('batch_size: %d' % (batch_size)) + logging.info('num_epoch: %d' % num_epoch) + logging.info('learning_rate: %d' % initial_learning_rate) + logging.info('reg_val: %d' % (reg_val)) + logging.info('max_gradient_norm: %f' % max_gradient_norm) + logging.info('clip_gradients: %s' % clip_gradients) + logging.info('valid_target_length %f' % valid_target_length) + logging.info('target_vocab_size: %d' % target_vocab_size) + logging.info('target_embedding_size: %f' % target_embedding_size) + logging.info('attn_num_hidden: %d' % attn_num_hidden) + logging.info('attn_num_layers: %d' % attn_num_layers) + logging.info('visualize: %s' % visualize) + + buckets = self.s_gen.bucket_specs + logging.info('buckets') + logging.info(buckets) + if use_gru: + logging.info('using GRU in the decoder.') + + # TODO: rename answer to label + + # variables + + self.zero_paddings = tf.placeholder(tf.float32, shape=(None, None, 512), name='zero_paddings') + + self.decoder_inputs = [] + self.encoder_masks = [] + self.target_weights = [] + for i in xrange(int(buckets[-1][0] + 1)): + self.encoder_masks.append(tf.placeholder(tf.float32, shape=[None, 1], + name="encoder_mask{0}".format(i))) + for i in xrange(buckets[-1][1] + 1): + self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], + name="decoder{0}".format(i))) + self.target_weights.append(tf.placeholder(tf.float32, shape=[None], + name="weight{0}".format(i))) + + + self.reg_val = reg_val + self.sess = session + self.evaluate = evaluate + self.steps_per_checkpoint = steps_per_checkpoint + self.model_dir = model_dir + self.output_dir = output_dir + self.buckets = buckets + self.batch_size = batch_size + self.num_epoch = num_epoch + self.global_step = tf.Variable(0, trainable=False) + self.valid_target_length = valid_target_length + self.phase = phase + self.visualize = visualize + self.learning_rate = initial_learning_rate + self.clip_gradients = clip_gradients + + if phase == 'train': + self.forward_only = False + elif phase == 'test': + self.forward_only = True + else: + assert False, phase + + + # TODO: [32, 85] -- proportional resizing + # TODO: one or many images + + # self.img_pl = tf.placeholder(tf.string, shape=None, name='input_image_as_bytes') + # self.imgs_pl = tf.expand_dims(self.img_pl, 0, name='input_images_as_bytes') + + self.img_pl = tf.placeholder(tf.string, name='input_image_as_bytes') + + self.img_data = tf.cond( + tf.less(tf.rank(self.img_pl), 1), + lambda: tf.expand_dims(self.img_pl, 0), + lambda: self.img_pl + ) + + self.img_data = tf.map_fn(lambda x: tf.image.decode_png(x, channels=1), self.img_data, dtype=tf.uint8) + + self.dims = tf.shape(self.img_data) + height_const = tf.constant(DataGen.IMAGE_HEIGHT, dtype=tf.float32) + new_height = tf.to_int32(height_const) + new_width = tf.to_int32(tf.ceil(tf.to_float(self.dims[2]) / tf.to_float(self.dims[1]) * height_const)) + self.new_dims = [new_height, new_width] # [32, 85] # + + with tf.control_dependencies(self.new_dims), tf.device(gpu_device_id): + self.img_data = tf.image.resize_images(self.img_data, self.new_dims, method=tf.image.ResizeMethod.BICUBIC) + self.img_data = tf.transpose(self.img_data, perm=[0, 3, 1, 2]) + + # with tf.device(gpu_device_id): + cnn_model = CNN(self.img_data, True) + self.conv_output = cnn_model.tf_output() + self.concat_conv_output = tf.concat(axis=1, values=[self.conv_output, self.zero_paddings]) + self.perm_conv_output = tf.transpose(self.concat_conv_output, perm=[1, 0, 2]) + self.attention_decoder_model = Seq2SeqModel( + encoder_masks = self.encoder_masks, + encoder_inputs_tensor = self.perm_conv_output, + decoder_inputs = self.decoder_inputs, + target_weights = self.target_weights, + target_vocab_size = target_vocab_size, + buckets = buckets, + target_embedding_size = target_embedding_size, + attn_num_layers = attn_num_layers, + attn_num_hidden = attn_num_hidden, + forward_only = self.forward_only, + use_gru = use_gru) + + if not self.forward_only: # train + self.updates = [] + self.summaries_by_bucket = [] + with tf.device(gpu_device_id): + params = tf.trainable_variables() + opt = tf.train.AdadeltaOptimizer(learning_rate=initial_learning_rate) + for b in xrange(len(buckets)): + if self.reg_val > 0: + reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) + logging.info('Adding %s regularization losses', len(reg_losses)) + logging.debug('REGULARIZATION_LOSSES: %s', reg_losses) + loss_op = self.reg_val * tf.reduce_sum(reg_losses) + self.attention_decoder_model.losses[b] + else: + loss_op = self.attention_decoder_model.losses[b] + + gradients, params = zip(*opt.compute_gradients(loss_op, params)) + if self.clip_gradients: + gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm) + # Add summaries for loss, variables, gradients, gradient norms and total gradient norm. + summaries = [] + summaries.append(tf.summary.scalar("loss", loss_op)) + summaries.append(tf.summary.scalar("total_gradient_norm", tf.global_norm(gradients))) + all_summaries = tf.summary.merge(summaries) + self.summaries_by_bucket.append(all_summaries) + # update op - apply gradients + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + with tf.control_dependencies(update_ops): + self.updates.append(opt.apply_gradients(zip(gradients, params), global_step=self.global_step)) + + table = tf.contrib.lookup.MutableHashTable( + key_dtype=tf.int64, + value_dtype=tf.string, + default_value="", + checkpoint=True, + ) + + insert = table.insert( + tf.constant([i for i in xrange(len(self.SYMBOLS))], dtype=tf.int64), + tf.constant(list(self.SYMBOLS)), + ) + + with tf.control_dependencies([insert]): + + output_num = [] + output_feed = [] + + for b in xrange(len(buckets)): + + for l in xrange(len(self.attention_decoder_model.outputs[b])): + guess = tf.argmax(self.attention_decoder_model.outputs[b][l], axis=1) + output_num.append(guess) + output_feed.append(table.lookup(guess)) + + tf.concat(output_num, 0) + self.arr_prediction = tf.foldl(lambda a, x: a + x, output_feed) + self.prediction = tf.gather(self.arr_prediction, 0, name='prediction') + + self.saver_all = tf.train.Saver(tf.all_variables()) + + ckpt = tf.train.get_checkpoint_state(model_dir) + if ckpt and load_model: + logging.info("Reading model parameters from %s" % ckpt.model_checkpoint_path) + self.saver_all.restore(self.sess, ckpt.model_checkpoint_path) + else: + logging.info("Created model with fresh parameters.") + self.sess.run(tf.initialize_all_variables()) + + def test(self): + step_time = 0.0 + loss = 0.0 + current_step = 0 + num_correct = 0 + num_total = 0 + + for batch in self.s_gen.gen(self.batch_size): + # Get a batch and make a step. + start_time = time.time() + result = self.step(batch, self.forward_only) + loss += result['loss'] / self.steps_per_checkpoint + grounds = [a for a in np.array([decoder_input.tolist() for decoder_input in batch['decoder_inputs']]).transpose()] + step_outputs = [b for b in np.array([np.argmax(logit, axis=1).tolist() for logit in result['logits']]).transpose()] + curr_step_time = (time.time() - start_time) + + logging.info('step_time: %f, loss: %f, step perplexity: %f'%(curr_step_time, result['loss'], math.exp(result['loss']) if result['loss'] < 300 else float('inf'))) + + if self.visualize: + step_attns = np.array([[a.tolist() for a in step_attn] for step_attn in result['attentions']]).transpose([1, 0, 2]) + + for idx, output, ground in zip(range(len(grounds)), step_outputs, grounds): + flag_ground, flag_out = True, True + num_total += 1 + output_valid = [] + ground_valid = [] + for j in range(1, len(ground)): + s1 = output[j-1] + s2 = ground[j] + if s2 != 2 and flag_ground: + ground_valid.append(s2) + else: + flag_ground = False + if s1 != 2 and flag_out: + output_valid.append(s1) + else: + flag_out = False + num_incorrect = distance.levenshtein(output_valid, ground_valid) + num_incorrect = float(num_incorrect) / len(ground_valid) + num_incorrect = min(1.0, num_incorrect) + num_correct += 1. - num_incorrect + + if self.visualize: + self.visualize_attention(batch['file_list'][idx], step_attns[idx], output_valid, ground_valid, num_incorrect>0, batch['real_len']) + + precision = num_correct / self.batch_size + logging.info('step %f - time: %f, loss: %f, perplexity: %f, precision: %f, batch_len: %f' + % (current_step, curr_step_time, result['loss'], math.exp(result['loss']) if result['loss'] < 300 else float('inf'), precision, batch['real_len'])) + current_step += 1 + + def train(self): + step_time = 0.0 + loss = 0.0 + current_step = 0 + writer = tf.summary.FileWriter(self.model_dir, self.sess.graph) + + logging.info('Starting the training process.') + for batch in self.s_gen.gen(self.batch_size): + start_time = time.time() + result = self.step(batch, self.forward_only) + loss += result['loss'] / self.steps_per_checkpoint + grounds = [a for a in np.array([decoder_input.tolist() for decoder_input in batch['decoder_inputs']]).transpose()] + step_outputs = [b for b in np.array([np.argmax(logit, axis=1).tolist() for logit in result['logits']]).transpose()] + curr_step_time = (time.time() - start_time) + step_time += curr_step_time / self.steps_per_checkpoint + + num_correct = 0 + for output, ground in zip(step_outputs, grounds): + flag_ground, flag_out = True, True + output_valid = [] + ground_valid = [] + for j in range(1, len(ground)): + s1 = output[j - 1] + s2 = ground[j] + if s2 != 2 and flag_ground: + ground_valid.append(s2) + else: + flag_ground = False + if s1 != 2 and flag_out: + output_valid.append(s1) + else: + flag_out = False + num_incorrect = distance.levenshtein(output_valid, ground_valid) + num_incorrect = float(num_incorrect) / len(ground_valid) + num_incorrect = min(1.0, num_incorrect) + num_correct += 1. - num_incorrect + + writer.add_summary(result['gradients'], current_step) + + precision = num_correct / self.batch_size + logging.info('step %f - time: %f, loss: %f, perplexity: %f, precision: %f, batch_len: %f' + % (current_step, curr_step_time, result['loss'], math.exp(result['loss']) if result['loss'] < 300 else float('inf'), precision, batch['real_len'])) + current_step += 1 + + # Once in a while, we save checkpoint, print statistics, and run evals. + if current_step % self.steps_per_checkpoint == 0: + # Print statistics for the previous epoch. + perplexity = math.exp(loss) if loss < 300 else float('inf') + logging.info("global step %d step-time %.2f loss %f perplexity " + "%.2f" % (self.global_step.eval(), step_time, loss, perplexity)) + # Save checkpoint and reset timer and loss. + checkpoint_path = os.path.join(self.model_dir, "model.ckpt") + logging.info("Saving model, current_step: %d"%current_step) + self.saver_all.save(self.sess, checkpoint_path, global_step=self.global_step) + step_time, loss = 0.0, 0.0 + + def to_savedmodel(self): + raise NotImplementedError + + def to_frozengraph(self): + raise NotImplementedError + + # step, read one batch, generate gradients + def step(self, batch, forward_only): + bucket_id = batch['bucket_id'] + img_data = batch['data'] + zero_paddings = batch['zero_paddings'] + decoder_inputs = batch['decoder_inputs'] + target_weights = batch['target_weights'] + encoder_masks = batch['encoder_mask'] + # Check if the sizes match. + encoder_size, decoder_size = self.buckets[bucket_id] + if len(decoder_inputs) != decoder_size: + raise ValueError("Decoder length must be equal to the one in bucket," + " %d != %d." % (len(decoder_inputs), decoder_size)) + if len(target_weights) != decoder_size: + raise ValueError("Weights length must be equal to the one in bucket," + " %d != %d." % (len(target_weights), decoder_size)) + + # Input feed: encoder inputs, decoder inputs, target_weights, as provided. + input_feed = {} + input_feed[self.img_pl.name] = img_data + input_feed[self.zero_paddings.name] = zero_paddings + for l in xrange(decoder_size): + input_feed[self.decoder_inputs[l].name] = decoder_inputs[l] + input_feed[self.target_weights[l].name] = target_weights[l] + for l in xrange(int(encoder_size)): + try: + input_feed[self.encoder_masks[l].name] = encoder_masks[l] + except: + pass + + # Since our targets are decoder inputs shifted by one, we need one more. + last_target = self.decoder_inputs[decoder_size].name + input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32) + + # TODO: merging into one op + + # Output feed: depends on whether we do a backward step or not. + output_feed = [self.attention_decoder_model.losses[bucket_id]] # Loss for this batch. + for l in xrange(decoder_size): # Output logits. + output_feed.append(self.attention_decoder_model.outputs[bucket_id][l]) + + if not forward_only: # train + output_feed += [self.summaries_by_bucket[bucket_id], + self.updates[bucket_id]] + elif self.visualize: # test and visualize + output_feed += self.attention_decoder_model.attention_weights_histories[bucket_id] + + outputs = self.sess.run(output_feed, input_feed) + + res = { + 'loss': outputs[0], + 'logits': outputs[1:(1+decoder_size)], + } + + if not forward_only: + res['gradients'] = outputs[2+decoder_size] + elif self.visualize: + res['attentions'] = outputs[(2+decoder_size):] + + return res + + def visualize_attention(self, filename, attentions, output_valid, ground_valid, flag_incorrect, real_len): + if flag_incorrect: + output_dir = os.path.join(self.output_dir, 'incorrect') + else: + output_dir = os.path.join(self.output_dir, 'correct') + output_dir = os.path.join(output_dir, filename.replace('/', '_')) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + with open(os.path.join(output_dir, 'word.txt'), 'w') as fword: + fword.write(' '.join([chr(c-13+97) if c-13+97>96 else chr(c-3+48) for c in ground_valid])+'\n') + fword.write(' '.join([chr(c-13+97) if c-13+97>96 else chr(c-3+48) for c in output_valid])) + with open(filename, 'rb') as img_file: + img = Image.open(img_file) + w, h = img.size + h = 32 + img = img.resize( + (real_len, h), + Image.ANTIALIAS) + img_data = np.asarray(img, dtype=np.uint8) + for idx in range(len(output_valid)): + output_filename = os.path.join(output_dir, 'image_%d.jpg'%(idx)) + attention = attentions[idx][:(int(real_len/4)-1)] + attention_orig = np.zeros(real_len) + for i in range(real_len): + if 0 < i/4-1 and i/4-1 < len(attention): + attention_orig[i] = attention[int(i/4)-1] + attention_orig = np.convolve(attention_orig, [0.199547,0.200226,0.200454,0.200226,0.199547], mode='same') + attention_orig = np.maximum(attention_orig, 0.3) + attention_out = np.zeros((h, real_len)) + for i in range(real_len): + attention_out[:,i] = attention_orig[i] + if len(img_data.shape) == 3: + attention_out = attention_out[:,:,np.newaxis] + img_out_data = img_data * attention_out + img_out = Image.fromarray(img_out_data.astype(np.uint8)) + img_out.save(output_filename) diff --git a/src/model/seq2seq.py b/aocr/model/seq2seq.py similarity index 100% rename from src/model/seq2seq.py rename to aocr/model/seq2seq.py diff --git a/src/model/seq2seq_model.py b/aocr/model/seq2seq_model.py similarity index 100% rename from src/model/seq2seq_model.py rename to aocr/model/seq2seq_model.py diff --git a/aocr/util/__init__.py b/aocr/util/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/data_util/bucketdata.py b/aocr/util/bucketdata.py similarity index 99% rename from src/data_util/bucketdata.py rename to aocr/util/bucketdata.py index d71fa803..f9a8ece6 100644 --- a/src/data_util/bucketdata.py +++ b/aocr/util/bucketdata.py @@ -1,5 +1,3 @@ -__author__ = 'moonkey' - import math import numpy as np diff --git a/src/data_util/data_gen.py b/aocr/util/data_gen.py similarity index 85% rename from src/data_util/data_gen.py rename to aocr/util/data_gen.py index f693a75d..77557572 100644 --- a/src/data_util/data_gen.py +++ b/aocr/util/data_gen.py @@ -1,5 +1,3 @@ -__author__ = 'moonkey, emedvedev' - import os import math @@ -13,20 +11,18 @@ class DataGen(object): - _GO = 1 - _EOS = 2 - - IMG_HEIGHT = 32 + GO_ID = 1 + EOS_ID = 2 + IMAGE_HEIGHT = 32 def __init__(self, - data_root, annotation_fn, + annotation_fn, evaluate=False, valid_target_len=float('inf'), img_width_range=(12, 320), word_len=30, epochs=1000): """ - :param data_root: :param annotation_fn: :param lexicon_fn: :param valid_target_len: @@ -35,16 +31,14 @@ def __init__(self, :param epochs: :return: """ - self.data_root = data_root self.epochs = epochs - self.image_height = self.IMG_HEIGHT self.valid_target_len = valid_target_len self.bucket_min_width, self.bucket_max_width = img_width_range if os.path.exists(annotation_fn): self.annotation_path = annotation_fn else: - self.annotation_path = os.path.join(data_root, annotation_fn) + raise IOError("The .tfrecords file %s does not exist." % annotation_fn) if evaluate: self.bucket_specs = [(int(math.floor(64 / 4)), int(word_len + 2)), @@ -88,15 +82,16 @@ def gen(self, batch_size): while not coord.should_stop(): raw_images, raw_labels = sess.run([images, labels]) for img, lex in zip(raw_images, raw_labels): - _, word = self.read_data(None, lex) + word = self.convert_lex(lex) if valid_target_len < float('inf'): word = word[:valid_target_len + 1] img_data = Image.open(StringIO(img)) - width, _ = img_data.size + width, height = img_data.size + resized_width = math.floor(float(width) / height * self.IMAGE_HEIGHT) - b_idx = min(width, self.bucket_max_width) - bucket_size = self.bucket_data[b_idx].append(img, width, word, lex) + b_idx = min(resized_width, self.bucket_max_width) + bucket_size = self.bucket_data[b_idx].append(img, resized_width, word, lex) if bucket_size >= batch_size: bucket = self.bucket_data[b_idx].flush_out( self.bucket_specs, @@ -105,7 +100,7 @@ def gen(self, batch_size): if bucket is not None: yield bucket else: - assert False, 'no valid bucket of width %d' % width + assert False, 'no valid bucket of width %d' % resized_width finally: coord.request_stop() @@ -113,18 +108,18 @@ def gen(self, batch_size): self.clear() - def read_data(self, img, lex): + def convert_lex(self, lex): assert lex and len(lex) < self.bucket_specs[-1][1] - word = [self._GO] + word = [self.GO_ID] for char in lex: assert 96 < ord(char) < 123 or 47 < ord(char) < 58 word.append( ord(char) - 97 + 13 if ord(char) > 96 else ord(char) - 48 + 3) - word.append(self._EOS) + word.append(self.EOS_ID) word = np.array(word, dtype=np.int32) - return img, word + return word def parse_tfrecords(filename_queue): @@ -134,6 +129,6 @@ def parse_tfrecords(filename_queue): serialized_example, features={ 'image': tf.FixedLenFeature([], tf.string), - 'label': tf.FixedLenFeature([], tf.string), + 'answer': tf.FixedLenFeature([], tf.string), }) - return features['image'], features['label'] + return features['image'], features['answer'] diff --git a/setup.py b/setup.py index 81ef710d..f0ea56a4 100644 --- a/setup.py +++ b/setup.py @@ -3,14 +3,26 @@ REQUIRED_PACKAGES = ['distance', 'tensorflow', 'numpy', 'six'] + +def readme(): + with open('README.md') as file: + return file.read() + + setup( - name='attentionocr', + name='aocr', url='https://github.com/emedvedev/attention-ocr', - author_name='Ed Medvedev', + author='Ed Medvedev', + author_email='edward.medvedev@gmail.com', version='0.1', install_requires=REQUIRED_PACKAGES, packages=find_packages(), include_package_data=True, + license='MIT', description='''Optical character recognition model - for Tensorflow based on Visual Attention.''' + for Tensorflow based on Visual Attention.''', + long_description=readme(), + entry_points={ + 'console_scripts': ['aocr=aocr.launcher:main'], + } ) diff --git a/src/__init__.py b/src/__init__.py deleted file mode 100644 index bec5fe7a..00000000 --- a/src/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__author__ = 'moonkey' diff --git a/src/data_util/__init__.py b/src/data_util/__init__.py deleted file mode 100644 index bec5fe7a..00000000 --- a/src/data_util/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__author__ = 'moonkey' diff --git a/src/exp_config.py b/src/exp_config.py deleted file mode 100644 index ac73b304..00000000 --- a/src/exp_config.py +++ /dev/null @@ -1,38 +0,0 @@ -import platform - -""" -Default paramters for experiemnt -""" - - -class ExpConfig: - - GPU_ID = 0 - # phase - PHASE = 'test' - VISUALIZE = True - - # input and output - DATA_BASE_DIR = '/mnt/90kDICT32px' - DATA_PATH = '/mnt/train_shuffled_words.txt' # path containing data file names and labels. Format: - MODEL_DIR = 'train' # the directory for saving and loading model parameters (structure is not stored) - LOG_PATH = 'log.txt' - OUTPUT_DIR = 'results' # output directory - STEPS_PER_CHECKPOINT = 500 # checkpointing (print perplexity, save model) per how many steps - - # Optimization - NUM_EPOCH = 1000 - BATCH_SIZE = 64 - INITIAL_LEARNING_RATE = 1.0 # initial learning rate, note the we use AdaDelta, so the initial value doe not matter much - - # Network parameters - CLIP_GRADIENTS = True # whether to perform gradient clipping - MAX_GRADIENT_NORM = 5.0 # Clip gradients to this norm - TARGET_EMBEDDING_SIZE = 10 # embedding dimension for each target - ATTN_USE_LSTM = True # whether or not use LSTM attention decoder cell - ATTN_NUM_HIDDEN=128 # number of hidden units in attention decoder cell - ATTN_NUM_LAYERS = 2 # number of layers in attention decoder cell - # (Encoder number of hidden units will be ATTN_NUM_HIDDEN*ATTN_NUM_LAYERS) - LOAD_MODEL = False - OLD_MODEL_VERSION = False - TARGET_VOCAB_SIZE = 26+10+3 # 0: PADDING, 1: GO, 2: EOS, >2: 0-9, a-z diff --git a/src/launcher.py b/src/launcher.py deleted file mode 100644 index f3edcd53..00000000 --- a/src/launcher.py +++ /dev/null @@ -1,147 +0,0 @@ -__author__ = 'moonkey' - -import sys, argparse, logging - -import numpy as np -from PIL import Image -import tensorflow as tf -tf.logging.set_verbosity(tf.logging.ERROR) - - -from model.model import Model -import exp_config - -def process_args(args, defaults): - parser = argparse.ArgumentParser() - - parser.add_argument('--gpu-id', dest="gpu_id", - type=int, default=defaults.GPU_ID) - - parser.add_argument('--use-gru', dest='use_gru', action='store_true') - - parser.add_argument('--phase', dest="phase", - type=str, default=defaults.PHASE, - choices=['train', 'test'], - help=('Phase of experiment, can be either' - ' train or test, default=%s'%(defaults.PHASE))) - parser.add_argument('--data-path', dest="data_path", - type=str, default=defaults.DATA_PATH, - help=('Path of file containing the path and labels' - ' of training or testing data, default=%s' - %(defaults.DATA_PATH))) - parser.add_argument('--data-base-dir', dest="data_base_dir", - type=str, default=defaults.DATA_BASE_DIR, - help=('The base directory of the paths in the file ' - 'containing the path and labels, default=%s' - %(defaults.DATA_PATH))) - parser.add_argument('--visualize', dest='visualize', action='store_true', - help=('Visualize attentions or not' - ', default=%s' %(defaults.VISUALIZE))) - parser.add_argument('--no-visualize', dest='visualize', action='store_false') - parser.set_defaults(visualize=defaults.VISUALIZE) - parser.add_argument('--batch-size', dest="batch_size", - type=int, default=defaults.BATCH_SIZE, - help=('Batch size, default = %s' - %(defaults.BATCH_SIZE))) - parser.add_argument('--initial-learning-rate', dest="initial_learning_rate", - type=float, default=defaults.INITIAL_LEARNING_RATE, - help=('Initial learning rate, default = %s' - %(defaults.INITIAL_LEARNING_RATE))) - parser.add_argument('--num-epoch', dest="num_epoch", - type=int, default=defaults.NUM_EPOCH, - help=('Number of epochs, default = %s' - %(defaults.NUM_EPOCH))) - parser.add_argument('--steps-per-checkpoint', dest="steps_per_checkpoint", - type=int, default=defaults.STEPS_PER_CHECKPOINT, - help=('Checkpointing (print perplexity, save model) per' - ' how many steps, default = %s' - %(defaults.STEPS_PER_CHECKPOINT))) - parser.add_argument('--target-vocab-size', dest="target_vocab_size", - type=int, default=defaults.TARGET_VOCAB_SIZE, - help=('Target vocabulary size, default=%s' - %(defaults.TARGET_VOCAB_SIZE))) - parser.add_argument('--model-dir', dest="model_dir", - type=str, default=defaults.MODEL_DIR, - help=('The directory for saving and loading model ' - '(structure is not stored), ' - 'default=%s' %(defaults.MODEL_DIR))) - parser.add_argument('--target-embedding-size', dest="target_embedding_size", - type=int, default=defaults.TARGET_EMBEDDING_SIZE, - help=('Embedding dimension for each target, default=%s' - %(defaults.TARGET_EMBEDDING_SIZE))) - parser.add_argument('--attn-num-hidden', dest="attn_num_hidden", - type=int, default=defaults.ATTN_NUM_HIDDEN, - help=('number of hidden units in attention decoder cell' - ', default=%s' - %(defaults.ATTN_NUM_HIDDEN))) - parser.add_argument('--attn-num-layers', dest="attn_num_layers", - type=int, default=defaults.ATTN_NUM_LAYERS, - help=('number of hidden layers in attention decoder cell' - ', default=%s' - %(defaults.ATTN_NUM_LAYERS))) - parser.add_argument('--load-model', dest='load_model', action='store_true', - help=('Load model from model-dir or not' - ', default=%s' %(defaults.LOAD_MODEL))) - parser.add_argument('--no-load-model', dest='load_model', action='store_false') - parser.set_defaults(load_model=defaults.LOAD_MODEL) - parser.add_argument('--log-path', dest="log_path", - type=str, default=defaults.LOG_PATH, - help=('Log file path, default=%s' - %(defaults.LOG_PATH))) - parser.add_argument('--output-dir', dest="output_dir", - type=str, default=defaults.OUTPUT_DIR, - help=('Output directory, default=%s' - %(defaults.OUTPUT_DIR))) - parser.add_argument('--max_gradient_norm', dest="max_gradient_norm", - type=int, default=defaults.MAX_GRADIENT_NORM, - help=('Clip gradients to this norm.' - ', default=%s' - % (defaults.MAX_GRADIENT_NORM))) - parser.add_argument('--no-gradient_clipping', dest='clip_gradients', action='store_false', - help=('Do not perform gradient clipping, difault for clip_gradients is %s' % - (defaults.CLIP_GRADIENTS))) - parser.set_defaults(clip_gradients=defaults.CLIP_GRADIENTS) - - parameters = parser.parse_args(args) - return parameters - -def main(args, defaults): - parameters = process_args(args, defaults) - logging.basicConfig( - level=logging.DEBUG, - format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s', - filename=parameters.log_path) - console = logging.StreamHandler() - console.setLevel(logging.INFO) - formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s') - console.setFormatter(formatter) - logging.getLogger('').addHandler(console) - - with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: - model = Model( - phase = parameters.phase, - visualize = parameters.visualize, - data_path = parameters.data_path, - data_base_dir = parameters.data_base_dir, - output_dir = parameters.output_dir, - batch_size = parameters.batch_size, - initial_learning_rate = parameters.initial_learning_rate, - num_epoch = parameters.num_epoch, - steps_per_checkpoint = parameters.steps_per_checkpoint, - target_vocab_size = parameters.target_vocab_size, - model_dir = parameters.model_dir, - target_embedding_size = parameters.target_embedding_size, - attn_num_hidden = parameters.attn_num_hidden, - attn_num_layers = parameters.attn_num_layers, - clip_gradients = parameters.clip_gradients, - max_gradient_norm = parameters.max_gradient_norm, - load_model = parameters.load_model, - valid_target_length = float('inf'), - gpu_id=parameters.gpu_id, - use_gru=parameters.use_gru, - session = sess) - model.launch() - -if __name__ == "__main__": - main(sys.argv[1:], exp_config.ExpConfig) - diff --git a/src/model/__init__.py b/src/model/__init__.py deleted file mode 100644 index bec5fe7a..00000000 --- a/src/model/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__author__ = 'moonkey' diff --git a/src/model/model.py b/src/model/model.py deleted file mode 100644 index cd136b07..00000000 --- a/src/model/model.py +++ /dev/null @@ -1,458 +0,0 @@ -"""Visual Attention Based OCR Model.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import random, time, os, shutil, math, sys, logging -#import ipdb -import numpy as np -from six.moves import xrange # pylint: disable=redefined-builtin -from PIL import Image -import tensorflow as tf -#import keras.backend as K -#from tensorflow.models.rnn.translate import data_utils - -from .cnn import CNN -from .seq2seq_model import Seq2SeqModel -from data_util.data_gen import DataGen -from tqdm import tqdm - -try: - import distance - distance_loaded = True -except ImportError: - distance_loaded = False - -class Model(object): - - def __init__(self, - phase, - visualize, - data_path, - data_base_dir, - output_dir, - batch_size, - initial_learning_rate, - num_epoch, - steps_per_checkpoint, - target_vocab_size, - model_dir, - target_embedding_size, - attn_num_hidden, - attn_num_layers, - clip_gradients, - max_gradient_norm, - session, - load_model, - gpu_id, - use_gru, - evaluate=False, - valid_target_length=float('inf'), - reg_val = 0 ): - - gpu_device_id = '/gpu:' + str(gpu_id) - if not os.path.exists(model_dir): - os.makedirs(model_dir) - logging.info('loading data') - # load data - if phase == 'train': - self.s_gen = DataGen( - data_base_dir, data_path, valid_target_len=valid_target_length, evaluate=False) - else: - batch_size = 1 - self.s_gen = DataGen( - data_base_dir, data_path, evaluate=True) - - - #logging.info('valid_target_length: %s' %(str(valid_target_length))) - logging.info('phase: %s' % phase) - logging.info('model_dir: %s' % (model_dir)) - logging.info('load_model: %s' % (load_model)) - logging.info('output_dir: %s' % (output_dir)) - logging.info('steps_per_checkpoint: %d' % (steps_per_checkpoint)) - logging.info('batch_size: %d' %(batch_size)) - logging.info('num_epoch: %d' %num_epoch) - logging.info('learning_rate: %d' % initial_learning_rate) - logging.info('reg_val: %d' % (reg_val)) - logging.info('max_gradient_norm: %f' % max_gradient_norm) - logging.info('clip_gradients: %s' % clip_gradients) - logging.info('valid_target_length %f' %valid_target_length) - logging.info('target_vocab_size: %d' %target_vocab_size) - logging.info('target_embedding_size: %f' % target_embedding_size) - logging.info('attn_num_hidden: %d' % attn_num_hidden) - logging.info('attn_num_layers: %d' % attn_num_layers) - logging.info('visualize: %s' % visualize) - - buckets = self.s_gen.bucket_specs - logging.info('buckets') - logging.info(buckets) - if use_gru: - logging.info('ues GRU in the decoder.') - - # variables - self.img_data = tf.placeholder(tf.float32, shape=(None, 1, 32, None), name='img_data') - self.zero_paddings = tf.placeholder(tf.float32, shape=(None, None, 512), name='zero_paddings') - - self.decoder_inputs = [] - self.encoder_masks = [] - self.target_weights = [] - for i in xrange(int(buckets[-1][0] + 1)): - self.encoder_masks.append(tf.placeholder(tf.float32, shape=[None, 1], - name="encoder_mask{0}".format(i))) - for i in xrange(buckets[-1][1] + 1): - self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], - name="decoder{0}".format(i))) - self.target_weights.append(tf.placeholder(tf.float32, shape=[None], - name="weight{0}".format(i))) - - self.reg_val = reg_val - self.sess = session - self.evaluate = evaluate - self.steps_per_checkpoint = steps_per_checkpoint - self.model_dir = model_dir - self.output_dir = output_dir - self.buckets = buckets - self.batch_size = batch_size - self.num_epoch = num_epoch - self.global_step = tf.Variable(0, trainable=False) - self.valid_target_length = valid_target_length - self.phase = phase - self.visualize = visualize - self.learning_rate = initial_learning_rate - self.clip_gradients = clip_gradients - - if phase == 'train': - self.forward_only = False - elif phase == 'test': - self.forward_only = True - else: - assert False, phase - - with tf.device(gpu_device_id): - cnn_model = CNN(self.img_data, True) #(not self.forward_only)) - self.conv_output = cnn_model.tf_output() - self.concat_conv_output = tf.concat(axis=1, values=[self.conv_output, self.zero_paddings]) - - self.perm_conv_output = tf.transpose(self.concat_conv_output, perm=[1, 0, 2]) - - with tf.device(gpu_device_id): - self.attention_decoder_model = Seq2SeqModel( - encoder_masks = self.encoder_masks, - encoder_inputs_tensor = self.perm_conv_output, - decoder_inputs = self.decoder_inputs, - target_weights = self.target_weights, - target_vocab_size = target_vocab_size, - buckets = buckets, - target_embedding_size = target_embedding_size, - attn_num_layers = attn_num_layers, - attn_num_hidden = attn_num_hidden, - forward_only = self.forward_only, - use_gru = use_gru) - - - - - if not self.forward_only: - - self.updates = [] - self.summaries_by_bucket = [] - with tf.device(gpu_device_id): - params = tf.trainable_variables() - # Gradients and SGD update operation for training the model. - opt = tf.train.AdadeltaOptimizer(learning_rate=initial_learning_rate) - for b in xrange(len(buckets)): - if self.reg_val > 0: - reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) - logging.info('Adding %s regularization losses', len(reg_losses)) - logging.debug('REGULARIZATION_LOSSES: %s', reg_losses) - loss_op = self.reg_val * tf.reduce_sum(reg_losses) + self.attention_decoder_model.losses[b] - else: - loss_op = self.attention_decoder_model.losses[b] - - gradients, params = zip(*opt.compute_gradients(loss_op, params)) - if self.clip_gradients: - gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm) - # Add summaries for loss, variables, gradients, gradient norms and total gradient norm. - summaries = [] - ''' - for gradient, variable in gradients: - if isinstance(gradient, tf.IndexedSlices): - grad_values = gradient.values - else: - grad_values = gradient - summaries.append(tf.summary.histogram(variable.name, variable)) - summaries.append(tf.summary.histogram(variable.name + "/gradients", grad_values)) - summaries.append(tf.summary.scalar(variable.name + "/gradient_norm", - tf.global_norm([grad_values]))) - ''' - summaries.append(tf.summary.scalar("loss", loss_op)) - summaries.append(tf.summary.scalar("total_gradient_norm", tf.global_norm(gradients))) - all_summaries = tf.summary.merge(summaries) - self.summaries_by_bucket.append(all_summaries) - # update op - apply gradients - update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) - with tf.control_dependencies(update_ops): - self.updates.append(opt.apply_gradients(zip(gradients, params), global_step=self.global_step)) - - self.saver_all = tf.train.Saver(tf.all_variables()) - - ckpt = tf.train.get_checkpoint_state(model_dir) - if ckpt and load_model: - logging.info("Reading model parameters from %s" % ckpt.model_checkpoint_path) - #self.saver.restore(self.sess, ckpt.model_checkpoint_path) - self.saver_all.restore(self.sess, ckpt.model_checkpoint_path) - else: - logging.info("Created model with fresh parameters.") - self.sess.run(tf.initialize_all_variables()) - #self.sess.run(init_new_vars_op) - - - # train or test as specified by phase - def launch(self): - step_time, loss = 0.0, 0.0 - current_step = 0 - previous_losses = [] - writer = tf.summary.FileWriter(self.model_dir, self.sess.graph) - if self.phase == 'test': - if not distance_loaded: - logging.info('Warning: distance module not installed. Do whole sequence comparison instead.') - else: - logging.info('Compare word based on edit distance.') - num_correct = 0 - num_total = 0 - for batch in self.s_gen.gen(self.batch_size): - # Get a batch and make a step. - start_time = time.time() - bucket_id = batch['bucket_id'] - img_data = batch['data'] - zero_paddings = batch['zero_paddings'] - decoder_inputs = batch['decoder_inputs'] - target_weights = batch['target_weights'] - encoder_masks = batch['encoder_mask'] - file_list = batch['filenames'] - real_len = batch['real_len'] - - grounds = [a for a in np.array([decoder_input.tolist() for decoder_input in decoder_inputs]).transpose()] - _, step_loss, step_logits, step_attns = self.step(encoder_masks, img_data, zero_paddings, decoder_inputs, target_weights, bucket_id, self.forward_only) - curr_step_time = (time.time() - start_time) - step_time += curr_step_time / self.steps_per_checkpoint - logging.info('step_time: %f, loss: %f, step perplexity: %f'%(curr_step_time, step_loss, math.exp(step_loss) if step_loss < 300 else float('inf'))) - loss += step_loss / self.steps_per_checkpoint - current_step += 1 - step_outputs = [b for b in np.array([np.argmax(logit, axis=1).tolist() for logit in step_logits]).transpose()] - if self.visualize: - step_attns = np.array([[a.tolist() for a in step_attn] for step_attn in step_attns]).transpose([1, 0, 2]) - #print (step_attns) - - for idx, output, ground in zip(range(len(grounds)), step_outputs, grounds): - flag_ground,flag_out = True, True - num_total += 1 - output_valid = [] - ground_valid = [] - for j in range(1,len(ground)): - s1 = output[j-1] - s2 = ground[j] - if s2 != 2 and flag_ground: - ground_valid.append(s2) - else: - flag_ground = False - if s1 != 2 and flag_out: - output_valid.append(s1) - else: - flag_out = False - if distance_loaded: - num_incorrect = distance.levenshtein(output_valid, ground_valid) - if self.visualize: - self.visualize_attention(file_list[idx], step_attns[idx], output_valid, ground_valid, num_incorrect>0, real_len) - num_incorrect = float(num_incorrect) / len(ground_valid) - num_incorrect = min(1.0, num_incorrect) - else: - if output_valid == ground_valid: - num_incorrect = 0 - else: - num_incorrect = 1 - if self.visualize: - self.visualize_attention(file_list[idx], step_attns[idx], output_valid, ground_valid, num_incorrect>0, real_len) - num_correct += 1. - num_incorrect - logging.info('%f out of %d correct' %(num_correct, num_total)) - elif self.phase == 'train': - total = (self.s_gen.get_size() // self.batch_size) - with tqdm(desc='Train: ', total=total) as pbar: - for epoch in range(self.num_epoch): - - logging.info('Generating first batch)') - for i,batch in enumerate(self.s_gen.gen(self.batch_size)): - # Get a batch and make a step. - num_total = 0 - num_correct = 0 - start_time = time.time() - batch_len = batch['real_len'] - bucket_id = batch['bucket_id'] - img_data = batch['data'] - zero_paddings = batch['zero_paddings'] - decoder_inputs = batch['decoder_inputs'] - target_weights = batch['target_weights'] - encoder_masks = batch['encoder_mask'] - #logging.info('current_step: %d'%current_step) - #logging.info(np.array([decoder_input.tolist() for decoder_input in decoder_inputs]).transpose()[0]) - #print (np.array([target_weight.tolist() for target_weight in target_weights]).transpose()[0]) - summaries, step_loss, step_logits, _ = self.step(encoder_masks, img_data, zero_paddings, decoder_inputs, target_weights, bucket_id, self.forward_only) - - grounds = [a for a in - np.array([decoder_input.tolist() for decoder_input in decoder_inputs]).transpose()] - step_outputs = [b for b in - np.array( - [np.argmax(logit, axis=1).tolist() for logit in step_logits]).transpose()] - - for idx, output, ground in zip(range(len(grounds)), step_outputs, grounds): - flag_ground, flag_out = True, True - num_total += 1 - output_valid = [] - ground_valid = [] - for j in range(1, len(ground)): - s1 = output[j - 1] - s2 = ground[j] - if s2 != 2 and flag_ground: - ground_valid.append(s2) - else: - flag_ground = False - if s1 != 2 and flag_out: - output_valid.append(s1) - else: - flag_out = False - if distance_loaded: - num_incorrect = distance.levenshtein(output_valid, ground_valid) - num_incorrect = float(num_incorrect) / len(ground_valid) - num_incorrect = min(1.0, num_incorrect) - else: - if output_valid == ground_valid: - num_incorrect = 0 - else: - num_incorrect = 1 - num_correct += 1. - num_incorrect - - writer.add_summary(summaries, current_step) - curr_step_time = (time.time() - start_time) - step_time += curr_step_time / self.steps_per_checkpoint - precision = num_correct / num_total - logging.info('step %f - time: %f, loss: %f, perplexity: %f, precision: %f, batch_len: %f'%(current_step, curr_step_time, step_loss, math.exp(step_loss) if step_loss < 300 else float('inf'), precision, batch_len)) - loss += step_loss / self.steps_per_checkpoint - pbar.set_description('Train, loss={:.8f}'.format(step_loss)) - pbar.update() - current_step += 1 - # If there is an EOS symbol in outputs, cut them at that point. - #if data_utils.EOS_ID in step_outputs: - # step_outputs = step_outputs[:step_outputs.index(data_utils.EOS_ID)] - #if data_utils.PAD_ID in decoder_inputs: - #decoder_inputs = decoder_inputs[:decoder_inputs.index(data_utils.PAD_ID)] - # print (step_outputs[0]) - - # Once in a while, we save checkpoint, print statistics, and run evals. - if current_step % self.steps_per_checkpoint == 0: - # Print statistics for the previous epoch. - perplexity = math.exp(loss) if loss < 300 else float('inf') - logging.info("global step %d step-time %.2f loss %f perplexity " - "%.2f" % (self.global_step.eval(), step_time, loss, perplexity)) - previous_losses.append(loss) - # Save checkpoint and zero timer and loss. - if not self.forward_only: - checkpoint_path = os.path.join(self.model_dir, "translate.ckpt") - logging.info("Saving model, current_step: %d"%current_step) - self.saver_all.save(self.sess, checkpoint_path, global_step=self.global_step) - step_time, loss = 0.0, 0.0 - #sys.stdout.flush() - - # step, read one batch, generate gradients - def step(self, encoder_masks, img_data, zero_paddings, decoder_inputs, target_weights, - bucket_id, forward_only): - # Check if the sizes match. - encoder_size, decoder_size = self.buckets[bucket_id] - if len(decoder_inputs) != decoder_size: - raise ValueError("Decoder length must be equal to the one in bucket," - " %d != %d." % (len(decoder_inputs), decoder_size)) - if len(target_weights) != decoder_size: - raise ValueError("Weights length must be equal to the one in bucket," - " %d != %d." % (len(target_weights), decoder_size)) - - # Input feed: encoder inputs, decoder inputs, target_weights, as provided. - input_feed = {} - input_feed[self.img_data.name] = img_data - input_feed[self.zero_paddings.name] = zero_paddings - for l in xrange(decoder_size): - input_feed[self.decoder_inputs[l].name] = decoder_inputs[l] - input_feed[self.target_weights[l].name] = target_weights[l] - for l in xrange(int(encoder_size)): - try: - input_feed[self.encoder_masks[l].name] = encoder_masks[l] - except Exception as e: - pass - #ipdb.set_trace() - - # Since our targets are decoder inputs shifted by one, we need one more. - last_target = self.decoder_inputs[decoder_size].name - input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32) - - # Output feed: depends on whether we do a backward step or not. - if not forward_only: - output_feed = [self.updates[bucket_id], # Update Op that does SGD. - #self.gradient_norms[bucket_id], # Gradient norm. - self.attention_decoder_model.losses[bucket_id], - self.summaries_by_bucket[bucket_id]] - for l in xrange(decoder_size): # Output logits. - output_feed.append(self.attention_decoder_model.outputs[bucket_id][l]) - else: - output_feed = [self.attention_decoder_model.losses[bucket_id]] # Loss for this batch. - for l in xrange(decoder_size): # Output logits. - output_feed.append(self.attention_decoder_model.outputs[bucket_id][l]) - if self.visualize: - output_feed += self.attention_decoder_model.attention_weights_histories[bucket_id] - - outputs = self.sess.run(output_feed, input_feed) - if not forward_only: - return outputs[2], outputs[1], outputs[3:(3+self.buckets[bucket_id][1])], None # Gradient norm summary, loss, no outputs, no attentions. - else: - return None, outputs[0], outputs[1:(1+self.buckets[bucket_id][1])], outputs[(1+self.buckets[bucket_id][1]):] # No gradient norm, loss, outputs, attentions. - - - def visualize_attention(self, filename, attentions, output_valid, ground_valid, flag_incorrect, real_len): - if flag_incorrect: - output_dir = os.path.join(self.output_dir, 'incorrect') - else: - output_dir = os.path.join(self.output_dir, 'correct') - output_dir = os.path.join(output_dir, filename.replace('/', '_')) - if not os.path.exists(output_dir): - os.makedirs(output_dir) - with open(os.path.join(output_dir, 'word.txt'), 'w') as fword: - fword.write(' '.join([chr(c-13+97) if c-13+97>96 else chr(c-3+48) for c in ground_valid])+'\n') - fword.write(' '.join([chr(c-13+97) if c-13+97>96 else chr(c-3+48) for c in output_valid])) - with open(filename, 'rb') as img_file: - img = Image.open(img_file) - w, h = img.size - h = 32 - img = img.resize( - (real_len, h), - Image.ANTIALIAS) - img_data = np.asarray(img, dtype=np.uint8) - for idx in range(len(output_valid)): - output_filename = os.path.join(output_dir, 'image_%d.jpg'%(idx)) - attention = attentions[idx][:(int(real_len/4)-1)] - - # I have got the attention_orig here, which is of size 32*len(ground_truth), the only thing left is to visualize it and save it to output_filename - # TODO here - attention_orig = np.zeros(real_len) - for i in range(real_len): - if 0 < i/4-1 and i/4-1 < len(attention): - attention_orig[i] = attention[int(i/4)-1] - attention_orig = np.convolve(attention_orig, [0.199547,0.200226,0.200454,0.200226,0.199547], mode='same') - attention_orig = np.maximum(attention_orig, 0.3) - attention_out = np.zeros((h, real_len)) - for i in range(real_len): - attention_out[:,i] = attention_orig[i] - if len(img_data.shape) == 3: - attention_out = attention_out[:,:,np.newaxis] - img_out_data = img_data * attention_out - img_out = Image.fromarray(img_out_data.astype(np.uint8)) - img_out.save(output_filename) - #print (output_filename) - #assert False From f9c4058380ac454add0cd30186b9495dab1bfa4e Mon Sep 17 00:00:00 2001 From: Edward Medvedev Date: Fri, 21 Jul 2017 16:48:46 +0200 Subject: [PATCH 08/12] Add the dataset util --- aocr/util/dataset.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 aocr/util/dataset.py diff --git a/aocr/util/dataset.py b/aocr/util/dataset.py new file mode 100644 index 00000000..ec6e5595 --- /dev/null +++ b/aocr/util/dataset.py @@ -0,0 +1,28 @@ +import tensorflow as tf + + +def _bytes_feature(value): + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) + + +def _int64_feature(value): + return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) + + +def generate(annotations_path, output_path): + writer = tf.python_io.TFRecordWriter(output_path) + with open(annotations_path, 'r') as f: + pairs = [line.split() for line in f.readlines()] + + for img_path, label in pairs: + + with open(img_path, 'rb') as img_file: + img = img_file.read() + + example = tf.train.Example(features=tf.train.Features(feature={ + 'image': _bytes_feature(img), + 'label': _bytes_feature(label)})) + + writer.write(example.SerializeToString()) + + writer.close() From 8a9d612d2f2e4390ec9f14d6c5dbc9958a22ebab Mon Sep 17 00:00:00 2001 From: Edward Medvedev Date: Fri, 21 Jul 2017 20:20:30 +0200 Subject: [PATCH 09/12] Add utils for generating datasets --- aocr/defaults.py | 1 + aocr/launcher.py | 27 +++++++++++++++++++++------ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/aocr/defaults.py b/aocr/defaults.py index a40cf719..50886c25 100644 --- a/aocr/defaults.py +++ b/aocr/defaults.py @@ -9,6 +9,7 @@ class Config: VISUALIZE = False # I/O + NEW_DATASET_PATH = 'dataset.tfrecords' DATA_PATH = 'data.tfrecords' MODEL_DIR = 'models' LOG_PATH = 'attentionocr.log' diff --git a/aocr/launcher.py b/aocr/launcher.py index 85346b29..fec710bc 100644 --- a/aocr/launcher.py +++ b/aocr/launcher.py @@ -6,6 +6,7 @@ from .model.model import Model from .defaults import Config +from .util import dataset tf.logging.set_verbosity(tf.logging.ERROR) @@ -23,14 +24,23 @@ def process_args(args, defaults): parser.set_defaults(load_model=defaults.LOAD_MODEL) # Dataset generation - parser_dataset = subparsers.add_parser('dataset', help='Create a dataset in the .tfrecords format for training or testing.') + parser_dataset = subparsers.add_parser('dataset', help='Create a dataset in the TFRecords format.') + parser_dataset.set_defaults(phase='dataset') + parser_dataset.add_argument('annotations_path', metavar='annotations', + type=str, + help=('Path to the annotation file')) + parser_dataset.add_argument('output_path', nargs='?', metavar='output', + type=str, default=defaults.NEW_DATASET_PATH, + help=('Output path' + ', default=%s' + % (defaults.NEW_DATASET_PATH))) # Training parser_train = subparsers.add_parser('train', help='Train the model and save checkpoints.') parser_train.set_defaults(phase='train') - parser_train.add_argument('dataset', + parser_train.add_argument('dataset_path', metavar='dataset', type=str, default=defaults.DATA_PATH, - help=('Path of the .tfrecords file containing the image/label pairs' + help=('Training dataset in the TFRecords format' ', default=%s' % (defaults.DATA_PATH))) parser_train.add_argument('--no-resume', dest='load_model', action='store_false', @@ -40,9 +50,9 @@ def process_args(args, defaults): # Testing parser_test = subparsers.add_parser('test', help='Test the saved model.') parser_test.set_defaults(phase='test') - parser_test.add_argument('dataset', + parser_test.add_argument('dataset_path', metavar='dataset', type=str, default=defaults.DATA_PATH, - help=('Path of the .tfrecords file containing the image/label pairs' + help=('Testing dataset in the TFRecords format' ', default=%s' % (defaults.DATA_PATH))) parser_test.add_argument('--visualize', dest='visualize', action='store_true', @@ -142,10 +152,15 @@ def main(args): logging.getLogger('').addHandler(console) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: + + if parameters.phase == 'dataset': + dataset.generate(parameters.annotations_path, parameters.output_path) + return + model = Model( phase=parameters.phase, visualize=parameters.visualize, - data_path=parameters.dataset, + data_path=parameters.dataset_path, output_dir=parameters.output_dir, batch_size=parameters.batch_size, initial_learning_rate=parameters.initial_learning_rate, From 3a0c4f2bbff30571ecc43ac54bc5b2dc196a17ec Mon Sep 17 00:00:00 2001 From: Edward Medvedev Date: Fri, 21 Jul 2017 20:20:39 +0200 Subject: [PATCH 10/12] Fix log messages --- aocr/model/model.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/aocr/model/model.py b/aocr/model/model.py index f0446acf..b3a391df 100644 --- a/aocr/model/model.py +++ b/aocr/model/model.py @@ -60,8 +60,7 @@ def __init__(self, epochs=num_epoch) else: batch_size = 1 - self.s_gen = DataGen( - data_path, evaluate=True) + self.s_gen = DataGen(data_path, evaluate=True, epochs=1) logging.info('phase: %s' % phase) logging.info('model_dir: %s' % (model_dir)) @@ -87,8 +86,6 @@ def __init__(self, if use_gru: logging.info('using GRU in the decoder.') - # TODO: rename answer to label - # variables self.zero_paddings = tf.placeholder(tf.float32, shape=(None, None, 512), name='zero_paddings') @@ -244,7 +241,7 @@ def __init__(self, def test(self): step_time = 0.0 loss = 0.0 - current_step = 0 + current_step = 1 num_correct = 0 num_total = 0 @@ -294,7 +291,7 @@ def test(self): def train(self): step_time = 0.0 loss = 0.0 - current_step = 0 + current_step = 1 writer = tf.summary.FileWriter(self.model_dir, self.sess.graph) logging.info('Starting the training process.') @@ -333,7 +330,6 @@ def train(self): precision = num_correct / self.batch_size logging.info('step %f - time: %f, loss: %f, perplexity: %f, precision: %f, batch_len: %f' % (current_step, curr_step_time, result['loss'], math.exp(result['loss']) if result['loss'] < 300 else float('inf'), precision, batch['real_len'])) - current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % self.steps_per_checkpoint == 0: @@ -347,6 +343,8 @@ def train(self): self.saver_all.save(self.sess, checkpoint_path, global_step=self.global_step) step_time, loss = 0.0, 0.0 + current_step += 1 + def to_savedmodel(self): raise NotImplementedError From 6096c0e6ec4df6a902747059a47e0f6f1bec7d77 Mon Sep 17 00:00:00 2001 From: Edward Medvedev Date: Fri, 21 Jul 2017 20:20:51 +0200 Subject: [PATCH 11/12] Use tf.data (1.2 feature) --- aocr/util/data_gen.py | 51 +++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/aocr/util/data_gen.py b/aocr/util/data_gen.py index 77557572..ff90d457 100644 --- a/aocr/util/data_gen.py +++ b/aocr/util/data_gen.py @@ -56,8 +56,10 @@ def __init__(self, self.bucket_data = {i: BucketData() for i in range(self.bucket_max_width + 1)} - filename_queue = tf.train.string_input_producer([self.annotation_path], num_epochs=self.epochs) - self.images, self.labels = parse_tfrecords(filename_queue) + dataset = tf.contrib.data.TFRecordDataset([self.annotation_path]) + dataset = dataset.map(self._parse_record) + dataset = dataset.shuffle(buffer_size=10000) + self.dataset = dataset.repeat(self.epochs) def clear(self): self.bucket_data = {i: BucketData() @@ -66,20 +68,15 @@ def clear(self): def gen(self, batch_size): valid_target_len = self.valid_target_len - images, labels = tf.train.shuffle_batch( - [self.images, self.labels], batch_size=batch_size, num_threads=2, - capacity=1000 + 3 * batch_size, min_after_dequeue=1000) + dataset = self.dataset.batch(batch_size) + iterator = dataset.make_one_shot_iterator() + + images, labels = iterator.get_next() with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: - sess.run([ - tf.local_variables_initializer(), - tf.global_variables_initializer(), - ]) - coord = tf.train.Coordinator() - threads = tf.train.start_queue_runners(sess=sess, coord=coord) - - try: - while not coord.should_stop(): + + while True: + try: raw_images, raw_labels = sess.run([images, labels]) for img, lex in zip(raw_images, raw_labels): word = self.convert_lex(lex) @@ -101,10 +98,8 @@ def gen(self, batch_size): yield bucket else: assert False, 'no valid bucket of width %d' % resized_width - - finally: - coord.request_stop() - coord.join(threads) + except tf.errors.OutOfRangeError: + break self.clear() @@ -121,14 +116,12 @@ def convert_lex(self, lex): return word - -def parse_tfrecords(filename_queue): - reader = tf.TFRecordReader() - _, serialized_example = reader.read(filename_queue) - features = tf.parse_single_example( - serialized_example, - features={ - 'image': tf.FixedLenFeature([], tf.string), - 'answer': tf.FixedLenFeature([], tf.string), - }) - return features['image'], features['answer'] + @staticmethod + def _parse_record(example_proto): + features = tf.parse_single_example( + example_proto, + features={ + 'image': tf.FixedLenFeature([], tf.string), + 'label': tf.FixedLenFeature([], tf.string), + }) + return features["image"], features["label"] From f060c79529d94be037ddcaf450f446f2623bd289 Mon Sep 17 00:00:00 2001 From: Edward Medvedev Date: Fri, 21 Jul 2017 20:30:14 +0200 Subject: [PATCH 12/12] Add todos and small changes --- aocr/launcher.py | 1 + aocr/model/model.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/aocr/launcher.py b/aocr/launcher.py index fec710bc..0613ab41 100644 --- a/aocr/launcher.py +++ b/aocr/launcher.py @@ -61,6 +61,7 @@ def process_args(args, defaults): # Exporting parser_export = subparsers.add_parser('export', help='Export the saved checkpoints for production.') + parser_test.set_defaults(phase='export') parser_export.add_argument('export_path', metavar='path', type=str, default=defaults.EXPORT_PATH, help=('Path to export the model in the specified format,' diff --git a/aocr/model/model.py b/aocr/model/model.py index b3a391df..0886e76e 100644 --- a/aocr/model/model.py +++ b/aocr/model/model.py @@ -385,7 +385,9 @@ def step(self, batch, forward_only): last_target = self.decoder_inputs[decoder_size].name input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32) - # TODO: merging into one op + # TODO: one op for answer + # TODO: visualization + # TODO: cleanup # Output feed: depends on whether we do a backward step or not. output_feed = [self.attention_decoder_model.losses[bucket_id]] # Loss for this batch.