Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to support TensorFlow 1.2.0 #47

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# ML data
datasets/
checkpoints/
models/
results/

### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down Expand Up @@ -107,4 +113,4 @@ crashlytics-build.properties

# Created by .ignore support plugin (hsz.mobi)
misc/
data/evaluation_data
data/evaluation_data
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Visual Attention based OCR. The model first runs a sliding CNN on the image (ima
# Prerequsites
Most of our code is written based on Tensorflow, but we also use Keras for the convolution part of our model. Besides, we use python package distance to calculate edit distance for evaluation. (However, that is not mandatory, if distance is not installed, we will do exact match).

### Tensorflow: [Installation Instructions](https://www.tensorflow.org/get_started/os_setup#download-and-setup) (tested on 0.12.1)
### Tensorflow: [Installation Instructions](https://www.tensorflow.org/install/) (tested on 1.2.0)

### Distance (Optional):

Expand Down
1 change: 1 addition & 0 deletions aocr/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__author__ = 'emedvedev'
36 changes: 36 additions & 0 deletions aocr/defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Default parameters
"""


class Config:

GPU_ID = 0
VISUALIZE = False

# I/O
NEW_DATASET_PATH = 'dataset.tfrecords'
DATA_PATH = 'data.tfrecords'
MODEL_DIR = 'models'
LOG_PATH = 'attentionocr.log'
OUTPUT_DIR = 'results'
STEPS_PER_CHECKPOINT = 500
EXPORT_FORMAT = 'savedmodel'
EXPORT_PATH = 'exported'

# Optimization
NUM_EPOCH = 1000
BATCH_SIZE = 45
INITIAL_LEARNING_RATE = 1.0

# Network parameters
CLIP_GRADIENTS = True # whether to perform gradient clipping
MAX_GRADIENT_NORM = 5.0 # Clip gradients to this norm
TARGET_EMBEDDING_SIZE = 10 # embedding dimension for each target
ATTN_USE_LSTM = True # whether or not use LSTM attention decoder cell
ATTN_NUM_HIDDEN = 128 # number of hidden units in attention decoder cell
ATTN_NUM_LAYERS = 2 # number of layers in attention decoder cell
# (Encoder number of hidden units will be ATTN_NUM_HIDDEN*ATTN_NUM_LAYERS)
LOAD_MODEL = True
OLD_MODEL_VERSION = False
TARGET_VOCAB_SIZE = 26+10+3 # 0: PADDING, 1: GO, 2: EOS, >2: 0-9, a-z
191 changes: 191 additions & 0 deletions aocr/launcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
import sys
import argparse
import logging

import tensorflow as tf

from .model.model import Model
from .defaults import Config
from .util import dataset

tf.logging.set_verbosity(tf.logging.ERROR)


def process_args(args, defaults):
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(help='Subcommands.')

# Global arguments
parser.add_argument('--log-path', dest="log_path",
type=str, default=defaults.LOG_PATH,
help=('Log file path, default=%s'
% (defaults.LOG_PATH)))
parser.set_defaults(visualize=defaults.VISUALIZE)
parser.set_defaults(load_model=defaults.LOAD_MODEL)

# Dataset generation
parser_dataset = subparsers.add_parser('dataset', help='Create a dataset in the TFRecords format.')
parser_dataset.set_defaults(phase='dataset')
parser_dataset.add_argument('annotations_path', metavar='annotations',
type=str,
help=('Path to the annotation file'))
parser_dataset.add_argument('output_path', nargs='?', metavar='output',
type=str, default=defaults.NEW_DATASET_PATH,
help=('Output path'
', default=%s'
% (defaults.NEW_DATASET_PATH)))

# Training
parser_train = subparsers.add_parser('train', help='Train the model and save checkpoints.')
parser_train.set_defaults(phase='train')
parser_train.add_argument('dataset_path', metavar='dataset',
type=str, default=defaults.DATA_PATH,
help=('Training dataset in the TFRecords format'
', default=%s'
% (defaults.DATA_PATH)))
parser_train.add_argument('--no-resume', dest='load_model', action='store_false',
help=('Create an empty model even if checkpoints already exist.'
', default=%s' % (defaults.LOAD_MODEL)))

# Testing
parser_test = subparsers.add_parser('test', help='Test the saved model.')
parser_test.set_defaults(phase='test')
parser_test.add_argument('dataset_path', metavar='dataset',
type=str, default=defaults.DATA_PATH,
help=('Testing dataset in the TFRecords format'
', default=%s'
% (defaults.DATA_PATH)))
parser_test.add_argument('--visualize', dest='visualize', action='store_true',
help=('Visualize attentions'
', default=%s' % (defaults.VISUALIZE)))

# Exporting
parser_export = subparsers.add_parser('export', help='Export the saved checkpoints for production.')
parser_test.set_defaults(phase='export')
parser_export.add_argument('export_path', metavar='path',
type=str, default=defaults.EXPORT_PATH,
help=('Path to export the model in the specified format,'
'default=%s'
% (defaults.EXPORT_PATH)))
parser_export.add_argument('--format', dest="format",
type=str, default=defaults.EXPORT_FORMAT,
choices=['frozengraph', 'savedmodel'],
help=('Export format for the model: either'
'a frozen GraphDef or a SavedModel'
'(default=%s)'
% (defaults.EXPORT_FORMAT)))




parser.add_argument('--gpu-id', dest="gpu_id",
type=int, default=defaults.GPU_ID)

parser.add_argument('--use-gru', dest='use_gru', action='store_true')

parser.add_argument('--batch-size', dest="batch_size",
type=int, default=defaults.BATCH_SIZE,
help=('Batch size, default = %s'
% (defaults.BATCH_SIZE)))
parser.add_argument('--initial-learning-rate', dest="initial_learning_rate",
type=float, default=defaults.INITIAL_LEARNING_RATE,
help=('Initial learning rate, default = %s'
% (defaults.INITIAL_LEARNING_RATE)))
parser.add_argument('--num-epoch', dest="num_epoch",
type=int, default=defaults.NUM_EPOCH,
help=('Number of epochs, default = %s'
% (defaults.NUM_EPOCH)))
parser.add_argument('--steps-per-checkpoint', dest="steps_per_checkpoint",
type=int, default=defaults.STEPS_PER_CHECKPOINT,
help=('Checkpointing (print perplexity, save model) per'
' how many steps, default = %s'
% (defaults.STEPS_PER_CHECKPOINT)))
parser.add_argument('--target-vocab-size', dest="target_vocab_size",
type=int, default=defaults.TARGET_VOCAB_SIZE,
help=('Target vocabulary size, default=%s'
% (defaults.TARGET_VOCAB_SIZE)))
parser.add_argument('--model-dir', dest="model_dir",
type=str, default=defaults.MODEL_DIR,
help=('The directory for saving and loading model '
'default=%s' %(defaults.MODEL_DIR)))
parser.add_argument('--target-embedding-size', dest="target_embedding_size",
type=int, default=defaults.TARGET_EMBEDDING_SIZE,
help=('Embedding dimension for each target, default=%s'
% (defaults.TARGET_EMBEDDING_SIZE)))
parser.add_argument('--attn-num-hidden', dest="attn_num_hidden",
type=int, default=defaults.ATTN_NUM_HIDDEN,
help=('number of hidden units in attention decoder cell'
', default=%s'
% (defaults.ATTN_NUM_HIDDEN)))
parser.add_argument('--attn-num-layers', dest="attn_num_layers",
type=int, default=defaults.ATTN_NUM_LAYERS,
help=('number of hidden layers in attention decoder cell'
', default=%s'
% (defaults.ATTN_NUM_LAYERS)))
parser.add_argument('--output-dir', dest="output_dir",
type=str, default=defaults.OUTPUT_DIR,
help=('Output directory, default=%s'
% (defaults.OUTPUT_DIR)))
parser.add_argument('--max_gradient_norm', dest="max_gradient_norm",
type=int, default=defaults.MAX_GRADIENT_NORM,
help=('Clip gradients to this norm.'
', default=%s'
% (defaults.MAX_GRADIENT_NORM)))
parser.add_argument('--no-gradient_clipping', dest='clip_gradients', action='store_false',
help=('Do not perform gradient clipping, default for clip_gradients is %s' %
(defaults.CLIP_GRADIENTS)))
parser.set_defaults(clip_gradients=defaults.CLIP_GRADIENTS)

parameters = parser.parse_args(args)
return parameters


def main(args):
parameters = process_args(args, Config)
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s',
filename=parameters.log_path)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)

with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:

if parameters.phase == 'dataset':
dataset.generate(parameters.annotations_path, parameters.output_path)
return

model = Model(
phase=parameters.phase,
visualize=parameters.visualize,
data_path=parameters.dataset_path,
output_dir=parameters.output_dir,
batch_size=parameters.batch_size,
initial_learning_rate=parameters.initial_learning_rate,
num_epoch=parameters.num_epoch,
steps_per_checkpoint=parameters.steps_per_checkpoint,
target_vocab_size=parameters.target_vocab_size,
model_dir=parameters.model_dir,
target_embedding_size=parameters.target_embedding_size,
attn_num_hidden=parameters.attn_num_hidden,
attn_num_layers=parameters.attn_num_layers,
clip_gradients=parameters.clip_gradients,
max_gradient_norm=parameters.max_gradient_norm,
session=sess,
load_model=parameters.load_model,
gpu_id=parameters.gpu_id,
use_gru=parameters.use_gru,
)
if parameters.phase == 'train':
model.train()
elif parameters.phase == 'test':
model.test()
else:
raise NotImplementedError


if __name__ == "__main__":
main(sys.argv[1:])
Empty file added aocr/model/__init__.py
Empty file.
19 changes: 8 additions & 11 deletions src/model/cnn.py → aocr/model/cnn.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,4 @@
__author__ = 'moonkey'

#from keras import models, layers
import logging
import numpy as np
# from src.data_util.synth_prepare import SynthGen

#import keras.backend as K
import tensorflow as tf


Expand All @@ -24,6 +17,7 @@ def var_random(name, shape, regularizable=False):
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, tf.nn.l2_loss(v))
return v


def max_2x2pool(incoming, name):
'''
max pooling on 2 dims.
Expand All @@ -34,6 +28,7 @@ def max_2x2pool(incoming, name):
with tf.variable_scope(name):
return tf.nn.max_pool(incoming, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1), padding='VALID')


def max_2x1pool(incoming, name):
'''
max pooling only on image width
Expand All @@ -44,6 +39,7 @@ def max_2x1pool(incoming, name):
with tf.variable_scope(name):
return tf.nn.max_pool(incoming, ksize=(1, 2, 1, 1), strides=(1, 2, 1, 1), padding='VALID')


def ConvRelu(incoming, num_filters, filter_size, name):
'''
Add a convolution layer followed by a Relu layer.
Expand Down Expand Up @@ -87,15 +83,17 @@ def ConvReluBN(incoming, num_filters, filter_size, name, is_training, padding_ty
with tf.variable_scope(name):
conv_W = var_random('W', tuple(filter_size) + (num_filters_from, num_filters), regularizable=True)

after_conv = tf.nn.conv2d(incoming, conv_W, strides=(1,1,1,1), padding=padding_type)
after_conv = tf.nn.conv2d(incoming, conv_W, strides=(1, 1, 1, 1), padding=padding_type)

after_bn = batch_norm(after_conv, is_training)

return tf.nn.relu(after_bn)


def dropout(incoming, is_training, keep_prob=0.5):
return tf.contrib.layers.dropout(incoming, keep_prob=keep_prob, is_training=is_training)


def tf_create_attention_map(incoming):
'''
flatten hight and width into one dimention of size attn_length
Expand All @@ -107,6 +105,7 @@ def tf_create_attention_map(incoming):
print(shape)
return tf.reshape(incoming, (-1, np.prod(shape[1:3]), shape[3]))


class CNN(object):
"""
Usage for tf tensor output:
Expand Down Expand Up @@ -146,7 +145,7 @@ def _build_network(self, input_tensor, is_training):

print('CNN outdim before squeeze: {}'.format(net.get_shape())) # 1x32x100 -> 24x512

net = tf.squeeze(net,axis=1)
net = tf.squeeze(net, axis=1)

print('CNN outdim: {}'.format(net.get_shape()))
self.model = net
Expand All @@ -160,5 +159,3 @@ def __call__(self, input_tensor):
'''
def save(self):
pass


Loading