attention_lstm_1.py.save

"""
A keras attention layer that wraps RNN layers.

Based on tensorflows [attention_decoder](https://github.com/tensorflow/tensorflow/blob/c8a45a8e236776bed1d14fd71f3b6755bd63cc58/tensorflow/python/ops/seq2seq.py#L506) 
and [Grammar as a Foreign Language](https://arxiv.org/abs/1412.7449).

date: 20161101
author: wassname
url: https://gist.github.com/wassname/5292f95000e409e239b9dc973295327a
"""

# test likes in https://github.com/fchollet/keras/blob/master/tests/keras/layers/test_wrappers.py
import pytest
import numpy as np
from numpy.testing import assert_allclose
from keras.utils.test_utils import keras_test
from keras.layers import wrappers, Input, recurrent, InputLayer
from keras.layers import core, convolutional, recurrent
from keras.models import Sequential, Model, model_from_json
from attention_lstm_ import *
from load_sts_data import *

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

YEAR_TRAIN="2013"
YEAR_VALID="2017"
MAX_SEQUENCE_LENGTH=50
VALIDATION_SPLIT=0.30
representation = "fastText"
#representation = "word2vec"
VECTOR_DIR="/almac/ignacio/data/" + representation
EMBEDDING_DIM=50
TRAIN_DIRS=[
    (VECTOR_DIR.rsplit('/', 1)[0]
 + "/sts_all/train-" + YEAR_TRAIN, None, False)]

VALID_DIRS=[
    (VECTOR_DIR.rsplit('/', 1)[0]
 + "/sts_all/valid-" + YEAR_VALID, "validation", False)]


# --------------------------
train_data_, gs_data=load_train_dirs(TRAIN_DIRS)
valid_data_, _ =load_train_dirs(VALID_DIRS)

train_data_A, train_data_B = train_data_[1::2], train_data_[::2]
valid_data_A, valid_data_B = valid_data_[1::2], valid_data_[::2]

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train_data_A + valid_data_A)
sequences_A = tokenizer.texts_to_sequences(train_data_A)
sequences_Av = tokenizer.texts_to_sequences(valid_data_A)
word_index_A = tokenizer.word_index
data_A = pad_sequences(sequences_A, maxlen=MAX_SEQUENCE_LENGTH)
x_data_Av = pad_sequences(sequences_Av, maxlen=MAX_SEQUENCE_LENGTH)

labels = np.asarray(gs_data)

indices = np.arange(labels.shape[0])
np.random.shuffle(indices)
data_A = data_A[indices]

nb_validation_samples = int(VALIDATION_SPLIT * labels.shape[0])
x_train_A = data_A[:-nb_validation_samples]
x_val_A = data_A[-nb_validation_samples:]

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train_data_B + valid_data_B)
sequences_B = tokenizer.texts_to_sequences(train_data_B)
sequences_Bv = tokenizer.texts_to_sequences(valid_data_B)
word_index_B = tokenizer.word_index
data_B = pad_sequences(sequences_B, maxlen=MAX_SEQUENCE_LENGTH)
x_data_Bv = pad_sequences(sequences_Bv, maxlen=MAX_SEQUENCE_LENGTH)

data_B = data_B[indices]
x_train_B = data_B[:-nb_validation_samples]
x_val_B = data_B[-nb_validation_samples:]

labels = labels[indices]

y_train = labels[:-nb_validation_samples]
y_val = labels[-nb_validation_samples:]

embeddings_index = {}
if representation == "glove":
    f = open(os.path.join(VECTOR_DIR, 'glove.6B.%dd.txt' % EMBEDDING_DIM))
elif representation == "fastText":
    f = open(os.path.join(VECTOR_DIR, 'wikiEn_Full_H%d.model.vec' % EMBEDDING_DIM))
elif representation == "word2vec":
    f = open(os.path.join(VECTOR_DIR, 'w2v_En_vector_space_H%d.vec' % EMBEDDING_DIM))

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
#embeddings_index['###'] = np.zeros(100)

embedding_matrix_A = np.zeros((len(word_index_A) + 1, EMBEDDING_DIM))
for word, i in word_index_A.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix_A[i] = embedding_vector

embedding_matrix_B = np.zeros((len(word_index_B) + 1, EMBEDDING_DIM))
for word, i in word_index_B.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
       embedding_matrix_B[i] = embedding_vector

# --------------------------

def models(M, nb_samples, timesteps, embedding_dim, output_dim):

    if M == "base_line":
        model = Sequential()
        model.add(InputLayer(batch_input_shape=(nb_samples, timesteps, embedding_dim)))
        model.add(Attention(recurrent.LSTM(output_dim, input_dim=embedding_dim, return_sequences=True, consume_less='mem')))
        model.add(core.Activation('relu'))
        #model.compile(optimizer='rmsprop', loss='mse')
        #model.fit(x,y, nb_epoch=1, batch_size=nb_samples)

    elif M == "stck":
    # test stacked with all RNN layers and consume_less options
        model = Sequential()
        model.add(InputLayer(batch_input_shape=(nb_samples, timesteps, embedding_dim)))
        # model.add(Attention(recurrent.LSTM(embedding_dim, input_dim=embedding_dim,, consume_less='cpu' return_sequences=True))) # not supported
        model.add(Attention(recurrent.LSTM(output_dim, input_dim=embedding_dim, consume_less='gpu', return_sequences=True)))
        model.add(Attention(recurrent.LSTM(embedding_dim, input_dim=embedding_dim, consume_less='mem', return_sequences=True)))
        # test each other RNN type
        model.add(Attention(recurrent.GRU(embedding_dim, input_dim=embedding_dim, consume_less='mem', return_sequences=True)))
        model.add(Attention(recurrent.SimpleRNN(output_dim, input_dim=embedding_dim, consume_less='mem', return_sequences=True)))
        model.add(core.Activation('relu'))
        #model.compile(optimizer='rmsprop', loss='mse')
        #model.fit(x,y, nb_epoch=1, batch_size=nb_samples)

    elif M == "simple_att":
        # test with return_sequence = False
        model = Sequential()
        model.add(InputLayer(batch_input_shape=(nb_samples, timesteps, embedding_dim)))
        model.add(Attention(recurrent.LSTM(output_dim, input_dim=embedding_dim, return_sequences=False, consume_less='mem')))
        model.add(core.Activation('relu'))
        #model.compile(optimizer='rmsprop', loss='mse')
        #model.fit(x,y[:,-1,:], nb_epoch=1, batch_size=nb_samples)

    elif M == "bidir_att":
    # with bidirectional encoder
        model = Sequential()
        model.add(InputLayer(batch_input_shape=(nb_samples, timesteps, embedding_dim)))
        model.add(wrappers.Bidirectional(recurrent.LSTM(embedding_dim, input_dim=embedding_dim, return_sequences=True)))
        model.add(Attention(recurrent.LSTM(output_dim, input_dim=embedding_dim, return_sequences=True, consume_less='mem')))
        model.add(core.Activation('relu'))
        #model.compile(optimizer='rmsprop', loss='mse')
        #model.fit(x,y, nb_epoch=1, batch_size=nb_samples)

    elif M == "non_symbolic_att":
    # test with functional API
        input = Input(batch_shape=(nb_samples, timesteps, embedding_dim))
        output = Attention(recurrent.LSTM(output_dim, input_dim=embedding_dim, return_sequences=True, consume_less='mem'))(input)
        model = Model(input, output)
        #model.compile(optimizer='rmsprop', loss='mse')
        #model.fit(x, y, nb_epoch=1, batch_size=nb_samples)

    return model

# Leaning constants
outfile="probabilities_bidir"
h_STATES = 10
EPOCHS = 200
DENSES = 100


nb_samples=20
timesteps=h_STATES
embedding_dim=EMBEDDING_DIM
output_dim=6
embedding_num = 12
x_a = embedding_matrix_A
x_b = embedding_matrix_B
#np.random.random((nb_samples, timesteps, embedding_dim))
y = 
#np.random.random((nb_samples, timesteps, output_dim))

print("Shape of X: ", x.shape)
print("Shape of Y: ", y.shape)

# base line test with LSTM
sent_A_pool=models("simple_att",  nb_samples, timesteps, embedding_dim, output_dim)
sent_B_pool=models("simple_att",  nb_samples, timesteps, embedding_dim, output_dim)

# -----------------------------------------------------------------------
pair=merge([sent_A_pool, sent_B_pool], mode='concat', concat_axis=-1)
degrees=Dense(6, activation="softmax")(pair)


similarity.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

print(similarity.summary())

# happy learning!
similarity.fit([x_train_A, x_train_B], y_train, validation_data=([x_val_A,x_val_B], y_val),
          nb_epoch=EPOCHS, batch_size=20)

    # test config
similarity.get_config()

    # test to and from json
similarity = model_from_json(similarity.to_json(), custom_objects=dict(Attention=Attention))
print similarity.summary()

print "\nParameters:\n---------------------\nh_STATES=%d\nEPOCHS=%d\nDENSES=%d\nRepresentation=%s\nEMBEDDING_DIM=%d\nMAX_SEQUENCE_LENGTH=%d" % (h_STATES,
                                                                                                                           EPOCHS,
                                                                                                                           DENSES,
                                                                                                                           representation,
                                                                                                                           EMBEDDING_DIM,MAX_SEQUENCE_LENGTH)