From 9ca152de2f57002f9813b73537023da6026af7b6 Mon Sep 17 00:00:00 2001 From: Nicolas Granger Date: Sat, 26 Aug 2017 20:23:27 +0200 Subject: [PATCH 1/8] Implementation of CTC in pure theano with custom gradient (which should hopefully be more robust to precision issues) --- .../ctc.py | 262 ++++++++++++++++++ .../test_ctc2.py | 123 ++++++++ 2 files changed, 385 insertions(+) create mode 100644 papers/connectionist_temporal_classification/ctc.py create mode 100644 papers/connectionist_temporal_classification/test_ctc2.py diff --git a/papers/connectionist_temporal_classification/ctc.py b/papers/connectionist_temporal_classification/ctc.py new file mode 100644 index 0000000..70e5526 --- /dev/null +++ b/papers/connectionist_temporal_classification/ctc.py @@ -0,0 +1,262 @@ +# Author: Nicolas Granger +# +# Implements the connectionist temporal classification loss from: +# Graves, A., Fernández, S., Gomez, F., & Schmidhuber, J. (2006, June). +# Connectionist temporal classification: labelling unsegmented sequence data +# with recurrent neural networks. In Proceedings of the 23rd international +# conference on Machine learning (pp. 369-376). ACM. +# ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf + +import numpy as np +import theano +import theano.tensor as T +from theano.tensor import discrete_dtypes, continuous_dtypes + + +# Bits of the CTC algorithm --------------------------------------------------- + +def insert_alternating_blanks(labels, blank_label): + batch_size, label_size = labels.shape + blanked_labels = T.zeros((batch_size, 2 * label_size + 1), dtype=np.int32) + blanked_labels = T.set_subtensor(blanked_labels[:, 0::2], blank_label) + blanked_labels = T.set_subtensor(blanked_labels[:, 1:-1:2], labels) + return blanked_labels + + +def ctc_forward(log_odds, seq_sizes, + blanked_labels, label_sizes, not_repeated): + batch_dur, batch_sz, _ = log_odds.shape + batch_dur, batch_sz = T.cast(batch_dur, 'int32'), T.cast(batch_sz, 'int32') + label_size = blanked_labels.shape[1] + + def step(t, a_tm1, log_odds_, + seq_sizes_, blanked_labels_, label_sizes_, not_repeated_): + y_t = log_odds_[t] + k = T.max(a_tm1, axis=-1, keepdims=True) + k = T.switch(T.isinf(k), 0, k) + a_tm1 = T.switch(T.isinf(a_tm1), 0, T.exp(a_tm1 - k)) # exit log space + a_t = a_tm1 + a_t = T.inc_subtensor(a_t[:, 1:], a_tm1[:, :-1]) + a_t = T.inc_subtensor(a_t[:, 2:], a_tm1[:, :-2] * not_repeated_) + + # stop after a_T(|l'|) + mask = T.ge(t, seq_sizes_)[:, None] \ + + T.ge(T.arange(label_size)[None, :], + 2 * label_sizes_[:, None] + 1) + + a_t = T.switch( # back to log space + T.eq(a_t, 0) + mask, -np.inf, + T.log(a_t) + k + y_t[T.arange(batch_sz)[:, None], blanked_labels_]) + return a_t + + alpha_init = -np.inf * T.ones((batch_sz, label_size)) + alpha_init = T.set_subtensor(alpha_init[:, 0], 0) + + alphas, _ = theano.scan( + fn=step, + sequences=[T.arange(batch_dur)], + outputs_info=[alpha_init], + non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes, + not_repeated], + name="ctc_forward", + profile=True) + + return alphas + + +def ctc_backward(log_odds, seq_sizes, + blanked_labels, label_sizes, not_repeated): + batch_dur, batch_sz, _ = log_odds.shape + label_size = blanked_labels.shape[1] + + def step(t, b_tp1, log_odds_, + seq_sizes_, blanked_labels_, label_sizes_, not_repeated_): + y_t = log_odds_[t] + k = T.max(b_tp1, axis=-1, keepdims=True) + k = T.switch(T.isinf(k), 0, k) + b_tp1 = T.switch(T.isinf(b_tp1), 0, T.exp(b_tp1 - k)) # exit log space + + # increase b_{T+1}(|l'|) from 0 to 1 to initialize recursion + starter_t = T.eq(t, seq_sizes_ - 1)[:, None] \ + * T.eq((2 * label_sizes_)[:, None], + T.arange(label_size)[None, :]) * 1 + b_tp1 += starter_t # initialize recursion + + b_t = b_tp1 + b_t = T.inc_subtensor(b_t[:, :-1], b_tp1[:, 1:]) + b_t = T.inc_subtensor(b_t[:, :-2], b_tp1[:, 2:] * not_repeated_) + b_t = T.switch( # back to log space + T.eq(b_t, 0), -np.inf, + T.log(b_t) + k + y_t[T.arange(batch_sz)[:, None], blanked_labels_]) + return b_t + + beta_init = - np.inf * T.ones((batch_sz, label_size)) + + betas, _ = theano.scan( + fn=step, + sequences=[T.arange(batch_dur)], + outputs_info=[beta_init], + non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes, + not_repeated], + go_backwards=True, + name="ctc_backward", + profile=True) + betas = betas[::-1, :, :] + + return betas + + +# Theano Op ------------------------------------------------------------------- + +def ctc_perform_graph(preds, seq_sizes, labels, label_sizes, blank): + _, batch_size, voca_size = preds.shape + + log_preds = T.log(preds) + blanked_labels = insert_alternating_blanks(labels, blank) + not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2]) + betas = ctc_backward(log_preds, seq_sizes, + blanked_labels, label_sizes, not_repeated) + + loss = -T.switch(T.all(T.isinf(betas[0, :, :2]), axis=1), + -np.inf, # impossible sequences, eg: too short + T.log(T.exp(betas[0, :, 0]) + T.exp(betas[0, :, 1]))) + + return log_preds, blanked_labels, not_repeated, betas, loss + + +def ctc_grad_graph(inputs, output_gradients): + linear_out, seq_durations, labels, label_sizes, _ = inputs + seq_size, batch_size, voca_size = linear_out.shape + label_size = labels.shape[1] + + # TODO: will theano optimize this redundant call when both loss and + # gradient are requested separately? + log_preds, blanked_labels, not_repeated, betas, loss = \ + ctc_perform_graph(*inputs) + + alphas = ctc_forward(log_preds, seq_durations, + blanked_labels, label_sizes, not_repeated) + + log_pl = - loss + + # sum_{s \in lab(l, k)} a_t(s) b_t(s) + def fwbw_sum_step(k, s, labels_, ab_): + s_view = s[:, T.arange(batch_size), labels_[:, k]] + ab_view = ab_[:, :, k] + next_sum = ab_view + T.switch(T.isinf(s_view), + 0, T.log(1 + T.exp(s_view - ab_view))) + next_sum = T.switch(T.isinf(ab_view), s_view, next_sum) + s = T.set_subtensor(s_view, next_sum) + return s + + ab = T.switch(T.isinf(alphas) + T.isinf(betas), -np.inf, alphas + betas) + fwbw_sum = theano.scan( + fn=fwbw_sum_step, + sequences=[T.arange(2 * label_size + 1)], + outputs_info=[-np.inf * T.ones((seq_size, batch_size, voca_size))], + non_sequences=[blanked_labels, ab], + name="fwbw_sum", + profile=True)[0][-1] + + # d(loss) / dy + dloss_dy = T.switch( + T.isinf(loss)[None, :, None], + 0, + - T.exp(fwbw_sum - log_pl[None, :, None] - 2 * log_preds)) + + return [dloss_dy * output_gradients[0][None, :, None], + theano.gradient.disconnected_type(), + theano.gradient.disconnected_type(), + theano.gradient.disconnected_type(), + theano.gradient.disconnected_type()] + + +def make_ctc_op(): + linear_out_var = T.tensor3() + seq_durations_var = T.ivector() + labels_var = T.imatrix() + label_sizes_var = T.ivector() + blank_var = T.iscalar() + + _, _, _, _, loss = ctc_perform_graph( + linear_out_var, seq_durations_var, labels_var, + label_sizes_var, blank_var) + + return theano.OpFromGraph( + inputs=[linear_out_var, seq_durations_var, + labels_var, label_sizes_var, blank_var], + outputs=[loss], + grad_overrides=ctc_grad_graph, + inline=True, name="ctcLossOp") + + +CTCLossOp = make_ctc_op() + + +# ----------------------------------------------------------------------------- + +def ctc_loss(preds, durations, labels, label_sizes, blank=-1): + """Compute the Connectionnist Temporal Classification loss [#graves2006]_. + + .. math:: L = - ln\left( \sum_{\pi \in \mathcal{B}^{-1}(l)} P(\pi | y) + \right) + + where :math:`y` is the sequence of predictions, :math:`l` the target + label sequence without blanks or repetition, :math:`\pi` is taken from the + ensemble of possible label assignments over the observations and + :math:`\mathcal{B}` is a function that remove blanks and repetitions for a + sequence of labels. + + Parameters + ---------- + preds : Theano shared variable, expression or numpy array + The probabilities of each class (for example the output of a softmax + function) with shape duration x batch_size x nclasses. + durations: Theano shared variable, expression or numpy array + An _integer_ vector of size batch_size contining the actual length of + each sequence in preds. + labels: Theano shared variable, expression or numpy array + An _integer_ matrix of size batch_size x label_size containg the target + labels. + label_sizes: Theano shared variable, expression or numpy array + An _integer_ vector of size batch_size contining the actual length of + each sequence in labels. + blank: + The blank label class, by default the last one. + + Returns + ------- + Theano tensor + A vector expression with the CTC loss of each sequence. + + Reference + --------- + .. [#graves2006] Graves, A., Fernández, S., Gomez, F., & Schmidhuber, J. + (2006, June). Connectionist temporal classification: labelling + unsegmented sequence data with recurrent neural networks. In + Proceedings of the 23rd international conference on Machine learning + (pp. 369-376). ACM. ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf + + """ + preds = T.as_tensor_variable(preds) + durations = T.as_tensor_variable(durations) + labels = T.as_tensor_variable(labels) + label_sizes = T.as_tensor_variable(label_sizes) + blank = T.cast(T.as_tensor_variable(blank), 'int32') + + if not(preds.dtype in continuous_dtypes and preds.ndim == 3): + raise ValueError("preds must continuous with dimension 3") + if not (durations.dtype in discrete_dtypes and durations.ndim == 1): + raise ValueError("durations must be a integer vector") + if not (labels.dtype in discrete_dtypes and labels.ndim == 2): + raise ValueError("labels must be an integer matrix") + if not (label_sizes.dtype in discrete_dtypes and label_sizes.ndim == 1): + raise ValueError("label_sizes must be an integer vector") + if not (blank.dtype in discrete_dtypes and blank.ndim == 0): + raise ValueError("blank must be an integer value") + + voca_size = T.cast(preds.shape[2], 'int32') + labels = labels % voca_size + blank = blank % voca_size + + return CTCLossOp(preds, durations, labels, label_sizes, blank) diff --git a/papers/connectionist_temporal_classification/test_ctc2.py b/papers/connectionist_temporal_classification/test_ctc2.py new file mode 100644 index 0000000..ff10c0b --- /dev/null +++ b/papers/connectionist_temporal_classification/test_ctc2.py @@ -0,0 +1,123 @@ +import unittest +import numpy as np +import theano +import theano.tensor as T +from theano.tests import unittest_tools + +from ctc import ctc_loss + + +class TestCTC(unittest.TestCase): + def setUp(self): + unittest_tools.seed_rng() + + def test_simple_precomputed(self): + # Test obtained from Torch tutorial at: + # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md + + linear_out = np.asarray([ + [[0, 0, 0, 0, 0], [1, 2, 3, 4, 5], [-5, -4, -3, -2, -1]], + [[0, 0, 0, 0, 0], [6, 7, 8, 9, 10], [-10, -9, -8, -7, -6]], + [[0, 0, 0, 0, 0], [11, 12, 13, 14, 15], [-15, -14, -13, -12, -11]] + ], dtype=np.float32) + + seq_sizes = np.asarray([1, 3, 3], dtype=np.int32) + + labels = np.asarray([[1, 0], [3, 3], [2, 3]], dtype=np.int32) + + label_sizes = np.asarray([1, 2, 2], dtype=np.int32) + + expected_losses = np.asarray([1.609437943, 7.355742931, 4.938849926], + dtype=np.float32) + + blank = 0 + + expected_grad = np.asarray([ + [[0.2, -0.8, 0.2, 0.2, 0.2], + [0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, + 0.636408627], + [-0.02115798369, 0.03168492019, -0.8810571432, 0.2341216654, + 0.636408627]], + [[0, 0, 0, 0, 0], + [-0.9883437753, 0.03168492019, 0.08612854034, 0.2341216654, + 0.636408627], + [-0.02115798369, 0.03168492019, -0.1891518533, -0.4577836394, + 0.636408627]], + [[0, 0, 0, 0, 0], + [0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, + 0.636408627], + [-0.02115798369, 0.03168492019, 0.08612854034, -0.7330639958, + 0.636408627]] + ], dtype=np.float32) + + seq_size, batch_size, voca_size = linear_out.shape + + linear_out_t = T.as_tensor_variable(linear_out) + seq_sizes_t = T.as_tensor_variable(seq_sizes) + labels_t = T.as_tensor_variable(labels) + label_sizes_t = T.as_tensor_variable(label_sizes) + blank_t = T.as_tensor_variable(blank) + + preds = T.nnet.softmax( + linear_out_t.reshape((-1, voca_size)) + ).reshape((seq_size, batch_size, voca_size)) + losses = ctc_loss(preds, seq_sizes_t, labels_t, label_sizes_t, blank_t) + + assert np.allclose(losses.eval(), expected_losses) + + grad = theano.grad(losses.sum(), wrt=linear_out_t) + + assert np.allclose(grad.eval(), expected_grad) + + def test_random(self): + batch_size = 16 + label_size = 5 + voca_size = 4 + seq_size = 20 + + label_sizes = np.random.randint( + 0, label_size, size=(batch_size,), dtype=np.int32) + label_sizes[0] = label_size + label_sizes[1] = 0 + label_sizes[2] = 5 + label_sizes[3] = 5 + + labels = np.random.randint( + 0, voca_size - 1, + size=(batch_size, label_size), dtype=np.int32) + labels[3] = 0 + + seq_sizes = np.array([ + np.random.randint(max(1, label_sizes[i]), seq_size) + for i in range(batch_size)], dtype=np.int32) + seq_sizes[2] = 4 + + linear_out = np.random.randn( + seq_size, batch_size, voca_size).astype(np.float32) + + # check edge cases + # TODO + + # check the gradient can be computed at all + linear_out_var = T.tensor3() + preds = T.nnet.softmax( + linear_out_var.reshape((-1, voca_size)) + ).reshape((seq_size, batch_size, voca_size)) + + g = theano.grad(ctc_loss(preds, seq_sizes, + labels, label_sizes).sum(), + wrt=linear_out_var).eval({linear_out_var: linear_out}) + assert not np.any(np.isnan(g)) + + # check correctness against finite difference approximation + def f(linear_out_): + preds_ = T.nnet.softmax( + linear_out_.reshape((-1, voca_size)) + ).reshape((seq_size, batch_size, voca_size)) + loss = ctc_loss(preds_, seq_sizes, labels, label_sizes) + # prevent finite differences from failing + loss = T.switch(T.isinf(loss), 0, loss) + return loss + + unittest_tools.verify_grad( + f, [linear_out], rel_tol=0.1) From 96b8d68f5c355e8c8b3ecd0f0824c71539a7291b Mon Sep 17 00:00:00 2001 From: Nicolas Granger Date: Fri, 15 Dec 2017 16:23:22 +0100 Subject: [PATCH 2/8] fix error with empty sequences+added low-level test for forward backward passes --- .../ctc.py | 13 ++- .../test_ctc2.py | 86 ++++++++++++++++++- 2 files changed, 90 insertions(+), 9 deletions(-) diff --git a/papers/connectionist_temporal_classification/ctc.py b/papers/connectionist_temporal_classification/ctc.py index 70e5526..22ff9fc 100644 --- a/papers/connectionist_temporal_classification/ctc.py +++ b/papers/connectionist_temporal_classification/ctc.py @@ -33,7 +33,7 @@ def step(t, a_tm1, log_odds_, seq_sizes_, blanked_labels_, label_sizes_, not_repeated_): y_t = log_odds_[t] k = T.max(a_tm1, axis=-1, keepdims=True) - k = T.switch(T.isinf(k), 0, k) + k = T.switch(T.all(T.isinf(a_tm1), axis=-1, keepdims=True), 0, k) a_tm1 = T.switch(T.isinf(a_tm1), 0, T.exp(a_tm1 - k)) # exit log space a_t = a_tm1 a_t = T.inc_subtensor(a_t[:, 1:], a_tm1[:, :-1]) @@ -58,8 +58,7 @@ def step(t, a_tm1, log_odds_, outputs_info=[alpha_init], non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes, not_repeated], - name="ctc_forward", - profile=True) + name="ctc_forward") return alphas @@ -73,7 +72,7 @@ def step(t, b_tp1, log_odds_, seq_sizes_, blanked_labels_, label_sizes_, not_repeated_): y_t = log_odds_[t] k = T.max(b_tp1, axis=-1, keepdims=True) - k = T.switch(T.isinf(k), 0, k) + k = T.switch(T.all(T.isinf(b_tp1), axis=-1, keepdims=True), 0, k) b_tp1 = T.switch(T.isinf(b_tp1), 0, T.exp(b_tp1 - k)) # exit log space # increase b_{T+1}(|l'|) from 0 to 1 to initialize recursion @@ -99,8 +98,7 @@ def step(t, b_tp1, log_odds_, non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes, not_repeated], go_backwards=True, - name="ctc_backward", - profile=True) + name="ctc_backward") betas = betas[::-1, :, :] return betas @@ -155,8 +153,7 @@ def fwbw_sum_step(k, s, labels_, ab_): sequences=[T.arange(2 * label_size + 1)], outputs_info=[-np.inf * T.ones((seq_size, batch_size, voca_size))], non_sequences=[blanked_labels, ab], - name="fwbw_sum", - profile=True)[0][-1] + name="fwbw_sum")[0][-1] # d(loss) / dy dloss_dy = T.switch( diff --git a/papers/connectionist_temporal_classification/test_ctc2.py b/papers/connectionist_temporal_classification/test_ctc2.py index ff10c0b..b10e61b 100644 --- a/papers/connectionist_temporal_classification/test_ctc2.py +++ b/papers/connectionist_temporal_classification/test_ctc2.py @@ -4,13 +4,97 @@ import theano.tensor as T from theano.tests import unittest_tools -from ctc import ctc_loss +from papers.connectionist_temporal_classification.ctc import \ + ctc_loss, ctc_forward, ctc_backward, insert_alternating_blanks + + +def log_softmax(X): + k = T.max(X, axis=-1, keepdims=True) + norm_X = X - k + log_sum_exp_X = T.log(T.sum(T.exp(norm_X), axis=-1, keepdims=True)) + return norm_X - log_sum_exp_X class TestCTC(unittest.TestCase): def setUp(self): unittest_tools.seed_rng() + def test_forward_backward(self): + batch_size = 6 + label_size = 7 + voca_size = 5 + seq_size = 10 + + label_lengths = np.random.randint(0, label_size, + size=(batch_size,), dtype=np.int32) + label_lengths[0] = label_size # extremum case + label_lengths[1] = 0 # extremum case + labels = np.array( + [np.random.randint(0, voca_size - 1, size=label_size, dtype=np.int32) + for _ in range(batch_size)]) + for i in range(batch_size): + labels[i, label_lengths[i]:] = -1 + + seq_durations = np.array([ + np.random.randint(max(1, label_lengths[i]), seq_size) + for i in range(batch_size)], dtype=np.int32) + + linear_out = np.random.randn(seq_size, batch_size, voca_size) \ + .astype(np.float32) + + blank_class = -1 + blank_class = np.mod(blank_class, voca_size) + + labels = np.mod(labels, voca_size) + + log_odds = log_softmax(linear_out) + blanked_labels = insert_alternating_blanks(T.mod(labels, voca_size), + blank_class) + not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2]) + + alphas = ctc_forward(log_odds, seq_durations, + blanked_labels, label_lengths, not_repeated) + betas = ctc_backward(log_odds, seq_durations, + blanked_labels, label_lengths, not_repeated) + + preds = log_softmax(linear_out) + + y_blanks = preds[:, T.arange(batch_size)[:, None], blanked_labels] + p_l = T.sum(T.exp(alphas + betas - y_blanks), axis=2) + + alphas = alphas.eval() + betas = betas.eval() + preds = preds.eval() + + for i in range(batch_size): + assert np.allclose(alphas[0, i, 0], preds[0, i, -1]) + if label_lengths[i] > 0: + assert np.allclose(alphas[0, i, 1], preds[0, i, labels[i, 0]]) + else: + assert np.isneginf(alphas[0, i, 1]) + assert np.all(np.isneginf(alphas[0, i, 2:])) + + for i in range(batch_size): + t = seq_durations[i] - 1 + l = label_lengths[i] + assert np.allclose(betas[t, i, 2 * l], preds[t, i, -1]) + if l > 0: + assert np.allclose(betas[t, i, 2 * l - 1], + preds[t, i, labels[i, l - 1]]) + assert np.all(np.isneginf(betas[t, i, :max(l - 2, 0)])) + else: + assert np.all(np.isneginf(betas[t, i, 1:])) + + p_l = p_l.eval() + + for i in range(batch_size): + assert (np.allclose(p_l[:seq_durations[i], i], p_l[0, i])) + a, b = max(0, 2 * label_lengths[i] - 1), 2 * label_lengths[i] + 1 + p_li = np.exp(alphas[seq_durations[i] - 1, i, a:b]).sum() + assert np.allclose(p_li, p_l[0, i]) + p_li = np.exp(betas[0, i, :2]).sum() + assert np.allclose(p_li, p_l[0, i]) + def test_simple_precomputed(self): # Test obtained from Torch tutorial at: # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md From afcf4b7ebacc5feb3a93366167750e1f56638d9a Mon Sep 17 00:00:00 2001 From: Nicolas Granger Date: Sat, 16 Dec 2017 00:55:49 +0100 Subject: [PATCH 3/8] more fixes for precision issues --- .../ctc.py | 54 +++++++++++-------- .../test_ctc2.py | 21 ++++---- 2 files changed, 43 insertions(+), 32 deletions(-) diff --git a/papers/connectionist_temporal_classification/ctc.py b/papers/connectionist_temporal_classification/ctc.py index 22ff9fc..a358726 100644 --- a/papers/connectionist_temporal_classification/ctc.py +++ b/papers/connectionist_temporal_classification/ctc.py @@ -11,6 +11,18 @@ import theano import theano.tensor as T from theano.tensor import discrete_dtypes, continuous_dtypes +from theano.printing import Print + + +def isneginf(x, neginf=-1e27): + return x < neginf + + +def logaddexp(x, y, inf=1e9): + x, y = T.minimum(x, y), T.maximum(x, y) + diff = T.minimum(y - x, T.log(inf) / T.log(10)) + res = x + T.log(1 + T.exp(diff)) + return T.switch((y - x > T.log(inf) / T.log(10)), y, res) # Bits of the CTC algorithm --------------------------------------------------- @@ -33,8 +45,8 @@ def step(t, a_tm1, log_odds_, seq_sizes_, blanked_labels_, label_sizes_, not_repeated_): y_t = log_odds_[t] k = T.max(a_tm1, axis=-1, keepdims=True) - k = T.switch(T.all(T.isinf(a_tm1), axis=-1, keepdims=True), 0, k) - a_tm1 = T.switch(T.isinf(a_tm1), 0, T.exp(a_tm1 - k)) # exit log space + k = T.switch(T.all(isneginf(a_tm1), axis=-1, keepdims=True), 0, k) + a_tm1 = T.switch(isneginf(a_tm1), 0, T.exp(a_tm1 - k)) # exit log space a_t = a_tm1 a_t = T.inc_subtensor(a_t[:, 1:], a_tm1[:, :-1]) a_t = T.inc_subtensor(a_t[:, 2:], a_tm1[:, :-2] * not_repeated_) @@ -45,11 +57,11 @@ def step(t, a_tm1, log_odds_, 2 * label_sizes_[:, None] + 1) a_t = T.switch( # back to log space - T.eq(a_t, 0) + mask, -np.inf, + T.eq(a_t, 0) + mask, -1e30, T.log(a_t) + k + y_t[T.arange(batch_sz)[:, None], blanked_labels_]) return a_t - alpha_init = -np.inf * T.ones((batch_sz, label_size)) + alpha_init = -1e30 * T.ones((batch_sz, label_size)) alpha_init = T.set_subtensor(alpha_init[:, 0], 0) alphas, _ = theano.scan( @@ -72,8 +84,9 @@ def step(t, b_tp1, log_odds_, seq_sizes_, blanked_labels_, label_sizes_, not_repeated_): y_t = log_odds_[t] k = T.max(b_tp1, axis=-1, keepdims=True) - k = T.switch(T.all(T.isinf(b_tp1), axis=-1, keepdims=True), 0, k) - b_tp1 = T.switch(T.isinf(b_tp1), 0, T.exp(b_tp1 - k)) # exit log space + k = T.switch(T.all(isneginf(b_tp1), axis=-1, keepdims=True), 0, k) + b_tp1 = T.switch(isneginf(b_tp1), 0, T.exp(b_tp1 - k)) # exit log space + b_tp1 = b_tp1 # increase b_{T+1}(|l'|) from 0 to 1 to initialize recursion starter_t = T.eq(t, seq_sizes_ - 1)[:, None] \ @@ -85,11 +98,11 @@ def step(t, b_tp1, log_odds_, b_t = T.inc_subtensor(b_t[:, :-1], b_tp1[:, 1:]) b_t = T.inc_subtensor(b_t[:, :-2], b_tp1[:, 2:] * not_repeated_) b_t = T.switch( # back to log space - T.eq(b_t, 0), -np.inf, + T.eq(b_t, 0), -1e30, T.log(b_t) + k + y_t[T.arange(batch_sz)[:, None], blanked_labels_]) return b_t - beta_init = - np.inf * T.ones((batch_sz, label_size)) + beta_init = -1e30 * T.ones((batch_sz, label_size)) betas, _ = theano.scan( fn=step, @@ -115,16 +128,14 @@ def ctc_perform_graph(preds, seq_sizes, labels, label_sizes, blank): betas = ctc_backward(log_preds, seq_sizes, blanked_labels, label_sizes, not_repeated) - loss = -T.switch(T.all(T.isinf(betas[0, :, :2]), axis=1), - -np.inf, # impossible sequences, eg: too short - T.log(T.exp(betas[0, :, 0]) + T.exp(betas[0, :, 1]))) + loss = - logaddexp(betas[0, :, 0], betas[0, :, 1]) return log_preds, blanked_labels, not_repeated, betas, loss def ctc_grad_graph(inputs, output_gradients): - linear_out, seq_durations, labels, label_sizes, _ = inputs - seq_size, batch_size, voca_size = linear_out.shape + preds, seq_durations, labels, label_sizes, _ = inputs + seq_size, batch_size, voca_size = preds.shape label_size = labels.shape[1] # TODO: will theano optimize this redundant call when both loss and @@ -141,23 +152,22 @@ def ctc_grad_graph(inputs, output_gradients): def fwbw_sum_step(k, s, labels_, ab_): s_view = s[:, T.arange(batch_size), labels_[:, k]] ab_view = ab_[:, :, k] - next_sum = ab_view + T.switch(T.isinf(s_view), - 0, T.log(1 + T.exp(s_view - ab_view))) - next_sum = T.switch(T.isinf(ab_view), s_view, next_sum) + next_sum = logaddexp(s_view, ab_view) s = T.set_subtensor(s_view, next_sum) return s - ab = T.switch(T.isinf(alphas) + T.isinf(betas), -np.inf, alphas + betas) + ab = alphas + betas fwbw_sum = theano.scan( fn=fwbw_sum_step, sequences=[T.arange(2 * label_size + 1)], - outputs_info=[-np.inf * T.ones((seq_size, batch_size, voca_size))], + outputs_info=[-1e30 * T.ones((seq_size, batch_size, voca_size))], non_sequences=[blanked_labels, ab], + strict=True, name="fwbw_sum")[0][-1] # d(loss) / dy dloss_dy = T.switch( - T.isinf(loss)[None, :, None], + isneginf(loss)[None, :, None], 0, - T.exp(fwbw_sum - log_pl[None, :, None] - 2 * log_preds)) @@ -169,18 +179,18 @@ def fwbw_sum_step(k, s, labels_, ab_): def make_ctc_op(): - linear_out_var = T.tensor3() + preds_var = T.tensor3() seq_durations_var = T.ivector() labels_var = T.imatrix() label_sizes_var = T.ivector() blank_var = T.iscalar() _, _, _, _, loss = ctc_perform_graph( - linear_out_var, seq_durations_var, labels_var, + preds_var, seq_durations_var, labels_var, label_sizes_var, blank_var) return theano.OpFromGraph( - inputs=[linear_out_var, seq_durations_var, + inputs=[preds_var, seq_durations_var, labels_var, label_sizes_var, blank_var], outputs=[loss], grad_overrides=ctc_grad_graph, diff --git a/papers/connectionist_temporal_classification/test_ctc2.py b/papers/connectionist_temporal_classification/test_ctc2.py index b10e61b..902c69f 100644 --- a/papers/connectionist_temporal_classification/test_ctc2.py +++ b/papers/connectionist_temporal_classification/test_ctc2.py @@ -5,7 +5,7 @@ from theano.tests import unittest_tools from papers.connectionist_temporal_classification.ctc import \ - ctc_loss, ctc_forward, ctc_backward, insert_alternating_blanks + ctc_loss, ctc_forward, ctc_backward, insert_alternating_blanks, isneginf def log_softmax(X): @@ -71,8 +71,8 @@ def test_forward_backward(self): if label_lengths[i] > 0: assert np.allclose(alphas[0, i, 1], preds[0, i, labels[i, 0]]) else: - assert np.isneginf(alphas[0, i, 1]) - assert np.all(np.isneginf(alphas[0, i, 2:])) + assert isneginf(alphas[0, i, 1]) + assert np.all(isneginf(alphas[0, i, 2:])) for i in range(batch_size): t = seq_durations[i] - 1 @@ -81,9 +81,9 @@ def test_forward_backward(self): if l > 0: assert np.allclose(betas[t, i, 2 * l - 1], preds[t, i, labels[i, l - 1]]) - assert np.all(np.isneginf(betas[t, i, :max(l - 2, 0)])) + assert np.all(isneginf(betas[t, i, :max(l - 2, 0)])) else: - assert np.all(np.isneginf(betas[t, i, 1:])) + assert np.all(isneginf(betas[t, i, 1:])) p_l = p_l.eval() @@ -147,11 +147,11 @@ def test_simple_precomputed(self): ).reshape((seq_size, batch_size, voca_size)) losses = ctc_loss(preds, seq_sizes_t, labels_t, label_sizes_t, blank_t) - assert np.allclose(losses.eval(), expected_losses) + assert np.allclose(losses.eval(), expected_losses, atol=1) grad = theano.grad(losses.sum(), wrt=linear_out_t) - assert np.allclose(grad.eval(), expected_grad) + assert np.allclose(grad.eval(), expected_grad, rtol=.001, atol=1) def test_random(self): batch_size = 16 @@ -190,7 +190,8 @@ def test_random(self): g = theano.grad(ctc_loss(preds, seq_sizes, labels, label_sizes).sum(), - wrt=linear_out_var).eval({linear_out_var: linear_out}) + wrt=linear_out_var).eval( + {linear_out_var: linear_out.astype(np.float32)}) assert not np.any(np.isnan(g)) # check correctness against finite difference approximation @@ -200,8 +201,8 @@ def f(linear_out_): ).reshape((seq_size, batch_size, voca_size)) loss = ctc_loss(preds_, seq_sizes, labels, label_sizes) # prevent finite differences from failing - loss = T.switch(T.isinf(loss), 0, loss) + loss = T.switch(isneginf(-loss), 0, loss) return loss unittest_tools.verify_grad( - f, [linear_out], rel_tol=0.1) + f, [linear_out], rel_tol=0.1, abs_tol=1) From c7ce022a99f9f41e34e64fa2e8b944fe02ad4545 Mon Sep 17 00:00:00 2001 From: Nicolas Granger Date: Mon, 18 Dec 2017 11:39:58 +0100 Subject: [PATCH 4/8] Improved ctc gradient stability The CTC loss function now takes predictions in log space (before softmax) to avoid precision issues. --- .../ctc.py | 71 ++++++++++++------- .../test_ctc2.py | 41 ++++------- 2 files changed, 59 insertions(+), 53 deletions(-) diff --git a/papers/connectionist_temporal_classification/ctc.py b/papers/connectionist_temporal_classification/ctc.py index a358726..ac62703 100644 --- a/papers/connectionist_temporal_classification/ctc.py +++ b/papers/connectionist_temporal_classification/ctc.py @@ -11,18 +11,36 @@ import theano import theano.tensor as T from theano.tensor import discrete_dtypes, continuous_dtypes -from theano.printing import Print +# from theano.printing import Print def isneginf(x, neginf=-1e27): return x < neginf -def logaddexp(x, y, inf=1e9): +def logaddexp(x, y, magnitude=9): x, y = T.minimum(x, y), T.maximum(x, y) - diff = T.minimum(y - x, T.log(inf) / T.log(10)) + diff = T.minimum(y - x, magnitude) res = x + T.log(1 + T.exp(diff)) - return T.switch((y - x > T.log(inf) / T.log(10)), y, res) + return T.switch((y - x > magnitude), y, res) + + +def logsumexp(x, axis, keepdims=False): + k = T.max(x, axis=axis, keepdims=True) + return T.log(T.sum(T.exp(x - k), axis=axis, keepdims=keepdims)) + + +def log_softmax(X, axis=-1, clip=None): + k = T.max(X, axis=axis, keepdims=True) + norm_X = X - k + + if clip is not None: + mini = T.log((T.cast(X.shape[axis], 'floatX') - 1) * clip / (1 - clip)) + # norm_X *= - T.min(norm_X, axis=axis, keepdims=True) / mini + norm_X = T.maximum(norm_X, mini) + + log_sum_exp_X = logsumexp(norm_X, axis=axis, keepdims=True) + return norm_X - log_sum_exp_X # Bits of the CTC algorithm --------------------------------------------------- @@ -119,31 +137,31 @@ def step(t, b_tp1, log_odds_, # Theano Op ------------------------------------------------------------------- -def ctc_perform_graph(preds, seq_sizes, labels, label_sizes, blank): - _, batch_size, voca_size = preds.shape +def ctc_perform_graph(linout, seq_sizes, labels, label_sizes, blank): + _, batch_size, voca_size = linout.shape - log_preds = T.log(preds) + log_odds = log_softmax(linout) blanked_labels = insert_alternating_blanks(labels, blank) not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2]) - betas = ctc_backward(log_preds, seq_sizes, + betas = ctc_backward(log_odds, seq_sizes, blanked_labels, label_sizes, not_repeated) loss = - logaddexp(betas[0, :, 0], betas[0, :, 1]) - return log_preds, blanked_labels, not_repeated, betas, loss + return log_odds, blanked_labels, not_repeated, betas, loss def ctc_grad_graph(inputs, output_gradients): - preds, seq_durations, labels, label_sizes, _ = inputs - seq_size, batch_size, voca_size = preds.shape + linout, seq_durations, labels, label_sizes, _ = inputs + seq_size, batch_size, voca_size = linout.shape label_size = labels.shape[1] # TODO: will theano optimize this redundant call when both loss and # gradient are requested separately? - log_preds, blanked_labels, not_repeated, betas, loss = \ + log_odds, blanked_labels, not_repeated, betas, loss = \ ctc_perform_graph(*inputs) - alphas = ctc_forward(log_preds, seq_durations, + alphas = ctc_forward(log_odds, seq_durations, blanked_labels, label_sizes, not_repeated) log_pl = - loss @@ -165,11 +183,14 @@ def fwbw_sum_step(k, s, labels_, ab_): strict=True, name="fwbw_sum")[0][-1] - # d(loss) / dy + A = fwbw_sum - log_pl[None, :, None] - 2 * log_odds + + dloss_dy = T.exp(2 * log_odds + logsumexp(A, axis=2, keepdims=True)) \ + - T.exp(log_odds + A) + dloss_dy = T.switch( - isneginf(loss)[None, :, None], - 0, - - T.exp(fwbw_sum - log_pl[None, :, None] - 2 * log_preds)) + (loss[None, :, None] > 1e10) + T.isinf(loss[None, :, None]), + 0, dloss_dy) return [dloss_dy * output_gradients[0][None, :, None], theano.gradient.disconnected_type(), @@ -202,7 +223,7 @@ def make_ctc_op(): # ----------------------------------------------------------------------------- -def ctc_loss(preds, durations, labels, label_sizes, blank=-1): +def ctc_loss(linout, durations, labels, label_sizes, blank=-1): """Compute the Connectionnist Temporal Classification loss [#graves2006]_. .. math:: L = - ln\left( \sum_{\pi \in \mathcal{B}^{-1}(l)} P(\pi | y) @@ -216,9 +237,9 @@ def ctc_loss(preds, durations, labels, label_sizes, blank=-1): Parameters ---------- - preds : Theano shared variable, expression or numpy array - The probabilities of each class (for example the output of a softmax - function) with shape duration x batch_size x nclasses. + linout : Theano shared variable, expression or numpy array + The input values for the softmax function with shape + duration x batch_size x nclasses. durations: Theano shared variable, expression or numpy array An _integer_ vector of size batch_size contining the actual length of each sequence in preds. @@ -245,13 +266,13 @@ def ctc_loss(preds, durations, labels, label_sizes, blank=-1): (pp. 369-376). ACM. ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf """ - preds = T.as_tensor_variable(preds) + linout = T.as_tensor_variable(linout) durations = T.as_tensor_variable(durations) labels = T.as_tensor_variable(labels) label_sizes = T.as_tensor_variable(label_sizes) blank = T.cast(T.as_tensor_variable(blank), 'int32') - if not(preds.dtype in continuous_dtypes and preds.ndim == 3): + if not(linout.dtype in continuous_dtypes and linout.ndim == 3): raise ValueError("preds must continuous with dimension 3") if not (durations.dtype in discrete_dtypes and durations.ndim == 1): raise ValueError("durations must be a integer vector") @@ -262,8 +283,8 @@ def ctc_loss(preds, durations, labels, label_sizes, blank=-1): if not (blank.dtype in discrete_dtypes and blank.ndim == 0): raise ValueError("blank must be an integer value") - voca_size = T.cast(preds.shape[2], 'int32') + voca_size = T.cast(linout.shape[2], 'int32') labels = labels % voca_size blank = blank % voca_size - return CTCLossOp(preds, durations, labels, label_sizes, blank) + return CTCLossOp(linout, durations, labels, label_sizes, blank) diff --git a/papers/connectionist_temporal_classification/test_ctc2.py b/papers/connectionist_temporal_classification/test_ctc2.py index 902c69f..7c66c90 100644 --- a/papers/connectionist_temporal_classification/test_ctc2.py +++ b/papers/connectionist_temporal_classification/test_ctc2.py @@ -117,39 +117,24 @@ def test_simple_precomputed(self): blank = 0 expected_grad = np.asarray([ - [[0.2, -0.8, 0.2, 0.2, 0.2], - [0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, - 0.636408627], - [-0.02115798369, 0.03168492019, -0.8810571432, 0.2341216654, - 0.636408627]], - [[0, 0, 0, 0, 0], - [-0.9883437753, 0.03168492019, 0.08612854034, 0.2341216654, - 0.636408627], - [-0.02115798369, 0.03168492019, -0.1891518533, -0.4577836394, - 0.636408627]], - [[0, 0, 0, 0, 0], - [0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, - 0.636408627], - [-0.02115798369, 0.03168492019, 0.08612854034, -0.7330639958, - 0.636408627]] + [[0.2, -0.8, 0.2, 0.2, 0.2], + [ 0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, 0.636408627], + [-0.02115798369, 0.03168492019, -0.8810571432, 0.2341216654, 0.636408627]], + [[0, 0, 0, 0, 0], + [-0.9883437753, 0.03168492019, 0.08612854034, 0.2341216654, 0.636408627], + [-0.02115798369, 0.03168492019, -0.1891518533, -0.4577836394, 0.636408627]], + [[0, 0, 0, 0, 0], + [0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, 0.636408627], + [-0.02115798369, 0.03168492019, 0.08612854034, -0.7330639958, 0.636408627]] ], dtype=np.float32) - seq_size, batch_size, voca_size = linear_out.shape - - linear_out_t = T.as_tensor_variable(linear_out) - seq_sizes_t = T.as_tensor_variable(seq_sizes) - labels_t = T.as_tensor_variable(labels) - label_sizes_t = T.as_tensor_variable(label_sizes) - blank_t = T.as_tensor_variable(blank) - - preds = T.nnet.softmax( - linear_out_t.reshape((-1, voca_size)) - ).reshape((seq_size, batch_size, voca_size)) - losses = ctc_loss(preds, seq_sizes_t, labels_t, label_sizes_t, blank_t) + linear_out_var = T.as_tensor_variable(linear_out) + losses = ctc_loss( + linear_out_var, seq_sizes, labels, label_sizes, blank) assert np.allclose(losses.eval(), expected_losses, atol=1) - grad = theano.grad(losses.sum(), wrt=linear_out_t) + grad = theano.grad(losses.sum(), wrt=linear_out_var) assert np.allclose(grad.eval(), expected_grad, rtol=.001, atol=1) From 0f739a8ca91957749c530a4315d50b41d9923655 Mon Sep 17 00:00:00 2001 From: Nicolas Granger Date: Tue, 9 Jan 2018 11:57:36 +0100 Subject: [PATCH 5/8] test for more stability in computations --- .../ctc.py | 46 +++++++++++-------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/papers/connectionist_temporal_classification/ctc.py b/papers/connectionist_temporal_classification/ctc.py index ac62703..d7079a6 100644 --- a/papers/connectionist_temporal_classification/ctc.py +++ b/papers/connectionist_temporal_classification/ctc.py @@ -11,14 +11,14 @@ import theano import theano.tensor as T from theano.tensor import discrete_dtypes, continuous_dtypes -# from theano.printing import Print +from theano.printing import Print -def isneginf(x, neginf=-1e27): +def isneginf(x, neginf=-1e9): return x < neginf -def logaddexp(x, y, magnitude=9): +def logaddexp(x, y, magnitude=20): x, y = T.minimum(x, y), T.maximum(x, y) diff = T.minimum(y - x, magnitude) res = x + T.log(1 + T.exp(diff)) @@ -64,7 +64,7 @@ def step(t, a_tm1, log_odds_, y_t = log_odds_[t] k = T.max(a_tm1, axis=-1, keepdims=True) k = T.switch(T.all(isneginf(a_tm1), axis=-1, keepdims=True), 0, k) - a_tm1 = T.switch(isneginf(a_tm1), 0, T.exp(a_tm1 - k)) # exit log space + a_tm1 = T.switch(a_tm1 - k < - 88, 0, T.exp(a_tm1 - k)) a_t = a_tm1 a_t = T.inc_subtensor(a_t[:, 1:], a_tm1[:, :-1]) a_t = T.inc_subtensor(a_t[:, 2:], a_tm1[:, :-2] * not_repeated_) @@ -75,11 +75,11 @@ def step(t, a_tm1, log_odds_, 2 * label_sizes_[:, None] + 1) a_t = T.switch( # back to log space - T.eq(a_t, 0) + mask, -1e30, + T.eq(a_t, 0) + mask, -2e9, T.log(a_t) + k + y_t[T.arange(batch_sz)[:, None], blanked_labels_]) return a_t - alpha_init = -1e30 * T.ones((batch_sz, label_size)) + alpha_init = -2e9 * T.ones((batch_sz, label_size)) alpha_init = T.set_subtensor(alpha_init[:, 0], 0) alphas, _ = theano.scan( @@ -101,26 +101,32 @@ def ctc_backward(log_odds, seq_sizes, def step(t, b_tp1, log_odds_, seq_sizes_, blanked_labels_, label_sizes_, not_repeated_): y_t = log_odds_[t] - k = T.max(b_tp1, axis=-1, keepdims=True) - k = T.switch(T.all(isneginf(b_tp1), axis=-1, keepdims=True), 0, k) - b_tp1 = T.switch(isneginf(b_tp1), 0, T.exp(b_tp1 - k)) # exit log space - b_tp1 = b_tp1 # increase b_{T+1}(|l'|) from 0 to 1 to initialize recursion starter_t = T.eq(t, seq_sizes_ - 1)[:, None] \ * T.eq((2 * label_sizes_)[:, None], - T.arange(label_size)[None, :]) * 1 - b_tp1 += starter_t # initialize recursion + T.arange(label_size)[None, :]) + b_tp1_2lp1 = b_tp1[T.arange(batch_sz), 2 * label_sizes_] + b_tp1 = T.set_subtensor( + b_tp1_2lp1, + T.switch(T.eq(t, seq_sizes_ - 1), 0, b_tp1_2lp1)) + b_tp1 = T.switch(starter_t, 0, b_tp1) # initialize recursion b_t = b_tp1 - b_t = T.inc_subtensor(b_t[:, :-1], b_tp1[:, 1:]) - b_t = T.inc_subtensor(b_t[:, :-2], b_tp1[:, 2:] * not_repeated_) - b_t = T.switch( # back to log space - T.eq(b_t, 0), -1e30, - T.log(b_t) + k + y_t[T.arange(batch_sz)[:, None], blanked_labels_]) + b_t = T.set_subtensor( + b_t[:, :-1], + logaddexp(b_t[:, :-1], b_tp1[:, 1:])) + b_t = T.set_subtensor( + b_t[:, :-2], + logaddexp(b_t[:, :-2], T.switch(not_repeated_, b_tp1[:, 2:], -2e9))) + b_t += y_t[T.arange(batch_sz)[:, None], blanked_labels_] + # idx = Print("idx")(T.maximum(0, 2 * label_sizes_ + 1 + 2 * t - 2 * batch_dur)) + # m = Print("m")(T.max(b_t).sum()) + # b_t = b_t + (m - m) + b_t = T.switch(isneginf(b_t), -2e9, b_t) return b_t - beta_init = -1e30 * T.ones((batch_sz, label_size)) + beta_init = -2e9 * T.ones((batch_sz, label_size)) betas, _ = theano.scan( fn=step, @@ -178,7 +184,7 @@ def fwbw_sum_step(k, s, labels_, ab_): fwbw_sum = theano.scan( fn=fwbw_sum_step, sequences=[T.arange(2 * label_size + 1)], - outputs_info=[-1e30 * T.ones((seq_size, batch_size, voca_size))], + outputs_info=[-2e9 * T.ones((seq_size, batch_size, voca_size))], non_sequences=[blanked_labels, ab], strict=True, name="fwbw_sum")[0][-1] @@ -189,7 +195,7 @@ def fwbw_sum_step(k, s, labels_, ab_): - T.exp(log_odds + A) dloss_dy = T.switch( - (loss[None, :, None] > 1e10) + T.isinf(loss[None, :, None]), + (loss[None, :, None] > 1e9) + T.isinf(loss[None, :, None]), 0, dloss_dy) return [dloss_dy * output_gradients[0][None, :, None], From 51a35851b555bf67d002fa219d98143826d87557 Mon Sep 17 00:00:00 2001 From: Nicolas Granger Date: Fri, 12 Jan 2018 10:50:52 +0100 Subject: [PATCH 6/8] fixes errors and precision issues, adds demos and tests. --- .../ctc.py | 73 +-- .../experiments-tf.ipynb | 369 +++++++++++++ .../experiments.ipynb | 517 ++++++++++++++++++ .../test_ctc2.py | 2 +- .../tests.ipynb | 199 +++++++ 5 files changed, 1125 insertions(+), 35 deletions(-) create mode 100644 papers/connectionist_temporal_classification/experiments-tf.ipynb create mode 100644 papers/connectionist_temporal_classification/experiments.ipynb create mode 100644 papers/connectionist_temporal_classification/tests.ipynb diff --git a/papers/connectionist_temporal_classification/ctc.py b/papers/connectionist_temporal_classification/ctc.py index d7079a6..1a5c48d 100644 --- a/papers/connectionist_temporal_classification/ctc.py +++ b/papers/connectionist_temporal_classification/ctc.py @@ -35,9 +35,7 @@ def log_softmax(X, axis=-1, clip=None): norm_X = X - k if clip is not None: - mini = T.log((T.cast(X.shape[axis], 'floatX') - 1) * clip / (1 - clip)) - # norm_X *= - T.min(norm_X, axis=axis, keepdims=True) / mini - norm_X = T.maximum(norm_X, mini) + norm_X = T.maximum(norm_X, clip) log_sum_exp_X = logsumexp(norm_X, axis=axis, keepdims=True) return norm_X - log_sum_exp_X @@ -55,28 +53,28 @@ def insert_alternating_blanks(labels, blank_label): def ctc_forward(log_odds, seq_sizes, blanked_labels, label_sizes, not_repeated): - batch_dur, batch_sz, _ = log_odds.shape - batch_dur, batch_sz = T.cast(batch_dur, 'int32'), T.cast(batch_sz, 'int32') + seqsize, batch_sz, _ = log_odds.shape label_size = blanked_labels.shape[1] def step(t, a_tm1, log_odds_, seq_sizes_, blanked_labels_, label_sizes_, not_repeated_): y_t = log_odds_[t] - k = T.max(a_tm1, axis=-1, keepdims=True) - k = T.switch(T.all(isneginf(a_tm1), axis=-1, keepdims=True), 0, k) - a_tm1 = T.switch(a_tm1 - k < - 88, 0, T.exp(a_tm1 - k)) a_t = a_tm1 - a_t = T.inc_subtensor(a_t[:, 1:], a_tm1[:, :-1]) - a_t = T.inc_subtensor(a_t[:, 2:], a_tm1[:, :-2] * not_repeated_) + a_t = T.set_subtensor( + a_t[:, 1:], + logaddexp(a_t[:, 1:], a_tm1[:, :-1])) + a_t = T.set_subtensor( + a_t[:, 2:], + logaddexp(a_t[:, 2:], T.switch(not_repeated_, a_tm1[:, :-2], -2e9))) # stop after a_T(|l'|) mask = T.ge(t, seq_sizes_)[:, None] \ + T.ge(T.arange(label_size)[None, :], 2 * label_sizes_[:, None] + 1) - a_t = T.switch( # back to log space - T.eq(a_t, 0) + mask, -2e9, - T.log(a_t) + k + y_t[T.arange(batch_sz)[:, None], blanked_labels_]) + a_t = T.switch( + isneginf(a_t) + mask, -2e9, + a_t + y_t[T.arange(batch_sz)[:, None], blanked_labels_]) return a_t alpha_init = -2e9 * T.ones((batch_sz, label_size)) @@ -84,8 +82,8 @@ def step(t, a_tm1, log_odds_, alphas, _ = theano.scan( fn=step, - sequences=[T.arange(batch_dur)], - outputs_info=[alpha_init], + sequences=[T.arange(seqsize)], + outputs_info=alpha_init, non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes, not_repeated], name="ctc_forward") @@ -95,7 +93,7 @@ def step(t, a_tm1, log_odds_, def ctc_backward(log_odds, seq_sizes, blanked_labels, label_sizes, not_repeated): - batch_dur, batch_sz, _ = log_odds.shape + seqsize, batch_sz, _ = log_odds.shape label_size = blanked_labels.shape[1] def step(t, b_tp1, log_odds_, @@ -120,9 +118,6 @@ def step(t, b_tp1, log_odds_, b_t[:, :-2], logaddexp(b_t[:, :-2], T.switch(not_repeated_, b_tp1[:, 2:], -2e9))) b_t += y_t[T.arange(batch_sz)[:, None], blanked_labels_] - # idx = Print("idx")(T.maximum(0, 2 * label_sizes_ + 1 + 2 * t - 2 * batch_dur)) - # m = Print("m")(T.max(b_t).sum()) - # b_t = b_t + (m - m) b_t = T.switch(isneginf(b_t), -2e9, b_t) return b_t @@ -130,8 +125,8 @@ def step(t, b_tp1, log_odds_, betas, _ = theano.scan( fn=step, - sequences=[T.arange(batch_dur)], - outputs_info=[beta_init], + sequences=[T.arange(seqsize)], + outputs_info=beta_init, non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes, not_repeated], go_backwards=True, @@ -146,15 +141,20 @@ def step(t, b_tp1, log_odds_, def ctc_perform_graph(linout, seq_sizes, labels, label_sizes, blank): _, batch_size, voca_size = linout.shape - log_odds = log_softmax(linout) + logits = log_softmax(linout) blanked_labels = insert_alternating_blanks(labels, blank) not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2]) - betas = ctc_backward(log_odds, seq_sizes, + betas = ctc_backward(logits, seq_sizes, blanked_labels, label_sizes, not_repeated) - loss = - logaddexp(betas[0, :, 0], betas[0, :, 1]) - return log_odds, blanked_labels, not_repeated, betas, loss + # alphas = ctc_forward(logits, seq_sizes, + # blanked_labels, label_sizes, not_repeated) + # loss = - logaddexp( + # alphas[seq_sizes - 1, T.arange(batch_size), 2 * label_sizes - 1], + # alphas[seq_sizes - 1, T.arange(batch_size), 2 * label_sizes]) + + return logits, blanked_labels, not_repeated, betas, loss def ctc_grad_graph(inputs, output_gradients): @@ -164,10 +164,10 @@ def ctc_grad_graph(inputs, output_gradients): # TODO: will theano optimize this redundant call when both loss and # gradient are requested separately? - log_odds, blanked_labels, not_repeated, betas, loss = \ + logits, blanked_labels, not_repeated, betas, loss = \ ctc_perform_graph(*inputs) - alphas = ctc_forward(log_odds, seq_durations, + alphas = ctc_forward(logits, seq_durations, blanked_labels, label_sizes, not_repeated) log_pl = - loss @@ -184,19 +184,24 @@ def fwbw_sum_step(k, s, labels_, ab_): fwbw_sum = theano.scan( fn=fwbw_sum_step, sequences=[T.arange(2 * label_size + 1)], - outputs_info=[-2e9 * T.ones((seq_size, batch_size, voca_size))], + outputs_info=-2e9 * T.ones((seq_size, batch_size, voca_size)), non_sequences=[blanked_labels, ab], strict=True, name="fwbw_sum")[0][-1] - A = fwbw_sum - log_pl[None, :, None] - 2 * log_odds + A = fwbw_sum - log_pl[None, :, None] - logits + B = logits + logsumexp(A, axis=2, keepdims=True) + dloss_dy = T.exp(B) - T.exp(A) + # A = fwbw_sum - log_pl[None, :, None] - 2 * logits + # dloss_dy = T.exp(2 * logits + logsumexp(A, axis=2, keepdims=True)) \ + # - T.exp(logits + A) - dloss_dy = T.exp(2 * log_odds + logsumexp(A, axis=2, keepdims=True)) \ - - T.exp(log_odds + A) + dloss_dy = T.switch(T.all(isneginf(A), axis=2, keepdims=True), + 0, dloss_dy) - dloss_dy = T.switch( - (loss[None, :, None] > 1e9) + T.isinf(loss[None, :, None]), - 0, dloss_dy) + # dloss_dy = T.switch( + # (loss[None, :, None] > 1e9) + T.isinf(loss[None, :, None]), + # 0, dloss_dy) return [dloss_dy * output_gradients[0][None, :, None], theano.gradient.disconnected_type(), diff --git a/papers/connectionist_temporal_classification/experiments-tf.ipynb b/papers/connectionist_temporal_classification/experiments-tf.ipynb new file mode 100644 index 0000000..effc1cb --- /dev/null +++ b/papers/connectionist_temporal_classification/experiments-tf.ipynb @@ -0,0 +1,369 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Requirements\n", + "\n", + "Please download the timit dataset at http://academictorrents.com/details/34e2b78745138186976cbc27939b1b34d18bd5b3 and place the TIMIT.zip file next to this file.\n", + "\n", + "The following python packages are required:\n", + "- lasagne\n", + "- matplotlib\n", + "- [sphfile](https://pypi.python.org/pypi/sphfile) (to read the sound files)\n", + "- [python_speech_features](https://github.com/jameslyons/python_speech_features) (to generate mfcc features)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "% autoreload 2\n", + "#%matplotlib inline\n", + "# %env CUDA_VISIBLE_DEVICES=\"1\"\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "InteractiveShell.ast_node_interactivity = \"all\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pickle as pkl\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from zipfile import ZipFile\n", + "from sphfile import SPHFile\n", + "from python_speech_features import mfcc\n", + "import tensorflow as tf\n", + "import keras as K\n", + "from keras.models import Model\n", + "from keras.layers import Input, Dense, LSTM, Concatenate, Layer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prepare dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.isdir(\"data/lisa/data/timit/raw/TIMIT\"):\n", + " assert os.path.exists(\"TIMIT.zip\"), \"Missing data archive\"\n", + " with ZipFile(\"TIMIT.zip\", 'r') as f:\n", + " f.extractall(path=\".\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "files = []\n", + "train_subset = []\n", + "\n", + "for dirpath, _, filenames in os.walk(\"data/lisa/data/timit/raw/TIMIT\"):\n", + " for f in filenames:\n", + " if f.endswith(\"WAV\"):\n", + " recording = SPHFile(dirpath + \"/\" + f).content\n", + " files.append(dirpath + \"/\" + f[:-4])\n", + " train_subset.append(dirpath[31:36] == \"TRAIN\")\n", + "\n", + "files = np.array(files)\n", + "train_subset = np.array(train_subset, dtype=np.bool)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(\"preprocessed_dataset.pkl\"):\n", + " features = []\n", + " labels = []\n", + "\n", + " for f in files:\n", + " recording = SPHFile(f + \".WAV\")\n", + " signal = recording.content\n", + " samplerate = recording.format['sample_rate']\n", + "\n", + " mfccfeats = mfcc(signal, samplerate=samplerate, winlen=0.01, winstep=0.005, \n", + " numcep=13, nfilt=26, appendEnergy=True)\n", + " derivatives = np.concatenate([\n", + " mfccfeats[1, None] - mfccfeats[0, None],\n", + " .5 * mfccfeats[2:] - .5 * mfccfeats[0:-2],\n", + " mfccfeats[-1, None] - mfccfeats[-2, None]], axis=0)\n", + "\n", + " features.append(np.concatenate([mfccfeats, derivatives], axis=1).astype(np.float32))\n", + "\n", + " with open(f + \".PHN\") as phonem_file:\n", + " labels.append([l.split()[2] for l in phonem_file.readlines()])\n", + "\n", + " m = np.mean(np.concatenate(features, axis=0))\n", + " s = np.std(np.concatenate(features, axis=0))\n", + "\n", + " for i in range(len(features)):\n", + " features[i] = (features[i] - m) / s\n", + "\n", + " vocabulary = set()\n", + " for lseq in labels:\n", + " vocabulary |= set(lseq)\n", + "\n", + " vocabulary = list(vocabulary)\n", + " vocabulary[-1], vocabulary[vocabulary.index('h#')] = vocabulary[vocabulary.index('h#')], vocabulary[-1]\n", + "\n", + " for i in range(len(labels)):\n", + " labels[i] = np.array([vocabulary.index(l) for l in labels[i]], dtype=np.int32)\n", + "\n", + " blank = 60\n", + " \n", + " with open(\"preprocessed_dataset.pkl\", 'wb') as f:\n", + " pkl.dump((features, labels, vocabulary, blank), f)\n", + "\n", + "\n", + "with open(\"preprocessed_dataset.pkl\", 'rb') as f:\n", + " features, labels, vocabulary, blank = pkl.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(20, 9))\n", + "plt.imshow(features[1].transpose(), clim=(-4, 4))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def zero_loss(y_true, y_pred):\n", + " return K.backend.zeros_like(y_pred)\n", + "\n", + "def dense_to_sparse(x):\n", + " idx = tf.where(tf.greater_equal(x, 0))\n", + " return tf.SparseTensor(idx, tf.gather_nd(x, idx), tf.shape(x, out_type=tf.int64))\n", + "\n", + "class CTCLossLayer(Layer):\n", + " def __init__(self, **kwargs):\n", + " super(CTCLossLayer, self).__init__(**kwargs)\n", + "\n", + " def call(self, x, mask=None):\n", + " linout = x[0]\n", + " targets = x[1]\n", + " durations = x[2]\n", + " loss = tf.nn.ctc_loss(\n", + " dense_to_sparse(targets), linout,\n", + " sequence_length=durations[:, 0],\n", + " time_major=False)\n", + " self.add_loss(tf.reduce_sum(loss), x)\n", + " return loss\n", + "\n", + " def compute_output_shape(self, input_shape):\n", + " return input_shape[0][0]\n", + "\n", + "a = Input(shape=(None, features[0].shape[1]), name=\"features\")\n", + "targets = Input(shape=[None], dtype='int32', name=\"targets\")\n", + "durations = Input(shape=[1], dtype='int32', name=\"durations\")\n", + "b1 = LSTM(100, return_sequences=True)(a)\n", + "b2 = LSTM(100, return_sequences=True, go_backwards=True)(a)\n", + "c = Concatenate(axis=2)([b1, b2])\n", + "d = Dense(len(vocabulary), activation=None)(c)\n", + "l = CTCLossLayer()([d, targets, durations])\n", + "model = Model(inputs=[a, targets, durations], outputs=[d, l])\n", + "sgd = K.optimizers.SGD(lr=1e-4, momentum=0.9, nesterov=True)\n", + "\n", + "model.summary()\n", + "\n", + "model.compile(\n", + " target_tensors=[targets, targets], \n", + " loss=[zero_loss, zero_loss], \n", + " optimizer=sgd)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# lasagne.layers.set_all_param_values(l_linout, params_backup[0])\n", + "\n", + "params_backup = []\n", + "running_loss = None\n", + "\n", + "for i in np.random.permutation(len(labels))[:300]:\n", + " f, l = features[i][None, :, :], labels[i][None, 1:-1]\n", + "\n", + " batch_loss = model.train_on_batch(\n", + " x=[f, l, np.array([f.shape[1]], np.int32)],\n", + " y=[l, l])[0]\n", + "\n", + " if batch_loss > 10000:\n", + " print(\"\\nskipped i = {}\".format(i))\n", + " continue\n", + " else:\n", + " running_loss = batch_loss if running_loss is None else .99 * running_loss + .01 * batch_loss\n", + " print(\"\\rloss = {:>5.0f}\".format(running_loss), end='', flush=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluate model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def argmax_decode(preds):\n", + " decoded = [preds[0]]\n", + " for v in preds:\n", + " if v != decoded[-1]:\n", + " decoded.append(v)\n", + " \n", + " return np.array(decoded, dtype=np.int32)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features[i].shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "i = 0\n", + "inputs = [features[i][None, :, :], labels[i][None, 1:-1], np.array([features[i].shape[0]], np.int32)]\n", + "logits = model.predict(inputs)[0][0]\n", + "# preds -= np.max(preds, axis=1, keepdims=True)\n", + "# preds = np.exp(preds)\n", + "# preds /= np.sum(preds, axis=1, keepdims=True)\n", + "lbl_preds = argmax_decode(np.argmax(preds, axis=-1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "o = np.argsort(np.mean(logits[:, :60], axis=0))\n", + "plt.figure(figsize=(10, 10))\n", + "for c in o:\n", + " plt.plot(np.arange(len(logits)), logits[:, c]);\n", + "\n", + "plt.plot(np.arange(len(logits)), logits[:, -1], linestyle=\":\");\n", + "plt.legend([vocabulary[o_] for o_ in o] + [vocabulary[-1]], bbox_to_anchor=(.6, 0, 1, 1), ncol=5)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "preds[:, -1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.layers[4].get_weights()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "weights" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/papers/connectionist_temporal_classification/experiments.ipynb b/papers/connectionist_temporal_classification/experiments.ipynb new file mode 100644 index 0000000..4db5447 --- /dev/null +++ b/papers/connectionist_temporal_classification/experiments.ipynb @@ -0,0 +1,517 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Requirements\n", + "\n", + "Please download the timit dataset at http://academictorrents.com/details/34e2b78745138186976cbc27939b1b34d18bd5b3 and place the TIMIT.zip file next to this file.\n", + "\n", + "The following python packages are required:\n", + "- lasagne\n", + "- matplotlib\n", + "- [sphfile](https://pypi.python.org/pypi/sphfile) (to read the sound files)\n", + "- [python_speech_features](https://github.com/jameslyons/python_speech_features) (to generate mfcc features)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib inline\n", + "\n", + "import os\n", + "os.environ['THEANO_FLAGS'] = \"device=cpu\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle as pkl\n", + "import numpy as np\n", + "from zipfile import ZipFile\n", + "from sphfile import SPHFile\n", + "from python_speech_features import mfcc\n", + "import lasagne\n", + "from lasagne.layers import InputLayer, GaussianNoiseLayer, LSTMLayer, DenseLayer, ConcatLayer, ReshapeLayer\n", + "import theano\n", + "import theano.tensor as T\n", + "from theano.compile.nanguardmode import NanGuardMode\n", + "import matplotlib.pyplot as plt\n", + "from ctc import ctc_loss, log_softmax, insert_alternating_blanks, ctc_backward" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.seterr(all='raise')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prepare dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.isdir(\"data/lisa/data/timit/raw/TIMIT\"):\n", + " assert os.path.exists(\"TIMIT.zip\"), \"Missing data archive\"\n", + " with ZipFile(\"TIMIT.zip\", 'r') as f:\n", + " f.extractall(path=\".\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "files = []\n", + "train_subset = []\n", + "\n", + "for dirpath, _, filenames in os.walk(\"data/lisa/data/timit/raw/TIMIT\"):\n", + " for f in filenames:\n", + " if f.endswith(\"WAV\"):\n", + " recording = SPHFile(dirpath + \"/\" + f).content\n", + " files.append(dirpath + \"/\" + f[:-4])\n", + " train_subset.append(dirpath[31:36] == \"TRAIN\")\n", + "\n", + "files = np.array(files)\n", + "train_subset = np.array(train_subset, dtype=np.bool)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(\"preprocessed_dataset.pkl\"):\n", + " features = []\n", + " labels = []\n", + "\n", + " for f in files:\n", + " recording = SPHFile(f + \".WAV\")\n", + " signal = recording.content\n", + " samplerate = recording.format['sample_rate']\n", + "\n", + " mfccfeats = mfcc(signal, samplerate=samplerate, winlen=0.01, winstep=0.005, \n", + " numcep=13, nfilt=26, appendEnergy=True)\n", + " derivatives = np.concatenate([\n", + " mfccfeats[1, None] - mfccfeats[0, None],\n", + " .5 * mfccfeats[2:] - .5 * mfccfeats[0:-2],\n", + " mfccfeats[-1, None] - mfccfeats[-2, None]], axis=0)\n", + "\n", + " features.append(np.concatenate([mfccfeats, derivatives], axis=1).astype(np.float32))\n", + "\n", + " with open(f + \".PHN\") as phonem_file:\n", + " labels.append([l.split()[2] for l in phonem_file.readlines()])\n", + "\n", + " m = np.mean(np.concatenate(features, axis=0))\n", + " s = np.std(np.concatenate(features, axis=0))\n", + "\n", + " for i in range(len(features)):\n", + " features[i] = (features[i] - m) / s\n", + "\n", + " vocabulary = set()\n", + " for lseq in labels:\n", + " vocabulary |= set(lseq)\n", + "\n", + " vocabulary = list(vocabulary)\n", + " vocabulary[-1], vocabulary[vocabulary.index('h#')] = vocabulary[vocabulary.index('h#')], vocabulary[-1]\n", + "\n", + " for i in range(len(labels)):\n", + " labels[i] = np.array([vocabulary.index(l) for l in labels[i]], dtype=np.int32)\n", + "\n", + " blank = len(labels) - 1\n", + " \n", + " with open(\"preprocessed_dataset.pkl\", 'wb') as f:\n", + " pkl.dump((features, labels, vocabulary, blank), f)\n", + "\n", + "\n", + "with open(\"preprocessed_dataset.pkl\", 'rb') as f:\n", + " features, labels, vocabulary, blank = pkl.load(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams\n", + "\n", + "class SmallGaussianNoiseLayer(lasagne.layers.Layer):\n", + " \"\"\"Gaussian noise layer (clipped for safety)\"\"\"\n", + " def __init__(self, incoming, sigma=0.1, **kwargs):\n", + " super(SmallGaussianNoiseLayer, self).__init__(incoming, **kwargs)\n", + " self._srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579))\n", + " self.sigma = sigma\n", + "\n", + " def get_output_for(self, input, deterministic=False, **kwargs):\n", + " if deterministic or self.sigma == 0:\n", + " return input\n", + " else:\n", + " noise = self._srng.normal(input.shape, avg=0.0, std=self.sigma)\n", + " return input + T.clip(noise, -3 * self.sigma, 3 * self.sigma)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "l_in = InputLayer(shape=(None, 1557, 26))\n", + "l_duration = InputLayer(input_var=T.ivector(name=\"duration\"), shape=(1,))\n", + "l_mask = lasagne.layers.ExpressionLayer(l_duration, lambda d: T.arange(1557)[None, :] < d[:, None])\n", + "l_noise = SmallGaussianNoiseLayer(l_in, sigma=0.6)\n", + "l_fwlstm = LSTMLayer(\n", + " l_noise, 100, mask_input=l_mask)\n", + "l_bwlstm = LSTMLayer(\n", + " l_noise, 100, mask_input=l_mask,\n", + " backwards=True)\n", + "l_cat = ConcatLayer([l_fwlstm, l_bwlstm], axis=2)\n", + "l_linout = DenseLayer(l_cat, len(vocabulary), nonlinearity=None, num_leading_axes=2)\n", + "\n", + "input_var = l_in.input_var\n", + "duration_var = l_duration.input_var\n", + "labels_var = T.imatrix()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_output = lasagne.layers.get_output(l_linout, deterministic=False).dimshuffle(1, 0, 2)\n", + "\n", + "loss = ctc_loss(\n", + " linout=train_output,\n", + " durations=duration_var,\n", + " labels=labels_var,\n", + " label_sizes=T.cast(T.reshape(labels_var.shape[1], (1,)), 'int32'),\n", + " blank=blank\n", + ")\n", + "\n", + "params = lasagne.layers.get_all_params(l_linout, trainable=True)\n", + "grads = theano.grad(loss.sum(), params)\n", + "updates = lasagne.updates.nesterov_momentum(grads, params, learning_rate=1e-4)\n", + "\n", + "update_fn = theano.function(\n", + " [input_var, duration_var, labels_var], \n", + " loss, \n", + " updates=updates,\n", + " # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True),\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "params_history = []\n", + "loss_history = []\n", + "running_loss = None\n", + "failed = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for e in range(10):\n", + " for i in np.random.permutation(len(labels)):\n", + " f, l = features[i][None, :, :], labels[i][None, 1:-1]\n", + " d = np.array([f.shape[1]], dtype=np.int32)\n", + " f = np.concatenate([f, np.zeros((1557 - f.shape[1], f.shape[2]), dtype=np.float32)[None, :, :]], axis=1)\n", + "\n", + " batch_loss = float(update_fn(f, d, l))\n", + "\n", + " if batch_loss > 10000 or np.isnan(batch_loss):\n", + " print(\"\\nskipped i = {} because loss was {}\".format(i, batch_loss))\n", + " raise RuntimeError()\n", + " else:\n", + " running_loss = batch_loss if running_loss is None else .99 * running_loss + .01 * batch_loss\n", + " print(\"\\r{:4d} loss = {:>5.0f} -> {:>5.0f}\".format(i, batch_loss, running_loss), end='', flush=True)\n", + "\n", + " if i % 25:\n", + " params_history.append(lasagne.layers.get_all_param_values(l_linout))\n", + " loss_history.append(running_loss)\n", + "\n", + " # batch_loss = loss_fn(f, l)\n", + " # if batch_loss > 5000:\n", + " # print('loss = {:>5.0f} > 5000 at element {:d}'.format(batch_loss, i))\n", + " # raise\n", + " # else:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.plot(loss_history)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lasagne.layers.set_all_param_values(l_linout, params_history[2000 // 25])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluate model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "i = 920\n", + "f, l = features[i][None, :, :], labels[i][None, 1:-1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "blanked_labels = insert_alternating_blanks(labels_var, blank)\n", + "not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])\n", + "betas = ctc_backward(\n", + " log_softmax(T.unbroadcast(train_output.dimshuffle(1, 0, 2), 1)),\n", + " T.cast(T.reshape(train_output.shape[0], (1,)), 'int32'), \n", + " blanked_labels,\n", + " T.cast(T.reshape(labels_var.shape[1], (1,)), 'int32'),\n", + " not_repeated)\n", + "test_output = lasagne.layers.get_output(l_linout, deterministic=True)\n", + "\n", + "loss_fn = theano.function([input_var, duration_var, labels_var], loss)\n", + "beta_fn = theano.function([input_var, duration_var, labels_var], betas)\n", + "grads_fn = theano.function([input_var, duration_var, labels_var], grads)\n", + "predict_fn = theano.function([input_var, duration_var], T.exp(log_softmax(test_output[:, 0, :])))\n", + "logits_fn = theano.function([input_var, duration_var], test_output)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "logits = logits_fn(f, d)[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "o = np.argsort(np.mean(logits[:, :60], axis=0))\n", + "plt.figure(figsize=(10, 10))\n", + "for c in o:\n", + " plt.plot(np.arange(len(logits)), logits[:, c])\n", + "\n", + "plt.plot(np.arange(len(logits)), logits[:, -1], linestyle=\":\")\n", + "plt.legend([vocabulary[o_] for o_ in o] + [vocabulary[-1]], bbox_to_anchor=(.6, 0, 1, 1), ncol=5)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "logits.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# beta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features[i].shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "betas = ctc_backward(\n", + " log_softmax(train_output),\n", + " T.cast(T.reshape(train_output.shape[0], (1,)), 'int32'), \n", + " blanked_labels,\n", + " T.cast(T.reshape(labels_var.shape[1], (1,)), 'int32'),\n", + " not_repeated)\n", + "beta_fn = theano.function([input_var, duration_var, labels_var], betas)\n", + "\n", + "b = beta_fn(f, d, l)\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "plt.imshow(b[0:, 0, 0:], clim=(-1100, max(0, np.max(b))))\n", + "plt.gca().set_aspect(0.1)\n", + "plt.colorbar()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "p = lasagne.layers.get_all_param_values(l_linout, trainable=True)\n", + "for p_ in p:\n", + " print((p_.min(), p_.max()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "g = theano.grad(loss.sum(), wrt=train_output).eval({\n", + " input_var: f,\n", + " duration_var: d,\n", + " labels_var: l\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.subplot(2, 1, 1)\n", + "plt.bar(np.arange(len(vocabulary)), g[:, 0, np.concatenate((o, [60]))].mean(axis=0))\n", + "plt.subplot(2, 1, 2)\n", + "plt.plot(g[:, 0, :].mean(axis=1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(vocabulary)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def argmax_decode(preds):\n", + " decoded = [preds[0]]\n", + " for v in preds:\n", + " if v != decoded[-1]:\n", + " decoded.append(v)\n", + " \n", + " return np.array(decoded, dtype=np.int32)\n", + "\n", + "lbl_preds = argmax_decode(np.argmax(logits, axis=-1))\n", + "lbl_tgt = labels[i]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/papers/connectionist_temporal_classification/test_ctc2.py b/papers/connectionist_temporal_classification/test_ctc2.py index 7c66c90..eca7be7 100644 --- a/papers/connectionist_temporal_classification/test_ctc2.py +++ b/papers/connectionist_temporal_classification/test_ctc2.py @@ -190,4 +190,4 @@ def f(linear_out_): return loss unittest_tools.verify_grad( - f, [linear_out], rel_tol=0.1, abs_tol=1) + f, [linear_out], rel_tol=.1, abs_tol=.1) diff --git a/papers/connectionist_temporal_classification/tests.ipynb b/papers/connectionist_temporal_classification/tests.ipynb new file mode 100644 index 0000000..538c39c --- /dev/null +++ b/papers/connectionist_temporal_classification/tests.ipynb @@ -0,0 +1,199 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib inline\n", + "\n", + "import sys\n", + "import os\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "\n", + "sys.path.insert(-1, os.getcwd())\n", + "InteractiveShell.ast_node_interactivity = \"all\"\n", + "os.environ['THEANO_FLAGS'] = \"device=cpu\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import tensorflow as tf\n", + "import theano\n", + "import theano.tensor as T\n", + "\n", + "from ctc import ctc_loss as my_ctc_loss" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 16\n", + "max_labsize = 20\n", + "voca_size = 20 # excluding blank\n", + "max_seqsize = 100\n", + "blank = -1\n", + "\n", + "labsize = np.random.randint(\n", + " 1, max_labsize + 1, size=(batch_size,), dtype=np.int32)\n", + "labsize[0] = max_labsize\n", + "labsize[1] = 1\n", + "labsize[2] = max_labsize\n", + "labsize[3] = max_labsize\n", + "\n", + "labels = np.random.randint(\n", + " 0, voca_size,\n", + " size=(batch_size, max_labsize), dtype=np.int32)\n", + "for b in range(batch_size):\n", + " labels[b, labsize[b]:] = blank\n", + "\n", + "seqsize = np.array([\n", + " np.random.randint(labsize[i] + 1, max_seqsize + 1)\n", + " for i in range(batch_size)], dtype=np.int32)\n", + "\n", + "linout = np.random.randn(\n", + " max_seqsize, batch_size, voca_size + 1).astype(np.float32)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "th_linout_var = T.tensor3()\n", + "th_seqsize_var = T.ivector()\n", + "th_labels_var = T.imatrix()\n", + "th_labsize_var = T.ivector()\n", + "th_loss = my_ctc_loss(th_linout_var, th_seqsize_var, th_labels_var, th_labsize_var)\n", + "\n", + "def dense_to_sparse(x):\n", + " idx = tf.where(tf.greater_equal(x, 0))\n", + " return tf.SparseTensor(idx, tf.gather_nd(x, idx), tf.shape(x, out_type=tf.int64))\n", + "\n", + "tf_linout_var = tf.placeholder(tf.float32, shape=[max_seqsize, batch_size, voca_size + 1])\n", + "tf_seqsize_var = tf.placeholder(tf.int32, shape=[batch_size])\n", + "tf_labels_var = tf.placeholder(tf.int32, shape=[batch_size, max_labsize])\n", + "\n", + "tf_loss = tf.nn.ctc_loss(\n", + " dense_to_sparse(tf_labels_var), tf_linout_var,\n", + " sequence_length=tf_seqsize_var,\n", + " time_major=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with tf.Session() as sess:\n", + " tf_result = sess.run(\n", + " tf_loss, {\n", + " tf_linout_var: linout,\n", + " tf_seqsize_var: seqsize,\n", + " tf_labels_var: labels\n", + " })\n", + " \n", + " th_results = th_loss.eval({\n", + " th_linout_var: linout,\n", + " th_seqsize_var: seqsize,\n", + " th_labels_var: labels,\n", + " th_labsize_var: labsize\n", + " })\n", + " \n", + " print(np.abs(tf_result - th_results) / tf_result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tf_g = tf.gradients(xs=tf_linout_var, ys=tf.reduce_sum(tf_loss))[0]\n", + "\n", + "with tf.Session() as sess:\n", + " tf_grad = sess.run(\n", + " tf_g, {\n", + " tf_linout_var: linout,\n", + " tf_seqsize_var: seqsize,\n", + " tf_labels_var: labels\n", + " })\n", + " \n", + " th_grad = theano.grad(th_loss.sum(), wrt=th_linout_var).eval({\n", + " th_linout_var: linout,\n", + " th_seqsize_var: seqsize,\n", + " th_labels_var: labels,\n", + " th_labsize_var: labsize\n", + " })\n", + " \n", + " print(np.abs(tf_grad - th_grad) / (tf_grad + .000001))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "th_grad[:, 1, :]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tf_grad[:, 1, :]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From f58174f25ff2b609e63eb7435f730e36a2c7e09f Mon Sep 17 00:00:00 2001 From: Nicolas Granger Date: Wed, 24 Jan 2018 10:32:24 +0100 Subject: [PATCH 7/8] fixed error in logsumexp, ctc gradient is now equal to tensorflow's --- .../ctc.py | 35 +++++------- .../experiments.ipynb | 53 ++++++++++--------- .../test_ctc2.py | 13 ++--- .../tests.ipynb | 19 ++----- 4 files changed, 47 insertions(+), 73 deletions(-) diff --git a/papers/connectionist_temporal_classification/ctc.py b/papers/connectionist_temporal_classification/ctc.py index 1a5c48d..c6f8a54 100644 --- a/papers/connectionist_temporal_classification/ctc.py +++ b/papers/connectionist_temporal_classification/ctc.py @@ -11,7 +11,6 @@ import theano import theano.tensor as T from theano.tensor import discrete_dtypes, continuous_dtypes -from theano.printing import Print def isneginf(x, neginf=-1e9): @@ -27,7 +26,8 @@ def logaddexp(x, y, magnitude=20): def logsumexp(x, axis, keepdims=False): k = T.max(x, axis=axis, keepdims=True) - return T.log(T.sum(T.exp(x - k), axis=axis, keepdims=keepdims)) + res = T.log(T.sum(T.exp(x - k), axis=axis, keepdims=keepdims)) + k + return T.switch(isneginf(k), -2e9, res) def log_softmax(X, axis=-1, clip=None): @@ -162,17 +162,13 @@ def ctc_grad_graph(inputs, output_gradients): seq_size, batch_size, voca_size = linout.shape label_size = labels.shape[1] - # TODO: will theano optimize this redundant call when both loss and - # gradient are requested separately? logits, blanked_labels, not_repeated, betas, loss = \ ctc_perform_graph(*inputs) alphas = ctc_forward(logits, seq_durations, blanked_labels, label_sizes, not_repeated) - log_pl = - loss - - # sum_{s \in lab(l, k)} a_t(s) b_t(s) + # log(sum_{s \in lab(l, k)} a_t(s) b_t(s)) def fwbw_sum_step(k, s, labels_, ab_): s_view = s[:, T.arange(batch_size), labels_[:, k]] ab_view = ab_[:, :, k] @@ -189,20 +185,14 @@ def fwbw_sum_step(k, s, labels_, ab_): strict=True, name="fwbw_sum")[0][-1] - A = fwbw_sum - log_pl[None, :, None] - logits - B = logits + logsumexp(A, axis=2, keepdims=True) - dloss_dy = T.exp(B) - T.exp(A) - # A = fwbw_sum - log_pl[None, :, None] - 2 * logits - # dloss_dy = T.exp(2 * logits + logsumexp(A, axis=2, keepdims=True)) \ - # - T.exp(logits + A) + A = loss[None, :, None] + logits \ + + logsumexp(fwbw_sum - logits, axis=2, keepdims=True) + B = loss[None, :, None] + fwbw_sum - logits + dloss_dy = T.exp(A) - T.exp(B) - dloss_dy = T.switch(T.all(isneginf(A), axis=2, keepdims=True), + dloss_dy = T.switch(T.all(isneginf(fwbw_sum), axis=2, keepdims=True), 0, dloss_dy) - # dloss_dy = T.switch( - # (loss[None, :, None] > 1e9) + T.isinf(loss[None, :, None]), - # 0, dloss_dy) - return [dloss_dy * output_gradients[0][None, :, None], theano.gradient.disconnected_type(), theano.gradient.disconnected_type(), @@ -255,13 +245,13 @@ def ctc_loss(linout, durations, labels, label_sizes, blank=-1): An _integer_ vector of size batch_size contining the actual length of each sequence in preds. labels: Theano shared variable, expression or numpy array - An _integer_ matrix of size batch_size x label_size containg the target - labels. + An _integer_ matrix of size batch_size x label_size containing the + target labels. label_sizes: Theano shared variable, expression or numpy array - An _integer_ vector of size batch_size contining the actual length of + An _integer_ vector of size batch_size containing the actual length of each sequence in labels. blank: - The blank label class, by default the last one. + The blank label class, by default the last index. Returns ------- @@ -275,7 +265,6 @@ def ctc_loss(linout, durations, labels, label_sizes, blank=-1): unsegmented sequence data with recurrent neural networks. In Proceedings of the 23rd international conference on Machine learning (pp. 369-376). ACM. ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf - """ linout = T.as_tensor_variable(linout) durations = T.as_tensor_variable(durations) diff --git a/papers/connectionist_temporal_classification/experiments.ipynb b/papers/connectionist_temporal_classification/experiments.ipynb index 4db5447..1e8b626 100644 --- a/papers/connectionist_temporal_classification/experiments.ipynb +++ b/papers/connectionist_temporal_classification/experiments.ipynb @@ -49,15 +49,6 @@ "from ctc import ctc_loss, log_softmax, insert_alternating_blanks, ctc_backward" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "np.seterr(all='raise')" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -150,7 +141,7 @@ " blank = len(labels) - 1\n", " \n", " with open(\"preprocessed_dataset.pkl\", 'wb') as f:\n", - " pkl.dump((features, labels, vocabulary, blank), f)\n", + " pkl.dump((features, labels, vocabulary, blank), f, -1)\n", "\n", "\n", "with open(\"preprocessed_dataset.pkl\", 'rb') as f:\n", @@ -295,7 +286,8 @@ "metadata": {}, "outputs": [], "source": [ - "plt.plot(loss_history)" + "plt.plot(loss_history)\n", + "plt.yscale('log')" ] }, { @@ -304,7 +296,16 @@ "metadata": {}, "outputs": [], "source": [ - "lasagne.layers.set_all_param_values(l_linout, params_history[2000 // 25])" + "np.argmin(loss_history[::25])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lasagne.layers.set_all_param_values(l_linout, params_history[6000//25])" ] }, { @@ -320,8 +321,9 @@ "metadata": {}, "outputs": [], "source": [ - "i = 920\n", - "f, l = features[i][None, :, :], labels[i][None, 1:-1]" + "i = 0\n", + "f, l = features[i][None, :, :], labels[i][None, 1:-1]\n", + "f = np.concatenate([f, np.zeros((1557 - f.shape[1], f.shape[2]), dtype=np.float32)[None, :, :]], axis=1)" ] }, { @@ -372,15 +374,6 @@ "plt.show()" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "logits.shape" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -414,7 +407,7 @@ "b = beta_fn(f, d, l)\n", "\n", "plt.figure(figsize=(10, 6))\n", - "plt.imshow(b[0:, 0, 0:], clim=(-1100, max(0, np.max(b))))\n", + "plt.imshow(b[0:, 0, 0:], clim=(-5000, max(0, np.max(b))))\n", "plt.gca().set_aspect(0.1)\n", "plt.colorbar()\n", "plt.show()" @@ -510,6 +503,18 @@ "display_name": "Python 3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" } }, "nbformat": 4, diff --git a/papers/connectionist_temporal_classification/test_ctc2.py b/papers/connectionist_temporal_classification/test_ctc2.py index eca7be7..10d3425 100644 --- a/papers/connectionist_temporal_classification/test_ctc2.py +++ b/papers/connectionist_temporal_classification/test_ctc2.py @@ -5,14 +5,8 @@ from theano.tests import unittest_tools from papers.connectionist_temporal_classification.ctc import \ - ctc_loss, ctc_forward, ctc_backward, insert_alternating_blanks, isneginf - - -def log_softmax(X): - k = T.max(X, axis=-1, keepdims=True) - norm_X = X - k - log_sum_exp_X = T.log(T.sum(T.exp(norm_X), axis=-1, keepdims=True)) - return norm_X - log_sum_exp_X + ctc_loss, ctc_forward, ctc_backward, insert_alternating_blanks, \ + isneginf, log_softmax class TestCTC(unittest.TestCase): @@ -189,5 +183,4 @@ def f(linear_out_): loss = T.switch(isneginf(-loss), 0, loss) return loss - unittest_tools.verify_grad( - f, [linear_out], rel_tol=.1, abs_tol=.1) + unittest_tools.verify_grad(f, [linear_out], abs_tol=0.05, rel_tol=0.05) diff --git a/papers/connectionist_temporal_classification/tests.ipynb b/papers/connectionist_temporal_classification/tests.ipynb index 538c39c..abd4061 100644 --- a/papers/connectionist_temporal_classification/tests.ipynb +++ b/papers/connectionist_temporal_classification/tests.ipynb @@ -61,6 +61,7 @@ "seqsize = np.array([\n", " np.random.randint(labsize[i] + 1, max_seqsize + 1)\n", " for i in range(batch_size)], dtype=np.int32)\n", + "seqsize[0] = max_seqsize\n", "\n", "linout = np.random.randn(\n", " max_seqsize, batch_size, voca_size + 1).astype(np.float32)" @@ -92,13 +93,6 @@ " time_major=True)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, @@ -155,7 +149,7 @@ "metadata": {}, "outputs": [], "source": [ - "th_grad[:, 1, :]" + "th_grad[:, 0, :]" ] }, { @@ -164,15 +158,8 @@ "metadata": {}, "outputs": [], "source": [ - "tf_grad[:, 1, :]" + "tf_grad[:, 0, :]" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 324f2a97ede4a2ed498b92d58982ced8a516fa80 Mon Sep 17 00:00:00 2001 From: Nicolas Granger Date: Sun, 24 Jun 2018 18:27:48 +0200 Subject: [PATCH 8/8] split ctc op in two to avoid redundant computation: no improvement though... --- .gitignore | 1 + ...onnectionist Temporal Classification.ipynb | 486 ++++++++++++++++ .../ctc.py | 159 ++++-- .../experiments-tf.ipynb | 369 ------------- .../experiments.ipynb | 522 ------------------ .../test_ctc.py | 181 ++++++ .../test_ctc2.py | 186 ------- .../tests.ipynb | 186 ------- 8 files changed, 774 insertions(+), 1316 deletions(-) create mode 100644 papers/connectionist_temporal_classification/Connectionist Temporal Classification.ipynb delete mode 100644 papers/connectionist_temporal_classification/experiments-tf.ipynb delete mode 100644 papers/connectionist_temporal_classification/experiments.ipynb create mode 100644 papers/connectionist_temporal_classification/test_ctc.py delete mode 100644 papers/connectionist_temporal_classification/test_ctc2.py delete mode 100644 papers/connectionist_temporal_classification/tests.ipynb diff --git a/.gitignore b/.gitignore index db78efc..0053c11 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ __pycache__/ # datasets *.zip +papers/connectionist_temporal_classification/TIMIT # Distribution / packaging .Python diff --git a/papers/connectionist_temporal_classification/Connectionist Temporal Classification.ipynb b/papers/connectionist_temporal_classification/Connectionist Temporal Classification.ipynb new file mode 100644 index 0000000..350030a --- /dev/null +++ b/papers/connectionist_temporal_classification/Connectionist Temporal Classification.ipynb @@ -0,0 +1,486 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is an implementation the Connectionist Temporal Classification loss function:\n", + "\n", + "> Graves, A., Fernández, S., Gomez, F., & Schmidhuber, J. (2006, June). Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In Proceedings of the 23rd international conference on Machine learning (pp. 369-376). ACM. ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf\n", + "\n", + "This notebook only show the learning procedure, no thorough testing is performed and the prefix search decoding is not implemented (contributions are welcome!).\n", + "\n", + "The original paper seems to use size 1 minibatches instead of 16 here. There shouldn't be any significant variations otherwise.\n", + "\n", + "Please download the [TIMIT dataset](http://academictorrents.com/details/34e2b78745138186976cbc27939b1b34d18bd5b3) and place the `TIMIT.zip` file next to this one.\n", + "\n", + "The following python packages are required:\n", + "- scipy\n", + "- lasagne\n", + "- matplotlib\n", + "- [sphfile](https://pypi.python.org/pypi/sphfile) (to read the sound files)\n", + "- [python_speech_features](https://github.com/jameslyons/python_speech_features) (to generate mfcc features)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib notebook\n", + "\n", + "import os\n", + "os.environ['THEANO_FLAGS'] = \"device=cuda\"\n", + "#os.environ['CUDA_LAUNCH_BLOCKING'] = \"1\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle as pkl\n", + "import numpy as np\n", + "from zipfile import ZipFile\n", + "from sphfile import SPHFile\n", + "from python_speech_features import mfcc\n", + "import theano\n", + "import theano.tensor as T\n", + "import lasagne\n", + "from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, GaussianNoiseLayer\n", + "from lasagne.init import Uniform\n", + "from lasagne.nonlinearities import tanh, sigmoid\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "from ctc import ctc_loss, log_softmax, ctc_backward\n", + "import time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## small useful functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def smooth(x, w):\n", + " window = int(np.ceil(len(x) / 2 * (1000 ** w - 1) / 999))\n", + " window += 1 - window % 2\n", + " \n", + " if window < 3 or len(x) < window:\n", + " return x\n", + " \n", + " edge_weights = np.arange(1, window // 2 + 1)\n", + " return np.concatenate([\n", + " np.cumsum(x[:window // 2]) / edge_weights,\n", + " np.convolve(x, np.full([window], 1 / window), 'valid'),\n", + " np.cumsum(x[:-window // 2:-1])[::-1] / edge_weights[::-1]])\n", + "\n", + "def argmax_decode(preds, exclude=()):\n", + " preds = np.argmax(preds, axis=1)\n", + " decoded = [preds[0]]\n", + " for v in preds:\n", + " if v != decoded[-1]:\n", + " decoded.append(v)\n", + " \n", + " return np.array([v for v in decoded if v not in exclude])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.isdir(\"data/lisa/data/timit/raw/TIMIT\"):\n", + " assert os.path.exists(\"TIMIT.zip\"), \"Missing data archive\"\n", + " with ZipFile(\"TIMIT.zip\", 'r') as f:\n", + " f.extractall(path=\".\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "files = []\n", + "train_subset = []\n", + "\n", + "for dirpath, _, filenames in os.walk(\"data/lisa/data/timit/raw/TIMIT\"):\n", + " for f in filenames:\n", + " if f.endswith(\"WAV\"):\n", + " recording = SPHFile(dirpath + \"/\" + f).content\n", + " files.append(dirpath + \"/\" + f[:-4])\n", + " train_subset.append(dirpath[31:36] == \"TRAIN\")\n", + "\n", + "files = np.array(files)\n", + "train_subset = np.array(train_subset, dtype=np.bool)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(\"preprocessed_dataset.pkl\"):\n", + " features = []\n", + " labels = []\n", + "\n", + " for f in files:\n", + " recording = SPHFile(f + \".WAV\")\n", + " signal = recording.content\n", + " samplerate = recording.format['sample_rate']\n", + "\n", + " mfccfeats = mfcc(signal, samplerate=samplerate, winlen=0.01, winstep=0.005, \n", + " numcep=13, nfilt=26, appendEnergy=True)\n", + " derivatives = np.concatenate([\n", + " mfccfeats[1, None] - mfccfeats[0, None],\n", + " .5 * mfccfeats[2:] - .5 * mfccfeats[0:-2],\n", + " mfccfeats[-1, None] - mfccfeats[-2, None]], axis=0)\n", + "\n", + " features.append(np.concatenate([mfccfeats, derivatives], axis=1).astype(np.float32))\n", + "\n", + " with open(f + \".PHN\") as phonem_file:\n", + " labels.append([l.split()[2] for l in phonem_file.readlines()])\n", + "\n", + " m = np.mean(np.concatenate(features, axis=0))\n", + " s = np.std(np.concatenate(features, axis=0))\n", + "\n", + " for i in range(len(features)):\n", + " features[i] = (features[i] - m) / s\n", + "\n", + " vocabulary = set()\n", + " for lseq in labels:\n", + " vocabulary |= set(lseq)\n", + "\n", + " vocabulary = list(vocabulary)\n", + " vocabulary[-1], vocabulary[vocabulary.index('h#')] = \\\n", + " vocabulary[vocabulary.index('h#')], vocabulary[-1]\n", + " blank = len(vocabulary) - 1\n", + "\n", + " for i in range(len(labels)):\n", + " labels[i] = np.array([vocabulary.index(l) for l in labels[i]], dtype=np.int32)\n", + " \n", + " with open(\"preprocessed_dataset.pkl\", 'wb') as f:\n", + " pkl.dump((features, labels, vocabulary, blank), f, -1)\n", + "\n", + "\n", + "with open(\"preprocessed_dataset.pkl\", 'rb') as f:\n", + " features, labels, vocabulary, blank = pkl.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# let's go brutal and shove that in GPU memory\n", + "\n", + "n_sequences = len(features)\n", + "feat_size = features[0].shape[1]\n", + "max_duration = max(len(seq) for seq in features)\n", + "max_labels = max(len(seq) - 2 for seq in labels) # -2 for init and final blank\n", + "\n", + "durations = np.array([len(seq) for seq in features], dtype=np.int32)\n", + "nlabels = np.array([len(seq) - 2 for seq in labels], dtype=np.int32)\n", + "all_features = np.zeros((n_sequences, max_duration, feat_size), dtype=np.float32)\n", + "for i in range(n_sequences):\n", + " all_features[i, :durations[i]] = features[i]\n", + "all_labels = np.zeros((n_sequences, max_labels), dtype=np.int32)\n", + "for i in range(n_sequences):\n", + " all_labels[i, :nlabels[i]] = labels[i][1:-1]\n", + "\n", + "durations_var = T.as_tensor_variable(durations, name=\"durations\")\n", + "all_features_var = T.as_tensor_variable(all_features, name=\"all_features\")\n", + "nlabels_var = T.as_tensor_variable(nlabels, name=\"nlabels\")\n", + "all_labels_var = T.as_tensor_variable(all_labels, name=\"all_labels\")\n", + "\n", + "minibatch_indexes = T.ivector()\n", + "batch_features = all_features_var[minibatch_indexes]\n", + "batch_durations = durations_var[minibatch_indexes]\n", + "batch_nlabels = nlabels_var[minibatch_indexes]\n", + "batch_labels = all_labels_var[minibatch_indexes]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 16\n", + "\n", + "l_in = InputLayer(\n", + " input_var=batch_features,\n", + " shape=(batch_size, max_duration, feat_size))\n", + "\n", + "l_duration = InputLayer(input_var=batch_durations, shape=(1,))\n", + "\n", + "l_mask = lasagne.layers.ExpressionLayer(\n", + " l_duration, \n", + " lambda d: T.arange(max_duration)[None, :] < d[:, None])\n", + "\n", + "l_noise = GaussianNoiseLayer(l_in, sigma=0.6)\n", + "# l_noise = l_in\n", + "\n", + "l_fwlstm = LSTMLayer(\n", + " l_noise, 100,\n", + " ingate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n", + " forgetgate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n", + " cell=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=tanh),\n", + " outgate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n", + " nonlinearity=tanh,\n", + " mask_input=l_mask, peepholes=True)\n", + "l_bwlstm = LSTMLayer(\n", + " l_noise, 100,\n", + " ingate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n", + " forgetgate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n", + " cell=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=tanh),\n", + " outgate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n", + " nonlinearity=tanh,\n", + " mask_input=l_mask, peepholes=True, backwards=True)\n", + "\n", + "l_cat = ConcatLayer([l_fwlstm, l_bwlstm], axis=2)\n", + "\n", + "l_linout = DenseLayer(\n", + " l_cat, len(vocabulary), \n", + " nonlinearity=None,\n", + " num_leading_axes=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_output = lasagne.layers.get_output(\n", + " l_linout, deterministic=False).dimshuffle(1, 0, 2)\n", + "\n", + "loss = ctc_loss(\n", + " linout=train_output,\n", + " durations=batch_durations,\n", + " labels=batch_labels,\n", + " label_sizes=batch_nlabels,\n", + " blank=blank)\n", + "\n", + "params = lasagne.layers.get_all_params(l_linout, trainable=True)\n", + "grads = theano.grad(loss.sum(), params)\n", + "updates = lasagne.updates.adam(\n", + " grads, params, \n", + " learning_rate=1e-4)\n", + "update_fn = theano.function(\n", + " [minibatch_indexes], \n", + " loss,\n", + " updates=updates)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "i = 0\n", + "nsteps = int(100 * n_sequences / batch_size)\n", + "params_history = []\n", + "loss_history = np.zeros((nsteps,))\n", + "\n", + "def update_plot(fig, ax1, ax2, loss_history):\n", + " ax1.clear()\n", + " ax1.set_xlim(0, len(loss_history))\n", + " ax1.set_yscale('log')\n", + " ax1.set_ylim(0.8 * np.percentile(loss_history, 1), \n", + " 1.2 * np.percentile(loss_history, 99))\n", + " ax1.grid(color='gray', linestyle='-', linewidth=1)\n", + " ax1.grid(color='gray', linestyle=':', which='minor', linewidth=1)\n", + " ax1.set_axisbelow(True)\n", + " xticks = np.arange(len(loss_history))\n", + " ax1.scatter(xticks, loss_history, marker='.', \n", + " color='firebrick', edgecolor=\"none\", alpha=0.1)\n", + " smooth_history = smooth(loss_history, 0.6)\n", + " ax1.plot(xticks, smooth_history, linewidth=2, color='firebrick')\n", + "\n", + " ax2.clear()\n", + " ax2.set_yscale('log')\n", + " ax2.set_ylim(0.8 * np.percentile(loss_history, 1), \n", + " 1.2 * np.percentile(loss_history, 99))\n", + " ax2.grid(False)\n", + " ax2.yaxis.set_label_position(\"right\")\n", + " ax2.set_yticks([], minor=True)\n", + " ax2.set_yticks([smooth_history[-1]])\n", + " ax2.get_yaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())\n", + "\n", + " fig.canvas.draw()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = plt.figure()\n", + "ax1 = fig.add_subplot(111)\n", + "xticks = np.arange(i)\n", + "ax1.set_xlim(0, i + 1)\n", + "ax1.set_ylim(0, 1)\n", + "ax2 = ax1.twinx()\n", + "\n", + "# Note: you can interrupt and resume the execution of this cell\n", + "while i < nsteps:\n", + " t1 = time.time()\n", + " batch_loss = np.mean(update_fn(\n", + " np.random.choice(n_sequences, batch_size).astype(np.int32)))\n", + " t2 = time.time()\n", + " \n", + " print(\"\\r{:<6d} loss = {:>5.0f}, (d={:1.2f})\".format(i, batch_loss, t2 - t1), end='', flush=True)\n", + " loss_history[i] = batch_loss\n", + "\n", + " if (i + 1) % 10 == 0: \n", + " update_plot(fig, ax1, ax2, loss_history[:i])\n", + "\n", + "# if (i + 1) % 1000 == 0:\n", + "# params_history.append(lasagne.layers.get_all_param_values(l_linout))\n", + "\n", + " i += 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluate model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_output = lasagne.layers.get_output(l_linout, deterministic=True)\n", + "\n", + "logits_fn = theano.function(\n", + " [minibatch_indexes],\n", + " [batch_features, batch_durations, \n", + " batch_labels, batch_nlabels, \n", + " test_output])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sequence = 3\n", + "\n", + "f, d, l, n, p = logits_fn(np.array([sequence], dtype=np.int32))\n", + "f = f[0, :d[0]]\n", + "l = l[0, :n[0]]\n", + "p = p[0, :d[0]]\n", + "s = np.exp(p - np.max(p, axis=-1, keepdims=True)) \\\n", + " / np.sum(np.exp(p - np.max(p, axis=-1, keepdims=True)), axis=-1, keepdims=True)\n", + "\n", + "fig = plt.figure()\n", + "ax = plt.subplot(111)\n", + "lines = []\n", + "\n", + "for c in np.argsort(vocabulary[:-1]):\n", + " if c in l:\n", + " line, = ax.plot(np.arange(len(p)), s[:, c], label=vocabulary[c], picker=5)\n", + " lines.append(line)\n", + "\n", + "ax.plot(np.arange(len(p)), s[:, -1], linestyle=\":\")\n", + "\n", + "ax.set_ylim(0.0, 1.2)\n", + "# ax.set_yscale('log')\n", + "ax.set_title('Select curve to see the label')\n", + "\n", + "ax.legend(\n", + " framealpha=1,\n", + " loc='upper center', bbox_to_anchor=(0.5, -0.2), ncol=8)\n", + "\n", + "fig.subplots_adjust(bottom=0.5)\n", + "fig.show()\n", + "\n", + "def onpick(event):\n", + " for line in lines:\n", + " line.set_alpha(0.3)\n", + " line.set_linewidth(2)\n", + " \n", + " event.artist.set_alpha(1)\n", + " event.artist.set_linewidth(2)\n", + " ax.set_title(event.artist.get_label())\n", + "\n", + "cid = fig.canvas.mpl_connect('pick_event', onpick)\n", + "\n", + "print(\"target : {}\".format(\", \".join(vocabulary[l_] for l_ in l)))\n", + "print(\"prediction: {}\".format(\", \".join(vocabulary[l_] for l_ in argmax_decode(s, [blank]))))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/papers/connectionist_temporal_classification/ctc.py b/papers/connectionist_temporal_classification/ctc.py index c6f8a54..4535d1d 100644 --- a/papers/connectionist_temporal_classification/ctc.py +++ b/papers/connectionist_temporal_classification/ctc.py @@ -21,7 +21,7 @@ def logaddexp(x, y, magnitude=20): x, y = T.minimum(x, y), T.maximum(x, y) diff = T.minimum(y - x, magnitude) res = x + T.log(1 + T.exp(diff)) - return T.switch((y - x > magnitude), y, res) + return T.switch((y - x) > magnitude, y, res) def logsumexp(x, axis, keepdims=False): @@ -51,13 +51,13 @@ def insert_alternating_blanks(labels, blank_label): return blanked_labels -def ctc_forward(log_odds, seq_sizes, +def ctc_forward(log_odds, durations, blanked_labels, label_sizes, not_repeated): seqsize, batch_sz, _ = log_odds.shape label_size = blanked_labels.shape[1] def step(t, a_tm1, log_odds_, - seq_sizes_, blanked_labels_, label_sizes_, not_repeated_): + durations_, blanked_labels_, label_sizes_, not_repeated_): y_t = log_odds_[t] a_t = a_tm1 a_t = T.set_subtensor( @@ -68,7 +68,7 @@ def step(t, a_tm1, log_odds_, logaddexp(a_t[:, 2:], T.switch(not_repeated_, a_tm1[:, :-2], -2e9))) # stop after a_T(|l'|) - mask = T.ge(t, seq_sizes_)[:, None] \ + mask = T.ge(t, durations_)[:, None] \ + T.ge(T.arange(label_size)[None, :], 2 * label_sizes_[:, None] + 1) @@ -82,32 +82,33 @@ def step(t, a_tm1, log_odds_, alphas, _ = theano.scan( fn=step, + n_steps=seqsize, + strict=True, sequences=[T.arange(seqsize)], outputs_info=alpha_init, - non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes, + non_sequences=[log_odds, durations, blanked_labels, label_sizes, not_repeated], name="ctc_forward") return alphas -def ctc_backward(log_odds, seq_sizes, - blanked_labels, label_sizes, not_repeated): +def ctc_backward(log_odds, durations, blanked_labels, label_sizes, not_repeated): seqsize, batch_sz, _ = log_odds.shape label_size = blanked_labels.shape[1] def step(t, b_tp1, log_odds_, - seq_sizes_, blanked_labels_, label_sizes_, not_repeated_): + durations_, blanked_labels_, label_sizes_, not_repeated_): y_t = log_odds_[t] # increase b_{T+1}(|l'|) from 0 to 1 to initialize recursion - starter_t = T.eq(t, seq_sizes_ - 1)[:, None] \ + starter_t = T.eq(t, durations_ - 1)[:, None] \ * T.eq((2 * label_sizes_)[:, None], T.arange(label_size)[None, :]) b_tp1_2lp1 = b_tp1[T.arange(batch_sz), 2 * label_sizes_] b_tp1 = T.set_subtensor( b_tp1_2lp1, - T.switch(T.eq(t, seq_sizes_ - 1), 0, b_tp1_2lp1)) + T.switch(T.eq(t, durations_ - 1), 0, b_tp1_2lp1)) b_tp1 = T.switch(starter_t, 0, b_tp1) # initialize recursion b_t = b_tp1 @@ -125,9 +126,11 @@ def step(t, b_tp1, log_odds_, betas, _ = theano.scan( fn=step, + n_steps=seqsize, + strict=True, sequences=[T.arange(seqsize)], outputs_info=beta_init, - non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes, + non_sequences=[log_odds, durations, blanked_labels, label_sizes, not_repeated], go_backwards=True, name="ctc_backward") @@ -138,34 +141,28 @@ def step(t, b_tp1, log_odds_, # Theano Op ------------------------------------------------------------------- -def ctc_perform_graph(linout, seq_sizes, labels, label_sizes, blank): +def ctc_propagate(linout, durations, blanked_labels, label_sizes, not_repeated): _, batch_size, voca_size = linout.shape logits = log_softmax(linout) - blanked_labels = insert_alternating_blanks(labels, blank) - not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2]) - betas = ctc_backward(logits, seq_sizes, + betas = ctc_backward(logits, durations, blanked_labels, label_sizes, not_repeated) loss = - logaddexp(betas[0, :, 0], betas[0, :, 1]) - # alphas = ctc_forward(logits, seq_sizes, + # alphas = ctc_forward(logits, durations, # blanked_labels, label_sizes, not_repeated) # loss = - logaddexp( - # alphas[seq_sizes - 1, T.arange(batch_size), 2 * label_sizes - 1], - # alphas[seq_sizes - 1, T.arange(batch_size), 2 * label_sizes]) + # alphas[durations - 1, T.arange(batch_size), 2 * label_sizes - 1], + # alphas[durations - 1, T.arange(batch_size), 2 * label_sizes]) - return logits, blanked_labels, not_repeated, betas, loss + return loss, logits, betas -def ctc_grad_graph(inputs, output_gradients): - linout, seq_durations, labels, label_sizes, _ = inputs - seq_size, batch_size, voca_size = linout.shape - label_size = labels.shape[1] +def ctc_backprop(durations, blanked_labels, label_sizes, not_repeated, + logits, betas, loss, output_gradient): + seq_size, batch_size, voca_size = logits.shape - logits, blanked_labels, not_repeated, betas, loss = \ - ctc_perform_graph(*inputs) - - alphas = ctc_forward(logits, seq_durations, + alphas = ctc_forward(logits, durations, blanked_labels, label_sizes, not_repeated) # log(sum_{s \in lab(l, k)} a_t(s) b_t(s)) @@ -179,11 +176,11 @@ def fwbw_sum_step(k, s, labels_, ab_): ab = alphas + betas fwbw_sum = theano.scan( fn=fwbw_sum_step, - sequences=[T.arange(2 * label_size + 1)], + sequences=[T.arange(blanked_labels.shape[1])], outputs_info=-2e9 * T.ones((seq_size, batch_size, voca_size)), non_sequences=[blanked_labels, ab], strict=True, - name="fwbw_sum")[0][-1] + name="fwbw_sum")[0][-1] # should be unrolled if label_size is known A = loss[None, :, None] + logits \ + logsumexp(fwbw_sum - logits, axis=2, keepdims=True) @@ -193,38 +190,83 @@ def fwbw_sum_step(k, s, labels_, ab_): dloss_dy = T.switch(T.all(isneginf(fwbw_sum), axis=2, keepdims=True), 0, dloss_dy) - return [dloss_dy * output_gradients[0][None, :, None], - theano.gradient.disconnected_type(), - theano.gradient.disconnected_type(), - theano.gradient.disconnected_type(), - theano.gradient.disconnected_type()] + return dloss_dy * output_gradient[None, :, None] def make_ctc_op(): preds_var = T.tensor3() - seq_durations_var = T.ivector() - labels_var = T.imatrix() + durations_var = T.ivector() + blanked_labels_var = T.imatrix() + bool_matrix = T.TensorType("bool", (False, False)) + not_repeated_var = bool_matrix() label_sizes_var = T.ivector() - blank_var = T.iscalar() - _, _, _, _, loss = ctc_perform_graph( - preds_var, seq_durations_var, labels_var, - label_sizes_var, blank_var) + # linout, durations, labels, label_sizes, blank = inputs + # seq_size, batch_size, voca_size = linout.shape + # + # logits, blanked_labels, not_repeated, betas, loss = \ + # ctc_perform_graph(linout, durations, labels, label_sizes, blank) + + loss, logits, betas = ctc_propagate(preds_var, durations_var, blanked_labels_var, + label_sizes_var, not_repeated_var) + + def backprop_op1(inputs, output_gradients): + del inputs + return [ + output_gradients[0], + theano.gradient.disconnected_type(), + theano.gradient.disconnected_type(), + theano.gradient.disconnected_type(), + theano.gradient.disconnected_type()] + + op1 = theano.OpFromGraph( + inputs=[preds_var, durations_var, + blanked_labels_var, label_sizes_var, + not_repeated_var], + outputs=[preds_var, logits, betas, loss], + grad_overrides=backprop_op1, + inline=True, name="ctcLossOp1") + + def backprop_op2(inputs, output_gradients): + preds_var_, logits_, betas_, loss_, \ + durations_, blanked_labels_, label_sizes_, not_repeated_ = inputs + output_gradient, = output_gradients + + g = ctc_backprop(durations_, blanked_labels_, label_sizes_, not_repeated_, + logits_, betas_, loss_, output_gradient) + + return [ + g, + T.zeros_like(logits_), + # theano.gradient.disconnected_type(), + T.zeros_like(betas_), + # theano.gradient.disconnected_type(), + T.zeros_like(loss_), + theano.gradient.disconnected_type(), + theano.gradient.disconnected_type(), + theano.gradient.disconnected_type(), + theano.gradient.disconnected_type(), + theano.gradient.disconnected_type()] - return theano.OpFromGraph( - inputs=[preds_var, seq_durations_var, - labels_var, label_sizes_var, blank_var], - outputs=[loss], - grad_overrides=ctc_grad_graph, - inline=True, name="ctcLossOp") + preds, logits, betas, loss = op1( + preds_var, durations_var, + blanked_labels_var, label_sizes_var, + not_repeated_var) + op2 = theano.OpFromGraph( + inputs=[preds, logits, betas, loss, + durations_var, blanked_labels_var, label_sizes_var, + not_repeated_var], + outputs=[loss + preds.sum() * 0 + logits.sum() * 0 + betas.sum() * 0], + grad_overrides=backprop_op2, + inline=True, name="ctcLossOp2") -CTCLossOp = make_ctc_op() + return op1, op2 # ----------------------------------------------------------------------------- -def ctc_loss(linout, durations, labels, label_sizes, blank=-1): +def ctc_loss(preds, durations, labels, label_sizes, blank=-1): """Compute the Connectionnist Temporal Classification loss [#graves2006]_. .. math:: L = - ln\left( \sum_{\pi \in \mathcal{B}^{-1}(l)} P(\pi | y) @@ -238,7 +280,7 @@ def ctc_loss(linout, durations, labels, label_sizes, blank=-1): Parameters ---------- - linout : Theano shared variable, expression or numpy array + preds : Theano shared variable, expression or numpy array The input values for the softmax function with shape duration x batch_size x nclasses. durations: Theano shared variable, expression or numpy array @@ -266,13 +308,13 @@ def ctc_loss(linout, durations, labels, label_sizes, blank=-1): Proceedings of the 23rd international conference on Machine learning (pp. 369-376). ACM. ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf """ - linout = T.as_tensor_variable(linout) + preds = T.as_tensor_variable(preds) durations = T.as_tensor_variable(durations) labels = T.as_tensor_variable(labels) label_sizes = T.as_tensor_variable(label_sizes) blank = T.cast(T.as_tensor_variable(blank), 'int32') - if not(linout.dtype in continuous_dtypes and linout.ndim == 3): + if not(preds.dtype in continuous_dtypes and preds.ndim == 3): raise ValueError("preds must continuous with dimension 3") if not (durations.dtype in discrete_dtypes and durations.ndim == 1): raise ValueError("durations must be a integer vector") @@ -283,8 +325,19 @@ def ctc_loss(linout, durations, labels, label_sizes, blank=-1): if not (blank.dtype in discrete_dtypes and blank.ndim == 0): raise ValueError("blank must be an integer value") - voca_size = T.cast(linout.shape[2], 'int32') + voca_size = T.cast(preds.shape[2], 'int32') labels = labels % voca_size blank = blank % voca_size - return CTCLossOp(linout, durations, labels, label_sizes, blank) + op1, op2 = make_ctc_op() + + blanked_labels = insert_alternating_blanks(labels, blank) + not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2]) + + preds, logits, betas, loss = op1(preds, durations, + blanked_labels, label_sizes, + not_repeated) + loss = op2(preds, logits, betas, loss, + durations, blanked_labels, label_sizes, not_repeated) + + return loss diff --git a/papers/connectionist_temporal_classification/experiments-tf.ipynb b/papers/connectionist_temporal_classification/experiments-tf.ipynb deleted file mode 100644 index effc1cb..0000000 --- a/papers/connectionist_temporal_classification/experiments-tf.ipynb +++ /dev/null @@ -1,369 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Requirements\n", - "\n", - "Please download the timit dataset at http://academictorrents.com/details/34e2b78745138186976cbc27939b1b34d18bd5b3 and place the TIMIT.zip file next to this file.\n", - "\n", - "The following python packages are required:\n", - "- lasagne\n", - "- matplotlib\n", - "- [sphfile](https://pypi.python.org/pypi/sphfile) (to read the sound files)\n", - "- [python_speech_features](https://github.com/jameslyons/python_speech_features) (to generate mfcc features)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "% autoreload 2\n", - "#%matplotlib inline\n", - "# %env CUDA_VISIBLE_DEVICES=\"1\"\n", - "from IPython.core.interactiveshell import InteractiveShell\n", - "InteractiveShell.ast_node_interactivity = \"all\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import pickle as pkl\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "from zipfile import ZipFile\n", - "from sphfile import SPHFile\n", - "from python_speech_features import mfcc\n", - "import tensorflow as tf\n", - "import keras as K\n", - "from keras.models import Model\n", - "from keras.layers import Input, Dense, LSTM, Concatenate, Layer" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Prepare dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if not os.path.isdir(\"data/lisa/data/timit/raw/TIMIT\"):\n", - " assert os.path.exists(\"TIMIT.zip\"), \"Missing data archive\"\n", - " with ZipFile(\"TIMIT.zip\", 'r') as f:\n", - " f.extractall(path=\".\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "files = []\n", - "train_subset = []\n", - "\n", - "for dirpath, _, filenames in os.walk(\"data/lisa/data/timit/raw/TIMIT\"):\n", - " for f in filenames:\n", - " if f.endswith(\"WAV\"):\n", - " recording = SPHFile(dirpath + \"/\" + f).content\n", - " files.append(dirpath + \"/\" + f[:-4])\n", - " train_subset.append(dirpath[31:36] == \"TRAIN\")\n", - "\n", - "files = np.array(files)\n", - "train_subset = np.array(train_subset, dtype=np.bool)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Preprocessing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if not os.path.exists(\"preprocessed_dataset.pkl\"):\n", - " features = []\n", - " labels = []\n", - "\n", - " for f in files:\n", - " recording = SPHFile(f + \".WAV\")\n", - " signal = recording.content\n", - " samplerate = recording.format['sample_rate']\n", - "\n", - " mfccfeats = mfcc(signal, samplerate=samplerate, winlen=0.01, winstep=0.005, \n", - " numcep=13, nfilt=26, appendEnergy=True)\n", - " derivatives = np.concatenate([\n", - " mfccfeats[1, None] - mfccfeats[0, None],\n", - " .5 * mfccfeats[2:] - .5 * mfccfeats[0:-2],\n", - " mfccfeats[-1, None] - mfccfeats[-2, None]], axis=0)\n", - "\n", - " features.append(np.concatenate([mfccfeats, derivatives], axis=1).astype(np.float32))\n", - "\n", - " with open(f + \".PHN\") as phonem_file:\n", - " labels.append([l.split()[2] for l in phonem_file.readlines()])\n", - "\n", - " m = np.mean(np.concatenate(features, axis=0))\n", - " s = np.std(np.concatenate(features, axis=0))\n", - "\n", - " for i in range(len(features)):\n", - " features[i] = (features[i] - m) / s\n", - "\n", - " vocabulary = set()\n", - " for lseq in labels:\n", - " vocabulary |= set(lseq)\n", - "\n", - " vocabulary = list(vocabulary)\n", - " vocabulary[-1], vocabulary[vocabulary.index('h#')] = vocabulary[vocabulary.index('h#')], vocabulary[-1]\n", - "\n", - " for i in range(len(labels)):\n", - " labels[i] = np.array([vocabulary.index(l) for l in labels[i]], dtype=np.int32)\n", - "\n", - " blank = 60\n", - " \n", - " with open(\"preprocessed_dataset.pkl\", 'wb') as f:\n", - " pkl.dump((features, labels, vocabulary, blank), f)\n", - "\n", - "\n", - "with open(\"preprocessed_dataset.pkl\", 'rb') as f:\n", - " features, labels, vocabulary, blank = pkl.load(f)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.figure(figsize=(20, 9))\n", - "plt.imshow(features[1].transpose(), clim=(-4, 4))\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def zero_loss(y_true, y_pred):\n", - " return K.backend.zeros_like(y_pred)\n", - "\n", - "def dense_to_sparse(x):\n", - " idx = tf.where(tf.greater_equal(x, 0))\n", - " return tf.SparseTensor(idx, tf.gather_nd(x, idx), tf.shape(x, out_type=tf.int64))\n", - "\n", - "class CTCLossLayer(Layer):\n", - " def __init__(self, **kwargs):\n", - " super(CTCLossLayer, self).__init__(**kwargs)\n", - "\n", - " def call(self, x, mask=None):\n", - " linout = x[0]\n", - " targets = x[1]\n", - " durations = x[2]\n", - " loss = tf.nn.ctc_loss(\n", - " dense_to_sparse(targets), linout,\n", - " sequence_length=durations[:, 0],\n", - " time_major=False)\n", - " self.add_loss(tf.reduce_sum(loss), x)\n", - " return loss\n", - "\n", - " def compute_output_shape(self, input_shape):\n", - " return input_shape[0][0]\n", - "\n", - "a = Input(shape=(None, features[0].shape[1]), name=\"features\")\n", - "targets = Input(shape=[None], dtype='int32', name=\"targets\")\n", - "durations = Input(shape=[1], dtype='int32', name=\"durations\")\n", - "b1 = LSTM(100, return_sequences=True)(a)\n", - "b2 = LSTM(100, return_sequences=True, go_backwards=True)(a)\n", - "c = Concatenate(axis=2)([b1, b2])\n", - "d = Dense(len(vocabulary), activation=None)(c)\n", - "l = CTCLossLayer()([d, targets, durations])\n", - "model = Model(inputs=[a, targets, durations], outputs=[d, l])\n", - "sgd = K.optimizers.SGD(lr=1e-4, momentum=0.9, nesterov=True)\n", - "\n", - "model.summary()\n", - "\n", - "model.compile(\n", - " target_tensors=[targets, targets], \n", - " loss=[zero_loss, zero_loss], \n", - " optimizer=sgd)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Training" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# lasagne.layers.set_all_param_values(l_linout, params_backup[0])\n", - "\n", - "params_backup = []\n", - "running_loss = None\n", - "\n", - "for i in np.random.permutation(len(labels))[:300]:\n", - " f, l = features[i][None, :, :], labels[i][None, 1:-1]\n", - "\n", - " batch_loss = model.train_on_batch(\n", - " x=[f, l, np.array([f.shape[1]], np.int32)],\n", - " y=[l, l])[0]\n", - "\n", - " if batch_loss > 10000:\n", - " print(\"\\nskipped i = {}\".format(i))\n", - " continue\n", - " else:\n", - " running_loss = batch_loss if running_loss is None else .99 * running_loss + .01 * batch_loss\n", - " print(\"\\rloss = {:>5.0f}\".format(running_loss), end='', flush=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Evaluate model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def argmax_decode(preds):\n", - " decoded = [preds[0]]\n", - " for v in preds:\n", - " if v != decoded[-1]:\n", - " decoded.append(v)\n", - " \n", - " return np.array(decoded, dtype=np.int32)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "features[i].shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "i = 0\n", - "inputs = [features[i][None, :, :], labels[i][None, 1:-1], np.array([features[i].shape[0]], np.int32)]\n", - "logits = model.predict(inputs)[0][0]\n", - "# preds -= np.max(preds, axis=1, keepdims=True)\n", - "# preds = np.exp(preds)\n", - "# preds /= np.sum(preds, axis=1, keepdims=True)\n", - "lbl_preds = argmax_decode(np.argmax(preds, axis=-1))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "o = np.argsort(np.mean(logits[:, :60], axis=0))\n", - "plt.figure(figsize=(10, 10))\n", - "for c in o:\n", - " plt.plot(np.arange(len(logits)), logits[:, c]);\n", - "\n", - "plt.plot(np.arange(len(logits)), logits[:, -1], linestyle=\":\");\n", - "plt.legend([vocabulary[o_] for o_ in o] + [vocabulary[-1]], bbox_to_anchor=(.6, 0, 1, 1), ncol=5)\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "preds[:, -1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.layers[4].get_weights()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "weights" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/papers/connectionist_temporal_classification/experiments.ipynb b/papers/connectionist_temporal_classification/experiments.ipynb deleted file mode 100644 index 1e8b626..0000000 --- a/papers/connectionist_temporal_classification/experiments.ipynb +++ /dev/null @@ -1,522 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Requirements\n", - "\n", - "Please download the timit dataset at http://academictorrents.com/details/34e2b78745138186976cbc27939b1b34d18bd5b3 and place the TIMIT.zip file next to this file.\n", - "\n", - "The following python packages are required:\n", - "- lasagne\n", - "- matplotlib\n", - "- [sphfile](https://pypi.python.org/pypi/sphfile) (to read the sound files)\n", - "- [python_speech_features](https://github.com/jameslyons/python_speech_features) (to generate mfcc features)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "%matplotlib inline\n", - "\n", - "import os\n", - "os.environ['THEANO_FLAGS'] = \"device=cpu\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pickle as pkl\n", - "import numpy as np\n", - "from zipfile import ZipFile\n", - "from sphfile import SPHFile\n", - "from python_speech_features import mfcc\n", - "import lasagne\n", - "from lasagne.layers import InputLayer, GaussianNoiseLayer, LSTMLayer, DenseLayer, ConcatLayer, ReshapeLayer\n", - "import theano\n", - "import theano.tensor as T\n", - "from theano.compile.nanguardmode import NanGuardMode\n", - "import matplotlib.pyplot as plt\n", - "from ctc import ctc_loss, log_softmax, insert_alternating_blanks, ctc_backward" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Prepare dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if not os.path.isdir(\"data/lisa/data/timit/raw/TIMIT\"):\n", - " assert os.path.exists(\"TIMIT.zip\"), \"Missing data archive\"\n", - " with ZipFile(\"TIMIT.zip\", 'r') as f:\n", - " f.extractall(path=\".\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "files = []\n", - "train_subset = []\n", - "\n", - "for dirpath, _, filenames in os.walk(\"data/lisa/data/timit/raw/TIMIT\"):\n", - " for f in filenames:\n", - " if f.endswith(\"WAV\"):\n", - " recording = SPHFile(dirpath + \"/\" + f).content\n", - " files.append(dirpath + \"/\" + f[:-4])\n", - " train_subset.append(dirpath[31:36] == \"TRAIN\")\n", - "\n", - "files = np.array(files)\n", - "train_subset = np.array(train_subset, dtype=np.bool)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Preprocessing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if not os.path.exists(\"preprocessed_dataset.pkl\"):\n", - " features = []\n", - " labels = []\n", - "\n", - " for f in files:\n", - " recording = SPHFile(f + \".WAV\")\n", - " signal = recording.content\n", - " samplerate = recording.format['sample_rate']\n", - "\n", - " mfccfeats = mfcc(signal, samplerate=samplerate, winlen=0.01, winstep=0.005, \n", - " numcep=13, nfilt=26, appendEnergy=True)\n", - " derivatives = np.concatenate([\n", - " mfccfeats[1, None] - mfccfeats[0, None],\n", - " .5 * mfccfeats[2:] - .5 * mfccfeats[0:-2],\n", - " mfccfeats[-1, None] - mfccfeats[-2, None]], axis=0)\n", - "\n", - " features.append(np.concatenate([mfccfeats, derivatives], axis=1).astype(np.float32))\n", - "\n", - " with open(f + \".PHN\") as phonem_file:\n", - " labels.append([l.split()[2] for l in phonem_file.readlines()])\n", - "\n", - " m = np.mean(np.concatenate(features, axis=0))\n", - " s = np.std(np.concatenate(features, axis=0))\n", - "\n", - " for i in range(len(features)):\n", - " features[i] = (features[i] - m) / s\n", - "\n", - " vocabulary = set()\n", - " for lseq in labels:\n", - " vocabulary |= set(lseq)\n", - "\n", - " vocabulary = list(vocabulary)\n", - " vocabulary[-1], vocabulary[vocabulary.index('h#')] = vocabulary[vocabulary.index('h#')], vocabulary[-1]\n", - "\n", - " for i in range(len(labels)):\n", - " labels[i] = np.array([vocabulary.index(l) for l in labels[i]], dtype=np.int32)\n", - "\n", - " blank = len(labels) - 1\n", - " \n", - " with open(\"preprocessed_dataset.pkl\", 'wb') as f:\n", - " pkl.dump((features, labels, vocabulary, blank), f, -1)\n", - "\n", - "\n", - "with open(\"preprocessed_dataset.pkl\", 'rb') as f:\n", - " features, labels, vocabulary, blank = pkl.load(f)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams\n", - "\n", - "class SmallGaussianNoiseLayer(lasagne.layers.Layer):\n", - " \"\"\"Gaussian noise layer (clipped for safety)\"\"\"\n", - " def __init__(self, incoming, sigma=0.1, **kwargs):\n", - " super(SmallGaussianNoiseLayer, self).__init__(incoming, **kwargs)\n", - " self._srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579))\n", - " self.sigma = sigma\n", - "\n", - " def get_output_for(self, input, deterministic=False, **kwargs):\n", - " if deterministic or self.sigma == 0:\n", - " return input\n", - " else:\n", - " noise = self._srng.normal(input.shape, avg=0.0, std=self.sigma)\n", - " return input + T.clip(noise, -3 * self.sigma, 3 * self.sigma)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "l_in = InputLayer(shape=(None, 1557, 26))\n", - "l_duration = InputLayer(input_var=T.ivector(name=\"duration\"), shape=(1,))\n", - "l_mask = lasagne.layers.ExpressionLayer(l_duration, lambda d: T.arange(1557)[None, :] < d[:, None])\n", - "l_noise = SmallGaussianNoiseLayer(l_in, sigma=0.6)\n", - "l_fwlstm = LSTMLayer(\n", - " l_noise, 100, mask_input=l_mask)\n", - "l_bwlstm = LSTMLayer(\n", - " l_noise, 100, mask_input=l_mask,\n", - " backwards=True)\n", - "l_cat = ConcatLayer([l_fwlstm, l_bwlstm], axis=2)\n", - "l_linout = DenseLayer(l_cat, len(vocabulary), nonlinearity=None, num_leading_axes=2)\n", - "\n", - "input_var = l_in.input_var\n", - "duration_var = l_duration.input_var\n", - "labels_var = T.imatrix()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Training" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_output = lasagne.layers.get_output(l_linout, deterministic=False).dimshuffle(1, 0, 2)\n", - "\n", - "loss = ctc_loss(\n", - " linout=train_output,\n", - " durations=duration_var,\n", - " labels=labels_var,\n", - " label_sizes=T.cast(T.reshape(labels_var.shape[1], (1,)), 'int32'),\n", - " blank=blank\n", - ")\n", - "\n", - "params = lasagne.layers.get_all_params(l_linout, trainable=True)\n", - "grads = theano.grad(loss.sum(), params)\n", - "updates = lasagne.updates.nesterov_momentum(grads, params, learning_rate=1e-4)\n", - "\n", - "update_fn = theano.function(\n", - " [input_var, duration_var, labels_var], \n", - " loss, \n", - " updates=updates,\n", - " # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True),\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "params_history = []\n", - "loss_history = []\n", - "running_loss = None\n", - "failed = []" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for e in range(10):\n", - " for i in np.random.permutation(len(labels)):\n", - " f, l = features[i][None, :, :], labels[i][None, 1:-1]\n", - " d = np.array([f.shape[1]], dtype=np.int32)\n", - " f = np.concatenate([f, np.zeros((1557 - f.shape[1], f.shape[2]), dtype=np.float32)[None, :, :]], axis=1)\n", - "\n", - " batch_loss = float(update_fn(f, d, l))\n", - "\n", - " if batch_loss > 10000 or np.isnan(batch_loss):\n", - " print(\"\\nskipped i = {} because loss was {}\".format(i, batch_loss))\n", - " raise RuntimeError()\n", - " else:\n", - " running_loss = batch_loss if running_loss is None else .99 * running_loss + .01 * batch_loss\n", - " print(\"\\r{:4d} loss = {:>5.0f} -> {:>5.0f}\".format(i, batch_loss, running_loss), end='', flush=True)\n", - "\n", - " if i % 25:\n", - " params_history.append(lasagne.layers.get_all_param_values(l_linout))\n", - " loss_history.append(running_loss)\n", - "\n", - " # batch_loss = loss_fn(f, l)\n", - " # if batch_loss > 5000:\n", - " # print('loss = {:>5.0f} > 5000 at element {:d}'.format(batch_loss, i))\n", - " # raise\n", - " # else:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.plot(loss_history)\n", - "plt.yscale('log')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "np.argmin(loss_history[::25])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "lasagne.layers.set_all_param_values(l_linout, params_history[6000//25])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Evaluate model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "i = 0\n", - "f, l = features[i][None, :, :], labels[i][None, 1:-1]\n", - "f = np.concatenate([f, np.zeros((1557 - f.shape[1], f.shape[2]), dtype=np.float32)[None, :, :]], axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "blanked_labels = insert_alternating_blanks(labels_var, blank)\n", - "not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])\n", - "betas = ctc_backward(\n", - " log_softmax(T.unbroadcast(train_output.dimshuffle(1, 0, 2), 1)),\n", - " T.cast(T.reshape(train_output.shape[0], (1,)), 'int32'), \n", - " blanked_labels,\n", - " T.cast(T.reshape(labels_var.shape[1], (1,)), 'int32'),\n", - " not_repeated)\n", - "test_output = lasagne.layers.get_output(l_linout, deterministic=True)\n", - "\n", - "loss_fn = theano.function([input_var, duration_var, labels_var], loss)\n", - "beta_fn = theano.function([input_var, duration_var, labels_var], betas)\n", - "grads_fn = theano.function([input_var, duration_var, labels_var], grads)\n", - "predict_fn = theano.function([input_var, duration_var], T.exp(log_softmax(test_output[:, 0, :])))\n", - "logits_fn = theano.function([input_var, duration_var], test_output)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "logits = logits_fn(f, d)[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "o = np.argsort(np.mean(logits[:, :60], axis=0))\n", - "plt.figure(figsize=(10, 10))\n", - "for c in o:\n", - " plt.plot(np.arange(len(logits)), logits[:, c])\n", - "\n", - "plt.plot(np.arange(len(logits)), logits[:, -1], linestyle=\":\")\n", - "plt.legend([vocabulary[o_] for o_ in o] + [vocabulary[-1]], bbox_to_anchor=(.6, 0, 1, 1), ncol=5)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# beta" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "features[i].shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "betas = ctc_backward(\n", - " log_softmax(train_output),\n", - " T.cast(T.reshape(train_output.shape[0], (1,)), 'int32'), \n", - " blanked_labels,\n", - " T.cast(T.reshape(labels_var.shape[1], (1,)), 'int32'),\n", - " not_repeated)\n", - "beta_fn = theano.function([input_var, duration_var, labels_var], betas)\n", - "\n", - "b = beta_fn(f, d, l)\n", - "\n", - "plt.figure(figsize=(10, 6))\n", - "plt.imshow(b[0:, 0, 0:], clim=(-5000, max(0, np.max(b))))\n", - "plt.gca().set_aspect(0.1)\n", - "plt.colorbar()\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "p = lasagne.layers.get_all_param_values(l_linout, trainable=True)\n", - "for p_ in p:\n", - " print((p_.min(), p_.max()))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "g = theano.grad(loss.sum(), wrt=train_output).eval({\n", - " input_var: f,\n", - " duration_var: d,\n", - " labels_var: l\n", - "})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.subplot(2, 1, 1)\n", - "plt.bar(np.arange(len(vocabulary)), g[:, 0, np.concatenate((o, [60]))].mean(axis=0))\n", - "plt.subplot(2, 1, 2)\n", - "plt.plot(g[:, 0, :].mean(axis=1))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "len(vocabulary)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def argmax_decode(preds):\n", - " decoded = [preds[0]]\n", - " for v in preds:\n", - " if v != decoded[-1]:\n", - " decoded.append(v)\n", - " \n", - " return np.array(decoded, dtype=np.int32)\n", - "\n", - "lbl_preds = argmax_decode(np.argmax(logits, axis=-1))\n", - "lbl_tgt = labels[i]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/papers/connectionist_temporal_classification/test_ctc.py b/papers/connectionist_temporal_classification/test_ctc.py new file mode 100644 index 0000000..204d5a5 --- /dev/null +++ b/papers/connectionist_temporal_classification/test_ctc.py @@ -0,0 +1,181 @@ +import numpy as np +import theano +import theano.tensor as T +from theano.tests import unittest_tools + +from papers.connectionist_temporal_classification.ctc import ctc_loss, isneginf + + +# def test_forward_backward(): +# batch_size = 6 +# label_size = 7 +# voca_size = 5 +# seq_size = 10 +# +# label_lengths = np.random.randint(0, label_size, +# size=(batch_size,), dtype=np.int32) +# label_lengths[0] = label_size # extremum case +# label_lengths[1] = 0 # extremum case +# labels = np.array( +# [np.random.randint(0, voca_size - 1, size=label_size, dtype=np.int32) +# for _ in range(batch_size)]) +# for i in range(batch_size): +# labels[i, label_lengths[i]:] = -1 +# +# seq_durations = np.array([ +# np.random.randint(max(1, label_lengths[i]), seq_size) +# for i in range(batch_size)], dtype=np.int32) +# +# linear_out = np.random.randn(seq_size, batch_size, voca_size) \ +# .astype(np.float32) +# +# blank_class = -1 +# blank_class = np.mod(blank_class, voca_size) +# +# labels = np.mod(labels, voca_size) +# +# log_odds = log_softmax(linear_out) +# blanked_labels = insert_alternating_blanks(T.mod(labels, voca_size), +# blank_class) +# not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2]) +# +# alphas = ctc_forward(log_odds, seq_durations, +# blanked_labels, label_lengths, not_repeated) +# betas = ctc_backward(log_odds, seq_durations, +# blanked_labels, label_lengths, not_repeated) +# +# preds = log_softmax(linear_out) +# +# y_blanks = preds[:, T.arange(batch_size)[:, None], blanked_labels] +# p_l = T.sum(T.exp(alphas + betas - y_blanks), axis=2) +# +# alphas = alphas.eval() +# betas = betas.eval() +# preds = preds.eval() +# +# for i in range(batch_size): +# assert np.allclose(alphas[0, i, 0], preds[0, i, -1]) +# if label_lengths[i] > 0: +# assert np.allclose(alphas[0, i, 1], preds[0, i, labels[i, 0]]) +# else: +# assert isneginf(alphas[0, i, 1]) +# assert np.all(isneginf(alphas[0, i, 2:])) +# +# for i in range(batch_size): +# t = seq_durations[i] - 1 +# l = label_lengths[i] +# assert np.allclose(betas[t, i, 2 * l], preds[t, i, -1]) +# if l > 0: +# assert np.allclose(betas[t, i, 2 * l - 1], +# preds[t, i, labels[i, l - 1]]) +# assert np.all(isneginf(betas[t, i, :max(l - 2, 0)])) +# else: +# assert np.all(isneginf(betas[t, i, 1:])) +# +# p_l = p_l.eval() +# +# for i in range(batch_size): +# assert (np.allclose(p_l[:seq_durations[i], i], p_l[0, i])) +# a, b = max(0, 2 * label_lengths[i] - 1), 2 * label_lengths[i] + 1 +# p_li = np.exp(alphas[seq_durations[i] - 1, i, a:b]).sum() +# assert np.allclose(p_li, p_l[0, i]) +# p_li = np.exp(betas[0, i, :2]).sum() +# assert np.allclose(p_li, p_l[0, i]) + + +def test_simple_precomputed(): + # Test obtained from Torch tutorial at: + # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md + + linear_out = np.asarray([ + [[0, 0, 0, 0, 0], [1, 2, 3, 4, 5], [-5, -4, -3, -2, -1]], + [[0, 0, 0, 0, 0], [6, 7, 8, 9, 10], [-10, -9, -8, -7, -6]], + [[0, 0, 0, 0, 0], [11, 12, 13, 14, 15], [-15, -14, -13, -12, -11]] + ], dtype=np.float32) + + seq_sizes = np.asarray([1, 3, 3], dtype=np.int32) + + labels = np.asarray([[1, 0], [3, 3], [2, 3]], dtype=np.int32) + + label_sizes = np.asarray([1, 2, 2], dtype=np.int32) + + expected_losses = np.asarray([1.609437943, 7.355742931, 4.938849926], + dtype=np.float32) + + blank = 0 + + expected_grad = np.asarray([ + [[0.2, -0.8, 0.2, 0.2, 0.2], + [ 0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, 0.636408627], + [-0.02115798369, 0.03168492019, -0.8810571432, 0.2341216654, 0.636408627]], + [[0, 0, 0, 0, 0], + [-0.9883437753, 0.03168492019, 0.08612854034, 0.2341216654, 0.636408627], + [-0.02115798369, 0.03168492019, -0.1891518533, -0.4577836394, 0.636408627]], + [[0, 0, 0, 0, 0], + [0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, 0.636408627], + [-0.02115798369, 0.03168492019, 0.08612854034, -0.7330639958, 0.636408627]] + ], dtype=np.float32) + + linear_out_var = T.as_tensor_variable(linear_out) + losses = ctc_loss( + linear_out_var, seq_sizes, labels, label_sizes, blank) + + assert np.allclose(losses.eval(), expected_losses, atol=1) + + grad = theano.grad(losses.sum(), wrt=linear_out_var) + + assert np.allclose(grad.eval(), expected_grad, rtol=.001, atol=1) + + +def test_random(): + batch_size = 16 + label_size = 5 + voca_size = 4 + seq_size = 20 + + label_sizes = np.random.randint( + 0, label_size, size=(batch_size,), dtype=np.int32) + label_sizes[0] = label_size + label_sizes[1] = 0 + label_sizes[2] = 5 + label_sizes[3] = 5 + + labels = np.random.randint( + 0, voca_size - 1, + size=(batch_size, label_size), dtype=np.int32) + labels[3] = 0 + + seq_sizes = np.array([ + np.random.randint(max(1, label_sizes[i]), seq_size) + for i in range(batch_size)], dtype=np.int32) + seq_sizes[2] = 4 + + linear_out = np.random.randn( + seq_size, batch_size, voca_size).astype(np.float32) + + # check edge cases + # TODO + + # check the gradient can be computed at all + linear_out_var = T.tensor3() + preds = T.nnet.softmax( + linear_out_var.reshape((-1, voca_size)) + ).reshape((seq_size, batch_size, voca_size)) + + g = theano.grad(ctc_loss(preds, seq_sizes, + labels, label_sizes).sum(), + wrt=linear_out_var).eval( + {linear_out_var: linear_out.astype(np.float32)}) + assert not np.any(np.isnan(g)) + + # check correctness against finite difference approximation + def f(linear_out_): + preds_ = T.nnet.softmax( + linear_out_.reshape((-1, voca_size)) + ).reshape((seq_size, batch_size, voca_size)) + loss = ctc_loss(preds_, seq_sizes, labels, label_sizes) + # prevent finite differences from failing + loss = T.switch(isneginf(-loss), 0, loss) + return loss + + unittest_tools.verify_grad(f, [linear_out], abs_tol=0.05, rel_tol=0.05) diff --git a/papers/connectionist_temporal_classification/test_ctc2.py b/papers/connectionist_temporal_classification/test_ctc2.py deleted file mode 100644 index 10d3425..0000000 --- a/papers/connectionist_temporal_classification/test_ctc2.py +++ /dev/null @@ -1,186 +0,0 @@ -import unittest -import numpy as np -import theano -import theano.tensor as T -from theano.tests import unittest_tools - -from papers.connectionist_temporal_classification.ctc import \ - ctc_loss, ctc_forward, ctc_backward, insert_alternating_blanks, \ - isneginf, log_softmax - - -class TestCTC(unittest.TestCase): - def setUp(self): - unittest_tools.seed_rng() - - def test_forward_backward(self): - batch_size = 6 - label_size = 7 - voca_size = 5 - seq_size = 10 - - label_lengths = np.random.randint(0, label_size, - size=(batch_size,), dtype=np.int32) - label_lengths[0] = label_size # extremum case - label_lengths[1] = 0 # extremum case - labels = np.array( - [np.random.randint(0, voca_size - 1, size=label_size, dtype=np.int32) - for _ in range(batch_size)]) - for i in range(batch_size): - labels[i, label_lengths[i]:] = -1 - - seq_durations = np.array([ - np.random.randint(max(1, label_lengths[i]), seq_size) - for i in range(batch_size)], dtype=np.int32) - - linear_out = np.random.randn(seq_size, batch_size, voca_size) \ - .astype(np.float32) - - blank_class = -1 - blank_class = np.mod(blank_class, voca_size) - - labels = np.mod(labels, voca_size) - - log_odds = log_softmax(linear_out) - blanked_labels = insert_alternating_blanks(T.mod(labels, voca_size), - blank_class) - not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2]) - - alphas = ctc_forward(log_odds, seq_durations, - blanked_labels, label_lengths, not_repeated) - betas = ctc_backward(log_odds, seq_durations, - blanked_labels, label_lengths, not_repeated) - - preds = log_softmax(linear_out) - - y_blanks = preds[:, T.arange(batch_size)[:, None], blanked_labels] - p_l = T.sum(T.exp(alphas + betas - y_blanks), axis=2) - - alphas = alphas.eval() - betas = betas.eval() - preds = preds.eval() - - for i in range(batch_size): - assert np.allclose(alphas[0, i, 0], preds[0, i, -1]) - if label_lengths[i] > 0: - assert np.allclose(alphas[0, i, 1], preds[0, i, labels[i, 0]]) - else: - assert isneginf(alphas[0, i, 1]) - assert np.all(isneginf(alphas[0, i, 2:])) - - for i in range(batch_size): - t = seq_durations[i] - 1 - l = label_lengths[i] - assert np.allclose(betas[t, i, 2 * l], preds[t, i, -1]) - if l > 0: - assert np.allclose(betas[t, i, 2 * l - 1], - preds[t, i, labels[i, l - 1]]) - assert np.all(isneginf(betas[t, i, :max(l - 2, 0)])) - else: - assert np.all(isneginf(betas[t, i, 1:])) - - p_l = p_l.eval() - - for i in range(batch_size): - assert (np.allclose(p_l[:seq_durations[i], i], p_l[0, i])) - a, b = max(0, 2 * label_lengths[i] - 1), 2 * label_lengths[i] + 1 - p_li = np.exp(alphas[seq_durations[i] - 1, i, a:b]).sum() - assert np.allclose(p_li, p_l[0, i]) - p_li = np.exp(betas[0, i, :2]).sum() - assert np.allclose(p_li, p_l[0, i]) - - def test_simple_precomputed(self): - # Test obtained from Torch tutorial at: - # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md - - linear_out = np.asarray([ - [[0, 0, 0, 0, 0], [1, 2, 3, 4, 5], [-5, -4, -3, -2, -1]], - [[0, 0, 0, 0, 0], [6, 7, 8, 9, 10], [-10, -9, -8, -7, -6]], - [[0, 0, 0, 0, 0], [11, 12, 13, 14, 15], [-15, -14, -13, -12, -11]] - ], dtype=np.float32) - - seq_sizes = np.asarray([1, 3, 3], dtype=np.int32) - - labels = np.asarray([[1, 0], [3, 3], [2, 3]], dtype=np.int32) - - label_sizes = np.asarray([1, 2, 2], dtype=np.int32) - - expected_losses = np.asarray([1.609437943, 7.355742931, 4.938849926], - dtype=np.float32) - - blank = 0 - - expected_grad = np.asarray([ - [[0.2, -0.8, 0.2, 0.2, 0.2], - [ 0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, 0.636408627], - [-0.02115798369, 0.03168492019, -0.8810571432, 0.2341216654, 0.636408627]], - [[0, 0, 0, 0, 0], - [-0.9883437753, 0.03168492019, 0.08612854034, 0.2341216654, 0.636408627], - [-0.02115798369, 0.03168492019, -0.1891518533, -0.4577836394, 0.636408627]], - [[0, 0, 0, 0, 0], - [0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, 0.636408627], - [-0.02115798369, 0.03168492019, 0.08612854034, -0.7330639958, 0.636408627]] - ], dtype=np.float32) - - linear_out_var = T.as_tensor_variable(linear_out) - losses = ctc_loss( - linear_out_var, seq_sizes, labels, label_sizes, blank) - - assert np.allclose(losses.eval(), expected_losses, atol=1) - - grad = theano.grad(losses.sum(), wrt=linear_out_var) - - assert np.allclose(grad.eval(), expected_grad, rtol=.001, atol=1) - - def test_random(self): - batch_size = 16 - label_size = 5 - voca_size = 4 - seq_size = 20 - - label_sizes = np.random.randint( - 0, label_size, size=(batch_size,), dtype=np.int32) - label_sizes[0] = label_size - label_sizes[1] = 0 - label_sizes[2] = 5 - label_sizes[3] = 5 - - labels = np.random.randint( - 0, voca_size - 1, - size=(batch_size, label_size), dtype=np.int32) - labels[3] = 0 - - seq_sizes = np.array([ - np.random.randint(max(1, label_sizes[i]), seq_size) - for i in range(batch_size)], dtype=np.int32) - seq_sizes[2] = 4 - - linear_out = np.random.randn( - seq_size, batch_size, voca_size).astype(np.float32) - - # check edge cases - # TODO - - # check the gradient can be computed at all - linear_out_var = T.tensor3() - preds = T.nnet.softmax( - linear_out_var.reshape((-1, voca_size)) - ).reshape((seq_size, batch_size, voca_size)) - - g = theano.grad(ctc_loss(preds, seq_sizes, - labels, label_sizes).sum(), - wrt=linear_out_var).eval( - {linear_out_var: linear_out.astype(np.float32)}) - assert not np.any(np.isnan(g)) - - # check correctness against finite difference approximation - def f(linear_out_): - preds_ = T.nnet.softmax( - linear_out_.reshape((-1, voca_size)) - ).reshape((seq_size, batch_size, voca_size)) - loss = ctc_loss(preds_, seq_sizes, labels, label_sizes) - # prevent finite differences from failing - loss = T.switch(isneginf(-loss), 0, loss) - return loss - - unittest_tools.verify_grad(f, [linear_out], abs_tol=0.05, rel_tol=0.05) diff --git a/papers/connectionist_temporal_classification/tests.ipynb b/papers/connectionist_temporal_classification/tests.ipynb deleted file mode 100644 index abd4061..0000000 --- a/papers/connectionist_temporal_classification/tests.ipynb +++ /dev/null @@ -1,186 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "%matplotlib inline\n", - "\n", - "import sys\n", - "import os\n", - "from IPython.core.interactiveshell import InteractiveShell\n", - "\n", - "sys.path.insert(-1, os.getcwd())\n", - "InteractiveShell.ast_node_interactivity = \"all\"\n", - "os.environ['THEANO_FLAGS'] = \"device=cpu\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import tensorflow as tf\n", - "import theano\n", - "import theano.tensor as T\n", - "\n", - "from ctc import ctc_loss as my_ctc_loss" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch_size = 16\n", - "max_labsize = 20\n", - "voca_size = 20 # excluding blank\n", - "max_seqsize = 100\n", - "blank = -1\n", - "\n", - "labsize = np.random.randint(\n", - " 1, max_labsize + 1, size=(batch_size,), dtype=np.int32)\n", - "labsize[0] = max_labsize\n", - "labsize[1] = 1\n", - "labsize[2] = max_labsize\n", - "labsize[3] = max_labsize\n", - "\n", - "labels = np.random.randint(\n", - " 0, voca_size,\n", - " size=(batch_size, max_labsize), dtype=np.int32)\n", - "for b in range(batch_size):\n", - " labels[b, labsize[b]:] = blank\n", - "\n", - "seqsize = np.array([\n", - " np.random.randint(labsize[i] + 1, max_seqsize + 1)\n", - " for i in range(batch_size)], dtype=np.int32)\n", - "seqsize[0] = max_seqsize\n", - "\n", - "linout = np.random.randn(\n", - " max_seqsize, batch_size, voca_size + 1).astype(np.float32)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "th_linout_var = T.tensor3()\n", - "th_seqsize_var = T.ivector()\n", - "th_labels_var = T.imatrix()\n", - "th_labsize_var = T.ivector()\n", - "th_loss = my_ctc_loss(th_linout_var, th_seqsize_var, th_labels_var, th_labsize_var)\n", - "\n", - "def dense_to_sparse(x):\n", - " idx = tf.where(tf.greater_equal(x, 0))\n", - " return tf.SparseTensor(idx, tf.gather_nd(x, idx), tf.shape(x, out_type=tf.int64))\n", - "\n", - "tf_linout_var = tf.placeholder(tf.float32, shape=[max_seqsize, batch_size, voca_size + 1])\n", - "tf_seqsize_var = tf.placeholder(tf.int32, shape=[batch_size])\n", - "tf_labels_var = tf.placeholder(tf.int32, shape=[batch_size, max_labsize])\n", - "\n", - "tf_loss = tf.nn.ctc_loss(\n", - " dense_to_sparse(tf_labels_var), tf_linout_var,\n", - " sequence_length=tf_seqsize_var,\n", - " time_major=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with tf.Session() as sess:\n", - " tf_result = sess.run(\n", - " tf_loss, {\n", - " tf_linout_var: linout,\n", - " tf_seqsize_var: seqsize,\n", - " tf_labels_var: labels\n", - " })\n", - " \n", - " th_results = th_loss.eval({\n", - " th_linout_var: linout,\n", - " th_seqsize_var: seqsize,\n", - " th_labels_var: labels,\n", - " th_labsize_var: labsize\n", - " })\n", - " \n", - " print(np.abs(tf_result - th_results) / tf_result)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tf_g = tf.gradients(xs=tf_linout_var, ys=tf.reduce_sum(tf_loss))[0]\n", - "\n", - "with tf.Session() as sess:\n", - " tf_grad = sess.run(\n", - " tf_g, {\n", - " tf_linout_var: linout,\n", - " tf_seqsize_var: seqsize,\n", - " tf_labels_var: labels\n", - " })\n", - " \n", - " th_grad = theano.grad(th_loss.sum(), wrt=th_linout_var).eval({\n", - " th_linout_var: linout,\n", - " th_seqsize_var: seqsize,\n", - " th_labels_var: labels,\n", - " th_labsize_var: labsize\n", - " })\n", - " \n", - " print(np.abs(tf_grad - th_grad) / (tf_grad + .000001))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "th_grad[:, 0, :]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tf_grad[:, 0, :]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}