From 9ca152de2f57002f9813b73537023da6026af7b6 Mon Sep 17 00:00:00 2001
From: Nicolas Granger <nicolas.granger.m@gmail.com>
Date: Sat, 26 Aug 2017 20:23:27 +0200
Subject: [PATCH 1/8] Implementation of CTC in pure theano with custom gradient
 (which should hopefully be more robust to precision issues)

---
 .../ctc.py                                    | 262 ++++++++++++++++++
 .../test_ctc2.py                              | 123 ++++++++
 2 files changed, 385 insertions(+)
 create mode 100644 papers/connectionist_temporal_classification/ctc.py
 create mode 100644 papers/connectionist_temporal_classification/test_ctc2.py

diff --git a/papers/connectionist_temporal_classification/ctc.py b/papers/connectionist_temporal_classification/ctc.py
new file mode 100644
index 0000000..70e5526
--- /dev/null
+++ b/papers/connectionist_temporal_classification/ctc.py
@@ -0,0 +1,262 @@
+# Author: Nicolas Granger <nicolas.granger.m@gmail.com>
+#
+# Implements the connectionist temporal classification loss from:
+# Graves, A., Fernández, S., Gomez, F., & Schmidhuber, J. (2006, June).
+# Connectionist temporal classification: labelling unsegmented sequence data
+# with recurrent neural networks. In Proceedings of the 23rd international
+# conference on Machine learning (pp. 369-376). ACM.
+# ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf
+
+import numpy as np
+import theano
+import theano.tensor as T
+from theano.tensor import discrete_dtypes, continuous_dtypes
+
+
+# Bits of the CTC algorithm ---------------------------------------------------
+
+def insert_alternating_blanks(labels, blank_label):
+    batch_size, label_size = labels.shape
+    blanked_labels = T.zeros((batch_size, 2 * label_size + 1), dtype=np.int32)
+    blanked_labels = T.set_subtensor(blanked_labels[:, 0::2], blank_label)
+    blanked_labels = T.set_subtensor(blanked_labels[:, 1:-1:2], labels)
+    return blanked_labels
+
+
+def ctc_forward(log_odds, seq_sizes,
+                blanked_labels, label_sizes, not_repeated):
+    batch_dur, batch_sz, _ = log_odds.shape
+    batch_dur, batch_sz = T.cast(batch_dur, 'int32'), T.cast(batch_sz, 'int32')
+    label_size = blanked_labels.shape[1]
+
+    def step(t, a_tm1, log_odds_,
+             seq_sizes_, blanked_labels_, label_sizes_, not_repeated_):
+        y_t = log_odds_[t]
+        k = T.max(a_tm1, axis=-1, keepdims=True)
+        k = T.switch(T.isinf(k), 0, k)
+        a_tm1 = T.switch(T.isinf(a_tm1), 0, T.exp(a_tm1 - k))  # exit log space
+        a_t = a_tm1
+        a_t = T.inc_subtensor(a_t[:, 1:], a_tm1[:, :-1])
+        a_t = T.inc_subtensor(a_t[:, 2:], a_tm1[:, :-2] * not_repeated_)
+
+        # stop after a_T(|l'|)
+        mask = T.ge(t, seq_sizes_)[:, None] \
+            + T.ge(T.arange(label_size)[None, :],
+                   2 * label_sizes_[:, None] + 1)
+
+        a_t = T.switch(  # back to log space
+            T.eq(a_t, 0) + mask, -np.inf,
+            T.log(a_t) + k + y_t[T.arange(batch_sz)[:, None], blanked_labels_])
+        return a_t
+
+    alpha_init = -np.inf * T.ones((batch_sz, label_size))
+    alpha_init = T.set_subtensor(alpha_init[:, 0], 0)
+
+    alphas, _ = theano.scan(
+        fn=step,
+        sequences=[T.arange(batch_dur)],
+        outputs_info=[alpha_init],
+        non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes,
+                       not_repeated],
+        name="ctc_forward",
+        profile=True)
+
+    return alphas
+
+
+def ctc_backward(log_odds, seq_sizes,
+                 blanked_labels, label_sizes, not_repeated):
+    batch_dur, batch_sz, _ = log_odds.shape
+    label_size = blanked_labels.shape[1]
+
+    def step(t, b_tp1, log_odds_,
+             seq_sizes_, blanked_labels_, label_sizes_, not_repeated_):
+        y_t = log_odds_[t]
+        k = T.max(b_tp1, axis=-1, keepdims=True)
+        k = T.switch(T.isinf(k), 0, k)
+        b_tp1 = T.switch(T.isinf(b_tp1), 0, T.exp(b_tp1 - k))  # exit log space
+
+        # increase b_{T+1}(|l'|) from 0 to 1 to initialize recursion
+        starter_t = T.eq(t, seq_sizes_ - 1)[:, None] \
+            * T.eq((2 * label_sizes_)[:, None],
+                   T.arange(label_size)[None, :]) * 1
+        b_tp1 += starter_t  # initialize recursion
+
+        b_t = b_tp1
+        b_t = T.inc_subtensor(b_t[:, :-1], b_tp1[:, 1:])
+        b_t = T.inc_subtensor(b_t[:, :-2], b_tp1[:, 2:] * not_repeated_)
+        b_t = T.switch(  # back to log space
+            T.eq(b_t, 0), -np.inf,
+            T.log(b_t) + k + y_t[T.arange(batch_sz)[:, None], blanked_labels_])
+        return b_t
+
+    beta_init = - np.inf * T.ones((batch_sz, label_size))
+
+    betas, _ = theano.scan(
+        fn=step,
+        sequences=[T.arange(batch_dur)],
+        outputs_info=[beta_init],
+        non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes,
+                       not_repeated],
+        go_backwards=True,
+        name="ctc_backward",
+        profile=True)
+    betas = betas[::-1, :, :]
+
+    return betas
+
+
+# Theano Op -------------------------------------------------------------------
+
+def ctc_perform_graph(preds, seq_sizes, labels, label_sizes, blank):
+    _, batch_size, voca_size = preds.shape
+
+    log_preds = T.log(preds)
+    blanked_labels = insert_alternating_blanks(labels, blank)
+    not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])
+    betas = ctc_backward(log_preds, seq_sizes,
+                         blanked_labels, label_sizes, not_repeated)
+
+    loss = -T.switch(T.all(T.isinf(betas[0, :, :2]), axis=1),
+                     -np.inf,  # impossible sequences, eg: too short
+                     T.log(T.exp(betas[0, :, 0]) + T.exp(betas[0, :, 1])))
+
+    return log_preds, blanked_labels, not_repeated, betas, loss
+
+
+def ctc_grad_graph(inputs, output_gradients):
+    linear_out, seq_durations, labels, label_sizes, _ = inputs
+    seq_size, batch_size, voca_size = linear_out.shape
+    label_size = labels.shape[1]
+
+    # TODO: will theano optimize this redundant call when both loss and
+    # gradient are requested separately?
+    log_preds, blanked_labels, not_repeated, betas, loss = \
+        ctc_perform_graph(*inputs)
+
+    alphas = ctc_forward(log_preds, seq_durations,
+                         blanked_labels, label_sizes, not_repeated)
+
+    log_pl = - loss
+
+    # sum_{s \in lab(l, k)} a_t(s) b_t(s)
+    def fwbw_sum_step(k, s, labels_, ab_):
+        s_view = s[:, T.arange(batch_size), labels_[:, k]]
+        ab_view = ab_[:, :, k]
+        next_sum = ab_view + T.switch(T.isinf(s_view),
+                                      0, T.log(1 + T.exp(s_view - ab_view)))
+        next_sum = T.switch(T.isinf(ab_view), s_view, next_sum)
+        s = T.set_subtensor(s_view, next_sum)
+        return s
+
+    ab = T.switch(T.isinf(alphas) + T.isinf(betas), -np.inf, alphas + betas)
+    fwbw_sum = theano.scan(
+        fn=fwbw_sum_step,
+        sequences=[T.arange(2 * label_size + 1)],
+        outputs_info=[-np.inf * T.ones((seq_size, batch_size, voca_size))],
+        non_sequences=[blanked_labels, ab],
+        name="fwbw_sum",
+        profile=True)[0][-1]
+
+    # d(loss) / dy
+    dloss_dy = T.switch(
+        T.isinf(loss)[None, :, None],
+        0,
+        - T.exp(fwbw_sum - log_pl[None, :, None] - 2 * log_preds))
+
+    return [dloss_dy * output_gradients[0][None, :, None],
+            theano.gradient.disconnected_type(),
+            theano.gradient.disconnected_type(),
+            theano.gradient.disconnected_type(),
+            theano.gradient.disconnected_type()]
+
+
+def make_ctc_op():
+    linear_out_var = T.tensor3()
+    seq_durations_var = T.ivector()
+    labels_var = T.imatrix()
+    label_sizes_var = T.ivector()
+    blank_var = T.iscalar()
+
+    _, _, _, _, loss = ctc_perform_graph(
+        linear_out_var, seq_durations_var, labels_var,
+        label_sizes_var, blank_var)
+
+    return theano.OpFromGraph(
+        inputs=[linear_out_var, seq_durations_var,
+                labels_var, label_sizes_var, blank_var],
+        outputs=[loss],
+        grad_overrides=ctc_grad_graph,
+        inline=True, name="ctcLossOp")
+
+
+CTCLossOp = make_ctc_op()
+
+
+# -----------------------------------------------------------------------------
+
+def ctc_loss(preds, durations, labels, label_sizes, blank=-1):
+    """Compute the Connectionnist Temporal Classification loss [#graves2006]_.
+
+    .. math:: L = - ln\left( \sum_{\pi \in \mathcal{B}^{-1}(l)} P(\pi | y)
+                      \right)
+
+    where :math:`y` is the sequence of predictions, :math:`l` the target
+    label sequence without blanks or repetition, :math:`\pi` is taken from the
+    ensemble of possible label assignments over the observations and
+    :math:`\mathcal{B}` is a function that remove blanks and repetitions for a
+    sequence of labels.
+
+    Parameters
+    ----------
+    preds : Theano shared variable, expression or numpy array
+        The probabilities of each class (for example the output of a softmax
+        function) with shape duration x batch_size x nclasses.
+    durations: Theano shared variable, expression or numpy array
+        An _integer_ vector of size batch_size contining the actual length of
+        each sequence in preds.
+    labels: Theano shared variable, expression or numpy array
+        An _integer_ matrix of size batch_size x label_size containg the target
+        labels.
+    label_sizes: Theano shared variable, expression or numpy array
+        An _integer_ vector of size batch_size contining the actual length of
+        each sequence in labels.
+    blank:
+        The blank label class, by default the last one.
+
+    Returns
+    -------
+    Theano tensor
+        A vector expression with the CTC loss of each sequence.
+
+    Reference
+    ---------
+    .. [#graves2006] Graves, A., Fernández, S., Gomez, F., & Schmidhuber, J.
+       (2006, June). Connectionist temporal classification: labelling
+       unsegmented sequence data with recurrent neural networks. In
+       Proceedings of the 23rd international conference on Machine learning
+       (pp. 369-376). ACM. ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf
+
+    """
+    preds = T.as_tensor_variable(preds)
+    durations = T.as_tensor_variable(durations)
+    labels = T.as_tensor_variable(labels)
+    label_sizes = T.as_tensor_variable(label_sizes)
+    blank = T.cast(T.as_tensor_variable(blank), 'int32')
+
+    if not(preds.dtype in continuous_dtypes and preds.ndim == 3):
+        raise ValueError("preds must continuous with dimension 3")
+    if not (durations.dtype in discrete_dtypes and durations.ndim == 1):
+        raise ValueError("durations must be a integer vector")
+    if not (labels.dtype in discrete_dtypes and labels.ndim == 2):
+        raise ValueError("labels must be an integer matrix")
+    if not (label_sizes.dtype in discrete_dtypes and label_sizes.ndim == 1):
+        raise ValueError("label_sizes must be an integer vector")
+    if not (blank.dtype in discrete_dtypes and blank.ndim == 0):
+        raise ValueError("blank must be an integer value")
+
+    voca_size = T.cast(preds.shape[2], 'int32')
+    labels = labels % voca_size
+    blank = blank % voca_size
+
+    return CTCLossOp(preds, durations, labels, label_sizes, blank)
diff --git a/papers/connectionist_temporal_classification/test_ctc2.py b/papers/connectionist_temporal_classification/test_ctc2.py
new file mode 100644
index 0000000..ff10c0b
--- /dev/null
+++ b/papers/connectionist_temporal_classification/test_ctc2.py
@@ -0,0 +1,123 @@
+import unittest
+import numpy as np
+import theano
+import theano.tensor as T
+from theano.tests import unittest_tools
+
+from ctc import ctc_loss
+
+
+class TestCTC(unittest.TestCase):
+    def setUp(self):
+        unittest_tools.seed_rng()
+
+    def test_simple_precomputed(self):
+        # Test obtained from Torch tutorial at:
+        # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md
+
+        linear_out = np.asarray([
+            [[0, 0, 0, 0, 0], [1, 2, 3, 4, 5], [-5, -4, -3, -2, -1]],
+            [[0, 0, 0, 0, 0], [6, 7, 8, 9, 10], [-10, -9, -8, -7, -6]],
+            [[0, 0, 0, 0, 0], [11, 12, 13, 14, 15], [-15, -14, -13, -12, -11]]
+        ], dtype=np.float32)
+
+        seq_sizes = np.asarray([1, 3, 3], dtype=np.int32)
+
+        labels = np.asarray([[1, 0], [3, 3], [2, 3]], dtype=np.int32)
+
+        label_sizes = np.asarray([1, 2, 2], dtype=np.int32)
+
+        expected_losses = np.asarray([1.609437943, 7.355742931, 4.938849926],
+                                     dtype=np.float32)
+
+        blank = 0
+
+        expected_grad = np.asarray([
+            [[0.2,            -0.8,          0.2,            0.2, 0.2],
+             [0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197,
+              0.636408627],
+             [-0.02115798369, 0.03168492019, -0.8810571432, 0.2341216654,
+              0.636408627]],
+            [[0, 0, 0, 0, 0],
+             [-0.9883437753, 0.03168492019, 0.08612854034, 0.2341216654,
+              0.636408627],
+             [-0.02115798369, 0.03168492019, -0.1891518533, -0.4577836394,
+              0.636408627]],
+            [[0, 0, 0, 0, 0],
+             [0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197,
+              0.636408627],
+             [-0.02115798369, 0.03168492019, 0.08612854034, -0.7330639958,
+              0.636408627]]
+        ], dtype=np.float32)
+
+        seq_size, batch_size, voca_size = linear_out.shape
+
+        linear_out_t = T.as_tensor_variable(linear_out)
+        seq_sizes_t = T.as_tensor_variable(seq_sizes)
+        labels_t = T.as_tensor_variable(labels)
+        label_sizes_t = T.as_tensor_variable(label_sizes)
+        blank_t = T.as_tensor_variable(blank)
+
+        preds = T.nnet.softmax(
+            linear_out_t.reshape((-1, voca_size))
+        ).reshape((seq_size, batch_size, voca_size))
+        losses = ctc_loss(preds, seq_sizes_t, labels_t, label_sizes_t, blank_t)
+
+        assert np.allclose(losses.eval(), expected_losses)
+
+        grad = theano.grad(losses.sum(), wrt=linear_out_t)
+
+        assert np.allclose(grad.eval(), expected_grad)
+
+    def test_random(self):
+        batch_size = 16
+        label_size = 5
+        voca_size = 4
+        seq_size = 20
+
+        label_sizes = np.random.randint(
+            0, label_size, size=(batch_size,), dtype=np.int32)
+        label_sizes[0] = label_size
+        label_sizes[1] = 0
+        label_sizes[2] = 5
+        label_sizes[3] = 5
+
+        labels = np.random.randint(
+            0, voca_size - 1,
+            size=(batch_size, label_size), dtype=np.int32)
+        labels[3] = 0
+
+        seq_sizes = np.array([
+            np.random.randint(max(1, label_sizes[i]), seq_size)
+            for i in range(batch_size)], dtype=np.int32)
+        seq_sizes[2] = 4
+
+        linear_out = np.random.randn(
+            seq_size, batch_size, voca_size).astype(np.float32)
+
+        # check edge cases
+        # TODO
+
+        # check the gradient can be computed at all
+        linear_out_var = T.tensor3()
+        preds = T.nnet.softmax(
+            linear_out_var.reshape((-1, voca_size))
+        ).reshape((seq_size, batch_size, voca_size))
+
+        g = theano.grad(ctc_loss(preds, seq_sizes,
+                                 labels, label_sizes).sum(),
+                        wrt=linear_out_var).eval({linear_out_var: linear_out})
+        assert not np.any(np.isnan(g))
+
+        # check correctness against finite difference approximation
+        def f(linear_out_):
+            preds_ = T.nnet.softmax(
+                    linear_out_.reshape((-1, voca_size))
+                ).reshape((seq_size, batch_size, voca_size))
+            loss = ctc_loss(preds_, seq_sizes, labels, label_sizes)
+            # prevent finite differences from failing
+            loss = T.switch(T.isinf(loss), 0, loss)
+            return loss
+
+        unittest_tools.verify_grad(
+            f, [linear_out], rel_tol=0.1)

From 96b8d68f5c355e8c8b3ecd0f0824c71539a7291b Mon Sep 17 00:00:00 2001
From: Nicolas Granger <nicolas.granger.m@gmail.com>
Date: Fri, 15 Dec 2017 16:23:22 +0100
Subject: [PATCH 2/8] fix error with empty sequences+added low-level test for
 forward backward passes

---
 .../ctc.py                                    | 13 ++-
 .../test_ctc2.py                              | 86 ++++++++++++++++++-
 2 files changed, 90 insertions(+), 9 deletions(-)

diff --git a/papers/connectionist_temporal_classification/ctc.py b/papers/connectionist_temporal_classification/ctc.py
index 70e5526..22ff9fc 100644
--- a/papers/connectionist_temporal_classification/ctc.py
+++ b/papers/connectionist_temporal_classification/ctc.py
@@ -33,7 +33,7 @@ def step(t, a_tm1, log_odds_,
              seq_sizes_, blanked_labels_, label_sizes_, not_repeated_):
         y_t = log_odds_[t]
         k = T.max(a_tm1, axis=-1, keepdims=True)
-        k = T.switch(T.isinf(k), 0, k)
+        k = T.switch(T.all(T.isinf(a_tm1), axis=-1, keepdims=True), 0, k)
         a_tm1 = T.switch(T.isinf(a_tm1), 0, T.exp(a_tm1 - k))  # exit log space
         a_t = a_tm1
         a_t = T.inc_subtensor(a_t[:, 1:], a_tm1[:, :-1])
@@ -58,8 +58,7 @@ def step(t, a_tm1, log_odds_,
         outputs_info=[alpha_init],
         non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes,
                        not_repeated],
-        name="ctc_forward",
-        profile=True)
+        name="ctc_forward")
 
     return alphas
 
@@ -73,7 +72,7 @@ def step(t, b_tp1, log_odds_,
              seq_sizes_, blanked_labels_, label_sizes_, not_repeated_):
         y_t = log_odds_[t]
         k = T.max(b_tp1, axis=-1, keepdims=True)
-        k = T.switch(T.isinf(k), 0, k)
+        k = T.switch(T.all(T.isinf(b_tp1), axis=-1, keepdims=True), 0, k)
         b_tp1 = T.switch(T.isinf(b_tp1), 0, T.exp(b_tp1 - k))  # exit log space
 
         # increase b_{T+1}(|l'|) from 0 to 1 to initialize recursion
@@ -99,8 +98,7 @@ def step(t, b_tp1, log_odds_,
         non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes,
                        not_repeated],
         go_backwards=True,
-        name="ctc_backward",
-        profile=True)
+        name="ctc_backward")
     betas = betas[::-1, :, :]
 
     return betas
@@ -155,8 +153,7 @@ def fwbw_sum_step(k, s, labels_, ab_):
         sequences=[T.arange(2 * label_size + 1)],
         outputs_info=[-np.inf * T.ones((seq_size, batch_size, voca_size))],
         non_sequences=[blanked_labels, ab],
-        name="fwbw_sum",
-        profile=True)[0][-1]
+        name="fwbw_sum")[0][-1]
 
     # d(loss) / dy
     dloss_dy = T.switch(
diff --git a/papers/connectionist_temporal_classification/test_ctc2.py b/papers/connectionist_temporal_classification/test_ctc2.py
index ff10c0b..b10e61b 100644
--- a/papers/connectionist_temporal_classification/test_ctc2.py
+++ b/papers/connectionist_temporal_classification/test_ctc2.py
@@ -4,13 +4,97 @@
 import theano.tensor as T
 from theano.tests import unittest_tools
 
-from ctc import ctc_loss
+from papers.connectionist_temporal_classification.ctc import \
+    ctc_loss, ctc_forward, ctc_backward, insert_alternating_blanks
+
+
+def log_softmax(X):
+    k = T.max(X, axis=-1, keepdims=True)
+    norm_X = X - k
+    log_sum_exp_X = T.log(T.sum(T.exp(norm_X), axis=-1, keepdims=True))
+    return norm_X - log_sum_exp_X
 
 
 class TestCTC(unittest.TestCase):
     def setUp(self):
         unittest_tools.seed_rng()
 
+    def test_forward_backward(self):
+        batch_size = 6
+        label_size = 7
+        voca_size = 5
+        seq_size = 10
+
+        label_lengths = np.random.randint(0, label_size,
+                                          size=(batch_size,), dtype=np.int32)
+        label_lengths[0] = label_size  # extremum case
+        label_lengths[1] = 0  # extremum case
+        labels = np.array(
+            [np.random.randint(0, voca_size - 1, size=label_size, dtype=np.int32)
+             for _ in range(batch_size)])
+        for i in range(batch_size):
+            labels[i, label_lengths[i]:] = -1
+
+        seq_durations = np.array([
+            np.random.randint(max(1, label_lengths[i]), seq_size)
+            for i in range(batch_size)], dtype=np.int32)
+
+        linear_out = np.random.randn(seq_size, batch_size, voca_size) \
+            .astype(np.float32)
+
+        blank_class = -1
+        blank_class = np.mod(blank_class, voca_size)
+
+        labels = np.mod(labels, voca_size)
+
+        log_odds = log_softmax(linear_out)
+        blanked_labels = insert_alternating_blanks(T.mod(labels, voca_size),
+                                                   blank_class)
+        not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])
+
+        alphas = ctc_forward(log_odds, seq_durations,
+                             blanked_labels, label_lengths, not_repeated)
+        betas = ctc_backward(log_odds, seq_durations,
+                             blanked_labels, label_lengths, not_repeated)
+
+        preds = log_softmax(linear_out)
+
+        y_blanks = preds[:, T.arange(batch_size)[:, None], blanked_labels]
+        p_l = T.sum(T.exp(alphas + betas - y_blanks), axis=2)
+
+        alphas = alphas.eval()
+        betas = betas.eval()
+        preds = preds.eval()
+
+        for i in range(batch_size):
+            assert np.allclose(alphas[0, i, 0], preds[0, i, -1])
+            if label_lengths[i] > 0:
+                assert np.allclose(alphas[0, i, 1], preds[0, i, labels[i, 0]])
+            else:
+                assert np.isneginf(alphas[0, i, 1])
+            assert np.all(np.isneginf(alphas[0, i, 2:]))
+
+        for i in range(batch_size):
+            t = seq_durations[i] - 1
+            l = label_lengths[i]
+            assert np.allclose(betas[t, i, 2 * l], preds[t, i, -1])
+            if l > 0:
+                assert np.allclose(betas[t, i, 2 * l - 1],
+                                   preds[t, i, labels[i, l - 1]])
+                assert np.all(np.isneginf(betas[t, i, :max(l - 2, 0)]))
+            else:
+                assert np.all(np.isneginf(betas[t, i, 1:]))
+
+        p_l = p_l.eval()
+
+        for i in range(batch_size):
+            assert (np.allclose(p_l[:seq_durations[i], i], p_l[0, i]))
+            a, b = max(0, 2 * label_lengths[i] - 1), 2 * label_lengths[i] + 1
+            p_li = np.exp(alphas[seq_durations[i] - 1, i, a:b]).sum()
+            assert np.allclose(p_li, p_l[0, i])
+            p_li = np.exp(betas[0, i, :2]).sum()
+            assert np.allclose(p_li, p_l[0, i])
+
     def test_simple_precomputed(self):
         # Test obtained from Torch tutorial at:
         # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md

From afcf4b7ebacc5feb3a93366167750e1f56638d9a Mon Sep 17 00:00:00 2001
From: Nicolas Granger <nicolas.granger.m@gmail.com>
Date: Sat, 16 Dec 2017 00:55:49 +0100
Subject: [PATCH 3/8] more fixes for precision issues

---
 .../ctc.py                                    | 54 +++++++++++--------
 .../test_ctc2.py                              | 21 ++++----
 2 files changed, 43 insertions(+), 32 deletions(-)

diff --git a/papers/connectionist_temporal_classification/ctc.py b/papers/connectionist_temporal_classification/ctc.py
index 22ff9fc..a358726 100644
--- a/papers/connectionist_temporal_classification/ctc.py
+++ b/papers/connectionist_temporal_classification/ctc.py
@@ -11,6 +11,18 @@
 import theano
 import theano.tensor as T
 from theano.tensor import discrete_dtypes, continuous_dtypes
+from theano.printing import Print
+
+
+def isneginf(x, neginf=-1e27):
+    return x < neginf
+
+
+def logaddexp(x, y, inf=1e9):
+    x, y = T.minimum(x, y), T.maximum(x, y)
+    diff = T.minimum(y - x, T.log(inf) / T.log(10))
+    res = x + T.log(1 + T.exp(diff))
+    return T.switch((y - x > T.log(inf) / T.log(10)), y, res)
 
 
 # Bits of the CTC algorithm ---------------------------------------------------
@@ -33,8 +45,8 @@ def step(t, a_tm1, log_odds_,
              seq_sizes_, blanked_labels_, label_sizes_, not_repeated_):
         y_t = log_odds_[t]
         k = T.max(a_tm1, axis=-1, keepdims=True)
-        k = T.switch(T.all(T.isinf(a_tm1), axis=-1, keepdims=True), 0, k)
-        a_tm1 = T.switch(T.isinf(a_tm1), 0, T.exp(a_tm1 - k))  # exit log space
+        k = T.switch(T.all(isneginf(a_tm1), axis=-1, keepdims=True), 0, k)
+        a_tm1 = T.switch(isneginf(a_tm1), 0, T.exp(a_tm1 - k))  # exit log space
         a_t = a_tm1
         a_t = T.inc_subtensor(a_t[:, 1:], a_tm1[:, :-1])
         a_t = T.inc_subtensor(a_t[:, 2:], a_tm1[:, :-2] * not_repeated_)
@@ -45,11 +57,11 @@ def step(t, a_tm1, log_odds_,
                    2 * label_sizes_[:, None] + 1)
 
         a_t = T.switch(  # back to log space
-            T.eq(a_t, 0) + mask, -np.inf,
+            T.eq(a_t, 0) + mask, -1e30,
             T.log(a_t) + k + y_t[T.arange(batch_sz)[:, None], blanked_labels_])
         return a_t
 
-    alpha_init = -np.inf * T.ones((batch_sz, label_size))
+    alpha_init = -1e30 * T.ones((batch_sz, label_size))
     alpha_init = T.set_subtensor(alpha_init[:, 0], 0)
 
     alphas, _ = theano.scan(
@@ -72,8 +84,9 @@ def step(t, b_tp1, log_odds_,
              seq_sizes_, blanked_labels_, label_sizes_, not_repeated_):
         y_t = log_odds_[t]
         k = T.max(b_tp1, axis=-1, keepdims=True)
-        k = T.switch(T.all(T.isinf(b_tp1), axis=-1, keepdims=True), 0, k)
-        b_tp1 = T.switch(T.isinf(b_tp1), 0, T.exp(b_tp1 - k))  # exit log space
+        k = T.switch(T.all(isneginf(b_tp1), axis=-1, keepdims=True), 0, k)
+        b_tp1 = T.switch(isneginf(b_tp1), 0, T.exp(b_tp1 - k))  # exit log space
+        b_tp1 = b_tp1
 
         # increase b_{T+1}(|l'|) from 0 to 1 to initialize recursion
         starter_t = T.eq(t, seq_sizes_ - 1)[:, None] \
@@ -85,11 +98,11 @@ def step(t, b_tp1, log_odds_,
         b_t = T.inc_subtensor(b_t[:, :-1], b_tp1[:, 1:])
         b_t = T.inc_subtensor(b_t[:, :-2], b_tp1[:, 2:] * not_repeated_)
         b_t = T.switch(  # back to log space
-            T.eq(b_t, 0), -np.inf,
+            T.eq(b_t, 0), -1e30,
             T.log(b_t) + k + y_t[T.arange(batch_sz)[:, None], blanked_labels_])
         return b_t
 
-    beta_init = - np.inf * T.ones((batch_sz, label_size))
+    beta_init = -1e30 * T.ones((batch_sz, label_size))
 
     betas, _ = theano.scan(
         fn=step,
@@ -115,16 +128,14 @@ def ctc_perform_graph(preds, seq_sizes, labels, label_sizes, blank):
     betas = ctc_backward(log_preds, seq_sizes,
                          blanked_labels, label_sizes, not_repeated)
 
-    loss = -T.switch(T.all(T.isinf(betas[0, :, :2]), axis=1),
-                     -np.inf,  # impossible sequences, eg: too short
-                     T.log(T.exp(betas[0, :, 0]) + T.exp(betas[0, :, 1])))
+    loss = - logaddexp(betas[0, :, 0], betas[0, :, 1])
 
     return log_preds, blanked_labels, not_repeated, betas, loss
 
 
 def ctc_grad_graph(inputs, output_gradients):
-    linear_out, seq_durations, labels, label_sizes, _ = inputs
-    seq_size, batch_size, voca_size = linear_out.shape
+    preds, seq_durations, labels, label_sizes, _ = inputs
+    seq_size, batch_size, voca_size = preds.shape
     label_size = labels.shape[1]
 
     # TODO: will theano optimize this redundant call when both loss and
@@ -141,23 +152,22 @@ def ctc_grad_graph(inputs, output_gradients):
     def fwbw_sum_step(k, s, labels_, ab_):
         s_view = s[:, T.arange(batch_size), labels_[:, k]]
         ab_view = ab_[:, :, k]
-        next_sum = ab_view + T.switch(T.isinf(s_view),
-                                      0, T.log(1 + T.exp(s_view - ab_view)))
-        next_sum = T.switch(T.isinf(ab_view), s_view, next_sum)
+        next_sum = logaddexp(s_view, ab_view)
         s = T.set_subtensor(s_view, next_sum)
         return s
 
-    ab = T.switch(T.isinf(alphas) + T.isinf(betas), -np.inf, alphas + betas)
+    ab = alphas + betas
     fwbw_sum = theano.scan(
         fn=fwbw_sum_step,
         sequences=[T.arange(2 * label_size + 1)],
-        outputs_info=[-np.inf * T.ones((seq_size, batch_size, voca_size))],
+        outputs_info=[-1e30 * T.ones((seq_size, batch_size, voca_size))],
         non_sequences=[blanked_labels, ab],
+        strict=True,
         name="fwbw_sum")[0][-1]
 
     # d(loss) / dy
     dloss_dy = T.switch(
-        T.isinf(loss)[None, :, None],
+        isneginf(loss)[None, :, None],
         0,
         - T.exp(fwbw_sum - log_pl[None, :, None] - 2 * log_preds))
 
@@ -169,18 +179,18 @@ def fwbw_sum_step(k, s, labels_, ab_):
 
 
 def make_ctc_op():
-    linear_out_var = T.tensor3()
+    preds_var = T.tensor3()
     seq_durations_var = T.ivector()
     labels_var = T.imatrix()
     label_sizes_var = T.ivector()
     blank_var = T.iscalar()
 
     _, _, _, _, loss = ctc_perform_graph(
-        linear_out_var, seq_durations_var, labels_var,
+        preds_var, seq_durations_var, labels_var,
         label_sizes_var, blank_var)
 
     return theano.OpFromGraph(
-        inputs=[linear_out_var, seq_durations_var,
+        inputs=[preds_var, seq_durations_var,
                 labels_var, label_sizes_var, blank_var],
         outputs=[loss],
         grad_overrides=ctc_grad_graph,
diff --git a/papers/connectionist_temporal_classification/test_ctc2.py b/papers/connectionist_temporal_classification/test_ctc2.py
index b10e61b..902c69f 100644
--- a/papers/connectionist_temporal_classification/test_ctc2.py
+++ b/papers/connectionist_temporal_classification/test_ctc2.py
@@ -5,7 +5,7 @@
 from theano.tests import unittest_tools
 
 from papers.connectionist_temporal_classification.ctc import \
-    ctc_loss, ctc_forward, ctc_backward, insert_alternating_blanks
+    ctc_loss, ctc_forward, ctc_backward, insert_alternating_blanks, isneginf
 
 
 def log_softmax(X):
@@ -71,8 +71,8 @@ def test_forward_backward(self):
             if label_lengths[i] > 0:
                 assert np.allclose(alphas[0, i, 1], preds[0, i, labels[i, 0]])
             else:
-                assert np.isneginf(alphas[0, i, 1])
-            assert np.all(np.isneginf(alphas[0, i, 2:]))
+                assert isneginf(alphas[0, i, 1])
+            assert np.all(isneginf(alphas[0, i, 2:]))
 
         for i in range(batch_size):
             t = seq_durations[i] - 1
@@ -81,9 +81,9 @@ def test_forward_backward(self):
             if l > 0:
                 assert np.allclose(betas[t, i, 2 * l - 1],
                                    preds[t, i, labels[i, l - 1]])
-                assert np.all(np.isneginf(betas[t, i, :max(l - 2, 0)]))
+                assert np.all(isneginf(betas[t, i, :max(l - 2, 0)]))
             else:
-                assert np.all(np.isneginf(betas[t, i, 1:]))
+                assert np.all(isneginf(betas[t, i, 1:]))
 
         p_l = p_l.eval()
 
@@ -147,11 +147,11 @@ def test_simple_precomputed(self):
         ).reshape((seq_size, batch_size, voca_size))
         losses = ctc_loss(preds, seq_sizes_t, labels_t, label_sizes_t, blank_t)
 
-        assert np.allclose(losses.eval(), expected_losses)
+        assert np.allclose(losses.eval(), expected_losses, atol=1)
 
         grad = theano.grad(losses.sum(), wrt=linear_out_t)
 
-        assert np.allclose(grad.eval(), expected_grad)
+        assert np.allclose(grad.eval(), expected_grad, rtol=.001, atol=1)
 
     def test_random(self):
         batch_size = 16
@@ -190,7 +190,8 @@ def test_random(self):
 
         g = theano.grad(ctc_loss(preds, seq_sizes,
                                  labels, label_sizes).sum(),
-                        wrt=linear_out_var).eval({linear_out_var: linear_out})
+                        wrt=linear_out_var).eval(
+            {linear_out_var: linear_out.astype(np.float32)})
         assert not np.any(np.isnan(g))
 
         # check correctness against finite difference approximation
@@ -200,8 +201,8 @@ def f(linear_out_):
                 ).reshape((seq_size, batch_size, voca_size))
             loss = ctc_loss(preds_, seq_sizes, labels, label_sizes)
             # prevent finite differences from failing
-            loss = T.switch(T.isinf(loss), 0, loss)
+            loss = T.switch(isneginf(-loss), 0, loss)
             return loss
 
         unittest_tools.verify_grad(
-            f, [linear_out], rel_tol=0.1)
+            f, [linear_out], rel_tol=0.1, abs_tol=1)

From c7ce022a99f9f41e34e64fa2e8b944fe02ad4545 Mon Sep 17 00:00:00 2001
From: Nicolas Granger <nicolas.granger.m@gmail.com>
Date: Mon, 18 Dec 2017 11:39:58 +0100
Subject: [PATCH 4/8] Improved ctc gradient stability

The CTC loss function now takes predictions in log space (before
softmax) to avoid precision issues.
---
 .../ctc.py                                    | 71 ++++++++++++-------
 .../test_ctc2.py                              | 41 ++++-------
 2 files changed, 59 insertions(+), 53 deletions(-)

diff --git a/papers/connectionist_temporal_classification/ctc.py b/papers/connectionist_temporal_classification/ctc.py
index a358726..ac62703 100644
--- a/papers/connectionist_temporal_classification/ctc.py
+++ b/papers/connectionist_temporal_classification/ctc.py
@@ -11,18 +11,36 @@
 import theano
 import theano.tensor as T
 from theano.tensor import discrete_dtypes, continuous_dtypes
-from theano.printing import Print
+# from theano.printing import Print
 
 
 def isneginf(x, neginf=-1e27):
     return x < neginf
 
 
-def logaddexp(x, y, inf=1e9):
+def logaddexp(x, y, magnitude=9):
     x, y = T.minimum(x, y), T.maximum(x, y)
-    diff = T.minimum(y - x, T.log(inf) / T.log(10))
+    diff = T.minimum(y - x, magnitude)
     res = x + T.log(1 + T.exp(diff))
-    return T.switch((y - x > T.log(inf) / T.log(10)), y, res)
+    return T.switch((y - x > magnitude), y, res)
+
+
+def logsumexp(x, axis, keepdims=False):
+    k = T.max(x, axis=axis, keepdims=True)
+    return T.log(T.sum(T.exp(x - k), axis=axis, keepdims=keepdims))
+
+
+def log_softmax(X, axis=-1, clip=None):
+    k = T.max(X, axis=axis, keepdims=True)
+    norm_X = X - k
+
+    if clip is not None:
+        mini = T.log((T.cast(X.shape[axis], 'floatX') - 1) * clip / (1 - clip))
+        # norm_X *= - T.min(norm_X, axis=axis, keepdims=True) / mini
+        norm_X = T.maximum(norm_X, mini)
+
+    log_sum_exp_X = logsumexp(norm_X, axis=axis, keepdims=True)
+    return norm_X - log_sum_exp_X
 
 
 # Bits of the CTC algorithm ---------------------------------------------------
@@ -119,31 +137,31 @@ def step(t, b_tp1, log_odds_,
 
 # Theano Op -------------------------------------------------------------------
 
-def ctc_perform_graph(preds, seq_sizes, labels, label_sizes, blank):
-    _, batch_size, voca_size = preds.shape
+def ctc_perform_graph(linout, seq_sizes, labels, label_sizes, blank):
+    _, batch_size, voca_size = linout.shape
 
-    log_preds = T.log(preds)
+    log_odds = log_softmax(linout)
     blanked_labels = insert_alternating_blanks(labels, blank)
     not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])
-    betas = ctc_backward(log_preds, seq_sizes,
+    betas = ctc_backward(log_odds, seq_sizes,
                          blanked_labels, label_sizes, not_repeated)
 
     loss = - logaddexp(betas[0, :, 0], betas[0, :, 1])
 
-    return log_preds, blanked_labels, not_repeated, betas, loss
+    return log_odds, blanked_labels, not_repeated, betas, loss
 
 
 def ctc_grad_graph(inputs, output_gradients):
-    preds, seq_durations, labels, label_sizes, _ = inputs
-    seq_size, batch_size, voca_size = preds.shape
+    linout, seq_durations, labels, label_sizes, _ = inputs
+    seq_size, batch_size, voca_size = linout.shape
     label_size = labels.shape[1]
 
     # TODO: will theano optimize this redundant call when both loss and
     # gradient are requested separately?
-    log_preds, blanked_labels, not_repeated, betas, loss = \
+    log_odds, blanked_labels, not_repeated, betas, loss = \
         ctc_perform_graph(*inputs)
 
-    alphas = ctc_forward(log_preds, seq_durations,
+    alphas = ctc_forward(log_odds, seq_durations,
                          blanked_labels, label_sizes, not_repeated)
 
     log_pl = - loss
@@ -165,11 +183,14 @@ def fwbw_sum_step(k, s, labels_, ab_):
         strict=True,
         name="fwbw_sum")[0][-1]
 
-    # d(loss) / dy
+    A = fwbw_sum - log_pl[None, :, None] - 2 * log_odds
+
+    dloss_dy = T.exp(2 * log_odds + logsumexp(A, axis=2, keepdims=True)) \
+        - T.exp(log_odds + A)
+
     dloss_dy = T.switch(
-        isneginf(loss)[None, :, None],
-        0,
-        - T.exp(fwbw_sum - log_pl[None, :, None] - 2 * log_preds))
+        (loss[None, :, None] > 1e10) + T.isinf(loss[None, :, None]),
+        0, dloss_dy)
 
     return [dloss_dy * output_gradients[0][None, :, None],
             theano.gradient.disconnected_type(),
@@ -202,7 +223,7 @@ def make_ctc_op():
 
 # -----------------------------------------------------------------------------
 
-def ctc_loss(preds, durations, labels, label_sizes, blank=-1):
+def ctc_loss(linout, durations, labels, label_sizes, blank=-1):
     """Compute the Connectionnist Temporal Classification loss [#graves2006]_.
 
     .. math:: L = - ln\left( \sum_{\pi \in \mathcal{B}^{-1}(l)} P(\pi | y)
@@ -216,9 +237,9 @@ def ctc_loss(preds, durations, labels, label_sizes, blank=-1):
 
     Parameters
     ----------
-    preds : Theano shared variable, expression or numpy array
-        The probabilities of each class (for example the output of a softmax
-        function) with shape duration x batch_size x nclasses.
+    linout : Theano shared variable, expression or numpy array
+        The input values for the softmax function with shape
+        duration x batch_size x nclasses.
     durations: Theano shared variable, expression or numpy array
         An _integer_ vector of size batch_size contining the actual length of
         each sequence in preds.
@@ -245,13 +266,13 @@ def ctc_loss(preds, durations, labels, label_sizes, blank=-1):
        (pp. 369-376). ACM. ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf
 
     """
-    preds = T.as_tensor_variable(preds)
+    linout = T.as_tensor_variable(linout)
     durations = T.as_tensor_variable(durations)
     labels = T.as_tensor_variable(labels)
     label_sizes = T.as_tensor_variable(label_sizes)
     blank = T.cast(T.as_tensor_variable(blank), 'int32')
 
-    if not(preds.dtype in continuous_dtypes and preds.ndim == 3):
+    if not(linout.dtype in continuous_dtypes and linout.ndim == 3):
         raise ValueError("preds must continuous with dimension 3")
     if not (durations.dtype in discrete_dtypes and durations.ndim == 1):
         raise ValueError("durations must be a integer vector")
@@ -262,8 +283,8 @@ def ctc_loss(preds, durations, labels, label_sizes, blank=-1):
     if not (blank.dtype in discrete_dtypes and blank.ndim == 0):
         raise ValueError("blank must be an integer value")
 
-    voca_size = T.cast(preds.shape[2], 'int32')
+    voca_size = T.cast(linout.shape[2], 'int32')
     labels = labels % voca_size
     blank = blank % voca_size
 
-    return CTCLossOp(preds, durations, labels, label_sizes, blank)
+    return CTCLossOp(linout, durations, labels, label_sizes, blank)
diff --git a/papers/connectionist_temporal_classification/test_ctc2.py b/papers/connectionist_temporal_classification/test_ctc2.py
index 902c69f..7c66c90 100644
--- a/papers/connectionist_temporal_classification/test_ctc2.py
+++ b/papers/connectionist_temporal_classification/test_ctc2.py
@@ -117,39 +117,24 @@ def test_simple_precomputed(self):
         blank = 0
 
         expected_grad = np.asarray([
-            [[0.2,            -0.8,          0.2,            0.2, 0.2],
-             [0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197,
-              0.636408627],
-             [-0.02115798369, 0.03168492019, -0.8810571432, 0.2341216654,
-              0.636408627]],
-            [[0, 0, 0, 0, 0],
-             [-0.9883437753, 0.03168492019, 0.08612854034, 0.2341216654,
-              0.636408627],
-             [-0.02115798369, 0.03168492019, -0.1891518533, -0.4577836394,
-              0.636408627]],
-            [[0, 0, 0, 0, 0],
-             [0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197,
-              0.636408627],
-             [-0.02115798369, 0.03168492019, 0.08612854034, -0.7330639958,
-              0.636408627]]
+            [[0.2,            -0.8,            0.2,            0.2,           0.2],
+             [ 0.01165623125,  0.03168492019,  0.08612854034, -0.7658783197,  0.636408627],
+             [-0.02115798369,  0.03168492019, -0.8810571432,   0.2341216654,  0.636408627]],
+            [[0,               0,              0,              0,             0],
+             [-0.9883437753,   0.03168492019,  0.08612854034,  0.2341216654,  0.636408627],
+             [-0.02115798369,  0.03168492019, -0.1891518533,  -0.4577836394,  0.636408627]],
+            [[0,               0,              0,              0,             0],
+             [0.01165623125,   0.03168492019,  0.08612854034, -0.7658783197,  0.636408627],
+             [-0.02115798369,  0.03168492019,  0.08612854034, -0.7330639958,  0.636408627]]
         ], dtype=np.float32)
 
-        seq_size, batch_size, voca_size = linear_out.shape
-
-        linear_out_t = T.as_tensor_variable(linear_out)
-        seq_sizes_t = T.as_tensor_variable(seq_sizes)
-        labels_t = T.as_tensor_variable(labels)
-        label_sizes_t = T.as_tensor_variable(label_sizes)
-        blank_t = T.as_tensor_variable(blank)
-
-        preds = T.nnet.softmax(
-            linear_out_t.reshape((-1, voca_size))
-        ).reshape((seq_size, batch_size, voca_size))
-        losses = ctc_loss(preds, seq_sizes_t, labels_t, label_sizes_t, blank_t)
+        linear_out_var = T.as_tensor_variable(linear_out)
+        losses = ctc_loss(
+            linear_out_var, seq_sizes, labels, label_sizes, blank)
 
         assert np.allclose(losses.eval(), expected_losses, atol=1)
 
-        grad = theano.grad(losses.sum(), wrt=linear_out_t)
+        grad = theano.grad(losses.sum(), wrt=linear_out_var)
 
         assert np.allclose(grad.eval(), expected_grad, rtol=.001, atol=1)
 

From 0f739a8ca91957749c530a4315d50b41d9923655 Mon Sep 17 00:00:00 2001
From: Nicolas Granger <nicolas.granger.m@gmail.com>
Date: Tue, 9 Jan 2018 11:57:36 +0100
Subject: [PATCH 5/8] test for more stability in computations

---
 .../ctc.py                                    | 46 +++++++++++--------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/papers/connectionist_temporal_classification/ctc.py b/papers/connectionist_temporal_classification/ctc.py
index ac62703..d7079a6 100644
--- a/papers/connectionist_temporal_classification/ctc.py
+++ b/papers/connectionist_temporal_classification/ctc.py
@@ -11,14 +11,14 @@
 import theano
 import theano.tensor as T
 from theano.tensor import discrete_dtypes, continuous_dtypes
-# from theano.printing import Print
+from theano.printing import Print
 
 
-def isneginf(x, neginf=-1e27):
+def isneginf(x, neginf=-1e9):
     return x < neginf
 
 
-def logaddexp(x, y, magnitude=9):
+def logaddexp(x, y, magnitude=20):
     x, y = T.minimum(x, y), T.maximum(x, y)
     diff = T.minimum(y - x, magnitude)
     res = x + T.log(1 + T.exp(diff))
@@ -64,7 +64,7 @@ def step(t, a_tm1, log_odds_,
         y_t = log_odds_[t]
         k = T.max(a_tm1, axis=-1, keepdims=True)
         k = T.switch(T.all(isneginf(a_tm1), axis=-1, keepdims=True), 0, k)
-        a_tm1 = T.switch(isneginf(a_tm1), 0, T.exp(a_tm1 - k))  # exit log space
+        a_tm1 = T.switch(a_tm1 - k < - 88, 0, T.exp(a_tm1 - k))
         a_t = a_tm1
         a_t = T.inc_subtensor(a_t[:, 1:], a_tm1[:, :-1])
         a_t = T.inc_subtensor(a_t[:, 2:], a_tm1[:, :-2] * not_repeated_)
@@ -75,11 +75,11 @@ def step(t, a_tm1, log_odds_,
                    2 * label_sizes_[:, None] + 1)
 
         a_t = T.switch(  # back to log space
-            T.eq(a_t, 0) + mask, -1e30,
+            T.eq(a_t, 0) + mask, -2e9,
             T.log(a_t) + k + y_t[T.arange(batch_sz)[:, None], blanked_labels_])
         return a_t
 
-    alpha_init = -1e30 * T.ones((batch_sz, label_size))
+    alpha_init = -2e9 * T.ones((batch_sz, label_size))
     alpha_init = T.set_subtensor(alpha_init[:, 0], 0)
 
     alphas, _ = theano.scan(
@@ -101,26 +101,32 @@ def ctc_backward(log_odds, seq_sizes,
     def step(t, b_tp1, log_odds_,
              seq_sizes_, blanked_labels_, label_sizes_, not_repeated_):
         y_t = log_odds_[t]
-        k = T.max(b_tp1, axis=-1, keepdims=True)
-        k = T.switch(T.all(isneginf(b_tp1), axis=-1, keepdims=True), 0, k)
-        b_tp1 = T.switch(isneginf(b_tp1), 0, T.exp(b_tp1 - k))  # exit log space
-        b_tp1 = b_tp1
 
         # increase b_{T+1}(|l'|) from 0 to 1 to initialize recursion
         starter_t = T.eq(t, seq_sizes_ - 1)[:, None] \
             * T.eq((2 * label_sizes_)[:, None],
-                   T.arange(label_size)[None, :]) * 1
-        b_tp1 += starter_t  # initialize recursion
+                   T.arange(label_size)[None, :])
+        b_tp1_2lp1 = b_tp1[T.arange(batch_sz), 2 * label_sizes_]
+        b_tp1 = T.set_subtensor(
+            b_tp1_2lp1,
+            T.switch(T.eq(t, seq_sizes_ - 1), 0, b_tp1_2lp1))
+        b_tp1 = T.switch(starter_t, 0, b_tp1)  # initialize recursion
 
         b_t = b_tp1
-        b_t = T.inc_subtensor(b_t[:, :-1], b_tp1[:, 1:])
-        b_t = T.inc_subtensor(b_t[:, :-2], b_tp1[:, 2:] * not_repeated_)
-        b_t = T.switch(  # back to log space
-            T.eq(b_t, 0), -1e30,
-            T.log(b_t) + k + y_t[T.arange(batch_sz)[:, None], blanked_labels_])
+        b_t = T.set_subtensor(
+            b_t[:, :-1],
+            logaddexp(b_t[:, :-1], b_tp1[:, 1:]))
+        b_t = T.set_subtensor(
+            b_t[:, :-2],
+            logaddexp(b_t[:, :-2], T.switch(not_repeated_, b_tp1[:, 2:], -2e9)))
+        b_t += y_t[T.arange(batch_sz)[:, None], blanked_labels_]
+        # idx = Print("idx")(T.maximum(0, 2 * label_sizes_ + 1 + 2 * t - 2 * batch_dur))
+        # m = Print("m")(T.max(b_t).sum())
+        # b_t = b_t + (m - m)
+        b_t = T.switch(isneginf(b_t), -2e9, b_t)
         return b_t
 
-    beta_init = -1e30 * T.ones((batch_sz, label_size))
+    beta_init = -2e9 * T.ones((batch_sz, label_size))
 
     betas, _ = theano.scan(
         fn=step,
@@ -178,7 +184,7 @@ def fwbw_sum_step(k, s, labels_, ab_):
     fwbw_sum = theano.scan(
         fn=fwbw_sum_step,
         sequences=[T.arange(2 * label_size + 1)],
-        outputs_info=[-1e30 * T.ones((seq_size, batch_size, voca_size))],
+        outputs_info=[-2e9 * T.ones((seq_size, batch_size, voca_size))],
         non_sequences=[blanked_labels, ab],
         strict=True,
         name="fwbw_sum")[0][-1]
@@ -189,7 +195,7 @@ def fwbw_sum_step(k, s, labels_, ab_):
         - T.exp(log_odds + A)
 
     dloss_dy = T.switch(
-        (loss[None, :, None] > 1e10) + T.isinf(loss[None, :, None]),
+        (loss[None, :, None] > 1e9) + T.isinf(loss[None, :, None]),
         0, dloss_dy)
 
     return [dloss_dy * output_gradients[0][None, :, None],

From 51a35851b555bf67d002fa219d98143826d87557 Mon Sep 17 00:00:00 2001
From: Nicolas Granger <nicolas.granger.m@gmail.com>
Date: Fri, 12 Jan 2018 10:50:52 +0100
Subject: [PATCH 6/8] fixes errors and precision issues, adds demos and tests.

---
 .../ctc.py                                    |  73 +--
 .../experiments-tf.ipynb                      | 369 +++++++++++++
 .../experiments.ipynb                         | 517 ++++++++++++++++++
 .../test_ctc2.py                              |   2 +-
 .../tests.ipynb                               | 199 +++++++
 5 files changed, 1125 insertions(+), 35 deletions(-)
 create mode 100644 papers/connectionist_temporal_classification/experiments-tf.ipynb
 create mode 100644 papers/connectionist_temporal_classification/experiments.ipynb
 create mode 100644 papers/connectionist_temporal_classification/tests.ipynb

diff --git a/papers/connectionist_temporal_classification/ctc.py b/papers/connectionist_temporal_classification/ctc.py
index d7079a6..1a5c48d 100644
--- a/papers/connectionist_temporal_classification/ctc.py
+++ b/papers/connectionist_temporal_classification/ctc.py
@@ -35,9 +35,7 @@ def log_softmax(X, axis=-1, clip=None):
     norm_X = X - k
 
     if clip is not None:
-        mini = T.log((T.cast(X.shape[axis], 'floatX') - 1) * clip / (1 - clip))
-        # norm_X *= - T.min(norm_X, axis=axis, keepdims=True) / mini
-        norm_X = T.maximum(norm_X, mini)
+        norm_X = T.maximum(norm_X, clip)
 
     log_sum_exp_X = logsumexp(norm_X, axis=axis, keepdims=True)
     return norm_X - log_sum_exp_X
@@ -55,28 +53,28 @@ def insert_alternating_blanks(labels, blank_label):
 
 def ctc_forward(log_odds, seq_sizes,
                 blanked_labels, label_sizes, not_repeated):
-    batch_dur, batch_sz, _ = log_odds.shape
-    batch_dur, batch_sz = T.cast(batch_dur, 'int32'), T.cast(batch_sz, 'int32')
+    seqsize, batch_sz, _ = log_odds.shape
     label_size = blanked_labels.shape[1]
 
     def step(t, a_tm1, log_odds_,
              seq_sizes_, blanked_labels_, label_sizes_, not_repeated_):
         y_t = log_odds_[t]
-        k = T.max(a_tm1, axis=-1, keepdims=True)
-        k = T.switch(T.all(isneginf(a_tm1), axis=-1, keepdims=True), 0, k)
-        a_tm1 = T.switch(a_tm1 - k < - 88, 0, T.exp(a_tm1 - k))
         a_t = a_tm1
-        a_t = T.inc_subtensor(a_t[:, 1:], a_tm1[:, :-1])
-        a_t = T.inc_subtensor(a_t[:, 2:], a_tm1[:, :-2] * not_repeated_)
+        a_t = T.set_subtensor(
+            a_t[:, 1:],
+            logaddexp(a_t[:, 1:], a_tm1[:, :-1]))
+        a_t = T.set_subtensor(
+            a_t[:, 2:],
+            logaddexp(a_t[:, 2:], T.switch(not_repeated_, a_tm1[:, :-2], -2e9)))
 
         # stop after a_T(|l'|)
         mask = T.ge(t, seq_sizes_)[:, None] \
             + T.ge(T.arange(label_size)[None, :],
                    2 * label_sizes_[:, None] + 1)
 
-        a_t = T.switch(  # back to log space
-            T.eq(a_t, 0) + mask, -2e9,
-            T.log(a_t) + k + y_t[T.arange(batch_sz)[:, None], blanked_labels_])
+        a_t = T.switch(
+            isneginf(a_t) + mask, -2e9,
+            a_t + y_t[T.arange(batch_sz)[:, None], blanked_labels_])
         return a_t
 
     alpha_init = -2e9 * T.ones((batch_sz, label_size))
@@ -84,8 +82,8 @@ def step(t, a_tm1, log_odds_,
 
     alphas, _ = theano.scan(
         fn=step,
-        sequences=[T.arange(batch_dur)],
-        outputs_info=[alpha_init],
+        sequences=[T.arange(seqsize)],
+        outputs_info=alpha_init,
         non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes,
                        not_repeated],
         name="ctc_forward")
@@ -95,7 +93,7 @@ def step(t, a_tm1, log_odds_,
 
 def ctc_backward(log_odds, seq_sizes,
                  blanked_labels, label_sizes, not_repeated):
-    batch_dur, batch_sz, _ = log_odds.shape
+    seqsize, batch_sz, _ = log_odds.shape
     label_size = blanked_labels.shape[1]
 
     def step(t, b_tp1, log_odds_,
@@ -120,9 +118,6 @@ def step(t, b_tp1, log_odds_,
             b_t[:, :-2],
             logaddexp(b_t[:, :-2], T.switch(not_repeated_, b_tp1[:, 2:], -2e9)))
         b_t += y_t[T.arange(batch_sz)[:, None], blanked_labels_]
-        # idx = Print("idx")(T.maximum(0, 2 * label_sizes_ + 1 + 2 * t - 2 * batch_dur))
-        # m = Print("m")(T.max(b_t).sum())
-        # b_t = b_t + (m - m)
         b_t = T.switch(isneginf(b_t), -2e9, b_t)
         return b_t
 
@@ -130,8 +125,8 @@ def step(t, b_tp1, log_odds_,
 
     betas, _ = theano.scan(
         fn=step,
-        sequences=[T.arange(batch_dur)],
-        outputs_info=[beta_init],
+        sequences=[T.arange(seqsize)],
+        outputs_info=beta_init,
         non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes,
                        not_repeated],
         go_backwards=True,
@@ -146,15 +141,20 @@ def step(t, b_tp1, log_odds_,
 def ctc_perform_graph(linout, seq_sizes, labels, label_sizes, blank):
     _, batch_size, voca_size = linout.shape
 
-    log_odds = log_softmax(linout)
+    logits = log_softmax(linout)
     blanked_labels = insert_alternating_blanks(labels, blank)
     not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])
-    betas = ctc_backward(log_odds, seq_sizes,
+    betas = ctc_backward(logits, seq_sizes,
                          blanked_labels, label_sizes, not_repeated)
-
     loss = - logaddexp(betas[0, :, 0], betas[0, :, 1])
 
-    return log_odds, blanked_labels, not_repeated, betas, loss
+    # alphas = ctc_forward(logits, seq_sizes,
+    #                      blanked_labels, label_sizes, not_repeated)
+    # loss = - logaddexp(
+    #     alphas[seq_sizes - 1, T.arange(batch_size), 2 * label_sizes - 1],
+    #     alphas[seq_sizes - 1, T.arange(batch_size), 2 * label_sizes])
+
+    return logits, blanked_labels, not_repeated, betas, loss
 
 
 def ctc_grad_graph(inputs, output_gradients):
@@ -164,10 +164,10 @@ def ctc_grad_graph(inputs, output_gradients):
 
     # TODO: will theano optimize this redundant call when both loss and
     # gradient are requested separately?
-    log_odds, blanked_labels, not_repeated, betas, loss = \
+    logits, blanked_labels, not_repeated, betas, loss = \
         ctc_perform_graph(*inputs)
 
-    alphas = ctc_forward(log_odds, seq_durations,
+    alphas = ctc_forward(logits, seq_durations,
                          blanked_labels, label_sizes, not_repeated)
 
     log_pl = - loss
@@ -184,19 +184,24 @@ def fwbw_sum_step(k, s, labels_, ab_):
     fwbw_sum = theano.scan(
         fn=fwbw_sum_step,
         sequences=[T.arange(2 * label_size + 1)],
-        outputs_info=[-2e9 * T.ones((seq_size, batch_size, voca_size))],
+        outputs_info=-2e9 * T.ones((seq_size, batch_size, voca_size)),
         non_sequences=[blanked_labels, ab],
         strict=True,
         name="fwbw_sum")[0][-1]
 
-    A = fwbw_sum - log_pl[None, :, None] - 2 * log_odds
+    A = fwbw_sum - log_pl[None, :, None] - logits
+    B = logits + logsumexp(A, axis=2, keepdims=True)
+    dloss_dy = T.exp(B) - T.exp(A)
+    # A = fwbw_sum - log_pl[None, :, None] - 2 * logits
+    # dloss_dy = T.exp(2 * logits + logsumexp(A, axis=2, keepdims=True)) \
+    #            - T.exp(logits + A)
 
-    dloss_dy = T.exp(2 * log_odds + logsumexp(A, axis=2, keepdims=True)) \
-        - T.exp(log_odds + A)
+    dloss_dy = T.switch(T.all(isneginf(A), axis=2, keepdims=True),
+                        0, dloss_dy)
 
-    dloss_dy = T.switch(
-        (loss[None, :, None] > 1e9) + T.isinf(loss[None, :, None]),
-        0, dloss_dy)
+    # dloss_dy = T.switch(
+    #     (loss[None, :, None] > 1e9) + T.isinf(loss[None, :, None]),
+    #     0, dloss_dy)
 
     return [dloss_dy * output_gradients[0][None, :, None],
             theano.gradient.disconnected_type(),
diff --git a/papers/connectionist_temporal_classification/experiments-tf.ipynb b/papers/connectionist_temporal_classification/experiments-tf.ipynb
new file mode 100644
index 0000000..effc1cb
--- /dev/null
+++ b/papers/connectionist_temporal_classification/experiments-tf.ipynb
@@ -0,0 +1,369 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Requirements\n",
+    "\n",
+    "Please download the timit dataset at http://academictorrents.com/details/34e2b78745138186976cbc27939b1b34d18bd5b3 and place the TIMIT.zip file next to this file.\n",
+    "\n",
+    "The following python packages are required:\n",
+    "- lasagne\n",
+    "- matplotlib\n",
+    "- [sphfile](https://pypi.python.org/pypi/sphfile) (to read the sound files)\n",
+    "- [python_speech_features](https://github.com/jameslyons/python_speech_features) (to generate mfcc features)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "% autoreload 2\n",
+    "#%matplotlib inline\n",
+    "# %env CUDA_VISIBLE_DEVICES=\"1\"\n",
+    "from IPython.core.interactiveshell import InteractiveShell\n",
+    "InteractiveShell.ast_node_interactivity = \"all\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pickle as pkl\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "from zipfile import ZipFile\n",
+    "from sphfile import SPHFile\n",
+    "from python_speech_features import mfcc\n",
+    "import tensorflow as tf\n",
+    "import keras as K\n",
+    "from keras.models import Model\n",
+    "from keras.layers import Input, Dense, LSTM, Concatenate, Layer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.isdir(\"data/lisa/data/timit/raw/TIMIT\"):\n",
+    "    assert os.path.exists(\"TIMIT.zip\"), \"Missing data archive\"\n",
+    "    with ZipFile(\"TIMIT.zip\", 'r') as f:\n",
+    "        f.extractall(path=\".\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = []\n",
+    "train_subset = []\n",
+    "\n",
+    "for dirpath, _, filenames in os.walk(\"data/lisa/data/timit/raw/TIMIT\"):\n",
+    "    for f in filenames:\n",
+    "        if f.endswith(\"WAV\"):\n",
+    "            recording = SPHFile(dirpath + \"/\" + f).content\n",
+    "            files.append(dirpath + \"/\" + f[:-4])\n",
+    "            train_subset.append(dirpath[31:36] == \"TRAIN\")\n",
+    "\n",
+    "files = np.array(files)\n",
+    "train_subset = np.array(train_subset, dtype=np.bool)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(\"preprocessed_dataset.pkl\"):\n",
+    "    features = []\n",
+    "    labels = []\n",
+    "\n",
+    "    for f in files:\n",
+    "        recording = SPHFile(f + \".WAV\")\n",
+    "        signal = recording.content\n",
+    "        samplerate = recording.format['sample_rate']\n",
+    "\n",
+    "        mfccfeats = mfcc(signal, samplerate=samplerate, winlen=0.01, winstep=0.005, \n",
+    "                         numcep=13, nfilt=26, appendEnergy=True)\n",
+    "        derivatives = np.concatenate([\n",
+    "            mfccfeats[1, None] - mfccfeats[0, None],\n",
+    "            .5 * mfccfeats[2:] - .5 * mfccfeats[0:-2],\n",
+    "            mfccfeats[-1, None] - mfccfeats[-2, None]], axis=0)\n",
+    "\n",
+    "        features.append(np.concatenate([mfccfeats, derivatives], axis=1).astype(np.float32))\n",
+    "\n",
+    "        with open(f + \".PHN\") as phonem_file:\n",
+    "            labels.append([l.split()[2] for l in phonem_file.readlines()])\n",
+    "\n",
+    "    m = np.mean(np.concatenate(features, axis=0))\n",
+    "    s = np.std(np.concatenate(features, axis=0))\n",
+    "\n",
+    "    for i in range(len(features)):\n",
+    "        features[i] = (features[i] - m) / s\n",
+    "\n",
+    "    vocabulary = set()\n",
+    "    for lseq in labels:\n",
+    "        vocabulary |= set(lseq)\n",
+    "\n",
+    "    vocabulary = list(vocabulary)\n",
+    "    vocabulary[-1], vocabulary[vocabulary.index('h#')] = vocabulary[vocabulary.index('h#')], vocabulary[-1]\n",
+    "\n",
+    "    for i in range(len(labels)):\n",
+    "        labels[i] = np.array([vocabulary.index(l) for l in labels[i]], dtype=np.int32)\n",
+    "\n",
+    "    blank = 60\n",
+    "    \n",
+    "    with open(\"preprocessed_dataset.pkl\", 'wb') as f:\n",
+    "        pkl.dump((features, labels, vocabulary, blank), f)\n",
+    "\n",
+    "\n",
+    "with open(\"preprocessed_dataset.pkl\", 'rb') as f:\n",
+    "    features, labels, vocabulary, blank = pkl.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(20, 9))\n",
+    "plt.imshow(features[1].transpose(), clim=(-4, 4))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def zero_loss(y_true, y_pred):\n",
+    "    return K.backend.zeros_like(y_pred)\n",
+    "\n",
+    "def dense_to_sparse(x):\n",
+    "    idx = tf.where(tf.greater_equal(x, 0))\n",
+    "    return tf.SparseTensor(idx, tf.gather_nd(x, idx), tf.shape(x, out_type=tf.int64))\n",
+    "\n",
+    "class CTCLossLayer(Layer):\n",
+    "    def __init__(self, **kwargs):\n",
+    "        super(CTCLossLayer, self).__init__(**kwargs)\n",
+    "\n",
+    "    def call(self, x, mask=None):\n",
+    "        linout = x[0]\n",
+    "        targets = x[1]\n",
+    "        durations = x[2]\n",
+    "        loss = tf.nn.ctc_loss(\n",
+    "            dense_to_sparse(targets), linout,\n",
+    "            sequence_length=durations[:, 0],\n",
+    "            time_major=False)\n",
+    "        self.add_loss(tf.reduce_sum(loss), x)\n",
+    "        return loss\n",
+    "\n",
+    "    def compute_output_shape(self, input_shape):\n",
+    "        return input_shape[0][0]\n",
+    "\n",
+    "a = Input(shape=(None, features[0].shape[1]), name=\"features\")\n",
+    "targets = Input(shape=[None], dtype='int32', name=\"targets\")\n",
+    "durations = Input(shape=[1], dtype='int32', name=\"durations\")\n",
+    "b1 = LSTM(100, return_sequences=True)(a)\n",
+    "b2 = LSTM(100, return_sequences=True, go_backwards=True)(a)\n",
+    "c = Concatenate(axis=2)([b1, b2])\n",
+    "d = Dense(len(vocabulary), activation=None)(c)\n",
+    "l = CTCLossLayer()([d, targets, durations])\n",
+    "model = Model(inputs=[a, targets, durations], outputs=[d, l])\n",
+    "sgd = K.optimizers.SGD(lr=1e-4, momentum=0.9, nesterov=True)\n",
+    "\n",
+    "model.summary()\n",
+    "\n",
+    "model.compile(\n",
+    "    target_tensors=[targets, targets], \n",
+    "    loss=[zero_loss, zero_loss], \n",
+    "    optimizer=sgd)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# lasagne.layers.set_all_param_values(l_linout, params_backup[0])\n",
+    "\n",
+    "params_backup = []\n",
+    "running_loss = None\n",
+    "\n",
+    "for i in np.random.permutation(len(labels))[:300]:\n",
+    "    f, l  = features[i][None, :, :], labels[i][None, 1:-1]\n",
+    "\n",
+    "    batch_loss = model.train_on_batch(\n",
+    "        x=[f, l, np.array([f.shape[1]], np.int32)],\n",
+    "        y=[l, l])[0]\n",
+    "\n",
+    "    if batch_loss > 10000:\n",
+    "        print(\"\\nskipped i = {}\".format(i))\n",
+    "        continue\n",
+    "    else:\n",
+    "        running_loss = batch_loss if running_loss is None else .99 * running_loss + .01 * batch_loss\n",
+    "        print(\"\\rloss = {:>5.0f}\".format(running_loss), end='', flush=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def argmax_decode(preds):\n",
+    "    decoded = [preds[0]]\n",
+    "    for v in preds:\n",
+    "        if v != decoded[-1]:\n",
+    "            decoded.append(v)\n",
+    "    \n",
+    "    return np.array(decoded, dtype=np.int32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features[i].shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "i = 0\n",
+    "inputs = [features[i][None, :, :], labels[i][None, 1:-1], np.array([features[i].shape[0]], np.int32)]\n",
+    "logits = model.predict(inputs)[0][0]\n",
+    "# preds -= np.max(preds, axis=1, keepdims=True)\n",
+    "# preds = np.exp(preds)\n",
+    "# preds /= np.sum(preds, axis=1, keepdims=True)\n",
+    "lbl_preds = argmax_decode(np.argmax(preds, axis=-1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "o = np.argsort(np.mean(logits[:, :60], axis=0))\n",
+    "plt.figure(figsize=(10, 10))\n",
+    "for c in o:\n",
+    "    plt.plot(np.arange(len(logits)), logits[:, c]);\n",
+    "\n",
+    "plt.plot(np.arange(len(logits)), logits[:, -1], linestyle=\":\");\n",
+    "plt.legend([vocabulary[o_] for o_ in o] + [vocabulary[-1]], bbox_to_anchor=(.6, 0, 1, 1), ncol=5)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preds[:, -1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.layers[4].get_weights()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "weights"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/papers/connectionist_temporal_classification/experiments.ipynb b/papers/connectionist_temporal_classification/experiments.ipynb
new file mode 100644
index 0000000..4db5447
--- /dev/null
+++ b/papers/connectionist_temporal_classification/experiments.ipynb
@@ -0,0 +1,517 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Requirements\n",
+    "\n",
+    "Please download the timit dataset at http://academictorrents.com/details/34e2b78745138186976cbc27939b1b34d18bd5b3 and place the TIMIT.zip file next to this file.\n",
+    "\n",
+    "The following python packages are required:\n",
+    "- lasagne\n",
+    "- matplotlib\n",
+    "- [sphfile](https://pypi.python.org/pypi/sphfile) (to read the sound files)\n",
+    "- [python_speech_features](https://github.com/jameslyons/python_speech_features) (to generate mfcc features)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "%matplotlib inline\n",
+    "\n",
+    "import os\n",
+    "os.environ['THEANO_FLAGS'] = \"device=cpu\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle as pkl\n",
+    "import numpy as np\n",
+    "from zipfile import ZipFile\n",
+    "from sphfile import SPHFile\n",
+    "from python_speech_features import mfcc\n",
+    "import lasagne\n",
+    "from lasagne.layers import InputLayer, GaussianNoiseLayer, LSTMLayer, DenseLayer, ConcatLayer, ReshapeLayer\n",
+    "import theano\n",
+    "import theano.tensor as T\n",
+    "from theano.compile.nanguardmode import NanGuardMode\n",
+    "import matplotlib.pyplot as plt\n",
+    "from ctc import ctc_loss, log_softmax, insert_alternating_blanks, ctc_backward"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.seterr(all='raise')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.isdir(\"data/lisa/data/timit/raw/TIMIT\"):\n",
+    "    assert os.path.exists(\"TIMIT.zip\"), \"Missing data archive\"\n",
+    "    with ZipFile(\"TIMIT.zip\", 'r') as f:\n",
+    "        f.extractall(path=\".\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = []\n",
+    "train_subset = []\n",
+    "\n",
+    "for dirpath, _, filenames in os.walk(\"data/lisa/data/timit/raw/TIMIT\"):\n",
+    "    for f in filenames:\n",
+    "        if f.endswith(\"WAV\"):\n",
+    "            recording = SPHFile(dirpath + \"/\" + f).content\n",
+    "            files.append(dirpath + \"/\" + f[:-4])\n",
+    "            train_subset.append(dirpath[31:36] == \"TRAIN\")\n",
+    "\n",
+    "files = np.array(files)\n",
+    "train_subset = np.array(train_subset, dtype=np.bool)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(\"preprocessed_dataset.pkl\"):\n",
+    "    features = []\n",
+    "    labels = []\n",
+    "\n",
+    "    for f in files:\n",
+    "        recording = SPHFile(f + \".WAV\")\n",
+    "        signal = recording.content\n",
+    "        samplerate = recording.format['sample_rate']\n",
+    "\n",
+    "        mfccfeats = mfcc(signal, samplerate=samplerate, winlen=0.01, winstep=0.005, \n",
+    "                         numcep=13, nfilt=26, appendEnergy=True)\n",
+    "        derivatives = np.concatenate([\n",
+    "            mfccfeats[1, None] - mfccfeats[0, None],\n",
+    "            .5 * mfccfeats[2:] - .5 * mfccfeats[0:-2],\n",
+    "            mfccfeats[-1, None] - mfccfeats[-2, None]], axis=0)\n",
+    "\n",
+    "        features.append(np.concatenate([mfccfeats, derivatives], axis=1).astype(np.float32))\n",
+    "\n",
+    "        with open(f + \".PHN\") as phonem_file:\n",
+    "            labels.append([l.split()[2] for l in phonem_file.readlines()])\n",
+    "\n",
+    "    m = np.mean(np.concatenate(features, axis=0))\n",
+    "    s = np.std(np.concatenate(features, axis=0))\n",
+    "\n",
+    "    for i in range(len(features)):\n",
+    "        features[i] = (features[i] - m) / s\n",
+    "\n",
+    "    vocabulary = set()\n",
+    "    for lseq in labels:\n",
+    "        vocabulary |= set(lseq)\n",
+    "\n",
+    "    vocabulary = list(vocabulary)\n",
+    "    vocabulary[-1], vocabulary[vocabulary.index('h#')] = vocabulary[vocabulary.index('h#')], vocabulary[-1]\n",
+    "\n",
+    "    for i in range(len(labels)):\n",
+    "        labels[i] = np.array([vocabulary.index(l) for l in labels[i]], dtype=np.int32)\n",
+    "\n",
+    "    blank = len(labels) - 1\n",
+    "    \n",
+    "    with open(\"preprocessed_dataset.pkl\", 'wb') as f:\n",
+    "        pkl.dump((features, labels, vocabulary, blank), f)\n",
+    "\n",
+    "\n",
+    "with open(\"preprocessed_dataset.pkl\", 'rb') as f:\n",
+    "    features, labels, vocabulary, blank = pkl.load(f)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams\n",
+    "\n",
+    "class SmallGaussianNoiseLayer(lasagne.layers.Layer):\n",
+    "    \"\"\"Gaussian noise layer (clipped for safety)\"\"\"\n",
+    "    def __init__(self, incoming, sigma=0.1, **kwargs):\n",
+    "        super(SmallGaussianNoiseLayer, self).__init__(incoming, **kwargs)\n",
+    "        self._srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579))\n",
+    "        self.sigma = sigma\n",
+    "\n",
+    "    def get_output_for(self, input, deterministic=False, **kwargs):\n",
+    "        if deterministic or self.sigma == 0:\n",
+    "            return input\n",
+    "        else:\n",
+    "            noise = self._srng.normal(input.shape, avg=0.0, std=self.sigma)\n",
+    "            return input + T.clip(noise, -3 * self.sigma, 3 * self.sigma)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "l_in = InputLayer(shape=(None, 1557,  26))\n",
+    "l_duration = InputLayer(input_var=T.ivector(name=\"duration\"), shape=(1,))\n",
+    "l_mask = lasagne.layers.ExpressionLayer(l_duration, lambda d: T.arange(1557)[None, :] < d[:, None])\n",
+    "l_noise = SmallGaussianNoiseLayer(l_in, sigma=0.6)\n",
+    "l_fwlstm = LSTMLayer(\n",
+    "    l_noise, 100, mask_input=l_mask)\n",
+    "l_bwlstm = LSTMLayer(\n",
+    "    l_noise, 100, mask_input=l_mask,\n",
+    "    backwards=True)\n",
+    "l_cat = ConcatLayer([l_fwlstm, l_bwlstm], axis=2)\n",
+    "l_linout = DenseLayer(l_cat, len(vocabulary), nonlinearity=None, num_leading_axes=2)\n",
+    "\n",
+    "input_var = l_in.input_var\n",
+    "duration_var = l_duration.input_var\n",
+    "labels_var = T.imatrix()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_output = lasagne.layers.get_output(l_linout, deterministic=False).dimshuffle(1, 0, 2)\n",
+    "\n",
+    "loss = ctc_loss(\n",
+    "    linout=train_output,\n",
+    "    durations=duration_var,\n",
+    "    labels=labels_var,\n",
+    "    label_sizes=T.cast(T.reshape(labels_var.shape[1], (1,)), 'int32'),\n",
+    "    blank=blank\n",
+    ")\n",
+    "\n",
+    "params = lasagne.layers.get_all_params(l_linout, trainable=True)\n",
+    "grads = theano.grad(loss.sum(), params)\n",
+    "updates = lasagne.updates.nesterov_momentum(grads, params, learning_rate=1e-4)\n",
+    "\n",
+    "update_fn = theano.function(\n",
+    "    [input_var, duration_var, labels_var], \n",
+    "    loss, \n",
+    "    updates=updates,\n",
+    "    # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True),\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "params_history = []\n",
+    "loss_history = []\n",
+    "running_loss = None\n",
+    "failed = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for e in range(10):\n",
+    "    for i in np.random.permutation(len(labels)):\n",
+    "        f, l  = features[i][None, :, :], labels[i][None, 1:-1]\n",
+    "        d = np.array([f.shape[1]], dtype=np.int32)\n",
+    "        f = np.concatenate([f, np.zeros((1557 - f.shape[1], f.shape[2]), dtype=np.float32)[None, :, :]], axis=1)\n",
+    "\n",
+    "        batch_loss = float(update_fn(f, d, l))\n",
+    "\n",
+    "        if batch_loss > 10000 or np.isnan(batch_loss):\n",
+    "            print(\"\\nskipped i = {} because loss was {}\".format(i, batch_loss))\n",
+    "            raise RuntimeError()\n",
+    "        else:\n",
+    "            running_loss = batch_loss if running_loss is None else .99 * running_loss + .01 * batch_loss\n",
+    "            print(\"\\r{:4d} loss = {:>5.0f} -> {:>5.0f}\".format(i, batch_loss, running_loss), end='', flush=True)\n",
+    "\n",
+    "        if i % 25:\n",
+    "            params_history.append(lasagne.layers.get_all_param_values(l_linout))\n",
+    "            loss_history.append(running_loss)\n",
+    "\n",
+    "    #         batch_loss = loss_fn(f, l)\n",
+    "    #         if batch_loss > 5000:\n",
+    "    #             print('loss = {:>5.0f} > 5000 at element {:d}'.format(batch_loss, i))\n",
+    "    #             raise\n",
+    "    #         else:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.plot(loss_history)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lasagne.layers.set_all_param_values(l_linout, params_history[2000 // 25])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "i = 920\n",
+    "f, l  = features[i][None, :, :], labels[i][None, 1:-1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "blanked_labels = insert_alternating_blanks(labels_var, blank)\n",
+    "not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])\n",
+    "betas = ctc_backward(\n",
+    "    log_softmax(T.unbroadcast(train_output.dimshuffle(1, 0, 2), 1)),\n",
+    "    T.cast(T.reshape(train_output.shape[0], (1,)), 'int32'), \n",
+    "    blanked_labels,\n",
+    "    T.cast(T.reshape(labels_var.shape[1], (1,)), 'int32'),\n",
+    "    not_repeated)\n",
+    "test_output = lasagne.layers.get_output(l_linout, deterministic=True)\n",
+    "\n",
+    "loss_fn = theano.function([input_var, duration_var, labels_var], loss)\n",
+    "beta_fn = theano.function([input_var, duration_var, labels_var], betas)\n",
+    "grads_fn = theano.function([input_var, duration_var, labels_var], grads)\n",
+    "predict_fn = theano.function([input_var, duration_var], T.exp(log_softmax(test_output[:, 0, :])))\n",
+    "logits_fn = theano.function([input_var, duration_var], test_output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "logits = logits_fn(f, d)[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "o = np.argsort(np.mean(logits[:, :60], axis=0))\n",
+    "plt.figure(figsize=(10, 10))\n",
+    "for c in o:\n",
+    "    plt.plot(np.arange(len(logits)), logits[:, c])\n",
+    "\n",
+    "plt.plot(np.arange(len(logits)), logits[:, -1], linestyle=\":\")\n",
+    "plt.legend([vocabulary[o_] for o_ in o] + [vocabulary[-1]], bbox_to_anchor=(.6, 0, 1, 1), ncol=5)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "logits.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# beta"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features[i].shape[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "betas = ctc_backward(\n",
+    "    log_softmax(train_output),\n",
+    "    T.cast(T.reshape(train_output.shape[0], (1,)), 'int32'), \n",
+    "    blanked_labels,\n",
+    "    T.cast(T.reshape(labels_var.shape[1], (1,)), 'int32'),\n",
+    "    not_repeated)\n",
+    "beta_fn = theano.function([input_var, duration_var, labels_var], betas)\n",
+    "\n",
+    "b = beta_fn(f, d, l)\n",
+    "\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.imshow(b[0:, 0, 0:], clim=(-1100, max(0, np.max(b))))\n",
+    "plt.gca().set_aspect(0.1)\n",
+    "plt.colorbar()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p = lasagne.layers.get_all_param_values(l_linout, trainable=True)\n",
+    "for p_ in p:\n",
+    "    print((p_.min(), p_.max()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "g = theano.grad(loss.sum(), wrt=train_output).eval({\n",
+    "    input_var: f,\n",
+    "    duration_var: d,\n",
+    "    labels_var: l\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.subplot(2, 1, 1)\n",
+    "plt.bar(np.arange(len(vocabulary)), g[:, 0, np.concatenate((o, [60]))].mean(axis=0))\n",
+    "plt.subplot(2, 1, 2)\n",
+    "plt.plot(g[:, 0, :].mean(axis=1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(vocabulary)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def argmax_decode(preds):\n",
+    "    decoded = [preds[0]]\n",
+    "    for v in preds:\n",
+    "        if v != decoded[-1]:\n",
+    "            decoded.append(v)\n",
+    "    \n",
+    "    return np.array(decoded, dtype=np.int32)\n",
+    "\n",
+    "lbl_preds = argmax_decode(np.argmax(logits, axis=-1))\n",
+    "lbl_tgt = labels[i]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/papers/connectionist_temporal_classification/test_ctc2.py b/papers/connectionist_temporal_classification/test_ctc2.py
index 7c66c90..eca7be7 100644
--- a/papers/connectionist_temporal_classification/test_ctc2.py
+++ b/papers/connectionist_temporal_classification/test_ctc2.py
@@ -190,4 +190,4 @@ def f(linear_out_):
             return loss
 
         unittest_tools.verify_grad(
-            f, [linear_out], rel_tol=0.1, abs_tol=1)
+            f, [linear_out], rel_tol=.1, abs_tol=.1)
diff --git a/papers/connectionist_temporal_classification/tests.ipynb b/papers/connectionist_temporal_classification/tests.ipynb
new file mode 100644
index 0000000..538c39c
--- /dev/null
+++ b/papers/connectionist_temporal_classification/tests.ipynb
@@ -0,0 +1,199 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "%matplotlib inline\n",
+    "\n",
+    "import sys\n",
+    "import os\n",
+    "from IPython.core.interactiveshell import InteractiveShell\n",
+    "\n",
+    "sys.path.insert(-1, os.getcwd())\n",
+    "InteractiveShell.ast_node_interactivity = \"all\"\n",
+    "os.environ['THEANO_FLAGS'] = \"device=cpu\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import tensorflow as tf\n",
+    "import theano\n",
+    "import theano.tensor as T\n",
+    "\n",
+    "from ctc import ctc_loss as my_ctc_loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 16\n",
+    "max_labsize = 20\n",
+    "voca_size = 20  # excluding blank\n",
+    "max_seqsize = 100\n",
+    "blank = -1\n",
+    "\n",
+    "labsize = np.random.randint(\n",
+    "    1, max_labsize + 1, size=(batch_size,), dtype=np.int32)\n",
+    "labsize[0] = max_labsize\n",
+    "labsize[1] = 1\n",
+    "labsize[2] = max_labsize\n",
+    "labsize[3] = max_labsize\n",
+    "\n",
+    "labels = np.random.randint(\n",
+    "    0, voca_size,\n",
+    "    size=(batch_size, max_labsize), dtype=np.int32)\n",
+    "for b in range(batch_size):\n",
+    "    labels[b, labsize[b]:] = blank\n",
+    "\n",
+    "seqsize = np.array([\n",
+    "    np.random.randint(labsize[i] + 1, max_seqsize + 1)\n",
+    "    for i in range(batch_size)], dtype=np.int32)\n",
+    "\n",
+    "linout = np.random.randn(\n",
+    "    max_seqsize, batch_size, voca_size + 1).astype(np.float32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "th_linout_var = T.tensor3()\n",
+    "th_seqsize_var = T.ivector()\n",
+    "th_labels_var = T.imatrix()\n",
+    "th_labsize_var = T.ivector()\n",
+    "th_loss = my_ctc_loss(th_linout_var, th_seqsize_var, th_labels_var, th_labsize_var)\n",
+    "\n",
+    "def dense_to_sparse(x):\n",
+    "    idx = tf.where(tf.greater_equal(x, 0))\n",
+    "    return tf.SparseTensor(idx, tf.gather_nd(x, idx), tf.shape(x, out_type=tf.int64))\n",
+    "\n",
+    "tf_linout_var = tf.placeholder(tf.float32, shape=[max_seqsize, batch_size, voca_size + 1])\n",
+    "tf_seqsize_var = tf.placeholder(tf.int32, shape=[batch_size])\n",
+    "tf_labels_var = tf.placeholder(tf.int32, shape=[batch_size, max_labsize])\n",
+    "\n",
+    "tf_loss = tf.nn.ctc_loss(\n",
+    "    dense_to_sparse(tf_labels_var), tf_linout_var,\n",
+    "    sequence_length=tf_seqsize_var,\n",
+    "    time_major=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with tf.Session() as sess:\n",
+    "    tf_result = sess.run(\n",
+    "        tf_loss, {\n",
+    "            tf_linout_var: linout,\n",
+    "            tf_seqsize_var: seqsize,\n",
+    "            tf_labels_var: labels\n",
+    "        })\n",
+    "    \n",
+    "    th_results = th_loss.eval({\n",
+    "        th_linout_var: linout,\n",
+    "        th_seqsize_var: seqsize,\n",
+    "        th_labels_var: labels,\n",
+    "        th_labsize_var: labsize\n",
+    "    })\n",
+    "    \n",
+    "    print(np.abs(tf_result - th_results) / tf_result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf_g = tf.gradients(xs=tf_linout_var, ys=tf.reduce_sum(tf_loss))[0]\n",
+    "\n",
+    "with tf.Session() as sess:\n",
+    "    tf_grad = sess.run(\n",
+    "        tf_g, {\n",
+    "            tf_linout_var: linout,\n",
+    "            tf_seqsize_var: seqsize,\n",
+    "            tf_labels_var: labels\n",
+    "        })\n",
+    "    \n",
+    "    th_grad = theano.grad(th_loss.sum(), wrt=th_linout_var).eval({\n",
+    "        th_linout_var: linout,\n",
+    "        th_seqsize_var: seqsize,\n",
+    "        th_labels_var: labels,\n",
+    "        th_labsize_var: labsize\n",
+    "    })\n",
+    "    \n",
+    "    print(np.abs(tf_grad - th_grad) / (tf_grad + .000001))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "th_grad[:, 1, :]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf_grad[:, 1, :]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From f58174f25ff2b609e63eb7435f730e36a2c7e09f Mon Sep 17 00:00:00 2001
From: Nicolas Granger <nicolas.granger.m@gmail.com>
Date: Wed, 24 Jan 2018 10:32:24 +0100
Subject: [PATCH 7/8] fixed error in logsumexp, ctc gradient is now equal to
 tensorflow's

---
 .../ctc.py                                    | 35 +++++-------
 .../experiments.ipynb                         | 53 ++++++++++---------
 .../test_ctc2.py                              | 13 ++---
 .../tests.ipynb                               | 19 ++-----
 4 files changed, 47 insertions(+), 73 deletions(-)

diff --git a/papers/connectionist_temporal_classification/ctc.py b/papers/connectionist_temporal_classification/ctc.py
index 1a5c48d..c6f8a54 100644
--- a/papers/connectionist_temporal_classification/ctc.py
+++ b/papers/connectionist_temporal_classification/ctc.py
@@ -11,7 +11,6 @@
 import theano
 import theano.tensor as T
 from theano.tensor import discrete_dtypes, continuous_dtypes
-from theano.printing import Print
 
 
 def isneginf(x, neginf=-1e9):
@@ -27,7 +26,8 @@ def logaddexp(x, y, magnitude=20):
 
 def logsumexp(x, axis, keepdims=False):
     k = T.max(x, axis=axis, keepdims=True)
-    return T.log(T.sum(T.exp(x - k), axis=axis, keepdims=keepdims))
+    res = T.log(T.sum(T.exp(x - k), axis=axis, keepdims=keepdims)) + k
+    return T.switch(isneginf(k), -2e9, res)
 
 
 def log_softmax(X, axis=-1, clip=None):
@@ -162,17 +162,13 @@ def ctc_grad_graph(inputs, output_gradients):
     seq_size, batch_size, voca_size = linout.shape
     label_size = labels.shape[1]
 
-    # TODO: will theano optimize this redundant call when both loss and
-    # gradient are requested separately?
     logits, blanked_labels, not_repeated, betas, loss = \
         ctc_perform_graph(*inputs)
 
     alphas = ctc_forward(logits, seq_durations,
                          blanked_labels, label_sizes, not_repeated)
 
-    log_pl = - loss
-
-    # sum_{s \in lab(l, k)} a_t(s) b_t(s)
+    # log(sum_{s \in lab(l, k)} a_t(s) b_t(s))
     def fwbw_sum_step(k, s, labels_, ab_):
         s_view = s[:, T.arange(batch_size), labels_[:, k]]
         ab_view = ab_[:, :, k]
@@ -189,20 +185,14 @@ def fwbw_sum_step(k, s, labels_, ab_):
         strict=True,
         name="fwbw_sum")[0][-1]
 
-    A = fwbw_sum - log_pl[None, :, None] - logits
-    B = logits + logsumexp(A, axis=2, keepdims=True)
-    dloss_dy = T.exp(B) - T.exp(A)
-    # A = fwbw_sum - log_pl[None, :, None] - 2 * logits
-    # dloss_dy = T.exp(2 * logits + logsumexp(A, axis=2, keepdims=True)) \
-    #            - T.exp(logits + A)
+    A = loss[None, :, None] + logits \
+        + logsumexp(fwbw_sum - logits, axis=2, keepdims=True)
+    B = loss[None, :, None] + fwbw_sum - logits
+    dloss_dy = T.exp(A) - T.exp(B)
 
-    dloss_dy = T.switch(T.all(isneginf(A), axis=2, keepdims=True),
+    dloss_dy = T.switch(T.all(isneginf(fwbw_sum), axis=2, keepdims=True),
                         0, dloss_dy)
 
-    # dloss_dy = T.switch(
-    #     (loss[None, :, None] > 1e9) + T.isinf(loss[None, :, None]),
-    #     0, dloss_dy)
-
     return [dloss_dy * output_gradients[0][None, :, None],
             theano.gradient.disconnected_type(),
             theano.gradient.disconnected_type(),
@@ -255,13 +245,13 @@ def ctc_loss(linout, durations, labels, label_sizes, blank=-1):
         An _integer_ vector of size batch_size contining the actual length of
         each sequence in preds.
     labels: Theano shared variable, expression or numpy array
-        An _integer_ matrix of size batch_size x label_size containg the target
-        labels.
+        An _integer_ matrix of size batch_size x label_size containing the
+        target labels.
     label_sizes: Theano shared variable, expression or numpy array
-        An _integer_ vector of size batch_size contining the actual length of
+        An _integer_ vector of size batch_size containing the actual length of
         each sequence in labels.
     blank:
-        The blank label class, by default the last one.
+        The blank label class, by default the last index.
 
     Returns
     -------
@@ -275,7 +265,6 @@ def ctc_loss(linout, durations, labels, label_sizes, blank=-1):
        unsegmented sequence data with recurrent neural networks. In
        Proceedings of the 23rd international conference on Machine learning
        (pp. 369-376). ACM. ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf
-
     """
     linout = T.as_tensor_variable(linout)
     durations = T.as_tensor_variable(durations)
diff --git a/papers/connectionist_temporal_classification/experiments.ipynb b/papers/connectionist_temporal_classification/experiments.ipynb
index 4db5447..1e8b626 100644
--- a/papers/connectionist_temporal_classification/experiments.ipynb
+++ b/papers/connectionist_temporal_classification/experiments.ipynb
@@ -49,15 +49,6 @@
     "from ctc import ctc_loss, log_softmax, insert_alternating_blanks, ctc_backward"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "np.seterr(all='raise')"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -150,7 +141,7 @@
     "    blank = len(labels) - 1\n",
     "    \n",
     "    with open(\"preprocessed_dataset.pkl\", 'wb') as f:\n",
-    "        pkl.dump((features, labels, vocabulary, blank), f)\n",
+    "        pkl.dump((features, labels, vocabulary, blank), f, -1)\n",
     "\n",
     "\n",
     "with open(\"preprocessed_dataset.pkl\", 'rb') as f:\n",
@@ -295,7 +286,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.plot(loss_history)"
+    "plt.plot(loss_history)\n",
+    "plt.yscale('log')"
    ]
   },
   {
@@ -304,7 +296,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "lasagne.layers.set_all_param_values(l_linout, params_history[2000 // 25])"
+    "np.argmin(loss_history[::25])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lasagne.layers.set_all_param_values(l_linout, params_history[6000//25])"
    ]
   },
   {
@@ -320,8 +321,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "i = 920\n",
-    "f, l  = features[i][None, :, :], labels[i][None, 1:-1]"
+    "i = 0\n",
+    "f, l  = features[i][None, :, :], labels[i][None, 1:-1]\n",
+    "f = np.concatenate([f, np.zeros((1557 - f.shape[1], f.shape[2]), dtype=np.float32)[None, :, :]], axis=1)"
    ]
   },
   {
@@ -372,15 +374,6 @@
     "plt.show()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "logits.shape"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -414,7 +407,7 @@
     "b = beta_fn(f, d, l)\n",
     "\n",
     "plt.figure(figsize=(10, 6))\n",
-    "plt.imshow(b[0:, 0, 0:], clim=(-1100, max(0, np.max(b))))\n",
+    "plt.imshow(b[0:, 0, 0:], clim=(-5000, max(0, np.max(b))))\n",
     "plt.gca().set_aspect(0.1)\n",
     "plt.colorbar()\n",
     "plt.show()"
@@ -510,6 +503,18 @@
    "display_name": "Python 3",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
   }
  },
  "nbformat": 4,
diff --git a/papers/connectionist_temporal_classification/test_ctc2.py b/papers/connectionist_temporal_classification/test_ctc2.py
index eca7be7..10d3425 100644
--- a/papers/connectionist_temporal_classification/test_ctc2.py
+++ b/papers/connectionist_temporal_classification/test_ctc2.py
@@ -5,14 +5,8 @@
 from theano.tests import unittest_tools
 
 from papers.connectionist_temporal_classification.ctc import \
-    ctc_loss, ctc_forward, ctc_backward, insert_alternating_blanks, isneginf
-
-
-def log_softmax(X):
-    k = T.max(X, axis=-1, keepdims=True)
-    norm_X = X - k
-    log_sum_exp_X = T.log(T.sum(T.exp(norm_X), axis=-1, keepdims=True))
-    return norm_X - log_sum_exp_X
+    ctc_loss, ctc_forward, ctc_backward, insert_alternating_blanks, \
+    isneginf, log_softmax
 
 
 class TestCTC(unittest.TestCase):
@@ -189,5 +183,4 @@ def f(linear_out_):
             loss = T.switch(isneginf(-loss), 0, loss)
             return loss
 
-        unittest_tools.verify_grad(
-            f, [linear_out], rel_tol=.1, abs_tol=.1)
+        unittest_tools.verify_grad(f, [linear_out], abs_tol=0.05, rel_tol=0.05)
diff --git a/papers/connectionist_temporal_classification/tests.ipynb b/papers/connectionist_temporal_classification/tests.ipynb
index 538c39c..abd4061 100644
--- a/papers/connectionist_temporal_classification/tests.ipynb
+++ b/papers/connectionist_temporal_classification/tests.ipynb
@@ -61,6 +61,7 @@
     "seqsize = np.array([\n",
     "    np.random.randint(labsize[i] + 1, max_seqsize + 1)\n",
     "    for i in range(batch_size)], dtype=np.int32)\n",
+    "seqsize[0] = max_seqsize\n",
     "\n",
     "linout = np.random.randn(\n",
     "    max_seqsize, batch_size, voca_size + 1).astype(np.float32)"
@@ -92,13 +93,6 @@
     "    time_major=True)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -155,7 +149,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "th_grad[:, 1, :]"
+    "th_grad[:, 0, :]"
    ]
   },
   {
@@ -164,15 +158,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf_grad[:, 1, :]"
+    "tf_grad[:, 0, :]"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

From 324f2a97ede4a2ed498b92d58982ced8a516fa80 Mon Sep 17 00:00:00 2001
From: Nicolas Granger <nicolas.granger.m@gmail.com>
Date: Sun, 24 Jun 2018 18:27:48 +0200
Subject: [PATCH 8/8] split ctc op in two to avoid redundant computation: no
 improvement though...

---
 .gitignore                                    |   1 +
 ...onnectionist Temporal Classification.ipynb | 486 ++++++++++++++++
 .../ctc.py                                    | 159 ++++--
 .../experiments-tf.ipynb                      | 369 -------------
 .../experiments.ipynb                         | 522 ------------------
 .../test_ctc.py                               | 181 ++++++
 .../test_ctc2.py                              | 186 -------
 .../tests.ipynb                               | 186 -------
 8 files changed, 774 insertions(+), 1316 deletions(-)
 create mode 100644 papers/connectionist_temporal_classification/Connectionist Temporal Classification.ipynb
 delete mode 100644 papers/connectionist_temporal_classification/experiments-tf.ipynb
 delete mode 100644 papers/connectionist_temporal_classification/experiments.ipynb
 create mode 100644 papers/connectionist_temporal_classification/test_ctc.py
 delete mode 100644 papers/connectionist_temporal_classification/test_ctc2.py
 delete mode 100644 papers/connectionist_temporal_classification/tests.ipynb

diff --git a/.gitignore b/.gitignore
index db78efc..0053c11 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ __pycache__/
 
 # datasets
 *.zip
+papers/connectionist_temporal_classification/TIMIT
 
 # Distribution / packaging
 .Python
diff --git a/papers/connectionist_temporal_classification/Connectionist Temporal Classification.ipynb b/papers/connectionist_temporal_classification/Connectionist Temporal Classification.ipynb
new file mode 100644
index 0000000..350030a
--- /dev/null
+++ b/papers/connectionist_temporal_classification/Connectionist Temporal Classification.ipynb	
@@ -0,0 +1,486 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is an implementation the Connectionist Temporal Classification loss function:\n",
+    "\n",
+    "> Graves, A., Fernández, S., Gomez, F., & Schmidhuber, J. (2006, June). Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In Proceedings of the 23rd international conference on Machine learning (pp. 369-376). ACM. ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf\n",
+    "\n",
+    "This notebook only show the learning procedure, no thorough testing is performed and the prefix search decoding is not implemented (contributions are welcome!).\n",
+    "\n",
+    "The original paper seems to use size 1 minibatches instead of 16 here. There shouldn't be any significant variations otherwise.\n",
+    "\n",
+    "Please download the [TIMIT dataset](http://academictorrents.com/details/34e2b78745138186976cbc27939b1b34d18bd5b3) and place the `TIMIT.zip` file next to this one.\n",
+    "\n",
+    "The following python packages are required:\n",
+    "- scipy\n",
+    "- lasagne\n",
+    "- matplotlib\n",
+    "- [sphfile](https://pypi.python.org/pypi/sphfile) (to read the sound files)\n",
+    "- [python_speech_features](https://github.com/jameslyons/python_speech_features) (to generate mfcc features)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "%matplotlib notebook\n",
+    "\n",
+    "import os\n",
+    "os.environ['THEANO_FLAGS'] = \"device=cuda\"\n",
+    "#os.environ['CUDA_LAUNCH_BLOCKING'] = \"1\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle as pkl\n",
+    "import numpy as np\n",
+    "from zipfile import ZipFile\n",
+    "from sphfile import SPHFile\n",
+    "from python_speech_features import mfcc\n",
+    "import theano\n",
+    "import theano.tensor as T\n",
+    "import lasagne\n",
+    "from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, GaussianNoiseLayer\n",
+    "from lasagne.init import Uniform\n",
+    "from lasagne.nonlinearities import tanh, sigmoid\n",
+    "import matplotlib\n",
+    "import matplotlib.pyplot as plt\n",
+    "from ctc import ctc_loss, log_softmax, ctc_backward\n",
+    "import time"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## small useful functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def smooth(x, w):\n",
+    "    window = int(np.ceil(len(x) / 2 * (1000 ** w - 1) / 999))\n",
+    "    window += 1 - window % 2\n",
+    "    \n",
+    "    if window < 3 or len(x) < window:\n",
+    "        return x\n",
+    "    \n",
+    "    edge_weights = np.arange(1, window // 2 + 1)\n",
+    "    return np.concatenate([\n",
+    "        np.cumsum(x[:window // 2]) / edge_weights,\n",
+    "        np.convolve(x, np.full([window], 1 / window), 'valid'),\n",
+    "        np.cumsum(x[:-window // 2:-1])[::-1] / edge_weights[::-1]])\n",
+    "\n",
+    "def argmax_decode(preds, exclude=()):\n",
+    "    preds = np.argmax(preds, axis=1)\n",
+    "    decoded = [preds[0]]\n",
+    "    for v in preds:\n",
+    "        if v != decoded[-1]:\n",
+    "            decoded.append(v)\n",
+    "    \n",
+    "    return np.array([v for v in decoded if v not in exclude])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.isdir(\"data/lisa/data/timit/raw/TIMIT\"):\n",
+    "    assert os.path.exists(\"TIMIT.zip\"), \"Missing data archive\"\n",
+    "    with ZipFile(\"TIMIT.zip\", 'r') as f:\n",
+    "        f.extractall(path=\".\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = []\n",
+    "train_subset = []\n",
+    "\n",
+    "for dirpath, _, filenames in os.walk(\"data/lisa/data/timit/raw/TIMIT\"):\n",
+    "    for f in filenames:\n",
+    "        if f.endswith(\"WAV\"):\n",
+    "            recording = SPHFile(dirpath + \"/\" + f).content\n",
+    "            files.append(dirpath + \"/\" + f[:-4])\n",
+    "            train_subset.append(dirpath[31:36] == \"TRAIN\")\n",
+    "\n",
+    "files = np.array(files)\n",
+    "train_subset = np.array(train_subset, dtype=np.bool)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(\"preprocessed_dataset.pkl\"):\n",
+    "    features = []\n",
+    "    labels = []\n",
+    "\n",
+    "    for f in files:\n",
+    "        recording = SPHFile(f + \".WAV\")\n",
+    "        signal = recording.content\n",
+    "        samplerate = recording.format['sample_rate']\n",
+    "\n",
+    "        mfccfeats = mfcc(signal, samplerate=samplerate, winlen=0.01, winstep=0.005, \n",
+    "                         numcep=13, nfilt=26, appendEnergy=True)\n",
+    "        derivatives = np.concatenate([\n",
+    "            mfccfeats[1, None] - mfccfeats[0, None],\n",
+    "            .5 * mfccfeats[2:] - .5 * mfccfeats[0:-2],\n",
+    "            mfccfeats[-1, None] - mfccfeats[-2, None]], axis=0)\n",
+    "\n",
+    "        features.append(np.concatenate([mfccfeats, derivatives], axis=1).astype(np.float32))\n",
+    "\n",
+    "        with open(f + \".PHN\") as phonem_file:\n",
+    "            labels.append([l.split()[2] for l in phonem_file.readlines()])\n",
+    "\n",
+    "    m = np.mean(np.concatenate(features, axis=0))\n",
+    "    s = np.std(np.concatenate(features, axis=0))\n",
+    "\n",
+    "    for i in range(len(features)):\n",
+    "        features[i] = (features[i] - m) / s\n",
+    "\n",
+    "    vocabulary = set()\n",
+    "    for lseq in labels:\n",
+    "        vocabulary |= set(lseq)\n",
+    "\n",
+    "    vocabulary = list(vocabulary)\n",
+    "    vocabulary[-1], vocabulary[vocabulary.index('h#')] = \\\n",
+    "        vocabulary[vocabulary.index('h#')], vocabulary[-1]\n",
+    "    blank = len(vocabulary) - 1\n",
+    "\n",
+    "    for i in range(len(labels)):\n",
+    "        labels[i] = np.array([vocabulary.index(l) for l in labels[i]], dtype=np.int32)\n",
+    "    \n",
+    "    with open(\"preprocessed_dataset.pkl\", 'wb') as f:\n",
+    "        pkl.dump((features, labels, vocabulary, blank), f, -1)\n",
+    "\n",
+    "\n",
+    "with open(\"preprocessed_dataset.pkl\", 'rb') as f:\n",
+    "    features, labels, vocabulary, blank = pkl.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# let's go brutal and shove that in GPU memory\n",
+    "\n",
+    "n_sequences = len(features)\n",
+    "feat_size = features[0].shape[1]\n",
+    "max_duration = max(len(seq) for seq in features)\n",
+    "max_labels = max(len(seq) - 2 for seq in labels)  # -2 for init and final blank\n",
+    "\n",
+    "durations = np.array([len(seq) for seq in features], dtype=np.int32)\n",
+    "nlabels = np.array([len(seq) - 2 for seq in labels], dtype=np.int32)\n",
+    "all_features = np.zeros((n_sequences, max_duration, feat_size), dtype=np.float32)\n",
+    "for i in range(n_sequences):\n",
+    "    all_features[i, :durations[i]] = features[i]\n",
+    "all_labels = np.zeros((n_sequences, max_labels), dtype=np.int32)\n",
+    "for i in range(n_sequences):\n",
+    "    all_labels[i, :nlabels[i]] = labels[i][1:-1]\n",
+    "\n",
+    "durations_var = T.as_tensor_variable(durations, name=\"durations\")\n",
+    "all_features_var = T.as_tensor_variable(all_features, name=\"all_features\")\n",
+    "nlabels_var = T.as_tensor_variable(nlabels, name=\"nlabels\")\n",
+    "all_labels_var = T.as_tensor_variable(all_labels, name=\"all_labels\")\n",
+    "\n",
+    "minibatch_indexes = T.ivector()\n",
+    "batch_features = all_features_var[minibatch_indexes]\n",
+    "batch_durations = durations_var[minibatch_indexes]\n",
+    "batch_nlabels = nlabels_var[minibatch_indexes]\n",
+    "batch_labels = all_labels_var[minibatch_indexes]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 16\n",
+    "\n",
+    "l_in = InputLayer(\n",
+    "    input_var=batch_features,\n",
+    "    shape=(batch_size, max_duration, feat_size))\n",
+    "\n",
+    "l_duration = InputLayer(input_var=batch_durations, shape=(1,))\n",
+    "\n",
+    "l_mask = lasagne.layers.ExpressionLayer(\n",
+    "    l_duration, \n",
+    "    lambda d: T.arange(max_duration)[None, :] < d[:, None])\n",
+    "\n",
+    "l_noise = GaussianNoiseLayer(l_in, sigma=0.6)\n",
+    "# l_noise = l_in\n",
+    "\n",
+    "l_fwlstm = LSTMLayer(\n",
+    "    l_noise, 100,\n",
+    "    ingate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n",
+    "    forgetgate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n",
+    "    cell=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=tanh),\n",
+    "    outgate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n",
+    "    nonlinearity=tanh,\n",
+    "    mask_input=l_mask, peepholes=True)\n",
+    "l_bwlstm = LSTMLayer(\n",
+    "    l_noise, 100,\n",
+    "    ingate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n",
+    "    forgetgate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n",
+    "    cell=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=tanh),\n",
+    "    outgate=lasagne.layers.Gate(W_cell=Uniform(0.1), nonlinearity=sigmoid),\n",
+    "    nonlinearity=tanh,\n",
+    "    mask_input=l_mask, peepholes=True, backwards=True)\n",
+    "\n",
+    "l_cat = ConcatLayer([l_fwlstm, l_bwlstm], axis=2)\n",
+    "\n",
+    "l_linout = DenseLayer(\n",
+    "    l_cat, len(vocabulary), \n",
+    "    nonlinearity=None,\n",
+    "    num_leading_axes=2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_output = lasagne.layers.get_output(\n",
+    "    l_linout, deterministic=False).dimshuffle(1, 0, 2)\n",
+    "\n",
+    "loss = ctc_loss(\n",
+    "    linout=train_output,\n",
+    "    durations=batch_durations,\n",
+    "    labels=batch_labels,\n",
+    "    label_sizes=batch_nlabels,\n",
+    "    blank=blank)\n",
+    "\n",
+    "params = lasagne.layers.get_all_params(l_linout, trainable=True)\n",
+    "grads = theano.grad(loss.sum(), params)\n",
+    "updates = lasagne.updates.adam(\n",
+    "    grads, params, \n",
+    "    learning_rate=1e-4)\n",
+    "update_fn = theano.function(\n",
+    "    [minibatch_indexes], \n",
+    "    loss,\n",
+    "    updates=updates)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "i = 0\n",
+    "nsteps = int(100 * n_sequences / batch_size)\n",
+    "params_history = []\n",
+    "loss_history = np.zeros((nsteps,))\n",
+    "\n",
+    "def update_plot(fig, ax1, ax2, loss_history):\n",
+    "    ax1.clear()\n",
+    "    ax1.set_xlim(0, len(loss_history))\n",
+    "    ax1.set_yscale('log')\n",
+    "    ax1.set_ylim(0.8 * np.percentile(loss_history, 1), \n",
+    "                1.2 * np.percentile(loss_history, 99))\n",
+    "    ax1.grid(color='gray', linestyle='-', linewidth=1)\n",
+    "    ax1.grid(color='gray', linestyle=':', which='minor', linewidth=1)\n",
+    "    ax1.set_axisbelow(True)\n",
+    "    xticks = np.arange(len(loss_history))\n",
+    "    ax1.scatter(xticks, loss_history, marker='.', \n",
+    "               color='firebrick', edgecolor=\"none\", alpha=0.1)\n",
+    "    smooth_history = smooth(loss_history, 0.6)\n",
+    "    ax1.plot(xticks, smooth_history, linewidth=2, color='firebrick')\n",
+    "\n",
+    "    ax2.clear()\n",
+    "    ax2.set_yscale('log')\n",
+    "    ax2.set_ylim(0.8 * np.percentile(loss_history, 1), \n",
+    "                 1.2 * np.percentile(loss_history, 99))\n",
+    "    ax2.grid(False)\n",
+    "    ax2.yaxis.set_label_position(\"right\")\n",
+    "    ax2.set_yticks([], minor=True)\n",
+    "    ax2.set_yticks([smooth_history[-1]])\n",
+    "    ax2.get_yaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())\n",
+    "\n",
+    "    fig.canvas.draw()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plt.figure()\n",
+    "ax1 = fig.add_subplot(111)\n",
+    "xticks = np.arange(i)\n",
+    "ax1.set_xlim(0, i + 1)\n",
+    "ax1.set_ylim(0, 1)\n",
+    "ax2 = ax1.twinx()\n",
+    "\n",
+    "# Note: you can interrupt and resume the execution of this cell\n",
+    "while i < nsteps:\n",
+    "    t1 = time.time()\n",
+    "    batch_loss = np.mean(update_fn(\n",
+    "        np.random.choice(n_sequences, batch_size).astype(np.int32)))\n",
+    "    t2 = time.time()\n",
+    "    \n",
+    "    print(\"\\r{:<6d} loss = {:>5.0f}, (d={:1.2f})\".format(i, batch_loss, t2 - t1), end='', flush=True)\n",
+    "    loss_history[i] = batch_loss\n",
+    "\n",
+    "    if (i + 1) % 10 == 0:        \n",
+    "        update_plot(fig, ax1, ax2, loss_history[:i])\n",
+    "\n",
+    "#     if (i + 1) % 1000 == 0:\n",
+    "#         params_history.append(lasagne.layers.get_all_param_values(l_linout))\n",
+    "\n",
+    "    i += 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluate model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_output = lasagne.layers.get_output(l_linout, deterministic=True)\n",
+    "\n",
+    "logits_fn = theano.function(\n",
+    "    [minibatch_indexes],\n",
+    "    [batch_features, batch_durations, \n",
+    "     batch_labels, batch_nlabels, \n",
+    "     test_output])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sequence = 3\n",
+    "\n",
+    "f, d, l, n, p = logits_fn(np.array([sequence], dtype=np.int32))\n",
+    "f = f[0, :d[0]]\n",
+    "l = l[0, :n[0]]\n",
+    "p = p[0, :d[0]]\n",
+    "s = np.exp(p - np.max(p, axis=-1, keepdims=True)) \\\n",
+    "    / np.sum(np.exp(p - np.max(p, axis=-1, keepdims=True)), axis=-1, keepdims=True)\n",
+    "\n",
+    "fig = plt.figure()\n",
+    "ax = plt.subplot(111)\n",
+    "lines = []\n",
+    "\n",
+    "for c in np.argsort(vocabulary[:-1]):\n",
+    "    if c in l:\n",
+    "        line, = ax.plot(np.arange(len(p)), s[:, c], label=vocabulary[c], picker=5)\n",
+    "        lines.append(line)\n",
+    "\n",
+    "ax.plot(np.arange(len(p)), s[:, -1], linestyle=\":\")\n",
+    "\n",
+    "ax.set_ylim(0.0, 1.2)\n",
+    "# ax.set_yscale('log')\n",
+    "ax.set_title('Select curve to see the label')\n",
+    "\n",
+    "ax.legend(\n",
+    "    framealpha=1,\n",
+    "    loc='upper center', bbox_to_anchor=(0.5, -0.2), ncol=8)\n",
+    "\n",
+    "fig.subplots_adjust(bottom=0.5)\n",
+    "fig.show()\n",
+    "\n",
+    "def onpick(event):\n",
+    "    for line in lines:\n",
+    "        line.set_alpha(0.3)\n",
+    "        line.set_linewidth(2)\n",
+    "    \n",
+    "    event.artist.set_alpha(1)\n",
+    "    event.artist.set_linewidth(2)\n",
+    "    ax.set_title(event.artist.get_label())\n",
+    "\n",
+    "cid = fig.canvas.mpl_connect('pick_event', onpick)\n",
+    "\n",
+    "print(\"target    : {}\".format(\", \".join(vocabulary[l_] for l_ in l)))\n",
+    "print(\"prediction: {}\".format(\", \".join(vocabulary[l_] for l_ in argmax_decode(s, [blank]))))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/papers/connectionist_temporal_classification/ctc.py b/papers/connectionist_temporal_classification/ctc.py
index c6f8a54..4535d1d 100644
--- a/papers/connectionist_temporal_classification/ctc.py
+++ b/papers/connectionist_temporal_classification/ctc.py
@@ -21,7 +21,7 @@ def logaddexp(x, y, magnitude=20):
     x, y = T.minimum(x, y), T.maximum(x, y)
     diff = T.minimum(y - x, magnitude)
     res = x + T.log(1 + T.exp(diff))
-    return T.switch((y - x > magnitude), y, res)
+    return T.switch((y - x) > magnitude, y, res)
 
 
 def logsumexp(x, axis, keepdims=False):
@@ -51,13 +51,13 @@ def insert_alternating_blanks(labels, blank_label):
     return blanked_labels
 
 
-def ctc_forward(log_odds, seq_sizes,
+def ctc_forward(log_odds, durations,
                 blanked_labels, label_sizes, not_repeated):
     seqsize, batch_sz, _ = log_odds.shape
     label_size = blanked_labels.shape[1]
 
     def step(t, a_tm1, log_odds_,
-             seq_sizes_, blanked_labels_, label_sizes_, not_repeated_):
+             durations_, blanked_labels_, label_sizes_, not_repeated_):
         y_t = log_odds_[t]
         a_t = a_tm1
         a_t = T.set_subtensor(
@@ -68,7 +68,7 @@ def step(t, a_tm1, log_odds_,
             logaddexp(a_t[:, 2:], T.switch(not_repeated_, a_tm1[:, :-2], -2e9)))
 
         # stop after a_T(|l'|)
-        mask = T.ge(t, seq_sizes_)[:, None] \
+        mask = T.ge(t, durations_)[:, None] \
             + T.ge(T.arange(label_size)[None, :],
                    2 * label_sizes_[:, None] + 1)
 
@@ -82,32 +82,33 @@ def step(t, a_tm1, log_odds_,
 
     alphas, _ = theano.scan(
         fn=step,
+        n_steps=seqsize,
+        strict=True,
         sequences=[T.arange(seqsize)],
         outputs_info=alpha_init,
-        non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes,
+        non_sequences=[log_odds, durations, blanked_labels, label_sizes,
                        not_repeated],
         name="ctc_forward")
 
     return alphas
 
 
-def ctc_backward(log_odds, seq_sizes,
-                 blanked_labels, label_sizes, not_repeated):
+def ctc_backward(log_odds, durations, blanked_labels, label_sizes, not_repeated):
     seqsize, batch_sz, _ = log_odds.shape
     label_size = blanked_labels.shape[1]
 
     def step(t, b_tp1, log_odds_,
-             seq_sizes_, blanked_labels_, label_sizes_, not_repeated_):
+             durations_, blanked_labels_, label_sizes_, not_repeated_):
         y_t = log_odds_[t]
 
         # increase b_{T+1}(|l'|) from 0 to 1 to initialize recursion
-        starter_t = T.eq(t, seq_sizes_ - 1)[:, None] \
+        starter_t = T.eq(t, durations_ - 1)[:, None] \
             * T.eq((2 * label_sizes_)[:, None],
                    T.arange(label_size)[None, :])
         b_tp1_2lp1 = b_tp1[T.arange(batch_sz), 2 * label_sizes_]
         b_tp1 = T.set_subtensor(
             b_tp1_2lp1,
-            T.switch(T.eq(t, seq_sizes_ - 1), 0, b_tp1_2lp1))
+            T.switch(T.eq(t, durations_ - 1), 0, b_tp1_2lp1))
         b_tp1 = T.switch(starter_t, 0, b_tp1)  # initialize recursion
 
         b_t = b_tp1
@@ -125,9 +126,11 @@ def step(t, b_tp1, log_odds_,
 
     betas, _ = theano.scan(
         fn=step,
+        n_steps=seqsize,
+        strict=True,
         sequences=[T.arange(seqsize)],
         outputs_info=beta_init,
-        non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes,
+        non_sequences=[log_odds, durations, blanked_labels, label_sizes,
                        not_repeated],
         go_backwards=True,
         name="ctc_backward")
@@ -138,34 +141,28 @@ def step(t, b_tp1, log_odds_,
 
 # Theano Op -------------------------------------------------------------------
 
-def ctc_perform_graph(linout, seq_sizes, labels, label_sizes, blank):
+def ctc_propagate(linout, durations, blanked_labels, label_sizes, not_repeated):
     _, batch_size, voca_size = linout.shape
 
     logits = log_softmax(linout)
-    blanked_labels = insert_alternating_blanks(labels, blank)
-    not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])
-    betas = ctc_backward(logits, seq_sizes,
+    betas = ctc_backward(logits, durations,
                          blanked_labels, label_sizes, not_repeated)
     loss = - logaddexp(betas[0, :, 0], betas[0, :, 1])
 
-    # alphas = ctc_forward(logits, seq_sizes,
+    # alphas = ctc_forward(logits, durations,
     #                      blanked_labels, label_sizes, not_repeated)
     # loss = - logaddexp(
-    #     alphas[seq_sizes - 1, T.arange(batch_size), 2 * label_sizes - 1],
-    #     alphas[seq_sizes - 1, T.arange(batch_size), 2 * label_sizes])
+    #     alphas[durations - 1, T.arange(batch_size), 2 * label_sizes - 1],
+    #     alphas[durations - 1, T.arange(batch_size), 2 * label_sizes])
 
-    return logits, blanked_labels, not_repeated, betas, loss
+    return loss, logits, betas
 
 
-def ctc_grad_graph(inputs, output_gradients):
-    linout, seq_durations, labels, label_sizes, _ = inputs
-    seq_size, batch_size, voca_size = linout.shape
-    label_size = labels.shape[1]
+def ctc_backprop(durations, blanked_labels, label_sizes, not_repeated,
+                 logits, betas, loss, output_gradient):
+    seq_size, batch_size, voca_size = logits.shape
 
-    logits, blanked_labels, not_repeated, betas, loss = \
-        ctc_perform_graph(*inputs)
-
-    alphas = ctc_forward(logits, seq_durations,
+    alphas = ctc_forward(logits, durations,
                          blanked_labels, label_sizes, not_repeated)
 
     # log(sum_{s \in lab(l, k)} a_t(s) b_t(s))
@@ -179,11 +176,11 @@ def fwbw_sum_step(k, s, labels_, ab_):
     ab = alphas + betas
     fwbw_sum = theano.scan(
         fn=fwbw_sum_step,
-        sequences=[T.arange(2 * label_size + 1)],
+        sequences=[T.arange(blanked_labels.shape[1])],
         outputs_info=-2e9 * T.ones((seq_size, batch_size, voca_size)),
         non_sequences=[blanked_labels, ab],
         strict=True,
-        name="fwbw_sum")[0][-1]
+        name="fwbw_sum")[0][-1]  # should be unrolled if label_size is known
 
     A = loss[None, :, None] + logits \
         + logsumexp(fwbw_sum - logits, axis=2, keepdims=True)
@@ -193,38 +190,83 @@ def fwbw_sum_step(k, s, labels_, ab_):
     dloss_dy = T.switch(T.all(isneginf(fwbw_sum), axis=2, keepdims=True),
                         0, dloss_dy)
 
-    return [dloss_dy * output_gradients[0][None, :, None],
-            theano.gradient.disconnected_type(),
-            theano.gradient.disconnected_type(),
-            theano.gradient.disconnected_type(),
-            theano.gradient.disconnected_type()]
+    return dloss_dy * output_gradient[None, :, None]
 
 
 def make_ctc_op():
     preds_var = T.tensor3()
-    seq_durations_var = T.ivector()
-    labels_var = T.imatrix()
+    durations_var = T.ivector()
+    blanked_labels_var = T.imatrix()
+    bool_matrix = T.TensorType("bool", (False, False))
+    not_repeated_var = bool_matrix()
     label_sizes_var = T.ivector()
-    blank_var = T.iscalar()
 
-    _, _, _, _, loss = ctc_perform_graph(
-        preds_var, seq_durations_var, labels_var,
-        label_sizes_var, blank_var)
+    # linout, durations, labels, label_sizes, blank = inputs
+    # seq_size, batch_size, voca_size = linout.shape
+    #
+    # logits, blanked_labels, not_repeated, betas, loss = \
+    #     ctc_perform_graph(linout, durations, labels, label_sizes, blank)
+
+    loss, logits, betas = ctc_propagate(preds_var, durations_var, blanked_labels_var,
+                                        label_sizes_var, not_repeated_var)
+
+    def backprop_op1(inputs, output_gradients):
+        del inputs
+        return [
+            output_gradients[0],
+            theano.gradient.disconnected_type(),
+            theano.gradient.disconnected_type(),
+            theano.gradient.disconnected_type(),
+            theano.gradient.disconnected_type()]
+
+    op1 = theano.OpFromGraph(
+        inputs=[preds_var, durations_var,
+                blanked_labels_var, label_sizes_var,
+                not_repeated_var],
+        outputs=[preds_var, logits, betas, loss],
+        grad_overrides=backprop_op1,
+        inline=True, name="ctcLossOp1")
+
+    def backprop_op2(inputs, output_gradients):
+        preds_var_, logits_, betas_, loss_, \
+            durations_, blanked_labels_, label_sizes_, not_repeated_ = inputs
+        output_gradient, = output_gradients
+
+        g = ctc_backprop(durations_, blanked_labels_, label_sizes_, not_repeated_,
+                         logits_, betas_, loss_, output_gradient)
+
+        return [
+            g,
+            T.zeros_like(logits_),
+            # theano.gradient.disconnected_type(),
+            T.zeros_like(betas_),
+            # theano.gradient.disconnected_type(),
+            T.zeros_like(loss_),
+            theano.gradient.disconnected_type(),
+            theano.gradient.disconnected_type(),
+            theano.gradient.disconnected_type(),
+            theano.gradient.disconnected_type(),
+            theano.gradient.disconnected_type()]
 
-    return theano.OpFromGraph(
-        inputs=[preds_var, seq_durations_var,
-                labels_var, label_sizes_var, blank_var],
-        outputs=[loss],
-        grad_overrides=ctc_grad_graph,
-        inline=True, name="ctcLossOp")
+    preds, logits, betas, loss = op1(
+        preds_var, durations_var,
+        blanked_labels_var, label_sizes_var,
+        not_repeated_var)
 
+    op2 = theano.OpFromGraph(
+        inputs=[preds, logits, betas, loss,
+                durations_var, blanked_labels_var, label_sizes_var,
+                not_repeated_var],
+        outputs=[loss + preds.sum() * 0 + logits.sum() * 0 + betas.sum() * 0],
+        grad_overrides=backprop_op2,
+        inline=True, name="ctcLossOp2")
 
-CTCLossOp = make_ctc_op()
+    return op1, op2
 
 
 # -----------------------------------------------------------------------------
 
-def ctc_loss(linout, durations, labels, label_sizes, blank=-1):
+def ctc_loss(preds, durations, labels, label_sizes, blank=-1):
     """Compute the Connectionnist Temporal Classification loss [#graves2006]_.
 
     .. math:: L = - ln\left( \sum_{\pi \in \mathcal{B}^{-1}(l)} P(\pi | y)
@@ -238,7 +280,7 @@ def ctc_loss(linout, durations, labels, label_sizes, blank=-1):
 
     Parameters
     ----------
-    linout : Theano shared variable, expression or numpy array
+    preds : Theano shared variable, expression or numpy array
         The input values for the softmax function with shape
         duration x batch_size x nclasses.
     durations: Theano shared variable, expression or numpy array
@@ -266,13 +308,13 @@ def ctc_loss(linout, durations, labels, label_sizes, blank=-1):
        Proceedings of the 23rd international conference on Machine learning
        (pp. 369-376). ACM. ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf
     """
-    linout = T.as_tensor_variable(linout)
+    preds = T.as_tensor_variable(preds)
     durations = T.as_tensor_variable(durations)
     labels = T.as_tensor_variable(labels)
     label_sizes = T.as_tensor_variable(label_sizes)
     blank = T.cast(T.as_tensor_variable(blank), 'int32')
 
-    if not(linout.dtype in continuous_dtypes and linout.ndim == 3):
+    if not(preds.dtype in continuous_dtypes and preds.ndim == 3):
         raise ValueError("preds must continuous with dimension 3")
     if not (durations.dtype in discrete_dtypes and durations.ndim == 1):
         raise ValueError("durations must be a integer vector")
@@ -283,8 +325,19 @@ def ctc_loss(linout, durations, labels, label_sizes, blank=-1):
     if not (blank.dtype in discrete_dtypes and blank.ndim == 0):
         raise ValueError("blank must be an integer value")
 
-    voca_size = T.cast(linout.shape[2], 'int32')
+    voca_size = T.cast(preds.shape[2], 'int32')
     labels = labels % voca_size
     blank = blank % voca_size
 
-    return CTCLossOp(linout, durations, labels, label_sizes, blank)
+    op1, op2 = make_ctc_op()
+
+    blanked_labels = insert_alternating_blanks(labels, blank)
+    not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])
+
+    preds, logits, betas, loss = op1(preds, durations,
+                                     blanked_labels, label_sizes,
+                                     not_repeated)
+    loss = op2(preds, logits, betas, loss,
+               durations, blanked_labels, label_sizes, not_repeated)
+
+    return loss
diff --git a/papers/connectionist_temporal_classification/experiments-tf.ipynb b/papers/connectionist_temporal_classification/experiments-tf.ipynb
deleted file mode 100644
index effc1cb..0000000
--- a/papers/connectionist_temporal_classification/experiments-tf.ipynb
+++ /dev/null
@@ -1,369 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Requirements\n",
-    "\n",
-    "Please download the timit dataset at http://academictorrents.com/details/34e2b78745138186976cbc27939b1b34d18bd5b3 and place the TIMIT.zip file next to this file.\n",
-    "\n",
-    "The following python packages are required:\n",
-    "- lasagne\n",
-    "- matplotlib\n",
-    "- [sphfile](https://pypi.python.org/pypi/sphfile) (to read the sound files)\n",
-    "- [python_speech_features](https://github.com/jameslyons/python_speech_features) (to generate mfcc features)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%load_ext autoreload\n",
-    "% autoreload 2\n",
-    "#%matplotlib inline\n",
-    "# %env CUDA_VISIBLE_DEVICES=\"1\"\n",
-    "from IPython.core.interactiveshell import InteractiveShell\n",
-    "InteractiveShell.ast_node_interactivity = \"all\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import pickle as pkl\n",
-    "import numpy as np\n",
-    "import matplotlib.pyplot as plt\n",
-    "from zipfile import ZipFile\n",
-    "from sphfile import SPHFile\n",
-    "from python_speech_features import mfcc\n",
-    "import tensorflow as tf\n",
-    "import keras as K\n",
-    "from keras.models import Model\n",
-    "from keras.layers import Input, Dense, LSTM, Concatenate, Layer"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Prepare dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if not os.path.isdir(\"data/lisa/data/timit/raw/TIMIT\"):\n",
-    "    assert os.path.exists(\"TIMIT.zip\"), \"Missing data archive\"\n",
-    "    with ZipFile(\"TIMIT.zip\", 'r') as f:\n",
-    "        f.extractall(path=\".\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "files = []\n",
-    "train_subset = []\n",
-    "\n",
-    "for dirpath, _, filenames in os.walk(\"data/lisa/data/timit/raw/TIMIT\"):\n",
-    "    for f in filenames:\n",
-    "        if f.endswith(\"WAV\"):\n",
-    "            recording = SPHFile(dirpath + \"/\" + f).content\n",
-    "            files.append(dirpath + \"/\" + f[:-4])\n",
-    "            train_subset.append(dirpath[31:36] == \"TRAIN\")\n",
-    "\n",
-    "files = np.array(files)\n",
-    "train_subset = np.array(train_subset, dtype=np.bool)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Preprocessing"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if not os.path.exists(\"preprocessed_dataset.pkl\"):\n",
-    "    features = []\n",
-    "    labels = []\n",
-    "\n",
-    "    for f in files:\n",
-    "        recording = SPHFile(f + \".WAV\")\n",
-    "        signal = recording.content\n",
-    "        samplerate = recording.format['sample_rate']\n",
-    "\n",
-    "        mfccfeats = mfcc(signal, samplerate=samplerate, winlen=0.01, winstep=0.005, \n",
-    "                         numcep=13, nfilt=26, appendEnergy=True)\n",
-    "        derivatives = np.concatenate([\n",
-    "            mfccfeats[1, None] - mfccfeats[0, None],\n",
-    "            .5 * mfccfeats[2:] - .5 * mfccfeats[0:-2],\n",
-    "            mfccfeats[-1, None] - mfccfeats[-2, None]], axis=0)\n",
-    "\n",
-    "        features.append(np.concatenate([mfccfeats, derivatives], axis=1).astype(np.float32))\n",
-    "\n",
-    "        with open(f + \".PHN\") as phonem_file:\n",
-    "            labels.append([l.split()[2] for l in phonem_file.readlines()])\n",
-    "\n",
-    "    m = np.mean(np.concatenate(features, axis=0))\n",
-    "    s = np.std(np.concatenate(features, axis=0))\n",
-    "\n",
-    "    for i in range(len(features)):\n",
-    "        features[i] = (features[i] - m) / s\n",
-    "\n",
-    "    vocabulary = set()\n",
-    "    for lseq in labels:\n",
-    "        vocabulary |= set(lseq)\n",
-    "\n",
-    "    vocabulary = list(vocabulary)\n",
-    "    vocabulary[-1], vocabulary[vocabulary.index('h#')] = vocabulary[vocabulary.index('h#')], vocabulary[-1]\n",
-    "\n",
-    "    for i in range(len(labels)):\n",
-    "        labels[i] = np.array([vocabulary.index(l) for l in labels[i]], dtype=np.int32)\n",
-    "\n",
-    "    blank = 60\n",
-    "    \n",
-    "    with open(\"preprocessed_dataset.pkl\", 'wb') as f:\n",
-    "        pkl.dump((features, labels, vocabulary, blank), f)\n",
-    "\n",
-    "\n",
-    "with open(\"preprocessed_dataset.pkl\", 'rb') as f:\n",
-    "    features, labels, vocabulary, blank = pkl.load(f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.figure(figsize=(20, 9))\n",
-    "plt.imshow(features[1].transpose(), clim=(-4, 4))\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def zero_loss(y_true, y_pred):\n",
-    "    return K.backend.zeros_like(y_pred)\n",
-    "\n",
-    "def dense_to_sparse(x):\n",
-    "    idx = tf.where(tf.greater_equal(x, 0))\n",
-    "    return tf.SparseTensor(idx, tf.gather_nd(x, idx), tf.shape(x, out_type=tf.int64))\n",
-    "\n",
-    "class CTCLossLayer(Layer):\n",
-    "    def __init__(self, **kwargs):\n",
-    "        super(CTCLossLayer, self).__init__(**kwargs)\n",
-    "\n",
-    "    def call(self, x, mask=None):\n",
-    "        linout = x[0]\n",
-    "        targets = x[1]\n",
-    "        durations = x[2]\n",
-    "        loss = tf.nn.ctc_loss(\n",
-    "            dense_to_sparse(targets), linout,\n",
-    "            sequence_length=durations[:, 0],\n",
-    "            time_major=False)\n",
-    "        self.add_loss(tf.reduce_sum(loss), x)\n",
-    "        return loss\n",
-    "\n",
-    "    def compute_output_shape(self, input_shape):\n",
-    "        return input_shape[0][0]\n",
-    "\n",
-    "a = Input(shape=(None, features[0].shape[1]), name=\"features\")\n",
-    "targets = Input(shape=[None], dtype='int32', name=\"targets\")\n",
-    "durations = Input(shape=[1], dtype='int32', name=\"durations\")\n",
-    "b1 = LSTM(100, return_sequences=True)(a)\n",
-    "b2 = LSTM(100, return_sequences=True, go_backwards=True)(a)\n",
-    "c = Concatenate(axis=2)([b1, b2])\n",
-    "d = Dense(len(vocabulary), activation=None)(c)\n",
-    "l = CTCLossLayer()([d, targets, durations])\n",
-    "model = Model(inputs=[a, targets, durations], outputs=[d, l])\n",
-    "sgd = K.optimizers.SGD(lr=1e-4, momentum=0.9, nesterov=True)\n",
-    "\n",
-    "model.summary()\n",
-    "\n",
-    "model.compile(\n",
-    "    target_tensors=[targets, targets], \n",
-    "    loss=[zero_loss, zero_loss], \n",
-    "    optimizer=sgd)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Training"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# lasagne.layers.set_all_param_values(l_linout, params_backup[0])\n",
-    "\n",
-    "params_backup = []\n",
-    "running_loss = None\n",
-    "\n",
-    "for i in np.random.permutation(len(labels))[:300]:\n",
-    "    f, l  = features[i][None, :, :], labels[i][None, 1:-1]\n",
-    "\n",
-    "    batch_loss = model.train_on_batch(\n",
-    "        x=[f, l, np.array([f.shape[1]], np.int32)],\n",
-    "        y=[l, l])[0]\n",
-    "\n",
-    "    if batch_loss > 10000:\n",
-    "        print(\"\\nskipped i = {}\".format(i))\n",
-    "        continue\n",
-    "    else:\n",
-    "        running_loss = batch_loss if running_loss is None else .99 * running_loss + .01 * batch_loss\n",
-    "        print(\"\\rloss = {:>5.0f}\".format(running_loss), end='', flush=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Evaluate model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def argmax_decode(preds):\n",
-    "    decoded = [preds[0]]\n",
-    "    for v in preds:\n",
-    "        if v != decoded[-1]:\n",
-    "            decoded.append(v)\n",
-    "    \n",
-    "    return np.array(decoded, dtype=np.int32)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "features[i].shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "i = 0\n",
-    "inputs = [features[i][None, :, :], labels[i][None, 1:-1], np.array([features[i].shape[0]], np.int32)]\n",
-    "logits = model.predict(inputs)[0][0]\n",
-    "# preds -= np.max(preds, axis=1, keepdims=True)\n",
-    "# preds = np.exp(preds)\n",
-    "# preds /= np.sum(preds, axis=1, keepdims=True)\n",
-    "lbl_preds = argmax_decode(np.argmax(preds, axis=-1))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "o = np.argsort(np.mean(logits[:, :60], axis=0))\n",
-    "plt.figure(figsize=(10, 10))\n",
-    "for c in o:\n",
-    "    plt.plot(np.arange(len(logits)), logits[:, c]);\n",
-    "\n",
-    "plt.plot(np.arange(len(logits)), logits[:, -1], linestyle=\":\");\n",
-    "plt.legend([vocabulary[o_] for o_ in o] + [vocabulary[-1]], bbox_to_anchor=(.6, 0, 1, 1), ncol=5)\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "preds[:, -1]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.layers[4].get_weights()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "weights"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.4"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/papers/connectionist_temporal_classification/experiments.ipynb b/papers/connectionist_temporal_classification/experiments.ipynb
deleted file mode 100644
index 1e8b626..0000000
--- a/papers/connectionist_temporal_classification/experiments.ipynb
+++ /dev/null
@@ -1,522 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Requirements\n",
-    "\n",
-    "Please download the timit dataset at http://academictorrents.com/details/34e2b78745138186976cbc27939b1b34d18bd5b3 and place the TIMIT.zip file next to this file.\n",
-    "\n",
-    "The following python packages are required:\n",
-    "- lasagne\n",
-    "- matplotlib\n",
-    "- [sphfile](https://pypi.python.org/pypi/sphfile) (to read the sound files)\n",
-    "- [python_speech_features](https://github.com/jameslyons/python_speech_features) (to generate mfcc features)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%load_ext autoreload\n",
-    "%autoreload 2\n",
-    "%matplotlib inline\n",
-    "\n",
-    "import os\n",
-    "os.environ['THEANO_FLAGS'] = \"device=cpu\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pickle as pkl\n",
-    "import numpy as np\n",
-    "from zipfile import ZipFile\n",
-    "from sphfile import SPHFile\n",
-    "from python_speech_features import mfcc\n",
-    "import lasagne\n",
-    "from lasagne.layers import InputLayer, GaussianNoiseLayer, LSTMLayer, DenseLayer, ConcatLayer, ReshapeLayer\n",
-    "import theano\n",
-    "import theano.tensor as T\n",
-    "from theano.compile.nanguardmode import NanGuardMode\n",
-    "import matplotlib.pyplot as plt\n",
-    "from ctc import ctc_loss, log_softmax, insert_alternating_blanks, ctc_backward"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Prepare dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if not os.path.isdir(\"data/lisa/data/timit/raw/TIMIT\"):\n",
-    "    assert os.path.exists(\"TIMIT.zip\"), \"Missing data archive\"\n",
-    "    with ZipFile(\"TIMIT.zip\", 'r') as f:\n",
-    "        f.extractall(path=\".\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "files = []\n",
-    "train_subset = []\n",
-    "\n",
-    "for dirpath, _, filenames in os.walk(\"data/lisa/data/timit/raw/TIMIT\"):\n",
-    "    for f in filenames:\n",
-    "        if f.endswith(\"WAV\"):\n",
-    "            recording = SPHFile(dirpath + \"/\" + f).content\n",
-    "            files.append(dirpath + \"/\" + f[:-4])\n",
-    "            train_subset.append(dirpath[31:36] == \"TRAIN\")\n",
-    "\n",
-    "files = np.array(files)\n",
-    "train_subset = np.array(train_subset, dtype=np.bool)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Preprocessing"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if not os.path.exists(\"preprocessed_dataset.pkl\"):\n",
-    "    features = []\n",
-    "    labels = []\n",
-    "\n",
-    "    for f in files:\n",
-    "        recording = SPHFile(f + \".WAV\")\n",
-    "        signal = recording.content\n",
-    "        samplerate = recording.format['sample_rate']\n",
-    "\n",
-    "        mfccfeats = mfcc(signal, samplerate=samplerate, winlen=0.01, winstep=0.005, \n",
-    "                         numcep=13, nfilt=26, appendEnergy=True)\n",
-    "        derivatives = np.concatenate([\n",
-    "            mfccfeats[1, None] - mfccfeats[0, None],\n",
-    "            .5 * mfccfeats[2:] - .5 * mfccfeats[0:-2],\n",
-    "            mfccfeats[-1, None] - mfccfeats[-2, None]], axis=0)\n",
-    "\n",
-    "        features.append(np.concatenate([mfccfeats, derivatives], axis=1).astype(np.float32))\n",
-    "\n",
-    "        with open(f + \".PHN\") as phonem_file:\n",
-    "            labels.append([l.split()[2] for l in phonem_file.readlines()])\n",
-    "\n",
-    "    m = np.mean(np.concatenate(features, axis=0))\n",
-    "    s = np.std(np.concatenate(features, axis=0))\n",
-    "\n",
-    "    for i in range(len(features)):\n",
-    "        features[i] = (features[i] - m) / s\n",
-    "\n",
-    "    vocabulary = set()\n",
-    "    for lseq in labels:\n",
-    "        vocabulary |= set(lseq)\n",
-    "\n",
-    "    vocabulary = list(vocabulary)\n",
-    "    vocabulary[-1], vocabulary[vocabulary.index('h#')] = vocabulary[vocabulary.index('h#')], vocabulary[-1]\n",
-    "\n",
-    "    for i in range(len(labels)):\n",
-    "        labels[i] = np.array([vocabulary.index(l) for l in labels[i]], dtype=np.int32)\n",
-    "\n",
-    "    blank = len(labels) - 1\n",
-    "    \n",
-    "    with open(\"preprocessed_dataset.pkl\", 'wb') as f:\n",
-    "        pkl.dump((features, labels, vocabulary, blank), f, -1)\n",
-    "\n",
-    "\n",
-    "with open(\"preprocessed_dataset.pkl\", 'rb') as f:\n",
-    "    features, labels, vocabulary, blank = pkl.load(f)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams\n",
-    "\n",
-    "class SmallGaussianNoiseLayer(lasagne.layers.Layer):\n",
-    "    \"\"\"Gaussian noise layer (clipped for safety)\"\"\"\n",
-    "    def __init__(self, incoming, sigma=0.1, **kwargs):\n",
-    "        super(SmallGaussianNoiseLayer, self).__init__(incoming, **kwargs)\n",
-    "        self._srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579))\n",
-    "        self.sigma = sigma\n",
-    "\n",
-    "    def get_output_for(self, input, deterministic=False, **kwargs):\n",
-    "        if deterministic or self.sigma == 0:\n",
-    "            return input\n",
-    "        else:\n",
-    "            noise = self._srng.normal(input.shape, avg=0.0, std=self.sigma)\n",
-    "            return input + T.clip(noise, -3 * self.sigma, 3 * self.sigma)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "l_in = InputLayer(shape=(None, 1557,  26))\n",
-    "l_duration = InputLayer(input_var=T.ivector(name=\"duration\"), shape=(1,))\n",
-    "l_mask = lasagne.layers.ExpressionLayer(l_duration, lambda d: T.arange(1557)[None, :] < d[:, None])\n",
-    "l_noise = SmallGaussianNoiseLayer(l_in, sigma=0.6)\n",
-    "l_fwlstm = LSTMLayer(\n",
-    "    l_noise, 100, mask_input=l_mask)\n",
-    "l_bwlstm = LSTMLayer(\n",
-    "    l_noise, 100, mask_input=l_mask,\n",
-    "    backwards=True)\n",
-    "l_cat = ConcatLayer([l_fwlstm, l_bwlstm], axis=2)\n",
-    "l_linout = DenseLayer(l_cat, len(vocabulary), nonlinearity=None, num_leading_axes=2)\n",
-    "\n",
-    "input_var = l_in.input_var\n",
-    "duration_var = l_duration.input_var\n",
-    "labels_var = T.imatrix()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Training"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_output = lasagne.layers.get_output(l_linout, deterministic=False).dimshuffle(1, 0, 2)\n",
-    "\n",
-    "loss = ctc_loss(\n",
-    "    linout=train_output,\n",
-    "    durations=duration_var,\n",
-    "    labels=labels_var,\n",
-    "    label_sizes=T.cast(T.reshape(labels_var.shape[1], (1,)), 'int32'),\n",
-    "    blank=blank\n",
-    ")\n",
-    "\n",
-    "params = lasagne.layers.get_all_params(l_linout, trainable=True)\n",
-    "grads = theano.grad(loss.sum(), params)\n",
-    "updates = lasagne.updates.nesterov_momentum(grads, params, learning_rate=1e-4)\n",
-    "\n",
-    "update_fn = theano.function(\n",
-    "    [input_var, duration_var, labels_var], \n",
-    "    loss, \n",
-    "    updates=updates,\n",
-    "    # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True),\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "params_history = []\n",
-    "loss_history = []\n",
-    "running_loss = None\n",
-    "failed = []"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for e in range(10):\n",
-    "    for i in np.random.permutation(len(labels)):\n",
-    "        f, l  = features[i][None, :, :], labels[i][None, 1:-1]\n",
-    "        d = np.array([f.shape[1]], dtype=np.int32)\n",
-    "        f = np.concatenate([f, np.zeros((1557 - f.shape[1], f.shape[2]), dtype=np.float32)[None, :, :]], axis=1)\n",
-    "\n",
-    "        batch_loss = float(update_fn(f, d, l))\n",
-    "\n",
-    "        if batch_loss > 10000 or np.isnan(batch_loss):\n",
-    "            print(\"\\nskipped i = {} because loss was {}\".format(i, batch_loss))\n",
-    "            raise RuntimeError()\n",
-    "        else:\n",
-    "            running_loss = batch_loss if running_loss is None else .99 * running_loss + .01 * batch_loss\n",
-    "            print(\"\\r{:4d} loss = {:>5.0f} -> {:>5.0f}\".format(i, batch_loss, running_loss), end='', flush=True)\n",
-    "\n",
-    "        if i % 25:\n",
-    "            params_history.append(lasagne.layers.get_all_param_values(l_linout))\n",
-    "            loss_history.append(running_loss)\n",
-    "\n",
-    "    #         batch_loss = loss_fn(f, l)\n",
-    "    #         if batch_loss > 5000:\n",
-    "    #             print('loss = {:>5.0f} > 5000 at element {:d}'.format(batch_loss, i))\n",
-    "    #             raise\n",
-    "    #         else:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.plot(loss_history)\n",
-    "plt.yscale('log')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "np.argmin(loss_history[::25])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "lasagne.layers.set_all_param_values(l_linout, params_history[6000//25])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Evaluate model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "i = 0\n",
-    "f, l  = features[i][None, :, :], labels[i][None, 1:-1]\n",
-    "f = np.concatenate([f, np.zeros((1557 - f.shape[1], f.shape[2]), dtype=np.float32)[None, :, :]], axis=1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "blanked_labels = insert_alternating_blanks(labels_var, blank)\n",
-    "not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])\n",
-    "betas = ctc_backward(\n",
-    "    log_softmax(T.unbroadcast(train_output.dimshuffle(1, 0, 2), 1)),\n",
-    "    T.cast(T.reshape(train_output.shape[0], (1,)), 'int32'), \n",
-    "    blanked_labels,\n",
-    "    T.cast(T.reshape(labels_var.shape[1], (1,)), 'int32'),\n",
-    "    not_repeated)\n",
-    "test_output = lasagne.layers.get_output(l_linout, deterministic=True)\n",
-    "\n",
-    "loss_fn = theano.function([input_var, duration_var, labels_var], loss)\n",
-    "beta_fn = theano.function([input_var, duration_var, labels_var], betas)\n",
-    "grads_fn = theano.function([input_var, duration_var, labels_var], grads)\n",
-    "predict_fn = theano.function([input_var, duration_var], T.exp(log_softmax(test_output[:, 0, :])))\n",
-    "logits_fn = theano.function([input_var, duration_var], test_output)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "logits = logits_fn(f, d)[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "o = np.argsort(np.mean(logits[:, :60], axis=0))\n",
-    "plt.figure(figsize=(10, 10))\n",
-    "for c in o:\n",
-    "    plt.plot(np.arange(len(logits)), logits[:, c])\n",
-    "\n",
-    "plt.plot(np.arange(len(logits)), logits[:, -1], linestyle=\":\")\n",
-    "plt.legend([vocabulary[o_] for o_ in o] + [vocabulary[-1]], bbox_to_anchor=(.6, 0, 1, 1), ncol=5)\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# beta"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "features[i].shape[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "betas = ctc_backward(\n",
-    "    log_softmax(train_output),\n",
-    "    T.cast(T.reshape(train_output.shape[0], (1,)), 'int32'), \n",
-    "    blanked_labels,\n",
-    "    T.cast(T.reshape(labels_var.shape[1], (1,)), 'int32'),\n",
-    "    not_repeated)\n",
-    "beta_fn = theano.function([input_var, duration_var, labels_var], betas)\n",
-    "\n",
-    "b = beta_fn(f, d, l)\n",
-    "\n",
-    "plt.figure(figsize=(10, 6))\n",
-    "plt.imshow(b[0:, 0, 0:], clim=(-5000, max(0, np.max(b))))\n",
-    "plt.gca().set_aspect(0.1)\n",
-    "plt.colorbar()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "p = lasagne.layers.get_all_param_values(l_linout, trainable=True)\n",
-    "for p_ in p:\n",
-    "    print((p_.min(), p_.max()))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "g = theano.grad(loss.sum(), wrt=train_output).eval({\n",
-    "    input_var: f,\n",
-    "    duration_var: d,\n",
-    "    labels_var: l\n",
-    "})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.subplot(2, 1, 1)\n",
-    "plt.bar(np.arange(len(vocabulary)), g[:, 0, np.concatenate((o, [60]))].mean(axis=0))\n",
-    "plt.subplot(2, 1, 2)\n",
-    "plt.plot(g[:, 0, :].mean(axis=1))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "len(vocabulary)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def argmax_decode(preds):\n",
-    "    decoded = [preds[0]]\n",
-    "    for v in preds:\n",
-    "        if v != decoded[-1]:\n",
-    "            decoded.append(v)\n",
-    "    \n",
-    "    return np.array(decoded, dtype=np.int32)\n",
-    "\n",
-    "lbl_preds = argmax_decode(np.argmax(logits, axis=-1))\n",
-    "lbl_tgt = labels[i]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.4"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/papers/connectionist_temporal_classification/test_ctc.py b/papers/connectionist_temporal_classification/test_ctc.py
new file mode 100644
index 0000000..204d5a5
--- /dev/null
+++ b/papers/connectionist_temporal_classification/test_ctc.py
@@ -0,0 +1,181 @@
+import numpy as np
+import theano
+import theano.tensor as T
+from theano.tests import unittest_tools
+
+from papers.connectionist_temporal_classification.ctc import ctc_loss, isneginf
+
+
+# def test_forward_backward():
+#     batch_size = 6
+#     label_size = 7
+#     voca_size = 5
+#     seq_size = 10
+#
+#     label_lengths = np.random.randint(0, label_size,
+#                                       size=(batch_size,), dtype=np.int32)
+#     label_lengths[0] = label_size  # extremum case
+#     label_lengths[1] = 0  # extremum case
+#     labels = np.array(
+#         [np.random.randint(0, voca_size - 1, size=label_size, dtype=np.int32)
+#          for _ in range(batch_size)])
+#     for i in range(batch_size):
+#         labels[i, label_lengths[i]:] = -1
+#
+#     seq_durations = np.array([
+#         np.random.randint(max(1, label_lengths[i]), seq_size)
+#         for i in range(batch_size)], dtype=np.int32)
+#
+#     linear_out = np.random.randn(seq_size, batch_size, voca_size) \
+#         .astype(np.float32)
+#
+#     blank_class = -1
+#     blank_class = np.mod(blank_class, voca_size)
+#
+#     labels = np.mod(labels, voca_size)
+#
+#     log_odds = log_softmax(linear_out)
+#     blanked_labels = insert_alternating_blanks(T.mod(labels, voca_size),
+#                                                blank_class)
+#     not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])
+#
+#     alphas = ctc_forward(log_odds, seq_durations,
+#                          blanked_labels, label_lengths, not_repeated)
+#     betas = ctc_backward(log_odds, seq_durations,
+#                          blanked_labels, label_lengths, not_repeated)
+#
+#     preds = log_softmax(linear_out)
+#
+#     y_blanks = preds[:, T.arange(batch_size)[:, None], blanked_labels]
+#     p_l = T.sum(T.exp(alphas + betas - y_blanks), axis=2)
+#
+#     alphas = alphas.eval()
+#     betas = betas.eval()
+#     preds = preds.eval()
+#
+#     for i in range(batch_size):
+#         assert np.allclose(alphas[0, i, 0], preds[0, i, -1])
+#         if label_lengths[i] > 0:
+#             assert np.allclose(alphas[0, i, 1], preds[0, i, labels[i, 0]])
+#         else:
+#             assert isneginf(alphas[0, i, 1])
+#         assert np.all(isneginf(alphas[0, i, 2:]))
+#
+#     for i in range(batch_size):
+#         t = seq_durations[i] - 1
+#         l = label_lengths[i]
+#         assert np.allclose(betas[t, i, 2 * l], preds[t, i, -1])
+#         if l > 0:
+#             assert np.allclose(betas[t, i, 2 * l - 1],
+#                                preds[t, i, labels[i, l - 1]])
+#             assert np.all(isneginf(betas[t, i, :max(l - 2, 0)]))
+#         else:
+#             assert np.all(isneginf(betas[t, i, 1:]))
+#
+#     p_l = p_l.eval()
+#
+#     for i in range(batch_size):
+#         assert (np.allclose(p_l[:seq_durations[i], i], p_l[0, i]))
+#         a, b = max(0, 2 * label_lengths[i] - 1), 2 * label_lengths[i] + 1
+#         p_li = np.exp(alphas[seq_durations[i] - 1, i, a:b]).sum()
+#         assert np.allclose(p_li, p_l[0, i])
+#         p_li = np.exp(betas[0, i, :2]).sum()
+#         assert np.allclose(p_li, p_l[0, i])
+
+
+def test_simple_precomputed():
+    # Test obtained from Torch tutorial at:
+    # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md
+
+    linear_out = np.asarray([
+        [[0, 0, 0, 0, 0], [1, 2, 3, 4, 5], [-5, -4, -3, -2, -1]],
+        [[0, 0, 0, 0, 0], [6, 7, 8, 9, 10], [-10, -9, -8, -7, -6]],
+        [[0, 0, 0, 0, 0], [11, 12, 13, 14, 15], [-15, -14, -13, -12, -11]]
+    ], dtype=np.float32)
+
+    seq_sizes = np.asarray([1, 3, 3], dtype=np.int32)
+
+    labels = np.asarray([[1, 0], [3, 3], [2, 3]], dtype=np.int32)
+
+    label_sizes = np.asarray([1, 2, 2], dtype=np.int32)
+
+    expected_losses = np.asarray([1.609437943, 7.355742931, 4.938849926],
+                                 dtype=np.float32)
+
+    blank = 0
+
+    expected_grad = np.asarray([
+        [[0.2,            -0.8,            0.2,            0.2,           0.2],
+         [ 0.01165623125,  0.03168492019,  0.08612854034, -0.7658783197,  0.636408627],
+         [-0.02115798369,  0.03168492019, -0.8810571432,   0.2341216654,  0.636408627]],
+        [[0,               0,              0,              0,             0],
+         [-0.9883437753,   0.03168492019,  0.08612854034,  0.2341216654,  0.636408627],
+         [-0.02115798369,  0.03168492019, -0.1891518533,  -0.4577836394,  0.636408627]],
+        [[0,               0,              0,              0,             0],
+         [0.01165623125,   0.03168492019,  0.08612854034, -0.7658783197,  0.636408627],
+         [-0.02115798369,  0.03168492019,  0.08612854034, -0.7330639958,  0.636408627]]
+    ], dtype=np.float32)
+
+    linear_out_var = T.as_tensor_variable(linear_out)
+    losses = ctc_loss(
+        linear_out_var, seq_sizes, labels, label_sizes, blank)
+
+    assert np.allclose(losses.eval(), expected_losses, atol=1)
+
+    grad = theano.grad(losses.sum(), wrt=linear_out_var)
+
+    assert np.allclose(grad.eval(), expected_grad, rtol=.001, atol=1)
+
+
+def test_random():
+    batch_size = 16
+    label_size = 5
+    voca_size = 4
+    seq_size = 20
+
+    label_sizes = np.random.randint(
+        0, label_size, size=(batch_size,), dtype=np.int32)
+    label_sizes[0] = label_size
+    label_sizes[1] = 0
+    label_sizes[2] = 5
+    label_sizes[3] = 5
+
+    labels = np.random.randint(
+        0, voca_size - 1,
+        size=(batch_size, label_size), dtype=np.int32)
+    labels[3] = 0
+
+    seq_sizes = np.array([
+        np.random.randint(max(1, label_sizes[i]), seq_size)
+        for i in range(batch_size)], dtype=np.int32)
+    seq_sizes[2] = 4
+
+    linear_out = np.random.randn(
+        seq_size, batch_size, voca_size).astype(np.float32)
+
+    # check edge cases
+    # TODO
+
+    # check the gradient can be computed at all
+    linear_out_var = T.tensor3()
+    preds = T.nnet.softmax(
+        linear_out_var.reshape((-1, voca_size))
+    ).reshape((seq_size, batch_size, voca_size))
+
+    g = theano.grad(ctc_loss(preds, seq_sizes,
+                             labels, label_sizes).sum(),
+                    wrt=linear_out_var).eval(
+        {linear_out_var: linear_out.astype(np.float32)})
+    assert not np.any(np.isnan(g))
+
+    # check correctness against finite difference approximation
+    def f(linear_out_):
+        preds_ = T.nnet.softmax(
+                linear_out_.reshape((-1, voca_size))
+            ).reshape((seq_size, batch_size, voca_size))
+        loss = ctc_loss(preds_, seq_sizes, labels, label_sizes)
+        # prevent finite differences from failing
+        loss = T.switch(isneginf(-loss), 0, loss)
+        return loss
+
+    unittest_tools.verify_grad(f, [linear_out], abs_tol=0.05, rel_tol=0.05)
diff --git a/papers/connectionist_temporal_classification/test_ctc2.py b/papers/connectionist_temporal_classification/test_ctc2.py
deleted file mode 100644
index 10d3425..0000000
--- a/papers/connectionist_temporal_classification/test_ctc2.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import unittest
-import numpy as np
-import theano
-import theano.tensor as T
-from theano.tests import unittest_tools
-
-from papers.connectionist_temporal_classification.ctc import \
-    ctc_loss, ctc_forward, ctc_backward, insert_alternating_blanks, \
-    isneginf, log_softmax
-
-
-class TestCTC(unittest.TestCase):
-    def setUp(self):
-        unittest_tools.seed_rng()
-
-    def test_forward_backward(self):
-        batch_size = 6
-        label_size = 7
-        voca_size = 5
-        seq_size = 10
-
-        label_lengths = np.random.randint(0, label_size,
-                                          size=(batch_size,), dtype=np.int32)
-        label_lengths[0] = label_size  # extremum case
-        label_lengths[1] = 0  # extremum case
-        labels = np.array(
-            [np.random.randint(0, voca_size - 1, size=label_size, dtype=np.int32)
-             for _ in range(batch_size)])
-        for i in range(batch_size):
-            labels[i, label_lengths[i]:] = -1
-
-        seq_durations = np.array([
-            np.random.randint(max(1, label_lengths[i]), seq_size)
-            for i in range(batch_size)], dtype=np.int32)
-
-        linear_out = np.random.randn(seq_size, batch_size, voca_size) \
-            .astype(np.float32)
-
-        blank_class = -1
-        blank_class = np.mod(blank_class, voca_size)
-
-        labels = np.mod(labels, voca_size)
-
-        log_odds = log_softmax(linear_out)
-        blanked_labels = insert_alternating_blanks(T.mod(labels, voca_size),
-                                                   blank_class)
-        not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])
-
-        alphas = ctc_forward(log_odds, seq_durations,
-                             blanked_labels, label_lengths, not_repeated)
-        betas = ctc_backward(log_odds, seq_durations,
-                             blanked_labels, label_lengths, not_repeated)
-
-        preds = log_softmax(linear_out)
-
-        y_blanks = preds[:, T.arange(batch_size)[:, None], blanked_labels]
-        p_l = T.sum(T.exp(alphas + betas - y_blanks), axis=2)
-
-        alphas = alphas.eval()
-        betas = betas.eval()
-        preds = preds.eval()
-
-        for i in range(batch_size):
-            assert np.allclose(alphas[0, i, 0], preds[0, i, -1])
-            if label_lengths[i] > 0:
-                assert np.allclose(alphas[0, i, 1], preds[0, i, labels[i, 0]])
-            else:
-                assert isneginf(alphas[0, i, 1])
-            assert np.all(isneginf(alphas[0, i, 2:]))
-
-        for i in range(batch_size):
-            t = seq_durations[i] - 1
-            l = label_lengths[i]
-            assert np.allclose(betas[t, i, 2 * l], preds[t, i, -1])
-            if l > 0:
-                assert np.allclose(betas[t, i, 2 * l - 1],
-                                   preds[t, i, labels[i, l - 1]])
-                assert np.all(isneginf(betas[t, i, :max(l - 2, 0)]))
-            else:
-                assert np.all(isneginf(betas[t, i, 1:]))
-
-        p_l = p_l.eval()
-
-        for i in range(batch_size):
-            assert (np.allclose(p_l[:seq_durations[i], i], p_l[0, i]))
-            a, b = max(0, 2 * label_lengths[i] - 1), 2 * label_lengths[i] + 1
-            p_li = np.exp(alphas[seq_durations[i] - 1, i, a:b]).sum()
-            assert np.allclose(p_li, p_l[0, i])
-            p_li = np.exp(betas[0, i, :2]).sum()
-            assert np.allclose(p_li, p_l[0, i])
-
-    def test_simple_precomputed(self):
-        # Test obtained from Torch tutorial at:
-        # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md
-
-        linear_out = np.asarray([
-            [[0, 0, 0, 0, 0], [1, 2, 3, 4, 5], [-5, -4, -3, -2, -1]],
-            [[0, 0, 0, 0, 0], [6, 7, 8, 9, 10], [-10, -9, -8, -7, -6]],
-            [[0, 0, 0, 0, 0], [11, 12, 13, 14, 15], [-15, -14, -13, -12, -11]]
-        ], dtype=np.float32)
-
-        seq_sizes = np.asarray([1, 3, 3], dtype=np.int32)
-
-        labels = np.asarray([[1, 0], [3, 3], [2, 3]], dtype=np.int32)
-
-        label_sizes = np.asarray([1, 2, 2], dtype=np.int32)
-
-        expected_losses = np.asarray([1.609437943, 7.355742931, 4.938849926],
-                                     dtype=np.float32)
-
-        blank = 0
-
-        expected_grad = np.asarray([
-            [[0.2,            -0.8,            0.2,            0.2,           0.2],
-             [ 0.01165623125,  0.03168492019,  0.08612854034, -0.7658783197,  0.636408627],
-             [-0.02115798369,  0.03168492019, -0.8810571432,   0.2341216654,  0.636408627]],
-            [[0,               0,              0,              0,             0],
-             [-0.9883437753,   0.03168492019,  0.08612854034,  0.2341216654,  0.636408627],
-             [-0.02115798369,  0.03168492019, -0.1891518533,  -0.4577836394,  0.636408627]],
-            [[0,               0,              0,              0,             0],
-             [0.01165623125,   0.03168492019,  0.08612854034, -0.7658783197,  0.636408627],
-             [-0.02115798369,  0.03168492019,  0.08612854034, -0.7330639958,  0.636408627]]
-        ], dtype=np.float32)
-
-        linear_out_var = T.as_tensor_variable(linear_out)
-        losses = ctc_loss(
-            linear_out_var, seq_sizes, labels, label_sizes, blank)
-
-        assert np.allclose(losses.eval(), expected_losses, atol=1)
-
-        grad = theano.grad(losses.sum(), wrt=linear_out_var)
-
-        assert np.allclose(grad.eval(), expected_grad, rtol=.001, atol=1)
-
-    def test_random(self):
-        batch_size = 16
-        label_size = 5
-        voca_size = 4
-        seq_size = 20
-
-        label_sizes = np.random.randint(
-            0, label_size, size=(batch_size,), dtype=np.int32)
-        label_sizes[0] = label_size
-        label_sizes[1] = 0
-        label_sizes[2] = 5
-        label_sizes[3] = 5
-
-        labels = np.random.randint(
-            0, voca_size - 1,
-            size=(batch_size, label_size), dtype=np.int32)
-        labels[3] = 0
-
-        seq_sizes = np.array([
-            np.random.randint(max(1, label_sizes[i]), seq_size)
-            for i in range(batch_size)], dtype=np.int32)
-        seq_sizes[2] = 4
-
-        linear_out = np.random.randn(
-            seq_size, batch_size, voca_size).astype(np.float32)
-
-        # check edge cases
-        # TODO
-
-        # check the gradient can be computed at all
-        linear_out_var = T.tensor3()
-        preds = T.nnet.softmax(
-            linear_out_var.reshape((-1, voca_size))
-        ).reshape((seq_size, batch_size, voca_size))
-
-        g = theano.grad(ctc_loss(preds, seq_sizes,
-                                 labels, label_sizes).sum(),
-                        wrt=linear_out_var).eval(
-            {linear_out_var: linear_out.astype(np.float32)})
-        assert not np.any(np.isnan(g))
-
-        # check correctness against finite difference approximation
-        def f(linear_out_):
-            preds_ = T.nnet.softmax(
-                    linear_out_.reshape((-1, voca_size))
-                ).reshape((seq_size, batch_size, voca_size))
-            loss = ctc_loss(preds_, seq_sizes, labels, label_sizes)
-            # prevent finite differences from failing
-            loss = T.switch(isneginf(-loss), 0, loss)
-            return loss
-
-        unittest_tools.verify_grad(f, [linear_out], abs_tol=0.05, rel_tol=0.05)
diff --git a/papers/connectionist_temporal_classification/tests.ipynb b/papers/connectionist_temporal_classification/tests.ipynb
deleted file mode 100644
index abd4061..0000000
--- a/papers/connectionist_temporal_classification/tests.ipynb
+++ /dev/null
@@ -1,186 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%load_ext autoreload\n",
-    "%autoreload 2\n",
-    "%matplotlib inline\n",
-    "\n",
-    "import sys\n",
-    "import os\n",
-    "from IPython.core.interactiveshell import InteractiveShell\n",
-    "\n",
-    "sys.path.insert(-1, os.getcwd())\n",
-    "InteractiveShell.ast_node_interactivity = \"all\"\n",
-    "os.environ['THEANO_FLAGS'] = \"device=cpu\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "import tensorflow as tf\n",
-    "import theano\n",
-    "import theano.tensor as T\n",
-    "\n",
-    "from ctc import ctc_loss as my_ctc_loss"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "batch_size = 16\n",
-    "max_labsize = 20\n",
-    "voca_size = 20  # excluding blank\n",
-    "max_seqsize = 100\n",
-    "blank = -1\n",
-    "\n",
-    "labsize = np.random.randint(\n",
-    "    1, max_labsize + 1, size=(batch_size,), dtype=np.int32)\n",
-    "labsize[0] = max_labsize\n",
-    "labsize[1] = 1\n",
-    "labsize[2] = max_labsize\n",
-    "labsize[3] = max_labsize\n",
-    "\n",
-    "labels = np.random.randint(\n",
-    "    0, voca_size,\n",
-    "    size=(batch_size, max_labsize), dtype=np.int32)\n",
-    "for b in range(batch_size):\n",
-    "    labels[b, labsize[b]:] = blank\n",
-    "\n",
-    "seqsize = np.array([\n",
-    "    np.random.randint(labsize[i] + 1, max_seqsize + 1)\n",
-    "    for i in range(batch_size)], dtype=np.int32)\n",
-    "seqsize[0] = max_seqsize\n",
-    "\n",
-    "linout = np.random.randn(\n",
-    "    max_seqsize, batch_size, voca_size + 1).astype(np.float32)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "th_linout_var = T.tensor3()\n",
-    "th_seqsize_var = T.ivector()\n",
-    "th_labels_var = T.imatrix()\n",
-    "th_labsize_var = T.ivector()\n",
-    "th_loss = my_ctc_loss(th_linout_var, th_seqsize_var, th_labels_var, th_labsize_var)\n",
-    "\n",
-    "def dense_to_sparse(x):\n",
-    "    idx = tf.where(tf.greater_equal(x, 0))\n",
-    "    return tf.SparseTensor(idx, tf.gather_nd(x, idx), tf.shape(x, out_type=tf.int64))\n",
-    "\n",
-    "tf_linout_var = tf.placeholder(tf.float32, shape=[max_seqsize, batch_size, voca_size + 1])\n",
-    "tf_seqsize_var = tf.placeholder(tf.int32, shape=[batch_size])\n",
-    "tf_labels_var = tf.placeholder(tf.int32, shape=[batch_size, max_labsize])\n",
-    "\n",
-    "tf_loss = tf.nn.ctc_loss(\n",
-    "    dense_to_sparse(tf_labels_var), tf_linout_var,\n",
-    "    sequence_length=tf_seqsize_var,\n",
-    "    time_major=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with tf.Session() as sess:\n",
-    "    tf_result = sess.run(\n",
-    "        tf_loss, {\n",
-    "            tf_linout_var: linout,\n",
-    "            tf_seqsize_var: seqsize,\n",
-    "            tf_labels_var: labels\n",
-    "        })\n",
-    "    \n",
-    "    th_results = th_loss.eval({\n",
-    "        th_linout_var: linout,\n",
-    "        th_seqsize_var: seqsize,\n",
-    "        th_labels_var: labels,\n",
-    "        th_labsize_var: labsize\n",
-    "    })\n",
-    "    \n",
-    "    print(np.abs(tf_result - th_results) / tf_result)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf_g = tf.gradients(xs=tf_linout_var, ys=tf.reduce_sum(tf_loss))[0]\n",
-    "\n",
-    "with tf.Session() as sess:\n",
-    "    tf_grad = sess.run(\n",
-    "        tf_g, {\n",
-    "            tf_linout_var: linout,\n",
-    "            tf_seqsize_var: seqsize,\n",
-    "            tf_labels_var: labels\n",
-    "        })\n",
-    "    \n",
-    "    th_grad = theano.grad(th_loss.sum(), wrt=th_linout_var).eval({\n",
-    "        th_linout_var: linout,\n",
-    "        th_seqsize_var: seqsize,\n",
-    "        th_labels_var: labels,\n",
-    "        th_labsize_var: labsize\n",
-    "    })\n",
-    "    \n",
-    "    print(np.abs(tf_grad - th_grad) / (tf_grad + .000001))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "th_grad[:, 0, :]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf_grad[:, 0, :]"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.4"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}