From 1951e7036bd166d7ec6677a3f3ab4a269abfcbe7 Mon Sep 17 00:00:00 2001
From: Dominik Welland <hallo@doik.de>
Date: Fri, 22 Apr 2016 15:04:08 +0200
Subject: [PATCH] remove lstm gradient descent for the moment

---
 micropsi_core/nodenet/native_modules.py | 698 ------------------------
 1 file changed, 698 deletions(-)

diff --git a/micropsi_core/nodenet/native_modules.py b/micropsi_core/nodenet/native_modules.py
index 47ded65f..b8cc049a 100644
--- a/micropsi_core/nodenet/native_modules.py
+++ b/micropsi_core/nodenet/native_modules.py
@@ -21,67 +21,6 @@
 if numpy_installed:
     # only register these native modules if we
     # have theano and numpy installed.
-    nodetypes["GradientDescentLSTM"] = {
-        "name": "GradientDescentLSTM",
-        "engine": "theano_engine",
-        "slottypes": ["trigger", "debug"],
-        "gatetypes": ["e"],
-        "nodefunction_name": "gradient_descent_lstm",
-        "symbol": "↺",
-        "category": "nn_learning",
-        "path": os.path.abspath(__file__),
-        "parameters": [
-            "adadelta_rho",
-            "adadelta_epsilon",
-            "sequence_length",
-            "links_io",
-            "links_porpor",
-            "links_porgin",
-            "links_porgou",
-            "links_porgfg",
-            "bias_gin",
-            "bias_gou",
-            "bias_gfg",
-            "group_t_nodes",
-            "group_t_gates",
-            "group_i_nodes",
-            "group_i_gates",
-            "group_c_nodes",
-            "group_o_nodes",
-            "group_o_gates"
-        ],
-        "parameter_values": {
-            "links_io": ["true", "false"],
-            "links_porpor": ["true", "false"],
-            "links_porgin": ["true", "false"],
-            "links_porgou": ["true", "false"],
-            "links_porgfg": ["true", "false"],
-            "bias_gin": ["true", "false"],
-            "bias_gou": ["true", "false"],
-            "bias_gfg": ["true", "false"]
-        },
-        "parameter_defaults": {
-            "adadelta_rho": "0.95",
-            "adadelta_epsilon": "0.000001",
-            "sequence_length": "5",
-            "links_io": "true",
-            "links_porpor": "true",
-            "links_porgin": "true",
-            "links_porgou": "true",
-            "links_porgfg": "true",
-            "bias_gin": "true",
-            "bias_gou": "true",
-            "bias_gfg": "true",
-            "group_t_nodes": "target",
-            "group_t_gates": "gen",
-            "group_i_nodes": "input",
-            "group_i_gates": "gen",
-            "group_c_nodes": "lstm",
-            "group_o_nodes": "output",
-            "group_o_gates": "gen"
-        }
-    }
-
     nodetypes["GradientDescent"] = {
         "name": "GradientDescent",
         "engine": "theano_engine",
@@ -121,643 +60,6 @@
     }
 
 
-def gradient_descent_lstm(netapi, node=None, **params):
-    """
-    Gradient Descent for LSTMs
-
-    The following assumes a three-layer architecture, with hidden LSTM nodes.
-    There is always a single LSTM cell per block (no multi-block cells are implemented).
-
-    The following sets of weights are defined:
-    input -> output
-    input -> cell
-    input -> input gate
-    input -> output gate
-    input -> forget gate
-    cell -> output
-    cell -> input gate
-    cell -> output gate
-    cell -> forget gate
-
-    The cell's constant error carousel link is explicitly modelled (as a gen loop).
-    Note that input, output and forget gate links aren't updated right now.
-
-    Variable naming and implementation follows:
-    Gers & al. 1999, Learning to Forget - Continual Prediction with LSTM
-
-    Other helpful papers:
-    Hochreiter & al. 1997, Long Short-Term Memory (introduces naming convention and most of the math)
-    Graves & al. 2005, Framewise Phoneme Classification with Bidrectional LSTM and Other NN Architectures
-
-    For the Graves paper, a minimal, almost readable python implementation can be found at:
-    https://gist.github.com/neubig/ff2f97d91c9bed820c15
-
-    The ADADELTA implemetation follows the original ADADELTA paper:
-    Zeiler 2012, ADADELTA: An Adaptive Learning Rate Method
-
-    A nice theano adadelta implementation is here:
-    https://blog.wtf.sg/2014/08/28/implementing-adadelta/
-    """
-
-    from numbers import Number
-    from theano import tensor as T
-
-    SEQUENCE_LENGTH = 3
-    sequence_length_string = node.get_parameter("sequence_length")
-    if sequence_length_string is not None:
-        SEQUENCE_LENGTH = int(sequence_length_string)
-
-    target_node_group = node.get_parameter("group_t_nodes")
-    target_gate = node.get_parameter("group_t_gates")
-    output_node_group = node.get_parameter("group_o_nodes")
-    output_gate = node.get_parameter("group_o_gates")
-    input_node_group = node.get_parameter("group_i_nodes")
-    input_gate = node.get_parameter("group_i_gates")
-    lstm = node.get_parameter("group_c_nodes")
-    lstm_gen = "%s_gen" % lstm
-    lstm_por = "%s_por" % lstm
-    lstm_gin = "%s_gin" % lstm
-    lstm_gou = "%s_gou" % lstm
-    lstm_gfg = "%s_gfg" % lstm
-    input = "%s_input" % input_node_group
-    output = "%s_output" % output_node_group
-    target = "%s_target" % target_node_group
-
-    nodespace = node.parent_nodespace
-
-    if not hasattr(node, 'initialized'):
-
-        # create the groups
-        netapi.group_nodes_by_names(nodespace, node_name_prefix=target_node_group, gate=target_gate, group_name=target)
-        netapi.group_nodes_by_names(nodespace, node_name_prefix=output_node_group, gate=output_gate, group_name=output)
-        netapi.group_nodes_by_names(nodespace, node_name_prefix=input_node_group, gate=input_gate, group_name=input)
-
-        netapi.group_nodes_by_names(nodespace, node_name_prefix=lstm, gate="gen", group_name=lstm_gen)
-        netapi.group_nodes_by_names(nodespace, node_name_prefix=lstm, gate="por", group_name=lstm_por)
-        netapi.group_nodes_by_names(nodespace, node_name_prefix=lstm, gate="gin", group_name=lstm_gin)
-        netapi.group_nodes_by_names(nodespace, node_name_prefix=lstm, gate="gou", group_name=lstm_gou)
-        netapi.group_nodes_by_names(nodespace, node_name_prefix=lstm, gate="gfg", group_name=lstm_gfg)
-
-        len_output = len(netapi.get_activations(nodespace, output))
-        len_input = len(netapi.get_activations(nodespace, input))
-        len_hidden = len(netapi.get_activations(nodespace, lstm_por))
-
-        # define a single LSTM-style backpropagation through time step, to be scanned over by theano
-        def bpttstep(
-                s, tgt, y_k, y_i, y_c, net_in, net_out, net_phi,
-                error, drv_ci_prev, drv_cc_prev, drv_ini_prev, drv_inc_prev, drv_in1_prev, drv_phii_prev, drv_phic_prev, drv_phi1_prev,
-                delta_w_ki, delta_w_kc, delta_w_outi, delta_w_outc, delta_w_ci, delta_w_cc, delta_w_ini, delta_w_inc, delta_w_phii, delta_w_phic,
-                delta_theta_i, delta_theta_k, delta_theta_in, delta_theta_out, delta_theta_phi,
-                w_kc, w_ci, w_cc, w_outc, w_outi, w_ini, w_inc, w_phii, w_phic):
-
-            # calculate error
-            e_k = tgt - y_k                         # (12) error per output element
-            E = T.sum(T.square(e_k)) / 2.           # (12) squared sum to be minimized
-
-            # Part I: standard (truncated) BPTT for links to output registers and lstm output gate slots
-            # cell -> output
-            # cell -> output gate
-            # input -> output
-            # input -> output gate
-
-            # functions and derivatives
-            y_in = T.nnet.sigmoid(net_in)           # (3) y_in = f(net_in)
-            y_out = T.nnet.sigmoid(net_out)         # (3) y_out = f(net_out)
-            y_phi = T.nnet.sigmoid(net_phi)         # (3) y_phi = f(net_phi)
-
-            h_s = 2 * T.nnet.sigmoid(s) - 1         # (8)
-
-            f_primed_net_k = y_k * (1. - y_k)       # f'(net_k) = f(net_k) * (1 - f(net_k)), f(net_k) provided as y_k
-            f_primed_net_out = y_out * (1. - y_out)
-            f_primed_net_in = y_in * (1. - y_in)
-            f_primed_net_phi = y_phi * (1. - y_phi)
-            # f_primed_net_i = y_i * (1. - y_i)
-            h_primed_s = (2 * T.exp(s)) / T.square(T.exp(s) + 1)
-
-            delta_k = f_primed_net_k * e_k                                                                       # (14) delta per output element
-            delta_out = f_primed_net_out * h_s * T.sum(w_kc * T.reshape(delta_k, (len_output, 1)), axis=0)       # (15) delta per output gate
-
-            # we use y_c and y_i here instead of y_i_prev because we have "flattened snapshots" to work with
-            # i.e. the partial derivative of net_k(t) with respect to w_kc is delta_k(t) * y_c(t)
-            # (y_c is what was propagated and created net_k)
-            delta_w_kc += T.dot(T.reshape(delta_k, (len_output, 1)), T.reshape(y_c, (1, len_hidden)))       # (13) m = c
-            delta_w_ki += T.dot(T.reshape(delta_k, (len_output, 1)), T.reshape(y_i, (1, len_input)))        # (13) m = i
-            delta_w_outi += T.dot(T.reshape(delta_out, (len_hidden, 1)), T.reshape(y_i, (1, len_input)))    # (13) m = c
-            delta_w_outc += T.dot(T.reshape(delta_out, (len_hidden, 1)), T.reshape(y_c, (1, len_hidden)))   # (13) m = i
-
-            delta_theta_k += delta_k
-            delta_theta_out += delta_out
-
-            # Part II: RTRL-style updates
-            # input -> cell
-            # cell -> cell
-            # input -> input gate
-            # cell -> input gate
-            # input -> forget gate
-            # cell -> forget gate
-
-            net_c = T.dot(w_ci, y_i)                                                        # ugly re-calculation of forward pass for net_c
-            g_net_c = 4 * T.nnet.sigmoid(net_c) - 2                                         # (5)
-            g_primed_net_c = (4 * T.exp(net_c)) / T.square(T.exp(net_c) + 1)
-
-            e_s = y_out * h_primed_s * T.sum(w_kc * T.reshape(delta_k, (len_output, 1)), axis=0)                 # (17)
-
-            drv_ci = drv_ci_prev * T.reshape(y_phi, (len_hidden, 1)) \
-                + T.dot(T.reshape(g_primed_net_c * y_in, (len_hidden, 1)), T.reshape(y_i, (1, len_input)))       # (19) m = i
-            drv_cc = drv_cc_prev * T.reshape(y_phi, (len_hidden, 1)) \
-                + T.dot(T.reshape(g_primed_net_c * y_in, (len_hidden, 1)), T.reshape(y_c, (1, len_hidden)))      # (19) m = i
-
-            drv_ini = drv_ini_prev * T.reshape(y_phi, (len_hidden, 1)) \
-                + T.dot(T.reshape(g_net_c * f_primed_net_in, (len_hidden, 1)), T.reshape(y_i, (1, len_input)))   # (20) m = i
-            drv_inc = drv_inc_prev * T.reshape(y_phi, (len_hidden, 1)) \
-                + T.dot(T.reshape(g_net_c * f_primed_net_in, (len_hidden, 1)), T.reshape(y_c, (1, len_hidden)))  # (20) m = c
-            drv_in1 = drv_in1_prev * y_phi + g_net_c * f_primed_net_in
-
-            drv_phii = drv_phii_prev * T.reshape(y_phi, (len_hidden, 1)) \
-                + T.dot(T.reshape(h_s * f_primed_net_phi, (len_hidden, 1)), T.reshape(y_i, (1, len_input)))      # (21) m = i
-            drv_phic = drv_phic_prev * T.reshape(y_phi, (len_hidden, 1)) \
-                + T.dot(T.reshape(h_s * f_primed_net_phi, (len_hidden, 1)), T.reshape(y_c, (1, len_hidden)))     # (21) m = c
-            drv_phi1 = drv_phi1_prev * y_phi + h_s * f_primed_net_phi
-
-            delta_w_ci += T.reshape(e_s, (len_hidden, 1)) * drv_ci
-            delta_w_cc += T.reshape(e_s, (len_hidden, 1)) * drv_cc
-
-            delta_w_ini += T.reshape(e_s, (len_hidden, 1)) * drv_ini
-            delta_w_inc += T.reshape(e_s, (len_hidden, 1)) * drv_inc
-
-            delta_w_phii += T.reshape(e_s, (len_hidden, 1)) * drv_phii
-            delta_w_phic += T.reshape(e_s, (len_hidden, 1)) * drv_phic
-
-            # delta_theta_i += 0
-            delta_theta_in += e_s * drv_in1
-            delta_theta_phi += e_s * drv_phi1
-
-            error = E
-
-            return error, drv_ci, drv_cc, drv_ini, drv_inc, drv_in1, drv_phii, drv_phic, drv_phi1, \
-                delta_w_ki, delta_w_kc, delta_w_outi, delta_w_outc, delta_w_ci, delta_w_cc, delta_w_ini, delta_w_inc, delta_w_phii, delta_w_phic, \
-                delta_theta_i, delta_theta_k, delta_theta_in, delta_theta_out, delta_theta_phi  # cumulate
-
-        node.set_state('current_error', 0.)
-        node.set_state('error', 0.)
-        node.set_state('updates', 0)
-        node.t = -1
-        node.samples = 0
-
-        t_a_i_matrix = node.t_a_i_matrix = np.zeros(shape=(SEQUENCE_LENGTH, len_input)).astype(T.config.floatX)
-        t_a_t_matrix = node.t_a_t_matrix = np.zeros(shape=(SEQUENCE_LENGTH, len_output)).astype(T.config.floatX)
-        t_a_o_matrix = node.t_a_o_matrix = np.zeros(shape=(SEQUENCE_LENGTH, len_output)).astype(T.config.floatX)
-        t_a_h_gen_matrix = node.t_a_h_gen_matrix = np.zeros(shape=(SEQUENCE_LENGTH, len_hidden)).astype(T.config.floatX)
-        t_a_h_por_matrix = node.t_a_h_por_matrix = np.zeros(shape=(SEQUENCE_LENGTH, len_hidden)).astype(T.config.floatX)
-        t_a_h_gin_matrix = node.t_a_h_gin_matrix = np.zeros(shape=(SEQUENCE_LENGTH, len_hidden)).astype(T.config.floatX)
-        t_a_h_gou_matrix = node.t_a_h_gou_matrix = np.zeros(shape=(SEQUENCE_LENGTH, len_hidden)).astype(T.config.floatX)
-        t_a_h_gfg_matrix = node.t_a_h_gfg_matrix = np.zeros(shape=(SEQUENCE_LENGTH, len_hidden)).astype(T.config.floatX)
-
-        w_oh_por_array = netapi.get_link_weights(nodespace, lstm_por, nodespace, output)
-        w_oi_array = netapi.get_link_weights(nodespace, input, nodespace, output)
-        w_h_por_i_array = netapi.get_link_weights(nodespace, input, nodespace, lstm_por)
-        w_h_gou_h_por_array = netapi.get_link_weights(nodespace, lstm_por, nodespace, lstm_gou)
-        w_h_gou_i_array = netapi.get_link_weights(nodespace, input, nodespace, lstm_gou)
-        w_h_por_h_por_array = netapi.get_link_weights(nodespace, lstm_por, nodespace, lstm_por)
-        w_h_gin_i_array = netapi.get_link_weights(nodespace, input, nodespace, lstm_gin)
-        w_h_gin_h_por_array = netapi.get_link_weights(nodespace, lstm_por, nodespace, lstm_gin)
-        w_h_gfg_i_array = netapi.get_link_weights(nodespace, input, nodespace, lstm_gfg)
-        w_h_gfg_h_por_array = netapi.get_link_weights(nodespace, lstm_por, nodespace, lstm_gfg)
-
-        theta_input_array = netapi.get_thetas(nodespace, input)
-        theta_output_array = netapi.get_thetas(nodespace, output)
-        theta_lstm_gin_array = netapi.get_thetas(nodespace, lstm_gin)
-        theta_lstm_gou_array = netapi.get_thetas(nodespace, lstm_gou)
-        theta_lstm_gfg_array = netapi.get_thetas(nodespace, lstm_gfg)
-
-        steps = T.iscalar("steps")
-
-        # adadelta hyperparameters
-        rho = T.scalar("rho")
-        epsilon = T.scalar("epsilon")
-
-        # activations -- post node/gatefunction, i.e. post-nonlinearities: y
-        # tgt t(t)
-        tgt = node.tgt = theano.shared(value=t_a_t_matrix.astype(T.config.floatX), name="tgt", borrow=False)
-        # output k(t)
-        y_k = node.y_k = theano.shared(value=t_a_o_matrix.astype(T.config.floatX), name="y_k", borrow=False)
-        # input i(t)
-        y_i = node.y_i = theano.shared(value=t_a_i_matrix.astype(T.config.floatX), name="y_i", borrow=False)
-        # cell state c(t)
-        y_c = node.y_c = theano.shared(value=t_a_h_por_matrix.astype(T.config.floatX), name="y_c", borrow=False)
-        # cell internal state (cec) s(t)
-        s = node.s = theano.shared(value=t_a_h_gen_matrix.astype(T.config.floatX), name="s", borrow=False)
-
-        # for the LSTM gates, no node/gatefunction has been calculated, so we get net sums, not post-nonlinearity values
-        # output gate out(t)
-        net_out = node.net_out = theano.shared(value=t_a_h_gou_matrix.astype(T.config.floatX), name="net_out", borrow=False)
-        # input gate in(t)
-        net_in = node.net_in = theano.shared(value=t_a_h_gin_matrix.astype(T.config.floatX), name="net_in", borrow=False)
-        # forget gate phi(t)
-        net_phi = node.net_phi = theano.shared(value=t_a_h_gfg_matrix.astype(T.config.floatX), name="net_phi", borrow=False)
-
-        # weight sets to be updated
-        # cell (c) -> output (k)
-        w_kc = node.w_kc = theano.shared(value=w_oh_por_array.astype(T.config.floatX), name="w_kc", borrow=False)
-        # input (i) -> output (k)
-        w_ki = node.w_ki = theano.shared(value=w_oi_array.astype(T.config.floatX), name="w_ki", borrow=False)
-        # cell (c) -> output gate (out)
-        w_outc = node.w_outc = theano.shared(value=w_h_gou_h_por_array.astype(T.config.floatX), name="w_outc", borrow=False)
-        # input (i) -> output gate (out)
-        w_outi = node.w_outi = theano.shared(value=w_h_gou_i_array.astype(T.config.floatX), name="w_outi", borrow=False)
-        # input (i) -> cell (c)
-        w_ci = node.w_ci = theano.shared(value=w_h_por_i_array.astype(T.config.floatX), name="w_ci", borrow=False)
-        # input (i) -> cell (c)
-        w_cc = node.w_cc = theano.shared(value=w_h_por_h_por_array.astype(T.config.floatX), name="w_cc", borrow=False)
-        # input (i) -> input gate (in)
-        w_ini = node.w_ini = theano.shared(value=w_h_gin_i_array.astype(T.config.floatX), name="w_ini", borrow=False)
-        # cell (c) -> input gate (in)
-        w_inc = node.w_inc = theano.shared(value=w_h_gin_h_por_array.astype(T.config.floatX), name="w_inc", borrow=False)
-        # input (i) -> forget gate (phi)
-        w_phii = node.w_phii = theano.shared(value=w_h_gfg_i_array.astype(T.config.floatX), name="w_phii", borrow=False)
-        # cell (c) -> forget gate (phi)
-        w_phic = node.w_phic = theano.shared(value=w_h_gfg_h_por_array.astype(T.config.floatX), name="w_phic", borrow=False)
-
-        # bias sets to be updated
-        theta_i = node.theta_i = theano.shared(value=theta_input_array.astype(T.config.floatX), name="theta_i", borrow=False)
-        theta_k = node.theta_k = theano.shared(value=theta_output_array.astype(T.config.floatX), name="theta_k", borrow=False)
-        theta_in = node.theta_in = theano.shared(value=theta_lstm_gin_array.astype(T.config.floatX), name="theta_in", borrow=False)
-        theta_out = node.theta_out = theano.shared(value=theta_lstm_gou_array.astype(T.config.floatX), name="theta_out", borrow=False)
-        theta_phi = node.theta_phi = theano.shared(value=theta_lstm_gfg_array.astype(T.config.floatX), name="theta_phi", borrow=False)
-
-        # adadelta gradients and delta accumulation variables
-        node.accu_grad_w_kc = theano.shared(value=np.zeros_like(w_oh_por_array), name="accu_grad_w_kc", borrow=True)
-        node.accu_delta_w_kc = theano.shared(value=np.zeros_like(w_oh_por_array), name="accu_delta_w_kc", borrow=True)
-        node.accu_grad_w_ki = theano.shared(value=np.zeros_like(w_oi_array), name="accu_grad_w_ki", borrow=True)
-        node.accu_delta_w_ki = theano.shared(value=np.zeros_like(w_oi_array), name="accu_delta_w_ki", borrow=True)
-        node.accu_grad_w_outc = theano.shared(value=np.zeros_like(w_h_gou_h_por_array), name="accu_grad_w_outc", borrow=True)
-        node.accu_delta_w_outc = theano.shared(value=np.zeros_like(w_h_gou_h_por_array), name="accu_delta_w_outc", borrow=True)
-        node.accu_grad_w_outi = theano.shared(value=np.zeros_like(w_h_gou_i_array), name="accu_grad_w_outi", borrow=True)
-        node.accu_delta_w_outi = theano.shared(value=np.zeros_like(w_h_gou_i_array), name="accu_delta_w_outi", borrow=True)
-        node.accu_grad_w_ci = theano.shared(value=np.zeros_like(w_h_por_i_array), name="accu_grad_w_ci", borrow=True)
-        node.accu_delta_w_ci = theano.shared(value=np.zeros_like(w_h_por_i_array), name="accu_delta_w_ci", borrow=True)
-        node.accu_grad_w_cc = theano.shared(value=np.zeros_like(w_h_por_h_por_array), name="accu_grad_w_cc", borrow=True)
-        node.accu_delta_w_cc = theano.shared(value=np.zeros_like(w_h_por_h_por_array), name="accu_delta_w_cc", borrow=True)
-        node.accu_grad_w_ini = theano.shared(value=np.zeros_like(w_h_gin_i_array), name="accu_grad_w_ini", borrow=True)
-        node.accu_delta_w_ini = theano.shared(value=np.zeros_like(w_h_gin_i_array), name="accu_delta_w_ini", borrow=True)
-        node.accu_grad_w_inc = theano.shared(value=np.zeros_like(w_h_gin_h_por_array), name="accu_grad_w_inc", borrow=True)
-        node.accu_delta_w_inc = theano.shared(value=np.zeros_like(w_h_gin_h_por_array), name="accu_delta_w_inc", borrow=True)
-        node.accu_grad_w_phii = theano.shared(value=np.zeros_like(w_h_gfg_i_array), name="accu_grad_w_phii", borrow=True)
-        node.accu_delta_w_phii = theano.shared(value=np.zeros_like(w_h_gfg_i_array), name="accu_delta_w_phii", borrow=True)
-        node.accu_grad_w_phic = theano.shared(value=np.zeros_like(w_h_gfg_h_por_array), name="accu_grad_w_phic", borrow=True)
-        node.accu_delta_w_phic = theano.shared(value=np.zeros_like(w_h_gfg_h_por_array), name="accu_delta_w_phic", borrow=True)
-        node.accu_grad_theta_k = theano.shared(value=np.zeros_like(theta_output_array), name="accu_grad_theta_k", borrow=True)
-        node.accu_delta_theta_k = theano.shared(value=np.zeros_like(theta_output_array), name="accu_delta_theta_k", borrow=True)
-        node.accu_grad_theta_out = theano.shared(value=np.zeros_like(theta_lstm_gou_array), name="accu_grad_theta_out", borrow=True)
-        node.accu_delta_theta_out = theano.shared(value=np.zeros_like(theta_lstm_gou_array), name="accu_delta_theta_out", borrow=True)
-        node.accu_grad_theta_in = theano.shared(value=np.zeros_like(theta_lstm_gin_array), name="accu_grad_theta_in", borrow=True)
-        node.accu_delta_theta_in = theano.shared(value=np.zeros_like(theta_lstm_gin_array), name="accu_delta_theta_in", borrow=True)
-        node.accu_grad_theta_phi = theano.shared(value=np.zeros_like(theta_lstm_gfg_array), name="accu_grad_theta_phi", borrow=True)
-        node.accu_delta_theta_phi = theano.shared(value=np.zeros_like(theta_lstm_gfg_array), name="accu_delta_theta_phi", borrow=True)
-
-        [errors,
-         deriv_ci_prev,
-         deriv_cc_prev,
-         deriv_ini_prev,
-         deriv_inc_prev,
-         deriv_in1_prev,
-         deriv_phii_prev,
-         deriv_phic_prev,
-         deriv_phi1_prev,
-         grad_w_ki,
-         grad_w_kc,
-         grad_w_outi,
-         grad_w_outc,
-         grad_w_ci,
-         grad_w_cc,
-         grad_w_ini,
-         grad_w_inc,
-         grad_w_phii,
-         grad_w_phic,
-         grad_theta_i,
-         grad_theta_k,
-         grad_theta_in,
-         grad_theta_out,
-         grad_theta_phi], updates = theano.scan(
-            fn=bpttstep,
-            sequences=[dict(input=s, taps=[-0]),
-                       dict(input=tgt, taps=[-0]),
-                       dict(input=y_k, taps=[-0]),
-                       dict(input=y_i, taps=[-0]),
-                       dict(input=y_c, taps=[-0]),
-                       dict(input=net_in, taps=[-0]),
-                       dict(input=net_out, taps=[-0]),
-                       dict(input=net_phi, taps=[-0])],
-            outputs_info=[0.,                                               # error
-                          T.zeros_like(w_ci, dtype=T.config.floatX),        # deriv_ci_prev
-                          T.zeros_like(w_cc, dtype=T.config.floatX),        # deriv_cc_prev
-                          T.zeros_like(w_ini, dtype=T.config.floatX),       # deriv_ini_prev
-                          T.zeros_like(w_inc, dtype=T.config.floatX),       # deriv_inc_prev
-                          T.zeros_like(theta_in, dtype=T.config.floatX),    # deriv_in1_prev
-                          T.zeros_like(w_phii, dtype=T.config.floatX),      # deriv_phii_prev
-                          T.zeros_like(w_phic, dtype=T.config.floatX),      # deriv_phic_prev
-                          T.zeros_like(theta_phi, dtype=T.config.floatX),   # deriv_phi1_prev
-                          T.zeros_like(w_ki, dtype=T.config.floatX),        # delta_w_ki
-                          T.zeros_like(w_kc, dtype=T.config.floatX),        # delta_w_kc
-                          T.zeros_like(w_outi, dtype=T.config.floatX),      # delta_w_outi
-                          T.zeros_like(w_outc, dtype=T.config.floatX),      # delta_w_outc
-                          T.zeros_like(w_ci, dtype=T.config.floatX),        # delta_w_ci
-                          T.zeros_like(w_cc, dtype=T.config.floatX),        # delta_w_cc
-                          T.zeros_like(w_ini, dtype=T.config.floatX),       # delta_w_ini
-                          T.zeros_like(w_inc, dtype=T.config.floatX),       # delta_w_inc
-                          T.zeros_like(w_phii, dtype=T.config.floatX),      # delta_w_phii
-                          T.zeros_like(w_phic, dtype=T.config.floatX),      # delta_w_phic
-                          T.zeros_like(theta_i, dtype=T.config.floatX),     # delta_theta_i
-                          T.zeros_like(theta_k, dtype=T.config.floatX),     # delta_theta_k
-                          T.zeros_like(theta_in, dtype=T.config.floatX),    # delta_theta_in
-                          T.zeros_like(theta_out, dtype=T.config.floatX),   # delta_theta_out
-                          T.zeros_like(theta_phi, dtype=T.config.floatX)],  # delta_theta_phi
-            non_sequences=[w_kc,
-                           w_ci,
-                           w_cc,
-                           w_outc,
-                           w_outi,
-                           w_ini,
-                           w_inc,
-                           w_phii,
-                           w_phic],
-            go_backwards=True,
-            n_steps=steps,
-            strict=True)
-
-        # adadelta momentum
-        accu_grad_w_kc = rho * node.accu_grad_w_kc + (1. - rho) * (grad_w_kc[SEQUENCE_LENGTH - 1]**2)
-        delta_w_kc = (T.sqrt(node.accu_delta_w_kc + epsilon) / T.sqrt(accu_grad_w_kc + epsilon)) * grad_w_kc[SEQUENCE_LENGTH - 1]
-        accu_delta_w_kc = rho * node.accu_delta_w_kc + (1. - rho) * (delta_w_kc**2)
-
-        accu_grad_w_ki = rho * node.accu_grad_w_ki + (1. - rho) * (grad_w_ki[SEQUENCE_LENGTH - 1]**2)
-        delta_w_ki = (T.sqrt(node.accu_delta_w_ki + epsilon) / T.sqrt(accu_grad_w_ki + epsilon)) * grad_w_ki[SEQUENCE_LENGTH - 1]
-        accu_delta_w_ki = rho * node.accu_delta_w_ki + (1. - rho) * (delta_w_ki**2)
-
-        accu_grad_w_outc = rho * node.accu_grad_w_outc + (1. - rho) * (grad_w_outc[SEQUENCE_LENGTH - 1]**2)
-        delta_w_outc = (T.sqrt(node.accu_delta_w_outc + epsilon) / T.sqrt(accu_grad_w_outc + epsilon)) * grad_w_outc[SEQUENCE_LENGTH - 1]
-        accu_delta_w_outc = rho * node.accu_delta_w_outc + (1. - rho) * (delta_w_outc**2)
-
-        accu_grad_w_outi = rho * node.accu_grad_w_outi + (1. - rho) * (grad_w_outi[SEQUENCE_LENGTH - 1]**2)
-        delta_w_outi = (T.sqrt(node.accu_delta_w_outi + epsilon) / T.sqrt(accu_grad_w_outi + epsilon)) * grad_w_outi[SEQUENCE_LENGTH - 1]
-        accu_delta_w_outi = rho * node.accu_delta_w_outi + (1. - rho) * (delta_w_outi**2)
-
-        accu_grad_w_ci = rho * node.accu_grad_w_ci + (1. - rho) * (grad_w_ci[SEQUENCE_LENGTH - 1]**2)
-        delta_w_ci = (T.sqrt(node.accu_delta_w_ci + epsilon) / T.sqrt(accu_grad_w_ci + epsilon)) * grad_w_ci[SEQUENCE_LENGTH - 1]
-        accu_delta_w_ci = rho * node.accu_delta_w_ci + (1. - rho) * (delta_w_ci**2)
-
-        accu_grad_w_cc = rho * node.accu_grad_w_cc + (1. - rho) * (grad_w_cc[SEQUENCE_LENGTH - 1]**2)
-        delta_w_cc = (T.sqrt(node.accu_delta_w_cc + epsilon) / T.sqrt(accu_grad_w_cc + epsilon)) * grad_w_cc[SEQUENCE_LENGTH - 1]
-        accu_delta_w_cc = rho * node.accu_delta_w_cc + (1. - rho) * (delta_w_cc**2)
-
-        accu_grad_w_ini = rho * node.accu_grad_w_ini + (1. - rho) * (grad_w_ini[SEQUENCE_LENGTH - 1]**2)
-        delta_w_ini = (T.sqrt(node.accu_delta_w_ini + epsilon) / T.sqrt(accu_grad_w_ini + epsilon)) * grad_w_ini[SEQUENCE_LENGTH - 1]
-        accu_delta_w_ini = rho * node.accu_delta_w_ini + (1. - rho) * (delta_w_ini**2)
-
-        accu_grad_w_inc = rho * node.accu_grad_w_inc + (1. - rho) * (grad_w_inc[SEQUENCE_LENGTH - 1]**2)
-        delta_w_inc = (T.sqrt(node.accu_delta_w_inc + epsilon) / T.sqrt(accu_grad_w_inc + epsilon)) * grad_w_inc[SEQUENCE_LENGTH - 1]
-        accu_delta_w_inc = rho * node.accu_delta_w_inc + (1. - rho) * (delta_w_inc**2)
-
-        accu_grad_w_phii = rho * node.accu_grad_w_phii + (1. - rho) * (grad_w_phii[SEQUENCE_LENGTH - 1]**2)
-        delta_w_phii = (T.sqrt(node.accu_delta_w_phii + epsilon) / T.sqrt(accu_grad_w_phii + epsilon)) * grad_w_phii[SEQUENCE_LENGTH - 1]
-        accu_delta_w_phii = rho * node.accu_delta_w_phii + (1. - rho) * (delta_w_phii**2)
-
-        accu_grad_w_phic = rho * node.accu_grad_w_phic + (1. - rho) * (grad_w_phic[SEQUENCE_LENGTH - 1]**2)
-        delta_w_phic = (T.sqrt(node.accu_delta_w_phic + epsilon) / T.sqrt(accu_grad_w_phic + epsilon)) * grad_w_phic[SEQUENCE_LENGTH - 1]
-        accu_delta_w_phic = rho * node.accu_delta_w_phic + (1. - rho) * (delta_w_phic**2)
-
-        accu_grad_theta_k = rho * node.accu_grad_theta_k + (1. - rho) * (grad_theta_k[SEQUENCE_LENGTH - 1]**2)
-        delta_theta_k = (T.sqrt(node.accu_delta_theta_k + epsilon) / T.sqrt(accu_grad_theta_k + epsilon)) * grad_theta_k[SEQUENCE_LENGTH - 1]
-        accu_delta_theta_k = rho * node.accu_delta_theta_k + (1. - rho) * (delta_theta_k**2)
-
-        accu_grad_theta_out = rho * node.accu_grad_theta_out + (1. - rho) * (grad_theta_out[SEQUENCE_LENGTH - 1]**2)
-        delta_theta_out = (T.sqrt(node.accu_delta_theta_out + epsilon) / T.sqrt(accu_grad_theta_out + epsilon)) * grad_theta_out[SEQUENCE_LENGTH - 1]
-        accu_delta_theta_out = rho * node.accu_delta_theta_out + (1. - rho) * (delta_theta_out**2)
-
-        accu_grad_theta_in = rho * node.accu_grad_theta_in + (1. - rho) * (grad_theta_in[SEQUENCE_LENGTH - 1]**2)
-        delta_theta_in = (T.sqrt(node.accu_delta_theta_in + epsilon) / T.sqrt(accu_grad_theta_in + epsilon)) * grad_theta_in[SEQUENCE_LENGTH - 1]
-        accu_delta_theta_in = rho * node.accu_delta_theta_in + (1. - rho) * (delta_theta_in**2)
-
-        accu_grad_theta_phi = rho * node.accu_grad_theta_phi + (1. - rho) * (grad_theta_phi[SEQUENCE_LENGTH - 1]**2)
-        delta_theta_phi = (T.sqrt(node.accu_delta_theta_phi + epsilon) / T.sqrt(accu_grad_theta_phi + epsilon)) * grad_theta_phi[SEQUENCE_LENGTH - 1]
-        accu_delta_theta_phi = rho * node.accu_delta_theta_phi + (1. - rho) * (delta_theta_phi**2)
-
-        # update weights
-        w_kc += delta_w_kc
-        w_ki += delta_w_ki
-        w_outc += delta_w_outc
-        w_outi += delta_w_outi
-        w_ci += delta_w_ci
-        w_cc += delta_w_cc
-        w_ini += delta_w_ini
-        w_inc += delta_w_inc
-        w_phii += delta_w_phii
-        w_phic += delta_w_phic
-
-        # update biases
-        # theta_i += delta_theta_i
-        theta_k += delta_theta_k
-        theta_out += delta_theta_out
-        theta_in += delta_theta_in
-        theta_phi += delta_theta_phi
-
-        # this will provide new w values to be written back to the node net,
-        # as well as deriv_lm_prev values to be used in the next step
-        node.get_updated_parameters = theano.function([rho, epsilon, steps],
-                                                      errors,
-                                                      updates=[(node.w_kc, w_kc),
-                                                               (node.w_ki, w_ki),
-                                                               (node.w_outc, w_outc),
-                                                               (node.w_outi, w_outi),
-                                                               (node.w_ci, w_ci),
-                                                               (node.w_cc, w_cc),
-                                                               (node.w_ini, w_ini),
-                                                               (node.w_inc, w_inc),
-                                                               (node.w_phii, w_phii),
-                                                               (node.w_phic, w_phic),
-                                                               (node.theta_i, theta_i),
-                                                               (node.theta_k, theta_k),
-                                                               (node.theta_in, theta_in),
-                                                               (node.theta_out, theta_out),
-                                                               (node.theta_phi, theta_phi),
-                                                               (node.accu_grad_w_kc, accu_grad_w_kc),
-                                                               (node.accu_delta_w_kc, accu_delta_w_kc),
-                                                               (node.accu_grad_w_ki, accu_grad_w_ki),
-                                                               (node.accu_delta_w_ki, accu_delta_w_ki),
-                                                               (node.accu_grad_w_outc, accu_grad_w_outc),
-                                                               (node.accu_delta_w_outc, accu_delta_w_outc),
-                                                               (node.accu_grad_w_outi, accu_grad_w_outi),
-                                                               (node.accu_delta_w_outi, accu_delta_w_outi),
-                                                               (node.accu_grad_w_ci, accu_grad_w_ci),
-                                                               (node.accu_delta_w_ci, accu_delta_w_ci),
-                                                               (node.accu_grad_w_cc, accu_grad_w_cc),
-                                                               (node.accu_delta_w_cc, accu_delta_w_cc),
-                                                               (node.accu_grad_w_ini, accu_grad_w_ini),
-                                                               (node.accu_delta_w_ini, accu_delta_w_ini),
-                                                               (node.accu_grad_w_inc, accu_grad_w_inc),
-                                                               (node.accu_delta_w_inc, accu_delta_w_inc),
-                                                               (node.accu_grad_w_phii, accu_grad_w_phii),
-                                                               (node.accu_delta_w_phii, accu_delta_w_phii),
-                                                               (node.accu_grad_w_phic, accu_grad_w_phic),
-                                                               (node.accu_delta_w_phic, accu_delta_w_phic),
-                                                               (node.accu_grad_theta_k, accu_grad_theta_k),
-                                                               (node.accu_delta_theta_k, accu_delta_theta_k),
-                                                               (node.accu_grad_theta_out, accu_grad_theta_out),
-                                                               (node.accu_delta_theta_out, accu_delta_theta_out),
-                                                               (node.accu_grad_theta_in, accu_grad_theta_in),
-                                                               (node.accu_delta_theta_in, accu_delta_theta_in),
-                                                               (node.accu_grad_theta_phi, accu_grad_theta_phi),
-                                                               (node.accu_delta_theta_phi, accu_delta_theta_phi)
-                                                               ],
-                                                      on_unused_input='warn')
-
-        node.get_error = theano.function([], T.sum(T.square(tgt[SEQUENCE_LENGTH] - y_k[SEQUENCE_LENGTH])) / 2.)
-
-        node.initialized = True
-
-    # every step
-
-    error_prev = node.get_state("current_error")
-    if error_prev is None:
-        error_prev = 0.
-    node.get_gate('e').gate_function(error_prev)
-
-    if netapi.step % 3 == 0 and node.get_slot("debug").activation > 0.5:
-        netapi.logger.debug("%10i: lstm sample step" % netapi.step)
-
-    if netapi.step % 3 != 1:
-        return
-    # every three steps, sample activation from LSTMs
-
-    node.t += 1
-    if node.t >= SEQUENCE_LENGTH:
-        node.t = 0
-
-    # roll time snapshots to the left
-    node.t_a_i_matrix = np.roll(node.t_a_i_matrix, -1, 0)
-    node.t_a_t_matrix = np.roll(node.t_a_t_matrix, -1, 0)
-    node.t_a_o_matrix = np.roll(node.t_a_o_matrix, -1, 0)
-    node.t_a_h_gen_matrix = np.roll(node.t_a_h_gen_matrix, -1, 0)
-    node.t_a_h_por_matrix = np.roll(node.t_a_h_por_matrix, -1, 0)
-    node.t_a_h_gin_matrix = np.roll(node.t_a_h_gin_matrix, -1, 0)
-    node.t_a_h_gou_matrix = np.roll(node.t_a_h_gou_matrix, -1, 0)
-    node.t_a_h_gfg_matrix = np.roll(node.t_a_h_gfg_matrix, -1, 0)
-
-    # insert new snapshot at the end
-    node.t_a_i_matrix[SEQUENCE_LENGTH - 1, :] = netapi.get_activations(nodespace, input)
-    node.t_a_t_matrix[SEQUENCE_LENGTH - 1, :] = netapi.get_activations(nodespace, target)
-    node.t_a_o_matrix[SEQUENCE_LENGTH - 1, :] = netapi.get_activations(nodespace, output)
-    node.t_a_h_gen_matrix[SEQUENCE_LENGTH - 1, :] = netapi.get_activations(nodespace, lstm_gen)
-    node.t_a_h_por_matrix[SEQUENCE_LENGTH - 1, :] = netapi.get_activations(nodespace, lstm_por)
-    node.t_a_h_gin_matrix[SEQUENCE_LENGTH - 1, :] = netapi.get_activations(nodespace, lstm_gou)
-    node.t_a_h_gou_matrix[SEQUENCE_LENGTH - 1, :] = netapi.get_activations(nodespace, lstm_gin)
-    node.t_a_h_gfg_matrix[SEQUENCE_LENGTH - 1, :] = netapi.get_activations(nodespace, lstm_gfg)
-    node.samples += 1
-
-    if node.get_slot("debug").activation > 0.5:
-        netapi.logger.debug("%10i: bp sample #%i t, i, c, k data: t[0]=%.6f i[0]=%.6f c[0]=%.6f k[0]=%.6f"
-                            % (netapi.step, node.t, node.t_a_t_matrix[node.t, 0], node.t_a_i_matrix[node.t, 0],
-                                node.t_a_h_por_matrix[node.t, 0], node.t_a_o_matrix[node.t, 0]))
-
-    if node.t != SEQUENCE_LENGTH - 1 or node.samples < 3:
-        return
-    # every sequence length samples, do backpropagation-through-time for the sampled sequence
-
-    # netapi.logger.debug("t=%.6f o=%.6f s=%.6f c=%.6f i=%.6f" % (node.t_a_t_matrix[0, 0], node.t_a_o_matrix[0, 0],
-    #                     node.t_a_h_gen_matrix[0, 0], node.t_a_h_por_matrix[0, 0], node.t_a_i_matrix[0, 0]))
-    # netapi.logger.debug("t=%.6f o=%.6f s=%.6f c=%.6f i=%.6f" % (node.t_a_t_matrix[1, 0], node.t_a_o_matrix[1, 0],
-    #                     node.t_a_h_gen_matrix[1, 0], node.t_a_h_por_matrix[1, 0], node.t_a_i_matrix[1, 0]))
-    # netapi.logger.debug("t=%.6f o=%.6f s=%.6f c=%.6f i=%.6f" % (node.t_a_t_matrix[2, 0], node.t_a_o_matrix[2, 0],
-    #                     node.t_a_h_gen_matrix[2, 0], node.t_a_h_por_matrix[2, 0], node.t_a_i_matrix[2, 0]))
-
-    # fill w and a variables with values from the Node Net
-    node.w_kc.set_value(netapi.get_link_weights(nodespace, lstm_por, nodespace, output), borrow=True)
-    node.w_ki.set_value(netapi.get_link_weights(nodespace, input, nodespace, output), borrow=True)
-    node.w_outc.set_value(netapi.get_link_weights(nodespace, lstm_por, nodespace, lstm_gou), borrow=True)
-    node.w_outi.set_value(netapi.get_link_weights(nodespace, input, nodespace, lstm_gou), borrow=True)
-    node.w_ci.set_value(netapi.get_link_weights(nodespace, input, nodespace, lstm_por), borrow=True)
-    node.w_cc.set_value(netapi.get_link_weights(nodespace, lstm_por, nodespace, lstm_por), borrow=True)
-    node.w_ini.set_value(netapi.get_link_weights(nodespace, input, nodespace, lstm_gin), borrow=True)
-    node.w_inc.set_value(netapi.get_link_weights(nodespace, lstm_por, nodespace, lstm_gin), borrow=True)
-    node.w_phii.set_value(netapi.get_link_weights(nodespace, input, nodespace, lstm_gfg), borrow=True)
-    node.w_phic.set_value(netapi.get_link_weights(nodespace, lstm_por, nodespace, lstm_gfg), borrow=True)
-
-    node.theta_i.set_value(netapi.get_thetas(nodespace, input), borrow=True)
-    node.theta_k.set_value(netapi.get_thetas(nodespace, output), borrow=True)
-    node.theta_in.set_value(netapi.get_thetas(nodespace, lstm_gin), borrow=True)
-    node.theta_out.set_value(netapi.get_thetas(nodespace, lstm_gou), borrow=True)
-    node.theta_phi.set_value(netapi.get_thetas(nodespace, lstm_gfg), borrow=True)
-
-    node.tgt.set_value(node.t_a_t_matrix, borrow=True)
-    node.y_k.set_value(node.t_a_o_matrix, borrow=True)
-    node.y_i.set_value(node.t_a_i_matrix, borrow=True)
-    node.y_c.set_value(node.t_a_h_por_matrix, borrow=True)
-    node.s.set_value(node.t_a_h_gen_matrix, borrow=True)
-    node.net_out.set_value(node.t_a_h_gou_matrix, borrow=True)
-    node.net_in.set_value(node.t_a_h_gin_matrix, borrow=True)
-    node.net_phi.set_value(node.t_a_h_gfg_matrix, borrow=True)
-
-    rho = float(node.get_parameter('adadelta_rho'))
-    if not isinstance(rho, Number):
-        rho = 0.95
-        node.set_parameter('adadelta_rho', rho)
-
-    epsilon = float(node.get_parameter('adadelta_epsilon'))
-    if not isinstance(epsilon, Number):
-        epsilon = 0.000001
-        node.set_parameter('adadelta_epsilon', epsilon)
-
-    len_output = len(netapi.get_activations(nodespace, output))
-    len_input = len(netapi.get_activations(nodespace, input))
-    len_hidden = len(netapi.get_activations(nodespace, lstm_por))
-
-    # update the weights, all derivatives and weight update sums are 0 for the first step
-    errors = node.get_updated_parameters(rho, epsilon, node.t + 1)
-
-    if node.get_slot("debug").activation > 0.5:
-        netapi.logger.debug("%10i: bp with error %.4f" % (netapi.step, errors[SEQUENCE_LENGTH - 1]))
-
-    # write back changed weights to node net
-
-    # netapi.set_thetas(nodespace, input, node.theta_i.get_value(borrow=True))
-    if node.get_parameter("bias_gin") == "true":
-        netapi.set_thetas(nodespace, lstm_gin, node.theta_in.get_value(borrow=True))
-    if node.get_parameter("bias_gou") == "true":
-        netapi.set_thetas(nodespace, lstm_gou, node.theta_out.get_value(borrow=True))
-    if node.get_parameter("bias_gfg") == "true":
-        netapi.set_thetas(nodespace, lstm_gfg, node.theta_phi.get_value(borrow=True))
-
-    netapi.set_link_weights(nodespace, input, nodespace, lstm_gou, node.w_outi.get_value(borrow=True))
-    netapi.set_link_weights(nodespace, input, nodespace, lstm_por, node.w_ci.get_value(borrow=True))
-    netapi.set_link_weights(nodespace, input, nodespace, lstm_gin, node.w_ini.get_value(borrow=True))
-    netapi.set_link_weights(nodespace, input, nodespace, lstm_gfg, node.w_phii.get_value(borrow=True))
-    netapi.set_link_weights(nodespace, lstm_por, nodespace, output, node.w_kc.get_value(borrow=True))
-
-    if node.get_parameter("links_io") == "true":
-        netapi.set_link_weights(nodespace, input, nodespace, output, node.w_ki.get_value(borrow=True))
-    if node.get_parameter("links_porpor") == "true":
-        netapi.set_link_weights(nodespace, lstm_por, nodespace, lstm_por, node.w_cc.get_value(borrow=True))
-    if node.get_parameter("links_porgin") == "true":
-        netapi.set_link_weights(nodespace, lstm_por, nodespace, lstm_gin, node.w_inc.get_value(borrow=True))
-    if node.get_parameter("links_porgou") == "true":
-        netapi.set_link_weights(nodespace, lstm_por, nodespace, lstm_gou, node.w_outc.get_value(borrow=True))
-    if node.get_parameter("links_porgfg") == "true":
-        netapi.set_link_weights(nodespace, lstm_por, nodespace, lstm_gfg, node.w_phic.get_value(borrow=True))
-
-    node.set_state('current_error', errors[SEQUENCE_LENGTH - 1])
-    node.set_state('error', node.get_state('error') + errors[SEQUENCE_LENGTH - 1])
-    if node.get_state('updates') % 100 == 0:
-        netapi.logger.debug("Number of lstm backprop steps computed %d" % node.get_state('updates'))
-        netapi.logger.debug("Error %.6f (Latest from loop: 0=%.6f)" % ((node.get_state('error') / 100), errors[SEQUENCE_LENGTH - 1]))
-        node.set_state('error', 0.0)
-
-    # after weight updates, reset gen loops of lstms
-    netapi.substitute_activations(nodespace, lstm_gen, np.zeros_like(netapi.get_activations(nodespace, lstm_gen)))
-    # netapi.substitute_activations(nodespace, "lstm_por", np.zeros_like(a_h_por_array))
-
-    node.set_state('updates', node.get_state('updates') + 1)
-
-
 def gradient_descent(netapi, node=None, **params):
     """
     Online gradient descent with backpropagation for three layers (input, hidden,