From 1951e7036bd166d7ec6677a3f3ab4a269abfcbe7 Mon Sep 17 00:00:00 2001 From: Dominik Welland Date: Fri, 22 Apr 2016 15:04:08 +0200 Subject: [PATCH] remove lstm gradient descent for the moment --- micropsi_core/nodenet/native_modules.py | 698 ------------------------ 1 file changed, 698 deletions(-) diff --git a/micropsi_core/nodenet/native_modules.py b/micropsi_core/nodenet/native_modules.py index 47ded65f..b8cc049a 100644 --- a/micropsi_core/nodenet/native_modules.py +++ b/micropsi_core/nodenet/native_modules.py @@ -21,67 +21,6 @@ if numpy_installed: # only register these native modules if we # have theano and numpy installed. - nodetypes["GradientDescentLSTM"] = { - "name": "GradientDescentLSTM", - "engine": "theano_engine", - "slottypes": ["trigger", "debug"], - "gatetypes": ["e"], - "nodefunction_name": "gradient_descent_lstm", - "symbol": "↺", - "category": "nn_learning", - "path": os.path.abspath(__file__), - "parameters": [ - "adadelta_rho", - "adadelta_epsilon", - "sequence_length", - "links_io", - "links_porpor", - "links_porgin", - "links_porgou", - "links_porgfg", - "bias_gin", - "bias_gou", - "bias_gfg", - "group_t_nodes", - "group_t_gates", - "group_i_nodes", - "group_i_gates", - "group_c_nodes", - "group_o_nodes", - "group_o_gates" - ], - "parameter_values": { - "links_io": ["true", "false"], - "links_porpor": ["true", "false"], - "links_porgin": ["true", "false"], - "links_porgou": ["true", "false"], - "links_porgfg": ["true", "false"], - "bias_gin": ["true", "false"], - "bias_gou": ["true", "false"], - "bias_gfg": ["true", "false"] - }, - "parameter_defaults": { - "adadelta_rho": "0.95", - "adadelta_epsilon": "0.000001", - "sequence_length": "5", - "links_io": "true", - "links_porpor": "true", - "links_porgin": "true", - "links_porgou": "true", - "links_porgfg": "true", - "bias_gin": "true", - "bias_gou": "true", - "bias_gfg": "true", - "group_t_nodes": "target", - "group_t_gates": "gen", - "group_i_nodes": "input", - "group_i_gates": "gen", - "group_c_nodes": "lstm", - "group_o_nodes": "output", - "group_o_gates": "gen" - } - } - nodetypes["GradientDescent"] = { "name": "GradientDescent", "engine": "theano_engine", @@ -121,643 +60,6 @@ } -def gradient_descent_lstm(netapi, node=None, **params): - """ - Gradient Descent for LSTMs - - The following assumes a three-layer architecture, with hidden LSTM nodes. - There is always a single LSTM cell per block (no multi-block cells are implemented). - - The following sets of weights are defined: - input -> output - input -> cell - input -> input gate - input -> output gate - input -> forget gate - cell -> output - cell -> input gate - cell -> output gate - cell -> forget gate - - The cell's constant error carousel link is explicitly modelled (as a gen loop). - Note that input, output and forget gate links aren't updated right now. - - Variable naming and implementation follows: - Gers & al. 1999, Learning to Forget - Continual Prediction with LSTM - - Other helpful papers: - Hochreiter & al. 1997, Long Short-Term Memory (introduces naming convention and most of the math) - Graves & al. 2005, Framewise Phoneme Classification with Bidrectional LSTM and Other NN Architectures - - For the Graves paper, a minimal, almost readable python implementation can be found at: - https://gist.github.com/neubig/ff2f97d91c9bed820c15 - - The ADADELTA implemetation follows the original ADADELTA paper: - Zeiler 2012, ADADELTA: An Adaptive Learning Rate Method - - A nice theano adadelta implementation is here: - https://blog.wtf.sg/2014/08/28/implementing-adadelta/ - """ - - from numbers import Number - from theano import tensor as T - - SEQUENCE_LENGTH = 3 - sequence_length_string = node.get_parameter("sequence_length") - if sequence_length_string is not None: - SEQUENCE_LENGTH = int(sequence_length_string) - - target_node_group = node.get_parameter("group_t_nodes") - target_gate = node.get_parameter("group_t_gates") - output_node_group = node.get_parameter("group_o_nodes") - output_gate = node.get_parameter("group_o_gates") - input_node_group = node.get_parameter("group_i_nodes") - input_gate = node.get_parameter("group_i_gates") - lstm = node.get_parameter("group_c_nodes") - lstm_gen = "%s_gen" % lstm - lstm_por = "%s_por" % lstm - lstm_gin = "%s_gin" % lstm - lstm_gou = "%s_gou" % lstm - lstm_gfg = "%s_gfg" % lstm - input = "%s_input" % input_node_group - output = "%s_output" % output_node_group - target = "%s_target" % target_node_group - - nodespace = node.parent_nodespace - - if not hasattr(node, 'initialized'): - - # create the groups - netapi.group_nodes_by_names(nodespace, node_name_prefix=target_node_group, gate=target_gate, group_name=target) - netapi.group_nodes_by_names(nodespace, node_name_prefix=output_node_group, gate=output_gate, group_name=output) - netapi.group_nodes_by_names(nodespace, node_name_prefix=input_node_group, gate=input_gate, group_name=input) - - netapi.group_nodes_by_names(nodespace, node_name_prefix=lstm, gate="gen", group_name=lstm_gen) - netapi.group_nodes_by_names(nodespace, node_name_prefix=lstm, gate="por", group_name=lstm_por) - netapi.group_nodes_by_names(nodespace, node_name_prefix=lstm, gate="gin", group_name=lstm_gin) - netapi.group_nodes_by_names(nodespace, node_name_prefix=lstm, gate="gou", group_name=lstm_gou) - netapi.group_nodes_by_names(nodespace, node_name_prefix=lstm, gate="gfg", group_name=lstm_gfg) - - len_output = len(netapi.get_activations(nodespace, output)) - len_input = len(netapi.get_activations(nodespace, input)) - len_hidden = len(netapi.get_activations(nodespace, lstm_por)) - - # define a single LSTM-style backpropagation through time step, to be scanned over by theano - def bpttstep( - s, tgt, y_k, y_i, y_c, net_in, net_out, net_phi, - error, drv_ci_prev, drv_cc_prev, drv_ini_prev, drv_inc_prev, drv_in1_prev, drv_phii_prev, drv_phic_prev, drv_phi1_prev, - delta_w_ki, delta_w_kc, delta_w_outi, delta_w_outc, delta_w_ci, delta_w_cc, delta_w_ini, delta_w_inc, delta_w_phii, delta_w_phic, - delta_theta_i, delta_theta_k, delta_theta_in, delta_theta_out, delta_theta_phi, - w_kc, w_ci, w_cc, w_outc, w_outi, w_ini, w_inc, w_phii, w_phic): - - # calculate error - e_k = tgt - y_k # (12) error per output element - E = T.sum(T.square(e_k)) / 2. # (12) squared sum to be minimized - - # Part I: standard (truncated) BPTT for links to output registers and lstm output gate slots - # cell -> output - # cell -> output gate - # input -> output - # input -> output gate - - # functions and derivatives - y_in = T.nnet.sigmoid(net_in) # (3) y_in = f(net_in) - y_out = T.nnet.sigmoid(net_out) # (3) y_out = f(net_out) - y_phi = T.nnet.sigmoid(net_phi) # (3) y_phi = f(net_phi) - - h_s = 2 * T.nnet.sigmoid(s) - 1 # (8) - - f_primed_net_k = y_k * (1. - y_k) # f'(net_k) = f(net_k) * (1 - f(net_k)), f(net_k) provided as y_k - f_primed_net_out = y_out * (1. - y_out) - f_primed_net_in = y_in * (1. - y_in) - f_primed_net_phi = y_phi * (1. - y_phi) - # f_primed_net_i = y_i * (1. - y_i) - h_primed_s = (2 * T.exp(s)) / T.square(T.exp(s) + 1) - - delta_k = f_primed_net_k * e_k # (14) delta per output element - delta_out = f_primed_net_out * h_s * T.sum(w_kc * T.reshape(delta_k, (len_output, 1)), axis=0) # (15) delta per output gate - - # we use y_c and y_i here instead of y_i_prev because we have "flattened snapshots" to work with - # i.e. the partial derivative of net_k(t) with respect to w_kc is delta_k(t) * y_c(t) - # (y_c is what was propagated and created net_k) - delta_w_kc += T.dot(T.reshape(delta_k, (len_output, 1)), T.reshape(y_c, (1, len_hidden))) # (13) m = c - delta_w_ki += T.dot(T.reshape(delta_k, (len_output, 1)), T.reshape(y_i, (1, len_input))) # (13) m = i - delta_w_outi += T.dot(T.reshape(delta_out, (len_hidden, 1)), T.reshape(y_i, (1, len_input))) # (13) m = c - delta_w_outc += T.dot(T.reshape(delta_out, (len_hidden, 1)), T.reshape(y_c, (1, len_hidden))) # (13) m = i - - delta_theta_k += delta_k - delta_theta_out += delta_out - - # Part II: RTRL-style updates - # input -> cell - # cell -> cell - # input -> input gate - # cell -> input gate - # input -> forget gate - # cell -> forget gate - - net_c = T.dot(w_ci, y_i) # ugly re-calculation of forward pass for net_c - g_net_c = 4 * T.nnet.sigmoid(net_c) - 2 # (5) - g_primed_net_c = (4 * T.exp(net_c)) / T.square(T.exp(net_c) + 1) - - e_s = y_out * h_primed_s * T.sum(w_kc * T.reshape(delta_k, (len_output, 1)), axis=0) # (17) - - drv_ci = drv_ci_prev * T.reshape(y_phi, (len_hidden, 1)) \ - + T.dot(T.reshape(g_primed_net_c * y_in, (len_hidden, 1)), T.reshape(y_i, (1, len_input))) # (19) m = i - drv_cc = drv_cc_prev * T.reshape(y_phi, (len_hidden, 1)) \ - + T.dot(T.reshape(g_primed_net_c * y_in, (len_hidden, 1)), T.reshape(y_c, (1, len_hidden))) # (19) m = i - - drv_ini = drv_ini_prev * T.reshape(y_phi, (len_hidden, 1)) \ - + T.dot(T.reshape(g_net_c * f_primed_net_in, (len_hidden, 1)), T.reshape(y_i, (1, len_input))) # (20) m = i - drv_inc = drv_inc_prev * T.reshape(y_phi, (len_hidden, 1)) \ - + T.dot(T.reshape(g_net_c * f_primed_net_in, (len_hidden, 1)), T.reshape(y_c, (1, len_hidden))) # (20) m = c - drv_in1 = drv_in1_prev * y_phi + g_net_c * f_primed_net_in - - drv_phii = drv_phii_prev * T.reshape(y_phi, (len_hidden, 1)) \ - + T.dot(T.reshape(h_s * f_primed_net_phi, (len_hidden, 1)), T.reshape(y_i, (1, len_input))) # (21) m = i - drv_phic = drv_phic_prev * T.reshape(y_phi, (len_hidden, 1)) \ - + T.dot(T.reshape(h_s * f_primed_net_phi, (len_hidden, 1)), T.reshape(y_c, (1, len_hidden))) # (21) m = c - drv_phi1 = drv_phi1_prev * y_phi + h_s * f_primed_net_phi - - delta_w_ci += T.reshape(e_s, (len_hidden, 1)) * drv_ci - delta_w_cc += T.reshape(e_s, (len_hidden, 1)) * drv_cc - - delta_w_ini += T.reshape(e_s, (len_hidden, 1)) * drv_ini - delta_w_inc += T.reshape(e_s, (len_hidden, 1)) * drv_inc - - delta_w_phii += T.reshape(e_s, (len_hidden, 1)) * drv_phii - delta_w_phic += T.reshape(e_s, (len_hidden, 1)) * drv_phic - - # delta_theta_i += 0 - delta_theta_in += e_s * drv_in1 - delta_theta_phi += e_s * drv_phi1 - - error = E - - return error, drv_ci, drv_cc, drv_ini, drv_inc, drv_in1, drv_phii, drv_phic, drv_phi1, \ - delta_w_ki, delta_w_kc, delta_w_outi, delta_w_outc, delta_w_ci, delta_w_cc, delta_w_ini, delta_w_inc, delta_w_phii, delta_w_phic, \ - delta_theta_i, delta_theta_k, delta_theta_in, delta_theta_out, delta_theta_phi # cumulate - - node.set_state('current_error', 0.) - node.set_state('error', 0.) - node.set_state('updates', 0) - node.t = -1 - node.samples = 0 - - t_a_i_matrix = node.t_a_i_matrix = np.zeros(shape=(SEQUENCE_LENGTH, len_input)).astype(T.config.floatX) - t_a_t_matrix = node.t_a_t_matrix = np.zeros(shape=(SEQUENCE_LENGTH, len_output)).astype(T.config.floatX) - t_a_o_matrix = node.t_a_o_matrix = np.zeros(shape=(SEQUENCE_LENGTH, len_output)).astype(T.config.floatX) - t_a_h_gen_matrix = node.t_a_h_gen_matrix = np.zeros(shape=(SEQUENCE_LENGTH, len_hidden)).astype(T.config.floatX) - t_a_h_por_matrix = node.t_a_h_por_matrix = np.zeros(shape=(SEQUENCE_LENGTH, len_hidden)).astype(T.config.floatX) - t_a_h_gin_matrix = node.t_a_h_gin_matrix = np.zeros(shape=(SEQUENCE_LENGTH, len_hidden)).astype(T.config.floatX) - t_a_h_gou_matrix = node.t_a_h_gou_matrix = np.zeros(shape=(SEQUENCE_LENGTH, len_hidden)).astype(T.config.floatX) - t_a_h_gfg_matrix = node.t_a_h_gfg_matrix = np.zeros(shape=(SEQUENCE_LENGTH, len_hidden)).astype(T.config.floatX) - - w_oh_por_array = netapi.get_link_weights(nodespace, lstm_por, nodespace, output) - w_oi_array = netapi.get_link_weights(nodespace, input, nodespace, output) - w_h_por_i_array = netapi.get_link_weights(nodespace, input, nodespace, lstm_por) - w_h_gou_h_por_array = netapi.get_link_weights(nodespace, lstm_por, nodespace, lstm_gou) - w_h_gou_i_array = netapi.get_link_weights(nodespace, input, nodespace, lstm_gou) - w_h_por_h_por_array = netapi.get_link_weights(nodespace, lstm_por, nodespace, lstm_por) - w_h_gin_i_array = netapi.get_link_weights(nodespace, input, nodespace, lstm_gin) - w_h_gin_h_por_array = netapi.get_link_weights(nodespace, lstm_por, nodespace, lstm_gin) - w_h_gfg_i_array = netapi.get_link_weights(nodespace, input, nodespace, lstm_gfg) - w_h_gfg_h_por_array = netapi.get_link_weights(nodespace, lstm_por, nodespace, lstm_gfg) - - theta_input_array = netapi.get_thetas(nodespace, input) - theta_output_array = netapi.get_thetas(nodespace, output) - theta_lstm_gin_array = netapi.get_thetas(nodespace, lstm_gin) - theta_lstm_gou_array = netapi.get_thetas(nodespace, lstm_gou) - theta_lstm_gfg_array = netapi.get_thetas(nodespace, lstm_gfg) - - steps = T.iscalar("steps") - - # adadelta hyperparameters - rho = T.scalar("rho") - epsilon = T.scalar("epsilon") - - # activations -- post node/gatefunction, i.e. post-nonlinearities: y - # tgt t(t) - tgt = node.tgt = theano.shared(value=t_a_t_matrix.astype(T.config.floatX), name="tgt", borrow=False) - # output k(t) - y_k = node.y_k = theano.shared(value=t_a_o_matrix.astype(T.config.floatX), name="y_k", borrow=False) - # input i(t) - y_i = node.y_i = theano.shared(value=t_a_i_matrix.astype(T.config.floatX), name="y_i", borrow=False) - # cell state c(t) - y_c = node.y_c = theano.shared(value=t_a_h_por_matrix.astype(T.config.floatX), name="y_c", borrow=False) - # cell internal state (cec) s(t) - s = node.s = theano.shared(value=t_a_h_gen_matrix.astype(T.config.floatX), name="s", borrow=False) - - # for the LSTM gates, no node/gatefunction has been calculated, so we get net sums, not post-nonlinearity values - # output gate out(t) - net_out = node.net_out = theano.shared(value=t_a_h_gou_matrix.astype(T.config.floatX), name="net_out", borrow=False) - # input gate in(t) - net_in = node.net_in = theano.shared(value=t_a_h_gin_matrix.astype(T.config.floatX), name="net_in", borrow=False) - # forget gate phi(t) - net_phi = node.net_phi = theano.shared(value=t_a_h_gfg_matrix.astype(T.config.floatX), name="net_phi", borrow=False) - - # weight sets to be updated - # cell (c) -> output (k) - w_kc = node.w_kc = theano.shared(value=w_oh_por_array.astype(T.config.floatX), name="w_kc", borrow=False) - # input (i) -> output (k) - w_ki = node.w_ki = theano.shared(value=w_oi_array.astype(T.config.floatX), name="w_ki", borrow=False) - # cell (c) -> output gate (out) - w_outc = node.w_outc = theano.shared(value=w_h_gou_h_por_array.astype(T.config.floatX), name="w_outc", borrow=False) - # input (i) -> output gate (out) - w_outi = node.w_outi = theano.shared(value=w_h_gou_i_array.astype(T.config.floatX), name="w_outi", borrow=False) - # input (i) -> cell (c) - w_ci = node.w_ci = theano.shared(value=w_h_por_i_array.astype(T.config.floatX), name="w_ci", borrow=False) - # input (i) -> cell (c) - w_cc = node.w_cc = theano.shared(value=w_h_por_h_por_array.astype(T.config.floatX), name="w_cc", borrow=False) - # input (i) -> input gate (in) - w_ini = node.w_ini = theano.shared(value=w_h_gin_i_array.astype(T.config.floatX), name="w_ini", borrow=False) - # cell (c) -> input gate (in) - w_inc = node.w_inc = theano.shared(value=w_h_gin_h_por_array.astype(T.config.floatX), name="w_inc", borrow=False) - # input (i) -> forget gate (phi) - w_phii = node.w_phii = theano.shared(value=w_h_gfg_i_array.astype(T.config.floatX), name="w_phii", borrow=False) - # cell (c) -> forget gate (phi) - w_phic = node.w_phic = theano.shared(value=w_h_gfg_h_por_array.astype(T.config.floatX), name="w_phic", borrow=False) - - # bias sets to be updated - theta_i = node.theta_i = theano.shared(value=theta_input_array.astype(T.config.floatX), name="theta_i", borrow=False) - theta_k = node.theta_k = theano.shared(value=theta_output_array.astype(T.config.floatX), name="theta_k", borrow=False) - theta_in = node.theta_in = theano.shared(value=theta_lstm_gin_array.astype(T.config.floatX), name="theta_in", borrow=False) - theta_out = node.theta_out = theano.shared(value=theta_lstm_gou_array.astype(T.config.floatX), name="theta_out", borrow=False) - theta_phi = node.theta_phi = theano.shared(value=theta_lstm_gfg_array.astype(T.config.floatX), name="theta_phi", borrow=False) - - # adadelta gradients and delta accumulation variables - node.accu_grad_w_kc = theano.shared(value=np.zeros_like(w_oh_por_array), name="accu_grad_w_kc", borrow=True) - node.accu_delta_w_kc = theano.shared(value=np.zeros_like(w_oh_por_array), name="accu_delta_w_kc", borrow=True) - node.accu_grad_w_ki = theano.shared(value=np.zeros_like(w_oi_array), name="accu_grad_w_ki", borrow=True) - node.accu_delta_w_ki = theano.shared(value=np.zeros_like(w_oi_array), name="accu_delta_w_ki", borrow=True) - node.accu_grad_w_outc = theano.shared(value=np.zeros_like(w_h_gou_h_por_array), name="accu_grad_w_outc", borrow=True) - node.accu_delta_w_outc = theano.shared(value=np.zeros_like(w_h_gou_h_por_array), name="accu_delta_w_outc", borrow=True) - node.accu_grad_w_outi = theano.shared(value=np.zeros_like(w_h_gou_i_array), name="accu_grad_w_outi", borrow=True) - node.accu_delta_w_outi = theano.shared(value=np.zeros_like(w_h_gou_i_array), name="accu_delta_w_outi", borrow=True) - node.accu_grad_w_ci = theano.shared(value=np.zeros_like(w_h_por_i_array), name="accu_grad_w_ci", borrow=True) - node.accu_delta_w_ci = theano.shared(value=np.zeros_like(w_h_por_i_array), name="accu_delta_w_ci", borrow=True) - node.accu_grad_w_cc = theano.shared(value=np.zeros_like(w_h_por_h_por_array), name="accu_grad_w_cc", borrow=True) - node.accu_delta_w_cc = theano.shared(value=np.zeros_like(w_h_por_h_por_array), name="accu_delta_w_cc", borrow=True) - node.accu_grad_w_ini = theano.shared(value=np.zeros_like(w_h_gin_i_array), name="accu_grad_w_ini", borrow=True) - node.accu_delta_w_ini = theano.shared(value=np.zeros_like(w_h_gin_i_array), name="accu_delta_w_ini", borrow=True) - node.accu_grad_w_inc = theano.shared(value=np.zeros_like(w_h_gin_h_por_array), name="accu_grad_w_inc", borrow=True) - node.accu_delta_w_inc = theano.shared(value=np.zeros_like(w_h_gin_h_por_array), name="accu_delta_w_inc", borrow=True) - node.accu_grad_w_phii = theano.shared(value=np.zeros_like(w_h_gfg_i_array), name="accu_grad_w_phii", borrow=True) - node.accu_delta_w_phii = theano.shared(value=np.zeros_like(w_h_gfg_i_array), name="accu_delta_w_phii", borrow=True) - node.accu_grad_w_phic = theano.shared(value=np.zeros_like(w_h_gfg_h_por_array), name="accu_grad_w_phic", borrow=True) - node.accu_delta_w_phic = theano.shared(value=np.zeros_like(w_h_gfg_h_por_array), name="accu_delta_w_phic", borrow=True) - node.accu_grad_theta_k = theano.shared(value=np.zeros_like(theta_output_array), name="accu_grad_theta_k", borrow=True) - node.accu_delta_theta_k = theano.shared(value=np.zeros_like(theta_output_array), name="accu_delta_theta_k", borrow=True) - node.accu_grad_theta_out = theano.shared(value=np.zeros_like(theta_lstm_gou_array), name="accu_grad_theta_out", borrow=True) - node.accu_delta_theta_out = theano.shared(value=np.zeros_like(theta_lstm_gou_array), name="accu_delta_theta_out", borrow=True) - node.accu_grad_theta_in = theano.shared(value=np.zeros_like(theta_lstm_gin_array), name="accu_grad_theta_in", borrow=True) - node.accu_delta_theta_in = theano.shared(value=np.zeros_like(theta_lstm_gin_array), name="accu_delta_theta_in", borrow=True) - node.accu_grad_theta_phi = theano.shared(value=np.zeros_like(theta_lstm_gfg_array), name="accu_grad_theta_phi", borrow=True) - node.accu_delta_theta_phi = theano.shared(value=np.zeros_like(theta_lstm_gfg_array), name="accu_delta_theta_phi", borrow=True) - - [errors, - deriv_ci_prev, - deriv_cc_prev, - deriv_ini_prev, - deriv_inc_prev, - deriv_in1_prev, - deriv_phii_prev, - deriv_phic_prev, - deriv_phi1_prev, - grad_w_ki, - grad_w_kc, - grad_w_outi, - grad_w_outc, - grad_w_ci, - grad_w_cc, - grad_w_ini, - grad_w_inc, - grad_w_phii, - grad_w_phic, - grad_theta_i, - grad_theta_k, - grad_theta_in, - grad_theta_out, - grad_theta_phi], updates = theano.scan( - fn=bpttstep, - sequences=[dict(input=s, taps=[-0]), - dict(input=tgt, taps=[-0]), - dict(input=y_k, taps=[-0]), - dict(input=y_i, taps=[-0]), - dict(input=y_c, taps=[-0]), - dict(input=net_in, taps=[-0]), - dict(input=net_out, taps=[-0]), - dict(input=net_phi, taps=[-0])], - outputs_info=[0., # error - T.zeros_like(w_ci, dtype=T.config.floatX), # deriv_ci_prev - T.zeros_like(w_cc, dtype=T.config.floatX), # deriv_cc_prev - T.zeros_like(w_ini, dtype=T.config.floatX), # deriv_ini_prev - T.zeros_like(w_inc, dtype=T.config.floatX), # deriv_inc_prev - T.zeros_like(theta_in, dtype=T.config.floatX), # deriv_in1_prev - T.zeros_like(w_phii, dtype=T.config.floatX), # deriv_phii_prev - T.zeros_like(w_phic, dtype=T.config.floatX), # deriv_phic_prev - T.zeros_like(theta_phi, dtype=T.config.floatX), # deriv_phi1_prev - T.zeros_like(w_ki, dtype=T.config.floatX), # delta_w_ki - T.zeros_like(w_kc, dtype=T.config.floatX), # delta_w_kc - T.zeros_like(w_outi, dtype=T.config.floatX), # delta_w_outi - T.zeros_like(w_outc, dtype=T.config.floatX), # delta_w_outc - T.zeros_like(w_ci, dtype=T.config.floatX), # delta_w_ci - T.zeros_like(w_cc, dtype=T.config.floatX), # delta_w_cc - T.zeros_like(w_ini, dtype=T.config.floatX), # delta_w_ini - T.zeros_like(w_inc, dtype=T.config.floatX), # delta_w_inc - T.zeros_like(w_phii, dtype=T.config.floatX), # delta_w_phii - T.zeros_like(w_phic, dtype=T.config.floatX), # delta_w_phic - T.zeros_like(theta_i, dtype=T.config.floatX), # delta_theta_i - T.zeros_like(theta_k, dtype=T.config.floatX), # delta_theta_k - T.zeros_like(theta_in, dtype=T.config.floatX), # delta_theta_in - T.zeros_like(theta_out, dtype=T.config.floatX), # delta_theta_out - T.zeros_like(theta_phi, dtype=T.config.floatX)], # delta_theta_phi - non_sequences=[w_kc, - w_ci, - w_cc, - w_outc, - w_outi, - w_ini, - w_inc, - w_phii, - w_phic], - go_backwards=True, - n_steps=steps, - strict=True) - - # adadelta momentum - accu_grad_w_kc = rho * node.accu_grad_w_kc + (1. - rho) * (grad_w_kc[SEQUENCE_LENGTH - 1]**2) - delta_w_kc = (T.sqrt(node.accu_delta_w_kc + epsilon) / T.sqrt(accu_grad_w_kc + epsilon)) * grad_w_kc[SEQUENCE_LENGTH - 1] - accu_delta_w_kc = rho * node.accu_delta_w_kc + (1. - rho) * (delta_w_kc**2) - - accu_grad_w_ki = rho * node.accu_grad_w_ki + (1. - rho) * (grad_w_ki[SEQUENCE_LENGTH - 1]**2) - delta_w_ki = (T.sqrt(node.accu_delta_w_ki + epsilon) / T.sqrt(accu_grad_w_ki + epsilon)) * grad_w_ki[SEQUENCE_LENGTH - 1] - accu_delta_w_ki = rho * node.accu_delta_w_ki + (1. - rho) * (delta_w_ki**2) - - accu_grad_w_outc = rho * node.accu_grad_w_outc + (1. - rho) * (grad_w_outc[SEQUENCE_LENGTH - 1]**2) - delta_w_outc = (T.sqrt(node.accu_delta_w_outc + epsilon) / T.sqrt(accu_grad_w_outc + epsilon)) * grad_w_outc[SEQUENCE_LENGTH - 1] - accu_delta_w_outc = rho * node.accu_delta_w_outc + (1. - rho) * (delta_w_outc**2) - - accu_grad_w_outi = rho * node.accu_grad_w_outi + (1. - rho) * (grad_w_outi[SEQUENCE_LENGTH - 1]**2) - delta_w_outi = (T.sqrt(node.accu_delta_w_outi + epsilon) / T.sqrt(accu_grad_w_outi + epsilon)) * grad_w_outi[SEQUENCE_LENGTH - 1] - accu_delta_w_outi = rho * node.accu_delta_w_outi + (1. - rho) * (delta_w_outi**2) - - accu_grad_w_ci = rho * node.accu_grad_w_ci + (1. - rho) * (grad_w_ci[SEQUENCE_LENGTH - 1]**2) - delta_w_ci = (T.sqrt(node.accu_delta_w_ci + epsilon) / T.sqrt(accu_grad_w_ci + epsilon)) * grad_w_ci[SEQUENCE_LENGTH - 1] - accu_delta_w_ci = rho * node.accu_delta_w_ci + (1. - rho) * (delta_w_ci**2) - - accu_grad_w_cc = rho * node.accu_grad_w_cc + (1. - rho) * (grad_w_cc[SEQUENCE_LENGTH - 1]**2) - delta_w_cc = (T.sqrt(node.accu_delta_w_cc + epsilon) / T.sqrt(accu_grad_w_cc + epsilon)) * grad_w_cc[SEQUENCE_LENGTH - 1] - accu_delta_w_cc = rho * node.accu_delta_w_cc + (1. - rho) * (delta_w_cc**2) - - accu_grad_w_ini = rho * node.accu_grad_w_ini + (1. - rho) * (grad_w_ini[SEQUENCE_LENGTH - 1]**2) - delta_w_ini = (T.sqrt(node.accu_delta_w_ini + epsilon) / T.sqrt(accu_grad_w_ini + epsilon)) * grad_w_ini[SEQUENCE_LENGTH - 1] - accu_delta_w_ini = rho * node.accu_delta_w_ini + (1. - rho) * (delta_w_ini**2) - - accu_grad_w_inc = rho * node.accu_grad_w_inc + (1. - rho) * (grad_w_inc[SEQUENCE_LENGTH - 1]**2) - delta_w_inc = (T.sqrt(node.accu_delta_w_inc + epsilon) / T.sqrt(accu_grad_w_inc + epsilon)) * grad_w_inc[SEQUENCE_LENGTH - 1] - accu_delta_w_inc = rho * node.accu_delta_w_inc + (1. - rho) * (delta_w_inc**2) - - accu_grad_w_phii = rho * node.accu_grad_w_phii + (1. - rho) * (grad_w_phii[SEQUENCE_LENGTH - 1]**2) - delta_w_phii = (T.sqrt(node.accu_delta_w_phii + epsilon) / T.sqrt(accu_grad_w_phii + epsilon)) * grad_w_phii[SEQUENCE_LENGTH - 1] - accu_delta_w_phii = rho * node.accu_delta_w_phii + (1. - rho) * (delta_w_phii**2) - - accu_grad_w_phic = rho * node.accu_grad_w_phic + (1. - rho) * (grad_w_phic[SEQUENCE_LENGTH - 1]**2) - delta_w_phic = (T.sqrt(node.accu_delta_w_phic + epsilon) / T.sqrt(accu_grad_w_phic + epsilon)) * grad_w_phic[SEQUENCE_LENGTH - 1] - accu_delta_w_phic = rho * node.accu_delta_w_phic + (1. - rho) * (delta_w_phic**2) - - accu_grad_theta_k = rho * node.accu_grad_theta_k + (1. - rho) * (grad_theta_k[SEQUENCE_LENGTH - 1]**2) - delta_theta_k = (T.sqrt(node.accu_delta_theta_k + epsilon) / T.sqrt(accu_grad_theta_k + epsilon)) * grad_theta_k[SEQUENCE_LENGTH - 1] - accu_delta_theta_k = rho * node.accu_delta_theta_k + (1. - rho) * (delta_theta_k**2) - - accu_grad_theta_out = rho * node.accu_grad_theta_out + (1. - rho) * (grad_theta_out[SEQUENCE_LENGTH - 1]**2) - delta_theta_out = (T.sqrt(node.accu_delta_theta_out + epsilon) / T.sqrt(accu_grad_theta_out + epsilon)) * grad_theta_out[SEQUENCE_LENGTH - 1] - accu_delta_theta_out = rho * node.accu_delta_theta_out + (1. - rho) * (delta_theta_out**2) - - accu_grad_theta_in = rho * node.accu_grad_theta_in + (1. - rho) * (grad_theta_in[SEQUENCE_LENGTH - 1]**2) - delta_theta_in = (T.sqrt(node.accu_delta_theta_in + epsilon) / T.sqrt(accu_grad_theta_in + epsilon)) * grad_theta_in[SEQUENCE_LENGTH - 1] - accu_delta_theta_in = rho * node.accu_delta_theta_in + (1. - rho) * (delta_theta_in**2) - - accu_grad_theta_phi = rho * node.accu_grad_theta_phi + (1. - rho) * (grad_theta_phi[SEQUENCE_LENGTH - 1]**2) - delta_theta_phi = (T.sqrt(node.accu_delta_theta_phi + epsilon) / T.sqrt(accu_grad_theta_phi + epsilon)) * grad_theta_phi[SEQUENCE_LENGTH - 1] - accu_delta_theta_phi = rho * node.accu_delta_theta_phi + (1. - rho) * (delta_theta_phi**2) - - # update weights - w_kc += delta_w_kc - w_ki += delta_w_ki - w_outc += delta_w_outc - w_outi += delta_w_outi - w_ci += delta_w_ci - w_cc += delta_w_cc - w_ini += delta_w_ini - w_inc += delta_w_inc - w_phii += delta_w_phii - w_phic += delta_w_phic - - # update biases - # theta_i += delta_theta_i - theta_k += delta_theta_k - theta_out += delta_theta_out - theta_in += delta_theta_in - theta_phi += delta_theta_phi - - # this will provide new w values to be written back to the node net, - # as well as deriv_lm_prev values to be used in the next step - node.get_updated_parameters = theano.function([rho, epsilon, steps], - errors, - updates=[(node.w_kc, w_kc), - (node.w_ki, w_ki), - (node.w_outc, w_outc), - (node.w_outi, w_outi), - (node.w_ci, w_ci), - (node.w_cc, w_cc), - (node.w_ini, w_ini), - (node.w_inc, w_inc), - (node.w_phii, w_phii), - (node.w_phic, w_phic), - (node.theta_i, theta_i), - (node.theta_k, theta_k), - (node.theta_in, theta_in), - (node.theta_out, theta_out), - (node.theta_phi, theta_phi), - (node.accu_grad_w_kc, accu_grad_w_kc), - (node.accu_delta_w_kc, accu_delta_w_kc), - (node.accu_grad_w_ki, accu_grad_w_ki), - (node.accu_delta_w_ki, accu_delta_w_ki), - (node.accu_grad_w_outc, accu_grad_w_outc), - (node.accu_delta_w_outc, accu_delta_w_outc), - (node.accu_grad_w_outi, accu_grad_w_outi), - (node.accu_delta_w_outi, accu_delta_w_outi), - (node.accu_grad_w_ci, accu_grad_w_ci), - (node.accu_delta_w_ci, accu_delta_w_ci), - (node.accu_grad_w_cc, accu_grad_w_cc), - (node.accu_delta_w_cc, accu_delta_w_cc), - (node.accu_grad_w_ini, accu_grad_w_ini), - (node.accu_delta_w_ini, accu_delta_w_ini), - (node.accu_grad_w_inc, accu_grad_w_inc), - (node.accu_delta_w_inc, accu_delta_w_inc), - (node.accu_grad_w_phii, accu_grad_w_phii), - (node.accu_delta_w_phii, accu_delta_w_phii), - (node.accu_grad_w_phic, accu_grad_w_phic), - (node.accu_delta_w_phic, accu_delta_w_phic), - (node.accu_grad_theta_k, accu_grad_theta_k), - (node.accu_delta_theta_k, accu_delta_theta_k), - (node.accu_grad_theta_out, accu_grad_theta_out), - (node.accu_delta_theta_out, accu_delta_theta_out), - (node.accu_grad_theta_in, accu_grad_theta_in), - (node.accu_delta_theta_in, accu_delta_theta_in), - (node.accu_grad_theta_phi, accu_grad_theta_phi), - (node.accu_delta_theta_phi, accu_delta_theta_phi) - ], - on_unused_input='warn') - - node.get_error = theano.function([], T.sum(T.square(tgt[SEQUENCE_LENGTH] - y_k[SEQUENCE_LENGTH])) / 2.) - - node.initialized = True - - # every step - - error_prev = node.get_state("current_error") - if error_prev is None: - error_prev = 0. - node.get_gate('e').gate_function(error_prev) - - if netapi.step % 3 == 0 and node.get_slot("debug").activation > 0.5: - netapi.logger.debug("%10i: lstm sample step" % netapi.step) - - if netapi.step % 3 != 1: - return - # every three steps, sample activation from LSTMs - - node.t += 1 - if node.t >= SEQUENCE_LENGTH: - node.t = 0 - - # roll time snapshots to the left - node.t_a_i_matrix = np.roll(node.t_a_i_matrix, -1, 0) - node.t_a_t_matrix = np.roll(node.t_a_t_matrix, -1, 0) - node.t_a_o_matrix = np.roll(node.t_a_o_matrix, -1, 0) - node.t_a_h_gen_matrix = np.roll(node.t_a_h_gen_matrix, -1, 0) - node.t_a_h_por_matrix = np.roll(node.t_a_h_por_matrix, -1, 0) - node.t_a_h_gin_matrix = np.roll(node.t_a_h_gin_matrix, -1, 0) - node.t_a_h_gou_matrix = np.roll(node.t_a_h_gou_matrix, -1, 0) - node.t_a_h_gfg_matrix = np.roll(node.t_a_h_gfg_matrix, -1, 0) - - # insert new snapshot at the end - node.t_a_i_matrix[SEQUENCE_LENGTH - 1, :] = netapi.get_activations(nodespace, input) - node.t_a_t_matrix[SEQUENCE_LENGTH - 1, :] = netapi.get_activations(nodespace, target) - node.t_a_o_matrix[SEQUENCE_LENGTH - 1, :] = netapi.get_activations(nodespace, output) - node.t_a_h_gen_matrix[SEQUENCE_LENGTH - 1, :] = netapi.get_activations(nodespace, lstm_gen) - node.t_a_h_por_matrix[SEQUENCE_LENGTH - 1, :] = netapi.get_activations(nodespace, lstm_por) - node.t_a_h_gin_matrix[SEQUENCE_LENGTH - 1, :] = netapi.get_activations(nodespace, lstm_gou) - node.t_a_h_gou_matrix[SEQUENCE_LENGTH - 1, :] = netapi.get_activations(nodespace, lstm_gin) - node.t_a_h_gfg_matrix[SEQUENCE_LENGTH - 1, :] = netapi.get_activations(nodespace, lstm_gfg) - node.samples += 1 - - if node.get_slot("debug").activation > 0.5: - netapi.logger.debug("%10i: bp sample #%i t, i, c, k data: t[0]=%.6f i[0]=%.6f c[0]=%.6f k[0]=%.6f" - % (netapi.step, node.t, node.t_a_t_matrix[node.t, 0], node.t_a_i_matrix[node.t, 0], - node.t_a_h_por_matrix[node.t, 0], node.t_a_o_matrix[node.t, 0])) - - if node.t != SEQUENCE_LENGTH - 1 or node.samples < 3: - return - # every sequence length samples, do backpropagation-through-time for the sampled sequence - - # netapi.logger.debug("t=%.6f o=%.6f s=%.6f c=%.6f i=%.6f" % (node.t_a_t_matrix[0, 0], node.t_a_o_matrix[0, 0], - # node.t_a_h_gen_matrix[0, 0], node.t_a_h_por_matrix[0, 0], node.t_a_i_matrix[0, 0])) - # netapi.logger.debug("t=%.6f o=%.6f s=%.6f c=%.6f i=%.6f" % (node.t_a_t_matrix[1, 0], node.t_a_o_matrix[1, 0], - # node.t_a_h_gen_matrix[1, 0], node.t_a_h_por_matrix[1, 0], node.t_a_i_matrix[1, 0])) - # netapi.logger.debug("t=%.6f o=%.6f s=%.6f c=%.6f i=%.6f" % (node.t_a_t_matrix[2, 0], node.t_a_o_matrix[2, 0], - # node.t_a_h_gen_matrix[2, 0], node.t_a_h_por_matrix[2, 0], node.t_a_i_matrix[2, 0])) - - # fill w and a variables with values from the Node Net - node.w_kc.set_value(netapi.get_link_weights(nodespace, lstm_por, nodespace, output), borrow=True) - node.w_ki.set_value(netapi.get_link_weights(nodespace, input, nodespace, output), borrow=True) - node.w_outc.set_value(netapi.get_link_weights(nodespace, lstm_por, nodespace, lstm_gou), borrow=True) - node.w_outi.set_value(netapi.get_link_weights(nodespace, input, nodespace, lstm_gou), borrow=True) - node.w_ci.set_value(netapi.get_link_weights(nodespace, input, nodespace, lstm_por), borrow=True) - node.w_cc.set_value(netapi.get_link_weights(nodespace, lstm_por, nodespace, lstm_por), borrow=True) - node.w_ini.set_value(netapi.get_link_weights(nodespace, input, nodespace, lstm_gin), borrow=True) - node.w_inc.set_value(netapi.get_link_weights(nodespace, lstm_por, nodespace, lstm_gin), borrow=True) - node.w_phii.set_value(netapi.get_link_weights(nodespace, input, nodespace, lstm_gfg), borrow=True) - node.w_phic.set_value(netapi.get_link_weights(nodespace, lstm_por, nodespace, lstm_gfg), borrow=True) - - node.theta_i.set_value(netapi.get_thetas(nodespace, input), borrow=True) - node.theta_k.set_value(netapi.get_thetas(nodespace, output), borrow=True) - node.theta_in.set_value(netapi.get_thetas(nodespace, lstm_gin), borrow=True) - node.theta_out.set_value(netapi.get_thetas(nodespace, lstm_gou), borrow=True) - node.theta_phi.set_value(netapi.get_thetas(nodespace, lstm_gfg), borrow=True) - - node.tgt.set_value(node.t_a_t_matrix, borrow=True) - node.y_k.set_value(node.t_a_o_matrix, borrow=True) - node.y_i.set_value(node.t_a_i_matrix, borrow=True) - node.y_c.set_value(node.t_a_h_por_matrix, borrow=True) - node.s.set_value(node.t_a_h_gen_matrix, borrow=True) - node.net_out.set_value(node.t_a_h_gou_matrix, borrow=True) - node.net_in.set_value(node.t_a_h_gin_matrix, borrow=True) - node.net_phi.set_value(node.t_a_h_gfg_matrix, borrow=True) - - rho = float(node.get_parameter('adadelta_rho')) - if not isinstance(rho, Number): - rho = 0.95 - node.set_parameter('adadelta_rho', rho) - - epsilon = float(node.get_parameter('adadelta_epsilon')) - if not isinstance(epsilon, Number): - epsilon = 0.000001 - node.set_parameter('adadelta_epsilon', epsilon) - - len_output = len(netapi.get_activations(nodespace, output)) - len_input = len(netapi.get_activations(nodespace, input)) - len_hidden = len(netapi.get_activations(nodespace, lstm_por)) - - # update the weights, all derivatives and weight update sums are 0 for the first step - errors = node.get_updated_parameters(rho, epsilon, node.t + 1) - - if node.get_slot("debug").activation > 0.5: - netapi.logger.debug("%10i: bp with error %.4f" % (netapi.step, errors[SEQUENCE_LENGTH - 1])) - - # write back changed weights to node net - - # netapi.set_thetas(nodespace, input, node.theta_i.get_value(borrow=True)) - if node.get_parameter("bias_gin") == "true": - netapi.set_thetas(nodespace, lstm_gin, node.theta_in.get_value(borrow=True)) - if node.get_parameter("bias_gou") == "true": - netapi.set_thetas(nodespace, lstm_gou, node.theta_out.get_value(borrow=True)) - if node.get_parameter("bias_gfg") == "true": - netapi.set_thetas(nodespace, lstm_gfg, node.theta_phi.get_value(borrow=True)) - - netapi.set_link_weights(nodespace, input, nodespace, lstm_gou, node.w_outi.get_value(borrow=True)) - netapi.set_link_weights(nodespace, input, nodespace, lstm_por, node.w_ci.get_value(borrow=True)) - netapi.set_link_weights(nodespace, input, nodespace, lstm_gin, node.w_ini.get_value(borrow=True)) - netapi.set_link_weights(nodespace, input, nodespace, lstm_gfg, node.w_phii.get_value(borrow=True)) - netapi.set_link_weights(nodespace, lstm_por, nodespace, output, node.w_kc.get_value(borrow=True)) - - if node.get_parameter("links_io") == "true": - netapi.set_link_weights(nodespace, input, nodespace, output, node.w_ki.get_value(borrow=True)) - if node.get_parameter("links_porpor") == "true": - netapi.set_link_weights(nodespace, lstm_por, nodespace, lstm_por, node.w_cc.get_value(borrow=True)) - if node.get_parameter("links_porgin") == "true": - netapi.set_link_weights(nodespace, lstm_por, nodespace, lstm_gin, node.w_inc.get_value(borrow=True)) - if node.get_parameter("links_porgou") == "true": - netapi.set_link_weights(nodespace, lstm_por, nodespace, lstm_gou, node.w_outc.get_value(borrow=True)) - if node.get_parameter("links_porgfg") == "true": - netapi.set_link_weights(nodespace, lstm_por, nodespace, lstm_gfg, node.w_phic.get_value(borrow=True)) - - node.set_state('current_error', errors[SEQUENCE_LENGTH - 1]) - node.set_state('error', node.get_state('error') + errors[SEQUENCE_LENGTH - 1]) - if node.get_state('updates') % 100 == 0: - netapi.logger.debug("Number of lstm backprop steps computed %d" % node.get_state('updates')) - netapi.logger.debug("Error %.6f (Latest from loop: 0=%.6f)" % ((node.get_state('error') / 100), errors[SEQUENCE_LENGTH - 1])) - node.set_state('error', 0.0) - - # after weight updates, reset gen loops of lstms - netapi.substitute_activations(nodespace, lstm_gen, np.zeros_like(netapi.get_activations(nodespace, lstm_gen))) - # netapi.substitute_activations(nodespace, "lstm_por", np.zeros_like(a_h_por_array)) - - node.set_state('updates', node.get_state('updates') + 1) - - def gradient_descent(netapi, node=None, **params): """ Online gradient descent with backpropagation for three layers (input, hidden,