From e2b225549eadc1f9c49c10d69a88bc93f662f20a Mon Sep 17 00:00:00 2001
From: DeLaVlag <michielvandervlag@gmail.com>
Date: Fri, 27 Sep 2019 14:18:05 +0200
Subject: [PATCH] first version of LEMS interpreter for TVB

---
 NeuroML/NeuroMl/RateBased_kura.xml            |  34 ++++
 NeuroML/README.md                             |  54 ++++++
 NeuroML/dsl_python/tvbLEMS.py                 | 145 ++++++++++++++++
 NeuroML/models/CUDA/kuramoto_network.c        | 128 ++++++++++++++
 .../models/CUDA/kuramoto_network_template.c   | 128 ++++++++++++++
 NeuroML/models/CUDA/network_2d.c              | 142 ++++++++++++++++
 NeuroML/models/CUDA/network_rww.c             | 157 ++++++++++++++++++
 NeuroML/models/CUDA/rateB_kuramoto.c          | 128 ++++++++++++++
 NeuroML/unittests/validate_output_equality.py |  13 ++
 9 files changed, 929 insertions(+)
 create mode 100755 NeuroML/NeuroMl/RateBased_kura.xml
 create mode 100644 NeuroML/README.md
 create mode 100644 NeuroML/dsl_python/tvbLEMS.py
 create mode 100644 NeuroML/models/CUDA/kuramoto_network.c
 create mode 100644 NeuroML/models/CUDA/kuramoto_network_template.c
 create mode 100755 NeuroML/models/CUDA/network_2d.c
 create mode 100644 NeuroML/models/CUDA/network_rww.c
 create mode 100644 NeuroML/models/CUDA/rateB_kuramoto.c
 create mode 100644 NeuroML/unittests/validate_output_equality.py
diff --git a/NeuroML/NeuroMl/RateBased_kura.xml b/NeuroML/NeuroMl/RateBased_kura.xml
new file mode 100755
index 0000000..60adc6a
--- /dev/null
+++ b/NeuroML/NeuroMl/RateBased_kura.xml
@@ -0,0 +1,34 @@
+<Lems
+      xmlns="http://www.neuroml.org/lems/0.7.4"
+      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+      xsi:schemaLocation="http://www.neuroml.org/lems/0.7.4 ../../LEMS/Schemas/LEMS/LEMS_v0.7.4.xsd"
+      description="A number of ComponentTypes for rate based/population models">
+
+    <ComponentType name="Kuramoto"
+                   description="Base type of any cell/population which has a (dimensionless) rate _R."
+                   value="none">
+
+        <Parameter name="global_coupling" dimension='float'/>
+        <Parameter name="global_speed" dimension='float'/>
+
+        <Requirement name="MODEL_LIMIT" dimension="2_pi"/> <!-- put helping functions in util, select wrapping functions.  -->
+
+        <Constant name="rec_n" dimension="none" value="1.0f / n_node" />
+        <Constant name="rec_speed_dt" dimension="1/distance" value="1.0f / global_speed / (dt)" />
+        <Constant name="omega" dimension="MHz" value="60.0 * 2.0 * M_PI_F / 1e3" />
+        <Constant name="sig" dimension="none" value="sqrt(dt) * sqrt(2.0 * 1e-5)" />
+        
+        <Exposure name="OBSERV_OUTPUT" dimension="sin(theta_i)"/>
+
+        <Dynamics>
+            <StateVariable name="MODEL_STATE" dimension="theta_i"/>
+
+            <DerivedVariable name="COUPLING_FUNCTION" exposure="coupling_value" value="wij * sin(theta_j - theta_i)" reduce="add"/>
+            <DerivedVariable name="MODEL_DYNAMICS" exposure="dynamics1" value="(omega + global_coupling * rec_n * coupling_value)" select="noiseON"/>
+
+            <TimeDerivative variable="INTEGRATION_SCHEME" value="dt"/>
+        </Dynamics>
+
+    </ComponentType>
+
+</Lems>
diff --git a/NeuroML/README.md b/NeuroML/README.md
new file mode 100644
index 0000000..381268b
--- /dev/null
+++ b/NeuroML/README.md
@@ -0,0 +1,54 @@
+﻿# TVB_DSL XML (LEMS) to CUDA code generation
+This readme describes the usage of the automatic code generation of the XML to CUDA. As an example 'tvbLEMS.py' takes the Kuramoto model (model for the behaviour of large set of coupled ocillators) defined in 'RateBased_kura.xml' and translates this in a CUDA file called 'rateB_kura.c'. It then compares the resulting output file to the golden file 'kuramoto_network.c' and outputs any differences between the file. A unit test which compares two files can also be selected.
+
+# Author
+Sandra Diaz & Michiel van der Vlag @ Forschungszentrum Juelich.
+
+# License
+Apache License. Version 2.0, January 2004
+
+# Files
+* /dsl_python/tvbLEMS.py 			: python script to convert XML(LEMS) to CUDA
+* /NeuroMl/RateBased_kura.xml 			: XML LEMS format containing Kuramoto model
+* /models/CUDA/kuratmoto_network_template.c 	: Template with placeholders for XML model
+* /models/CUDA/kuramoto_network.c 		: Golden model file for comparison
+* /models/CUDA/rateB_kura.c 			: Resulting Cuda output file
+* /unittests/validate_output_equality.py	: Unittest to validate equality of output
+
+# Prerequisites
+pyLEMS: https://github.com/LEMS/pylems. 
+sudo pip install pylems
+
+# XML LEMS Definitions 
+http://lems.github.io/LEMS/elements.html
+This section defines the fields on the XML file and how they have been interpretated for the Kuramoto model.
+
+```XML
+* <Parameter name="string" dimension="string"/>
+	Sets the name and dimensinality of the parameter that must be supplied when a component is defined. These are the inputs from outside to the model.
+
+* <Requirement name="string" dimension="string" description="string"/>
+	Sets the name and dimensionality that should be accessible within the scope of a model component. Used for selecting the wrapping function for the limits of the model. The desciption holds the actual value.
+
+* <Constant name="string" dimension="string" value="string"/>
+	This is like a parameter but the value is supplied within the model definition itself. The dimension is used to set the unit of the constant. 
+
+* <Exposure name="string" dimension="string"/>
+	For variables that should be made available to other components. Used to output the results of the model. 
+
+Dynamics
+	Specifies the dynamical behaviour of the model.
+
+   * <StateVariable name="string" dimension="string"/>
+   		Name of the state variable in state elements. Dimension is used for the value
+
+   * <DerivedVariable name="string" exposure="string" value="string" reduce="add/mul" select="noiseOn/noiseOff"/>
+   		A quantity that depends algebraically on other quantities in the model. 
+		The 'value' field can be set to a mathematical expression. 
+		The reduce field is used to set the mathematical expression (Add: +=, Mul: *=). 		The select field can be used to add noise to the integration step.
+
+   * <TimeDerivative variable="string" value="string"/>
+   		Expresses the time step uped for the model. Variable is used for the name.
+```
+
+
diff --git a/NeuroML/dsl_python/tvbLEMS.py b/NeuroML/dsl_python/tvbLEMS.py
new file mode 100644
index 0000000..c249e0a
--- /dev/null
+++ b/NeuroML/dsl_python/tvbLEMS.py
@@ -0,0 +1,145 @@
+#! /usr/bin/python
+
+import xml.etree.ElementTree as xe
+import sys
+# from lems.model.model import Model
+from lems.api import Model
+import inspect
+import filecmp
+
+i=0
+
+def build_constants():
+    """
+    Builds the constants section for the cuda kernel
+    @raise ValueError: Raised when the string is not in kuramoto_network_template.c.
+    """
+
+    datatowrite.insert(num, datatowrite.pop(i).replace("<TYPE_1>",
+                                                       modelist[0].parameters['global_coupling'].dimension))
+    datatowrite.insert(num, datatowrite.pop(i).replace("<PARAMETER_1>",
+                                                       modelist[0].parameters['global_coupling'].name))
+    datatowrite.insert(num, datatowrite.pop(i).replace("<TYPE_2>",
+                                                       modelist[0].parameters['global_speed'].dimension))
+    datatowrite.insert(num, datatowrite.pop(i).replace("<PARAMETER_2>",
+                                                       modelist[0].parameters['global_speed'].name))
+    datatowrite.insert(num, datatowrite.pop(i).replace("<RECN_NAME>",
+                                                       modelist[0].constants['rec_n'].name))
+    datatowrite.insert(num, datatowrite.pop(i).replace("<RECN_VALUE>",
+                                                       modelist[0].constants['rec_n'].value))
+    datatowrite.insert(num, datatowrite.pop(i).replace("<RECSPEED_NAME>",
+                                                       modelist[0].constants['rec_speed_dt'].name))
+    datatowrite.insert(num, datatowrite.pop(i).replace("<RECSPEED_VALUE>",
+                                                       modelist[0].constants['rec_speed_dt'].value))
+    datatowrite.insert(num, datatowrite.pop(i).replace("<OMEGA_NAME>",
+                                                       modelist[0].constants['omega'].name))
+    datatowrite.insert(num, datatowrite.pop(i).replace("<OMEGA_VALUE>",
+                                                       modelist[0].constants['omega'].value))
+    datatowrite.insert(num, datatowrite.pop(i).replace("<NOISE_NAME>",
+                                                       modelist[0].constants['sig'].name))
+    datatowrite.insert(num, datatowrite.pop(i).replace("<NOISE_VALUE>",
+                                                       modelist[0].constants['sig'].value))
+
+def build_modeldyn():
+    """
+    Builds the model dynamics section for the cuda kernel
+    Noise is either on or off set by the noiseOn/noiseOff select option in the xmlmodel
+    @raise ValueError: Raised when the string is not in kuramoto_network_template.c.
+    """
+
+    global i
+
+    dyn_noise = model.component_types['Kuramoto'].dynamics.derived_variables['MODEL_DYNAMICS'].select
+
+    datatowrite.insert(num, datatowrite.pop(i).replace("<MODEL_STATE>",
+                                                    modelist[0].dynamics.state_variables['MODEL_STATE'].dimension))
+    datatowrite.insert(num, datatowrite.pop(i).replace("<INTEGRATION_SCHEME>",
+                                                    modelist[0].dynamics.time_derivatives['INTEGRATION_SCHEME'].value))
+    datatowrite.insert(num, datatowrite.pop(i).replace("<MODEL_DYNAMICS>",
+                                                    modelist[0].dynamics.derived_variables['MODEL_DYNAMICS'].value))
+    if dyn_noise == 'noiseON':
+        datatowrite.insert(num, datatowrite.pop(i).replace("<NOISE_PARAM>",
+                                                    modelist[0].constants['sig'].name))
+    if dyn_noise == 'noiseOFF' and '<NOISE_PARAM>' in line:
+        datatowrite.pop(i)
+        datatowrite.pop(i - 1)
+        i -= 2
+    datatowrite.insert(num, datatowrite.pop(i).replace("<MODEL_LIMIT>",
+                                                    modelist[0].requirements['MODEL_LIMIT'].dimension))
+    datatowrite.insert(num, datatowrite.pop(i).replace("<OBSERV_OUTPUT>",
+                                                    modelist[0].exposures['OBSERV_OUTPUT'].dimension))
+
+
+def build_coupling():
+    """
+    Builds the model couling section for the cuda kernel
+    @raise ValueError: Raised when the string is not in kuramoto_network_template.c.
+    """
+
+    # switch reduce for the actual csyntax
+    switcher = {
+        'add': '+=',
+        'sub': '-=',
+        'mul': '*=',
+        'div': '/='
+    }
+
+    datatowrite.insert(num, datatowrite.pop(i).replace("<COUPLING_FUNCTION_NAME>",
+                                                modelist[0].dynamics.derived_variables['COUPLING_FUNCTION'].exposure))
+    datatowrite.insert(num, datatowrite.pop(i).replace("<COUPLING_FUNCTION_REDUCE>",switcher.get(
+                                                modelist[0].dynamics.derived_variables['COUPLING_FUNCTION']
+                                                    .reduce, "reduce argument not listed")))
+    datatowrite.insert(num, datatowrite.pop(i).replace("<COUPLING_FUNCTION_VALUE>",
+                                                modelist[0].dynamics.derived_variables['COUPLING_FUNCTION'].value))
+
+def comparefiles():
+
+    # print(filecmp.cmp(fp_cuda, fp_golden, shallow = False))
+    import difflib
+    import sys
+    from termcolor import colored, cprint
+
+    text1 = open(fp_cuda).readlines()
+    text2 = open(fp_golden).readlines()
+
+    num=0
+    for num, line in enumerate(difflib.unified_diff(text1, text2),0):
+        print(line)
+
+    if num == 0:
+        toprint=('\n The file ' + fp_cuda + ' equals ' + fp_golden + ' dead on!')
+        cprint(toprint, 'cyan')
+
+if __name__ == '__main__':
+
+    fp_xml = '../NeuroMl/RateBased_kura.xml'
+    fp_templ = '../models/CUDA/kuramoto_network_template.c'
+    fp_cuda = '../models/CUDA/rateB_kuramoto.c'
+    fp_golden = '../models/CUDA/kuramoto_network.c'
+
+    datatowrite=[]
+    with open(fp_templ, 'r') as c:
+        datatoread = c.readlines()
+
+    model = Model()
+    model.import_from_file(fp_xml)
+
+    modelist=list()
+    modelist.append(model.component_types['Kuramoto'])
+
+    print(inspect.getmembers(modelist[0]))
+
+    for num, line in enumerate(datatoread,1):
+        datatowrite.append(line)
+        build_constants()
+        build_modeldyn()
+        build_coupling()
+        i += 1
+
+    with open(fp_cuda, "w") as f:
+        f.writelines(datatowrite)
+
+    #compare files
+    comparefiles()
+
+    print("----------------------------------------------")
\ No newline at end of file
diff --git a/NeuroML/models/CUDA/kuramoto_network.c b/NeuroML/models/CUDA/kuramoto_network.c
new file mode 100644
index 0000000..da2cdb8
--- /dev/null
+++ b/NeuroML/models/CUDA/kuramoto_network.c
@@ -0,0 +1,128 @@
+#define PI_2 (2 * M_PI_F)
+
+// buffer length defaults to the argument to the integrate kernel
+// but if it's known at compile time, it can be provided which allows
+// compiler to change i%n to i&(n-1) if n is a power of two.
+#ifndef NH
+#define NH nh
+#endif
+
+#ifndef WARP_SIZE
+#define WARP_SIZE 32
+#endif
+
+#include <stdio.h> // for printf
+#include <curand_kernel.h>
+#include <curand.h>
+
+__device__ float wrap_2_pi_(float x)/*{{{*/
+{
+    bool neg_mask = x < 0.0f;
+    bool pos_mask = !neg_mask;
+    // fmodf diverges 51% of time
+    float pos_val = fmodf(x, PI_2);
+    float neg_val = PI_2 - fmodf(-x, PI_2);
+    return neg_mask * neg_val + pos_mask * pos_val;
+}/*}}}*/
+
+__device__ float wrap_2_pi(float x) // not divergent/*{{{*/
+{
+    bool lt_0 = x < 0.0f;
+    bool gt_2pi = x > PI_2;
+    return (x + PI_2)*lt_0 + x*(!lt_0)*(!gt_2pi) + (x - PI_2)*gt_2pi;
+}/*}}}*/
+
+__global__ void integrate(/*{{{*/
+        // config
+        unsigned int i_step, unsigned int n_node, unsigned int nh, unsigned int n_step, unsigned int n_params,
+        float dt, float speed,
+        float * __restrict__ weights,
+        float * __restrict__ lengths,
+        float * __restrict__ params_pwi, // pwi: per work item
+        // state
+        float * __restrict__ state_pwi,
+        // outputs
+        float * __restrict__ tavg_pwi
+        )
+{/*}}}*/
+
+    // work id & size/*{{{*/
+    const unsigned int id = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
+    const unsigned int size = blockDim.x * gridDim.x * gridDim.y;/*}}}*/
+
+    // ND array accessors (TODO autogen from py shape info)/*{{{*/
+#define params(i_par) (params_pwi[(size * (i_par)) + id])
+#define state(time, i_node) (state_pwi[((time) * n_node + (i_node))*size + id])
+#define tavg(i_node) (tavg_pwi[((i_node) * size) + id])/*}}}*/
+
+    // unpack params/*{{{*/
+    //***// These are the two parameters which are usually explore in fitting in this model
+    const float global_coupling = params(1);
+    const float global_speed = params(0);/*}}}*/
+
+    // derived/*{{{*/
+    const float rec_n = 1.0f / n_node;
+    //***// The speed affects the delay and speed_value is a parameter which is usually explored in fitting ***
+    const float rec_speed_dt = 1.0f / global_speed / (dt);
+    //***// This is a parameter specific to the Kuramoto model
+    const float omega = 60.0 * 2.0 * M_PI_F / 1e3;
+    //***// This is a parameter for the stochastic integration step, you can leave this constant for the moment
+    const float sig = sqrt(dt) * sqrt(2.0 * 1e-5);/*}}}*/ //-->noise sigma value
+
+    curandState s;
+    curand_init(id * (blockDim.x * gridDim.x * gridDim.y), 0, 0, &s);
+
+    //***// This is only initialization of the observable
+    for (unsigned int i_node = 0; i_node < n_node; i_node++)
+        tavg(i_node) = 0.0f;
+
+    //***// This is the loop over time, should stay always the same
+    for (unsigned int t = i_step; t < (i_step + n_step); t++)
+    {
+    //***// This is the loop over nodes, which also should stay the same
+        for (unsigned int i_node = threadIdx.y; i_node < n_node; i_node+=blockDim.y)
+        {
+            //***// We here gather the current state of the node
+            float theta_i = state(t % NH, i_node);
+            //***// This variable is used to traverse the weights and lengths matrix, which is really just a vector. It is just a displacement.
+            unsigned int i_n = i_node * n_node;
+            float sum = 0.0f;
+
+           //***// For all nodes that are not the current node (i_node) sum the coupling
+            for (unsigned int j_node = 0; j_node < n_node; j_node++)
+            {
+            //***// Get the weight of the coupling between node i and node j
+                float wij = weights[i_n + j_node]; // nb. not coalesced
+                if (wij == 0.0)
+                    continue;
+                //***// Get the delay between node i and node j
+                unsigned int dij = lengths[i_n + j_node] * rec_speed_dt;
+                //***// Get the state of node j which is delayed by dij
+                float theta_j = state((t - dij + NH) % NH, j_node);
+                //***// Sum it all together using the coupling function. This is a kuramoto coupling so: a * sin(pre_syn - post_syn)
+                coupling_value += wij * sin(theta_j - theta_i);
+            } // j_node
+
+            //***// This is actually the integration step and the update in the state of the node
+            theta_i += dt * (omega + global_coupling * rec_n * coupling_value);
+            //***// We add some noise if noise is selected
+            theta_i += sig * curand_normal2(&s).x;
+            //***// Wrap it within the limits of the model (0-2pi)
+            theta_i = wrap_2_pi_(theta_i);
+            //***// Update the state
+            state((t + 1) % NH, i_node) = theta_i;
+            //***// Update the observable
+            tavg(i_node) = sin(theta_i);
+
+            // sync across warps executing nodes for single sim, before going on to next time step
+            __syncthreads();
+
+        } // for i_node
+    } // for t
+
+// cleanup macros/*{{{*/
+#undef params
+#undef state
+#undef tavg/*}}}*/
+
+} // kernel integrate
\ No newline at end of file
diff --git a/NeuroML/models/CUDA/kuramoto_network_template.c b/NeuroML/models/CUDA/kuramoto_network_template.c
new file mode 100644
index 0000000..4eead4f
--- /dev/null
+++ b/NeuroML/models/CUDA/kuramoto_network_template.c
@@ -0,0 +1,128 @@
+#define PI_2 (2 * M_PI_F)
+
+// buffer length defaults to the argument to the integrate kernel
+// but if it's known at compile time, it can be provided which allows
+// compiler to change i%n to i&(n-1) if n is a power of two.
+#ifndef NH
+#define NH nh
+#endif
+
+#ifndef WARP_SIZE
+#define WARP_SIZE 32
+#endif
+
+#include <stdio.h> // for printf
+#include <curand_kernel.h>
+#include <curand.h>
+
+__device__ float wrap_2_pi_(float x)/*{{{*/
+{
+    bool neg_mask = x < 0.0f;
+    bool pos_mask = !neg_mask;
+    // fmodf diverges 51% of time
+    float pos_val = fmodf(x, PI_2);
+    float neg_val = PI_2 - fmodf(-x, PI_2);
+    return neg_mask * neg_val + pos_mask * pos_val;
+}/*}}}*/
+
+__device__ float wrap_2_pi(float x) // not divergent/*{{{*/
+{
+    bool lt_0 = x < 0.0f;
+    bool gt_2pi = x > PI_2;
+    return (x + PI_2)*lt_0 + x*(!lt_0)*(!gt_2pi) + (x - PI_2)*gt_2pi;
+}/*}}}*/
+
+__global__ void integrate(/*{{{*/
+        // config
+        unsigned int i_step, unsigned int n_node, unsigned int nh, unsigned int n_step, unsigned int n_params,
+        float dt, float speed,
+        float * __restrict__ weights,
+        float * __restrict__ lengths,
+        float * __restrict__ params_pwi, // pwi: per work item
+        // state
+        float * __restrict__ state_pwi,
+        // outputs
+        float * __restrict__ tavg_pwi
+        )
+{/*}}}*/
+
+    // work id & size/*{{{*/
+    const unsigned int id = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
+    const unsigned int size = blockDim.x * gridDim.x * gridDim.y;/*}}}*/
+
+    // ND array accessors (TODO autogen from py shape info)/*{{{*/
+#define params(i_par) (params_pwi[(size * (i_par)) + id])
+#define state(time, i_node) (state_pwi[((time) * n_node + (i_node))*size + id])
+#define tavg(i_node) (tavg_pwi[((i_node) * size) + id])/*}}}*/
+
+    // unpack params/*{{{*/
+    //***// These are the two parameters which are usually explore in fitting in this model
+    const <TYPE_1> <PARAMETER_1> = params(1);
+    const <TYPE_2> <PARAMETER_2> = params(0);/*}}}*/
+
+    // derived/*{{{*/
+    const float <RECN_NAME> = <RECN_VALUE>;
+    //***// The speed affects the delay and speed_value is a parameter which is usually explored in fitting ***
+    const float <RECSPEED_NAME> = <RECSPEED_VALUE>;
+    //***// This is a parameter specific to the Kuramoto model
+    const float <OMEGA_NAME> = <OMEGA_VALUE>;
+    //***// This is a parameter for the stochastic integration step, you can leave this constant for the moment
+    const float <NOISE_NAME> = <NOISE_VALUE>;/*}}}*/ //-->noise sigma value
+
+    curandState s;
+    curand_init(id * (blockDim.x * gridDim.x * gridDim.y), 0, 0, &s);
+
+    //***// This is only initialization of the observable
+    for (unsigned int i_node = 0; i_node < n_node; i_node++)
+        tavg(i_node) = 0.0f;
+
+    //***// This is the loop over time, should stay always the same
+    for (unsigned int t = i_step; t < (i_step + n_step); t++)
+    {
+    //***// This is the loop over nodes, which also should stay the same
+        for (unsigned int i_node = threadIdx.y; i_node < n_node; i_node+=blockDim.y)
+        {
+            //***// We here gather the current state of the node
+            float theta_i = state(t % NH, i_node);
+            //***// This variable is used to traverse the weights and lengths matrix, which is really just a vector. It is just a displacement.
+            unsigned int i_n = i_node * n_node;
+            float sum = 0.0f;
+
+           //***// For all nodes that are not the current node (i_node) sum the coupling
+            for (unsigned int j_node = 0; j_node < n_node; j_node++)
+            {
+            //***// Get the weight of the coupling between node i and node j
+                float wij = weights[i_n + j_node]; // nb. not coalesced
+                if (wij == 0.0)
+                    continue;
+                //***// Get the delay between node i and node j
+                unsigned int dij = lengths[i_n + j_node] * rec_speed_dt;
+                //***// Get the state of node j which is delayed by dij
+                float theta_j = state((t - dij + NH) % NH, j_node);
+                //***// Sum it all together using the coupling function. This is a kuramoto coupling so: a * sin(pre_syn - post_syn)
+                <COUPLING_FUNCTION_NAME> <COUPLING_FUNCTION_REDUCE> <COUPLING_FUNCTION_VALUE>;
+            } // j_node
+
+            //***// This is actually the integration step and the update in the state of the node
+            <MODEL_STATE> += <INTEGRATION_SCHEME> * <MODEL_DYNAMICS>;
+            //***// We add some noise if noise is selected
+            <MODEL_STATE> += <NOISE_PARAM> * curand_normal2(&s).x;
+            //***// Wrap it within the limits of the model (0-2pi)
+            <MODEL_STATE> = wrap_<MODEL_LIMIT>_(<MODEL_STATE>);
+            //***// Update the state
+            state((t + 1) % NH, i_node) = <MODEL_STATE>;
+            //***// Update the observable
+            tavg(i_node) = <OBSERV_OUTPUT>;
+
+            // sync across warps executing nodes for single sim, before going on to next time step
+            __syncthreads();
+
+        } // for i_node
+    } // for t
+
+// cleanup macros/*{{{*/
+#undef params
+#undef state
+#undef tavg/*}}}*/
+
+} // kernel integrate
\ No newline at end of file
diff --git a/NeuroML/models/CUDA/network_2d.c b/NeuroML/models/CUDA/network_2d.c
new file mode 100755
index 0000000..e84e491
--- /dev/null
+++ b/NeuroML/models/CUDA/network_2d.c
@@ -0,0 +1,142 @@
+#include <stdio.h> // for printf
+#define PI_2 (2 * M_PI_F)
+
+// buffer length defaults to the argument to the integrate kernel
+// but if it's known at compile time, it can be provided which allows
+// compiler to change i%n to i&(n-1) if n is a power of two.
+#ifndef NH
+#define NH nh
+#endif
+
+#ifndef WARP_SIZE
+#define WARP_SIZE 32
+#endif
+
+
+#include <curand_kernel.h>
+#include <curand.h>
+
+__device__ float wrap_2_pi_(float x)/*{{{*/
+{
+    bool neg_mask = x < 0.0f;
+    bool pos_mask = !neg_mask;
+    // fmodf diverges 51% of time
+    float pos_val = fmodf(x, PI_2);
+    float neg_val = PI_2 - fmodf(-x, PI_2);
+    return neg_mask * neg_val + pos_mask * pos_val;
+}/*}}}*/
+
+__device__ float wrap_2_pi(float x) // not divergent/*{{{*/
+{
+    bool lt_0 = x < 0.0f;
+    bool gt_2pi = x > PI_2;
+    return (x + PI_2)*lt_0 + x*(!lt_0)*(!gt_2pi) + (x - PI_2)*gt_2pi;
+}/*}}}*/
+
+
+const float tau = 1.0;
+const float I = 0.0;
+const float a = -2.0;
+const float b = -10.0;
+const float c = 0.0;
+const float d = 0.02;
+const float e = 3.0;
+const float f = 1.0;
+const float g = 0.0;
+const float beta = 1.0;
+const float alpha = 1.0;
+const float gam = 1.0;
+
+
+__global__ void integrate_2d(
+        // config
+        unsigned int i_step, unsigned int n_node, unsigned int nh, unsigned int n_step, unsigned int n_params,
+        float dt, float speed,
+        float * weights,
+        float * lengths,
+        float * params_pwi, // pwi: per work item
+        // state
+        float * state_pwi,
+        // outputs
+        float * tavg_pwi
+        )
+{
+    // work id & size
+    const unsigned int id = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
+    const unsigned int size = blockDim.x * gridDim.x * gridDim.y;
+
+    // ND array accessors (TODO autogen from py shape info)
+#define params(i_par) (params_pwi[(size * (i_par)) + id])
+#define state(time, i_node) (state_pwi[((time) *2 * n_node + (i_node))*(size) + id])
+#define tavg(i_node) (tavg_pwi[((i_node) * size) + id])
+
+    // unpack params
+    
+
+    // derived
+    const float sig = 0.0001; //params(0);//0.001;//sqrt(dt) * sqrt(2.0 * 1e-3);
+    const float rec_speed_dt = params(0);
+    const float G = params(1);
+    const float lc = 0.0;
+    
+    curandState s;
+    curand_init(id + (unsigned int) clock64(), 0, 0, &s);
+ 
+    double derivV = 0.0;
+    double derivW = 0.0;
+    double V = 0.0;
+    double W = 0.0;
+   
+    double sum = 0.0;
+    float wij = 0.0f;
+    float V_j = 0.0;
+    unsigned int dij = 0;
+    
+
+    for (unsigned int i_node = 0; i_node < n_node; i_node++){
+        tavg(i_node) = 0.0f;
+        if (i_step == 0){
+            state(i_step, i_node) = 0.001;
+        }
+    }
+
+    for (unsigned int t = i_step; t < (i_step + n_step); t++)
+    {
+        for (unsigned int i_node = 0; i_node < n_node; i_node++)
+        {
+            sum = 0.0f;
+            V = state((t) % nh, i_node);
+            W = state((t) % nh, i_node + n_node);
+            for (unsigned int j_node = 0; j_node < n_node; j_node++)
+            {
+                float wij = weights[(i_node*n_node) + j_node]; // nb. not coalesced
+                if (wij == 0.0)
+                    continue;
+                dij = lengths[(i_node*n_node) + j_node] * rec_speed_dt;
+                V_j = state((t - dij + NH) % NH, j_node);
+                sum += wij * sin(V_j - V);
+            }
+            sum = G*sum; // Global coupling
+
+            derivV = d * tau * (alpha * W - f * powf(V,3) + e * powf(V,2) + g * V + gam * I + gam * sum + lc * V);
+            derivW = d * (a + b * V + c * powf(V,2) - beta * W) / tau;
+            V = (V)+(dt*(sig * curand_normal(&s)))+((derivV));
+            W = (W)+(dt*(sig * curand_normal(&s)))+((derivW));
+            if(V>4) V = 4;
+            if(W>6) W = 6;
+            if(V<-2) V = -2;
+            if(W<-6) W = -6;
+            state((t+1) % nh, i_node) = V;
+            state((t+1) % nh, i_node+(n_node)) = W;
+            tavg(i_node) = V;
+
+            // sync across warps executing nodes for single sim, before going on to next time step
+            __syncthreads();
+        } // for i_node
+    } // for t
+    // cleanup macros/*{{{*/
+    #undef params
+    #undef state
+    #undef tavg/*}}}*/
+} // kernel integrate
+// vim: sw=4 sts=4 ts=8 et ai
diff --git a/NeuroML/models/CUDA/network_rww.c b/NeuroML/models/CUDA/network_rww.c
new file mode 100644
index 0000000..9dc0a83
--- /dev/null
+++ b/NeuroML/models/CUDA/network_rww.c
@@ -0,0 +1,157 @@
+#include <stdio.h> // for printf
+#define PI_2 (2 * M_PI_F)
+
+// buffer length defaults to the argument to the integrate kernel
+// but if it's known at compile time, it can be provided which allows
+// compiler to change i%n to i&(n-1) if n is a power of two.
+#ifndef NH
+#define NH nh
+#endif
+
+#ifndef WARP_SIZE
+#define WARP_SIZE 32
+#endif
+
+
+#include <curand_kernel.h>
+#include <curand.h>
+
+__device__ float wrap_2_pi_(float x)/*{{{*/
+{
+    bool neg_mask = x < 0.0f;
+    bool pos_mask = !neg_mask;
+    // fmodf diverges 51% of time
+    float pos_val = fmodf(x, PI_2);
+    float neg_val = PI_2 - fmodf(-x, PI_2);
+    return neg_mask * neg_val + pos_mask * pos_val;
+}/*}}}*/
+
+__device__ float wrap_2_pi(float x) // not divergent/*{{{*/
+{
+    bool lt_0 = x < 0.0f;
+    bool gt_2pi = x > PI_2;
+    return (x + PI_2)*lt_0 + x*(!lt_0)*(!gt_2pi) + (x - PI_2)*gt_2pi;
+}/*}}}*/
+
+
+const float w_plus=1.4f;
+const float a_E=310.0f;
+const float b_E=125.0f;
+const float d_E=0.154f;
+const float a_I=615.0f;
+const float b_I=177.0f;
+const float d_I=0.087f;
+const float gamma_E=0.641f / 1000.0f;
+const float tau_E=100.0f;
+const float tau_I=10.0f;
+const float I_0=0.382f;
+const float w_E=1.0f;
+const float w_I=0.7f;
+const float gamma_I= 1.0f / 1000.0f;
+const float min_d_E = (-1.0f * d_E);
+const float min_d_I = (-1.0f * d_I);
+const float imintau_E = (-1.0f / tau_E);
+const float imintau_I = (-1.0f / tau_I);
+const float w_E__I_0 = (w_E * I_0);
+const float w_I__I_0 = (w_I * I_0);
+
+__global__ void integrate_wongwang(
+        // config
+        unsigned int i_step, unsigned int n_node, unsigned int nh, unsigned int n_step, unsigned int n_params,
+        float dt, float speed,
+        float * weights,
+        float * lengths,
+        float * params_pwi, // pwi: per work item
+        // state
+        float * state_pwi,
+        // outputs
+        float * tavg_pwi
+        )
+{
+    // const int i_step_dev = i_step;
+    // const int n_node_dev = n_node;
+    // work id & size
+    const unsigned int id = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
+    const unsigned int size = blockDim.x * gridDim.x * gridDim.y;
+
+    // ND array accessors (TODO autogen from py shape info)
+#define params(i_par) (params_pwi[(size * (i_par)) + id])
+#define state(time, i_node) (state_pwi[((time) *2 * n_node + (i_node))*(size) + id])
+#define tavg(i_node) (tavg_pwi[((i_node) * size) + id])
+
+    // unpack params
+    const float G = params(0);
+    const float J_NMDA = 0.15;//params(0);
+    const float JI= 1.0;
+    const float G_J_NMDA = G*J_NMDA;
+    // derived
+    
+    const float w_plus__J_NMDA = (w_plus * J_NMDA);
+    const float sig = params(1);//0.001;//sqrt(dt) * sqrt(2.0 * 1e-3);
+    // We have three variables which could be changed here. Actually 4
+    // G (the global coupling), sigma (the noise), J_NMDA(the excitatory synaptic coupling) and J_i(the inner inhibition for each region)
+    // For now we are making things simple and only change two parameters, G and J_NMDA.
+
+    curandState s;
+    curand_init(id + (unsigned int) clock64(), 0, 0, &s);
+ 
+    double tmp_I_E;
+    double tmp_H_E;
+    double tmp_I_I;
+    double tmp_H_I;
+    double sum;
+    double S_E = 0.0;
+    double S_I = 0.0;
+    
+
+    for (unsigned int i_node = 0; i_node < n_node; i_node++){
+        tavg(i_node) = 0.0f;
+        if (i_step == 0){
+            state(i_step, i_node) = 0.001;
+        }
+    }
+
+    for (unsigned int t = i_step; t < (i_step + n_step); t++)
+    {
+        for (unsigned int i_node = 0; i_node < n_node; i_node++)
+        {
+            sum = 0.0f;
+            S_E = state((t) % nh, i_node);
+            S_I = state((t) % nh, i_node + n_node);
+            for (unsigned int j_node = 0; j_node < n_node; j_node++)
+            {
+                //we are not considering delays in this model
+                float wij = G_J_NMDA*weights[(i_node*n_node) + j_node]; // nb. not coalesced
+                if (wij == 0.0)
+                    continue;
+                sum += wij * state((t) % nh, j_node); //of J
+            }
+            // external Input set to 0, no task evoked activity
+            tmp_I_E = JI*S_I; // Inner inhibition set to 1
+            tmp_I_E = sum - tmp_I_E ;    
+            tmp_I_E = ((w_E__I_0)+(w_plus__J_NMDA * S_E)) + tmp_I_E ;
+            tmp_I_E = a_E * tmp_I_E - b_E;
+            tmp_H_E = tmp_I_E/(1.0-exp(min_d_E * tmp_I_E));
+            tmp_I_I = (a_I*(((w_I__I_0)+(J_NMDA * S_E))-(S_I)))-b_I;
+            tmp_H_I = tmp_I_I/(1.0-exp(min_d_I*tmp_I_I));
+
+            S_E = (S_E)+(dt*(sig * curand_normal(&s)))+(dt*((imintau_E* S_E)+(tmp_H_E*((1-S_E)*gamma_E))));
+            S_I = (S_I)+(dt*(sig * curand_normal(&s)))+(dt*((imintau_I* S_I)+(tmp_H_I*gamma_I)));
+            if(S_E>1) S_E = 1;
+            if(S_I>1) S_I = 1;
+            if(S_E<0) S_E = 0;
+            if(S_I<0) S_I = 0;
+            state((t+1) % nh, i_node) = S_E;
+            state((t+1) % nh, i_node+(n_node)) = S_I;
+            tavg(i_node) = S_E;
+
+            // sync across warps executing nodes for single sim, before going on to next time step
+            __syncthreads();
+        } // for i_node
+    } // for t
+    // cleanup macros/*{{{*/
+    #undef params
+    #undef state
+    #undef tavg/*}}}*/
+} // kernel integrate
+// vim: sw=4 sts=4 ts=8 et ai
diff --git a/NeuroML/models/CUDA/rateB_kuramoto.c b/NeuroML/models/CUDA/rateB_kuramoto.c
new file mode 100644
index 0000000..da2cdb8
--- /dev/null
+++ b/NeuroML/models/CUDA/rateB_kuramoto.c
@@ -0,0 +1,128 @@
+#define PI_2 (2 * M_PI_F)
+
+// buffer length defaults to the argument to the integrate kernel
+// but if it's known at compile time, it can be provided which allows
+// compiler to change i%n to i&(n-1) if n is a power of two.
+#ifndef NH
+#define NH nh
+#endif
+
+#ifndef WARP_SIZE
+#define WARP_SIZE 32
+#endif
+
+#include <stdio.h> // for printf
+#include <curand_kernel.h>
+#include <curand.h>
+
+__device__ float wrap_2_pi_(float x)/*{{{*/
+{
+    bool neg_mask = x < 0.0f;
+    bool pos_mask = !neg_mask;
+    // fmodf diverges 51% of time
+    float pos_val = fmodf(x, PI_2);
+    float neg_val = PI_2 - fmodf(-x, PI_2);
+    return neg_mask * neg_val + pos_mask * pos_val;
+}/*}}}*/
+
+__device__ float wrap_2_pi(float x) // not divergent/*{{{*/
+{
+    bool lt_0 = x < 0.0f;
+    bool gt_2pi = x > PI_2;
+    return (x + PI_2)*lt_0 + x*(!lt_0)*(!gt_2pi) + (x - PI_2)*gt_2pi;
+}/*}}}*/
+
+__global__ void integrate(/*{{{*/
+        // config
+        unsigned int i_step, unsigned int n_node, unsigned int nh, unsigned int n_step, unsigned int n_params,
+        float dt, float speed,
+        float * __restrict__ weights,
+        float * __restrict__ lengths,
+        float * __restrict__ params_pwi, // pwi: per work item
+        // state
+        float * __restrict__ state_pwi,
+        // outputs
+        float * __restrict__ tavg_pwi
+        )
+{/*}}}*/
+
+    // work id & size/*{{{*/
+    const unsigned int id = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
+    const unsigned int size = blockDim.x * gridDim.x * gridDim.y;/*}}}*/
+
+    // ND array accessors (TODO autogen from py shape info)/*{{{*/
+#define params(i_par) (params_pwi[(size * (i_par)) + id])
+#define state(time, i_node) (state_pwi[((time) * n_node + (i_node))*size + id])
+#define tavg(i_node) (tavg_pwi[((i_node) * size) + id])/*}}}*/
+
+    // unpack params/*{{{*/
+    //***// These are the two parameters which are usually explore in fitting in this model
+    const float global_coupling = params(1);
+    const float global_speed = params(0);/*}}}*/
+
+    // derived/*{{{*/
+    const float rec_n = 1.0f / n_node;
+    //***// The speed affects the delay and speed_value is a parameter which is usually explored in fitting ***
+    const float rec_speed_dt = 1.0f / global_speed / (dt);
+    //***// This is a parameter specific to the Kuramoto model
+    const float omega = 60.0 * 2.0 * M_PI_F / 1e3;
+    //***// This is a parameter for the stochastic integration step, you can leave this constant for the moment
+    const float sig = sqrt(dt) * sqrt(2.0 * 1e-5);/*}}}*/ //-->noise sigma value
+
+    curandState s;
+    curand_init(id * (blockDim.x * gridDim.x * gridDim.y), 0, 0, &s);
+
+    //***// This is only initialization of the observable
+    for (unsigned int i_node = 0; i_node < n_node; i_node++)
+        tavg(i_node) = 0.0f;
+
+    //***// This is the loop over time, should stay always the same
+    for (unsigned int t = i_step; t < (i_step + n_step); t++)
+    {
+    //***// This is the loop over nodes, which also should stay the same
+        for (unsigned int i_node = threadIdx.y; i_node < n_node; i_node+=blockDim.y)
+        {
+            //***// We here gather the current state of the node
+            float theta_i = state(t % NH, i_node);
+            //***// This variable is used to traverse the weights and lengths matrix, which is really just a vector. It is just a displacement.
+            unsigned int i_n = i_node * n_node;
+            float sum = 0.0f;
+
+           //***// For all nodes that are not the current node (i_node) sum the coupling
+            for (unsigned int j_node = 0; j_node < n_node; j_node++)
+            {
+            //***// Get the weight of the coupling between node i and node j
+                float wij = weights[i_n + j_node]; // nb. not coalesced
+                if (wij == 0.0)
+                    continue;
+                //***// Get the delay between node i and node j
+                unsigned int dij = lengths[i_n + j_node] * rec_speed_dt;
+                //***// Get the state of node j which is delayed by dij
+                float theta_j = state((t - dij + NH) % NH, j_node);
+                //***// Sum it all together using the coupling function. This is a kuramoto coupling so: a * sin(pre_syn - post_syn)
+                coupling_value += wij * sin(theta_j - theta_i);
+            } // j_node
+
+            //***// This is actually the integration step and the update in the state of the node
+            theta_i += dt * (omega + global_coupling * rec_n * coupling_value);
+            //***// We add some noise if noise is selected
+            theta_i += sig * curand_normal2(&s).x;
+            //***// Wrap it within the limits of the model (0-2pi)
+            theta_i = wrap_2_pi_(theta_i);
+            //***// Update the state
+            state((t + 1) % NH, i_node) = theta_i;
+            //***// Update the observable
+            tavg(i_node) = sin(theta_i);
+
+            // sync across warps executing nodes for single sim, before going on to next time step
+            __syncthreads();
+
+        } // for i_node
+    } // for t
+
+// cleanup macros/*{{{*/
+#undef params
+#undef state
+#undef tavg/*}}}*/
+
+} // kernel integrate
\ No newline at end of file
diff --git a/NeuroML/unittests/validate_output_equality.py b/NeuroML/unittests/validate_output_equality.py
new file mode 100644
index 0000000..3c8f952
--- /dev/null
+++ b/NeuroML/unittests/validate_output_equality.py
@@ -0,0 +1,13 @@
+import unittest
+import filecmp
+
+
+class TestEqualFiles(unittest.TestCase):
+    def test_something(self):
+        fp_cuda = '../models/CUDA/rateB_kuramoto.c'
+        fp_golden = '../models/CUDA/kuramoto_network.c'
+        self.assertTrue(filecmp.cmp(fp_golden, fp_cuda, shallow=False))
+
+
+if __name__ == '__main__':
+    unittest.main()