diff --git a/src/CartLatticeAccess.hpp.Rt b/src/CartLatticeAccess.hpp.Rt
index 5a72e1c24..0b1f2d071 100644
--- a/src/CartLatticeAccess.hpp.Rt
+++ b/src/CartLatticeAccess.hpp.Rt
@@ -262,21 +262,23 @@ class CartLatticeAccess {
   const z_t z;
   const flag_t nt;
   const range_int<0,1,0,1> nx, ny, nz;
+  const CartLatticeContainer* container;
   public:
-  CudaDeviceFunction CartLatticeAccess(const int& x_, const int& y_, const int& z_) :
+  CudaDeviceFunction CartLatticeAccess(const int& x_, const int& y_, const int& z_, const CartLatticeContainer& container_) :
     x(x_),y(y_),z(z_),
-    nt(fetchNodeType(constContainer, x, y, z)),
-    nx(constContainer.nx),ny(constContainer.ny),nz(constContainer.nz)
+    nt(fetchNodeType(container_, x, y, z)),
+    nx(container_.nx),ny(container_.ny),nz(container_.nz),
+    container(&container_)
     { }
-  CudaDeviceFunction real_t getX() const { return constContainer.px + x; }
-  CudaDeviceFunction real_t getY() const { return constContainer.py + y; }
-  CudaDeviceFunction real_t getZ() const { return constContainer.pz + z; }
+  CudaDeviceFunction real_t getX() const { return container->px + x; }
+  CudaDeviceFunction real_t getY() const { return container->py + y; }
+  CudaDeviceFunction real_t getZ() const { return container->pz + z; }
   CudaDeviceFunction flag_t getNodeType() const { return nt; }
 
   CudaDeviceFunction inline cut_t getQ(const int& d) const  {
-    if (constContainer.Q == NULL) return NO_CUT;
+    if (!container->Q) return NO_CUT;
     size_t i = ((((size_t)d)*nz+z)*ny+y)*nx+x;
-    return constContainer.Q[i];
+    return container->Q[i];
   }
 
 <?R for (f in rows(Fields)) { ?>
@@ -299,7 +301,7 @@ template <class dx_t, class dy_t, class dz_t>
 CudaDeviceFunction real_t CartLatticeAccess< x_t, y_t, z_t >::load_<?%s f$nicename ?> (const dx_t & dx, const dy_t & dy, const dz_t & dz) const
 {
   storage_t ret; <?R
-  con = make.context("constContainer.in");
+  con = make.context("container->in");
   p = PV(c("x","y","z"));
   dp = PV(c("dx","dy","dz"));
   if (f$minx == f$maxx) dp[1] = f$minx
@@ -318,7 +320,7 @@ template <class N>
 CudaDeviceFunction void CartLatticeAccess< x_t, y_t, z_t >::pop<?%s s$suffix ?>(N & node) const
 {
 	storage_t val; <?R
-  con = make.context("constContainer.in",pocket=TRUE);
+  con = make.context("container->in",pocket=TRUE);
   for (d in rows(Density)[s$load.densities]) {
     f = rows(Fields)[[match(d$field, Fields$name)]]
     dp = c(-d$dx, -d$dy, -d$dz)
@@ -332,7 +334,7 @@ template <class N>
 CudaDeviceFunction void CartLatticeAccess< x_t, y_t, z_t >::push<?%s s$suffix ?>(N & node) const
 {
   storage_t val; <?R
-  con = make.context("constContainer.out",pocket=TRUE);
+  con = make.context("container->out",pocket=TRUE);
   for (f in rows(Fields)[s$save.fields]) { ?>
   val = <?%s real_to_storage(paste("node",f$name,sep="."),f) ?>; <?R
     con=save.field("val", f, p, con)
@@ -344,7 +346,7 @@ template < class x_t, class y_t, class z_t >
 template <class N>
 CudaDeviceFunction void CartLatticeAccess< x_t, y_t, z_t >::push<?%s s$suffix ?>_adj(N & node) const
 { <?R
-  con = make.context("constContainer.adjout",pocket=TRUE);
+  con = make.context("container->adjout",pocket=TRUE);
   for (d in rows(Density)[s$load.densities]) {
     f = rows(Fields)[[match(d$field, Fields$name)]]
     dp = c(-d$dx, -d$dy, -d$dz)
@@ -357,7 +359,7 @@ template < class x_t, class y_t, class z_t >
 template <class N>
 CudaDeviceFunction void CartLatticeAccess< x_t, y_t, z_t >::pop<?%s s$suffix ?>_adj(N & node) const
 { <?R
-  con = make.context("constContainer.adjin",pocket=TRUE);
+  con = make.context("container->adjin",pocket=TRUE);
   for (f in rows(Fields)[s$save.fields]) {
     val = paste("node",f$adjoint_name,sep=".")
     con = field.access( d=val, f, p, pattern="put", access="getsum", MContext=con)
diff --git a/src/CartLatticeExecutor.hpp.Rt b/src/CartLatticeExecutor.hpp.Rt
deleted file mode 100644
index bfe9d74be..000000000
--- a/src/CartLatticeExecutor.hpp.Rt
+++ /dev/null
@@ -1,327 +0,0 @@
-<?R
-        source("conf.R")
-c_header();
-?>
-/** \file CartLatticeExecutor.hpp
-*/
-
-#include "CartLatticeExecutor.h"
-#include "CartLatticeAccess.hpp"
-#include "Node.hpp"
-#include "GetThreads.h"
-
-using CartLatticeAccessAll      = AccessComposite< CartLatticeAccess< range_int<0,0,-1,1>, range_int<0,0,-1,1>, range_int<0,0,-1,1> > >;
-using CartLatticeAccessInterior = AccessComposite< CartLatticeAccess<
-  range_int< <?%d BorderMargin$max[1] ?>,0,<?%d BorderMargin$min[1]-1 ?>,1>,
-  range_int< <?%d BorderMargin$max[2] ?>,0,<?%d BorderMargin$min[2]-1 ?>,1>,
-  range_int< <?%d BorderMargin$max[3] ?>,0,<?%d BorderMargin$min[3]-1 ?>,1> > >;
-
-template<class T>
-static void CopyObjToConst(const T& host_obj, T& const_symbol) {
-  CudaCopyToConstant("copy to GPU constant memory", const_symbol, &host_obj, sizeof(T));
-}
-
-void CartLatticeExecutor::CopyToConst() const {
-  CopyObjToConst(container, constContainer);
-  CopyObjToConst(data, constData);
-}
-
-/// Main Kernel
-/**
-  iterates over all elements and runs them with RunElement function.
-  constContainer.dx/dy is to calculate only internal nodes
-*/
-template < eOperationType I, eCalculateGlobals G, eStage S >
-struct InteriorExecutor {
-  CudaDeviceFunction void Execute() const {
-    using LA = CartLatticeAccessInterior;
-    using N  = Node<LA, I, G, S>;
-    int x_ = CudaThread.x + CudaBlock.z*CudaNumberOfThreads.x + <?%d BorderMargin$max[1] ?>;
-    int y_ = CudaThread.y + CudaBlock.x*CudaNumberOfThreads.y + <?%d BorderMargin$max[2] ?>;
-    int z_ = CudaBlock.y                                      + <?%d BorderMargin$max[3] ?>;
-    if (y_ < constContainer.ny - <?%d -BorderMargin$min[2] ?>) {
-#ifndef GRID3D
-      for (; x_ < constContainer.nx; x_ += CudaNumberOfThreads.x)
-#endif
-      {
-        LA acc(x_,y_,z_);
-        N now(acc);
-        now.RunElement();
-      }
-    }
-  }
-};
-
-/// Border Kernel
-/**
-  iterates over border elements and runs them with RunElement function
-*/
-template < eOperationType I, eCalculateGlobals G, eStage S >
-struct BorderExecutor {
-  CudaDeviceFunction void Execute() const {
-    using LA = CartLatticeAccessAll;
-    using N  = Node<LA, I, G, S>;
-    int x_ = CudaThread.x + CudaBlock.z*CudaNumberOfThreads.x + <?%d BorderMargin$max[1] ?>;
-    int a_ = CudaThread.y + CudaBlock.x*CudaNumberOfThreads.y;
-    int y_,z_;
-    switch (CudaBlock.y) { <?R
-    i = 0
-    if (BorderMargin$max[2] > BorderMargin$min[2]) for (y in BorderMargin$min[2]:BorderMargin$max[2]) if (y != 0) { ?>
-    case <?%d i ?>:
-      z_ = a_; <?R
-      if (y > 0) { ?>
-        y_ = <?%d y - 1 ?>; <?R
-      } else if (y < 0) { ?>
-        y_ = constContainer.ny - <?%d -y ?>; <?R
-      } ?>
-      if (z_ >= constContainer.nz) return;
-      break; <?R
-      i = i + 1;
-      }
-      if (BorderMargin$max[3] > BorderMargin$min[3]) for (z in BorderMargin$min[3]:BorderMargin$max[3]) if (z != 0) { ?>
-    case <?%d i ?>:
-      y_ = a_ + <?%d BorderMargin$max[2] ?>; <?R
-      if (z > 0) { ?>
-        z_ = <?%d z - 1 ?>; <?R
-      } else if (z < 0) { ?>
-      z_ = constContainer.nz - <?%d -z ?>; <?R
-      } ?>
-      if (y_ >= constContainer.ny - <?%d -BorderMargin$min[2] ?>) return;
-      break; <?R
-      i = i + 1;
-      } ?>
-    default:
-      assert(CudaThread.y < <?%d i ?>);
-      y_ = 0;z_ = 0;
-      break;
-    }
-
-#ifndef GRID3D
-    for (; x_ < constContainer.nx; x_ += CudaNumberOfThreads.x)
-#endif
-    {
-      LA acc(x_,y_,z_);
-      N now(acc);
-      now.RunElement();
-    }
-  }
-};
-
-template <class E>
-CudaGlobalFunction void Kernel() {
-  E e;
-  e.Execute();
-}
-
-/// Run the interior kernel
-/**
-  Dispatch the kernel running RunElement on all interior elements of the lattice
-  \param stream CUDA Stream to which add the kernel run
-*/
-template <class EX>
-void RunInteriorImpl(const CartLatticeContainer& container, CudaStream_t stream) {
-  const int nx = container.nx;
-  const int ny = container.ny;
-  const int nz = container.nz;
-  dim3 thr = ThreadNumber< EX >::threads();
-  dim3 blx;
-#ifdef GRID3D
-  blx.z = nx/thr.x;
-#else
-  blx.z = 1;
-#endif
-  int totx = ny - <?%d BorderMargin$max[2]-BorderMargin$min[2] ?>;
-  blx.x = ceiling_div(totx, thr.y);
-  int toty = nz - <?%d BorderMargin$max[3]-BorderMargin$min[3] ?>;
-  blx.y = toty;
-  CudaKernelRunNoWait(Kernel< EX >, blx, thr, stream);
-};
-
-/// Run the border kernel
-/**
-  Dispatch the kernel running RunElement on all border elements of the Lattice
-  \param stream CUDA Stream to which add the kernel run
-*/
-template <class EX>
-void RunBorderImpl(const CartLatticeContainer& container, CudaStream_t stream) {
-  const int nx = container.nx;
-  const int ny = container.ny;
-  const int nz = container.nz;
-<?R
-  thy = BorderMargin$max[2] - BorderMargin$min[2] + BorderMargin$max[3] - BorderMargin$min[3]
-  blx = "nz"
-  if (BorderMargin$max[3] != 0 || BorderMargin$min[3] != 0) blx = "max(ny,nz)"
-  if (thy > 0) {
-?>
-  dim3 thr = ThreadNumber< EX >::threads();
-  dim3 blx;
-#ifdef GRID3D
-  blx.z = nx/thr.x;
-#else
-  blx.z = 1;
-#endif
-  int totx = <?%s blx ?>;
-  blx.x = ceiling_div(totx, thr.y);
-  blx.y = <?%d thy ?>;
-  CudaKernelRunNoWait(Kernel< EX >, blx, thr, stream);
-<?R } ?>
-};
-
-template < eOperationType I, eCalculateGlobals G, eStage S >
-void CartLatticeExecutor::RunInterior(CudaStream_t stream) const {
-  RunInteriorImpl< InteriorExecutor< I, G, S > >(container, stream);
-}
-
-template < eOperationType I, eCalculateGlobals G, eStage S >
-void CartLatticeExecutor::RunBorder(CudaStream_t stream) const {
-  RunBorderImpl< BorderExecutor< I, G, S > >(container, stream);
-}
-
-/// Old function for graphics output
-/**
-  calculates the color for one node
-*/
-template < eOperationType I, eCalculateGlobals G, eStage S >
-CudaDeviceFunction void CartLatticeExecutor::NodeToColor(int x, int y, int z, uchar4 *optr) {
-  using LA = CartLatticeAccessAll;
-  using N  = Node<LA, I, G, S>;
-
-  int offset = x + y * constContainer.nx;
-  float l = 0.0; float w = 0.0;
-  int r = 0,g = 0,b = 0;
-  if (x < 0 || y < 0 || z < 0) return;
-  if (x >= constContainer.nx || y >= constContainer.ny || z >= constContainer.nz) return;
-
-  {
-    LA acc(x,y,z);
-    N now(acc);
-    acc.pop(now);
-    float2 v = now.Color();
-    l = v.x;
-    w = v.y;
-  }
-
-  if (ISFINITE(l)) {
-    l = l * 111;
-    if (               (l <-111)) {r = 255; g = 255; b = 255; }
-    if ((l >= -111) && (l < -11)) {r = 255*(-l-11)/100; g = 255; b = 255; }
-    if ((l >=  -11) && (l <  -1)) {r = 0; g = (255*(-l-1))/10; b = 255; }
-    if ((l >=   -1) && (l <   0)) {r = 0; g = 0; b = 255*(-l); }
-    if ((l >=    0) && (l <   1)) {r = 255*l; g = 0; b = 0; }
-    if ((l >=    1) && (l <  11)) {r = 255; g = 255*(l-1)/10; b = 0; }
-    if ((l >=   11) && (l < 111)) {r = 255; g = 255; b = 255*(l-11)/100; }
-    if ((l >=  111)             ) {r = 255; g = 255; b = 255; }
-    r=r*w;
-    g=g*w + (1-w)*255;
-    b=b*w;
-  } else {
-    r = 255;
-    g = 0;
-    b = 255;
-  }
-  optr[offset].x = r;
-  optr[offset].y = g;
-  optr[offset].z = b;
-  optr[offset].w = 255;
-}
-
-/// Kernel for graphics output
-CudaGlobalFunction void ColorKernel(uchar4 *optr, int z)
-{
-  CartLatticeExecutor::NodeToColor< Primal, NoGlobals, Get >(
-    CudaThread.x+CudaBlock.x*CudaNumberOfThreads.x,
-    CudaBlock.y,
-    z,
-    optr);
-}
-
-/// Runs kernel for rendering graphics
-/**
-  Runs the kernel for rendering graphics
-  \param optr 4-component graphics buffer
-*/
-void CartLatticeExecutor::Color(uchar4 *optr) {
-  CopyToConst();
-  CudaKernelRun(ColorKernel, dim3(floor(container.nx / X_BLOCK), container.ny, 1), dim3(X_BLOCK), optr, container.nz / 2);
-}
-
-// Quantity getters
-<?R
-        for (q in rows(Quantities))
-        {
-                ifdef(q$adjoint); ?>
-/// Calculate quantity [<?%s q$comment ?>] kernel
-/**
-  Kernel to calculate quantity <?%s q$name ?> (<?%s q$comment ?>) over a region
-  \param r Lattice region to calculate the quantity
-  \param tab buffor to put the calculated result
-  \param scale Scale to rescale the result (for units)
-*/
-CudaGlobalFunction void get<?%s q$name ?>(lbRegion r, <?%s q$type ?> * tab, real_t scale)
-{
-  using LA = CartLatticeAccessAll;
-  int x = CudaBlock.x+r.dx;
-  int y = CudaBlock.y+r.dy;
-  int z = CudaBlock.z+r.dz;
-  LA acc(x,y,z); <?R
-    if (q$adjoint) { ?>
-      Node< LA, Adjoint, NoGlobals, Get > now(acc); <?R
-    } else { ?>
-      Node< LA, Primal, NoGlobals, Get > now(acc); <?R
-    }?>
-    <?%s q$type ?> w;
-//  if (now.NodeType) {
-      acc.pop(now); <?R
-      if (q$adjoint) { ?>
-        acc.pop_adj(now); <?R
-      } ?>
-      w = now.get<?%s q$name ?>(); <?R
-      if (q$type == "vector_t") {
-        for (coef in c("x","y","z")) { ?>
-          w.<?%s coef ?> *= scale; <?R
-        }
-      } else { ?>
-        w *= scale; <?R
-      } ?>
-//  } else { <?R
-      if (q$type == "vector_t") {
-        for (coef in c("x","y","z")) { ?>
-//        w.<?%s coef ?> = nan(""); <?R
-        }
-      } else { ?>
-//      w = nan(""); <?R
-      } ?>
-//  }
-  tab[r.offset(x,y,z)] = w;
-}
-
-<?R
-  }
-  ifdef();
-?>
-
-/// Init Settings with 0 in GPU constant memory
-void initSettings() {
-  real_t val = 0;
-<?R for (v in rows(Settings)) {
-  if (is.na(v$derived)) { ?>
-    CudaCopyToConstant("<?%s v$name ?>", <?%s v$name ?>, &val, sizeof(real_t)); <?R
-  }} ?>
-}
-
-/// Set Setting in GPU constant memory
-/**
-  Sets a Setting in the constant memory of GPU
-  \param i Index of the Setting
-  \param tmp value of the Setting
-*/
-void setConstSetting(int i, real_t tmp) {
-  switch (i) {
-<?R
-        for (v in rows(Settings)) if (is.na(v$derived)) { ?>
-  case <?%s v$Index ?>:
-    CudaCopyToConstant("<?%s v$name ?>", <?%s v$name ?>, &tmp, sizeof(real_t));
-    break; <?R
-    } ?>
-  }
-}
diff --git a/src/CartLatticeExecutor.h.Rt b/src/CartLatticeLauncher.h.Rt
similarity index 54%
rename from src/CartLatticeExecutor.h.Rt
rename to src/CartLatticeLauncher.h.Rt
index 213186dad..1b4eb99c7 100644
--- a/src/CartLatticeExecutor.h.Rt
+++ b/src/CartLatticeLauncher.h.Rt
@@ -2,7 +2,7 @@
         source("conf.R")
 c_header();
 ?>
-/** \file CartLatticeExecutor.h
+/** \file CartLatticeLauncher.h
 */
 
 #ifndef CARTLATTICEEXECUTOR_H
@@ -11,7 +11,7 @@ c_header();
 #include "CartLatticeContainer.h"
 #include "LatticeData.hpp"
 
-struct CartLatticeExecutor {
+struct CartLatticeLauncher {
   CartLatticeContainer container;
   LatticeData          data;
 
@@ -19,15 +19,7 @@ struct CartLatticeExecutor {
   void RunBorder(CudaStream_t stream) const;
   template < eOperationType I, eCalculateGlobals G, eStage S >
   void RunInterior(CudaStream_t stream) const;
-  void Color(uchar4 *out_ptr);
-
-  void CopyToConst() const;
-
-  template < eOperationType I, eCalculateGlobals G, eStage S >
-  static CudaDeviceFunction void NodeToColor(int x, int y, int z, uchar4 *optr);
-};
-
-CudaGlobalFunction void ColorKernel(uchar4 *optr, int z);
+  void Color(uchar4 *out_ptr) const;
 
 <?R
 for (q in rows(Quantities)) { ifdef(q$adjoint);
@@ -37,13 +29,11 @@ for (q in rows(Quantities)) { ifdef(q$adjoint);
           node = "Node"
         }
 ?>
-CudaGlobalFunction void get<?%s q$name ?>(lbRegion r, <?%s q$type ?> * tab, real_t scale); <?R
-  for (tp in c("float","double"))
-  { ?>
-CudaGlobalFunction void get<?%s q$name ?>_<?%s tp ?>(lbRegion , <?%s tp ?> *, int);
+  void GetQuantity<?%s q$name ?>(lbRegion r, <?%s q$type ?>* tab, real_t scale) const;
+  void SampleQuantity<?%s q$name ?>(lbRegion r, <?%s q$type ?>* tab, real_t scale) const;
 <?R
-  }
 }
 ifdef() ?>
+};
 
 #endif // CARTLATTICEEXECUTOR_H
diff --git a/src/CartLatticeLauncher.hpp.Rt b/src/CartLatticeLauncher.hpp.Rt
new file mode 100644
index 000000000..0fbad39ff
--- /dev/null
+++ b/src/CartLatticeLauncher.hpp.Rt
@@ -0,0 +1,340 @@
+<?R
+        source("conf.R")
+c_header();
+?>
+/** \file CartLatticeLauncher.hpp
+*/
+
+#include "CartLatticeLauncher.h"
+#include "CartLatticeAccess.hpp"
+#include "Node.hpp"
+#include "GetThreads.h"
+
+using CartLatticeAccessAll      = AccessComposite< CartLatticeAccess< range_int<0,0,-1,1>, range_int<0,0,-1,1>, range_int<0,0,-1,1> > >;
+using CartLatticeAccessInterior = AccessComposite< CartLatticeAccess<
+  range_int< <?%d BorderMargin$max[1] ?>,0,<?%d BorderMargin$min[1]-1 ?>,1>,
+  range_int< <?%d BorderMargin$max[2] ?>,0,<?%d BorderMargin$min[2]-1 ?>,1>,
+  range_int< <?%d BorderMargin$max[3] ?>,0,<?%d BorderMargin$min[3]-1 ?>,1> > >;
+
+/// Main Kernel
+/**
+  iterates over all elements and runs them with RunElement function.
+  container.dx/dy is to calculate only internal nodes
+*/
+template < eOperationType I, eCalculateGlobals G, eStage S >
+class CartInteriorExecutor {
+  CartLatticeContainer container;
+  LatticeData          data;
+
+public:
+  CartInteriorExecutor(const CartLatticeContainer& container_, const LatticeData& data_) : container(container_), data(data_) {}
+
+  CudaHostFunction LaunchParams ComputeLaunchParams(dim3 thr) const {
+    const int nx = container.nx;
+    const int ny = container.ny;
+    const int nz = container.nz;
+    dim3 blx;
+#ifdef GRID3D
+    blx.z = nx/thr.x;
+#else
+    blx.z = 1;
+#endif
+    int totx = ny - <?%d BorderMargin$max[2]-BorderMargin$min[2] ?>;
+    blx.x = ceiling_div(totx, thr.y);
+    int toty = nz - <?%d BorderMargin$max[3]-BorderMargin$min[3] ?>;
+    blx.y = toty;
+    return LaunchParams{blx, thr};
+  }
+
+  CudaDeviceFunction void Execute() const {
+    using LA = CartLatticeAccessInterior;
+    using N  = Node<LA, I, G, S>;
+    int x_ = CudaThread.x + CudaBlock.z*CudaNumberOfThreads.x + <?%d BorderMargin$max[1] ?>;
+    int y_ = CudaThread.y + CudaBlock.x*CudaNumberOfThreads.y + <?%d BorderMargin$max[2] ?>;
+    int z_ = CudaBlock.y                                      + <?%d BorderMargin$max[3] ?>;
+    if (y_ < container.ny - <?%d -BorderMargin$min[2] ?>) {
+#ifndef GRID3D
+      for (; x_ < container.nx; x_ += CudaNumberOfThreads.x)
+#endif
+      {
+        LA acc(x_, y_, z_, container);
+        N now(acc, data);
+        now.RunElement();
+      }
+    }
+  }
+};
+
+/// Border Kernel
+/**
+  iterates over border elements and runs them with RunElement function
+*/
+template < eOperationType I, eCalculateGlobals G, eStage S >
+class CartBorderExecutor {
+  CartLatticeContainer container;
+  LatticeData          data;
+
+public:
+  CartBorderExecutor(const CartLatticeContainer& container_, const LatticeData& data_) : container(container_), data(data_) {}
+
+  CudaHostFunction LaunchParams ComputeLaunchParams(dim3 thr) const {
+    const int nx = container.nx;
+    const int ny = container.ny;
+    const int nz = container.nz;
+<?R
+    thy = BorderMargin$max[2] - BorderMargin$min[2] + BorderMargin$max[3] - BorderMargin$min[3]
+    blx = "nz"
+    if (BorderMargin$max[3] != 0 || BorderMargin$min[3] != 0) blx = "max(ny,nz)"
+    if (thy > 0) {
+?>
+    dim3 blx;
+#ifdef GRID3D
+    blx.z = nx/thr.x;
+#else
+    blx.z = 1;
+#endif
+    int totx = <?%s blx ?>;
+    blx.x = ceiling_div(totx, thr.y);
+    blx.y = <?%d thy ?>;
+    return LaunchParams{blx, thr};
+<?R } ?>
+  }
+
+  CudaDeviceFunction void Execute() const {
+    int x_ = CudaThread.x + CudaBlock.z*CudaNumberOfThreads.x + <?%d BorderMargin$max[1] ?>;
+    int a_ = CudaThread.y + CudaBlock.x*CudaNumberOfThreads.y;
+    int y_,z_;
+    switch (CudaBlock.y) { <?R
+    i = 0
+    if (BorderMargin$max[2] > BorderMargin$min[2]) for (y in BorderMargin$min[2]:BorderMargin$max[2]) if (y != 0) { ?>
+    case <?%d i ?>:
+      z_ = a_; <?R
+      if (y > 0) { ?>
+        y_ = <?%d y - 1 ?>; <?R
+      } else if (y < 0) { ?>
+        y_ = container.ny - <?%d -y ?>; <?R
+      } ?>
+      if (z_ >= container.nz) return;
+      break; <?R
+      i = i + 1;
+      }
+      if (BorderMargin$max[3] > BorderMargin$min[3]) for (z in BorderMargin$min[3]:BorderMargin$max[3]) if (z != 0) { ?>
+    case <?%d i ?>:
+      y_ = a_ + <?%d BorderMargin$max[2] ?>; <?R
+      if (z > 0) { ?>
+        z_ = <?%d z - 1 ?>; <?R
+      } else if (z < 0) { ?>
+      z_ = container.nz - <?%d -z ?>; <?R
+      } ?>
+      if (y_ >= container.ny - <?%d -BorderMargin$min[2] ?>) return;
+      break; <?R
+      i = i + 1;
+      } ?>
+    default:
+      assert(CudaThread.y < <?%d i ?>);
+      y_ = 0;z_ = 0;
+      break;
+    }
+
+    using LA = CartLatticeAccessAll;
+    using N  = Node<LA, I, G, S>;
+
+#ifndef GRID3D
+    for (; x_ < container.nx; x_ += CudaNumberOfThreads.x)
+#endif
+    {
+      LA acc(x_, y_, z_, container);
+      N now(acc, data);
+      now.RunElement();
+    }
+  }
+};
+
+template < eOperationType I, eCalculateGlobals G, eStage S >
+void CartLatticeLauncher::RunInterior(CudaStream_t stream) const {
+  const CartInteriorExecutor< I, G, S > executor(container, data);
+  LaunchExecutorAsync(executor, stream);
+}
+
+template < eOperationType I, eCalculateGlobals G, eStage S >
+void CartLatticeLauncher::RunBorder(CudaStream_t stream) const {
+  const CartBorderExecutor< I, G, S > executor(container, data);
+  LaunchExecutorAsync(executor, stream);
+}
+
+
+template < eOperationType I, eCalculateGlobals G, eStage S >
+class CartColorExecutor{
+  CartLatticeContainer container;
+  LatticeData          data;
+
+  uchar4* optr;
+  int     z;
+
+public:
+  CartColorExecutor(const CartLatticeContainer& container_, const LatticeData& data_, uchar4* optr_, int z_)
+    : container(container_), data(data_), optr(optr_), z(z_)
+  {}
+
+  CudaHostFunction LaunchParams ComputeLaunchParams(dim3) const {
+    return LaunchParams{dim3(floor(container.nx / X_BLOCK), container.ny, 1), dim3(X_BLOCK)};
+  }
+
+  CudaDeviceFunction void Execute() const {
+    using LA = CartLatticeAccessAll;
+    using N  = Node<LA, I, G, S>;
+
+    const int x = CudaThread.x + CudaBlock.x * CudaNumberOfThreads.x;
+    const int y = CudaBlock.y;
+
+    int offset = x + y * container.nx;
+    float l = 0.; float w = 0.;
+    int r = 0, g = 0, b = 0;
+    if (x < 0 || y < 0 || z < 0 || x >= container.nx || y >= container.ny || z >= container.nz)
+      return;
+
+    LA acc(x, y, z, container);
+    N now(acc, data);
+    acc.pop(now);
+    float2 v = now.Color();
+    l = v.x;
+    w = v.y;
+
+    if (ISFINITE(l)) {
+      l = l * 111;
+      if (               (l <-111)) {r = 255; g = 255; b = 255; }
+      if ((l >= -111) && (l < -11)) {r = 255*(-l-11)/100; g = 255; b = 255; }
+      if ((l >=  -11) && (l <  -1)) {r = 0; g = (255*(-l-1))/10; b = 255; }
+      if ((l >=   -1) && (l <   0)) {r = 0; g = 0; b = 255*(-l); }
+      if ((l >=    0) && (l <   1)) {r = 255*l; g = 0; b = 0; }
+      if ((l >=    1) && (l <  11)) {r = 255; g = 255*(l-1)/10; b = 0; }
+      if ((l >=   11) && (l < 111)) {r = 255; g = 255; b = 255*(l-11)/100; }
+      if ((l >=  111)             ) {r = 255; g = 255; b = 255; }
+      r=r*w;
+      g=g*w + (1-w)*255;
+      b=b*w;
+    } else {
+      r = 255;
+      g = 0;
+      b = 255;
+    }
+
+    optr[offset].x = r;
+    optr[offset].y = g;
+    optr[offset].z = b;
+    optr[offset].w = 255;
+  }
+};
+
+/// Runs kernel for rendering graphics
+/**
+  Runs the kernel for rendering graphics
+  \param optr 4-component graphics buffer
+*/
+void CartLatticeLauncher::Color(uchar4 *optr) const {
+  const CartColorExecutor< Primal, NoGlobals, Get > executor(container, data, optr, container.nz / 2);
+  LaunchExecutor(executor);
+}
+
+// Quantity getters
+<?R for (q in rows(Quantities)) { ifdef(q$adjoint); ?>
+
+class GetQuantityExecutor<?%s q$name ?>Base {
+protected:
+  CartLatticeContainer container;
+  LatticeData          data;
+
+  lbRegion        small;
+  <?%s q$type ?>* buf;
+  real_t          scale;
+
+public:
+  GetQuantityExecutor<?%s q$name ?>Base(const CartLatticeContainer& container_, const LatticeData& data_, const lbRegion& small_, <?%s q$type ?>* buf_, real_t scale_)
+    : container(container_), data(data_), small(small_), buf(buf_), scale(scale_)
+  {}
+
+  CudaDeviceFunction void Execute() const {
+    using LA = CartLatticeAccessAll;
+    int x = CudaBlock.x + small.dx;
+    int y = CudaBlock.y + small.dy;
+    int z = CudaBlock.z + small.dz;
+    LA acc(x, y, z, container); <?R
+    if (q$adjoint) { ?>
+      Node< LA, Adjoint, NoGlobals, Get > now(acc, data); <?R
+    } else { ?>
+      Node< LA, Primal, NoGlobals, Get > now(acc, data); <?R
+    }?>
+    <?%s q$type ?> w;
+//  if (now.NodeType) {
+      acc.pop(now); <?R
+      if (q$adjoint) { ?>
+        acc.pop_adj(now); <?R
+      } ?>
+      w = now.get<?%s q$name ?>(); <?R
+      if (q$type == "vector_t") {
+        for (coef in c("x","y","z")) { ?>
+          w.<?%s coef ?> *= scale; <?R
+        }
+      } else { ?>
+        w *= scale; <?R
+      } ?>
+//  } else { <?R
+      if (q$type == "vector_t") {
+        for (coef in c("x","y","z")) { ?>
+//        w.<?%s coef ?> = nan(""); <?R
+        }
+      } else { ?>
+//      w = nan(""); <?R
+      } ?>
+//  }
+    buf[small.offset(x,y,z)] = w;
+  }
+};
+
+class GetQuantityExecutor<?%s q$name ?> : public GetQuantityExecutor<?%s q$name ?>Base {
+public:
+  template<class... Args>
+  GetQuantityExecutor<?%s q$name ?>(Args&&... args) : GetQuantityExecutor<?%s q$name ?>Base(std::forward<Args>(args)...) {}
+
+  CudaHostFunction LaunchParams ComputeLaunchParams(dim3) const {
+    return LaunchParams{dim3(small.nx, small.ny, small.nz), dim3(1)};
+  }
+};
+
+class GetQuantitySampleExecutor<?%s q$name ?> : public GetQuantityExecutor<?%s q$name ?>Base {
+public:
+  template<class... Args>
+  GetQuantitySampleExecutor<?%s q$name ?>(Args&&... args) : GetQuantityExecutor<?%s q$name ?>Base(std::forward<Args>(args)...) {}
+
+  CudaHostFunction LaunchParams ComputeLaunchParams(dim3) const {
+    return LaunchParams{dim3(small.nx, small.ny), dim3(1)};
+  }
+};
+
+/// Calculate quantity [<?%s q$comment ?>] kernel
+/**
+  Kernel to calculate quantity <?%s q$name ?> (<?%s q$comment ?>) over a region
+  \param r Lattice region to calculate the quantity
+  \param tab buffer to put the calculated result
+  \param scale Scale to rescale the result (for units)
+*/
+void CartLatticeLauncher::GetQuantity<?%s q$name ?>(lbRegion r, <?%s q$type ?>* tab, real_t scale) const {
+  const GetQuantityExecutor<?%s q$name ?> executor(container, data, r, tab, scale);
+  LaunchExecutor(executor);
+}
+
+/// Sample quantity [<?%s q$comment ?>] kernel
+/**
+  Kernel to sample quantity <?%s q$name ?> (<?%s q$comment ?>) over a region
+  \param r Lattice region to sample the quantity
+  \param tab buffer to put the sampled result
+  \param scale Scale to rescale the result (for units)
+*/
+void CartLatticeLauncher::SampleQuantity<?%s q$name ?>(lbRegion r, <?%s q$type ?>* tab, real_t scale) const {
+  const GetQuantitySampleExecutor<?%s q$name ?> executor(container, data, r, tab, scale);
+  LaunchExecutor(executor);
+}
+
+<?R
+}
+ifdef(); ?>
diff --git a/src/GetThreads.h b/src/GetThreads.h
index c35c19cdf..0e2f7e8fc 100644
--- a/src/GetThreads.h
+++ b/src/GetThreads.h
@@ -1,8 +1,7 @@
 #include "Global.h"
-#include <typeinfo>
 #include "cross.h"
 
-template <class E> CudaGlobalFunction void Kernel();
+#include <typeinfo>
 
 std::string cxx_demangle(std::string str);
 
@@ -10,17 +9,23 @@ inline int ceiling_div(const int & x, const int & y) {
   return x / y + (x % y != 0);
 }
 
-/// Get maximal number of threads for all the kernels on runtime
-template < class T > int GetThreads() {
-	dim3 ret;
-	CudaFuncAttributes * attr = new CudaFuncAttributes;
-	CudaFuncGetAttributes(attr, Kernel<T>) ;
-	debug1( "[%d] Constant mem:%ld\n", D_MPI_RANK, attr->constSizeBytes);
-	debug1( "[%d] Local    mem:%ld\n", D_MPI_RANK, attr->localSizeBytes);
-	debug1( "[%d] Max  threads:%d\n", D_MPI_RANK, attr->maxThreadsPerBlock);
-	debug1( "[%d] Reg   Number:%d\n", D_MPI_RANK, attr->numRegs);
-	debug1( "[%d] Shared   mem:%ld\n", D_MPI_RANK, attr->sharedSizeBytes);
-	return attr->maxThreadsPerBlock;
+template <class EX>
+CudaGlobalFunction void Kernel(const EX executor) {
+  executor.Execute();
+}
+
+/// Get maximum number of threads for all kernels at runtime
+template < class EX >
+int GetThreads() {
+	CudaFuncAttributes attr;
+    auto attr_ptr = &attr;
+	CudaFuncGetAttributes(attr_ptr, Kernel< EX >) ;
+	debug1( "[%d] Constant mem:%ld\n", D_MPI_RANK, attr.constSizeBytes);
+	debug1( "[%d] Local    mem:%ld\n", D_MPI_RANK, attr.localSizeBytes);
+	debug1( "[%d] Max  threads:%d\n", D_MPI_RANK, attr.maxThreadsPerBlock);
+	debug1( "[%d] Reg   Number:%d\n", D_MPI_RANK, attr.numRegs);
+	debug1( "[%d] Shared   mem:%ld\n", D_MPI_RANK, attr.sharedSizeBytes);
+	return attr.maxThreadsPerBlock;
 }
 
 class ThreadNumberCalculatorBase {
@@ -43,11 +48,11 @@ class ThreadNumberCalculatorBase {
   void print();
 };
 
-template < class T > class ThreadNumberCalculator : public ThreadNumberCalculatorBase {
+template < class EX > class ThreadNumberCalculator : public ThreadNumberCalculatorBase {
   public:
   virtual void Init() {
-    name = cxx_demangle(typeid(T).name());
-    maxthr = GetThreads< T >();
+    name = cxx_demangle(typeid(EX).name());
+    maxthr = GetThreads< EX >();
     thr.z = 1;
     int val = maxthr;
     if (maxthr < X_BLOCK) {
@@ -63,15 +68,37 @@ template < class T > class ThreadNumberCalculator : public ThreadNumberCalculato
   };
 };
 
-template < class T > class ThreadNumber {
-  typedef ThreadNumberCalculator< T > calc_t;
+template < class EX > class ThreadNumber {
+  typedef ThreadNumberCalculator< EX > calc_t;
   static calc_t calc;
   public:
   static inline dim3 threads() { return calc.threads(); }
 };
 
-template < class T > ThreadNumberCalculator<T> ThreadNumber<T>::calc;
+template < class EX >
+ThreadNumberCalculator< EX > ThreadNumber< EX >::calc;
 
 /// Initialize Thread/Block number variables
 int InitDim();
 
+struct LaunchParams{
+  dim3 blx, thr;
+};
+
+template<class EX>
+LaunchParams ComputeLaunchParams(const EX& executor) {
+  const auto threads = ThreadNumber< EX >::threads();
+  return executor.ComputeLaunchParams(threads);
+}
+
+template<class EX>
+void LaunchExecutor(const EX& executor) {
+  const auto exec_params = ComputeLaunchParams(executor);
+  CudaKernelRun(Kernel< EX >, exec_params.blx, exec_params.thr, executor);
+}
+
+template<class EX>
+void LaunchExecutorAsync(const EX& executor, CudaStream_t stream) {
+  const auto exec_params = ComputeLaunchParams(executor);
+  CudaKernelRunAsync(Kernel< EX >, exec_params.blx, exec_params.thr, stream, executor);
+}
diff --git a/src/Lattice.cu.Rt b/src/Lattice.cpp.Rt
similarity index 88%
rename from src/Lattice.cu.Rt
rename to src/Lattice.cpp.Rt
index ff0354b02..a87bdeb3b 100644
--- a/src/Lattice.cu.Rt
+++ b/src/Lattice.cpp.Rt
@@ -57,9 +57,9 @@ void Lattice::setPosition(double px_, double py_, double pz_)
 	px = px_;
 	py = py_;
 	pz = pz_;
-	executor.container.px = px + 0.5 + region.dx;
-	executor.container.py = py + 0.5 + region.dy;
-	executor.container.pz = pz + 0.5 + region.dz;
+	launcher.container.px = px + 0.5 + region.dx;
+	launcher.container.py = py + 0.5 + region.dy;
+	launcher.container.pz = pz + 0.5 + region.dz;
 }
 
 /// Calculation of the offset from X, Y and Z
@@ -95,8 +95,8 @@ Lattice::Lattice(lbRegion _region, MPIInfo mpi_, int ns) : zSet(ZONESETTINGS, ZO
 {
 	DEBUG_M;
 	model = &my_model;
-    AllocContainer(executor.container, _region.nx, _region.ny, _region.nz);
-    executor.data.Alloc();
+    AllocContainer(launcher.container, _region.nx, _region.ny, _region.nz);
+    launcher.data.Alloc();
 	reverse_save=0;
 	Record_Iter = 0;
 	Iter = 0;
@@ -108,8 +108,8 @@ Lattice::Lattice(lbRegion _region, MPIInfo mpi_, int ns) : zSet(ZONESETTINGS, ZO
 	Snaps = new FTabs[nSnaps];
 	iSnaps = new int[maxSnaps];
 	setPosition(0.0,0.0,0.0);
-	executor.data.iter = 0;
-	executor.data.reset_iter = 0;
+	launcher.data.iter = 0;
+	launcher.data.reset_iter = 0;
 	DEBUG_M;
 	for (int i=0; i < nSnaps; i++) {
 		Snaps[i].PreAlloc(_region.nx,_region.ny,_region.nz);
@@ -128,10 +128,10 @@ Lattice::Lattice(lbRegion _region, MPIInfo mpi_, int ns) : zSet(ZONESETTINGS, ZO
 	CudaAllocFinalize();
 	DEBUG_M;
 
-	executor.container.in = Snaps[0];
-	executor.container.out = Snaps[1];
+	launcher.container.in = Snaps[0];
+	launcher.container.out = Snaps[1];
 #ifdef ADJOINT
-	executor.container.adjout = aSnaps[0];
+	launcher.container.adjout = aSnaps[0];
 #endif
 	aSnap = 0;
 	for (int i=0; i < SETTINGS; i++) {
@@ -140,17 +140,17 @@ Lattice::Lattice(lbRegion _region, MPIInfo mpi_, int ns) : zSet(ZONESETTINGS, ZO
         CudaStreamCreate(&kernelStream);
         CudaStreamCreate(&inStream);
         CudaStreamCreate(&outStream);
-        executor.data.ZoneSettings = zSet.gpuTab;
-        executor.data.ConstZoneSettings = zSet.gpuConst;
-        executor.data.ZoneIndex = 0;
+        launcher.data.ZoneSettings = zSet.gpuTab;
+        launcher.data.ConstZoneSettings = zSet.gpuConst;
+        launcher.data.ZoneIndex = 0;
 <?R for (s in rows(ZoneSettings)) { ?>
 //        zSet.set(<?%s s$Index ?>, -1, units.alt("<?%s s$default ?>"));
 <?R } ?>
         ZoneIter = 0;
 	particle_data_size_max = 0;
-	SC.InitFinder(executor.data.solidfinder);
-	executor.data.particle_data = NULL;
-	executor.data.particle_data_size = 0;
+	SC.InitFinder(launcher.data.solidfinder);
+	launcher.data.particle_data = NULL;
+	launcher.data.particle_data_size = 0;
 	SC.balls = &RFI;
 	RFI.name = "TCLB";
 }
@@ -405,28 +405,28 @@ void Lattice::CopyInParticles() {
 		RFI.SendParticles();
 	DEBUG_PROF_POP();
 	if (RFI.size() > particle_data_size_max) {
-		if (executor.data.particle_data != NULL) CudaFree(executor.data.particle_data);
+		if (launcher.data.particle_data != NULL) CudaFree(launcher.data.particle_data);
 		particle_data_size_max = RFI.size();
-		CudaMalloc(&executor.data.particle_data, RFI.mem_size());
+		CudaMalloc(&launcher.data.particle_data, RFI.mem_size());
 	}
-	executor.data.particle_data_size = RFI.size();
+	launcher.data.particle_data_size = RFI.size();
 	for (size_t i=0; i<RFI.size(); i++){
 		RFI.RawData(i, RFI_DATA_FORCE+0) = 0;
 		RFI.RawData(i, RFI_DATA_FORCE+1) = 0;
 		RFI.RawData(i, RFI_DATA_FORCE+2) = 0;
 	}
 	if (RFI.mem_size() > 0) {
-		CudaMemcpyAsync(executor.data.particle_data, RFI.Particles(), RFI.mem_size(), CudaMemcpyHostToDevice, kernelStream);
+		CudaMemcpyAsync(launcher.data.particle_data, RFI.Particles(), RFI.mem_size(), CudaMemcpyHostToDevice, kernelStream);
 	}
 	DEBUG_PROF_PUSH("Tree Build");
 		SC.Build();
 	DEBUG_PROF_POP();
-	SC.CopyToGPU(executor.data.solidfinder, kernelStream);
+	SC.CopyToGPU(launcher.data.solidfinder, kernelStream);
 }
 
 void Lattice::CopyOutParticles() {
 	if (RFI.mem_size() > 0) {
-		CudaMemcpyAsync(RFI.Particles(), executor.data.particle_data, RFI.mem_size(), CudaMemcpyDeviceToHost, kernelStream);
+		CudaMemcpyAsync(RFI.Particles(), launcher.data.particle_data, RFI.mem_size(), CudaMemcpyDeviceToHost, kernelStream);
 	}
     CudaStreamSynchronize(kernelStream);
 	DEBUG_PROF_PUSH("Testing particles for NaNs");
@@ -452,15 +452,15 @@ void Lattice::SetFirstTabs(int tab0, int tab1) {
 	to = mpi.node[mpi.rank].<?%s m$side ?>;
 	if (mpi.rank != to) {
 		gpuin[i] = Snaps[tab1].<?%s m$name ?>;
-		gpuout[i] = executor.container.out.<?%s m$name ?> = gpubuf[i];
+		gpuout[i] = launcher.container.out.<?%s m$name ?> = gpubuf[i];
 		nodeout[i] = to;
 		nodein[i] = from;
 		i ++;
 	} else {
-		executor.container.out.<?%s m$name ?> = Snaps[tab1].<?%s m$name ?>;
+		launcher.container.out.<?%s m$name ?> = Snaps[tab1].<?%s m$name ?>;
 	} <?R
 	} ?>
-	executor.container.in = Snaps[tab0];
+	launcher.container.in = Snaps[tab0];
 }
 
 <?R for (a in rows(Actions)) { ?>
@@ -481,8 +481,8 @@ void Lattice::<?%s a$FunName ?>(int tab0, int tab1, int iter_type)
 	ZoneIter = (Iter + Record_Iter) % zSet.getLen();
 
 	debug1("ZoneIter: %d (in <?%s a$FunName ?>)\n", ZoneIter);
-	executor.data.ZoneIndex = ZoneIter;
-	executor.data.MaxZones = zSet.MaxZones;
+	launcher.data.ZoneIndex = ZoneIter;
+	launcher.data.MaxZones = zSet.MaxZones;
 	SetFirstTabs(tab0, tab1); <?R
 	old_stage_level = 0
 	action_stages = Stages[a$stages,,drop=FALSE]
@@ -499,32 +499,31 @@ void Lattice::<?%s a$FunName ?>(int tab0, int tab1, int iter_type)
 <?R if (old_stage_level > 0) { ?>
 	MPIStream_B();
 	CudaDeviceSynchronize();
-	executor.container.in = Snaps[tab1]; <?R
+	launcher.container.in = Snaps[tab1]; <?R
     }
     old_stage_level = old_stage_level + 1
 ?>
-    executor.CopyToConst();
 	DEBUG_PROF_PUSH("Calculation");
 	switch(iter_type & ITER_INTEG){
 	case ITER_NO:
-		executor.RunBorder< Primal, NoGlobals, <?%s stage$name ?> > (kernelStream); break;
+		launcher.RunBorder< Primal, NoGlobals, <?%s stage$name ?> > (kernelStream); break;
 	case ITER_GLOBS:
-		executor.RunBorder< Primal, IntegrateGlobals, <?%s stage$name ?> >(kernelStream); break;
+		launcher.RunBorder< Primal, IntegrateGlobals, <?%s stage$name ?> >(kernelStream); break;
 #ifdef ADJOINT
 	case ITER_OBJ:
-		executor.RunBorder< Primal, OnlyObjective, <?%s stage$name ?> >(kernelStream); break;
+		launcher.RunBorder< Primal, OnlyObjective, <?%s stage$name ?> >(kernelStream); break;
 #endif
 	}
     CudaStreamSynchronize(kernelStream);
     MPIStream_A();
 	switch(iter_type & ITER_INTEG){
 	case ITER_NO:
-		executor.RunInterior< Primal, NoGlobals, <?%s stage$name ?> > (kernelStream); break;
+		launcher.RunInterior< Primal, NoGlobals, <?%s stage$name ?> > (kernelStream); break;
 	case ITER_GLOBS:
-		executor.RunInterior< Primal, IntegrateGlobals, <?%s stage$name ?> >(kernelStream); break;
+		launcher.RunInterior< Primal, IntegrateGlobals, <?%s stage$name ?> >(kernelStream); break;
 #ifdef ADJOINT
 	case ITER_OBJ:
-		executor.RunInterior< Primal, OnlyObjective, <?%s stage$name ?> >(kernelStream); break;
+		launcher.RunInterior< Primal, OnlyObjective, <?%s stage$name ?> >(kernelStream); break;
 #endif
 	}
 	DEBUG_PROF_POP();
@@ -555,8 +554,8 @@ inline void Lattice::<?%s a$FunName ?>_Adj(int tab0, int tab1, int adjtab0, int
 	int i=0;
 	debug1("[%d] Iteration_Adj %d -> %d type: %d\n", D_MPI_RANK, adjtab0, adjtab1, iter_type);
 	ZoneIter = (Iter + Record_Iter) % zSet.getLen();
-	executor.data.ZoneIndex = ZoneIter;
-	executor.data.MaxZones = zSet.MaxZones;
+	launcher.data.ZoneIndex = ZoneIter;
+	launcher.data.MaxZones = zSet.MaxZones;
 
 	debug1("ZoneIter: %d (in <?%s a$FunName ?>_Adj)\n", ZoneIter);
 <?R
@@ -566,35 +565,32 @@ inline void Lattice::<?%s a$FunName ?>_Adj(int tab0, int tab1, int adjtab0, int
 	from = mpi.node[mpi.rank].<?%s m$side ?>;
 	if (mpi.rank != to) {
 		gpuout[i] = aSnaps[adjtab0].<?%s m$name ?>;
-		gpuin[i] = executor.container.adjin.<?%s m$name ?> = gpubuf[i];
+		gpuin[i] = launcher.container.adjin.<?%s m$name ?> = gpubuf[i];
 		nodeout[i] = to;
 		nodein[i] = from;
 		i ++;
 	} else {
-		executor.container.adjin.<?%s m$name ?> = aSnaps[adjtab0].<?%s m$name ?>;
+		launcher.container.adjin.<?%s m$name ?> = aSnaps[adjtab0].<?%s m$name ?>;
 	}
-	executor.container.in.<?%s m$name ?> = Snaps[tab0].<?%s m$name ?>;
-	executor.container.adjout.<?%s m$name ?> = aSnaps[adjtab1].<?%s m$name ?>;
+	launcher.container.in.<?%s m$name ?> = Snaps[tab0].<?%s m$name ?>;
+	launcher.container.adjout.<?%s m$name ?> = aSnaps[adjtab1].<?%s m$name ?>;
 <?R
 	}
 
 	for (s in a$stages) {
                  stage = Stages[s,,drop=F] ?>
 
-
-        executor.CopyToConst();
-
         MPIStream_A();
 
 	switch(iter_type & ITER_INTEG){
 	case ITER_NO:
-	        executor.RunInterior< Adjoint, NoGlobals, <?%s stage$name ?> > (kernelStream); break;
+	        launcher.RunInterior< Adjoint, NoGlobals, <?%s stage$name ?> > (kernelStream); break;
 	case ITER_GLOBS:
-	        executor.RunInterior< Adjoint, IntegrateGlobals, <?%s stage$name ?> > (kernelStream); break;
+	        launcher.RunInterior< Adjoint, IntegrateGlobals, <?%s stage$name ?> > (kernelStream); break;
 	case ITER_NO | ITER_STEADY:
-	        executor.RunInterior< SteadyAdjoint, NoGlobals, <?%s stage$name ?> > (kernelStream); break;
+	        launcher.RunInterior< SteadyAdjoint, NoGlobals, <?%s stage$name ?> > (kernelStream); break;
 	case ITER_GLOBS | ITER_STEADY:
-	        executor.RunInterior< SteadyAdjoint, IntegrateGlobals, <?%s stage$name ?> > (kernelStream); break;
+	        launcher.RunInterior< SteadyAdjoint, IntegrateGlobals, <?%s stage$name ?> > (kernelStream); break;
 	}
 
         MPIStream_B();
@@ -602,13 +598,13 @@ inline void Lattice::<?%s a$FunName ?>_Adj(int tab0, int tab1, int adjtab0, int
 	DEBUG_M;
 	switch(iter_type & ITER_INTEG){
 	case ITER_NO:
-	        executor.RunBorder< Adjoint, NoGlobals, <?%s stage$name ?> > (kernelStream); break;
+	        launcher.RunBorder< Adjoint, NoGlobals, <?%s stage$name ?> > (kernelStream); break;
 	case ITER_GLOBS:
-	        executor.RunBorder< Adjoint, IntegrateGlobals, <?%s stage$name ?> > (kernelStream); break;
+	        launcher.RunBorder< Adjoint, IntegrateGlobals, <?%s stage$name ?> > (kernelStream); break;
 	case ITER_NO | ITER_STEADY:
-	        executor.RunBorder< SteadyAdjoint, NoGlobals, <?%s stage$name ?> > (kernelStream); break;
+	        launcher.RunBorder< SteadyAdjoint, NoGlobals, <?%s stage$name ?> > (kernelStream); break;
 	case ITER_GLOBS | ITER_STEADY:
-	        executor.RunBorder< SteadyAdjoint, IntegrateGlobals, <?%s stage$name ?> > (kernelStream); break;
+	        launcher.RunBorder< SteadyAdjoint, IntegrateGlobals, <?%s stage$name ?> > (kernelStream); break;
 	}
 	DEBUG_M;
         CudaDeviceSynchronize();
@@ -635,8 +631,8 @@ inline void Lattice::<?%s a$FunName ?>_Opt(int tab0, int tab1, int adjtab0, int
 #ifdef ADJOINT
         <?%s a$FunName ?>(tab0, tab1, iter_type);
         <?%s a$FunName ?>_Adj(tab0, tab1, adjtab0, adjtab1, iter_type | ITER_STEADY);
-        executor.RunInterior< Optimize, NoGlobals, <?%s stage$name ?> > (kernelStream);
-        executor.RunBorder< Optimize, NoGlobals, <?%s stage$name ?> > (kernelStream);
+        launcher.RunInterior< Optimize, NoGlobals, <?%s stage$name ?> > (kernelStream);
+        launcher.RunBorder< Optimize, NoGlobals, <?%s stage$name ?> > (kernelStream);
         CudaDeviceSynchronize();
 #else
 	ERROR("This model doesn't have adjoint!\n");
@@ -791,7 +787,7 @@ Lattice::~Lattice()
 {
 	RFI.Close();
     CudaAllocFreeAll();
-	FreeContainer(executor.container);
+	FreeContainer(launcher.container);
 	for (int i=0; i<nSnaps; i++) {
 		Snaps[i].Free();
 	}
@@ -804,7 +800,7 @@ Lattice::~Lattice()
         Renders graphics in the GUI version
 */
 void Lattice::Color(uchar4 * ptr) {
-	executor.Color(ptr);
+	launcher.Color(ptr);
 }
 
 /// Initialization of lattice nodes
@@ -818,19 +814,19 @@ void Lattice::Init()
 	iSnaps[0]=0;
 	Record_Iter=0;
 	Iter = 0;
-	executor.data.iter = 0;
+	launcher.data.iter = 0;
 	Snap=0;
    MPI_Barrier(MPMD.local);
    Action_Init(1,0,ITER_NO);
    MPI_Barrier(MPMD.local);
 	Iter = 0;
-	executor.data.iter = 0;
+	launcher.data.iter = 0;
 	clearGlobals();
 }
 
 void Lattice::GenerateST() {
 	ST.Generate();
-	ST.CopyToGPU(executor.data.ST);
+	ST.CopyToGPU(launcher.data.ST);
 }
 
 
@@ -916,7 +912,7 @@ void Lattice::Iterate(int niter, int iter_type)
 	        case ITER_ADJOINT:
 			for (i=0; i< niter; i++) {
 				if ((last_glob) && (i == niter -1)) {
-					executor.data.clearGlobals();
+					launcher.data.clearGlobals();
 					iter_type = iter_type | ITER_GLOBS;
 				}
 				Record_Iter -= 1;
@@ -931,7 +927,7 @@ void Lattice::Iterate(int niter, int iter_type)
                 case ITER_NORM:
 			if (last_glob) {
 				IterateTill(niter+Record_Iter-1, iter_type);
-				executor.data.clearGlobals();
+				launcher.data.clearGlobals();
 				IterateTill(Record_Iter+1, iter_type | ITER_GLOBS);
 			} else {
 				IterateTill(niter+Record_Iter, iter_type);
@@ -948,7 +944,7 @@ void Lattice::Iterate(int niter, int iter_type)
 	                iter_type |= ITER_STEADY;
 			for (i=0; i< niter; i++) {
 				if ((last_glob) && (i == niter -1)) {
-					executor.data.clearGlobals();
+					launcher.data.clearGlobals();
 					iter_type |= ITER_GLOBS;
 				}
 				Iteration_Adj(Snap, (Snap+1) % 2, aSnap, (aSnap+1) % 2, iter_type);
@@ -957,23 +953,23 @@ void Lattice::Iterate(int niter, int iter_type)
                 case ITER_NORM:
 			for (i=0; i< niter; i++) {
 				if ((last_glob) && (i == niter -1)) {
-					executor.data.clearGlobals();
+					launcher.data.clearGlobals();
 					iter_type |= ITER_GLOBS;
 				}
 				Iteration(Snap, (Snap+1) % 2, iter_type);
 				Iter ++;
-				executor.data.iter ++;
+				launcher.data.iter ++;
 			}
 			break;
                 case ITER_OPT:
 			for (i=0; i< niter; i++) {
 				if ((last_glob) && (i == niter -1)) {
-					executor.data.clearGlobals();
+					launcher.data.clearGlobals();
 					iter_type |= ITER_GLOBS;
 				}
 				Iteration_Opt(Snap, (Snap+1) % 2, aSnap, (aSnap+1) % 2, iter_type);
 				Iter ++;
-				executor.data.iter ++;
+				launcher.data.iter ++;
 			}
                         break;
 		}
@@ -1013,12 +1009,12 @@ void Lattice::IterateAction(int action, int niter, int iter_type)
                 case ITER_NORM:
 			for (i=0; i< niter; i++) {
 				if ((last_glob) && (i == niter -1)) {
-					executor.data.clearGlobals();
+					launcher.data.clearGlobals();
 					iter_type |= ITER_GLOBS;
 				}
 				RunAction(action, Snap, (Snap+1) % 2, iter_type);
 				Iter ++;
-				executor.data.iter ++;
+				launcher.data.iter ++;
 			}
 			break;
                 case ITER_OPT:
@@ -1049,14 +1045,14 @@ void Lattice::FlagOverwrite(flag_t * mask, lbRegion over)
 {
 	if (region.isEqual(over)) {
 		output("overwriting all flags\n");
-		CudaMemcpy(executor.container.NodeType, mask, sizeof(flag_t)*region.sizeL(), CudaMemcpyHostToDevice);
+		CudaMemcpy(launcher.container.NodeType, mask, sizeof(flag_t)*region.sizeL(), CudaMemcpyHostToDevice);
 	} else {
 		lbRegion inter = region.intersect(over);
 	        int x = inter.dx;
 	        for (int y = inter.dy; y<inter.dy+inter.ny; y++)
 	        for (int z = inter.dz; z<inter.dz+inter.nz; z++)
 	        {
-	                CudaMemcpy2D(&executor.container.NodeType[region.offsetL(x,y,z)], sizeof(flag_t), &mask[over.offsetL(x,y,z)], sizeof(flag_t), sizeof(flag_t), inter.nx, CudaMemcpyHostToDevice);
+	                CudaMemcpy2D(&launcher.container.NodeType[region.offsetL(x,y,z)], sizeof(flag_t), &mask[over.offsetL(x,y,z)], sizeof(flag_t), sizeof(flag_t), inter.nx, CudaMemcpyHostToDevice);
 	        }
 	}
 }
@@ -1076,7 +1072,7 @@ static void ContainerActivateCuts(CartLatticeContainer& container){
 void Lattice::CutsOverwrite(cut_t * Q, lbRegion over)
 {
 	if (Q == NULL) return;
-    ContainerActivateCuts(executor.container);
+    ContainerActivateCuts(launcher.container);
 	lbRegion inter = region.intersect(over);
         int x = inter.dx;
 	size_t regsize = region.size();
@@ -1085,7 +1081,7 @@ void Lattice::CutsOverwrite(cut_t * Q, lbRegion over)
         for (int z = inter.dz; z<inter.dz+inter.nz; z++)
         for (int d = 0; d<26; d++)
         {
-                CudaMemcpy2D(&executor.container.Q[region.offsetL(x,y,z)+d*regsize], sizeof(cut_t), &Q[over.offsetL(x,y,z)+oversize*d], sizeof(cut_t), sizeof(cut_t), inter.nx, CudaMemcpyHostToDevice);
+                CudaMemcpy2D(&launcher.container.Q[region.offsetL(x,y,z)+d*regsize], sizeof(cut_t), &Q[over.offsetL(x,y,z)+oversize*d], sizeof(cut_t), sizeof(cut_t), inter.nx, CudaMemcpyHostToDevice);
         }
 
 }
@@ -1096,14 +1092,14 @@ void Lattice::GetFlags(lbRegion over, flag_t * NodeType)
 	size_t offset;
 	lbRegion inter = region.intersect(over);
 	if (region.isEqual(over)) {
-                CudaMemcpy(NodeType, executor.container.NodeType, sizeof(flag_t)*region.sizeL(), CudaMemcpyDeviceToHost);
+                CudaMemcpy(NodeType, launcher.container.NodeType, sizeof(flag_t)*region.sizeL(), CudaMemcpyDeviceToHost);
         } else {
 	        int x = inter.dx;
 	        for (int y = inter.dy; y<inter.dy+inter.ny; y++)
 	        for (int z = inter.dz; z<inter.dz+inter.nz; z++)
 	        {
 			offset = region.offsetL(x,y,z);
-			CudaMemcpy2D(&NodeType[over.offsetL(x,y,z)], sizeof(flag_t), &executor.container.NodeType[offset], sizeof(flag_t), sizeof(flag_t), inter.nx, CudaMemcpyDeviceToHost);
+			CudaMemcpy2D(&NodeType[over.offsetL(x,y,z)], sizeof(flag_t), &launcher.container.NodeType[offset], sizeof(flag_t), sizeof(flag_t), inter.nx, CudaMemcpyDeviceToHost);
 		}
 	}
 }
@@ -1215,7 +1211,6 @@ void Lattice::GetQuantity(int quant, lbRegion over, real_t * tab, real_t scale)
 
 
 <?R for (q in rows(Quantities)) { ifdef(q$adjoint); ?>
-
 /// Get [<?%s q$comment ?>]
 /**
         Retrive the values of the Quantity <?%s q$name ?> (<?%s q$comment ?>)
@@ -1223,20 +1218,20 @@ void Lattice::GetQuantity(int quant, lbRegion over, real_t * tab, real_t scale)
 */
 void Lattice::Get<?%s q$name ?>(lbRegion over, <?%s q$type ?> * tab, real_t scale)
 {
-	executor.container.in = Snaps[Snap];
-	<?R if (q$adjoint) { ?> executor.container.adjin = aSnaps[aSnap]; <?R } ?>
-        executor.CopyToConst();
+	launcher.container.in = Snaps[Snap];
+	<?R if (q$adjoint) { ?> launcher.container.adjin = aSnaps[aSnap]; <?R } ?>
 
 	lbRegion inter = region.intersect(over);
 	if (inter.size()==0) return;
 	<?%s q$type ?> * buf=NULL;
 	CudaMalloc((void**)&buf, inter.sizeL()*sizeof(<?%s q$type ?>));
-        {	lbRegion small = inter;
-		small.dx -= region.dx;
-		small.dy -= region.dy;
-		small.dz -= region.dz;
-		CudaKernelRun( get<?%s q$name ?> , dim3(small.nx,small.ny,small.nz) , dim3(1) , small, buf, scale);
-		CudaMemcpy(tab, buf, small.sizeL()*sizeof(<?%s q$type ?>), CudaMemcpyDeviceToHost);
+    {
+        lbRegion small = inter;
+        small.dx -= region.dx;
+        small.dy -= region.dy;
+        small.dz -= region.dz;
+        launcher.GetQuantity<?%s q$name ?>(small, buf, scale);
+        CudaMemcpy(tab, buf, small.sizeL()*sizeof(<?%s q$type ?>), CudaMemcpyDeviceToHost);
 	}
 	CudaFree(buf);
 }
@@ -1298,7 +1293,7 @@ void Lattice::pop_settings()
 */
 void Lattice::getGlobals(real_t * tab) {
         real_t tabl[ GLOBALS ];
-        executor.data.getGlobals(tabl); <?R
+        launcher.data.getGlobals(tabl); <?R
         by(Globals,Globals$op,function(G) { n = nrow(G); ?>
                 MPI_Reduce(
                         &tabl[<?%s G$Index[1] ?>],
@@ -1331,7 +1326,7 @@ void Lattice::calcGlobals() {
 ?>
 	tab[<?%s Globals$Index[Globals$name == "Objective"] ?>] += obj;
 	for (int i=0; i< GLOBALS ; i++) globals[i] += tab[i];
-	executor.data.clearGlobals();
+	launcher.data.clearGlobals();
 }
 
 /// Clear the internal globals table
@@ -1359,12 +1354,19 @@ double Lattice::getObjective() {
 /**
         Set a specific Setting
         \param i Index of the Setting
-        \param tmp Value of the Setting
+        \param val Value of the Setting
+*/
+void Lattice::setSetting(int i, real_t val) {
+  launcher.data.settings[i] = val;
+}
+
+/// Set a Setting
+/**
+        Get a specific Setting
+        \param i Index of the Setting
 */
-void Lattice::setSetting(int i, real_t tmp) {
-	settings[i] = tmp;
-        debug1("[%d] Settings %d to %f\n", D_MPI_RANK,i, tmp);
-	setConstSetting(i,tmp);
+real_t Lattice::getSetting(int i) {
+  return launcher.data.settings[i];
 }
 
 void Lattice::SetSetting(const Model::Setting& set, real_t val) {
@@ -1378,7 +1380,7 @@ void Lattice::SetSetting(const Model::Setting& set, real_t val) {
 }
 
 void Lattice::resetAverage() {
-	executor.data.reset_iter = executor.data.iter;
+	launcher.data.reset_iter = launcher.data.iter;
         <?R for (f in rows(Fields))  if (f$average) { ?>
           CudaMemset(&Snaps[Snap].block14[<?%s f$Index ?>*region.sizeL()],0,region.sizeL()*sizeof(real_t));
 	 <?R } ?>
@@ -1386,11 +1388,11 @@ void Lattice::resetAverage() {
 <?R for (q in rows(Quantities)) { ifdef(q$adjoint); ?>
 void Lattice::GetSample<?%s q$name ?>(lbRegion over, real_t scale,real_t* buf)
 {
-       	executor.container.in = Snaps[Snap];
-       	<?R if (q$adjoint) { ?> executor.container.adjin = aSnaps[aSnap]; <?R } ?>
-        executor.CopyToConst();
-	lbRegion small = region.intersect(over);
-	CudaKernelRun( get<?%s q$name ?> , dim3(small.nx,small.ny) , dim3(1) , small, (<?%s q$type?>*) buf, scale);
+    launcher.container.in = Snaps[Snap];
+<?R if (q$adjoint) { ?>
+    launcher.container.adjin = aSnaps[aSnap]; <?R } ?>
+    lbRegion small = region.intersect(over);
+    launcher.SampleQuantity<?%s q$name ?>(small, (<?%s q$type ?>*)buf, scale);
 }
 <?R } ;ifdef() ?>
 void Lattice::updateAllSamples(){
@@ -1401,7 +1403,7 @@ void Lattice::updateAllSamples(){
 	 	 if (sample->quant->in("<?%s q$name ?>"))
 		{
                         double v = sample->units.alt("<?%s q$unit ?>");
-			GetSample<?%s q$name ?>(sample->spoints[j].location,1/v,&sample->gpu_buffer[sample->location["<?%s q$name ?>"]+(executor.data.iter - sample->startIter)*sample->size + sample->totalIter*j*sample->size]);
+			GetSample<?%s q$name ?>(sample->spoints[j].location,1/v,&sample->gpu_buffer[sample->location["<?%s q$name ?>"]+(launcher.data.iter - sample->startIter)*sample->size + sample->totalIter*j*sample->size]);
 		}
 		<?R }; ifdef() ?>
 	} }
diff --git a/src/Lattice.h.Rt b/src/Lattice.h.Rt
index ef2d15f99..14eddf4d8 100644
--- a/src/Lattice.h.Rt
+++ b/src/Lattice.h.Rt
@@ -12,7 +12,7 @@
 #include "Sampler.h"
 #include "SolidContainer.h"
 #include "Lists.h"
-#include "CartLatticeExecutor.h"
+#include "CartLatticeLauncher.h"
 
 class lbRegion;
 
@@ -39,7 +39,7 @@ const int maxSnaps=33;
 class Lattice {
 private:
   Model_m my_model;
-  CartLatticeExecutor executor; ///< Main execution context, including the lattice container, data, etc.
+  CartLatticeLauncher launcher; ///< Main execution context, including the lattice container, data, etc.
   storage_t *mpiin[27], *mpiout[27]; ///< MPI Buffers
   storage_t *gpuin[27], *gpuout[27], *gpubuf[27], *gpubuf2[27]; ///< GPU Buffers
   size_t bufsize[27]; ///< Sizes of the Buffers
@@ -187,6 +187,7 @@ void GetQuantity(int quant, lbRegion over, real_t * tab, real_t scale);
   double getObjective();
   void resetAverage();
   void setSetting(int i, real_t tmp);
+  real_t getSetting(int i);
   void SetSetting(const Model::Setting& set, real_t val);
   void GenerateST();
 };
diff --git a/src/LatticeData.hpp.Rt b/src/LatticeData.hpp.Rt
index 4bb1ae89f..a8d3f75c1 100644
--- a/src/LatticeData.hpp.Rt
+++ b/src/LatticeData.hpp.Rt
@@ -10,21 +10,11 @@
 
 #include "SyntheticTurbulence.h"
 #include "SolidContainer.h"
-
-#ifndef SETTINGS_H
-<?R
-	for (v in rows(Settings)) { ?>
-    CudaExternConstantMemory(real_t <?%s v$name ?>); <?R
-	} ?>
-    void initSettings();
-    void setConstSetting(int i, real_t tmp);
-
-#define SETTINGS_H 1
-#endif
-
 #include "Consts.h"
 #include "cross.h"
 
+#include <array>
+
 struct LatticeData {
     size_t particle_data_size;
     real_t *particle_data;
@@ -37,31 +27,30 @@ struct LatticeData {
     real_t **ZoneSettings;
     real_t *ConstZoneSettings;
     STWaveSet ST;
+    real_t settings[SETTINGS]; ///< settings initialized to zero
 
-    void CopyToConst() const;
-
-    CudaDeviceFunction real_t ZoneSetting(const int &s, const int &z) {
+    CudaDeviceFunction real_t ZoneSetting(const int &s, const int &z) const {
         const int i = s + ZONESETTINGS * z;
         const real_t *w = ZoneSettings[i];
         if (w == NULL) return ConstZoneSettings[i];
         return w[ZoneIndex];
     }
 
-    CudaDeviceFunction real_t ZoneSetting_DT(const int &s, const int &z) {
+    CudaDeviceFunction real_t ZoneSetting_DT(const int &s, const int &z) const {
         const int i = s + ZONESETTINGS * z;
         const real_t *w = ZoneSettings[i + DT_OFFSET];
         if (w == NULL) return 0;
         return w[ZoneIndex];
     }
 
-    CudaDeviceFunction real_t *ZoneSettingGrad(const int &s, const int &z) {
+    CudaDeviceFunction real_t *ZoneSettingGrad(const int &s, const int &z) const {
         const int i = s + ZONESETTINGS * z;
         real_t *w = ZoneSettings[i + GRAD_OFFSET];
         if (w == NULL) return &ConstZoneSettings[i + GRAD_OFFSET];
         return &w[ZoneIndex];
     }
 
-    CudaDeviceFunction vector_t getST(real_t x, real_t y, real_t z) {
+    CudaDeviceFunction vector_t getST(real_t x, real_t y, real_t z) const {
         return calc(ST, x, y, z);
     }
 
diff --git a/src/Node.hpp.Rt b/src/Node.hpp.Rt
index 5f91bb01b..71e833601 100644
--- a/src/Node.hpp.Rt
+++ b/src/Node.hpp.Rt
@@ -7,6 +7,8 @@
         Defines Node based on supplied Dynamics
 */
 
+#include "Particle.hpp"
+
 <?R
 	paste0_s = function(...) if (any(sapply(list(...),length) < 1)) character(0) else paste0(...)
 	macros = NULL
@@ -20,16 +22,16 @@
 	XYZ = c("X","Y","Z")
 	AddMacro(XYZ, paste0_s("acc.get", XYZ,"()"))
 	AddMacro("NodeType", paste0_s("acc.getNodeType()"))
-	AddMacro("Time", paste0_s("(constData.iter)"))
-	AddMacro("SyntheticTurbulence(x__,y__,z__)","constData.getST(x__,y__,z__)")
-	AddMacro("average_iter","(constData.iter - constData.reset_iter)")
+	AddMacro("Time", paste0_s("(data->iter)"))
+	AddMacro("SyntheticTurbulence(x__,y__,z__)","data->getST(x__,y__,z__)")
+	AddMacro("average_iter","(data->iter - data->reset_iter)")
 	P = expand.grid(x=0:2,y=0:2,z=0:2)
 	Q = paste("Q",P$x,P$y,P$z,sep="")[-1]
 	AddMacro(Q, paste0_s("(acc.getQ(",seq_along(Q)-1,"))"))
 	s = ZoneSettings[! as.logical(ZoneSettings$preload),,drop=FALSE]
-	AddMacro(s$name, paste0_s("constData.ZoneSetting(", s$Index, ", acc.getNodeType() >> ZONE_SHIFT)"))
+	AddMacro(s$name, paste0_s("data->ZoneSetting(", s$Index, ", acc.getNodeType() >> ZONE_SHIFT)"))
 	s = ZoneSettings
-	AddMacro(paste0_s(s$name,"_DT"), paste0_s("constData.ZoneSetting_DT(", s$Index, ", acc.getNodeType() >> ZONE_SHIFT)"))
+	AddMacro(paste0_s(s$name,"_DT"), paste0_s("data->ZoneSetting_DT(", s$Index, ", acc.getNodeType() >> ZONE_SHIFT)"))
 	g = Globals
 	g$opstr = ifelse(g$op == "SUM", "Add", "Max")
 	AddMacro(paste0_s("AddTo", g$name, "(x__)"), paste0_s("glob.", g$opstr, "ToGlobal<", g$Index, ">(x__, acc.getNodeType())"))
@@ -37,6 +39,8 @@
 	AddMacro(paste0_s("Iam",n$name),paste0_s("((acc.getNodeType() & ", n$groupIndex, ") == ", n$Index,")"))
 	n = NodeTypeGroups
 	AddMacro(paste0_s("Iam",n$name),paste0_s("(acc.getNodeType() & ", n$Index, ")"))
+	s = Settings[is.na(Settings$derived), ]
+    AddMacro(s$name, paste0_s("data->settings[",s$Index,"]"))
 ?>
 
 template <eCalculateGlobals G>
@@ -44,6 +48,7 @@ struct CalculateGlobals {};
 
 template <>
 struct CalculateGlobals<NoGlobals> {
+    CudaDeviceFunction CalculateGlobals(const LatticeData&) {}
 	template <int I>
 	CudaDeviceFunction inline void AddToGlobal(const real_t& x, const flag_t& NodeType) {}
 	template <int I>
@@ -53,17 +58,18 @@ struct CalculateGlobals<NoGlobals> {
 
 template <>
 struct CalculateGlobals<IntegrateGlobals> {
-	real_t globals[GLOBALS];
-	CudaDeviceFunction CalculateGlobals() {
-		for (int i=0; i<GLOBALS; i++) {
-            globals[i] = 0.0;
-		}
-	}
+    real_t globals[GLOBALS];
+    const LatticeData* data;
+
+    CudaDeviceFunction CalculateGlobals(const LatticeData& data_) : data(&data_) {
+      for (int i=0; i<GLOBALS; i++)
+        globals[i] = 0.0;
+    }
 	template <int I>
 	CudaDeviceFunction inline void AddToGlobal(const real_t& x, const flag_t& NodeType) {
 		globals[I] = globals[I] + x;
 		if (I < SUM_GLOBALS) {
-			globals[GLOBALS_Objective] = globals[GLOBALS_Objective] + constData.ZoneSetting(I + IN_OBJ_OFFSET, NodeType >> ZONE_SHIFT);
+			globals[GLOBALS_Objective] = globals[GLOBALS_Objective] + data->ZoneSetting(I + IN_OBJ_OFFSET, NodeType >> ZONE_SHIFT);
 		}
 	}
 	template <int I>
@@ -73,9 +79,9 @@ struct CalculateGlobals<IntegrateGlobals> {
 	CudaDeviceFunction void inline Glob() {
 		for (int i=0; i<GLOBALS; i++) {
 			if (i < SUM_GLOBALS) {
-            	CudaAtomicAddReduceWarp(&constData.Globals[i], globals[i]);
+            	CudaAtomicAddReduceWarp(&data->Globals[i], globals[i]);
 			} else {
-				CudaAtomicMaxReduceWarp(&constData.Globals[i], globals[i]);
+				CudaAtomicMaxReduceWarp(&data->Globals[i], globals[i]);
 			}
 		}
 	}
@@ -84,25 +90,28 @@ struct CalculateGlobals<IntegrateGlobals> {
 template <>
 struct CalculateGlobals<OnlyObjective> {
 	real_t obj;
-	CudaDeviceFunction CalculateGlobals() {
-		obj = 0.0;
-	}
+	const LatticeData* data;
+
+	CudaDeviceFunction CalculateGlobals(const LatticeData& data_) : obj(0.), data(&data_) {}
 	template <int I>
 	CudaDeviceFunction inline void AddToGlobal(const real_t& x, const flag_t& NodeType) {
 		if (I < SUM_GLOBALS) {
-			obj = obj + constData.ZoneSetting(I + IN_OBJ_OFFSET, NodeType >> ZONE_SHIFT);
+			obj = obj + data->ZoneSetting(I + IN_OBJ_OFFSET, NodeType >> ZONE_SHIFT);
 		}
 	}
 	template <int I>
 	CudaDeviceFunction inline void MaxToGlobal(const real_t& x, const flag_t& NodeType) {}
 	CudaDeviceFunction void inline Glob() {
-        CudaAtomicAddReduceWarp(&constData.Globals[GLOBALS_Objective], obj);
+        CudaAtomicAddReduceWarp(&data->Globals[GLOBALS_Objective], obj);
 	}
 };
 
 struct CalculateGlobalsAdjoint {
 	real_t duals[ZONESETTINGS];
 	real_t duals_dt[ZONESETTINGS];
+	const LatticeData* data;
+
+    CudaDeviceFunction CalculateGlobalsAdjoint(const LatticeData& data_) : data(&data_) {}
 	CudaDeviceFunction CalculateGlobalsAdjoint() {
 		for (int i=0; i<ZONESETTINGS; i++) {
             duals[i] = 0.0;
@@ -115,25 +124,42 @@ struct CalculateGlobalsAdjoint {
 	CudaDeviceFunction inline real_t& DTBRef() { return duals_dt[I]; }
 	CudaDeviceFunction void inline Glob(const flag_t& NodeType) {
         int z = NodeType >> ZONE_SHIFT;
-		for (int nz = 0; nz < constData.MaxZones; nz++) if (CudaSyncWarpOr(nz == z)) {
+		for (int nz = 0; nz < data->MaxZones; nz++) if (CudaSyncWarpOr(nz == z)) {
 			for (int i=0; i<ZONESETTINGS; i++) {
 				real_t val;
 				val = (nz == z) ? duals[i] : 0.0f;
-    	        CudaAtomicAddReduceWarp(constData.ZoneSettingGrad( i , nz), val);
+    	        CudaAtomicAddReduceWarp(data->ZoneSettingGrad( i , nz), val);
 				val = (nz == z) ? duals_dt[i] : 0.0f;
-	            CudaAtomicAddReduceWarp(constData.ZoneSettingGrad( i + DT_OFFSET, nz), val);
+	            CudaAtomicAddReduceWarp(data->ZoneSettingGrad( i + DT_OFFSET, nz), val);
 			}
 		}
 	}
 };
 
+template <class T>
+CudaDeviceFunction T ParticleIteratorXBlockT(real_t x, real_t y, real_t z, const LatticeData* data) {
+	real_t mar = PART_MAR;
+	real_t point[3] = {x,y,z};
+	real_t lower[3] = {x-CudaThread.x-mar,y-mar,z-mar};
+	real_t upper[3] = {x-CudaThread.x+CudaNumberOfThreads.x-1.0f+mar,y+mar,z+mar};
+	return T(data->solidfinder, data->particle_data, point, lower, upper);
+}
+
+template <class T>
+CudaDeviceFunction T ParticleIteratorT(real_t x, real_t y, real_t z, const LatticeData* data) {
+	real_t mar = PART_MAR;
+	real_t point[3] = {x,y,z};
+	real_t lower[3] = {x-mar,y-mar,z-mar};
+	real_t upper[3] = {x+mar,y+mar,z+mar};
+	return T(data->solidfinder, data->particle_data, point, lower, upper);
+}
 
 #ifndef NODE_HPP
 #define NODE_HPP
 
 template < class LA, eOperationType I, eCalculateGlobals G, eStage S> struct Node {
         CudaDeviceFunction inline void RunElement(){};
-		CudaDeviceFunction inline Node(const LA& acc_) {};
+		CudaDeviceFunction inline Node(const LA& acc_, const LatticeData&) {};
 };
 
 <?R
@@ -151,16 +177,17 @@ for (tp in rows(Dispatch)) {
 ?>
 template < class LA > struct Node < LA, <?%s T1 ?>, <?%s T2 ?>, <?%s T3 ?> > {
 	const LA& acc;
+	const LatticeData* data;
 	CalculateGlobals< <?%s T2 ?> > glob;
 <?R
 		for(setting in rows(ZoneSettings)) if (setting$preload) { ?>
 	real_t <?%s setting$name ?>; <?R
 		}
 ?>
-	CudaDeviceFunction inline Node(const LA& acc_):acc(acc_) {
+	CudaDeviceFunction Node(const LA& acc_, const LatticeData& data_) : acc(acc_), data(&data_), glob(data_) {
 		int z = NodeType >> ZONE_SHIFT; <?R
 		for(setting in rows(ZoneSettings)) if (setting$preload) { ?>
-		<?%s setting$name ?> =  constData.ZoneSetting(<?%s setting$Index ?>, z); <?R
+		<?%s setting$name ?> =  data->ZoneSetting(<?%s setting$Index ?>, z); <?R
 		} ?>
 	};
 	CudaDeviceFunction void inline Glob() {
@@ -273,7 +300,21 @@ template < class LA > struct Node < LA, <?%s T1 ?>, <?%s T2 ?>, <?%s T3 ?> > {
 		ExecElement();
 		SaveElement();
 		Glob();
-	} <?R
+	}
+#ifdef USE_ADDOPP
+    CudaDeviceFunction set_found_t_s SyncParticleIterator(real_t x, real_t y, real_t z) {
+        return ParticleIteratorT< set_found_t_s >(x, y, z, data);
+    }
+#else
+    CudaDeviceFunction set_found_t_s SyncParticleIterator(real_t x, real_t y, real_t z) {
+        return ParticleIteratorXBlockT< set_found_t_s >(x, y, z, data);
+    }
+#endif
+
+    CudaDeviceFunction set_found_t_i ParticleIterator(real_t x, real_t y, real_t z) {
+        return ParticleIteratorT< set_found_t_i >(x, y, z, data);
+    }
+	<?R
         } ########################################################################################### ?>
 }; <?R
 } 
diff --git a/src/Particle.hpp b/src/Particle.hpp
index c04eb98be..daa117ee6 100644
--- a/src/Particle.hpp
+++ b/src/Particle.hpp
@@ -5,7 +5,7 @@
 #define OPP_SYNC 0x04
 
 struct Particle {
-//	vector_t pos, vel, angvel;
+    real_t* particle_data;
 	vector_t cvel, diff;
 	real_t rad; real_t dist;
 	CudaDeviceFunction bool in() {
@@ -15,18 +15,19 @@ struct Particle {
 
 struct ParticleI : Particle {
 	size_t i;
-	CudaDeviceFunction ParticleI(const size_t& i_, const real_t node[3]): i(i_) {
+	CudaDeviceFunction ParticleI(real_t* particle_data_, const size_t& i_, const real_t node[3]) : i(i_) {
+        particle_data = particle_data_;
 		vector_t pos, vel, angvel;
-		rad = constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_R];
-		pos.x = constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_POS+0];
-		pos.y = constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_POS+1];
-		pos.z = constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_POS+2];
-		vel.x = constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_VEL+0];
-		vel.y = constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_VEL+1];
-		vel.z = constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_VEL+2];
-		angvel.x = constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_ANGVEL+0];
-		angvel.y = constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_ANGVEL+1];
-		angvel.z = constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_ANGVEL+2];
+		rad = particle_data[i*RFI_DATA_SIZE+RFI_DATA_R];
+		pos.x = particle_data[i*RFI_DATA_SIZE+RFI_DATA_POS+0];
+		pos.y = particle_data[i*RFI_DATA_SIZE+RFI_DATA_POS+1];
+		pos.z = particle_data[i*RFI_DATA_SIZE+RFI_DATA_POS+2];
+		vel.x = particle_data[i*RFI_DATA_SIZE+RFI_DATA_VEL+0];
+		vel.y = particle_data[i*RFI_DATA_SIZE+RFI_DATA_VEL+1];
+		vel.z = particle_data[i*RFI_DATA_SIZE+RFI_DATA_VEL+2];
+		angvel.x = particle_data[i*RFI_DATA_SIZE+RFI_DATA_ANGVEL+0];
+		angvel.y = particle_data[i*RFI_DATA_SIZE+RFI_DATA_ANGVEL+1];
+		angvel.z = particle_data[i*RFI_DATA_SIZE+RFI_DATA_ANGVEL+2];
 		diff.x = pos.x - node[0];
 		diff.y = pos.y - node[1];
 		diff.z = pos.z - node[2];
@@ -41,7 +42,7 @@ template <int SYNC>
 struct ParticleS : ParticleI {
 	vector_t force;
 	vector_t moment;
-	CudaDeviceFunction ParticleS(const size_t& i_, const real_t node[3]) : ParticleI(i_,node) {
+	CudaDeviceFunction ParticleS(real_t* particle_data_, const size_t& i_, const real_t node[3]) : ParticleI(particle_data_, i_, node) {
 		force.x = 0;
 		force.y = 0;
 		force.z = 0;
@@ -62,37 +63,37 @@ struct ParticleS : ParticleI {
 
 template <>
 CudaDeviceFunction ParticleS< NO_SYNC >::~ParticleS() {
-	CudaAtomicAdd(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+0],force.x);
-	CudaAtomicAdd(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+1],force.y);
-	CudaAtomicAdd(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+2],force.z);
-	CudaAtomicAdd(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+0],moment.x);
-	CudaAtomicAdd(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+1],moment.y);
-	CudaAtomicAdd(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+2],moment.z);
+	CudaAtomicAdd(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+0],force.x);
+	CudaAtomicAdd(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+1],force.y);
+	CudaAtomicAdd(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+2],force.z);
+	CudaAtomicAdd(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+0],moment.x);
+	CudaAtomicAdd(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+1],moment.y);
+	CudaAtomicAdd(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+2],moment.z);
 }
 
 template <>
 CudaDeviceFunction ParticleS< WARP_SYNC >::~ParticleS() {
 	real_t val[6] = {force.x,force.y,force.z,moment.x,moment.y,moment.z};
-	CudaAtomicAddReduceWarpArr<6>(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE],val);
+	CudaAtomicAddReduceWarpArr<6>(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE],val);
 }
 
 #ifdef CROSS_HAS_ADDOPP
 template <>
 CudaDeviceFunction ParticleS< OPP_SYNC >::~ParticleS() {
 	real_t val[6] = {force.x,force.y,force.z,moment.x,moment.y,moment.z};
-	CudaAtomicAddReduceOppArr<6>(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE],val);
+	CudaAtomicAddReduceOppArr<6>(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE],val);
 }
 #endif
 
 
 template <>
 CudaDeviceFunction ParticleS< BLOCK_SYNC >::~ParticleS() {
-	CudaAtomicAddReduce(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+0],force.x);
-	CudaAtomicAddReduce(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+1],force.y);
-	CudaAtomicAddReduce(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+2],force.z);
-	CudaAtomicAddReduce(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+0],moment.x);
-	CudaAtomicAddReduce(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+1],moment.y);
-	CudaAtomicAddReduce(&constData.particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+2],moment.z);
+	CudaAtomicAddReduce(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+0],force.x);
+	CudaAtomicAddReduce(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+1],force.y);
+	CudaAtomicAddReduce(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_FORCE+2],force.z);
+	CudaAtomicAddReduce(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+0],moment.x);
+	CudaAtomicAddReduce(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+1],moment.y);
+	CudaAtomicAddReduce(&particle_data[i*RFI_DATA_SIZE+RFI_DATA_MOMENT+2],moment.z);
 }
 
 
@@ -104,46 +105,10 @@ CudaDeviceFunction ParticleS< BLOCK_SYNC >::~ParticleS() {
 
 #ifdef SOLID_CACHE
 	typedef typename solidcontainer_t::cache_set_found_t< ParticleS< PART_SYNC >, SOLID_CACHE > set_found_t;
-#else
-	typedef typename solidcontainer_t::set_found_t< ParticleS< PART_SYNC > > set_found_t;
-#endif
-
-template <class T>
-CudaDeviceFunction T ParticleIteratorXBlockT(real_t x, real_t y, real_t z) {
-	real_t mar = PART_MAR;
-	real_t point[3] = {x,y,z};
-	real_t lower[3] = {x-CudaThread.x-mar,y-mar,z-mar};
-	real_t upper[3] = {x-CudaThread.x+CudaNumberOfThreads.x-1.0f+mar,y+mar,z+mar};
-	return T(constData.solidfinder, point, lower, upper);
-}
-
-template <class T>
-CudaDeviceFunction T ParticleIteratorT(real_t x, real_t y, real_t z) {
-	real_t mar = PART_MAR;
-	real_t point[3] = {x,y,z};
-	real_t lower[3] = {x-mar,y-mar,z-mar};
-	real_t upper[3] = {x+mar,y+mar,z+mar};
-	return T(constData.solidfinder, point, lower, upper);
-}
-
-#ifdef SOLID_CACHE
-	typedef typename solidcontainer_t::cache_set_found_t< ParticleS< PART_SYNC >, SOLID_CACHE > set_found_t_s;
+    typedef typename solidcontainer_t::cache_set_found_t< ParticleS< PART_SYNC >, SOLID_CACHE > set_found_t_s;
 	typedef typename solidcontainer_t::cache_set_found_t< ParticleI, SOLID_CACHE > set_found_t_i;
 #else
-	typedef typename solidcontainer_t::set_found_t< ParticleS< PART_SYNC > > set_found_t_s;
-	typedef typename solidcontainer_t::set_found_t< ParticleI > set_found_t_i;
-#endif
-
-#ifdef USE_ADDOPP
-	CudaDeviceFunction set_found_t_s SyncParticleIterator(real_t x, real_t y, real_t z) {
-		return ParticleIteratorT< set_found_t_s >(x,y,z);
-	}
-#else
-	CudaDeviceFunction set_found_t_s SyncParticleIterator(real_t x, real_t y, real_t z) {
-		return ParticleIteratorXBlockT< set_found_t_s >(x,y,z);
-	}
+	typedef typename solidcontainer_t::set_found_t< ParticleS< PART_SYNC > > set_found_t;
+    typedef typename solidcontainer_t::set_found_t< ParticleS< PART_SYNC > > set_found_t_s;
+    typedef typename solidcontainer_t::set_found_t< ParticleI > set_found_t_i;
 #endif
-
-CudaDeviceFunction set_found_t_i ParticleIterator(real_t x, real_t y, real_t z) {
-	return ParticleIteratorT< set_found_t_i >(x,y,z);
-}
diff --git a/src/Region.h b/src/Region.h
index 9103b82fd..4ab72f2df 100644
--- a/src/Region.h
+++ b/src/Region.h
@@ -25,16 +25,16 @@ class lbRegion {
     if (ret.nx <= 0 || ret.ny <= 0 || ret.nz <= 0) { ret.nx = ret.ny = ret.nz = 0; };
     return ret;
   };
-  inline int offset(int x,int y) {
+  int offset(int x,int y) const {
     return (x-dx) + (y-dy) * nx;
   };
   inline void print() {
     printf("Region: %dx%dx%d + %d,%d,%d\n", nx,ny,nz,dx,dy,dz);
   };
-  CudaHostFunction CudaDeviceFunction inline int offset(int x,int y,int z) {
+  CudaHostFunction CudaDeviceFunction int offset(int x,int y,int z) const {
     return (x-dx) + (y-dy) * nx + (z-dz) * nx * ny;
   };
-  CudaHostFunction CudaDeviceFunction inline size_t offsetL(int x,int y,int z) {
+  CudaHostFunction CudaDeviceFunction size_t offsetL(int x,int y,int z) const {
     return (x-dx) + (y-dy) * nx + (z-dz) * nx * ny;
   };
 
diff --git a/src/SolidAll.h b/src/SolidAll.h
index 7126b6d2e..e8b16a767 100644
--- a/src/SolidAll.h
+++ b/src/SolidAll.h
@@ -9,6 +9,7 @@ class SolidAll {
     template <class T>
     class set_found_t {
         const finder_t& finder;
+        real_t* particle_data;
         real_t point[3];
         class iterator_t {
             const set_found_t * set;
@@ -16,7 +17,7 @@ class SolidAll {
             CudaDeviceFunction iterator_t(const set_found_t& set_, const addr_t& i_) : set(&set_), i(i_) { };
             friend class set_found_t;
         public:
-            CudaDeviceFunction T operator* () { return T(i,set->point); }
+            CudaDeviceFunction T operator* () { return T(set->particle_data, i, set->point); }
             CudaDeviceFunction iterator_t& operator++() { 
                 ++i;
 	            return *this;
@@ -27,7 +28,7 @@ class SolidAll {
             typedef iterator_t iterator;
             CudaDeviceFunction inline iterator begin() { return iterator(*this, 0); };
             CudaDeviceFunction inline iterator end()   { return iterator(*this, finder.size); };
-            CudaDeviceFunction inline set_found_t(const finder_t& finder_, const real_t point_[], const real_t lower[], const real_t upper[]): finder(finder_) {
+            CudaDeviceFunction inline set_found_t(const finder_t& finder_, real_t* particle_data_, const real_t point_[], const real_t lower[], const real_t upper[]) : finder(finder_),  particle_data(particle_data_) {
                 for (int i=0;i<3;i++) { point[i]=point_[i]; }
             };
     };
diff --git a/src/SolidGrid.h b/src/SolidGrid.h
index 0ce721077..fa345b74d 100644
--- a/src/SolidGrid.h
+++ b/src/SolidGrid.h
@@ -19,6 +19,7 @@ class SolidGrid {
     template <class T>
     class set_found_t {
         const finder_t& finder;
+        real_t* particle_data;
         real_t point[3];
         int mins[3];
         int maxs[3];
@@ -62,7 +63,7 @@ class SolidGrid {
             CudaDeviceFunction iterator_t() : set(NULL) { balli = -1; };
             friend class set_found_t;
         public:
-            CudaDeviceFunction T operator* () { return T(balli,set->point); }
+            CudaDeviceFunction T operator* () { return T(set->particle_data, balli, set->point); }
             CudaDeviceFunction iterator_t& operator++() { 
                 ++d;
                 go();
@@ -74,7 +75,7 @@ class SolidGrid {
             typedef iterator_t iterator;
             CudaDeviceFunction inline iterator begin() { return iterator(*this); };
             CudaDeviceFunction inline iterator end()   { return iterator(); };
-            CudaDeviceFunction inline set_found_t(const finder_t& finder_, const real_t point_[], const real_t lower[], const real_t upper[]): finder(finder_) {
+            CudaDeviceFunction inline set_found_t(const finder_t& finder_, real_t* particle_data_, const real_t point_[], const real_t lower[], const real_t upper[]): finder(finder_),  particle_data(particle_data_) {
                 for (int i=0;i<3;i++) { point[i]=point_[i]; }
                 real_t d = 0.5*finder.delta;
                 for (int k=0; k<3; k++) {
@@ -87,6 +88,7 @@ class SolidGrid {
     };
     template <class T, int MAX_CACHE>
     class cache_set_found_t {
+        real_t* particle_data;
         real_t point[3];
         size_t cache_size;
         tr_addr_t cache[MAX_CACHE];
@@ -96,7 +98,7 @@ class SolidGrid {
             CudaDeviceFunction iterator_t(const cache_set_found_t& set_, const size_t& i_) : set(&set_), i(i_) { };
             friend class cache_set_found_t;
         public:
-            CudaDeviceFunction T operator* () { return T(set->cache[i],set->point); }
+            CudaDeviceFunction T operator* () { return T(set->particle_data, set->cache[i], set->point); }
             CudaDeviceFunction iterator_t& operator++() { 
                 ++i;
 	            return *this;
@@ -107,7 +109,7 @@ class SolidGrid {
             typedef iterator_t iterator;
             CudaDeviceFunction inline iterator begin() { return iterator(*this, 0); };
             CudaDeviceFunction inline iterator end()   { return iterator(*this, cache_size); };
-            CudaDeviceFunction inline cache_set_found_t(const finder_t& finder, const real_t point_[], const real_t lower[], const real_t upper[]) {
+            CudaDeviceFunction inline cache_set_found_t(const finder_t& finder, real_t* particle_data_, const real_t point_[], const real_t lower[], const real_t upper[]) : particle_data(particle_data_) {
                 for (int i=0;i<3;i++) { point[i]=point_[i]; }
                 int mins[3];
                 int maxs[3];
diff --git a/src/SolidTree.cpp b/src/SolidTree.cpp
deleted file mode 100644
index 9a0238b17..000000000
--- a/src/SolidTree.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-#include "SolidTree.hpp"
-#include "RemoteForceInterface.hpp"
-
diff --git a/src/SolidTree.h b/src/SolidTree.h
index 4f04f8e0b..079ecb8bf 100644
--- a/src/SolidTree.h
+++ b/src/SolidTree.h
@@ -28,6 +28,7 @@ class SolidTree {
     template <class T>
     class set_found_t {
         const finder_t& finder;
+        real_t* particle_data;
         real_t lower[3];
         real_t upper[3];
         real_t point[3];
@@ -56,7 +57,7 @@ class SolidTree {
             CudaDeviceFunction iterator_t(): set(nullptr) { nodei = -1; return; };
             friend class set_found_t;
         public:
-            CudaDeviceFunction T operator* () { return T(i,set->point); }
+            CudaDeviceFunction T operator* () { return T(set->particle_data, i, set->point); }
             CudaDeviceFunction iterator_t& operator++() { 
            		if (nodei != -1) {
 		            nodei = set->finder.data[nodei].back;
@@ -70,12 +71,13 @@ class SolidTree {
             typedef iterator_t iterator;
             CudaDeviceFunction inline iterator begin() { return iterator(*this); };
             CudaDeviceFunction inline iterator end()   { return iterator(); };
-            CudaDeviceFunction inline set_found_t(const finder_t& finder_, const real_t point_[], const real_t lower_[], const real_t upper_[]): finder(finder_) {
+            CudaDeviceFunction inline set_found_t(const finder_t& finder_, real_t* particle_data_, const real_t point_[], const real_t lower_[], const real_t upper_[]): finder(finder_),  particle_data(particle_data_) {
                 for (int i=0;i<3;i++) { lower[i]=lower_[i]; upper[i]=upper_[i]; point[i]=point_[i]; }
             };
     };
     template <class T, int MAX_CACHE>
     class cache_set_found_t {
+        real_t* particle_data;
         real_t point[3];
         size_t cache_size;
         tr_addr_t cache[MAX_CACHE];
@@ -85,7 +87,7 @@ class SolidTree {
             CudaDeviceFunction iterator_t(const cache_set_found_t& set_, const size_t& i_) : set(&set_), i(i_) { };
             friend class cache_set_found_t;
         public:
-            CudaDeviceFunction T operator* () { return T(set->cache[i],set->point); }
+            CudaDeviceFunction T operator* () { return T(set->particle_data, set->cache[i], set->point); }
             CudaDeviceFunction iterator_t& operator++() { 
                 ++i;
 	            return *this;
@@ -96,7 +98,7 @@ class SolidTree {
             typedef iterator_t iterator;
             CudaDeviceFunction inline iterator begin() { return iterator(*this, 0); };
             CudaDeviceFunction inline iterator end()   { return iterator(*this, cache_size); };
-            CudaDeviceFunction inline cache_set_found_t(const finder_t& finder, const real_t point_[], const real_t lower[], const real_t upper[]) {
+            CudaDeviceFunction inline cache_set_found_t(const finder_t& finder, real_t* particle_data_, const real_t point_[], const real_t lower[], const real_t upper[]) : particle_data(particle_data_) {
                 for (int i=0;i<3;i++) { point[i]=point_[i]; }
                 tr_addr_t nodei = 0;
                 bool go_left = true;
diff --git a/src/Solver.cpp.Rt b/src/Solver.cpp.Rt
index 383db170c..11ef3ae8d 100644
--- a/src/Solver.cpp.Rt
+++ b/src/Solver.cpp.Rt
@@ -165,8 +165,7 @@ void MainFree( Solver *d);
 */
 	int Solver::writeLog(const char * filename)
 	{ 
-	        FILE * f = NULL;
-		double v;
+	    FILE * f = NULL;
 		double * glob = lattice->globals;
 	        if (mpi.rank == 0) {
 			int j=0;
@@ -174,22 +173,21 @@ void MainFree( Solver *d);
 	                assert( f != NULL );
 			fprintf(f,"%d, %.13le, %.13le, %d",iter, LogScales[SETTINGS+GLOBALS+ZONESETTINGS+SCALES_dt] * iter, get_walltime(), opt_iter);
 			for (int i=0; i< SETTINGS; i++) {
-				v = lattice->settings[i];
+				const double v = lattice->getSetting(i);
 				fprintf(f,", %.13le, %.13le",v,v*LogScales[j]);
 				j++;
 			}
 			for (int i=0; i< ZONESETTINGS; i++) {
-			    	for (std::map<std::string,int>::iterator it = geometry->SettingZones.begin(); it != geometry->SettingZones.end(); it++) {
+			    	for (const auto& map_entry : geometry->SettingZones) {
 			    		int ind = lattice->ZoneIter;
-			    		int zone = it->second;
-			    		v = lattice->zSet.get(i, zone, ind);
-//					v = lattice->settings[i];
-					fprintf(f,", %.13le, %.13le",v,v*LogScales[j]);
-		                }
+			    		int zone = map_entry.second;
+			    		const double v = lattice->zSet.get(i, zone, ind);
+					    fprintf(f,", %.13le, %.13le",v,v*LogScales[j]);
+		            }
 				j++;
 			}
 			for (int i=0; i< GLOBALS; i++) {
-				v = glob[i];
+				const double v = glob[i];
 				fprintf(f,", %.13le, %.13le",v,v*LogScales[j]);
 				j++;
 			}
@@ -391,9 +389,6 @@ void MainFree( Solver *d);
 		lattice->zSet.set(<?%s s$Index ?>, -1, units.alt("<?%s s$default ?>"));
 <?R } ?>
 
-		// Setting global variables
-		initSettings();
-
 		geometry = new Geometry(region, mpi.totalregion, units);
 
 		return 0;
diff --git a/src/config.mk.in b/src/config.mk.in
index aa5084468..3b83b0f56 100644
--- a/src/config.mk.in
+++ b/src/config.mk.in
@@ -14,7 +14,7 @@ EMBEDED_PYTHON = @EMBEDED_PYTHON@
 
 WITH_LAMMPS = @WITH_LAMMPS@
 
-SOURCE_CU=Global.cu Lattice.cu main.cu vtkLattice.cu vtkOutput.cu cross.cu cuda.cu CartLatticeContainer.cu Dynamics.c
+SOURCE_CU=Global.cu main.cu vtkLattice.cu vtkOutput.cu cross.cu cuda.cu CartLatticeContainer.cu Dynamics.c
 SOURCE=$(SOURCE_CU)
 HEADERS=Global.h gpu_anim.h CartLatticeContainer.h Lattice.h Region.h vtkLattice.h vtkOutput.h cross.h gl_helper.h Dynamics.h types.h pugixml.hpp pugiconfig.hpp
 
diff --git a/src/cross.h b/src/cross.h
index 7349cc07d..5d5b424e7 100644
--- a/src/cross.h
+++ b/src/cross.h
@@ -56,16 +56,16 @@
       #ifndef CROSS_HIP 
        #define CudaKernelRun(a__,b__,c__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( cudaDeviceSynchronize()); HANDLE_ERROR( cudaGetLastError() )
        #ifdef CROSS_SYNC
-         #define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( cudaDeviceSynchronize()); HANDLE_ERROR( cudaGetLastError() );
+         #define CudaKernelRunAsync(a__,b__,c__,e__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( cudaDeviceSynchronize()); HANDLE_ERROR( cudaGetLastError() );
        #else
-         #define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<<b__,c__,0,e__>>>(__VA_ARGS__);
+         #define CudaKernelRunAsync(a__,b__,c__,e__,...) a__<<<b__,c__,0,e__>>>(__VA_ARGS__);
        #endif
       #else
        #define CudaKernelRun(a__,b__,c__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( hipDeviceSynchronize()); HANDLE_ERROR( hipGetLastError() )
        #ifdef CROSS_SYNC
-         #define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( hipDeviceSynchronize()); HANDLE_ERROR( hipGetLastError() );
+         #define CudaKernelRunAsync(a__,b__,c__,e__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( hipDeviceSynchronize()); HANDLE_ERROR( hipGetLastError() );
        #else
-         #define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<<b__,c__,0,e__>>>(__VA_ARGS__);
+         #define CudaKernelRunAsync(a__,b__,c__,e__,...) a__<<<b__,c__,0,e__>>>(__VA_ARGS__);
        #endif
       #endif
       #define CudaBlock blockIdx
@@ -174,7 +174,7 @@
     #define CudaGetDeviceCount(a__) HANDLE_ERROR( hipGetDeviceCount( a__ ) )
     #define CudaDeviceReset() HANDLE_ERROR( hipDeviceReset( ) )
     #define CudaFuncAttributes hipFuncAttributes
-    #define CudaFuncGetAttributes(a__,b__) HANDLE_ERROR( hipFuncGetAttributes(a__, reinterpret_cast<const void*>(&b__)) )
+    #define CudaFuncGetAttributes(a__,b__) HANDLE_ERROR( hipFuncGetAttributes(a__, reinterpret_cast<const void*>(b__)) )
    #endif
 //    cudaError_t cudaPreAlloc(void ** ptr, size_t size);
 //    cudaError_t cudaAllocFinalize();
@@ -182,6 +182,7 @@
     CudaError HandleError( CudaError err, const char *file, int line );
     #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
     #define ISFINITE(l__) isfinite(l__)
+
   #else
     #include <assert.h>
     #include <time.h>
@@ -281,8 +282,7 @@
     extern uint3 CpuThread;
     extern uint3 CpuSize;
 
-    #include <functional>
-
+    #include <utility>
     template <typename F, typename ...P>
     inline void CPUKernelRun(F &&func, const dim3& blocks, P &&... args) {
       #pragma omp parallel for collapse(3) schedule(static)
@@ -302,7 +302,7 @@
     }
 
     template <typename F, typename ...P>
-    inline void CudaKernelRunNoWait(F &&func, const dim3& blocks, const dim3& threads, CudaStream_t stream, P &&... args) {
+    inline void CudaKernelRunAsync(F &&func, const dim3& blocks, const dim3& threads, CudaStream_t stream, P &&... args) {
       CPUKernelRun(func, blocks, std::forward<P>(args)...);
     }
 
diff --git a/src/cuda.cu.Rt b/src/cuda.cu.Rt
index ac9d9e9d3..8c18426a8 100644
--- a/src/cuda.cu.Rt
+++ b/src/cuda.cu.Rt
@@ -24,39 +24,27 @@ if (exists('Extra_Dynamics_C_Header')) {
 #include "cross.h"
 #include "cross.hpp"
 
+#ifdef ADJOINT
+	#include "ADTools.cu"
+#endif
+
+#include "CartLatticeLauncher.hpp"
+
+/*
 #include <cassert>
 CudaDeviceFunction real_t rise_nan() {
 	assert(false);
 	return NAN;
 }
 #define RISENAN rise_nan()
-
-#ifdef ADJOINT
-	#include "ADTools.cu"
-#endif
-
-/// Constant memory variables
-#include "CartLatticeContainer.h"
-#include "LatticeData.hpp"
-
-CudaDeviceFunction CudaConstantMemory CartLatticeContainer constContainer;
-CudaDeviceFunction CudaConstantMemory LatticeData          constData;
-
-// Settings <?R for (v in rows(Settings)) { ?>
-/// GPU Constant memory variable for [<?%s v$comment?>]
-CudaDeviceFunction CudaConstantMemory real_t <?%s v$name ?> = 0.0f;
-<?R } ?>
-
-// These headers depend on contant memory variables
-#include "Particle.hpp"
-#include "CartLatticeExecutor.hpp"
+*/
 
 <?R   for (tp in rows(AllKernels)[order(AllKernels$adjoint)]) {
       st = Stages[tp$Stage,,drop=FALSE]
       ifdef(tp$adjoint)
       ?>
-template void CartLatticeExecutor::RunBorder   < <?%s tp$TemplateArgs ?> > (CudaStream_t stream) const;
-template void CartLatticeExecutor::RunInterior < <?%s tp$TemplateArgs ?> > (CudaStream_t stream) const; <?R
+template void CartLatticeLauncher::RunBorder   < <?%s tp$TemplateArgs ?> > (CudaStream_t stream) const;
+template void CartLatticeLauncher::RunInterior < <?%s tp$TemplateArgs ?> > (CudaStream_t stream) const; <?R
     };
   ifdef();
 ?>
diff --git a/src/makefile.main.Rt b/src/makefile.main.Rt
index 3573cf9ba..0ab532390 100644
--- a/src/makefile.main.Rt
+++ b/src/makefile.main.Rt
@@ -17,16 +17,16 @@ all: <?%s paste(Models$name,collapse=" ") ?>
 .PHONY: all clean dummy <?%s paste(Models$name,collapse="/kernel_stats_20 ") ?>
 
 
-SOURCE_PLAN+=Global.cpp Lattice.cu vtkLattice.cpp vtkOutput.cpp cross.cu cuda.cu CartLatticeAccess.hpp
+SOURCE_PLAN+=Global.cpp Lattice.cpp vtkLattice.cpp vtkOutput.cpp cross.cu cuda.cu CartLatticeAccess.hpp
 SOURCE_PLAN+=Dynamics.c Dynamics_sp.c Solver.cpp pugixml.cpp Geometry.cpp def.cpp unit.cpp
 SOURCE_PLAN+=ZoneSettings.cpp SyntheticTurbulence.cpp Sampler.cpp
 SOURCE_PLAN+=main.cpp
-SOURCE_PLAN+=Global.h gpu_anim.h CartLatticeContainer.h LatticeData.hpp CartLatticeExecutor.h CartLatticeExecutor.hpp Lattice.h Region.h vtkLattice.h vtkOutput.h cross.h cross.hpp
+SOURCE_PLAN+=Global.h gpu_anim.h CartLatticeContainer.h LatticeData.hpp CartLatticeLauncher.h CartLatticeLauncher.hpp Lattice.h Region.h vtkLattice.h vtkOutput.h cross.h cross.hpp
 SOURCE_PLAN+=gl_helper.h Dynamics.h types.h Consts.h Solver.h pugixml.hpp pugiconfig.hpp Node.hpp
 SOURCE_PLAN+=Geometry.h def.h utils.h unit.h ZoneSettings.h SyntheticTurbulence.h Sampler.h spline.h TCLBForceGroupCommon.h
 SOURCE_PLAN+=RemoteForceInterface.cpp RemoteForceInterface.h RemoteForceInterface.hpp
 SOURCE_PLAN+=TCLBForceGroupCommon.h MPMD.hpp empty.cpp Particle.hpp lammps.cpp
-SOURCE_PLAN+=SolidTree.h SolidTree.hpp SolidTree.cpp SolidAll.h SolidGrid.h SolidGrid.hpp
+SOURCE_PLAN+=SolidTree.h SolidTree.hpp SolidAll.h SolidGrid.h SolidGrid.hpp
 SOURCE_PLAN+=SolidContainer.h
 SOURCE_PLAN+=xpath_modification.cpp xpath_modification.h
 SOURCE_PLAN+=hdf5Lattice.cpp hdf5Lattice.h
diff --git a/src/solver.vcproj b/src/solver.vcproj
index 183fa71fd..b59555b38 100644
--- a/src/solver.vcproj
+++ b/src/solver.vcproj
@@ -359,7 +359,7 @@
 			>
 		</File>
 		<File
-			RelativePath="Lattice.cu"
+			RelativePath="Lattice.cpp"
 			>
 		</File>
 		<File
diff --git a/tests/solid/main.cpp b/tests/solid/main.cpp
index f4783d68e..9676e2e9c 100644
--- a/tests/solid/main.cpp
+++ b/tests/solid/main.cpp
@@ -52,7 +52,7 @@ struct Particle {
 	double pos[3];
 	double rad;
 	double dist;
-	Particle(int i_, const real_t org[3]): balli(i_) {
+	Particle(const real_t* dummy, int i_, const real_t org[3]): balli(i_) {
 		//printf("load Particle... ");
 		dist = 0;
 		for (int i=0;i<3;i++) {
@@ -89,7 +89,7 @@ class Tester {
 		typedef typename container_t::set_found_t< Particle > set_found_t;
 		real_t lower[3] = {pos[0]-offset,pos[1]-offset,pos[2]-offset};
 		real_t upper[3] = {pos[0]+offset,pos[1]+offset,pos[2]+offset};
-		for (auto part : set_found_t(finder, pos, lower, upper)) {
+        for (auto part : set_found_t(finder, nullptr, pos, lower, upper)) {
 			if (part.dist <= part.rad+offset) ret.push_back(part.balli);
 		}
 		std::sort(ret.begin(),ret.end());