diff --git a/3rdparty/amd_bins_linux/equiw200k9.bin b/3rdparty/amd_bins_linux/equiw200k9.bin
deleted file mode 100644
index 45785dc4e..000000000
Binary files a/3rdparty/amd_bins_linux/equiw200k9.bin and /dev/null differ
diff --git a/3rdparty/amd_bins_linux/zcash/gpu/blake2bcl.h b/3rdparty/amd_bins_linux/zcash/gpu/blake2bcl.h
deleted file mode 100644
index 13cad965c..000000000
--- a/3rdparty/amd_bins_linux/zcash/gpu/blake2bcl.h
+++ /dev/null
@@ -1,150 +0,0 @@
-// Blake2-B CUDA Implementation
-// tpruvot@github July 2016
-// permission granted to use under MIT license
-// modified for use in Zcash by John Tromp September 2016
-
-/**
- * uint2 direct ops by c++ operator definitions
- */
-
-// static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) {
-//   return make_uint2(a.x ^ b.x, a.y ^ b.y);
-// }
-
-// uint2 ROR/ROL methods
-uint2 ROR2(const uint2 a, const int offset) {
-  uint2 result;
-  if (!offset)
-          result = a;
-  else if (offset < 32) {
-          result.y = ((a.y >> offset) | (a.x << (32 - offset)));
-          result.x = ((a.x >> offset) | (a.y << (32 - offset)));
-  } else if (offset == 32) {
-          result.y = a.x;
-          result.x = a.y;
-  } else {
-          result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
-          result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
-  }
-  return result;
-}
-
-uint2 SWAPUINT2(uint2 value) {
-  uint2 result;
-  result.x = value.y;
-  result.y = value.x;
-  return result;
-//   return make_uint2(value.y, value.x);
-}
-
-#define ROR24(u) ROR2(u,24)
-#define ROR16(u) ROR2(u,16)
-
-__constant int8_t blake2b_sigma[12][16] = {
-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
-  { 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  } ,
-  { 11, 8,  12, 0,  5,  2,  15, 13, 10, 14, 3,  6,  7,  1,  9,  4  } ,
-  { 7,  9,  3,  1,  13, 12, 11, 14, 2,  6,  5,  10, 4,  0,  15, 8  } ,
-  { 9,  0,  5,  7,  2,  4,  10, 15, 14, 1,  11, 12, 6,  8,  3,  13 } ,
-  { 2,  12, 6,  10, 0,  11, 8,  3,  4,  13, 7,  5,  15, 14, 1,  9  } ,
-  { 12, 5,  1,  15, 14, 13, 4,  10, 0,  7,  6,  3,  9,  2,  8,  11 } ,
-  { 13, 11, 7,  14, 12, 1,  3,  9,  5,  0,  15, 4,  8,  6,  2,  10 } ,
-  { 6,  15, 14, 9,  11, 3,  0,  8,  12, 2,  13, 7,  1,  4,  10, 5  } ,
-  { 10, 2,  8,  4,  7,  6,  1,  5,  15, 11, 9,  14, 3,  12, 13, 0  } ,
-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
-  { 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  }
-};
-
-void G(const int32_t r, const int32_t i, uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d, uint64_t const m[16]) {
-  *a += *b + m[ blake2b_sigma[r][2*i] ];
-  ((uint2*)d)[0] = SWAPUINT2( ((uint2*)d)[0] ^ ((uint2*)a)[0] );
-  *c += *d;
-  ((uint2*)b)[0] = ROR24( ((uint2*)b)[0] ^ ((uint2*)c)[0] );
-  *a += *b + m[ blake2b_sigma[r][2*i+1] ];
-  ((uint2*)d)[0] = ROR16( ((uint2*)d)[0] ^ ((uint2*)a)[0] );
-  *c += *d;
-  ((uint2*)b)[0] = ROR2( ((uint2*)b)[0] ^ ((uint2*)c)[0], 63U);
-}
-
-#define ROUND(r) \
-  G(r, 0, &v[0], &v[4], &v[ 8], &v[12], m); \
-  G(r, 1, &v[1], &v[5], &v[ 9], &v[13], m); \
-  G(r, 2, &v[2], &v[6], &v[10], &v[14], m); \
-  G(r, 3, &v[3], &v[7], &v[11], &v[15], m); \
-  G(r, 4, &v[0], &v[5], &v[10], &v[15], m); \
-  G(r, 5, &v[1], &v[6], &v[11], &v[12], m); \
-  G(r, 6, &v[2], &v[7], &v[ 8], &v[13], m); \
-  G(r, 7, &v[3], &v[4], &v[ 9], &v[14], m);
-
-void blake2b_gpu_hash(blake2b_state *state, uint32_t idx, uint8_t *hash, uint32_t outlen) {
-  const uint32_t leb = idx;
-  *(uint32_t*)(state->buf + state->buflen) = leb;
-  state->buflen += 4;
-  state->counter += state->buflen;
-  for (unsigned i = 0; i < BLAKE2B_BLOCKBYTES - state->buflen; i++)
-    state->buf[i+state->buflen] = 0;  
-
-  uint64_t *d_data = (uint64_t *)state->buf;
-  uint64_t m[16];
-
-  m[0] = d_data[0];
-  m[1] = d_data[1];
-  m[2] = d_data[2];
-  m[3] = d_data[3];
-  m[4] = d_data[4];
-  m[5] = d_data[5];
-  m[6] = d_data[6];
-  m[7] = d_data[7];
-  m[8] = d_data[8];
-  m[9] = d_data[9];
-  m[10] = d_data[10];
-  m[11] = d_data[11];
-  m[12] = d_data[12];
-  m[13] = d_data[13];
-  m[14] = d_data[14];
-  m[15] = d_data[15];
-
-  uint64_t v[16];
-
-  v[0] = state->h[0];
-  v[1] = state->h[1];
-  v[2] = state->h[2];
-  v[3] = state->h[3];
-  v[4] = state->h[4];
-  v[5] = state->h[5];
-  v[6] = state->h[6];
-  v[7] = state->h[7];
-  v[8] = 0x6a09e667f3bcc908;
-  v[9] = 0xbb67ae8584caa73b;
-  v[10] =  0x3c6ef372fe94f82b;
-  v[11] = 0xa54ff53a5f1d36f1;
-  v[12] = 0x510e527fade682d1 ^ state->counter;
-  v[13] = 0x9b05688c2b3e6c1f;
-  v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff;
-  v[15] = 0x5be0cd19137e2179;
-
-  ROUND( 0 );
-  ROUND( 1 );
-  ROUND( 2 );
-  ROUND( 3 );
-  ROUND( 4 );
-  ROUND( 5 );
-  ROUND( 6 );
-  ROUND( 7 );
-  ROUND( 8 );
-  ROUND( 9 );
-  ROUND( 10 );
-  ROUND( 11 );
-  
-  state->h[0] ^= v[0] ^ v[ 8];
-  state->h[1] ^= v[1] ^ v[ 9];
-  state->h[2] ^= v[2] ^ v[10];
-  state->h[3] ^= v[3] ^ v[11];
-  state->h[4] ^= v[4] ^ v[12];
-  state->h[5] ^= v[5] ^ v[13];
-  state->h[6] ^= v[6] ^ v[14];
-  state->h[7] ^= v[7] ^ v[15];
-
-  for (unsigned i = 0; i < outlen; i++)
-    hash[i] = ((uint8_t*)state->h)[i];
-}
diff --git a/3rdparty/amd_bins_linux/zcash/gpu/common.h b/3rdparty/amd_bins_linux/zcash/gpu/common.h
deleted file mode 100644
index 22ba9548e..000000000
--- a/3rdparty/amd_bins_linux/zcash/gpu/common.h
+++ /dev/null
@@ -1,156 +0,0 @@
-#if defined(__OPENCL_HOST__)
-#define __global
-#include "../blake2.h"
-#else
-typedef char int8_t;
-typedef uchar uint8_t;
-typedef short int16_t;
-typedef ushort uint16_t;
-typedef int int32_t;
-typedef uint uint32_t;
-typedef long int64_t;
-typedef ulong uint64_t;
-
-#if defined(_MSC_VER)
-#define ALIGN(x) __declspec(align(x))
-#else
-#define ALIGN(x) __attribute__ ((__aligned__(x)))
-#endif
-
-enum blake2b_constant
-{
-  BLAKE2B_BLOCKBYTES = 128,
-  BLAKE2B_OUTBYTES   = 64,
-  BLAKE2B_KEYBYTES   = 64,
-  BLAKE2B_SALTBYTES  = 16,
-  BLAKE2B_PERSONALBYTES = 16
-};
-
-#pragma pack(push, 1)
-ALIGN( 64 ) typedef struct __blake2b_state {
-  uint64_t h[8];
-  uint8_t  buf[BLAKE2B_BLOCKBYTES];
-  uint16_t counter;
-  uint8_t  buflen;
-  uint8_t  lastblock;
-} blake2b_state;
-#pragma pack(pop)
-#endif
-
-#define COLLISION_BIT_LENGTH (WN / (WK+1))
-#define COLLISION_BYTE_LENGTH ((COLLISION_BIT_LENGTH+7)/8)
-#define FINAL_FULL_WIDTH (2*COLLISION_BYTE_LENGTH+sizeof(uint32_t)*(1 << (WK)))
-
-
-#define NDIGITS   (WK+1)
-#define DIGITBITS (WN/(NDIGITS))
-#define PROOFSIZE (1u<<WK)
-#define COMPRESSED_PROOFSIZE ((COLLISION_BIT_LENGTH+1)*PROOFSIZE*4/(8*sizeof(uint32_t)))
-#define BASE (1u<<DIGITBITS)
-#define NHASHES (2u*BASE)
-#define HASHESPERBLAKE (512/WN)
-#define HASHOUT (HASHESPERBLAKE*WN/8)
-
-// 2_log of number of buckets
-#define BUCKBITS  (DIGITBITS-RESTBITS)
-
-// number of buckets
-#define NBUCKETS (1<<BUCKBITS)
-// 2_log of number of slots per bucket
-#define SLOTBITS (RESTBITS+1+1)
-// number of slots per bucket
-#define NSLOTS (1u<<SLOTBITS)
-// number of per-xhash slots
-#define XFULL 16
-// SLOTBITS mask
-#define SLOTMASK (NSLOTS-1)
-// number of possible values of xhash (rest of n) bits
-#define NRESTS (1u<<RESTBITS)
-// number of blocks of hashes extracted from single 512 bit blake2b output
-#define NBLOCKS ((NHASHES+HASHESPERBLAKE-1)/HASHESPERBLAKE)
-// nothing larger found in 100000 runs
-#define MAXSOLS 8
-
-#define WORDS(bits)     ((bits + 31) / 32)
-#define HASHWORDS0 WORDS(WN - DIGITBITS + RESTBITS)
-#define HASHWORDS1 WORDS(WN - 2*DIGITBITS + RESTBITS)
-
-typedef uint32_t proof[PROOFSIZE];
-
-// tree  = | xhash(RESTBITS)    | slotid1(SLOTBITS) | slotid0(SLOTBITS) | bucketid(BUCKBITS) |
-// index = | bucketid(BUCKBITS) | slotid0(SLOTBITS) |
-typedef uint32_t tree;
-
-typedef union hashunit {
-  uint32_t word;
-  uint8_t bytes[4];
-} hashunit;
-
-typedef struct slot0 {
-  tree attr;
-  hashunit hash[HASHWORDS0];
-} slot0;
-
-typedef struct slot1 {
-  tree attr;
-  hashunit hash[HASHWORDS1];
-} slot1;
-
-// a bucket is NSLOTS treenodes
-typedef slot0 bucket0[NSLOTS];
-typedef slot1 bucket1[NSLOTS];
-// the N-bit hash consists of K+1 n-bit "digits"
-// each of which corresponds to a layer of NBUCKETS buckets
-typedef bucket0 digit0[NBUCKETS];
-typedef bucket1 digit1[NBUCKETS];
-
-// manages hash and tree data
-typedef struct htalloc {
-  __global bucket0 *trees0[(WK+1)/2];
-  __global bucket1 *trees1[WK/2];
-} htalloc;
-
-typedef uint32_t bsizes[NBUCKETS];
-
-
-typedef struct htlayout {
-  htalloc hta;
-  uint32_t prevhashunits;
-  uint32_t nexthashunits;
-  uint32_t dunits;
-  uint32_t prevbo;
-  uint32_t nextbo;
-} htlayout;
-
-#if RESTBITS <= 6
-  typedef uint8_t xslot;
-#else
-  typedef uint16_t xslot;
-#endif
-
-typedef struct collisiondata {
-#ifdef XBITMAP
-#if NSLOTS > 64
-#error cant use XBITMAP with more than 64 slots
-#endif
-  uint64_t xhashmap[NRESTS];
-  uint64_t xmap;
-#else
-  xslot nxhashslots[NRESTS];
-  xslot xhashslots[NRESTS][XFULL];
-  xslot *xx;
-  uint32_t n0;
-  uint32_t n1;
-#endif
-  uint32_t s0;
-} collisiondata;
-
-
-typedef struct equi {
-  blake2b_state blake_ctx;
-  htalloc hta;
-  __global bsizes *nslots;
-  __global proof *sols;
-  uint32_t nsols;
-  uint32_t nthreads;
-} equi;
diff --git a/3rdparty/amd_bins_linux/zcash/gpu/equihash.cl b/3rdparty/amd_bins_linux/zcash/gpu/equihash.cl
deleted file mode 100644
index 213a8e4d6..000000000
--- a/3rdparty/amd_bins_linux/zcash/gpu/equihash.cl
+++ /dev/null
@@ -1,1038 +0,0 @@
-#include "common.h"
-
-#include "blake2bcl.h"
-
-#define tree0_ptr(heap, r) ((__global bucket0 *)(heap + r))
-#define tree1_ptr(heap, r) ((__global bucket1 *)(heap + r))
-
-uint32_t tree_bucket(tree t)
-{
-  const uint32_t bucketMask = ((1u<<BUCKBITS)-1);
-  return t & bucketMask;
-}
-
-uint32_t tree_slotid0(tree t)
-{
-  const uint32_t slotMask =  ((1u<<SLOTBITS)-1);
-  return (t >> BUCKBITS) & SLOTMASK;
-}
-
-uint32_t tree_slotid1(tree t)
-{
-  const uint32_t slotMask =  ((1u<<SLOTBITS)-1);
-  return (t >> (BUCKBITS+SLOTBITS)) & SLOTMASK;
-}
-
-uint32_t tree_xhash(tree t)
-{
-  return t >> (2*SLOTBITS + BUCKBITS);
-}
-
-uint32_t tree_getindex(const tree t)
-{
-  const uint32_t bucketMask = ((1u<<BUCKBITS)-1);
-  const uint32_t slotMask =  ((1u<<SLOTBITS)-1);
-  return ((t & bucketMask) << SLOTBITS) | ((t & (slotMask << BUCKBITS)) >> BUCKBITS);  
-}
-
-void tree_setindex(tree *t, uint32_t idx)
-{
-  const uint32_t bucketMask = ((1u<<BUCKBITS)-1);
-  const uint32_t slotMask =  ((1u<<SLOTBITS)-1);
-
-  (*t) &= ~(bucketMask | (slotMask << BUCKBITS));
-  (*t) |= (idx >> SLOTBITS);
-  (*t) |= ((idx & slotMask) << BUCKBITS);
-}
-
-void tree_setxhash(tree *t, uint32_t xhash)
-{
-  const uint32_t xhashMask = ((1u << RESTBITS)-1);
-  (*t) &= ~(xhashMask << (2*SLOTBITS + BUCKBITS));
-  (*t) |= (xhash << (2*SLOTBITS + BUCKBITS));
-}
-
-tree tree_create3(uint32_t bucketId, uint32_t s0, uint32_t s1)
-{
-  return bucketId | (s0 << BUCKBITS) | (s1 << (BUCKBITS+SLOTBITS));
-}
-
-tree tree_create4(uint32_t bucketId, uint32_t s0, uint32_t s1, uint32_t xhash)
-{
-  return bucketId | (s0 << BUCKBITS) | (s1 << (BUCKBITS+SLOTBITS)) | (xhash << (2*SLOTBITS+BUCKBITS));;
-}
-
-// size (in bytes) of hash in round 0 <= r < WK
-uint32_t hashsize(const uint32_t r)
-{
-#ifdef XINTREE
-  const uint32_t hashbits = WN - (r+1) * DIGITBITS;
-#else
-  const uint32_t hashbits = WN - (r+1) * DIGITBITS + RESTBITS;
-#endif
-  return (hashbits + 7) / 8;
-}
-
-uint32_t hashwords(uint32_t bytes)
-{
-  return (bytes + 3) / 4;
-}
-
-htlayout htlayout_create_2(uint32_t r)
-{
-  htlayout R;
-  R.prevhashunits = 0;
-  R.dunits = 0;
-  
-  uint32_t nexthashbytes = hashsize(r);
-  R.nexthashunits = hashwords(nexthashbytes);
-  
-  R.prevbo = 0;
-  R.nextbo = R.nexthashunits * sizeof(hashunit) - nexthashbytes; // 0-3
-  if (r) {
-    uint32_t prevhashbytes = hashsize(r-1);
-    R.prevhashunits = hashwords(prevhashbytes);
-    R.prevbo = R.prevhashunits * sizeof(hashunit) - prevhashbytes; // 0-3
-    R.dunits = R.prevhashunits - R.nexthashunits;
-  }
-  
-  return R;
-}
-
-uint32_t htlayout_getxhash0(uint32_t prevbo, __global const slot0 *pslot)
-{
-#ifdef XINTREE
-  return tree_xhash(pslot->attr);
-#elif WN == 200 && RESTBITS == 4
-  return pslot->hash->bytes[prevbo] >> 4;
-#elif WN == 200 && RESTBITS == 8
-  return (pslot->hash->bytes[prevbo] & 0xf) << 4 | pslot->hash->bytes[prevbo+1] >> 4;
-#elif WN == 144 && RESTBITS == 4
-  return pslot->hash->bytes[prevbo] & 0xf;
-#elif WN == 200 && RESTBITS == 6
-  return (pslot->hash->bytes[prevbo] & 0x3) << 4 | pslot->hash->bytes[prevbo+1] >> 4;
-#else
-#error non implemented
-#endif
-}
-
-uint32_t htlayout_getxhash1(uint32_t prevbo, __global const slot1 *pslot)
-{
-#ifdef XINTREE
-  return tree_xhash(pslot->attr);
-#elif WN == 200 && RESTBITS == 4
-  return pslot->hash->bytes[prevbo] & 0xf;
-#elif WN == 200 && RESTBITS == 8
-  return pslot->hash->bytes[prevbo];
-#elif WN == 144 && RESTBITS == 4
-  return pslot->hash->bytes[prevbo] & 0xf;
-#elif WN == 200 && RESTBITS == 6
-  return pslot->hash->bytes[prevbo] & 0x3f;
-#else
-#error non implemented
-#endif  
-}
-
-bool htlayout_equal(uint32_t prevhashunits, __global const hashunit *hash0, __global const hashunit *hash1)
-{
-  return hash0[prevhashunits-1].word == hash1[prevhashunits-1].word;
-}
-
-void collisiondata_clear(collisiondata *data) 
-{
-#ifdef XBITMAP
-  // memset(xhashmap, 0, NRESTS * sizeof(u64));
-  for (unsigned i = 0; i < NRESTS; i++)
-    data->xhashmap[i] = 0;
-#else
-  // memset(nxhashslots, 0, NRESTS * sizeof(xslot));
-  for (unsigned i = 0; i < NRESTS; i++)
-    data->nxhashslots[i] = 0;
-#endif
-}
-
-bool collisiondata_addslot(collisiondata *data, uint32_t s1, uint32_t xh)
-{
-#ifdef XBITMAP
-  data->xmap = data->xhashmap[xh];
-  data->xhashmap[xh] |= (uint64_t)1 << s1;
-  data->s0 = ~0;
-  return true;
-#else
-  data->n1 = (uint32_t)data->nxhashslots[xh]++;
-  if (data->n1 >= XFULL)
-    return false;
-  data->xx = data->xhashslots[xh];
-  data->xx[data->n1] = s1;
-  data->n0 = 0;
-  return true;
-#endif
-}
-
-bool collisiondata_nextcollision(collisiondata *data)
-{
-#ifdef XBITMAP
-  return data->xmap != 0;
-#else
-  return data->n0 < data->n1;
-#endif
-}
-
-uint64_t __ffsll(uint64_t x)
-{
-  return x ? (64 - clz(x & -x)) : 0;
-}
-
-uint32_t collisiondata_slot(collisiondata *data) {
-#ifdef XBITMAP
-  const uint32_t ffs = __ffsll(xmap);
-  data->s0 += ffs;
-  data->xmap >>= ffs;
-  return data->s0;
-#else
-  return (uint32_t)data->xx[data->n0++];
-#endif
-}
-
-uint32_t equi_getnslots(__global bsizes *nslots, const uint32_t r, const uint32_t bid)
-{
-  __global uint32_t *nslot = &nslots[r&1][bid];
-  const uint32_t n = min(*nslot, NSLOTS);
-  *nslot = 0;
-  return n;
-}
-
-void equi_orderindices(__global uint32_t *indices, uint32_t size)
-{
-  if (indices[0] > indices[size]) {
-    for (uint32_t i = 0; i < size; i++) {
-      const uint32_t tmp = indices[i];
-      indices[i] = indices[size+i];
-      indices[size+i] = tmp;
-    }
-  }
-}
-
-void local_orderindices(uint32_t *indices, uint32_t size)
-{
-  if (indices[0] > indices[size]) {
-    for (uint32_t i = 0; i < size; i++) {
-      const uint32_t tmp = indices[i];
-      indices[i] = indices[size+i];
-      indices[size+i] = tmp;
-    }
-  }
-}
-
-
-void equi_listindices1(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 0)[tree_bucket(t)];
-  const uint32_t size = 1 << 0;
-  indices[0]    = tree_getindex((*buck)[tree_slotid0(t)].attr);
-  indices[size] = tree_getindex((*buck)[tree_slotid1(t)].attr);
-  equi_orderindices(indices, size);
-}
-
-void equi_listindices2(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 0)[tree_bucket(t)];
-  const uint32_t size = 1 << 1;
-  equi_listindices1(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices1(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}
-
-void equi_listindices3(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 1)[tree_bucket(t)];
-  const uint32_t size = 1 << 2;
-  equi_listindices2(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices2(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}
-
-void equi_listindices4(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 1)[tree_bucket(t)];
-  const uint32_t size = 1 << 3;
-  equi_listindices3(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices3(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}
- 
-void equi_listindices5(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 2)[tree_bucket(t)];
-  const uint32_t size = 1 << 4;
-  equi_listindices4(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices4(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}  
-  
-void equi_listindices6(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 2)[tree_bucket(t)];
-  const uint32_t size = 1 << 5;
-  equi_listindices5(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices5(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}  
-  
-void equi_listindices7(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 3)[tree_bucket(t)];
-  const uint32_t size = 1 << 6;
-  equi_listindices6(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices6(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}  
-
-void equi_listindices8(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 3)[tree_bucket(t)];
-  const uint32_t size = 1 << 7;
-  equi_listindices7(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices7(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}  
-
-void equi_listindices9(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 4)[tree_bucket(t)];
-  const uint32_t size = 1 << 8;
-  equi_listindices8(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices8(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}
-
-void local_listindices1(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 0)[tree_bucket(t)];
-  const uint32_t size = 1 << 0;
-  indices[0]    = tree_getindex((*buck)[tree_slotid0(t)].attr);
-  indices[size] = tree_getindex((*buck)[tree_slotid1(t)].attr);
-  local_orderindices(indices, size);
-}
-
-void local_listindices2(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 0)[tree_bucket(t)];
-  const uint32_t size = 1 << 1;
-  local_listindices1(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices1(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}
-
-void local_listindices3(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 1)[tree_bucket(t)];
-  const uint32_t size = 1 << 2;
-  local_listindices2(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices2(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}
-
-void local_listindices4(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 1)[tree_bucket(t)];
-  const uint32_t size = 1 << 3;
-  local_listindices3(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices3(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}
- 
-void local_listindices5(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 2)[tree_bucket(t)];
-  const uint32_t size = 1 << 4;
-  local_listindices4(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices4(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}  
-  
-void local_listindices6(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 2)[tree_bucket(t)];
-  const uint32_t size = 1 << 5;
-  local_listindices5(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices5(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}  
-  
-void local_listindices7(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 3)[tree_bucket(t)];
-  const uint32_t size = 1 << 6;
-  local_listindices6(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices6(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}  
-
-void local_listindices8(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 3)[tree_bucket(t)];
-  const uint32_t size = 1 << 7;
-  local_listindices7(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices7(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}  
-
-void local_listindices9(__global uint32_t *heap0,
-                        __global uint32_t *heap1,
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 4)[tree_bucket(t)];
-  const uint32_t size = 1 << 8;
-  local_listindices8(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices8(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}
-
-// proper dupe test is a little costly on GPU, so allow false negatives
-bool equi_probdupe(uint32_t *prf) {
-  unsigned short susp[PROOFSIZE];
-  for (unsigned i = 0; i < PROOFSIZE; i++)
-    susp[i] = 0xFFFF;
-    
-  for (unsigned i = 0; i < PROOFSIZE; i++) {
-    uint32_t bin = prf[i] & (PROOFSIZE-1);
-    unsigned short msb = prf[i] >> WK;
-    if (msb == susp[bin])
-      return true;
-    susp[bin] = msb;
-  }
-  
-  return false;
-}
-
-void equi_candidate(__global uint32_t *heap0,
-                    __global uint32_t *heap1,
-                    __global proof *sols,
-                    __global uint32_t *nsols,
-                    const tree t)
-{
-  proof prf;
-#if WK==9
-  local_listindices9(heap0, heap1, t, (uint32_t*)&prf);
-#elif WK==5
-  local_listindices5(heap0, heap1, t, (uint32_t*)&prf);
-#else
-#error not implemented
-#endif
-  if (equi_probdupe(prf))
-    return;
-  uint32_t soli = atomic_inc(nsols);
-  if (soli < MAXSOLS)
-#if WK==9
-    equi_listindices9(heap0, heap1, t, sols[soli]);
-#elif WK==5
-    equi_listindices5(heap0, heap1, t, sols[soli]);
-#else
-#error not implemented
-#endif
-}
-
-
-__kernel void digitH(__global blake2b_state *blake2bState,
-                     __global const uint32_t *heap0,
-                     __global bsizes *nslots)
-{
-  uint8_t hash[HASHOUT];
-  blake2b_state state;
-  // equi::htlayout htl(eq, 0);
-  htlayout htl = htlayout_create_2(0);
-  const uint32_t hashbytes = hashsize(0);
-  // const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t block = id; block < NBLOCKS; block += get_global_size(0)) {
-    state = *blake2bState;
-    blake2b_gpu_hash(&state, block, hash, HASHOUT);
-    for (uint32_t i = 0; i < HASHESPERBLAKE; i++) {
-      const uint8_t *ph = hash + i * WN/8;
-#if BUCKBITS == 16 && RESTBITS == 4
-      const uint32_t bucketid = ((uint32_t)ph[0] << 8) | ph[1];
-#ifdef XINTREE
-      const uint32_t xhash = ph[2] >> 4;
-#endif
-#elif BUCKBITS == 14 && RESTBITS == 6
-      const uint32_t bucketid = ((uint32_t)ph[0] << 6) | ph[1] >> 2;
-#elif BUCKBITS == 12 && RESTBITS == 8
-      const uint32_t bucketid = ((uint32_t)ph[0] << 4) | ph[1] >> 4;
-#elif BUCKBITS == 20 && RESTBITS == 4
-      const uint32_t bucketid = ((((uint32_t)ph[0] << 8) | ph[1]) << 4) | ph[2] >> 4;
-#ifdef XINTREE
-      const uint32_t xhash = ph[2] & 0xf;
-#endif
-#elif BUCKBITS == 12 && RESTBITS == 4
-      const uint32_t bucketid = ((uint32_t)ph[0] << 4) | ph[1] >> 4;
-      const uint32_t xhash = ph[1] & 0xf;
-#else
-#error not implemented
-#endif
-      const uint32_t slot = atomic_inc(&nslots[0][bucketid]);
-      if (slot >= NSLOTS)
-        continue;
-      tree leaf;
-      tree_setindex(&leaf, block*HASHESPERBLAKE+i);
-#ifdef XINTREE
-      tree_setxhash(&leaf, xhash);
-#endif
-      __global slot0 *s = &tree0_ptr(heap0, 0)[bucketid][slot];
-      s->attr = leaf;
-      
-      // memcpy(s.hash->bytes+htl.nextbo, ph+WN/8-hashbytes, hashbytes);
-      for (unsigned i = 0; i < hashbytes; i++)
-        ((__global uint8_t*)s->hash->bytes+htl.nextbo)[i] = ((uint8_t*)(ph+WN/8-hashbytes))[i];
-    }
-  }
-}
-
-__kernel void digitOdd(const uint32_t r,
-                       __global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       __global bsizes *nslots)
-{
-  // equi::htlayout htl(eq, r);
-//   htlayout htl = htlayout_create(eq, r);
-  htlayout htl = htlayout_create_2(r);  
-  collisiondata cd;
-  
-  // const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x;
-  const uint32_t id = get_global_id(0);
-  
-  for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    // cd.clear();
-    collisiondata_clear(&cd);
-//     __global slot0 *buck = htl.hta.trees0[(r-1)/2][bucketid]; // optimize by updating previous buck?!
-    __global slot0 *buck = tree0_ptr(heap0, (r-1)/2)[bucketid]; // optimize by updating previous buck?!    
-    uint32_t bsize = equi_getnslots(nslots, r-1, bucketid);       // optimize by putting bucketsize with block?!
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot0 *pslot1 = buck + s1;          // optimize by updating previous pslot1?!
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot0 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-#if WN == 200 && BUCKBITS == 16 && RESTBITS == 4 && defined(XINTREE)
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
-                          | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
-                  | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        xhash &= 0xf;
-#elif WN == 144 && BUCKBITS == 20 && RESTBITS == 4
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 8)
-                            | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 4)
-                    | (xhash = bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 4;
-        xhash &= 0xf;
-#elif WN == 96 && BUCKBITS == 12 && RESTBITS == 4
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4)
-                  | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        xhash &= 0xf;
-#elif WN == 200 && BUCKBITS == 14 && RESTBITS == 6
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) & 0xf) << 8)
-                           | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 2
-                           | (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 6;
-#else
-#error not implemented
-#endif
-        const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-#ifdef XINTREE
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-#else
-        tree xort = tree_create3(bucketid, s0, s1);
-#endif
-//         __global slot1 *xs = &htl.hta.trees1[r/2][xorbucketid][xorslot];
-        __global slot1 *xs = &tree1_ptr(heap1, r/2)[xorbucketid][xorslot];        
-        xs->attr = xort;
-        for (uint32_t i = htl.dunits; i < htl.prevhashunits; i++)
-          xs->hash[i-htl.dunits].word = pslot0->hash[i].word ^ pslot1->hash[i].word;
-      }
-    }
-  }
-}
-
-
-__kernel void digitEven(const uint32_t r,
-                        __global uint32_t *heap0,
-                        __global uint32_t *heap1,
-                        __global bsizes *nslots)
-{
-  // equi::htlayout htl(eq, r);
-//   htlayout htl = htlayout_create(eq, r);
-  htlayout htl = htlayout_create_2(r);
-  collisiondata cd;
-  
-  // const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x;
-  const uint32_t id = get_global_id(0);
-  
-  for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    // cd.clear();
-    collisiondata_clear(&cd);    
-//     __global slot1 *buck = htl.hta.trees1[(r-1)/2][bucketid]; // OPTIMIZE BY UPDATING PREVIOUS
-     __global slot1 *buck = tree1_ptr(heap1, (r-1)/2)[bucketid]; // OPTIMIZE BY UPDATING PREVIOUS
-    uint32_t bsize = equi_getnslots(nslots, r-1, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot1 *pslot1 = buck + s1;          // OPTIMIZE BY UPDATING PREVIOUS
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot1 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-#if WN == 200 && BUCKBITS == 16 && RESTBITS == 4 && defined(XINTREE)
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
-                        | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
-                  xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-#elif WN == 144 && BUCKBITS == 20 && RESTBITS == 4
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 8)
-                            | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 4)
-                            | (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 4;
-#elif WN == 96 && BUCKBITS == 12 && RESTBITS == 4
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4)
-                          | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-#elif WN == 200 && BUCKBITS == 14 && RESTBITS == 6
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 6)
-                          | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 2;
-#else
-#error not implemented
-#endif
-        const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-#ifdef XINTREE
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-#else
-        tree xort = tree_create3(bucketid, s0, s1);
-#endif
-//         __global slot0 *xs = &htl.hta.trees0[r/2][xorbucketid][xorslot];
-        __global slot0 *xs = &tree0_ptr(heap0, r/2)[xorbucketid][xorslot];        
-        xs->attr = xort;
-        for (uint32_t i=htl.dunits; i < htl.prevhashunits; i++)
-          xs->hash[i-htl.dunits].word = pslot0->hash[i].word ^ pslot1->hash[i].word;
-      }
-    }
-  }
-}
-
-
-#ifdef UNROLL
-
-__kernel void digit_1(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots)
-{
-  htlayout htl = htlayout_create_2(1);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-    __global slot0 *buck = tree0_ptr(heap0, 0)[bucketid];
-    uint32_t bsize = equi_getnslots(nslots, 0, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot0 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot0 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
-                          | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
-                  | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        xhash &= 0xf;
-        const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot1 *xs = &htl.hta.trees1[0][xorbucketid][xorslot];
-        __global slot1 *xs = &tree1_ptr(heap1, 0)[xorbucketid][xorslot];
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-        xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
-        xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
-        xs->hash[3].word = pslot0->hash[4].word ^ pslot1->hash[4].word;
-        xs->hash[4].word = pslot0->hash[5].word ^ pslot1->hash[5].word;
-      }
-    }
-  }
-}
-__kernel void digit_2(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(2);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot1 *buck = htl.hta.trees1[0][bucketid];
-    __global slot1 *buck = tree1_ptr(heap1, 0)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 1, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot1 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))  
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot1 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
-                        | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
-                  xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-        // __global slot0 *xs = &htl.hta.trees0[1][xorbucketid][xorslot];
-         __global slot0 *xs = &tree0_ptr(heap0, 1)[xorbucketid][xorslot];
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word;
-        xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-        xs->hash[2].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
-        xs->hash[3].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
-        xs->hash[4].word = pslot0->hash[4].word ^ pslot1->hash[4].word;
-      }
-    }
-  }
-}
-__kernel void digit_3(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(3);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot0 *buck = htl.hta.trees0[1][bucketid];
-    __global slot0 *buck = tree0_ptr(heap0, 1)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 2, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot0 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))  
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot0 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
-                          | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
-                  | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        xhash &= 0xf;
-        const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot1 *xs = &htl.hta.trees1[1][xorbucketid][xorslot];
-        __global slot1 *xs = &tree1_ptr(heap1, 1)[xorbucketid][xorslot];
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-        xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
-        xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
-        xs->hash[3].word = pslot0->hash[4].word ^ pslot1->hash[4].word;
-      }
-    }
-  }
-}
-__kernel void digit_4(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(4);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot1 *buck = htl.hta.trees1[1][bucketid];
-    __global slot1 *buck = tree1_ptr(heap1, 1)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 3, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot1 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot1 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
-                        | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
-                  xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot0 *xs = &htl.hta.trees0[2][xorbucketid][xorslot];
-        __global slot0 *xs = &tree0_ptr(heap0, 2)[xorbucketid][xorslot];        
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word;
-        xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-        xs->hash[2].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
-        xs->hash[3].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
-      }
-    }
-  }
-}
-__kernel void digit_5(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(5);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot0 *buck = htl.hta.trees0[2][bucketid];
-    __global slot0 *buck = tree0_ptr(heap0, 2)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 4, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot0 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot0 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
-                          | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
-                  | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        xhash &= 0xf;
-        const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot1 *xs = &htl.hta.trees1[2][xorbucketid][xorslot];
-        __global slot1 *xs = &tree1_ptr(heap1, 2)[xorbucketid][xorslot];        
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-        xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
-        xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
-      }
-    }
-  }
-}
-__kernel void digit_6(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(6);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot1 *buck = htl.hta.trees1[2][bucketid];
-    __global slot1 *buck = tree1_ptr(heap1, 2)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 5, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot1 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot1 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
-                        | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
-                  xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot0 *xs = &htl.hta.trees0[3][xorbucketid][xorslot];
-        __global slot0 *xs = &tree0_ptr(heap0, 3)[xorbucketid][xorslot];        
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-        xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
-      }
-    }
-  }
-}
-__kernel void digit_7(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(7);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot0 *buck = htl.hta.trees0[3][bucketid];
-    __global slot0 *buck = tree0_ptr(heap0, 3)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 6, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot0 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot0 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
-                          | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
-                  | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        xhash &= 0xf;
-        const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot1 *xs = &htl.hta.trees1[3][xorbucketid][xorslot];
-        __global slot1 *xs = &tree1_ptr(heap1, 3)[xorbucketid][xorslot];        
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word;
-        xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-      }
-    }
-  }
-}
-__kernel void digit_8(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(8);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot1 *buck = htl.hta.trees1[3][bucketid];
-    __global slot1 *buck = tree1_ptr(heap1, 3)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 7, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot1 *pslot1 = buck + s1;          // OPTIMIZE BY UPDATING PREVIOUS
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot1 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
-                        | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
-                  xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot0 *xs = &htl.hta.trees0[4][xorbucketid][xorslot];
-        __global slot0 *xs = &tree0_ptr(heap0, 4)[xorbucketid][xorslot];     
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-      }
-    }
-  }
-}
-#endif //UNROLL
-
-__kernel void digitK(__global uint32_t *heap0,
-                     __global uint32_t *heap1,
-                     __global bsizes *nslots,
-                     __global proof *sols,
-                     __global uint32_t *nsols) {
-  collisiondata cd;
-  htlayout htl = htlayout_create_2(WK);
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-    __global slot0 *buck = tree0_ptr(heap0, (WK-1)/2)[bucketid];
-    uint32_t bsize = equi_getnslots(nslots, WK-1, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot0 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) // assume WK odd
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot0 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) {
-          tree xort = tree_create3(bucketid, s0, s1);
-          equi_candidate(heap0, heap1, sols, nsols, xort);
-        }
-      }
-    }
-  }
-}
diff --git a/3rdparty/amd_bins_windows/equiw200k9.bin b/3rdparty/amd_bins_windows/equiw200k9.bin
deleted file mode 100644
index 868842f93..000000000
Binary files a/3rdparty/amd_bins_windows/equiw200k9.bin and /dev/null differ
diff --git a/3rdparty/amd_bins_windows/zcash/gpu/blake2bcl.h b/3rdparty/amd_bins_windows/zcash/gpu/blake2bcl.h
deleted file mode 100644
index 13cad965c..000000000
--- a/3rdparty/amd_bins_windows/zcash/gpu/blake2bcl.h
+++ /dev/null
@@ -1,150 +0,0 @@
-// Blake2-B CUDA Implementation
-// tpruvot@github July 2016
-// permission granted to use under MIT license
-// modified for use in Zcash by John Tromp September 2016
-
-/**
- * uint2 direct ops by c++ operator definitions
- */
-
-// static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) {
-//   return make_uint2(a.x ^ b.x, a.y ^ b.y);
-// }
-
-// uint2 ROR/ROL methods
-uint2 ROR2(const uint2 a, const int offset) {
-  uint2 result;
-  if (!offset)
-          result = a;
-  else if (offset < 32) {
-          result.y = ((a.y >> offset) | (a.x << (32 - offset)));
-          result.x = ((a.x >> offset) | (a.y << (32 - offset)));
-  } else if (offset == 32) {
-          result.y = a.x;
-          result.x = a.y;
-  } else {
-          result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
-          result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
-  }
-  return result;
-}
-
-uint2 SWAPUINT2(uint2 value) {
-  uint2 result;
-  result.x = value.y;
-  result.y = value.x;
-  return result;
-//   return make_uint2(value.y, value.x);
-}
-
-#define ROR24(u) ROR2(u,24)
-#define ROR16(u) ROR2(u,16)
-
-__constant int8_t blake2b_sigma[12][16] = {
-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
-  { 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  } ,
-  { 11, 8,  12, 0,  5,  2,  15, 13, 10, 14, 3,  6,  7,  1,  9,  4  } ,
-  { 7,  9,  3,  1,  13, 12, 11, 14, 2,  6,  5,  10, 4,  0,  15, 8  } ,
-  { 9,  0,  5,  7,  2,  4,  10, 15, 14, 1,  11, 12, 6,  8,  3,  13 } ,
-  { 2,  12, 6,  10, 0,  11, 8,  3,  4,  13, 7,  5,  15, 14, 1,  9  } ,
-  { 12, 5,  1,  15, 14, 13, 4,  10, 0,  7,  6,  3,  9,  2,  8,  11 } ,
-  { 13, 11, 7,  14, 12, 1,  3,  9,  5,  0,  15, 4,  8,  6,  2,  10 } ,
-  { 6,  15, 14, 9,  11, 3,  0,  8,  12, 2,  13, 7,  1,  4,  10, 5  } ,
-  { 10, 2,  8,  4,  7,  6,  1,  5,  15, 11, 9,  14, 3,  12, 13, 0  } ,
-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
-  { 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  }
-};
-
-void G(const int32_t r, const int32_t i, uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d, uint64_t const m[16]) {
-  *a += *b + m[ blake2b_sigma[r][2*i] ];
-  ((uint2*)d)[0] = SWAPUINT2( ((uint2*)d)[0] ^ ((uint2*)a)[0] );
-  *c += *d;
-  ((uint2*)b)[0] = ROR24( ((uint2*)b)[0] ^ ((uint2*)c)[0] );
-  *a += *b + m[ blake2b_sigma[r][2*i+1] ];
-  ((uint2*)d)[0] = ROR16( ((uint2*)d)[0] ^ ((uint2*)a)[0] );
-  *c += *d;
-  ((uint2*)b)[0] = ROR2( ((uint2*)b)[0] ^ ((uint2*)c)[0], 63U);
-}
-
-#define ROUND(r) \
-  G(r, 0, &v[0], &v[4], &v[ 8], &v[12], m); \
-  G(r, 1, &v[1], &v[5], &v[ 9], &v[13], m); \
-  G(r, 2, &v[2], &v[6], &v[10], &v[14], m); \
-  G(r, 3, &v[3], &v[7], &v[11], &v[15], m); \
-  G(r, 4, &v[0], &v[5], &v[10], &v[15], m); \
-  G(r, 5, &v[1], &v[6], &v[11], &v[12], m); \
-  G(r, 6, &v[2], &v[7], &v[ 8], &v[13], m); \
-  G(r, 7, &v[3], &v[4], &v[ 9], &v[14], m);
-
-void blake2b_gpu_hash(blake2b_state *state, uint32_t idx, uint8_t *hash, uint32_t outlen) {
-  const uint32_t leb = idx;
-  *(uint32_t*)(state->buf + state->buflen) = leb;
-  state->buflen += 4;
-  state->counter += state->buflen;
-  for (unsigned i = 0; i < BLAKE2B_BLOCKBYTES - state->buflen; i++)
-    state->buf[i+state->buflen] = 0;  
-
-  uint64_t *d_data = (uint64_t *)state->buf;
-  uint64_t m[16];
-
-  m[0] = d_data[0];
-  m[1] = d_data[1];
-  m[2] = d_data[2];
-  m[3] = d_data[3];
-  m[4] = d_data[4];
-  m[5] = d_data[5];
-  m[6] = d_data[6];
-  m[7] = d_data[7];
-  m[8] = d_data[8];
-  m[9] = d_data[9];
-  m[10] = d_data[10];
-  m[11] = d_data[11];
-  m[12] = d_data[12];
-  m[13] = d_data[13];
-  m[14] = d_data[14];
-  m[15] = d_data[15];
-
-  uint64_t v[16];
-
-  v[0] = state->h[0];
-  v[1] = state->h[1];
-  v[2] = state->h[2];
-  v[3] = state->h[3];
-  v[4] = state->h[4];
-  v[5] = state->h[5];
-  v[6] = state->h[6];
-  v[7] = state->h[7];
-  v[8] = 0x6a09e667f3bcc908;
-  v[9] = 0xbb67ae8584caa73b;
-  v[10] =  0x3c6ef372fe94f82b;
-  v[11] = 0xa54ff53a5f1d36f1;
-  v[12] = 0x510e527fade682d1 ^ state->counter;
-  v[13] = 0x9b05688c2b3e6c1f;
-  v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff;
-  v[15] = 0x5be0cd19137e2179;
-
-  ROUND( 0 );
-  ROUND( 1 );
-  ROUND( 2 );
-  ROUND( 3 );
-  ROUND( 4 );
-  ROUND( 5 );
-  ROUND( 6 );
-  ROUND( 7 );
-  ROUND( 8 );
-  ROUND( 9 );
-  ROUND( 10 );
-  ROUND( 11 );
-  
-  state->h[0] ^= v[0] ^ v[ 8];
-  state->h[1] ^= v[1] ^ v[ 9];
-  state->h[2] ^= v[2] ^ v[10];
-  state->h[3] ^= v[3] ^ v[11];
-  state->h[4] ^= v[4] ^ v[12];
-  state->h[5] ^= v[5] ^ v[13];
-  state->h[6] ^= v[6] ^ v[14];
-  state->h[7] ^= v[7] ^ v[15];
-
-  for (unsigned i = 0; i < outlen; i++)
-    hash[i] = ((uint8_t*)state->h)[i];
-}
diff --git a/3rdparty/amd_bins_windows/zcash/gpu/common.h b/3rdparty/amd_bins_windows/zcash/gpu/common.h
deleted file mode 100644
index 22ba9548e..000000000
--- a/3rdparty/amd_bins_windows/zcash/gpu/common.h
+++ /dev/null
@@ -1,156 +0,0 @@
-#if defined(__OPENCL_HOST__)
-#define __global
-#include "../blake2.h"
-#else
-typedef char int8_t;
-typedef uchar uint8_t;
-typedef short int16_t;
-typedef ushort uint16_t;
-typedef int int32_t;
-typedef uint uint32_t;
-typedef long int64_t;
-typedef ulong uint64_t;
-
-#if defined(_MSC_VER)
-#define ALIGN(x) __declspec(align(x))
-#else
-#define ALIGN(x) __attribute__ ((__aligned__(x)))
-#endif
-
-enum blake2b_constant
-{
-  BLAKE2B_BLOCKBYTES = 128,
-  BLAKE2B_OUTBYTES   = 64,
-  BLAKE2B_KEYBYTES   = 64,
-  BLAKE2B_SALTBYTES  = 16,
-  BLAKE2B_PERSONALBYTES = 16
-};
-
-#pragma pack(push, 1)
-ALIGN( 64 ) typedef struct __blake2b_state {
-  uint64_t h[8];
-  uint8_t  buf[BLAKE2B_BLOCKBYTES];
-  uint16_t counter;
-  uint8_t  buflen;
-  uint8_t  lastblock;
-} blake2b_state;
-#pragma pack(pop)
-#endif
-
-#define COLLISION_BIT_LENGTH (WN / (WK+1))
-#define COLLISION_BYTE_LENGTH ((COLLISION_BIT_LENGTH+7)/8)
-#define FINAL_FULL_WIDTH (2*COLLISION_BYTE_LENGTH+sizeof(uint32_t)*(1 << (WK)))
-
-
-#define NDIGITS   (WK+1)
-#define DIGITBITS (WN/(NDIGITS))
-#define PROOFSIZE (1u<<WK)
-#define COMPRESSED_PROOFSIZE ((COLLISION_BIT_LENGTH+1)*PROOFSIZE*4/(8*sizeof(uint32_t)))
-#define BASE (1u<<DIGITBITS)
-#define NHASHES (2u*BASE)
-#define HASHESPERBLAKE (512/WN)
-#define HASHOUT (HASHESPERBLAKE*WN/8)
-
-// 2_log of number of buckets
-#define BUCKBITS  (DIGITBITS-RESTBITS)
-
-// number of buckets
-#define NBUCKETS (1<<BUCKBITS)
-// 2_log of number of slots per bucket
-#define SLOTBITS (RESTBITS+1+1)
-// number of slots per bucket
-#define NSLOTS (1u<<SLOTBITS)
-// number of per-xhash slots
-#define XFULL 16
-// SLOTBITS mask
-#define SLOTMASK (NSLOTS-1)
-// number of possible values of xhash (rest of n) bits
-#define NRESTS (1u<<RESTBITS)
-// number of blocks of hashes extracted from single 512 bit blake2b output
-#define NBLOCKS ((NHASHES+HASHESPERBLAKE-1)/HASHESPERBLAKE)
-// nothing larger found in 100000 runs
-#define MAXSOLS 8
-
-#define WORDS(bits)     ((bits + 31) / 32)
-#define HASHWORDS0 WORDS(WN - DIGITBITS + RESTBITS)
-#define HASHWORDS1 WORDS(WN - 2*DIGITBITS + RESTBITS)
-
-typedef uint32_t proof[PROOFSIZE];
-
-// tree  = | xhash(RESTBITS)    | slotid1(SLOTBITS) | slotid0(SLOTBITS) | bucketid(BUCKBITS) |
-// index = | bucketid(BUCKBITS) | slotid0(SLOTBITS) |
-typedef uint32_t tree;
-
-typedef union hashunit {
-  uint32_t word;
-  uint8_t bytes[4];
-} hashunit;
-
-typedef struct slot0 {
-  tree attr;
-  hashunit hash[HASHWORDS0];
-} slot0;
-
-typedef struct slot1 {
-  tree attr;
-  hashunit hash[HASHWORDS1];
-} slot1;
-
-// a bucket is NSLOTS treenodes
-typedef slot0 bucket0[NSLOTS];
-typedef slot1 bucket1[NSLOTS];
-// the N-bit hash consists of K+1 n-bit "digits"
-// each of which corresponds to a layer of NBUCKETS buckets
-typedef bucket0 digit0[NBUCKETS];
-typedef bucket1 digit1[NBUCKETS];
-
-// manages hash and tree data
-typedef struct htalloc {
-  __global bucket0 *trees0[(WK+1)/2];
-  __global bucket1 *trees1[WK/2];
-} htalloc;
-
-typedef uint32_t bsizes[NBUCKETS];
-
-
-typedef struct htlayout {
-  htalloc hta;
-  uint32_t prevhashunits;
-  uint32_t nexthashunits;
-  uint32_t dunits;
-  uint32_t prevbo;
-  uint32_t nextbo;
-} htlayout;
-
-#if RESTBITS <= 6
-  typedef uint8_t xslot;
-#else
-  typedef uint16_t xslot;
-#endif
-
-typedef struct collisiondata {
-#ifdef XBITMAP
-#if NSLOTS > 64
-#error cant use XBITMAP with more than 64 slots
-#endif
-  uint64_t xhashmap[NRESTS];
-  uint64_t xmap;
-#else
-  xslot nxhashslots[NRESTS];
-  xslot xhashslots[NRESTS][XFULL];
-  xslot *xx;
-  uint32_t n0;
-  uint32_t n1;
-#endif
-  uint32_t s0;
-} collisiondata;
-
-
-typedef struct equi {
-  blake2b_state blake_ctx;
-  htalloc hta;
-  __global bsizes *nslots;
-  __global proof *sols;
-  uint32_t nsols;
-  uint32_t nthreads;
-} equi;
diff --git a/3rdparty/amd_bins_windows/zcash/gpu/equihash.cl b/3rdparty/amd_bins_windows/zcash/gpu/equihash.cl
deleted file mode 100644
index 213a8e4d6..000000000
--- a/3rdparty/amd_bins_windows/zcash/gpu/equihash.cl
+++ /dev/null
@@ -1,1038 +0,0 @@
-#include "common.h"
-
-#include "blake2bcl.h"
-
-#define tree0_ptr(heap, r) ((__global bucket0 *)(heap + r))
-#define tree1_ptr(heap, r) ((__global bucket1 *)(heap + r))
-
-uint32_t tree_bucket(tree t)
-{
-  const uint32_t bucketMask = ((1u<<BUCKBITS)-1);
-  return t & bucketMask;
-}
-
-uint32_t tree_slotid0(tree t)
-{
-  const uint32_t slotMask =  ((1u<<SLOTBITS)-1);
-  return (t >> BUCKBITS) & SLOTMASK;
-}
-
-uint32_t tree_slotid1(tree t)
-{
-  const uint32_t slotMask =  ((1u<<SLOTBITS)-1);
-  return (t >> (BUCKBITS+SLOTBITS)) & SLOTMASK;
-}
-
-uint32_t tree_xhash(tree t)
-{
-  return t >> (2*SLOTBITS + BUCKBITS);
-}
-
-uint32_t tree_getindex(const tree t)
-{
-  const uint32_t bucketMask = ((1u<<BUCKBITS)-1);
-  const uint32_t slotMask =  ((1u<<SLOTBITS)-1);
-  return ((t & bucketMask) << SLOTBITS) | ((t & (slotMask << BUCKBITS)) >> BUCKBITS);  
-}
-
-void tree_setindex(tree *t, uint32_t idx)
-{
-  const uint32_t bucketMask = ((1u<<BUCKBITS)-1);
-  const uint32_t slotMask =  ((1u<<SLOTBITS)-1);
-
-  (*t) &= ~(bucketMask | (slotMask << BUCKBITS));
-  (*t) |= (idx >> SLOTBITS);
-  (*t) |= ((idx & slotMask) << BUCKBITS);
-}
-
-void tree_setxhash(tree *t, uint32_t xhash)
-{
-  const uint32_t xhashMask = ((1u << RESTBITS)-1);
-  (*t) &= ~(xhashMask << (2*SLOTBITS + BUCKBITS));
-  (*t) |= (xhash << (2*SLOTBITS + BUCKBITS));
-}
-
-tree tree_create3(uint32_t bucketId, uint32_t s0, uint32_t s1)
-{
-  return bucketId | (s0 << BUCKBITS) | (s1 << (BUCKBITS+SLOTBITS));
-}
-
-tree tree_create4(uint32_t bucketId, uint32_t s0, uint32_t s1, uint32_t xhash)
-{
-  return bucketId | (s0 << BUCKBITS) | (s1 << (BUCKBITS+SLOTBITS)) | (xhash << (2*SLOTBITS+BUCKBITS));;
-}
-
-// size (in bytes) of hash in round 0 <= r < WK
-uint32_t hashsize(const uint32_t r)
-{
-#ifdef XINTREE
-  const uint32_t hashbits = WN - (r+1) * DIGITBITS;
-#else
-  const uint32_t hashbits = WN - (r+1) * DIGITBITS + RESTBITS;
-#endif
-  return (hashbits + 7) / 8;
-}
-
-uint32_t hashwords(uint32_t bytes)
-{
-  return (bytes + 3) / 4;
-}
-
-htlayout htlayout_create_2(uint32_t r)
-{
-  htlayout R;
-  R.prevhashunits = 0;
-  R.dunits = 0;
-  
-  uint32_t nexthashbytes = hashsize(r);
-  R.nexthashunits = hashwords(nexthashbytes);
-  
-  R.prevbo = 0;
-  R.nextbo = R.nexthashunits * sizeof(hashunit) - nexthashbytes; // 0-3
-  if (r) {
-    uint32_t prevhashbytes = hashsize(r-1);
-    R.prevhashunits = hashwords(prevhashbytes);
-    R.prevbo = R.prevhashunits * sizeof(hashunit) - prevhashbytes; // 0-3
-    R.dunits = R.prevhashunits - R.nexthashunits;
-  }
-  
-  return R;
-}
-
-uint32_t htlayout_getxhash0(uint32_t prevbo, __global const slot0 *pslot)
-{
-#ifdef XINTREE
-  return tree_xhash(pslot->attr);
-#elif WN == 200 && RESTBITS == 4
-  return pslot->hash->bytes[prevbo] >> 4;
-#elif WN == 200 && RESTBITS == 8
-  return (pslot->hash->bytes[prevbo] & 0xf) << 4 | pslot->hash->bytes[prevbo+1] >> 4;
-#elif WN == 144 && RESTBITS == 4
-  return pslot->hash->bytes[prevbo] & 0xf;
-#elif WN == 200 && RESTBITS == 6
-  return (pslot->hash->bytes[prevbo] & 0x3) << 4 | pslot->hash->bytes[prevbo+1] >> 4;
-#else
-#error non implemented
-#endif
-}
-
-uint32_t htlayout_getxhash1(uint32_t prevbo, __global const slot1 *pslot)
-{
-#ifdef XINTREE
-  return tree_xhash(pslot->attr);
-#elif WN == 200 && RESTBITS == 4
-  return pslot->hash->bytes[prevbo] & 0xf;
-#elif WN == 200 && RESTBITS == 8
-  return pslot->hash->bytes[prevbo];
-#elif WN == 144 && RESTBITS == 4
-  return pslot->hash->bytes[prevbo] & 0xf;
-#elif WN == 200 && RESTBITS == 6
-  return pslot->hash->bytes[prevbo] & 0x3f;
-#else
-#error non implemented
-#endif  
-}
-
-bool htlayout_equal(uint32_t prevhashunits, __global const hashunit *hash0, __global const hashunit *hash1)
-{
-  return hash0[prevhashunits-1].word == hash1[prevhashunits-1].word;
-}
-
-void collisiondata_clear(collisiondata *data) 
-{
-#ifdef XBITMAP
-  // memset(xhashmap, 0, NRESTS * sizeof(u64));
-  for (unsigned i = 0; i < NRESTS; i++)
-    data->xhashmap[i] = 0;
-#else
-  // memset(nxhashslots, 0, NRESTS * sizeof(xslot));
-  for (unsigned i = 0; i < NRESTS; i++)
-    data->nxhashslots[i] = 0;
-#endif
-}
-
-bool collisiondata_addslot(collisiondata *data, uint32_t s1, uint32_t xh)
-{
-#ifdef XBITMAP
-  data->xmap = data->xhashmap[xh];
-  data->xhashmap[xh] |= (uint64_t)1 << s1;
-  data->s0 = ~0;
-  return true;
-#else
-  data->n1 = (uint32_t)data->nxhashslots[xh]++;
-  if (data->n1 >= XFULL)
-    return false;
-  data->xx = data->xhashslots[xh];
-  data->xx[data->n1] = s1;
-  data->n0 = 0;
-  return true;
-#endif
-}
-
-bool collisiondata_nextcollision(collisiondata *data)
-{
-#ifdef XBITMAP
-  return data->xmap != 0;
-#else
-  return data->n0 < data->n1;
-#endif
-}
-
-uint64_t __ffsll(uint64_t x)
-{
-  return x ? (64 - clz(x & -x)) : 0;
-}
-
-uint32_t collisiondata_slot(collisiondata *data) {
-#ifdef XBITMAP
-  const uint32_t ffs = __ffsll(xmap);
-  data->s0 += ffs;
-  data->xmap >>= ffs;
-  return data->s0;
-#else
-  return (uint32_t)data->xx[data->n0++];
-#endif
-}
-
-uint32_t equi_getnslots(__global bsizes *nslots, const uint32_t r, const uint32_t bid)
-{
-  __global uint32_t *nslot = &nslots[r&1][bid];
-  const uint32_t n = min(*nslot, NSLOTS);
-  *nslot = 0;
-  return n;
-}
-
-void equi_orderindices(__global uint32_t *indices, uint32_t size)
-{
-  if (indices[0] > indices[size]) {
-    for (uint32_t i = 0; i < size; i++) {
-      const uint32_t tmp = indices[i];
-      indices[i] = indices[size+i];
-      indices[size+i] = tmp;
-    }
-  }
-}
-
-void local_orderindices(uint32_t *indices, uint32_t size)
-{
-  if (indices[0] > indices[size]) {
-    for (uint32_t i = 0; i < size; i++) {
-      const uint32_t tmp = indices[i];
-      indices[i] = indices[size+i];
-      indices[size+i] = tmp;
-    }
-  }
-}
-
-
-void equi_listindices1(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 0)[tree_bucket(t)];
-  const uint32_t size = 1 << 0;
-  indices[0]    = tree_getindex((*buck)[tree_slotid0(t)].attr);
-  indices[size] = tree_getindex((*buck)[tree_slotid1(t)].attr);
-  equi_orderindices(indices, size);
-}
-
-void equi_listindices2(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 0)[tree_bucket(t)];
-  const uint32_t size = 1 << 1;
-  equi_listindices1(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices1(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}
-
-void equi_listindices3(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 1)[tree_bucket(t)];
-  const uint32_t size = 1 << 2;
-  equi_listindices2(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices2(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}
-
-void equi_listindices4(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 1)[tree_bucket(t)];
-  const uint32_t size = 1 << 3;
-  equi_listindices3(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices3(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}
- 
-void equi_listindices5(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 2)[tree_bucket(t)];
-  const uint32_t size = 1 << 4;
-  equi_listindices4(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices4(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}  
-  
-void equi_listindices6(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 2)[tree_bucket(t)];
-  const uint32_t size = 1 << 5;
-  equi_listindices5(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices5(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}  
-  
-void equi_listindices7(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 3)[tree_bucket(t)];
-  const uint32_t size = 1 << 6;
-  equi_listindices6(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices6(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}  
-
-void equi_listindices8(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 3)[tree_bucket(t)];
-  const uint32_t size = 1 << 7;
-  equi_listindices7(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices7(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}  
-
-void equi_listindices9(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 4)[tree_bucket(t)];
-  const uint32_t size = 1 << 8;
-  equi_listindices8(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices8(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}
-
-void local_listindices1(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 0)[tree_bucket(t)];
-  const uint32_t size = 1 << 0;
-  indices[0]    = tree_getindex((*buck)[tree_slotid0(t)].attr);
-  indices[size] = tree_getindex((*buck)[tree_slotid1(t)].attr);
-  local_orderindices(indices, size);
-}
-
-void local_listindices2(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 0)[tree_bucket(t)];
-  const uint32_t size = 1 << 1;
-  local_listindices1(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices1(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}
-
-void local_listindices3(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 1)[tree_bucket(t)];
-  const uint32_t size = 1 << 2;
-  local_listindices2(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices2(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}
-
-void local_listindices4(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 1)[tree_bucket(t)];
-  const uint32_t size = 1 << 3;
-  local_listindices3(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices3(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}
- 
-void local_listindices5(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 2)[tree_bucket(t)];
-  const uint32_t size = 1 << 4;
-  local_listindices4(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices4(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}  
-  
-void local_listindices6(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 2)[tree_bucket(t)];
-  const uint32_t size = 1 << 5;
-  local_listindices5(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices5(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}  
-  
-void local_listindices7(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 3)[tree_bucket(t)];
-  const uint32_t size = 1 << 6;
-  local_listindices6(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices6(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}  
-
-void local_listindices8(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 3)[tree_bucket(t)];
-  const uint32_t size = 1 << 7;
-  local_listindices7(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices7(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}  
-
-void local_listindices9(__global uint32_t *heap0,
-                        __global uint32_t *heap1,
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 4)[tree_bucket(t)];
-  const uint32_t size = 1 << 8;
-  local_listindices8(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices8(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}
-
-// proper dupe test is a little costly on GPU, so allow false negatives
-bool equi_probdupe(uint32_t *prf) {
-  unsigned short susp[PROOFSIZE];
-  for (unsigned i = 0; i < PROOFSIZE; i++)
-    susp[i] = 0xFFFF;
-    
-  for (unsigned i = 0; i < PROOFSIZE; i++) {
-    uint32_t bin = prf[i] & (PROOFSIZE-1);
-    unsigned short msb = prf[i] >> WK;
-    if (msb == susp[bin])
-      return true;
-    susp[bin] = msb;
-  }
-  
-  return false;
-}
-
-void equi_candidate(__global uint32_t *heap0,
-                    __global uint32_t *heap1,
-                    __global proof *sols,
-                    __global uint32_t *nsols,
-                    const tree t)
-{
-  proof prf;
-#if WK==9
-  local_listindices9(heap0, heap1, t, (uint32_t*)&prf);
-#elif WK==5
-  local_listindices5(heap0, heap1, t, (uint32_t*)&prf);
-#else
-#error not implemented
-#endif
-  if (equi_probdupe(prf))
-    return;
-  uint32_t soli = atomic_inc(nsols);
-  if (soli < MAXSOLS)
-#if WK==9
-    equi_listindices9(heap0, heap1, t, sols[soli]);
-#elif WK==5
-    equi_listindices5(heap0, heap1, t, sols[soli]);
-#else
-#error not implemented
-#endif
-}
-
-
-__kernel void digitH(__global blake2b_state *blake2bState,
-                     __global const uint32_t *heap0,
-                     __global bsizes *nslots)
-{
-  uint8_t hash[HASHOUT];
-  blake2b_state state;
-  // equi::htlayout htl(eq, 0);
-  htlayout htl = htlayout_create_2(0);
-  const uint32_t hashbytes = hashsize(0);
-  // const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t block = id; block < NBLOCKS; block += get_global_size(0)) {
-    state = *blake2bState;
-    blake2b_gpu_hash(&state, block, hash, HASHOUT);
-    for (uint32_t i = 0; i < HASHESPERBLAKE; i++) {
-      const uint8_t *ph = hash + i * WN/8;
-#if BUCKBITS == 16 && RESTBITS == 4
-      const uint32_t bucketid = ((uint32_t)ph[0] << 8) | ph[1];
-#ifdef XINTREE
-      const uint32_t xhash = ph[2] >> 4;
-#endif
-#elif BUCKBITS == 14 && RESTBITS == 6
-      const uint32_t bucketid = ((uint32_t)ph[0] << 6) | ph[1] >> 2;
-#elif BUCKBITS == 12 && RESTBITS == 8
-      const uint32_t bucketid = ((uint32_t)ph[0] << 4) | ph[1] >> 4;
-#elif BUCKBITS == 20 && RESTBITS == 4
-      const uint32_t bucketid = ((((uint32_t)ph[0] << 8) | ph[1]) << 4) | ph[2] >> 4;
-#ifdef XINTREE
-      const uint32_t xhash = ph[2] & 0xf;
-#endif
-#elif BUCKBITS == 12 && RESTBITS == 4
-      const uint32_t bucketid = ((uint32_t)ph[0] << 4) | ph[1] >> 4;
-      const uint32_t xhash = ph[1] & 0xf;
-#else
-#error not implemented
-#endif
-      const uint32_t slot = atomic_inc(&nslots[0][bucketid]);
-      if (slot >= NSLOTS)
-        continue;
-      tree leaf;
-      tree_setindex(&leaf, block*HASHESPERBLAKE+i);
-#ifdef XINTREE
-      tree_setxhash(&leaf, xhash);
-#endif
-      __global slot0 *s = &tree0_ptr(heap0, 0)[bucketid][slot];
-      s->attr = leaf;
-      
-      // memcpy(s.hash->bytes+htl.nextbo, ph+WN/8-hashbytes, hashbytes);
-      for (unsigned i = 0; i < hashbytes; i++)
-        ((__global uint8_t*)s->hash->bytes+htl.nextbo)[i] = ((uint8_t*)(ph+WN/8-hashbytes))[i];
-    }
-  }
-}
-
-__kernel void digitOdd(const uint32_t r,
-                       __global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       __global bsizes *nslots)
-{
-  // equi::htlayout htl(eq, r);
-//   htlayout htl = htlayout_create(eq, r);
-  htlayout htl = htlayout_create_2(r);  
-  collisiondata cd;
-  
-  // const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x;
-  const uint32_t id = get_global_id(0);
-  
-  for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    // cd.clear();
-    collisiondata_clear(&cd);
-//     __global slot0 *buck = htl.hta.trees0[(r-1)/2][bucketid]; // optimize by updating previous buck?!
-    __global slot0 *buck = tree0_ptr(heap0, (r-1)/2)[bucketid]; // optimize by updating previous buck?!    
-    uint32_t bsize = equi_getnslots(nslots, r-1, bucketid);       // optimize by putting bucketsize with block?!
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot0 *pslot1 = buck + s1;          // optimize by updating previous pslot1?!
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot0 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-#if WN == 200 && BUCKBITS == 16 && RESTBITS == 4 && defined(XINTREE)
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
-                          | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
-                  | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        xhash &= 0xf;
-#elif WN == 144 && BUCKBITS == 20 && RESTBITS == 4
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 8)
-                            | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 4)
-                    | (xhash = bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 4;
-        xhash &= 0xf;
-#elif WN == 96 && BUCKBITS == 12 && RESTBITS == 4
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4)
-                  | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        xhash &= 0xf;
-#elif WN == 200 && BUCKBITS == 14 && RESTBITS == 6
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) & 0xf) << 8)
-                           | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 2
-                           | (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 6;
-#else
-#error not implemented
-#endif
-        const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-#ifdef XINTREE
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-#else
-        tree xort = tree_create3(bucketid, s0, s1);
-#endif
-//         __global slot1 *xs = &htl.hta.trees1[r/2][xorbucketid][xorslot];
-        __global slot1 *xs = &tree1_ptr(heap1, r/2)[xorbucketid][xorslot];        
-        xs->attr = xort;
-        for (uint32_t i = htl.dunits; i < htl.prevhashunits; i++)
-          xs->hash[i-htl.dunits].word = pslot0->hash[i].word ^ pslot1->hash[i].word;
-      }
-    }
-  }
-}
-
-
-__kernel void digitEven(const uint32_t r,
-                        __global uint32_t *heap0,
-                        __global uint32_t *heap1,
-                        __global bsizes *nslots)
-{
-  // equi::htlayout htl(eq, r);
-//   htlayout htl = htlayout_create(eq, r);
-  htlayout htl = htlayout_create_2(r);
-  collisiondata cd;
-  
-  // const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x;
-  const uint32_t id = get_global_id(0);
-  
-  for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    // cd.clear();
-    collisiondata_clear(&cd);    
-//     __global slot1 *buck = htl.hta.trees1[(r-1)/2][bucketid]; // OPTIMIZE BY UPDATING PREVIOUS
-     __global slot1 *buck = tree1_ptr(heap1, (r-1)/2)[bucketid]; // OPTIMIZE BY UPDATING PREVIOUS
-    uint32_t bsize = equi_getnslots(nslots, r-1, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot1 *pslot1 = buck + s1;          // OPTIMIZE BY UPDATING PREVIOUS
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot1 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-#if WN == 200 && BUCKBITS == 16 && RESTBITS == 4 && defined(XINTREE)
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
-                        | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
-                  xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-#elif WN == 144 && BUCKBITS == 20 && RESTBITS == 4
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 8)
-                            | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 4)
-                            | (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 4;
-#elif WN == 96 && BUCKBITS == 12 && RESTBITS == 4
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4)
-                          | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-#elif WN == 200 && BUCKBITS == 14 && RESTBITS == 6
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 6)
-                          | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 2;
-#else
-#error not implemented
-#endif
-        const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-#ifdef XINTREE
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-#else
-        tree xort = tree_create3(bucketid, s0, s1);
-#endif
-//         __global slot0 *xs = &htl.hta.trees0[r/2][xorbucketid][xorslot];
-        __global slot0 *xs = &tree0_ptr(heap0, r/2)[xorbucketid][xorslot];        
-        xs->attr = xort;
-        for (uint32_t i=htl.dunits; i < htl.prevhashunits; i++)
-          xs->hash[i-htl.dunits].word = pslot0->hash[i].word ^ pslot1->hash[i].word;
-      }
-    }
-  }
-}
-
-
-#ifdef UNROLL
-
-__kernel void digit_1(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots)
-{
-  htlayout htl = htlayout_create_2(1);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-    __global slot0 *buck = tree0_ptr(heap0, 0)[bucketid];
-    uint32_t bsize = equi_getnslots(nslots, 0, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot0 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot0 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
-                          | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
-                  | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        xhash &= 0xf;
-        const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot1 *xs = &htl.hta.trees1[0][xorbucketid][xorslot];
-        __global slot1 *xs = &tree1_ptr(heap1, 0)[xorbucketid][xorslot];
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-        xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
-        xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
-        xs->hash[3].word = pslot0->hash[4].word ^ pslot1->hash[4].word;
-        xs->hash[4].word = pslot0->hash[5].word ^ pslot1->hash[5].word;
-      }
-    }
-  }
-}
-__kernel void digit_2(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(2);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot1 *buck = htl.hta.trees1[0][bucketid];
-    __global slot1 *buck = tree1_ptr(heap1, 0)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 1, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot1 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))  
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot1 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
-                        | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
-                  xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-        // __global slot0 *xs = &htl.hta.trees0[1][xorbucketid][xorslot];
-         __global slot0 *xs = &tree0_ptr(heap0, 1)[xorbucketid][xorslot];
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word;
-        xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-        xs->hash[2].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
-        xs->hash[3].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
-        xs->hash[4].word = pslot0->hash[4].word ^ pslot1->hash[4].word;
-      }
-    }
-  }
-}
-__kernel void digit_3(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(3);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot0 *buck = htl.hta.trees0[1][bucketid];
-    __global slot0 *buck = tree0_ptr(heap0, 1)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 2, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot0 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))  
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot0 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
-                          | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
-                  | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        xhash &= 0xf;
-        const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot1 *xs = &htl.hta.trees1[1][xorbucketid][xorslot];
-        __global slot1 *xs = &tree1_ptr(heap1, 1)[xorbucketid][xorslot];
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-        xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
-        xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
-        xs->hash[3].word = pslot0->hash[4].word ^ pslot1->hash[4].word;
-      }
-    }
-  }
-}
-__kernel void digit_4(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(4);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot1 *buck = htl.hta.trees1[1][bucketid];
-    __global slot1 *buck = tree1_ptr(heap1, 1)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 3, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot1 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot1 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
-                        | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
-                  xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot0 *xs = &htl.hta.trees0[2][xorbucketid][xorslot];
-        __global slot0 *xs = &tree0_ptr(heap0, 2)[xorbucketid][xorslot];        
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word;
-        xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-        xs->hash[2].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
-        xs->hash[3].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
-      }
-    }
-  }
-}
-__kernel void digit_5(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(5);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot0 *buck = htl.hta.trees0[2][bucketid];
-    __global slot0 *buck = tree0_ptr(heap0, 2)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 4, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot0 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot0 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
-                          | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
-                  | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        xhash &= 0xf;
-        const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot1 *xs = &htl.hta.trees1[2][xorbucketid][xorslot];
-        __global slot1 *xs = &tree1_ptr(heap1, 2)[xorbucketid][xorslot];        
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-        xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
-        xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
-      }
-    }
-  }
-}
-__kernel void digit_6(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(6);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot1 *buck = htl.hta.trees1[2][bucketid];
-    __global slot1 *buck = tree1_ptr(heap1, 2)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 5, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot1 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot1 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
-                        | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
-                  xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot0 *xs = &htl.hta.trees0[3][xorbucketid][xorslot];
-        __global slot0 *xs = &tree0_ptr(heap0, 3)[xorbucketid][xorslot];        
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-        xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
-      }
-    }
-  }
-}
-__kernel void digit_7(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(7);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot0 *buck = htl.hta.trees0[3][bucketid];
-    __global slot0 *buck = tree0_ptr(heap0, 3)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 6, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot0 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot0 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
-                          | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
-                  | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        xhash &= 0xf;
-        const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot1 *xs = &htl.hta.trees1[3][xorbucketid][xorslot];
-        __global slot1 *xs = &tree1_ptr(heap1, 3)[xorbucketid][xorslot];        
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word;
-        xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-      }
-    }
-  }
-}
-__kernel void digit_8(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(8);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot1 *buck = htl.hta.trees1[3][bucketid];
-    __global slot1 *buck = tree1_ptr(heap1, 3)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 7, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot1 *pslot1 = buck + s1;          // OPTIMIZE BY UPDATING PREVIOUS
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot1 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
-                        | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
-                  xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot0 *xs = &htl.hta.trees0[4][xorbucketid][xorslot];
-        __global slot0 *xs = &tree0_ptr(heap0, 4)[xorbucketid][xorslot];     
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-      }
-    }
-  }
-}
-#endif //UNROLL
-
-__kernel void digitK(__global uint32_t *heap0,
-                     __global uint32_t *heap1,
-                     __global bsizes *nslots,
-                     __global proof *sols,
-                     __global uint32_t *nsols) {
-  collisiondata cd;
-  htlayout htl = htlayout_create_2(WK);
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-    __global slot0 *buck = tree0_ptr(heap0, (WK-1)/2)[bucketid];
-    uint32_t bsize = equi_getnslots(nslots, WK-1, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot0 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) // assume WK odd
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot0 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) {
-          tree xort = tree_create3(bucketid, s0, s1);
-          equi_candidate(heap0, heap1, sols, nsols, xort);
-        }
-      }
-    }
-  }
-}
diff --git a/3rdparty/amd_silentarmy_kernels/zcash/gpu/kernel.cl b/3rdparty/amd_silentarmy_kernels/zcash/gpu/kernel.cl
deleted file mode 100644
index 0fdc74d83..000000000
--- a/3rdparty/amd_silentarmy_kernels/zcash/gpu/kernel.cl
+++ /dev/null
@@ -1,555 +0,0 @@
-# 1 "input.cl"
-# 1 "<built-in>"
-# 1 "<command-line>"
-# 1 "/usr/include/stdc-predef.h" 1 3 4
-# 1 "<command-line>" 2
-# 1 "input.cl"
-# 1 "param.h" 1
-# 60 "param.h"
-typedef struct sols_s
-{
-    uint nr;
-    uint likely_invalids;
-    uchar valid[2000];
-    uint values[2000][(1 << 9)];
-} sols_t;
-# 2 "input.cl" 2
-# 36 "input.cl"
-__constant ulong blake_iv[] =
-{
-    0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
-    0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
-    0x510e527fade682d1, 0x9b05688c2b3e6c1f,
-    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
-};
-
-
-
-
-__kernel
-void kernel_init_ht(__global char *ht)
-{
-    uint tid = get_global_id(0);
-    *(__global uint *)(ht + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32) = 0;
-}
-# 80 "input.cl"
-uint ht_store(uint round, __global char *ht, uint i,
-        ulong xi0, ulong xi1, ulong xi2, ulong xi3)
-{
-    uint row;
-    __global char *p;
-    uint cnt;
-# 111 "input.cl"
-    if (!(round % 2))
- row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4);
-    else
- row = ((xi0 & 0xf0000) >> 0) |
-     ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
-     ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-
-
-
-    xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
-    xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
-    xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
-    p = ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32;
-    cnt = atomic_inc((__global uint *)p);
-    if (cnt >= ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9))
-        return 1;
-    p += cnt * 32 + (8 + ((round) / 2) * 4);
-
-    *(__global uint *)(p - 4) = i;
-    if (round == 0 || round == 1)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global ulong *)(p + 8) = xi1;
- *(__global ulong *)(p + 16) = xi2;
-      }
-    else if (round == 2)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global ulong *)(p + 8) = xi1;
- *(__global uint *)(p + 16) = xi2;
-      }
-    else if (round == 3 || round == 4)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global ulong *)(p + 8) = xi1;
-
-      }
-    else if (round == 5)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global uint *)(p + 8) = xi1;
-      }
-    else if (round == 6 || round == 7)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
-      }
-    else if (round == 8)
-      {
-
- *(__global uint *)(p + 0) = xi0;
-      }
-    return 0;
-}
-# 188 "input.cl"
-__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
-void kernel_round0(__global ulong *blake_state, __global char *ht,
-        __global uint *debug)
-{
-    uint tid = get_global_id(0);
-    ulong v[16];
-    uint inputs_per_thread = (1 << (200 / (9 + 1))) / get_global_size(0);
-    uint input = tid * inputs_per_thread;
-    uint input_end = (tid + 1) * inputs_per_thread;
-    uint dropped = 0;
-    while (input < input_end)
-      {
-
-
-        ulong word1 = (ulong)input << 32;
-
-        v[0] = blake_state[0];
-        v[1] = blake_state[1];
-        v[2] = blake_state[2];
-        v[3] = blake_state[3];
-        v[4] = blake_state[4];
-        v[5] = blake_state[5];
-        v[6] = blake_state[6];
-        v[7] = blake_state[7];
-        v[8] = blake_iv[0];
-        v[9] = blake_iv[1];
-        v[10] = blake_iv[2];
-        v[11] = blake_iv[3];
-        v[12] = blake_iv[4];
-        v[13] = blake_iv[5];
-        v[14] = blake_iv[6];
-        v[15] = blake_iv[7];
-
-        v[12] ^= 140 + 4 ;
-
-        v[14] ^= -1;
-
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + word1); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + word1); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + word1); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-
-
-        ulong h[7];
-        h[0] = blake_state[0] ^ v[0] ^ v[8];
-        h[1] = blake_state[1] ^ v[1] ^ v[9];
-        h[2] = blake_state[2] ^ v[2] ^ v[10];
-        h[3] = blake_state[3] ^ v[3] ^ v[11];
-        h[4] = blake_state[4] ^ v[4] ^ v[12];
-        h[5] = blake_state[5] ^ v[5] ^ v[13];
-        h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
-
-
-
-        dropped += ht_store(0, ht, input * 2,
-                h[0],
-                h[1],
-                h[2],
-                h[3]);
-        dropped += ht_store(0, ht, input * 2 + 1,
-                (h[3] >> 8) | (h[4] << (64 - 8)),
-                (h[4] >> 8) | (h[5] << (64 - 8)),
-                (h[5] >> 8) | (h[6] << (64 - 8)),
-                (h[6] >> 8));
-
-
-
-
-        input++;
-      }
-
-
-
-
-}
-# 415 "input.cl"
-uint xor_and_store(uint round, __global char *ht_dst, uint row,
- uint slot_a, uint slot_b, __global ulong *a, __global ulong *b)
-{
-    ulong xi0, xi1, xi2;
-
-
-
-    if (round == 1 || round == 2)
-      {
-
- xi0 = *(a++) ^ *(b++);
- xi1 = *(a++) ^ *(b++);
- xi2 = *a ^ *b;
- if (round == 2)
-   {
-
-     xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
-     xi1 = (xi1 >> 8) | (xi2 << (64 - 8));
-     xi2 = (xi2 >> 8);
-   }
-      }
-    else if (round == 3)
-      {
-
- xi0 = *a++ ^ *b++;
- xi1 = *a++ ^ *b++;
- xi2 = *(__global uint *)a ^ *(__global uint *)b;
-      }
-    else if (round == 4 || round == 5)
-      {
-
- xi0 = *a++ ^ *b++;
- xi1 = *a ^ *b;
- xi2 = 0;
- if (round == 4)
-   {
-
-     xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
-     xi1 = (xi1 >> 8);
-   }
-      }
-    else if (round == 6)
-      {
-
- xi0 = *a++ ^ *b++;
- xi1 = *(__global uint *)a ^ *(__global uint *)b;
- xi2 = 0;
- if (round == 6)
-   {
-
-     xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
-     xi1 = (xi1 >> 8);
-   }
-      }
-    else if (round == 7 || round == 8)
-      {
-
- xi0 = *a ^ *b;
- xi1 = 0;
- xi2 = 0;
- if (round == 8)
-   {
-
-     xi0 = (xi0 >> 8);
-   }
-      }
-
-
-    if (!xi0 && !xi1)
- return 0;
-
-
-
-    return ht_store(round, ht_dst, ((row << 12) | ((slot_b & 0x3f) << 6) | (slot_a & 0x3f)),
-     xi0, xi1, xi2, 0);
-}
-
-
-
-
-
-void equihash_round(uint round, __global char *ht_src, __global char *ht_dst,
- __global uint *debug)
-{
-    uint tid = get_global_id(0);
-    uint tlid = get_local_id(0);
-    __global char *p;
-    uint cnt;
-    uchar first_words[((1 << (((200 / (9 + 1)) + 1) - 20)) * 9)];
-    uchar mask;
-    uint i, j;
-
-
-    ushort collisions[((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 3];
-    uint nr_coll = 0;
-    uint n;
-    uint dropped_coll, dropped_stor;
-    __global ulong *a, *b;
-    uint xi_offset;
-
-    xi_offset = (8 + ((round - 1) / 2) * 4);
-# 524 "input.cl"
-    mask = 0;
-
-
-
-    p = (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32);
-    cnt = *(__global uint *)p;
-    cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 9));
-    p += xi_offset;
-    for (i = 0; i < cnt; i++, p += 32)
-        first_words[i] = *(__global uchar *)p;
-
-    nr_coll = 0;
-    dropped_coll = 0;
-    for (i = 0; i < cnt; i++)
-        for (j = i + 1; j < cnt; j++)
-            if ((first_words[i] & mask) ==
-      (first_words[j] & mask))
-              {
-
-                if (nr_coll >= sizeof (collisions) / sizeof (*collisions))
-                    dropped_coll++;
-                else
-
-
-                    collisions[nr_coll++] =
-   ((ushort)j << 8) | ((ushort)i & 0xff);
-
-
-
-              }
-
-    dropped_stor = 0;
-    for (n = 0; n < nr_coll; n++)
-      {
-        i = collisions[n] & 0xff;
-        j = collisions[n] >> 8;
-        a = (__global ulong *)
-            (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 + i * 32 + xi_offset);
-        b = (__global ulong *)
-            (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 + j * 32 + xi_offset);
- dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b);
-      }
-    if (round < 8)
-
- *(__global uint *)(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32) = 0;
-
-
-
-
-}
-# 585 "input.cl"
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round1(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(1, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round2(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(2, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round3(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(3, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round4(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(4, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round5(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(5, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round6(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(6, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round7(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(7, ht_src, ht_dst, debug); }
-
-
-__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
-void kernel_round8(__global char *ht_src, __global char *ht_dst,
- __global uint *debug, __global sols_t *sols)
-{
-    uint tid = get_global_id(0);
-    equihash_round(8, ht_src, ht_dst, debug);
-    if (!tid)
- sols->nr = sols->likely_invalids = 0;
-}
-
-uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
-{
-    return *(__global uint *)(ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 +
-     slot * 32 + xi_offset - 4);
-}
-
-void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs,
- uint round)
-{
-    __global char *ht = htabs[round % 2];
-    uint i = nr_inputs - 1;
-    uint j = nr_inputs * 2 - 1;
-    uint xi_offset = (8 + ((round) / 2) * 4);
-    do
-      {
- ins[j] = expand_ref(ht, xi_offset,
-  (ins[i] >> 12), ((ins[i] >> 6) & 0x3f));
- ins[j - 1] = expand_ref(ht, xi_offset,
-  (ins[i] >> 12), (ins[i] & 0x3f));
- if (!i)
-     break ;
- i--;
- j -= 2;
-      }
-    while (1);
-}
-
-
-
-
-void potential_sol(__global char **htabs, __global sols_t *sols,
- uint ref0, uint ref1)
-{
-    uint sol_i;
-    uint nr_values;
-    sol_i = atomic_inc(&sols->nr);
-    if (sol_i >= 2000)
- return ;
-    sols->valid[sol_i] = 0;
-    nr_values = 0;
-    sols->values[sol_i][nr_values++] = ref0;
-    sols->values[sol_i][nr_values++] = ref1;
-    uint round = 9 - 1;
-    do
-      {
- round--;
- expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round);
- nr_values *= 2;
-      }
-    while (round > 0);
-    sols->valid[sol_i] = 1;
-}
-
-
-
-
-__kernel
-void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols)
-{
-    uint tid = get_global_id(0);
-    __global char *htabs[2] = { ht0, ht1 };
-    uint ht_i = (9 - 1) % 2;
-    uint cnt;
-    uint xi_offset = (8 + ((9 - 1) / 2) * 4);
-    uint i, j;
-    __global char *a, *b;
-    uint ref_i, ref_j;
-
-
-    ulong collisions[5];
-    uint coll;
-
-
-
-    uint mask = 0xffffff;
-
-
-
-    a = htabs[ht_i] + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32;
-    cnt = *(__global uint *)a;
-    cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 9));
-    coll = 0;
-    a += xi_offset;
-    for (i = 0; i < cnt; i++, a += 32)
- for (j = i + 1, b = a + 32; j < cnt; j++, b += 32)
-     if (((*(__global uint *)a) & mask) ==
-      ((*(__global uint *)b) & mask))
-       {
-  ref_i = *(__global uint *)(a - 4);
-  ref_j = *(__global uint *)(b - 4);
-  if (coll < sizeof (collisions) / sizeof (*collisions))
-      collisions[coll++] = ((ulong)ref_i << 32) | ref_j;
-  else
-      atomic_inc(&sols->likely_invalids);
-       }
-    if (!coll)
- return ;
-    for (i = 0; i < coll; i++)
- potential_sol(htabs, sols, collisions[i] >> 32,
-  collisions[i] & 0xffffffff);
-}
diff --git a/3rdparty/kernel_R92xx_linux_14.4/equiw200k9.bin b/3rdparty/kernel_R92xx_linux_14.4/equiw200k9.bin
deleted file mode 100644
index 45785dc4e..000000000
Binary files a/3rdparty/kernel_R92xx_linux_14.4/equiw200k9.bin and /dev/null differ
diff --git a/3rdparty/kernel_RX4xx_windows_16.7/equiw200k9.bin b/3rdparty/kernel_RX4xx_windows_16.7/equiw200k9.bin
deleted file mode 100644
index 868842f93..000000000
Binary files a/3rdparty/kernel_RX4xx_windows_16.7/equiw200k9.bin and /dev/null differ
diff --git a/3rdparty/silentarmy/16_kernel.cl b/3rdparty/silentarmy/16_kernel.cl
deleted file mode 100644
index b7d23e4bd..000000000
--- a/3rdparty/silentarmy/16_kernel.cl
+++ /dev/null
@@ -1,526 +0,0 @@
-# 1 "input.cl"
-# 1 "<built-in>"
-# 1 "<command-line>"
-# 1 "/usr/include/stdc-predef.h" 1 3 4
-# 1 "<command-line>" 2
-# 1 "input.cl"
-# 1 "param.h" 1
-# 60 "param.h"
-typedef struct sols_s
-{
-    uint nr;
-    uint likely_invalidss;
-    uchar valid[2000];
-    uint values[2000][(1 << 9)];
-} sols_t;
-# 2 "input.cl" 2
-# 35 "input.cl"
-__constant ulong blake_iv[] =
-{
-    0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
-    0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
-    0x510e527fade682d1, 0x9b05688c2b3e6c1f,
-    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
-};
-
-
-
-
-__kernel
-void kernel_init_ht(__global char *ht)
-{
-    uint tid = get_global_id(0);
-    *(__global uint *)(ht + tid * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32) = 0;
-}
-# 79 "input.cl"
-uint ht_store(uint round, __global char *ht, uint i,
-        ulong xi0, ulong xi1, ulong xi2, ulong xi3)
-{
-    uint row;
-    __global char *p;
-    uint cnt;
-
-    if (!(round % 2))
- row = (xi0 & 0xffff);
-    else
-
-
-
-
- row = ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
-     ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-# 119 "input.cl"
-    xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
-    xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
-    xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
-    p = ht + row * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32;
-    cnt = atomic_inc((__global uint *)p);
-    if (cnt >= ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3))
-        return 1;
-    p += cnt * 32 + (8 + ((round) / 2) * 4);
-
-    *(__global uint *)(p - 4) = i;
-    if (round == 0 || round == 1)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global ulong *)(p + 8) = xi1;
- *(__global ulong *)(p + 16) = xi2;
-      }
-    else if (round == 2)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global ulong *)(p + 8) = xi1;
- *(__global uint *)(p + 16) = xi2;
-      }
-    else if (round == 3 || round == 4)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global ulong *)(p + 8) = xi1;
-
-      }
-    else if (round == 5)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global uint *)(p + 8) = xi1;
-      }
-    else if (round == 6 || round == 7)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
-      }
-    else if (round == 8)
-      {
-
- *(__global uint *)(p + 0) = xi0;
-      }
-    return 0;
-}
-# 187 "input.cl"
-__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
-void kernel_round0(__global ulong *blake_state, __global char *ht,
-        __global uint *debug)
-{
-    uint tid = get_global_id(0);
-    ulong v[16];
-    uint inputs_per_thread = (1 << (200 / (9 + 1))) / get_global_size(0);
-    uint input = tid * inputs_per_thread;
-    uint input_end = (tid + 1) * inputs_per_thread;
-    uint dropped = 0;
-    while (input < input_end)
-      {
-
-
-        ulong word1 = (ulong)input << 32;
-
-        v[0] = blake_state[0];
-        v[1] = blake_state[1];
-        v[2] = blake_state[2];
-        v[3] = blake_state[3];
-        v[4] = blake_state[4];
-        v[5] = blake_state[5];
-        v[6] = blake_state[6];
-        v[7] = blake_state[7];
-        v[8] = blake_iv[0];
-        v[9] = blake_iv[1];
-        v[10] = blake_iv[2];
-        v[11] = blake_iv[3];
-        v[12] = blake_iv[4];
-        v[13] = blake_iv[5];
-        v[14] = blake_iv[6];
-        v[15] = blake_iv[7];
-
-        v[12] ^= 140 + 4 ;
-
-        v[14] ^= -1;
-
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + word1); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + word1); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + word1); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-
-
-        ulong h[7];
-        h[0] = blake_state[0] ^ v[0] ^ v[8];
-        h[1] = blake_state[1] ^ v[1] ^ v[9];
-        h[2] = blake_state[2] ^ v[2] ^ v[10];
-        h[3] = blake_state[3] ^ v[3] ^ v[11];
-        h[4] = blake_state[4] ^ v[4] ^ v[12];
-        h[5] = blake_state[5] ^ v[5] ^ v[13];
-        h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
-
-
-
-        dropped += ht_store(0, ht, input * 2,
-                h[0],
-                h[1],
-                h[2],
-                h[3]);
-        dropped += ht_store(0, ht, input * 2 + 1,
-                (h[3] >> 8) | (h[4] << (64 - 8)),
-                (h[4] >> 8) | (h[5] << (64 - 8)),
-                (h[5] >> 8) | (h[6] << (64 - 8)),
-                (h[6] >> 8));
-
-
-
-
-        input++;
-      }
-
-
-
-
-}
-# 409 "input.cl"
-uint xor_and_store(uint round, __global char *ht_dst, uint row,
- uint slot_a, uint slot_b, __global ulong *a, __global ulong *b)
-{
-    ulong xi0, xi1, xi2;
-
-
-
-    if (round == 1 || round == 2)
-      {
-
-
- xi0 = *(a++) ^ *(b++);
- xi1 = *(a++) ^ *(b++);
- xi2 = *a ^ *b;
-      }
-    else if (round == 3)
-      {
-
- xi0 = *a++ ^ *b++;
- xi1 = *a++ ^ *b++;
- xi2 = *(__global uint *)a ^ *(__global uint *)b;
-      }
-    else if (round == 4 || round == 5)
-      {
-
- xi0 = *a++ ^ *b++;
- xi1 = *a ^ *b;
- xi2 = 0;
-      }
-    else if (round == 6)
-      {
-
- xi0 = *a++ ^ *b++;
- xi1 = *(__global uint *)a ^ *(__global uint *)b;
- xi2 = 0;
-      }
-    else if (round == 7 || round == 8)
-      {
-
- xi0 = *a ^ *b;
- xi1 = 0;
- xi2 = 0;
-      }
-
-
-    if (!xi0 && !xi1)
- return 0;
-
-
-
-    return ht_store(round, ht_dst, ((row << 16) | ((slot_b & 0xff) << 8) | (slot_a & 0xff)),
-     xi0, xi1, xi2, 0);
-}
-
-
-
-
-
-void equihash_round(uint round, __global char *ht_src, __global char *ht_dst,
- __global uint *debug)
-{
-    uint tid = get_global_id(0);
-    uint tlid = get_local_id(0);
-    __global char *p;
-    uint cnt;
-    uchar first_words[((1 << (((200 / (9 + 1)) + 1) - 16)) * 3)];
-    uchar mask;
-    uint i, j;
-
-
-    ushort collisions[((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 2];
-    uint nr_coll = 0;
-    uint n;
-    uint dropped_coll, dropped_stor;
-    __global ulong *a, *b;
-    uint xi_offset;
-
-    xi_offset = (8 + ((round - 1) / 2) * 4);
-
-
-    mask = ((!(round % 2)) ? 0x0f : 0xf0);
-# 499 "input.cl"
-    p = (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32);
-    cnt = *(__global uint *)p;
-    cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 16)) * 3));
-    p += xi_offset;
-    for (i = 0; i < cnt; i++, p += 32)
-        first_words[i] = *(__global uchar *)p;
-
-    nr_coll = 0;
-    dropped_coll = 0;
-    for (i = 0; i < cnt; i++)
-        for (j = i + 1; j < cnt; j++)
-            if ((first_words[i] & mask) ==
-      (first_words[j] & mask))
-              {
-
-                if (nr_coll >= sizeof (collisions) / sizeof (*collisions))
-                    dropped_coll++;
-                else
-
-
-                    collisions[nr_coll++] =
-   ((ushort)j << 8) | ((ushort)i & 0xff);
-
-
-
-              }
-
-    uint adj = (!(round % 2)) ? 1 : 0;
-
-    dropped_stor = 0;
-    for (n = 0; n < nr_coll; n++)
-      {
-        i = collisions[n] & 0xff;
-        j = collisions[n] >> 8;
-        a = (__global ulong *)
-            (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32 + i * 32 + xi_offset
-      + adj);
-        b = (__global ulong *)
-            (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32 + j * 32 + xi_offset
-      + adj);
- dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b);
-      }
-
-
-
-
-}
-# 557 "input.cl"
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round1(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(1, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round2(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(2, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round3(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(3, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round4(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(4, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round5(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(5, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round6(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(6, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round7(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(7, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round8(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(8, ht_src, ht_dst, debug); }
-
-uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
-{
-    return *(__global uint *)(ht + row * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32 +
-     slot * 32 + xi_offset - 4);
-}
-
-void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs,
- uint round)
-{
-    __global char *ht = htabs[round % 2];
-    uint i = nr_inputs - 1;
-    uint j = nr_inputs * 2 - 1;
-    uint xi_offset = (8 + ((round) / 2) * 4);
-    do
-      {
- ins[j] = expand_ref(ht, xi_offset,
-  (ins[i] >> 16), ((ins[i] >> 8) & 0xff));
- ins[j - 1] = expand_ref(ht, xi_offset,
-  (ins[i] >> 16), (ins[i] & 0xff));
- if (!i)
-     break ;
- i--;
- j -= 2;
-      }
-    while (1);
-}
-
-
-
-
-void potential_sol(__global char **htabs, __global sols_t *sols,
- uint ref0, uint ref1)
-{
-    uint sol_i;
-    uint nr_values;
-    sol_i = atomic_inc(&sols->nr);
-    if (sol_i >= 2000)
- return ;
-    sols->valid[sol_i] = 0;
-    nr_values = 0;
-    sols->values[sol_i][nr_values++] = ref0;
-    sols->values[sol_i][nr_values++] = ref1;
-    uint round = 9 - 1;
-    do
-      {
- round--;
- expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round);
- nr_values *= 2;
-      }
-    while (round > 0);
-    sols->valid[sol_i] = 1;
-}
-
-
-
-
-__kernel
-void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols)
-{
-    uint tid = get_global_id(0);
-    __global char *htabs[2] = { ht0, ht1 };
-    uint ht_i = (9 - 1) % 2;
-    uint cnt;
-    uint xi_offset = (8 + ((9 - 1) / 2) * 4);
-    uint i, j;
-    __global char *a, *b;
-    uint ref_i, ref_j;
-
-
-    ulong collisions[5];
-    uint coll;
-
-
-
-    uint mask = 0xffffff;
-
-
-
-    if (tid == 0)
- sols->nr = sols->likely_invalidss = 0;
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-    a = htabs[ht_i] + tid * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32;
-    cnt = *(__global uint *)a;
-    cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 16)) * 3));
-    coll = 0;
-    a += xi_offset;
-    for (i = 0; i < cnt; i++, a += 32)
- for (j = i + 1, b = a + 32; j < cnt; j++, b += 32)
-     if (((*(__global uint *)a) & mask) ==
-      ((*(__global uint *)b) & mask))
-       {
-  ref_i = *(__global uint *)(a - 4);
-  ref_j = *(__global uint *)(b - 4);
-  if (coll < sizeof (collisions) / sizeof (*collisions))
-      collisions[coll++] = ((ulong)ref_i << 32) | ref_j;
-  else
-      atomic_inc(&sols->likely_invalidss);
-       }
-    if (!coll)
- return ;
-    for (i = 0; i < coll; i++)
- potential_sol(htabs, sols, collisions[i] >> 32,
-  collisions[i] & 0xffffffff);
-}
diff --git a/3rdparty/silentarmy/19_kernel.cl b/3rdparty/silentarmy/19_kernel.cl
deleted file mode 100644
index fd0f29a7a..000000000
--- a/3rdparty/silentarmy/19_kernel.cl
+++ /dev/null
@@ -1,531 +0,0 @@
-# 1 "input.cl"
-# 1 "<built-in>"
-# 1 "<command-line>"
-# 1 "/usr/include/stdc-predef.h" 1 3 4
-# 1 "<command-line>" 2
-# 1 "input.cl"
-# 1 "param.h" 1
-# 60 "param.h"
-typedef struct sols_s
-{
-    uint nr;
-    uint likely_invalidss;
-    uchar valid[2000];
-    uint values[2000][(1 << 9)];
-} sols_t;
-# 2 "input.cl" 2
-# 35 "input.cl"
-__constant ulong blake_iv[] =
-{
-    0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
-    0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
-    0x510e527fade682d1, 0x9b05688c2b3e6c1f,
-    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
-};
-
-
-
-
-__kernel
-void kernel_init_ht(__global char *ht)
-{
-    uint tid = get_global_id(0);
-    *(__global uint *)(ht + tid * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32) = 0;
-}
-# 79 "input.cl"
-uint ht_store(uint round, __global char *ht, uint i,
-        ulong xi0, ulong xi1, ulong xi2, ulong xi3)
-{
-    uint row;
-    __global char *p;
-    uint cnt;
-# 103 "input.cl"
-    if (!(round % 2))
- row = (xi0 & 0xffff) | ((xi0 & 0xe00000) >> 5);
-    else
- row = ((xi0 & 0xe0000) >> 1) |
-     ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
-     ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-# 119 "input.cl"
-    xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
-    xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
-    xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
-    p = ht + row * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32;
-    cnt = atomic_inc((__global uint *)p);
-    if (cnt >= ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9))
-        return 1;
-    p += cnt * 32 + (8 + ((round) / 2) * 4);
-
-    *(__global uint *)(p - 4) = i;
-    if (round == 0 || round == 1)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global ulong *)(p + 8) = xi1;
- *(__global ulong *)(p + 16) = xi2;
-      }
-    else if (round == 2)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global ulong *)(p + 8) = xi1;
- *(__global uint *)(p + 16) = xi2;
-      }
-    else if (round == 3 || round == 4)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global ulong *)(p + 8) = xi1;
-
-      }
-    else if (round == 5)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global uint *)(p + 8) = xi1;
-      }
-    else if (round == 6 || round == 7)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
-      }
-    else if (round == 8)
-      {
-
- *(__global uint *)(p + 0) = xi0;
-      }
-    return 0;
-}
-# 187 "input.cl"
-__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
-void kernel_round0(__global ulong *blake_state, __global char *ht,
-        __global uint *debug)
-{
-    uint tid = get_global_id(0);
-    ulong v[16];
-    uint inputs_per_thread = (1 << (200 / (9 + 1))) / get_global_size(0);
-    uint input = tid * inputs_per_thread;
-    uint input_end = (tid + 1) * inputs_per_thread;
-    uint dropped = 0;
-    while (input < input_end)
-      {
-
-
-        ulong word1 = (ulong)input << 32;
-
-        v[0] = blake_state[0];
-        v[1] = blake_state[1];
-        v[2] = blake_state[2];
-        v[3] = blake_state[3];
-        v[4] = blake_state[4];
-        v[5] = blake_state[5];
-        v[6] = blake_state[6];
-        v[7] = blake_state[7];
-        v[8] = blake_iv[0];
-        v[9] = blake_iv[1];
-        v[10] = blake_iv[2];
-        v[11] = blake_iv[3];
-        v[12] = blake_iv[4];
-        v[13] = blake_iv[5];
-        v[14] = blake_iv[6];
-        v[15] = blake_iv[7];
-
-        v[12] ^= 140 + 4 ;
-
-        v[14] ^= -1;
-
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + word1); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + word1); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + word1); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-
-
-        ulong h[7];
-        h[0] = blake_state[0] ^ v[0] ^ v[8];
-        h[1] = blake_state[1] ^ v[1] ^ v[9];
-        h[2] = blake_state[2] ^ v[2] ^ v[10];
-        h[3] = blake_state[3] ^ v[3] ^ v[11];
-        h[4] = blake_state[4] ^ v[4] ^ v[12];
-        h[5] = blake_state[5] ^ v[5] ^ v[13];
-        h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
-
-
-
-        dropped += ht_store(0, ht, input * 2,
-                h[0],
-                h[1],
-                h[2],
-                h[3]);
-        dropped += ht_store(0, ht, input * 2 + 1,
-                (h[3] >> 8) | (h[4] << (64 - 8)),
-                (h[4] >> 8) | (h[5] << (64 - 8)),
-                (h[5] >> 8) | (h[6] << (64 - 8)),
-                (h[6] >> 8));
-
-
-
-
-        input++;
-      }
-
-
-
-
-}
-# 409 "input.cl"
-uint xor_and_store(uint round, __global char *ht_dst, uint row,
- uint slot_a, uint slot_b, __global ulong *a, __global ulong *b)
-{
-    ulong xi0, xi1, xi2;
-
-
-
-    if (round == 1 || round == 2)
-      {
-
-
- xi0 = *(a++) ^ *(b++);
- xi1 = *(a++) ^ *(b++);
- xi2 = *a ^ *b;
-      }
-    else if (round == 3)
-      {
-
- xi0 = *a++ ^ *b++;
- xi1 = *a++ ^ *b++;
- xi2 = *(__global uint *)a ^ *(__global uint *)b;
-      }
-    else if (round == 4 || round == 5)
-      {
-
- xi0 = *a++ ^ *b++;
- xi1 = *a ^ *b;
- xi2 = 0;
-      }
-    else if (round == 6)
-      {
-
- xi0 = *a++ ^ *b++;
- xi1 = *(__global uint *)a ^ *(__global uint *)b;
- xi2 = 0;
-      }
-    else if (round == 7 || round == 8)
-      {
-
- xi0 = *a ^ *b;
- xi1 = 0;
- xi2 = 0;
-      }
-
-
-    if (!xi0 && !xi1)
- return 0;
-
-
-
-    return ht_store(round, ht_dst, ((row << 13) | ((slot_b & 0x3f) << 6) | (slot_a & 0x3f)),
-     xi0, xi1, xi2, 0);
-}
-
-
-
-
-
-void equihash_round(uint round, __global char *ht_src, __global char *ht_dst,
- __global uint *debug)
-{
-    uint tid = get_global_id(0);
-    uint tlid = get_local_id(0);
-    __global char *p;
-    uint cnt;
-    uchar first_words[((1 << (((200 / (9 + 1)) + 1) - 19)) * 9)];
-    uchar mask;
-    uint i, j;
-
-
-    ushort collisions[((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 2];
-    uint nr_coll = 0;
-    uint n;
-    uint dropped_coll, dropped_stor;
-    __global ulong *a, *b;
-    uint xi_offset;
-
-    xi_offset = (8 + ((round - 1) / 2) * 4);
-
-
-
-
-
-
-    mask = ((!(round % 2)) ? 0x01 : 0x10);
-
-
-
-
-
-    p = (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32);
-    cnt = *(__global uint *)p;
-    cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 19)) * 9));
-    p += xi_offset;
-    for (i = 0; i < cnt; i++, p += 32)
-        first_words[i] = *(__global uchar *)p;
-
-    nr_coll = 0;
-    dropped_coll = 0;
-    for (i = 0; i < cnt; i++)
-        for (j = i + 1; j < cnt; j++)
-            if ((first_words[i] & mask) ==
-      (first_words[j] & mask))
-              {
-
-                if (nr_coll >= sizeof (collisions) / sizeof (*collisions))
-                    dropped_coll++;
-                else
-
-
-                    collisions[nr_coll++] =
-   ((ushort)j << 8) | ((ushort)i & 0xff);
-
-
-
-              }
-
-    uint adj = (!(round % 2)) ? 1 : 0;
-
-    dropped_stor = 0;
-    for (n = 0; n < nr_coll; n++)
-      {
-        i = collisions[n] & 0xff;
-        j = collisions[n] >> 8;
-        a = (__global ulong *)
-            (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32 + i * 32 + xi_offset
-      + adj);
-        b = (__global ulong *)
-            (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32 + j * 32 + xi_offset
-      + adj);
- dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b);
-      }
-
-
-
-
-}
-# 557 "input.cl"
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round1(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(1, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round2(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(2, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round3(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(3, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round4(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(4, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round5(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(5, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round6(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(6, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round7(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(7, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round8(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(8, ht_src, ht_dst, debug); }
-
-uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
-{
-    return *(__global uint *)(ht + row * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32 +
-     slot * 32 + xi_offset - 4);
-}
-
-void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs,
- uint round)
-{
-    __global char *ht = htabs[round % 2];
-    uint i = nr_inputs - 1;
-    uint j = nr_inputs * 2 - 1;
-    uint xi_offset = (8 + ((round) / 2) * 4);
-    do
-      {
- ins[j] = expand_ref(ht, xi_offset,
-  (ins[i] >> 13), ((ins[i] >> 6) & 0x3f));
- ins[j - 1] = expand_ref(ht, xi_offset,
-  (ins[i] >> 13), (ins[i] & 0x3f));
- if (!i)
-     break ;
- i--;
- j -= 2;
-      }
-    while (1);
-}
-
-
-
-
-void potential_sol(__global char **htabs, __global sols_t *sols,
- uint ref0, uint ref1)
-{
-    uint sol_i;
-    uint nr_values;
-    sol_i = atomic_inc(&sols->nr);
-    if (sol_i >= 2000)
- return ;
-    sols->valid[sol_i] = 0;
-    nr_values = 0;
-    sols->values[sol_i][nr_values++] = ref0;
-    sols->values[sol_i][nr_values++] = ref1;
-    uint round = 9 - 1;
-    do
-      {
- round--;
- expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round);
- nr_values *= 2;
-      }
-    while (round > 0);
-    sols->valid[sol_i] = 1;
-}
-
-
-
-
-__kernel
-void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols)
-{
-    uint tid = get_global_id(0);
-    __global char *htabs[2] = { ht0, ht1 };
-    uint ht_i = (9 - 1) % 2;
-    uint cnt;
-    uint xi_offset = (8 + ((9 - 1) / 2) * 4);
-    uint i, j;
-    __global char *a, *b;
-    uint ref_i, ref_j;
-
-
-    ulong collisions[5];
-    uint coll;
-
-
-
-    uint mask = 0xffffff;
-
-
-
-    if (tid == 0)
- sols->nr = sols->likely_invalidss = 0;
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-    a = htabs[ht_i] + tid * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32;
-    cnt = *(__global uint *)a;
-    cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 19)) * 9));
-    coll = 0;
-    a += xi_offset;
-    for (i = 0; i < cnt; i++, a += 32)
- for (j = i + 1, b = a + 32; j < cnt; j++, b += 32)
-     if (((*(__global uint *)a) & mask) ==
-      ((*(__global uint *)b) & mask))
-       {
-  ref_i = *(__global uint *)(a - 4);
-  ref_j = *(__global uint *)(b - 4);
-  if (coll < sizeof (collisions) / sizeof (*collisions))
-      collisions[coll++] = ((ulong)ref_i << 32) | ref_j;
-  else
-      atomic_inc(&sols->likely_invalidss);
-       }
-    if (!coll)
- return ;
-    for (i = 0; i < coll; i++)
- potential_sol(htabs, sols, collisions[i] >> 32,
-  collisions[i] & 0xffffffff);
-}
diff --git a/3rdparty/silentarmy/kernel.cl b/3rdparty/silentarmy/kernel.cl
deleted file mode 100644
index 2099bd049..000000000
--- a/3rdparty/silentarmy/kernel.cl
+++ /dev/null
@@ -1,526 +0,0 @@
-# 1 "input.cl"
-# 1 "<built-in>"
-# 1 "<command-line>"
-# 1 "/usr/include/stdc-predef.h" 1 3 4
-# 1 "<command-line>" 2
-# 1 "input.cl"
-# 1 "param.h" 1
-# 60 "param.h"
-typedef struct sols_s
-{
-    uint nr;
-    uint likely_invalidss;
-    uchar valid[2000];
-    uint values[2000][(1 << 9)];
-} sols_t;
-# 2 "input.cl" 2
-# 35 "input.cl"
-__constant ulong blake_iv[] =
-{
-    0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
-    0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
-    0x510e527fade682d1, 0x9b05688c2b3e6c1f,
-    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
-};
-
-
-
-
-__kernel
-void kernel_init_ht(__global char *ht)
-{
-    uint tid = get_global_id(0);
-    *(__global uint *)(ht + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32) = 0;
-}
-# 79 "input.cl"
-uint ht_store(uint round, __global char *ht, uint i,
-        ulong xi0, ulong xi1, ulong xi2, ulong xi3)
-{
-    uint row;
-    __global char *p;
-    uint cnt;
-# 110 "input.cl"
-    if (!(round % 2))
- row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4);
-    else
- row = ((xi0 & 0xf0000) >> 0) |
-     ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
-     ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-
-
-
-    xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
-    xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
-    xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
-    p = ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32;
-    cnt = atomic_inc((__global uint *)p);
-    if (cnt >= ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13))
-        return 1;
-    p += cnt * 32 + (8 + ((round) / 2) * 4);
-
-    *(__global uint *)(p - 4) = i;
-    if (round == 0 || round == 1)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global ulong *)(p + 8) = xi1;
- *(__global ulong *)(p + 16) = xi2;
-      }
-    else if (round == 2)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global ulong *)(p + 8) = xi1;
- *(__global uint *)(p + 16) = xi2;
-      }
-    else if (round == 3 || round == 4)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global ulong *)(p + 8) = xi1;
-
-      }
-    else if (round == 5)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global uint *)(p + 8) = xi1;
-      }
-    else if (round == 6 || round == 7)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
-      }
-    else if (round == 8)
-      {
-
- *(__global uint *)(p + 0) = xi0;
-      }
-    return 0;
-}
-# 187 "input.cl"
-__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
-void kernel_round0(__global ulong *blake_state, __global char *ht,
-        __global uint *debug)
-{
-    uint tid = get_global_id(0);
-    ulong v[16];
-    uint inputs_per_thread = (1 << (200 / (9 + 1))) / get_global_size(0);
-    uint input = tid * inputs_per_thread;
-    uint input_end = (tid + 1) * inputs_per_thread;
-    uint dropped = 0;
-    while (input < input_end)
-      {
-
-
-        ulong word1 = (ulong)input << 32;
-
-        v[0] = blake_state[0];
-        v[1] = blake_state[1];
-        v[2] = blake_state[2];
-        v[3] = blake_state[3];
-        v[4] = blake_state[4];
-        v[5] = blake_state[5];
-        v[6] = blake_state[6];
-        v[7] = blake_state[7];
-        v[8] = blake_iv[0];
-        v[9] = blake_iv[1];
-        v[10] = blake_iv[2];
-        v[11] = blake_iv[3];
-        v[12] = blake_iv[4];
-        v[13] = blake_iv[5];
-        v[14] = blake_iv[6];
-        v[15] = blake_iv[7];
-
-        v[12] ^= 140 + 4 ;
-
-        v[14] ^= -1;
-
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + word1); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + word1); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + word1); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-
-
-        ulong h[7];
-        h[0] = blake_state[0] ^ v[0] ^ v[8];
-        h[1] = blake_state[1] ^ v[1] ^ v[9];
-        h[2] = blake_state[2] ^ v[2] ^ v[10];
-        h[3] = blake_state[3] ^ v[3] ^ v[11];
-        h[4] = blake_state[4] ^ v[4] ^ v[12];
-        h[5] = blake_state[5] ^ v[5] ^ v[13];
-        h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
-
-
-
-        dropped += ht_store(0, ht, input * 2,
-                h[0],
-                h[1],
-                h[2],
-                h[3]);
-        dropped += ht_store(0, ht, input * 2 + 1,
-                (h[3] >> 8) | (h[4] << (64 - 8)),
-                (h[4] >> 8) | (h[5] << (64 - 8)),
-                (h[5] >> 8) | (h[6] << (64 - 8)),
-                (h[6] >> 8));
-
-
-
-
-        input++;
-      }
-
-
-
-
-}
-# 409 "input.cl"
-uint xor_and_store(uint round, __global char *ht_dst, uint row,
- uint slot_a, uint slot_b, __global ulong *a, __global ulong *b)
-{
-    ulong xi0, xi1, xi2;
-
-
-
-    if (round == 1 || round == 2)
-      {
-
-
- xi0 = *(a++) ^ *(b++);
- xi1 = *(a++) ^ *(b++);
- xi2 = *a ^ *b;
-      }
-    else if (round == 3)
-      {
-
- xi0 = *a++ ^ *b++;
- xi1 = *a++ ^ *b++;
- xi2 = *(__global uint *)a ^ *(__global uint *)b;
-      }
-    else if (round == 4 || round == 5)
-      {
-
- xi0 = *a++ ^ *b++;
- xi1 = *a ^ *b;
- xi2 = 0;
-      }
-    else if (round == 6)
-      {
-
- xi0 = *a++ ^ *b++;
- xi1 = *(__global uint *)a ^ *(__global uint *)b;
- xi2 = 0;
-      }
-    else if (round == 7 || round == 8)
-      {
-
- xi0 = *a ^ *b;
- xi1 = 0;
- xi2 = 0;
-      }
-
-
-    if (!xi0 && !xi1)
- return 0;
-
-
-
-    return ht_store(round, ht_dst, ((row << 12) | ((slot_b & 0x3f) << 6) | (slot_a & 0x3f)),
-     xi0, xi1, xi2, 0);
-}
-
-
-
-
-
-void equihash_round(uint round, __global char *ht_src, __global char *ht_dst,
- __global uint *debug)
-{
-    uint tid = get_global_id(0);
-    uint tlid = get_local_id(0);
-    __global char *p;
-    uint cnt;
-    uchar first_words[((1 << (((200 / (9 + 1)) + 1) - 20)) * 13)];
-    uchar mask;
-    uint i, j;
-
-
-    ushort collisions[((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 2];
-    uint nr_coll = 0;
-    uint n;
-    uint dropped_coll, dropped_stor;
-    __global ulong *a, *b;
-    uint xi_offset;
-
-    xi_offset = (8 + ((round - 1) / 2) * 4);
-# 495 "input.cl"
-    mask = 0;
-
-
-
-    p = (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32);
-    cnt = *(__global uint *)p;
-    cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 13));
-    p += xi_offset;
-    for (i = 0; i < cnt; i++, p += 32)
-        first_words[i] = *(__global uchar *)p;
-
-    nr_coll = 0;
-    dropped_coll = 0;
-    for (i = 0; i < cnt; i++)
-        for (j = i + 1; j < cnt; j++)
-            if ((first_words[i] & mask) ==
-      (first_words[j] & mask))
-              {
-
-                if (nr_coll >= sizeof (collisions) / sizeof (*collisions))
-                    dropped_coll++;
-                else
-
-
-                    collisions[nr_coll++] =
-   ((ushort)j << 8) | ((ushort)i & 0xff);
-
-
-
-              }
-
-    uint adj = (!(round % 2)) ? 1 : 0;
-
-    dropped_stor = 0;
-    for (n = 0; n < nr_coll; n++)
-      {
-        i = collisions[n] & 0xff;
-        j = collisions[n] >> 8;
-        a = (__global ulong *)
-            (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32 + i * 32 + xi_offset
-      + adj);
-        b = (__global ulong *)
-            (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32 + j * 32 + xi_offset
-      + adj);
- dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b);
-      }
-
-
-
-
-}
-# 557 "input.cl"
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round1(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(1, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round2(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(2, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round3(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(3, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round4(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(4, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round5(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(5, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round6(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(6, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round7(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(7, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round8(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(8, ht_src, ht_dst, debug); }
-
-uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
-{
-    return *(__global uint *)(ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32 +
-     slot * 32 + xi_offset - 4);
-}
-
-void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs,
- uint round)
-{
-    __global char *ht = htabs[round % 2];
-    uint i = nr_inputs - 1;
-    uint j = nr_inputs * 2 - 1;
-    uint xi_offset = (8 + ((round) / 2) * 4);
-    do
-      {
- ins[j] = expand_ref(ht, xi_offset,
-  (ins[i] >> 12), ((ins[i] >> 6) & 0x3f));
- ins[j - 1] = expand_ref(ht, xi_offset,
-  (ins[i] >> 12), (ins[i] & 0x3f));
- if (!i)
-     break ;
- i--;
- j -= 2;
-      }
-    while (1);
-}
-
-
-
-
-void potential_sol(__global char **htabs, __global sols_t *sols,
- uint ref0, uint ref1)
-{
-    uint sol_i;
-    uint nr_values;
-    sol_i = atomic_inc(&sols->nr);
-    if (sol_i >= 2000)
- return ;
-    sols->valid[sol_i] = 0;
-    nr_values = 0;
-    sols->values[sol_i][nr_values++] = ref0;
-    sols->values[sol_i][nr_values++] = ref1;
-    uint round = 9 - 1;
-    do
-      {
- round--;
- expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round);
- nr_values *= 2;
-      }
-    while (round > 0);
-    sols->valid[sol_i] = 1;
-}
-
-
-
-
-__kernel
-void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols)
-{
-    uint tid = get_global_id(0);
-    __global char *htabs[2] = { ht0, ht1 };
-    uint ht_i = (9 - 1) % 2;
-    uint cnt;
-    uint xi_offset = (8 + ((9 - 1) / 2) * 4);
-    uint i, j;
-    __global char *a, *b;
-    uint ref_i, ref_j;
-
-
-    ulong collisions[5];
-    uint coll;
-
-
-
-    uint mask = 0xffffff;
-
-
-
-    if (tid == 0)
- sols->nr = sols->likely_invalidss = 0;
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-    a = htabs[ht_i] + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32;
-    cnt = *(__global uint *)a;
-    cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 13));
-    coll = 0;
-    a += xi_offset;
-    for (i = 0; i < cnt; i++, a += 32)
- for (j = i + 1, b = a + 32; j < cnt; j++, b += 32)
-     if (((*(__global uint *)a) & mask) ==
-      ((*(__global uint *)b) & mask))
-       {
-  ref_i = *(__global uint *)(a - 4);
-  ref_j = *(__global uint *)(b - 4);
-  if (coll < sizeof (collisions) / sizeof (*collisions))
-      collisions[coll++] = ((ulong)ref_i << 32) | ref_j;
-  else
-      atomic_inc(&sols->likely_invalidss);
-       }
-    if (!coll)
- return ;
-    for (i = 0; i < coll; i++)
- potential_sol(htabs, sols, collisions[i] >> 32,
-  collisions[i] & 0xffffffff);
-}
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..f9066756a
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,198 @@
+project(nheqminer)
+cmake_minimum_required(VERSION 3.5)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") #  -Wall
+
+## Enable solvers here
+#### older slower
+option(USE_CPU_TROMP "USE CPU_TROMP" OFF)
+option(USE_CUDA_TROMP "USE CUDA_TROMP" OFF)
+#### faster
+option(USE_CPU_XENONCAT "USE CPU_XENONCAT" ON)
+option(USE_CUDA_DJEZO "USE CUDA_DJEZO" ON)
+
+## Add solvers here
+if (USE_CPU_TROMP)
+    add_definitions(-DUSE_CPU_TROMP)
+    message("-- USE_CPU_TROMP DEFINED")
+endif()
+if (USE_CPU_XENONCAT)
+    add_definitions(-DUSE_CPU_XENONCAT)
+    message("-- USE_CPU_XENONCAT DEFINED")
+endif()
+if (USE_CUDA_TROMP)
+    add_definitions(-DUSE_CUDA_TROMP)
+    message("-- USE_CUDA_TROMP DEFINED")
+endif()
+if (USE_CUDA_DJEZO)
+    add_definitions(-DUSE_CUDA_DJEZO)
+    message("-- USE_CUDA_DJEZO DEFINED")
+endif()
+
+
+########
+# LINUX
+if(CMAKE_COMPILER_IS_GNUCXX)
+#    # use native cpu features
+#    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -fPIC")
+#    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -fPIC")
+
+#    # optimizations
+#    add_definitions(-O3)
+
+    # use
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -m64 -msse2")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse2")
+    # optimizations
+    add_definitions(-O2)
+endif()
+
+# Common
+include_directories(${nheqminer_SOURCE_DIR}/nheqminer)
+
+# BOOST
+#find_package(Threads REQUIRED COMPONENTS)
+# compile boost staticaly
+set(Boost_USE_STATIC_LIBS ON)
+set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
+#set(BUILD_SHARED_LIBRARIES OFF)
+#set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++ -static")
+find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread)
+
+if (Boost_FOUND)
+  # From the offical documentation:
+  # Add include directories to the build. [...] If the SYSTEM option is given,
+  # the compiler will be told the directories are meant as system include
+  # directories on some platforms (signalling this setting might achieve effects
+  # such as the compiler skipping warnings [...])."
+  include_directories (SYSTEM ${Boost_INCLUDE_DIR})
+
+  # From the offical documentation:
+  # "Specify directories in which the linker will look for libraries. [...] Note
+  # that this command is rarely necessary. Library locations returned by
+  # find_package() and find_library() are absolute paths. Pass these absolute
+  # library file paths directly to the target_link_libraries() command. CMake
+  # will ensure the linker finds them."
+  link_directories (${Boost_LIBRARY_DIRS})
+else()
+    message("Boost_FOUND NOT FOUND")
+endif ()
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/../)
+
+set(SOURCE_FILES
+    # sources
+    nheqminer/amount.cpp
+    nheqminer/api.cpp
+    nheqminer/arith_uint256.cpp
+    nheqminer/crypto/sha256.cpp
+    nheqminer/json/json_spirit_reader.cpp
+    nheqminer/json/json_spirit_value.cpp
+    nheqminer/json/json_spirit_writer.cpp
+    nheqminer/libstratum/ZcashStratum.cpp
+    nheqminer/main.cpp
+    nheqminer/primitives/block.cpp
+    nheqminer/speed.cpp
+    nheqminer/uint256.cpp
+    nheqminer/utilstrencodings.cpp
+    # headers
+    nheqminer/amount.h
+    nheqminer/api.hpp
+    nheqminer/arith_uint256.h
+    nheqminer/crypto/sha256.h
+    nheqminer/hash.h
+    nheqminer/json/json_spirit.h
+    nheqminer/json/json_spirit_error_position.h
+    nheqminer/json/json_spirit_reader.h
+    nheqminer/json/json_spirit_reader_template.h
+    nheqminer/json/json_spirit_stream_reader.h
+    nheqminer/json/json_spirit_utils.h
+    nheqminer/json/json_spirit_value.h
+    nheqminer/json/json_spirit_writer.h
+    nheqminer/json/json_spirit_writer_template.h
+    nheqminer/libstratum/StratumClient.cpp
+    nheqminer/libstratum/StratumClient.h
+    nheqminer/libstratum/ZcashStratum.cpp
+    nheqminer/libstratum/ZcashStratum.h
+    nheqminer/primitives/block.h
+    nheqminer/primitives/transaction.h
+    nheqminer/script/script.h
+    nheqminer/serialize.h
+    nheqminer/speed.hpp
+    nheqminer/streams.h
+    nheqminer/support/allocators/zeroafterfree.h
+    nheqminer/tinyformat.h
+    nheqminer/uint252.h
+    nheqminer/uint256.h
+    nheqminer/utilstrencodings.h
+    nheqminer/version.h
+    nheqminer/zcash/JoinSplit.hpp
+    nheqminer/zcash/NoteEncryption.hpp
+    nheqminer/zcash/Proof.hpp
+    nheqminer/zcash/Zcash.h
+    nheqminer/SolverStub.h # just a stub
+
+    nheqminer/AvailableSolvers.h
+    nheqminer/ISolver.h
+    nheqminer/Solver.h
+    nheqminer/MinerFactory.h
+    nheqminer/MinerFactory.cpp
+
+    # make same path on windows
+    #blake shared
+    # src
+    blake2/blake2bx.cpp
+    # headers
+    blake2/blake2.h
+    blake2/blake2b-load-sse2.h
+    blake2/blake2b-load-sse41.h
+    blake2/blake2b-round.h
+    blake2/blake2-config.h
+    blake2/blake2-impl.h
+    blake2/blake2-round.h
+    )
+
+#set(LIBS ${LIBS} ${Threads_LIBRARIES} ${Boost_LIBRARIES})
+set(LIBS ${LIBS} ${Boost_LIBRARIES})
+
+message("-- CXXFLAGS: ${CMAKE_CXX_FLAGS}")
+message("-- LIBS: ${LIBS}")
+
+if (USE_CPU_TROMP)
+    add_subdirectory(cpu_tromp)
+endif()
+if (USE_CPU_XENONCAT)
+    add_subdirectory(cpu_xenoncat)
+endif()
+if (USE_CUDA_TROMP)
+    add_subdirectory(cuda_tromp)
+endif()
+if (USE_CUDA_DJEZO)
+    add_subdirectory(cuda_djezo)
+endif()
+
+#add_subdirectory(cpu_xenoncat)
+
+ADD_EXECUTABLE(${PROJECT_NAME} ${SOURCE_FILES})
+
+#target_link_libraries(${PROJECT_NAME} ${LIBS} ${CUDA_LIBRARIES} )
+target_link_libraries(${PROJECT_NAME} ${CMAKE_THREAD_LIBS_INIT} ${LIBS} )
+
+# link libs
+if (USE_CPU_TROMP)
+    target_link_libraries(${PROJECT_NAME} cpu_tromp)
+endif()
+if (USE_CPU_XENONCAT)
+    add_library ( xenoncat_avx1 SHARED IMPORTED GLOBAL )
+    set_target_properties ( xenoncat_avx1 PROPERTIES IMPORTED_LOCATION "../nheqminer/cpu_xenoncat/asm_linux/equihash_avx1.o" )
+    add_library ( xenoncat_avx2 SHARED IMPORTED GLOBAL )
+    set_target_properties ( xenoncat_avx2 PROPERTIES IMPORTED_LOCATION "../nheqminer/cpu_xenoncat/asm_linux/equihash_avx2.o" )
+    target_link_libraries(${PROJECT_NAME} cpu_xenoncat xenoncat_avx1 xenoncat_avx2)
+endif()
+if (USE_CUDA_TROMP)
+    target_link_libraries(${PROJECT_NAME} cuda_tromp)
+endif()
+if (USE_CUDA_DJEZO)
+    target_link_libraries(${PROJECT_NAME} cuda_djezo)
+endif()
+
diff --git a/Linux_cmake/nheqminer_AMD/CMakeLists.txt b/Linux_cmake/nheqminer_AMD/CMakeLists.txt
deleted file mode 100644
index 40c1dabb2..000000000
--- a/Linux_cmake/nheqminer_AMD/CMakeLists.txt
+++ /dev/null
@@ -1,163 +0,0 @@
-project(nheqminer_AMD)
-cmake_minimum_required(VERSION 2.8)
-
-#aux_source_directory(. SRC_LIST)
-#add_executable(${PROJECT_NAME} ${SRC_LIST})
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-# LINUX
-if(CMAKE_COMPILER_IS_GNUCXX)
-    # use native cpu features
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
-    # optimizations
-    add_definitions(-O3)
-endif()
-
-# Common
-include_directories(${nheqminer_SOURCE_DIR})
-
-add_definitions(-DBOOST_ALL_NO_LIB -DBOOST_ALL_DYN_LINK -DBOOST_LOG_DYN_LINK)
-
-find_package(Threads REQUIRED COMPONENTS)
-find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread)
-
-if (Boost_FOUND)
-  # From the offical documentation:
-  # Add include directories to the build. [...] If the SYSTEM option is given,
-  # the compiler will be told the directories are meant as system include
-  # directories on some platforms (signalling this setting might achieve effects
-  # such as the compiler skipping warnings [...])."
-  include_directories (SYSTEM ${Boost_INCLUDE_DIR})
-
-  # From the offical documentation:
-  # "Specify directories in which the linker will look for libraries. [...] Note
-  # that this command is rarely necessary. Library locations returned by
-  # find_package() and find_library() are absolute paths. Pass these absolute
-  # library file paths directly to the target_link_libraries() command. CMake
-  # will ensure the linker finds them."
-  link_directories (${Boost_LIBRARY_DIRS})
-else()
-    message("Boost_FOUND NOT FOUND")
-endif ()
-
-## Add solvers here
-#add_definitions(-DUSE_CPU_XENONCAT)
-#add_definitions(-DUSE_CPU_TROMP)
-add_definitions(-DUSE_OCL_XMP)
-add_definitions(-DUSE_OCL_SILENTARMY)
-
-#add_library ( xenoncat_avx1 SHARED IMPORTED GLOBAL )
-#set_target_properties ( xenoncat_avx1 PROPERTIES IMPORTED_LOCATION "../../cpu_xenoncat/Linux/asm/equihash_avx1.o" )
-
-#add_library ( xenoncat_avx2 SHARED IMPORTED GLOBAL )
-#set_target_properties ( xenoncat_avx2 PROPERTIES IMPORTED_LOCATION "../../cpu_xenoncat/Linux/asm/equihash_avx2.o" )
-
-include_directories(${CMAKE_CURRENT_BINARY_DIR}/../../nheqminer/)
-include_directories(${CMAKE_CURRENT_BINARY_DIR}/../../ocl_device_utils/)
-
-# OCL INC DIR
-include_directories(${OPENCL_INCLUDE_DIRECTORY})
-
-
-set(SOURCE_FILES
-    # sources
-    ../../nheqminer/amount.cpp
-    ../../nheqminer/api.cpp
-    ../../nheqminer/arith_uint256.cpp
-    ../../nheqminer/crypto/sha256.cpp
-    ../../nheqminer/json/json_spirit_reader.cpp
-    ../../nheqminer/json/json_spirit_value.cpp
-    ../../nheqminer/json/json_spirit_writer.cpp
-    ../../nheqminer/libstratum/ZcashStratum.cpp
-    ../../nheqminer/main.cpp
-    ../../nheqminer/primitives/block.cpp
-    ../../nheqminer/speed.cpp
-    ../../nheqminer/uint256.cpp
-    ../../nheqminer/utilstrencodings.cpp
-    # headers
-    ../../nheqminer/amount.h
-    ../../nheqminer/api.hpp
-    ../../nheqminer/arith_uint256.h
-    ../../nheqminer/crypto/sha256.h
-    ../../nheqminer/hash.h
-    ../../nheqminer/json/json_spirit.h
-    ../../nheqminer/json/json_spirit_error_position.h
-    ../../nheqminer/json/json_spirit_reader.h
-    ../../nheqminer/json/json_spirit_reader_template.h
-    ../../nheqminer/json/json_spirit_stream_reader.h
-    ../../nheqminer/json/json_spirit_utils.h
-    ../../nheqminer/json/json_spirit_value.h
-    ../../nheqminer/json/json_spirit_writer.h
-    ../../nheqminer/json/json_spirit_writer_template.h
-    ../../nheqminer/libstratum/StratumClient.cpp
-    ../../nheqminer/libstratum/StratumClient.h
-    ../../nheqminer/libstratum/ZcashStratum.cpp
-    ../../nheqminer/libstratum/ZcashStratum.h
-    ../../nheqminer/primitives/block.h
-    ../../nheqminer/primitives/transaction.h
-    ../../nheqminer/script/script.h
-    ../../nheqminer/serialize.h
-    ../../nheqminer/speed.hpp
-    ../../nheqminer/streams.h
-    ../../nheqminer/support/allocators/zeroafterfree.h
-    ../../nheqminer/tinyformat.h
-    ../../nheqminer/uint252.h
-    ../../nheqminer/uint256.h
-    ../../nheqminer/utilstrencodings.h
-    ../../nheqminer/version.h
-    ../../nheqminer/zcash/JoinSplit.hpp
-    ../../nheqminer/zcash/NoteEncryption.hpp
-    ../../nheqminer/zcash/Proof.hpp
-    ../../nheqminer/zcash/Zcash.h
-    ../../nheqminer/SolverStub.h # just a stub
-
-    ## cpu tromp
-    #../../cpu_tromp/blake2/blake2bx.cpp
-    #../../cpu_tromp/cpu_tromp.cpp
-    #../../cpu_tromp/blake2/blake2-config.h
-    #../../cpu_tromp/blake2/blake2-impl.h
-    #../../cpu_tromp/blake2/blake2-round.h
-    #../../cpu_tromp/blake2/blake2.h
-    #../../cpu_tromp/blake2/blake2b-load-sse2.h
-    #../../cpu_tromp/blake2/blake2b-load-sse41.h
-    #../../cpu_tromp/blake2/blake2b-round.h
-    #../../cpu_tromp/cpu_tromp.hpp
-    #../../cpu_tromp/equi.h
-    #../../cpu_tromp/equi_miner.h
-#
-    ## cpu xenocat
-    #../../cpu_xenoncat/cpu_xenoncat.hpp
-    #../../cpu_xenoncat/xenoncat.cpp
-#
-    # AMD ocl_device_utils
-    ../../ocl_device_utils/cl_ext.hpp
-    ../../ocl_device_utils/ocl_device_utils.h
-    ../../ocl_device_utils/ocl_device_utils.cpp
-    ../../ocl_device_utils/OpenCLDevice.h
-    ../../ocl_device_utils/opencl.h
-    ../../ocl_device_utils/opencl.cpp
-    # AMD ocl_xpm
-    ../../ocl_xpm/ocl_xmp.hpp
-    ../../ocl_xpm/ocl_xmp.cpp
-    ../../ocl_xpm/zcash/gpu/common.h
-    ../../cpu_tromp/blake2/blake2bx.cpp
-    # AMD ocl_silentarmy
-    ../../ocl_silentarmy/ocl_silentarmy.hpp
-    ../../ocl_silentarmy/param.h
-    ../../ocl_silentarmy/sa_blake.h
-    ../../ocl_silentarmy/ocl_silentarmy.cpp
-    ../../ocl_silentarmy/sa_blake.cpp
-    )
-
-#add_executable(${PROJECT_NAME} ${SRC_LIST})
-
-set(LIBS ${LIBS} ${Boost_LIBRARIES} ${OPENCL_LIBRARY})
-
-#message("-- CXXFLAGS: ${CMAKE_CXX_FLAGS}")
-#message("-- LIBS: ${LIBS}")
-
-add_executable(${PROJECT_NAME} ${SOURCE_FILES})
-#target_link_libraries(${PROJECT_NAME} ${Boost_LIBRARIES})
-target_link_libraries(${PROJECT_NAME} ${LIBS} ${CMAKE_THREAD_LIBS_INIT})
diff --git a/Linux_cmake/nheqminer_cpu/CMakeLists.txt b/Linux_cmake/nheqminer_cpu/CMakeLists.txt
deleted file mode 100644
index d4269b444..000000000
--- a/Linux_cmake/nheqminer_cpu/CMakeLists.txt
+++ /dev/null
@@ -1,137 +0,0 @@
-project(nheqminer_cpu)
-cmake_minimum_required(VERSION 2.8)
-
-#aux_source_directory(. SRC_LIST)
-#add_executable(${PROJECT_NAME} ${SRC_LIST})
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-# LINUX
-if(CMAKE_COMPILER_IS_GNUCXX)
-    # use native cpu features
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
-    # optimizations
-    add_definitions(-O3)
-endif()
-
-# Common
-include_directories(${nheqminer_SOURCE_DIR})
-
-add_definitions(-DBOOST_ALL_NO_LIB -DBOOST_ALL_DYN_LINK -DBOOST_LOG_DYN_LINK)
-
-find_package(Threads REQUIRED COMPONENTS)
-find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread)
-
-if (Boost_FOUND)
-  # From the offical documentation:
-  # Add include directories to the build. [...] If the SYSTEM option is given,
-  # the compiler will be told the directories are meant as system include
-  # directories on some platforms (signalling this setting might achieve effects
-  # such as the compiler skipping warnings [...])."
-  include_directories (SYSTEM ${Boost_INCLUDE_DIR})
-
-  # From the offical documentation:
-  # "Specify directories in which the linker will look for libraries. [...] Note
-  # that this command is rarely necessary. Library locations returned by
-  # find_package() and find_library() are absolute paths. Pass these absolute
-  # library file paths directly to the target_link_libraries() command. CMake
-  # will ensure the linker finds them."
-  link_directories (${Boost_LIBRARY_DIRS})
-else()
-    message("Boost_FOUND NOT FOUND")
-endif ()
-
-## Add solvers here
-add_definitions(-DUSE_CPU_XENONCAT)
-add_definitions(-DUSE_CPU_TROMP)
-
-add_library ( xenoncat_avx1 SHARED IMPORTED GLOBAL )
-set_target_properties ( xenoncat_avx1 PROPERTIES IMPORTED_LOCATION "../../cpu_xenoncat/Linux/asm/equihash_avx1.o" )
-
-add_library ( xenoncat_avx2 SHARED IMPORTED GLOBAL )
-set_target_properties ( xenoncat_avx2 PROPERTIES IMPORTED_LOCATION "../../cpu_xenoncat/Linux/asm/equihash_avx2.o" )
-
-include_directories(${CMAKE_CURRENT_BINARY_DIR}/../../nheqminer/)
-
-set(SOURCE_FILES
-    # sources
-    ../../nheqminer/amount.cpp
-    ../../nheqminer/api.cpp
-    ../../nheqminer/arith_uint256.cpp
-    ../../nheqminer/crypto/sha256.cpp
-    ../../nheqminer/json/json_spirit_reader.cpp
-    ../../nheqminer/json/json_spirit_value.cpp
-    ../../nheqminer/json/json_spirit_writer.cpp
-    ../../nheqminer/libstratum/ZcashStratum.cpp
-    ../../nheqminer/main.cpp
-    ../../nheqminer/primitives/block.cpp
-    ../../nheqminer/speed.cpp
-    ../../nheqminer/uint256.cpp
-    ../../nheqminer/utilstrencodings.cpp
-    # headers
-    ../../nheqminer/amount.h
-    ../../nheqminer/api.hpp
-    ../../nheqminer/arith_uint256.h
-    ../../nheqminer/crypto/sha256.h
-    ../../nheqminer/hash.h
-    ../../nheqminer/json/json_spirit.h
-    ../../nheqminer/json/json_spirit_error_position.h
-    ../../nheqminer/json/json_spirit_reader.h
-    ../../nheqminer/json/json_spirit_reader_template.h
-    ../../nheqminer/json/json_spirit_stream_reader.h
-    ../../nheqminer/json/json_spirit_utils.h
-    ../../nheqminer/json/json_spirit_value.h
-    ../../nheqminer/json/json_spirit_writer.h
-    ../../nheqminer/json/json_spirit_writer_template.h
-    ../../nheqminer/libstratum/StratumClient.cpp
-    ../../nheqminer/libstratum/StratumClient.h
-    ../../nheqminer/libstratum/ZcashStratum.cpp
-    ../../nheqminer/libstratum/ZcashStratum.h
-    ../../nheqminer/primitives/block.h
-    ../../nheqminer/primitives/transaction.h
-    ../../nheqminer/script/script.h
-    ../../nheqminer/serialize.h
-    ../../nheqminer/speed.hpp
-    ../../nheqminer/streams.h
-    ../../nheqminer/support/allocators/zeroafterfree.h
-    ../../nheqminer/tinyformat.h
-    ../../nheqminer/uint252.h
-    ../../nheqminer/uint256.h
-    ../../nheqminer/utilstrencodings.h
-    ../../nheqminer/version.h
-    ../../nheqminer/zcash/JoinSplit.hpp
-    ../../nheqminer/zcash/NoteEncryption.hpp
-    ../../nheqminer/zcash/Proof.hpp
-    ../../nheqminer/zcash/Zcash.h
-    ../../nheqminer/SolverStub.h # just a stub
-
-    # cpu tromp
-    ../../cpu_tromp/blake2/blake2bx.cpp
-    ../../cpu_tromp/cpu_tromp.cpp
-    ../../cpu_tromp/blake2/blake2-config.h
-    ../../cpu_tromp/blake2/blake2-impl.h
-    ../../cpu_tromp/blake2/blake2-round.h
-    ../../cpu_tromp/blake2/blake2.h
-    ../../cpu_tromp/blake2/blake2b-load-sse2.h
-    ../../cpu_tromp/blake2/blake2b-load-sse41.h
-    ../../cpu_tromp/blake2/blake2b-round.h
-    ../../cpu_tromp/cpu_tromp.hpp
-    ../../cpu_tromp/equi.h
-    ../../cpu_tromp/equi_miner.h
-
-    # cpu xenocat
-    ../../cpu_xenoncat/cpu_xenoncat.hpp
-    ../../cpu_xenoncat/xenoncat.cpp
-    )
-
-#add_executable(${PROJECT_NAME} ${SRC_LIST})
-
-set(LIBS ${LIBS} ${Boost_LIBRARIES})
-
-#message("-- CXXFLAGS: ${CMAKE_CXX_FLAGS}")
-#message("-- LIBS: ${LIBS}")
-
-add_executable(${PROJECT_NAME} ${SOURCE_FILES})
-#target_link_libraries(${PROJECT_NAME} ${Boost_LIBRARIES})
-target_link_libraries(${PROJECT_NAME} ${LIBS} ${CMAKE_THREAD_LIBS_INIT} xenoncat_avx1 xenoncat_avx2 )
diff --git a/Linux_cmake/nheqminer_cpu_tromp/CMakeLists.txt b/Linux_cmake/nheqminer_cpu_tromp/CMakeLists.txt
deleted file mode 100644
index 1fe33e5e6..000000000
--- a/Linux_cmake/nheqminer_cpu_tromp/CMakeLists.txt
+++ /dev/null
@@ -1,143 +0,0 @@
-project(nheqminer_cpu_tromp)
-cmake_minimum_required(VERSION 2.8)
-
-#aux_source_directory(. SRC_LIST)
-#add_executable(${PROJECT_NAME} ${SRC_LIST})
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-# LINUX
-if(CMAKE_COMPILER_IS_GNUCXX)
-    # use native cpu features
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
-    # optimizations
-    add_definitions(-O3)
-endif()
-
-# Common
-include_directories(${nheqminer_SOURCE_DIR})
-
-add_definitions(-DBOOST_ALL_NO_LIB -DBOOST_ALL_DYN_LINK -DBOOST_LOG_DYN_LINK)
-
-find_package(Threads REQUIRED COMPONENTS)
-find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread)
-
-if (Boost_FOUND)
-  # From the offical documentation:
-  # Add include directories to the build. [...] If the SYSTEM option is given,
-  # the compiler will be told the directories are meant as system include
-  # directories on some platforms (signalling this setting might achieve effects
-  # such as the compiler skipping warnings [...])."
-  include_directories (SYSTEM ${Boost_INCLUDE_DIR})
-
-  # From the offical documentation:
-  # "Specify directories in which the linker will look for libraries. [...] Note
-  # that this command is rarely necessary. Library locations returned by
-  # find_package() and find_library() are absolute paths. Pass these absolute
-  # library file paths directly to the target_link_libraries() command. CMake
-  # will ensure the linker finds them."
-  link_directories (${Boost_LIBRARY_DIRS})
-else()
-    message("Boost_FOUND NOT FOUND")
-endif ()
-
-## Add solvers here
-add_definitions(-DUSE_CPU_TROMP)
-
-include_directories(${CMAKE_CURRENT_BINARY_DIR}/../../nheqminer/)
-
-set(SOURCE_FILES
-    # sources
-    ../../nheqminer/amount.cpp
-    ../../nheqminer/api.cpp
-    ../../nheqminer/arith_uint256.cpp
-    ../../nheqminer/crypto/sha256.cpp
-    ../../nheqminer/json/json_spirit_reader.cpp
-    ../../nheqminer/json/json_spirit_value.cpp
-    ../../nheqminer/json/json_spirit_writer.cpp
-    ../../nheqminer/libstratum/ZcashStratum.cpp
-    ../../nheqminer/main.cpp
-    ../../nheqminer/primitives/block.cpp
-    ../../nheqminer/speed.cpp
-    ../../nheqminer/uint256.cpp
-    ../../nheqminer/utilstrencodings.cpp
-    # headers
-    ../../nheqminer/amount.h
-    ../../nheqminer/api.hpp
-    ../../nheqminer/arith_uint256.h
-    ../../nheqminer/crypto/sha256.h
-    ../../nheqminer/hash.h
-    ../../nheqminer/json/json_spirit.h
-    ../../nheqminer/json/json_spirit_error_position.h
-    ../../nheqminer/json/json_spirit_reader.h
-    ../../nheqminer/json/json_spirit_reader_template.h
-    ../../nheqminer/json/json_spirit_stream_reader.h
-    ../../nheqminer/json/json_spirit_utils.h
-    ../../nheqminer/json/json_spirit_value.h
-    ../../nheqminer/json/json_spirit_writer.h
-    ../../nheqminer/json/json_spirit_writer_template.h
-    ../../nheqminer/libstratum/StratumClient.cpp
-    ../../nheqminer/libstratum/StratumClient.h
-    ../../nheqminer/libstratum/ZcashStratum.cpp
-    ../../nheqminer/libstratum/ZcashStratum.h
-    ../../nheqminer/primitives/block.h
-    ../../nheqminer/primitives/transaction.h
-    ../../nheqminer/script/script.h
-    ../../nheqminer/serialize.h
-    ../../nheqminer/speed.hpp
-    ../../nheqminer/streams.h
-    ../../nheqminer/support/allocators/zeroafterfree.h
-    ../../nheqminer/tinyformat.h
-    ../../nheqminer/uint252.h
-    ../../nheqminer/uint256.h
-    ../../nheqminer/utilstrencodings.h
-    ../../nheqminer/version.h
-    ../../nheqminer/zcash/JoinSplit.hpp
-    ../../nheqminer/zcash/NoteEncryption.hpp
-    ../../nheqminer/zcash/Proof.hpp
-    ../../nheqminer/zcash/Zcash.h
-    ../../nheqminer/SolverStub.h # just a stub
-
-    # cpu tromp
-    ../../cpu_tromp/blake2/blake2bx.cpp
-    ../../cpu_tromp/cpu_tromp.cpp
-    ../../cpu_tromp/blake2/blake2-config.h
-    ../../cpu_tromp/blake2/blake2-impl.h
-    ../../cpu_tromp/blake2/blake2-round.h
-    ../../cpu_tromp/blake2/blake2.h
-    ../../cpu_tromp/blake2/blake2b-load-sse2.h
-    ../../cpu_tromp/blake2/blake2b-load-sse41.h
-    ../../cpu_tromp/blake2/blake2b-round.h
-    ../../cpu_tromp/cpu_tromp.hpp
-    ../../cpu_tromp/equi.h
-    ../../cpu_tromp/equi_miner.h
-    )
-
-#if(USE_CPU_TROMP)
-#    set(SOURCE_FILES ${SOURCE_FILES}
-#        ../../cpu_tromp/blake2/blake2bx.cpp
-#        ../../cpu_tromp/cpu_tromp.cpp
-#        ../../cpu_tromp/blake2/blake2-config.h
-#        ../../cpu_tromp/blake2/blake2-impl.h
-#        ../../cpu_tromp/blake2/blake2-round.h
-#        ../../cpu_tromp/blake2/blake2.h
-#        ../../cpu_tromp/blake2/blake2b-load-sse2.h
-#        ../../cpu_tromp/blake2/blake2b-load-sse41.h
-#        ../../cpu_tromp/blake2/blake2b-round.h
-#        ../../cpu_tromp/cpu_tromp.hpp
-#        ../../cpu_tromp/equi.h
-#        ../../cpu_tromp/equi_miner.h
-#        )
-#endif()
-
-#add_executable(${PROJECT_NAME} ${SRC_LIST})
-
-set(LIBS ${LIBS} ${Boost_LIBRARIES})
-
-#message("-- CXXFLAGS: ${CMAKE_CXX_FLAGS}")
-#message("-- LIBS: ${LIBS}")
-
-add_executable(${PROJECT_NAME} ${SOURCE_FILES})
-#target_link_libraries(${PROJECT_NAME} ${Boost_LIBRARIES})
-target_link_libraries(${PROJECT_NAME} ${LIBS} ${CMAKE_THREAD_LIBS_INIT})
diff --git a/Linux_cmake/nheqminer_cpu_xenoncat/CMakeLists.txt b/Linux_cmake/nheqminer_cpu_xenoncat/CMakeLists.txt
deleted file mode 100644
index 25d58c633..000000000
--- a/Linux_cmake/nheqminer_cpu_xenoncat/CMakeLists.txt
+++ /dev/null
@@ -1,122 +0,0 @@
-project(nheqminer_cpu_xenoncat)
-cmake_minimum_required(VERSION 2.8)
-
-#aux_source_directory(. SRC_LIST)
-#add_executable(${PROJECT_NAME} ${SRC_LIST})
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-# LINUX
-if(CMAKE_COMPILER_IS_GNUCXX)
-    # use native cpu features
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
-    # optimizations
-    add_definitions(-O3)
-endif()
-
-# Common
-include_directories(${nheqminer_SOURCE_DIR})
-
-add_definitions(-DBOOST_ALL_NO_LIB -DBOOST_ALL_DYN_LINK -DBOOST_LOG_DYN_LINK)
-
-find_package(Threads REQUIRED COMPONENTS)
-find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread)
-
-if (Boost_FOUND)
-  # From the offical documentation:
-  # Add include directories to the build. [...] If the SYSTEM option is given,
-  # the compiler will be told the directories are meant as system include
-  # directories on some platforms (signalling this setting might achieve effects
-  # such as the compiler skipping warnings [...])."
-  include_directories (SYSTEM ${Boost_INCLUDE_DIR})
-
-  # From the offical documentation:
-  # "Specify directories in which the linker will look for libraries. [...] Note
-  # that this command is rarely necessary. Library locations returned by
-  # find_package() and find_library() are absolute paths. Pass these absolute
-  # library file paths directly to the target_link_libraries() command. CMake
-  # will ensure the linker finds them."
-  link_directories (${Boost_LIBRARY_DIRS})
-else()
-    message("Boost_FOUND NOT FOUND")
-endif ()
-
-## Add solvers here
-add_definitions(-DUSE_CPU_XENONCAT)
-
-add_library ( xenoncat_avx1 SHARED IMPORTED GLOBAL )
-set_target_properties ( xenoncat_avx1 PROPERTIES IMPORTED_LOCATION "../../cpu_xenoncat/Linux/asm/equihash_avx1.o" )
-
-add_library ( xenoncat_avx2 SHARED IMPORTED GLOBAL )
-set_target_properties ( xenoncat_avx2 PROPERTIES IMPORTED_LOCATION "../../cpu_xenoncat/Linux/asm/equihash_avx2.o" )
-
-include_directories(${CMAKE_CURRENT_BINARY_DIR}/../../nheqminer/)
-
-set(SOURCE_FILES
-    # sources
-    ../../nheqminer/amount.cpp
-    ../../nheqminer/api.cpp
-    ../../nheqminer/arith_uint256.cpp
-    ../../nheqminer/crypto/sha256.cpp
-    ../../nheqminer/json/json_spirit_reader.cpp
-    ../../nheqminer/json/json_spirit_value.cpp
-    ../../nheqminer/json/json_spirit_writer.cpp
-    ../../nheqminer/libstratum/ZcashStratum.cpp
-    ../../nheqminer/main.cpp
-    ../../nheqminer/primitives/block.cpp
-    ../../nheqminer/speed.cpp
-    ../../nheqminer/uint256.cpp
-    ../../nheqminer/utilstrencodings.cpp
-    # headers
-    ../../nheqminer/amount.h
-    ../../nheqminer/api.hpp
-    ../../nheqminer/arith_uint256.h
-    ../../nheqminer/crypto/sha256.h
-    ../../nheqminer/hash.h
-    ../../nheqminer/json/json_spirit.h
-    ../../nheqminer/json/json_spirit_error_position.h
-    ../../nheqminer/json/json_spirit_reader.h
-    ../../nheqminer/json/json_spirit_reader_template.h
-    ../../nheqminer/json/json_spirit_stream_reader.h
-    ../../nheqminer/json/json_spirit_utils.h
-    ../../nheqminer/json/json_spirit_value.h
-    ../../nheqminer/json/json_spirit_writer.h
-    ../../nheqminer/json/json_spirit_writer_template.h
-    ../../nheqminer/libstratum/StratumClient.cpp
-    ../../nheqminer/libstratum/StratumClient.h
-    ../../nheqminer/libstratum/ZcashStratum.cpp
-    ../../nheqminer/libstratum/ZcashStratum.h
-    ../../nheqminer/primitives/block.h
-    ../../nheqminer/primitives/transaction.h
-    ../../nheqminer/script/script.h
-    ../../nheqminer/serialize.h
-    ../../nheqminer/speed.hpp
-    ../../nheqminer/streams.h
-    ../../nheqminer/support/allocators/zeroafterfree.h
-    ../../nheqminer/tinyformat.h
-    ../../nheqminer/uint252.h
-    ../../nheqminer/uint256.h
-    ../../nheqminer/utilstrencodings.h
-    ../../nheqminer/version.h
-    ../../nheqminer/zcash/JoinSplit.hpp
-    ../../nheqminer/zcash/NoteEncryption.hpp
-    ../../nheqminer/zcash/Proof.hpp
-    ../../nheqminer/zcash/Zcash.h
-    ../../nheqminer/SolverStub.h # just a stub
-
-    # cpu xenocat
-    ../../cpu_xenoncat/cpu_xenoncat.hpp
-    ../../cpu_xenoncat/xenoncat.cpp
-    )
-
-#add_executable(${PROJECT_NAME} ${SRC_LIST})
-
-set(LIBS ${LIBS} ${Boost_LIBRARIES})
-
-#message("-- CXXFLAGS: ${CMAKE_CXX_FLAGS}")
-#message("-- LIBS: ${LIBS}")
-
-add_executable(${PROJECT_NAME} ${SOURCE_FILES})
-#target_link_libraries(${PROJECT_NAME} ${Boost_LIBRARIES})
-target_link_libraries(${PROJECT_NAME} ${LIBS} ${CMAKE_THREAD_LIBS_INIT} xenoncat_avx1 xenoncat_avx2 )
diff --git a/Linux_cmake/nheqminer_cuda_tromp/CMakeLists.txt b/Linux_cmake/nheqminer_cuda_tromp/CMakeLists.txt
deleted file mode 100644
index 1853b22de..000000000
--- a/Linux_cmake/nheqminer_cuda_tromp/CMakeLists.txt
+++ /dev/null
@@ -1,170 +0,0 @@
-project(nheqminer_cuda_tromp)
-cmake_minimum_required(VERSION 2.8)
-
-option(ENABLE_CUDA "Enable the cuda build" ON)
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-# LINUX
-if(CMAKE_COMPILER_IS_GNUCXX)
-    # use native cpu features
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
-    # optimizations
-    add_definitions(-O3)
-endif()
-
-# Common
-include_directories(${nheqminer_SOURCE_DIR})
-
-add_definitions(-DBOOST_ALL_NO_LIB -DBOOST_ALL_DYN_LINK -DBOOST_LOG_DYN_LINK)
-
-#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-m64;--std=c++11;--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo)
-
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo)
-
-add_definitions(-DHIST)
-#add_definitions(-DXINTREE)
-#add_definitions(-DUNROLL)
-
-list(APPEND CUDA_NVCC_FLAGS_RELEASE -O3)
-
-
-FIND_PACKAGE(CUDA REQUIRED)
-if(COMPUTE AND (COMPUTE GREATER 0))
-        LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE})
-else(COMPUTE AND (COMPUTE GREATER 0))
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};; -gencode arch=compute_20,code=sm_21; -gencode arch=compute_30,code=sm_30; -gencode arch=compute_35,code=sm_35; -gencode arch=compute_50,code=sm_50; -gencode arch=compute_52,code=sm_52; -gencode arch=compute_61,code=sm_61 )
-endif(COMPUTE AND (COMPUTE GREATER 0))
-
-include_directories(${CUDA_INCLUDE_DIRS})
-
-find_package(Threads REQUIRED COMPONENTS)
-find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread)
-
-if(CUDA_FOUND)
-message("CUDA FOUND")
-else()
-message("CUDA NOT FOUND")
-endif()
-
-if (Boost_FOUND)
-  # From the offical documentation:
-  # Add include directories to the build. [...] If the SYSTEM option is given,
-  # the compiler will be told the directories are meant as system include
-  # directories on some platforms (signalling this setting might achieve effects
-  # such as the compiler skipping warnings [...])."
-  include_directories (SYSTEM ${Boost_INCLUDE_DIR})
-
-  # From the offical documentation:
-  # "Specify directories in which the linker will look for libraries. [...] Note
-  # that this command is rarely necessary. Library locations returned by
-  # find_package() and find_library() are absolute paths. Pass these absolute
-  # library file paths directly to the target_link_libraries() command. CMake
-  # will ensure the linker finds them."
-  link_directories (${Boost_LIBRARY_DIRS})
-else()
-    message("Boost_FOUND NOT FOUND")
-endif ()
-
-## Add solvers here
-#add_definitions(-DUSE_CPU_XENONCAT)
-#add_definitions(-DUSE_CPU_TROMP)
-add_definitions(-DUSE_CUDA_TROMP)
-
-#add_library ( xenoncat_avx1 SHARED IMPORTED GLOBAL )
-#set_target_properties ( xenoncat_avx1 PROPERTIES IMPORTED_LOCATION "../../cpu_xenoncat/Linux/asm/equihash_avx1.o" )
-
-#add_library ( xenoncat_avx2 SHARED IMPORTED GLOBAL )
-#set_target_properties ( xenoncat_avx2 PROPERTIES IMPORTED_LOCATION "../../cpu_xenoncat/Linux/asm/equihash_avx2.o" )
-
-include_directories(${CMAKE_CURRENT_BINARY_DIR}/../../nheqminer/)
-
-set(SOURCE_FILES
-    # sources
-    ../../nheqminer/amount.cpp
-    ../../nheqminer/api.cpp
-    ../../nheqminer/arith_uint256.cpp
-    ../../nheqminer/crypto/sha256.cpp
-    ../../nheqminer/json/json_spirit_reader.cpp
-    ../../nheqminer/json/json_spirit_value.cpp
-    ../../nheqminer/json/json_spirit_writer.cpp
-    ../../nheqminer/libstratum/ZcashStratum.cpp
-    ../../nheqminer/main.cpp
-    ../../nheqminer/primitives/block.cpp
-    ../../nheqminer/speed.cpp
-    ../../nheqminer/uint256.cpp
-    ../../nheqminer/utilstrencodings.cpp
-    # headers
-    ../../nheqminer/amount.h
-    ../../nheqminer/api.hpp
-    ../../nheqminer/arith_uint256.h
-    ../../nheqminer/crypto/sha256.h
-    ../../nheqminer/hash.h
-    ../../nheqminer/json/json_spirit.h
-    ../../nheqminer/json/json_spirit_error_position.h
-    ../../nheqminer/json/json_spirit_reader.h
-    ../../nheqminer/json/json_spirit_reader_template.h
-    ../../nheqminer/json/json_spirit_stream_reader.h
-    ../../nheqminer/json/json_spirit_utils.h
-    ../../nheqminer/json/json_spirit_value.h
-    ../../nheqminer/json/json_spirit_writer.h
-    ../../nheqminer/json/json_spirit_writer_template.h
-    ../../nheqminer/libstratum/StratumClient.cpp
-    ../../nheqminer/libstratum/StratumClient.h
-    ../../nheqminer/libstratum/ZcashStratum.cpp
-    ../../nheqminer/libstratum/ZcashStratum.h
-    ../../nheqminer/primitives/block.h
-    ../../nheqminer/primitives/transaction.h
-    ../../nheqminer/script/script.h
-    ../../nheqminer/serialize.h
-    ../../nheqminer/speed.hpp
-    ../../nheqminer/streams.h
-    ../../nheqminer/support/allocators/zeroafterfree.h
-    ../../nheqminer/tinyformat.h
-    ../../nheqminer/uint252.h
-    ../../nheqminer/uint256.h
-    ../../nheqminer/utilstrencodings.h
-    ../../nheqminer/version.h
-    ../../nheqminer/zcash/JoinSplit.hpp
-    ../../nheqminer/zcash/NoteEncryption.hpp
-    ../../nheqminer/zcash/Proof.hpp
-    ../../nheqminer/zcash/Zcash.h
-    ../../nheqminer/SolverStub.h # just a stub
-
-#    # cpu tromp
-#    ../../cpu_tromp/blake2/blake2bx.cpp
-#    ../../cpu_tromp/cpu_tromp.cpp
-#    ../../cpu_tromp/blake2/blake2-config.h
-#    ../../cpu_tromp/blake2/blake2-impl.h
-#    ../../cpu_tromp/blake2/blake2-round.h
-#    ../../cpu_tromp/blake2/blake2.h
-#    ../../cpu_tromp/blake2/blake2b-load-sse2.h
-#    ../../cpu_tromp/blake2/blake2b-load-sse41.h
-#    ../../cpu_tromp/blake2/blake2b-round.h
-#    ../../cpu_tromp/cpu_tromp.hpp
-#    ../../cpu_tromp/equi.h
-#    ../../cpu_tromp/equi_miner.h
-
-#    # cpu xenocat
-#    ../../cpu_xenoncat/cpu_xenoncat.hpp
-#    ../../cpu_xenoncat/xenoncat.cpp
-
-    # cuda tromp
-    ../../cuda_tromp/cuda_tromp.hpp
-    ../../cuda_tromp/cuda_tromp.cpp
-    ../../cuda_tromp/eqcuda.hpp
-    ../../cuda_tromp/equi_miner.cu
-    ../../cpu_tromp/blake2/blake2bx.cpp
-    )
-
-#add_executable(${PROJECT_NAME} ${SRC_LIST})
-set(LIBS ${LIBS} ${Threads_LIBRARIES} ${Boost_LIBRARIES})
-
-#message("-- CXXFLAGS: ${CMAKE_CXX_FLAGS}")
-#message("-- LIBS: ${LIBS}")
-
-#add_executable(${PROJECT_NAME} ${SOURCE_FILES})
-#target_link_libraries(${PROJECT_NAME} ${Boost_LIBRARIES})
-CUDA_ADD_EXECUTABLE(${PROJECT_NAME} ${SOURCE_FILES})
-target_link_libraries(${PROJECT_NAME} ${LIBS} ${CUDA_LIBRARIES} )
diff --git a/README.md b/README.md
index 5b4a9ad79..e13bb1845 100644
--- a/README.md
+++ b/README.md
@@ -1,71 +1,58 @@
 # Build instructions:
 
 ### Dependencies:
-  - Boost 1.54+
+  - Boost 1.62+
 
 ## Windows:
 
 Windows builds made by us are available here: https://github.com/nicehash/nheqminer/releases
 
 Download and install:
-- [AMD APP SDK](http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/) (if not needed remove **USE_OCL_XMP** from **nheqminer** Preprocessor definitions under Properties > C/C++ > Preprocessor)
-- [CUDA SDK](https://developer.nvidia.com/cuda-downloads) (if not needed remove **USE_CUDA_TROMP** from **nheqminer** Preprocessor definitions under Properties > C/C++ > Preprocessor)
+- [CUDA SDK](https://developer.nvidia.com/cuda-downloads) (if not needed remove **USE_CUDA_TROMP** and **USE_CUDA_DJEZO** from **nheqminer** Preprocessor definitions under Properties > C/C++ > Preprocessor)
 - Visual Studio 2013 Community: https://www.visualstudio.com/en-us/news/releasenotes/vs2013-community-vs
-- Visual Studio Update 5 installed
+- [Visual Studio Update 5](https://www.microsoft.com/en-us/download/details.aspx?id=48129) installed
 - 64 bit version only
 
 Open **nheqminer.sln** under **nheqminer/nheqminer.sln** and build. You will have to build ReleaseSSE2 cpu_tromp project first, then Release7.5 cuda_tromp project, then select Release and build all.
 
+### Enabled solvers: 
+  - USE_CPU_TROMP
+  - USE_CPU_XENONCAT
+  - USE_CUDA_TROMP
+  - USE_CUDA_DJEZO
 
-## Linux
+If you don't wan't to build with all solvlers you can go to **nheqminer Properties > C/C++ > Preprocessor > Preprocessor Definitions** and remove the solver you don't need.
 
+## Linux
 Work in progress.
-
-Working solvers CPU_TROMP, CPU_XENONCAT, CUDA_TROMP, OCL_XMP, OCL_SILENTARMY
-
-## Linux (Ubuntu 14.04 / 16.04) Build CPU_XENONCAT:
-
- - Open terminal and run the following commands:
-   - `sudo apt-get install cmake build-essential libboost-all-dev`
-   - `git clone -b Linux https://github.com/nicehash/nheqminer.git`
-   - `cd nheqminer/cpu_xenoncat/Linux/asm/`
-   - `sh assemble.sh`
-   - `cd ../../../Linux_cmake/nheqminer_cpu`
-   - `cmake .`
-   - `make -j $(nproc)`
-
-## Linux (Ubuntu 14.04 / 16.04) Build CUDA_TROMP:
-
- - Open terminal and run the following commands:
-   - **Ubuntu 14.04**:
-     - `wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_8.0.44-1_amd64.deb`
-     - `sudo dpkg -i cuda-repo-ubuntu1404_8.0.44-1_amd64.deb`
-   - **Ubuntu 16.04**:
-     - `wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-repo-ubuntu1604_8.0.44-1_amd64.deb`
-     - `sudo dpkg -i cuda-repo-ubuntu1604_8.0.44-1_amd64.deb`
-   - `sudo apt-get update`
-   - `sudo apt-get install cuda`
-   - `sudo apt-get install cuda-toolkit-8-0`
-   - `sudo apt-get install cmake build-essential libboost-all-dev`
-   - `git clone -b Linux https://github.com/nicehash/nheqminer.git`
-   - `cd nheqminer/Linux_cmake/nheqminer_cuda_tromp && cmake . && make -j $(nproc)`
-   - or specify your compute version for example 50 like so `cd nheqminer/Linux_cmake/nheqminer_cuda_tromp && cmake COMPUTE=50 . && make`
-
-## Linux (16.04) Build OCL_XMP, OCL_SILENTARMY:
-
- - Open terminal and run the following commands:
-   - [AMD APP SDK](http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/)
-   - and make sure you have the [AMD drivers](http://support.amd.com/en-us/download) installed
-   - install them to the default paths
-   - `sudo apt-get install mesa-common-dev`
-   - `sudo apt-get install cmake build-essential libboost-all-dev`
-   - `git clone -b Linux https://github.com/nicehash/nheqminer.git`
-   - `cd nheqminer/Linux_cmake/nheqminer_AMD && cmake . -DOPENCL_LIBRARY=/usr/lib/x86_64-linux-gnu/libOpenCL.so -DOPENCL_INCLUDE_DIRECTORY=/opt/AMDAPPSDK-3.0/include && make -j $(nproc)`
-   - `cp ../../3rdparty/amd_bins_linux/* -r .`
-   - `cp ../../3rdparty/amd_silentarmy_kernels/* -r .`
-
-   
-
+Working solvers CPU_TROMP, CPU_XENONCAT, CUDA_TROMP, CUDA_DJEZO
+
+### General instructions:
+  - Install CUDA SDK v8 (make sure you have cuda libraries in **LD_LIBRARY_PATH** and cuda toolkit bins in **PATH**)
+    - example on Ubuntu:
+    - LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda-8.0/lib64:/usr/local/cuda-8.0/lib64/stubs"
+    - PATH="$PATH:/usr/local/cuda-8.0/"
+    - PATH="$PATH:/usr/local/cuda-8.0/bin"
+
+  - Use Boost 1.62+ (if it is not available from the repos you will have to download and build it yourself)
+  - CMake v3.5 (if it is not available from the repos you will have to download and build it yourself)
+  - Currently support only static building (CPU_XENONCAT, CUDA_DJEZO are enabled by default, check **CMakeLists.txt** in **nheqminer** root folder)
+  - If not on Ubuntu make sure you have **fasm** installed and accessible in **PATH**
+  - After that open the terminal and run the following commands:
+    - `git clone https://github.com/nicehash/nheqminer.git`
+    - Generating asm object file:
+      - **On Ubuntu**:
+        - `cd nheqminer/cpu_xenoncat/asm_linux/`
+        - `sh assemble.sh`
+      - **bundeled fasm not compatible**:
+        - delete/replace (inside **nheqminer/cpu_xenoncat/asm_linux/** directory) with fasm binary compatible with your distro
+        - `cd nheqminer/cpu_xenoncat/asm_linux/`
+        - `sh assemble.sh`
+    - `cd ../../../`
+    - `mkdir build && cd build`
+    - `cmake ../nheqminer`
+    - `make -j $(nproc)`
+    
 # Run instructions:
 
 Parameters: 
diff --git a/blake2/blake2-config.h b/blake2/blake2-config.h
new file mode 100644
index 000000000..3524209bf
--- /dev/null
+++ b/blake2/blake2-config.h
@@ -0,0 +1,72 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2_CONFIG_H__
+#define __BLAKE2_CONFIG_H__
+
+// These don't work everywhere
+#if (defined(__SSE2__) || defined(_M_AMD_64) || defined(_M_X64))
+#define HAVE_SSE2
+#endif
+
+#if defined(__SSSE3__)
+#define HAVE_SSSE3
+#endif
+
+#if defined(__SSE4_1__)
+#define HAVE_SSE41
+#endif
+
+#if defined(__AVX__)
+#define HAVE_AVX
+#endif
+
+#if defined(__XOP__)
+#define HAVE_XOP
+#endif
+
+
+#ifdef HAVE_AVX2
+#ifndef HAVE_AVX
+#define HAVE_AVX
+#endif
+#endif
+
+#ifdef HAVE_XOP
+#ifndef HAVE_AVX
+#define HAVE_AVX
+#endif
+#endif
+
+#ifdef HAVE_AVX
+#ifndef HAVE_SSE41
+#define HAVE_SSE41
+#endif
+#endif
+
+#ifdef HAVE_SSE41
+#ifndef HAVE_SSSE3
+#define HAVE_SSSE3
+#endif
+#endif
+
+#ifdef HAVE_SSSE3
+#define HAVE_SSE2
+#endif
+
+#if !defined(HAVE_SSE2)
+#error "This code requires at least SSE2."
+#endif
+
+#endif
+
diff --git a/blake2/blake2-impl.h b/blake2/blake2-impl.h
new file mode 100644
index 000000000..16219dbcb
--- /dev/null
+++ b/blake2/blake2-impl.h
@@ -0,0 +1,136 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2_IMPL_H__
+#define __BLAKE2_IMPL_H__
+
+#include <stdint.h>
+
+static inline uint32_t load32( const void *src )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  uint32_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
+#else
+  const uint8_t *p = ( const uint8_t * )src;
+  uint32_t w = *p++;
+  w |= ( uint32_t )( *p++ ) <<  8;
+  w |= ( uint32_t )( *p++ ) << 16;
+  w |= ( uint32_t )( *p++ ) << 24;
+  return w;
+#endif
+}
+
+static inline uint64_t load64( const void *src )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  uint64_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
+#else
+  const uint8_t *p = ( const uint8_t * )src;
+  uint64_t w = *p++;
+  w |= ( uint64_t )( *p++ ) <<  8;
+  w |= ( uint64_t )( *p++ ) << 16;
+  w |= ( uint64_t )( *p++ ) << 24;
+  w |= ( uint64_t )( *p++ ) << 32;
+  w |= ( uint64_t )( *p++ ) << 40;
+  w |= ( uint64_t )( *p++ ) << 48;
+  w |= ( uint64_t )( *p++ ) << 56;
+  return w;
+#endif
+}
+
+static inline void store32( void *dst, uint32_t w )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  memcpy(dst, &w, sizeof w);
+#else
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+#endif
+}
+
+static inline void store64( void *dst, uint64_t w )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  memcpy(dst, &w, sizeof w);
+#else
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+#endif
+}
+
+static inline uint64_t load48( const void *src )
+{
+  const uint8_t *p = ( const uint8_t * )src;
+  uint64_t w = *p++;
+  w |= ( uint64_t )( *p++ ) <<  8;
+  w |= ( uint64_t )( *p++ ) << 16;
+  w |= ( uint64_t )( *p++ ) << 24;
+  w |= ( uint64_t )( *p++ ) << 32;
+  w |= ( uint64_t )( *p++ ) << 40;
+  return w;
+}
+
+static inline void store48( void *dst, uint64_t w )
+{
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+}
+
+static inline uint32_t rotl32( const uint32_t w, const unsigned c )
+{
+  return ( w << c ) | ( w >> ( 32 - c ) );
+}
+
+static inline uint64_t rotl64( const uint64_t w, const unsigned c )
+{
+  return ( w << c ) | ( w >> ( 64 - c ) );
+}
+
+static inline uint32_t rotr32( const uint32_t w, const unsigned c )
+{
+  return ( w >> c ) | ( w << ( 32 - c ) );
+}
+
+static inline uint64_t rotr64( const uint64_t w, const unsigned c )
+{
+  return ( w >> c ) | ( w << ( 64 - c ) );
+}
+
+/* prevents compiler optimizing out memset() */
+static inline void secure_zero_memory( void *v, size_t n )
+{
+  volatile uint8_t *p = ( volatile uint8_t * )v;
+  while( n-- ) *p++ = 0;
+}
+
+#endif
+
diff --git a/blake2/blake2-round.h b/blake2/blake2-round.h
new file mode 100644
index 000000000..400ed2034
--- /dev/null
+++ b/blake2/blake2-round.h
@@ -0,0 +1,85 @@
+#define _mm_roti_epi64(x, c) \
+	(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
+	: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
+	: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
+	: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
+	: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	row1l = _mm_add_epi64(row1l, row2l); \
+	row1h = _mm_add_epi64(row1h, row2h); \
+	\
+	row4l = _mm_xor_si128(row4l, row1l); \
+	row4h = _mm_xor_si128(row4h, row1h); \
+	\
+	row4l = _mm_roti_epi64(row4l, -32); \
+	row4h = _mm_roti_epi64(row4h, -32); \
+	\
+	row3l = _mm_add_epi64(row3l, row4l); \
+	row3h = _mm_add_epi64(row3h, row4h); \
+	\
+	row2l = _mm_xor_si128(row2l, row3l); \
+	row2h = _mm_xor_si128(row2h, row3h); \
+	\
+	row2l = _mm_roti_epi64(row2l, -24); \
+	row2h = _mm_roti_epi64(row2h, -24); \
+ 
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	row1l = _mm_add_epi64(row1l, row2l); \
+	row1h = _mm_add_epi64(row1h, row2h); \
+	\
+	row4l = _mm_xor_si128(row4l, row1l); \
+	row4h = _mm_xor_si128(row4h, row1h); \
+	\
+	row4l = _mm_roti_epi64(row4l, -16); \
+	row4h = _mm_roti_epi64(row4h, -16); \
+	\
+	row3l = _mm_add_epi64(row3l, row4l); \
+	row3h = _mm_add_epi64(row3h, row4h); \
+	\
+	row2l = _mm_xor_si128(row2l, row3l); \
+	row2h = _mm_xor_si128(row2h, row3h); \
+	\
+	row2l = _mm_roti_epi64(row2l, -63); \
+	row2h = _mm_roti_epi64(row2h, -63); \
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	t0 = _mm_alignr_epi8(row2h, row2l, 8); \
+	t1 = _mm_alignr_epi8(row2l, row2h, 8); \
+	row2l = t0; \
+	row2h = t1; \
+	\
+	t0 = row3l; \
+	row3l = row3h; \
+	row3h = t0;    \
+	\
+	t0 = _mm_alignr_epi8(row4h, row4l, 8); \
+	t1 = _mm_alignr_epi8(row4l, row4h, 8); \
+	row4l = t1; \
+	row4h = t0;
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	t0 = _mm_alignr_epi8(row2l, row2h, 8); \
+	t1 = _mm_alignr_epi8(row2h, row2l, 8); \
+	row2l = t0; \
+	row2h = t1; \
+	\
+	t0 = row3l; \
+	row3l = row3h; \
+	row3h = t0; \
+	\
+	t0 = _mm_alignr_epi8(row4l, row4h, 8); \
+	t1 = _mm_alignr_epi8(row4h, row4l, 8); \
+	row4l = t1; \
+	row4h = t0;
+
+#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
+	G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
diff --git a/blake2/blake2.h b/blake2/blake2.h
new file mode 100644
index 000000000..85d63866f
--- /dev/null
+++ b/blake2/blake2.h
@@ -0,0 +1,156 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2_H__
+#define __BLAKE2_H__
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#define ALIGN(x) __declspec(align(x))
+#else
+#define ALIGN(x) __attribute__ ((__aligned__(x)))
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+  enum blake2s_constant
+  {
+    BLAKE2S_BLOCKBYTES = 64,
+    BLAKE2S_OUTBYTES   = 32,
+    BLAKE2S_KEYBYTES   = 32,
+    BLAKE2S_SALTBYTES  = 8,
+    BLAKE2S_PERSONALBYTES = 8
+  };
+
+  enum blake2b_constant
+  {
+    BLAKE2B_BLOCKBYTES = 128,
+    BLAKE2B_OUTBYTES   = 64,
+    BLAKE2B_KEYBYTES   = 64,
+    BLAKE2B_SALTBYTES  = 16,
+    BLAKE2B_PERSONALBYTES = 16
+  };
+
+#pragma pack(push, 1)
+  typedef struct __blake2s_param
+  {
+    uint8_t  digest_length; // 1
+    uint8_t  key_length;    // 2
+    uint8_t  fanout;        // 3
+    uint8_t  depth;         // 4
+    uint32_t leaf_length;   // 8
+    uint8_t  node_offset[6];// 14
+    uint8_t  node_depth;    // 15
+    uint8_t  inner_length;  // 16
+    // uint8_t  reserved[0];
+    uint8_t  salt[BLAKE2S_SALTBYTES]; // 24
+    uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
+  } blake2s_param;
+
+  ALIGN( 64 ) typedef struct __blake2s_state
+  {
+    uint32_t h[8];
+    uint32_t t[2];
+    uint32_t f[2];
+    uint8_t  buf[2 * BLAKE2S_BLOCKBYTES];
+    size_t   buflen;
+    uint8_t  last_node;
+  } blake2s_state;
+
+  typedef struct __blake2b_param
+  {
+    uint8_t  digest_length; // 1
+    uint8_t  key_length;    // 2
+    uint8_t  fanout;        // 3
+    uint8_t  depth;         // 4
+    uint32_t leaf_length;   // 8
+    uint64_t node_offset;   // 16
+    uint8_t  node_depth;    // 17
+    uint8_t  inner_length;  // 18
+    uint8_t  reserved[14];  // 32
+    uint8_t  salt[BLAKE2B_SALTBYTES]; // 48
+    uint8_t  personal[BLAKE2B_PERSONALBYTES];  // 64
+  } blake2b_param;
+
+  ALIGN( 64 ) typedef struct __blake2b_state
+  {
+    uint64_t h[8];
+    uint8_t  buf[BLAKE2B_BLOCKBYTES];
+    uint16_t counter;
+    uint8_t  buflen;
+    uint8_t  lastblock;
+  } blake2b_state;
+
+  ALIGN( 64 ) typedef struct __blake2sp_state
+  {
+    blake2s_state S[8][1];
+    blake2s_state R[1];
+    uint8_t buf[8 * BLAKE2S_BLOCKBYTES];
+    size_t  buflen;
+  } blake2sp_state;
+
+  ALIGN( 64 ) typedef struct __blake2bp_state
+  {
+    blake2b_state S[4][1];
+    blake2b_state R[1];
+    uint8_t buf[4 * BLAKE2B_BLOCKBYTES];
+    size_t  buflen;
+  } blake2bp_state;
+#pragma pack(pop)
+
+  // Streaming API
+  int blake2s_init( blake2s_state *S, const uint8_t outlen );
+  int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+  int blake2s_init_param( blake2s_state *S, const blake2s_param *P );
+  int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen );
+  int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen );
+
+  int blake2b_init( blake2b_state *S, const uint8_t outlen );
+  int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+  int blake2b_init_param( blake2b_state *S, const blake2b_param *P );
+  int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen );
+  int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen );
+
+  int blake2sp_init( blake2sp_state *S, const uint8_t outlen );
+  int blake2sp_init_key( blake2sp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+  int blake2sp_update( blake2sp_state *S, const uint8_t *in, uint64_t inlen );
+  int blake2sp_final( blake2sp_state *S, uint8_t *out, uint8_t outlen );
+
+  int blake2bp_init( blake2bp_state *S, const uint8_t outlen );
+  int blake2bp_init_key( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+  int blake2bp_update( blake2bp_state *S, const uint8_t *in, uint64_t inlen );
+  int blake2bp_final( blake2bp_state *S, uint8_t *out, uint8_t outlen );
+
+  // Simple API
+  int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+  int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+  int blake2b_long(uint8_t *out, const void *in, const uint32_t outlen, const uint64_t inlen);
+
+  int blake2sp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+  int blake2bp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+
+  static inline int blake2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
+  {
+    return blake2b( out, in, key, outlen, inlen, keylen );
+  }
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
+
diff --git a/blake2/blake2b-load-sse2.h b/blake2/blake2b-load-sse2.h
new file mode 100644
index 000000000..1ba153c87
--- /dev/null
+++ b/blake2/blake2b-load-sse2.h
@@ -0,0 +1,68 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2B_LOAD_SSE2_H__
+#define __BLAKE2B_LOAD_SSE2_H__
+
+#define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
+#define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
+#define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
+#define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
+#define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
+#define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
+#define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
+#define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5)
+#define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2)
+#define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7)
+#define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1)
+#define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13)
+#define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4)
+#define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0)
+#define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2)
+#define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4)
+#define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6)
+#define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8)
+#define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0)
+#define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11)
+#define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15)
+#define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14)
+#define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14)
+#define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13)
+#define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9)
+#define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2)
+#define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12)
+#define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1)
+#define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8)
+#define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6)
+#define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11)
+#define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3)
+#define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1)
+#define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4)
+#define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7)
+#define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6)
+#define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3)
+#define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12)
+#define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
+#define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
+#define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
+#define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
+#define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
+#define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
+#define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
+
+
+#endif
+
diff --git a/blake2/blake2b-load-sse41.h b/blake2/blake2b-load-sse41.h
new file mode 100644
index 000000000..f6c1bc839
--- /dev/null
+++ b/blake2/blake2b-load-sse41.h
@@ -0,0 +1,402 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2B_LOAD_SSE41_H__
+#define __BLAKE2B_LOAD_SSE41_H__
+
+#define LOAD_MSG_0_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m1); \
+b1 = _mm_unpacklo_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_0_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m0, m1); \
+b1 = _mm_unpackhi_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_0_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m5); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_0_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m5); \
+b1 = _mm_unpackhi_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_1_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m2); \
+b1 = _mm_unpackhi_epi64(m4, m6); \
+} while(0)
+
+
+#define LOAD_MSG_1_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_alignr_epi8(m3, m7, 8); \
+} while(0)
+
+
+#define LOAD_MSG_1_3(b0, b1) \
+do \
+{ \
+b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
+b1 = _mm_unpackhi_epi64(m5, m2); \
+} while(0)
+
+
+#define LOAD_MSG_1_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m1); \
+b1 = _mm_unpackhi_epi64(m3, m1); \
+} while(0)
+
+
+#define LOAD_MSG_2_1(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m6, m5, 8); \
+b1 = _mm_unpackhi_epi64(m2, m7); \
+} while(0)
+
+
+#define LOAD_MSG_2_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m0); \
+b1 = _mm_blend_epi16(m1, m6, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_2_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m5, m1, 0xF0); \
+b1 = _mm_unpackhi_epi64(m3, m4); \
+} while(0)
+
+
+#define LOAD_MSG_2_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m3); \
+b1 = _mm_alignr_epi8(m2, m0, 8); \
+} while(0)
+
+
+#define LOAD_MSG_3_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m3, m1); \
+b1 = _mm_unpackhi_epi64(m6, m5); \
+} while(0)
+
+
+#define LOAD_MSG_3_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m0); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_3_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m1, m2, 0xF0); \
+b1 = _mm_blend_epi16(m2, m7, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_3_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m3, m5); \
+b1 = _mm_unpacklo_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_4_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m2); \
+b1 = _mm_unpacklo_epi64(m1, m5); \
+} while(0)
+
+
+#define LOAD_MSG_4_2(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m0, m3, 0xF0); \
+b1 = _mm_blend_epi16(m2, m7, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_4_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m7, m5, 0xF0); \
+b1 = _mm_blend_epi16(m3, m1, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_4_4(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m6, m0, 8); \
+b1 = _mm_blend_epi16(m4, m6, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_5_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m1, m3); \
+b1 = _mm_unpacklo_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_5_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m5); \
+b1 = _mm_unpackhi_epi64(m5, m1); \
+} while(0)
+
+
+#define LOAD_MSG_5_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m2, m3, 0xF0); \
+b1 = _mm_unpackhi_epi64(m7, m0); \
+} while(0)
+
+
+#define LOAD_MSG_5_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m6, m2); \
+b1 = _mm_blend_epi16(m7, m4, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_6_1(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m6, m0, 0xF0); \
+b1 = _mm_unpacklo_epi64(m7, m2); \
+} while(0)
+
+
+#define LOAD_MSG_6_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m2, m7); \
+b1 = _mm_alignr_epi8(m5, m6, 8); \
+} while(0)
+
+
+#define LOAD_MSG_6_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m3); \
+b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
+} while(0)
+
+
+#define LOAD_MSG_6_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m3, m1); \
+b1 = _mm_blend_epi16(m1, m5, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_7_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m6, m3); \
+b1 = _mm_blend_epi16(m6, m1, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_7_2(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m7, m5, 8); \
+b1 = _mm_unpackhi_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_7_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m2, m7); \
+b1 = _mm_unpacklo_epi64(m4, m1); \
+} while(0)
+
+
+#define LOAD_MSG_7_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m2); \
+b1 = _mm_unpacklo_epi64(m3, m5); \
+} while(0)
+
+
+#define LOAD_MSG_8_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m3, m7); \
+b1 = _mm_alignr_epi8(m0, m5, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m7, m4); \
+b1 = _mm_alignr_epi8(m4, m1, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_3(b0, b1) \
+do \
+{ \
+b0 = m6; \
+b1 = _mm_alignr_epi8(m5, m0, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_4(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m1, m3, 0xF0); \
+b1 = m2; \
+} while(0)
+
+
+#define LOAD_MSG_9_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_unpackhi_epi64(m3, m0); \
+} while(0)
+
+
+#define LOAD_MSG_9_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m1, m2); \
+b1 = _mm_blend_epi16(m3, m2, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_9_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m7, m4); \
+b1 = _mm_unpackhi_epi64(m1, m6); \
+} while(0)
+
+
+#define LOAD_MSG_9_4(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m7, m5, 8); \
+b1 = _mm_unpacklo_epi64(m6, m0); \
+} while(0)
+
+
+#define LOAD_MSG_10_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m1); \
+b1 = _mm_unpacklo_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_10_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m0, m1); \
+b1 = _mm_unpackhi_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_10_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m5); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_10_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m5); \
+b1 = _mm_unpackhi_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_11_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m2); \
+b1 = _mm_unpackhi_epi64(m4, m6); \
+} while(0)
+
+
+#define LOAD_MSG_11_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_alignr_epi8(m3, m7, 8); \
+} while(0)
+
+
+#define LOAD_MSG_11_3(b0, b1) \
+do \
+{ \
+b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
+b1 = _mm_unpackhi_epi64(m5, m2); \
+} while(0)
+
+
+#define LOAD_MSG_11_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m1); \
+b1 = _mm_unpackhi_epi64(m3, m1); \
+} while(0)
+
+
+#endif
+
diff --git a/blake2/blake2b-round.h b/blake2/blake2b-round.h
new file mode 100644
index 000000000..3e6fd0cbe
--- /dev/null
+++ b/blake2/blake2b-round.h
@@ -0,0 +1,170 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2B_ROUND_H__
+#define __BLAKE2B_ROUND_H__
+
+#define LOAD(p)  _mm_load_si128( (const __m128i *)(p) )
+#define STORE(p,r) _mm_store_si128((__m128i *)(p), r)
+
+#define LOADU(p)  _mm_loadu_si128( (const __m128i *)(p) )
+#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
+
+#define TOF(reg) _mm_castsi128_ps((reg))
+#define TOI(reg) _mm_castps_si128((reg))
+
+#define LIKELY(x) __builtin_expect((x),1)
+
+
+/* Microarchitecture-specific macros */
+#ifndef HAVE_XOP
+#ifdef HAVE_SSSE3
+#define _mm_roti_epi64(x, c) \
+    (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
+    : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
+    : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
+    : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
+    : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
+#else
+#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-c) ))
+#endif
+#else
+/* ... */
+#endif
+
+
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, (-32)); \
+  row4h = _mm_roti_epi64(row4h, (-32)); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, (-24)); \
+  row2h = _mm_roti_epi64(row2h, (-24)); \
+ 
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, (-16)); \
+  row4h = _mm_roti_epi64(row4h, (-16)); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, (-63)); \
+  row2h = _mm_roti_epi64(row2h, (-63)); \
+ 
+#if defined(HAVE_SSSE3)
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = _mm_alignr_epi8(row2h, row2l, 8); \
+  t1 = _mm_alignr_epi8(row2l, row2h, 8); \
+  row2l = t0; \
+  row2h = t1; \
+  \
+  t0 = row3l; \
+  row3l = row3h; \
+  row3h = t0;    \
+  \
+  t0 = _mm_alignr_epi8(row4h, row4l, 8); \
+  t1 = _mm_alignr_epi8(row4l, row4h, 8); \
+  row4l = t1; \
+  row4h = t0;
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = _mm_alignr_epi8(row2l, row2h, 8); \
+  t1 = _mm_alignr_epi8(row2h, row2l, 8); \
+  row2l = t0; \
+  row2h = t1; \
+  \
+  t0 = row3l; \
+  row3l = row3h; \
+  row3h = t0; \
+  \
+  t0 = _mm_alignr_epi8(row4l, row4h, 8); \
+  t1 = _mm_alignr_epi8(row4h, row4l, 8); \
+  row4l = t1; \
+  row4h = t0;
+#else
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = row4l;\
+  t1 = row2l;\
+  row4l = row3l;\
+  row3l = row3h;\
+  row3h = row4l;\
+  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
+  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
+  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
+  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = row3l;\
+  row3l = row3h;\
+  row3h = t0;\
+  t0 = row2l;\
+  t1 = row4l;\
+  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
+  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
+  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
+  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
+
+#endif
+
+#if defined(HAVE_SSE41)
+#include "blake2b-load-sse41.h"
+#else
+#include "blake2b-load-sse2.h"
+#endif
+
+#define ROUND(r) \
+  LOAD_MSG_ ##r ##_1(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_2(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+  LOAD_MSG_ ##r ##_3(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_4(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
+
+#endif
+
+#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
+	G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	\
+	DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	\
+	G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	\
+	UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
diff --git a/blake2/blake2bx.cpp b/blake2/blake2bx.cpp
new file mode 100644
index 000000000..2df512e95
--- /dev/null
+++ b/blake2/blake2bx.cpp
@@ -0,0 +1,346 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+#include "blake2-config.h"
+
+#ifdef WIN32
+#include <intrin.h>
+#endif
+
+#include <emmintrin.h>
+#if defined(HAVE_SSSE3)
+#include <tmmintrin.h>
+#endif
+#if defined(HAVE_SSE41)
+#include <smmintrin.h>
+#endif
+#if defined(HAVE_AVX)
+#include <immintrin.h>
+#endif
+#if defined(HAVE_XOP)
+#include <x86intrin.h>
+#endif
+
+#include "blake2b-round.h"
+
+
+
+ALIGN(64) static const uint64_t blake2b_IV[8] =
+{
+	0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+	0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+	0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+	0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+};
+
+/* init xors IV with input parameter block */
+int blake2b_init_param(blake2b_state *S, const blake2b_param *P)
+{
+	//blake2b_init0( S );
+	const uint8_t * v = (const uint8_t *)(blake2b_IV);
+	const uint8_t * p = (const uint8_t *)(P);
+	uint8_t * h = (uint8_t *)(S->h);
+	/* IV XOR ParamBlock */
+	memset(S, 0, sizeof(blake2b_state));
+
+	for (int i = 0; i < BLAKE2B_OUTBYTES; ++i) h[i] = v[i] ^ p[i];
+
+	return 0;
+}
+
+/* Some sort of default parameter block initialization, for sequential blake2b */
+int blake2b_init(blake2b_state *S, const uint8_t outlen)
+{
+	if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1;
+
+	const blake2b_param P =
+	{
+		outlen,
+		0,
+		1,
+		1,
+		0,
+		0,
+		0,
+		0,
+		{ 0 },
+		{ 0 },
+		{ 0 }
+	};
+	return blake2b_init_param(S, &P);
+}
+
+int blake2b_init_key(blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen)
+{
+	if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1;
+
+	if ((!keylen) || keylen > BLAKE2B_KEYBYTES) return -1;
+
+	const blake2b_param P =
+	{
+		outlen,
+		keylen,
+		1,
+		1,
+		0,
+		0,
+		0,
+		0,
+		{ 0 },
+		{ 0 },
+		{ 0 }
+	};
+
+	if (blake2b_init_param(S, &P) < 0)
+		return 0;
+
+	{
+		uint8_t block[BLAKE2B_BLOCKBYTES];
+		memset(block, 0, BLAKE2B_BLOCKBYTES);
+		memcpy(block, key, keylen);
+		blake2b_update(S, block, BLAKE2B_BLOCKBYTES);
+		secure_zero_memory(block, BLAKE2B_BLOCKBYTES); /* Burn the key from stack */
+	}
+	return 0;
+}
+
+static inline int blake2b_compress(blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES])
+{
+	__m128i row1l, row1h;
+	__m128i row2l, row2h;
+	__m128i row3l, row3h;
+	__m128i row4l, row4h;
+	__m128i b0, b1;
+	__m128i t0, t1;
+#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
+	const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
+	const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
+#endif
+#if defined(HAVE_SSE41)
+	const __m128i m0 = LOADU(block + 00);
+	const __m128i m1 = LOADU(block + 16);
+	const __m128i m2 = LOADU(block + 32);
+	const __m128i m3 = LOADU(block + 48);
+	const __m128i m4 = LOADU(block + 64);
+	const __m128i m5 = LOADU(block + 80);
+	const __m128i m6 = LOADU(block + 96);
+	const __m128i m7 = LOADU(block + 112);
+#else
+	const uint64_t  m0 = ( ( uint64_t * )block )[ 0];
+	const uint64_t  m1 = ( ( uint64_t * )block )[ 1];
+	const uint64_t  m2 = ( ( uint64_t * )block )[ 2];
+	const uint64_t  m3 = ( ( uint64_t * )block )[ 3];
+	const uint64_t  m4 = ( ( uint64_t * )block )[ 4];
+	const uint64_t  m5 = ( ( uint64_t * )block )[ 5];
+	const uint64_t  m6 = ( ( uint64_t * )block )[ 6];
+	const uint64_t  m7 = ( ( uint64_t * )block )[ 7];
+	const uint64_t  m8 = ( ( uint64_t * )block )[ 8];
+	const uint64_t  m9 = ( ( uint64_t * )block )[ 9];
+	const uint64_t m10 = ( ( uint64_t * )block )[10];
+	const uint64_t m11 = ( ( uint64_t * )block )[11];
+	const uint64_t m12 = ( ( uint64_t * )block )[12];
+	const uint64_t m13 = ( ( uint64_t * )block )[13];
+	const uint64_t m14 = ( ( uint64_t * )block )[14];
+	const uint64_t m15 = ( ( uint64_t * )block )[15];
+#endif
+	row1l = LOADU(&S->h[0]);
+	row1h = LOADU(&S->h[2]);
+	row2l = LOADU(&S->h[4]);
+	row2h = LOADU(&S->h[6]);
+	row3l = LOADU(&blake2b_IV[0]);
+	row3h = LOADU(&blake2b_IV[2]);
+	row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), _mm_set_epi32(0, 0, 0, S->counter));
+	row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), _mm_set_epi32(0, 0, 0L - S->lastblock, 0L - S->lastblock));
+	ROUND(0);
+	ROUND(1);
+	ROUND(2);
+	ROUND(3);
+	ROUND(4);
+	ROUND(5);
+	ROUND(6);
+	ROUND(7);
+	ROUND(8);
+	ROUND(9);
+	ROUND(10);
+	ROUND(11);
+	row1l = _mm_xor_si128(row3l, row1l);
+	row1h = _mm_xor_si128(row3h, row1h);
+	STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
+	STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
+	row2l = _mm_xor_si128(row4l, row2l);
+	row2h = _mm_xor_si128(row4h, row2h);
+	STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
+	STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
+	return 0;
+}
+
+
+int blake2b_update(blake2b_state *S, const uint8_t *in, uint64_t inlen)
+{
+	while (inlen > 0)
+	{
+		size_t left = S->buflen;
+		size_t fill = BLAKE2B_BLOCKBYTES - left;
+
+		if (inlen > fill)
+		{
+			memcpy(S->buf + left, in, fill); // Fill buffer
+			in += fill;
+			inlen -= fill;
+			S->counter += BLAKE2B_BLOCKBYTES;
+			blake2b_compress(S, S->buf); // Compress
+			S->buflen = 0;
+		}
+		else // inlen <= fill
+		{
+			memcpy(S->buf + left, in, inlen);
+			S->buflen += inlen; // not enough to compress
+			in += inlen;
+			inlen = 0;
+		}
+	}
+
+	return 0;
+}
+
+
+int blake2b_final(blake2b_state *S, uint8_t *out, uint8_t outlen)
+{
+	if (outlen > BLAKE2B_OUTBYTES)
+		return -1;
+
+	if (S->buflen > BLAKE2B_BLOCKBYTES)
+	{
+		S->counter += BLAKE2B_BLOCKBYTES;
+		blake2b_compress(S, S->buf);
+		S->buflen -= BLAKE2B_BLOCKBYTES;
+		memcpy(S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen);
+	}
+
+	S->counter += S->buflen;
+	S->lastblock = 1;
+	memset(S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
+	blake2b_compress(S, S->buf);
+	memcpy(out, &S->h[0], outlen);
+	S->lastblock = 0;
+	return 0;
+}
+
+
+int blake2b(uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen)
+{
+	blake2b_state S[1];
+
+	/* Verify parameters */
+	if (NULL == in) return -1;
+
+	if (NULL == out) return -1;
+
+	if (NULL == key) keylen = 0;
+
+	if (keylen)
+	{
+		if (blake2b_init_key(S, outlen, key, keylen) < 0) return -1;
+	}
+	else
+	{
+		if (blake2b_init(S, outlen) < 0) return -1;
+	}
+
+	blake2b_update(S, (const uint8_t *)in, inlen);
+	blake2b_final(S, out, outlen);
+	return 0;
+}
+
+#if defined(SUPERCOP)
+int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
+{
+	return blake2b( out, in, NULL, BLAKE2B_OUTBYTES, inlen, 0 );
+}
+#endif
+
+#if defined(BLAKE2B_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h"
+int main( int argc, char **argv )
+{
+	uint8_t key[BLAKE2B_KEYBYTES];
+	uint8_t buf[KAT_LENGTH];
+
+	for( size_t i = 0; i < BLAKE2B_KEYBYTES; ++i )
+		key[i] = ( uint8_t )i;
+
+	for( size_t i = 0; i < KAT_LENGTH; ++i )
+		buf[i] = ( uint8_t )i;
+
+	for( size_t i = 0; i < KAT_LENGTH; ++i )
+	{
+		uint8_t hash[BLAKE2B_OUTBYTES];
+		blake2b( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );
+
+		if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) )
+		{
+			puts( "error" );
+			return -1;
+		}
+	}
+
+	puts( "ok" );
+	return 0;
+}
+#endif
+
+int blake2b_long(uint8_t *out, const void *in, const uint32_t outlen, const uint64_t inlen)
+{
+	blake2b_state blake_state;
+	if (outlen <= BLAKE2B_OUTBYTES)
+	{
+		blake2b_init(&blake_state, outlen);
+		blake2b_update(&blake_state, (const uint8_t*)&outlen, sizeof(uint32_t));
+		blake2b_update(&blake_state, (const uint8_t *)in, inlen);
+		blake2b_final(&blake_state, out, outlen);
+	}
+	else
+	{
+		uint8_t out_buffer[BLAKE2B_OUTBYTES];
+		uint8_t in_buffer[BLAKE2B_OUTBYTES];
+		blake2b_init(&blake_state, BLAKE2B_OUTBYTES);
+		blake2b_update(&blake_state, (const uint8_t*)&outlen, sizeof(uint32_t));
+		blake2b_update(&blake_state, (const uint8_t *)in, inlen);
+		blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES);
+		memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
+		out += BLAKE2B_OUTBYTES / 2;
+		uint32_t toproduce = outlen - BLAKE2B_OUTBYTES / 2;
+		while (toproduce > BLAKE2B_OUTBYTES)
+		{
+			memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+			blake2b(out_buffer, in_buffer, NULL, BLAKE2B_OUTBYTES, BLAKE2B_OUTBYTES, 0);
+			memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
+			out += BLAKE2B_OUTBYTES / 2;
+			toproduce -= BLAKE2B_OUTBYTES / 2;
+		}
+		memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+		blake2b(out_buffer, in_buffer, NULL, toproduce, BLAKE2B_OUTBYTES, 0);
+		memcpy(out, out_buffer, toproduce);
+
+	}
+	return 0;
+}
\ No newline at end of file
diff --git a/cpu_tromp/CMakeLists.txt b/cpu_tromp/CMakeLists.txt
new file mode 100644
index 000000000..8214d97ff
--- /dev/null
+++ b/cpu_tromp/CMakeLists.txt
@@ -0,0 +1,19 @@
+set(EXECUTABLE cpu_tromp)
+
+#cpu_tromp/
+file(GLOB SRC_LIST
+    cpu_tromp.cpp )
+file(GLOB HEADERS
+    cpu_tromp.hpp
+	equi.h
+	equi_miner.h
+    )
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CUDA_INCLUDE_DIRS})
+include_directories(..)
+ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
+TARGET_LINK_LIBRARIES(${EXECUTABLE} )
+
+install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
+install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )
diff --git a/cpu_tromp/equi.h b/cpu_tromp/equi.h
index 94ad0ad8a..ed91cee14 100644
--- a/cpu_tromp/equi.h
+++ b/cpu_tromp/equi.h
@@ -2,6 +2,7 @@
 // Equihash solver
 // Copyright (c) 2016-2016 John Tromp
 
+
 #include "blake2/blake2.h"
 #ifdef __APPLE__
 #include "osx_barrier.h"
@@ -131,3 +132,4 @@ int verify(u32 indices[PROOFSIZE], const char *header, const u32 headerlen, cons
   uchar hash[WN/8];
   return verifyrec(&ctx, indices, hash, WK);
 }
+
diff --git a/cpu_xenoncat/CMakeLists.txt b/cpu_xenoncat/CMakeLists.txt
new file mode 100644
index 000000000..66c698d74
--- /dev/null
+++ b/cpu_xenoncat/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(EXECUTABLE cpu_xenoncat)
+
+#cpu_xenoncat/
+file(GLOB SRC_LIST
+    xenoncat.cpp )
+file(GLOB HEADERS
+    cpu_xenoncat.hpp
+    )
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CUDA_INCLUDE_DIRS})
+include_directories(..)
+ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
+TARGET_LINK_LIBRARIES(${EXECUTABLE} )
+
+install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
+install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )
diff --git a/cpu_xenoncat/Linux/asm/t2.bin b/cpu_xenoncat/Linux/asm/t2.bin
deleted file mode 100644
index 432b9ab90..000000000
Binary files a/cpu_xenoncat/Linux/asm/t2.bin and /dev/null differ
diff --git a/cpu_xenoncat/Linux/blake2b/asm/assemble.sh b/cpu_xenoncat/Linux/blake2b/asm/assemble.sh
deleted file mode 100644
index 91990b5d0..000000000
--- a/cpu_xenoncat/Linux/blake2b/asm/assemble.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-fasm zcblake2_avx1.asm
-fasm zcblake2_avx2.asm
diff --git a/cpu_xenoncat/Linux/blake2b/asm/data_blake2b.asm b/cpu_xenoncat/Linux/blake2b/asm/data_blake2b.asm
deleted file mode 100644
index be2026b1b..000000000
--- a/cpu_xenoncat/Linux/blake2b/asm/data_blake2b.asm
+++ /dev/null
@@ -1,36 +0,0 @@
-xshufb_ror24 db 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10
-xshufb_ror16 db 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9
-xshufb_bswap8 db 7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9,8
-xctrinc dd 0,2, 0,2
-
-align 32
-iv dq 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
-dq 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
-dq 0x510e527fade682d1, 0x9b05688c2b3e6c1f
-dq 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
-
-s0 dq 0x6a09e667f3bcc908 xor 0x1010032, 0xbb67ae8584caa73b	;0x32=50 bytes output
-s2 dq 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
-s4 dq 0x510e527fade682d1, 0x9b05688c2b3e6c1f
-s6 dq 0x1f83d9abfb41bd6b xor 0x576f50687361635a	;Personalization
-s7 dq 0x5be0cd19137e2179 xor 0x00000009000000c8	;n=200, k=9
-
-iv4xor128 dq 0x510e527fade682d1 xor 0x80, 0x9b05688c2b3e6c1f
-dq 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
-iv4xor144 dq 0x510e527fade682d1 xor 144, 0x9b05688c2b3e6c1f
-iv6inverted dq 0xe07c265404be4294, 0x5be0cd19137e2179
-
-align 32
-yctrinit dd 0,0, 0,1, 0,2, 0,3
-yctrinc dd 0,4, 0,4, 0,4, 0,4
-
-blake2sigma db 0,2,4,6,1,3,5,7,8,10,12,14,9,11,13,15
-db 14,4,9,13,10,8,15,6,1,0,11,5,12,2,7,3
-db 11,12,5,15,8,0,2,13,10,3,7,9,14,6,1,4
-db 7,3,13,11,9,1,12,14,2,5,4,15,6,10,0,8
-db 9,5,2,10,0,7,4,15,14,11,6,3,1,12,8,13
-db 2,6,0,8,12,10,11,3,4,7,15,1,13,5,14,9
-db 12,1,14,4,5,15,13,10,0,6,9,8,7,3,2,11
-db 13,7,12,3,11,14,1,9,5,15,8,2,0,4,6,10
-db 6,14,11,0,15,9,3,8,12,13,1,10,2,7,4,5
-db 10,8,7,1,2,4,6,5,15,9,3,13,11,14,12,0
diff --git a/cpu_xenoncat/Linux/blake2b/asm/macro_blake2b_avx1.asm b/cpu_xenoncat/Linux/blake2b/asm/macro_blake2b_avx1.asm
deleted file mode 100644
index fa3aeee8c..000000000
--- a/cpu_xenoncat/Linux/blake2b/asm/macro_blake2b_avx1.asm
+++ /dev/null
@@ -1,349 +0,0 @@
-macro hR0 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
-{
-vpaddq xmm0,xmm0,xmm4
-vpaddq xmm1,xmm1,xmm5
-vpaddq xmm2,xmm2,xmm6
-vpaddq xmm3,xmm3,xmm7
-if m0<lim
-vpaddq xmm0,xmm0, xword [src+m0*16]
-end if
-if m1<lim
-vpaddq xmm1,xmm1, xword [src+m1*16]
-end if
-if m2<lim
-vpaddq xmm2,xmm2, xword [src+m2*16]
-end if
-if m3<lim
-vpaddq xmm3,xmm3, xword [src+m3*16]
-end if
-vpxor xmm12,xmm12,xmm0
-vpxor xmm13,xmm13,xmm1
-vpxor xmm14,xmm14,xmm2
-vpxor xmm15,xmm15,xmm3
-vpshufd xmm12,xmm12,0xB1
-vpshufd xmm13,xmm13,0xB1
-vpshufd xmm14,xmm14,0xB1
-vpshufd xmm15,xmm15,0xB1
-vpaddq xmm8,xmm8,xmm12
-vpaddq xmm9,xmm9,xmm13
-vpaddq xmm10,xmm10,xmm14
-vpaddq xmm11,xmm11,xmm15
-vpxor xmm4,xmm4,xmm8
-vpxor xmm5,xmm5,xmm9
-vpxor xmm6,xmm6,xmm10
-vpxor xmm7,xmm7,xmm11
-vmovdqa [rsp], xmm8
-vmovdqa xmm8, xword [xshufb_ror24]
-vpshufb xmm4,xmm4,xmm8
-vpshufb xmm5,xmm5,xmm8
-vpshufb xmm6,xmm6,xmm8
-vpshufb xmm7,xmm7,xmm8
-vmovdqa xmm8, [rsp]
-
-vpaddq xmm0,xmm0,xmm4
-vpaddq xmm1,xmm1,xmm5
-vpaddq xmm2,xmm2,xmm6
-vpaddq xmm3,xmm3,xmm7
-if m4<lim
-vpaddq xmm0,xmm0, xword [src+m4*16]
-end if
-if m5<lim
-vpaddq xmm1,xmm1, xword [src+m5*16]
-end if
-if m6<lim
-vpaddq xmm2,xmm2, xword [src+m6*16]
-end if
-if m7<lim
-vpaddq xmm3,xmm3, xword [src+m7*16]
-end if
-vpxor xmm12,xmm12,xmm0
-vpxor xmm13,xmm13,xmm1
-vpxor xmm14,xmm14,xmm2
-vpxor xmm15,xmm15,xmm3
-vmovdqa [rsp], xmm0
-vmovdqa xmm0, xword [xshufb_ror16]
-vpshufb xmm12,xmm12,xmm0
-vpshufb xmm13,xmm13,xmm0
-vpshufb xmm14,xmm14,xmm0
-vpshufb xmm15,xmm15,xmm0
-vpaddq xmm8,xmm8,xmm12
-vpaddq xmm9,xmm9,xmm13
-vpaddq xmm10,xmm10,xmm14
-vpaddq xmm11,xmm11,xmm15
-vpxor xmm4,xmm4,xmm8
-vpxor xmm5,xmm5,xmm9
-vpxor xmm6,xmm6,xmm10
-vpxor xmm7,xmm7,xmm11
-
-vpaddq xmm0,xmm4,xmm4
-vpsrlq xmm4,xmm4,63
-vpor xmm4,xmm4,xmm0
-vpaddq xmm0,xmm5,xmm5
-vpsrlq xmm5,xmm5,63
-vpor xmm5,xmm5,xmm0
-vpaddq xmm0,xmm6,xmm6
-vpsrlq xmm6,xmm6,63
-vpor xmm6,xmm6,xmm0
-vpaddq xmm0,xmm7,xmm7
-vpsrlq xmm7,xmm7,63
-vpor xmm7,xmm7,xmm0
-
-vmovdqa xmm0, [rsp]
-}
-
-macro hR1 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
-{
-vpaddq xmm0,xmm0,xmm5
-vpaddq xmm1,xmm1,xmm6
-vpaddq xmm2,xmm2,xmm7
-vpaddq xmm3,xmm3,xmm4
-if m0<lim
-vpaddq xmm0,xmm0, xword [src+m0*16]
-end if
-if m1<lim
-vpaddq xmm1,xmm1, xword [src+m1*16]
-end if
-if m2<lim
-vpaddq xmm2,xmm2, xword [src+m2*16]
-end if
-if m3<lim
-vpaddq xmm3,xmm3, xword [src+m3*16]
-end if
-vpxor xmm15,xmm15,xmm0
-vpxor xmm12,xmm12,xmm1
-vpxor xmm13,xmm13,xmm2
-vpxor xmm14,xmm14,xmm3
-vpshufd xmm15,xmm15,0xB1
-vpshufd xmm12,xmm12,0xB1
-vpshufd xmm13,xmm13,0xB1
-vpshufd xmm14,xmm14,0xB1
-vpaddq xmm10,xmm10,xmm15
-vpaddq xmm11,xmm11,xmm12
-vpaddq xmm8,xmm8,xmm13
-vpaddq xmm9,xmm9,xmm14
-vpxor xmm5,xmm5,xmm10
-vpxor xmm6,xmm6,xmm11
-vpxor xmm7,xmm7,xmm8
-vpxor xmm4,xmm4,xmm9
-vmovdqa [rsp], xmm10
-vmovdqa xmm10, xword [xshufb_ror24]
-vpshufb xmm5,xmm5,xmm10
-vpshufb xmm6,xmm6,xmm10
-vpshufb xmm7,xmm7,xmm10
-vpshufb xmm4,xmm4,xmm10
-vmovdqa xmm10, [rsp]
-
-vpaddq xmm0,xmm0,xmm5
-vpaddq xmm1,xmm1,xmm6
-vpaddq xmm2,xmm2,xmm7
-vpaddq xmm3,xmm3,xmm4
-if m4<lim
-vpaddq xmm0,xmm0, xword [src+m4*16]
-end if
-if m5<lim
-vpaddq xmm1,xmm1, xword [src+m5*16]
-end if
-if m6<lim
-vpaddq xmm2,xmm2, xword [src+m6*16]
-end if
-if m7<lim
-vpaddq xmm3,xmm3, xword [src+m7*16]
-end if
-vpxor xmm15,xmm15,xmm0
-vpxor xmm12,xmm12,xmm1
-vpxor xmm13,xmm13,xmm2
-vpxor xmm14,xmm14,xmm3
-vmovdqa [rsp], xmm0
-vmovdqa xmm0, xword [xshufb_ror16]
-vpshufb xmm15,xmm15,xmm0
-vpshufb xmm12,xmm12,xmm0
-vpshufb xmm13,xmm13,xmm0
-vpshufb xmm14,xmm14,xmm0
-vpaddq xmm10,xmm10,xmm15
-vpaddq xmm11,xmm11,xmm12
-vpaddq xmm8,xmm8,xmm13
-vpaddq xmm9,xmm9,xmm14
-vpxor xmm5,xmm5,xmm10
-vpxor xmm6,xmm6,xmm11
-vpxor xmm7,xmm7,xmm8
-vpxor xmm4,xmm4,xmm9
-
-vpaddq xmm0,xmm5,xmm5
-vpsrlq xmm5,xmm5,63
-vpor xmm5,xmm5,xmm0
-vpaddq xmm0,xmm6,xmm6
-vpsrlq xmm6,xmm6,63
-vpor xmm6,xmm6,xmm0
-vpaddq xmm0,xmm7,xmm7
-vpsrlq xmm7,xmm7,63
-vpor xmm7,xmm7,xmm0
-vpaddq xmm0,xmm4,xmm4
-vpsrlq xmm4,xmm4,63
-vpor xmm4,xmm4,xmm0
-
-vmovdqa xmm0, [rsp]
-}
-
-macro Blake2bRounds2 lim,src
-{
-;ROUND 0
-;hR0 0,2,4,6,1,3,5,7,lim,src
-;hR1 8,10,12,14,9,11,13,15,lim,src
-
-;ROUND 1
-hR0 14,4,9,13,10,8,15,6,lim,src
-hR1 1,0,11,5,12,2,7,3,lim,src
-
-;ROUND 2
-hR0 11,12,5,15,8,0,2,13,lim,src
-hR1 10,3,7,9,14,6,1,4,lim,src
-
-;ROUND 3
-hR0 7,3,13,11,9,1,12,14,lim,src
-hR1 2,5,4,15,6,10,0,8,lim,src
-
-;ROUND 4
-hR0 9,5,2,10,0,7,4,15,lim,src
-hR1 14,11,6,3,1,12,8,13,lim,src
-
-;ROUND 5
-hR0 2,6,0,8,12,10,11,3,lim,src
-hR1 4,7,15,1,13,5,14,9,lim,src
-
-;ROUND 6
-hR0 12,1,14,4,5,15,13,10,lim,src
-hR1 0,6,9,8,7,3,2,11,lim,src
-
-;ROUND 7
-hR0 13,7,12,3,11,14,1,9,lim,src
-hR1 5,15,8,2,0,4,6,10,lim,src
-
-;ROUND 8
-hR0 6,14,11,0,15,9,3,8,lim,src
-hR1 12,13,1,10,2,7,4,5,lim,src
-
-;ROUND 9
-hR0 10,8,7,1,2,4,6,5,lim,src
-hR1 15,9,3,13,11,14,12,0,lim,src
-
-;ROUND 10
-hR0 0,2,4,6,1,3,5,7,lim,src
-hR1 8,10,12,14,9,11,13,15,lim,src
-
-;ROUND 11
-hR0 14,4,9,13,10,8,15,6,lim,src
-hR1 1,0,11,5,12,2,7,3,lim,src
-}
-
-macro Blake2beq2of2 mids, src
-{
-vmovddup xmm0, qword [mids]
-vpaddq xmm0,xmm0, xword [src+1*16]
-vmovddup xmm12, qword [mids+0x08]
-vpxor xmm12,xmm12,xmm0
-vpshufb xmm12,xmm12, xword [xshufb_ror16]
-vmovddup xmm8, qword [mids+0x10]
-vpaddq xmm8,xmm8,xmm12
-vmovddup xmm4, qword [mids+0x18]
-vpxor xmm4,xmm4,xmm8
-vpaddq xmm2,xmm4,xmm4	;xmm2 is temp
-vpsrlq xmm4,xmm4,63
-vpor xmm4,xmm4,xmm2
-
-vmovddup xmm5, qword [mids+0x20]
-vpaddq xmm0,xmm0,xmm5
-vmovddup xmm1, qword [mids+0x30]
-vpxor xmm12,xmm12,xmm1
-vpshufd xmm12,xmm12,0xB1
-vmovddup xmm13, qword [mids+0x38]
-vpaddq xmm8,xmm8,xmm13
-vmovddup xmm3, qword [mids+0x60]
-vpaddq xmm3,xmm3,xmm4
-vmovddup xmm15, qword [mids+0x48]
-vpxor xmm15,xmm15,xmm0
-vpshufd xmm15,xmm15,0xB1
-vmovddup xmm11, qword [mids+0x58]
-vpaddq xmm11,xmm11,xmm12
-vmovddup xmm7, qword [mids+0x68]
-vpxor xmm7,xmm7,xmm8
-vmovddup xmm14, qword [mids+0x40]
-vpxor xmm14,xmm14,xmm3
-vpshufd xmm14,xmm14,0xB1
-vmovddup xmm10, qword [mids+0x50]
-vpaddq xmm10,xmm10,xmm15
-vmovddup xmm6, qword [mids+0x28]
-vpxor xmm6,xmm6,xmm11
-vmovddup xmm9, qword [mids+0x70]
-vpaddq xmm9,xmm9,xmm14
-vpxor xmm5,xmm5,xmm10
-vpxor xmm4,xmm4,xmm9
-vmovdqa xmm2, xword [xshufb_ror24]	;xmm2 is temp
-vpshufb xmm5,xmm5,xmm2
-vpshufb xmm6,xmm6,xmm2
-vpshufb xmm7,xmm7,xmm2
-vpshufb xmm4,xmm4,xmm2
-vmovddup xmm2, qword [mids+0x78]
-
-vpaddq xmm0,xmm0,xmm5
-vpaddq xmm1,xmm1,xmm6
-vpaddq xmm2,xmm2,xmm7
-vpaddq xmm3,xmm3,xmm4
-vpxor xmm15,xmm15,xmm0
-vpxor xmm12,xmm12,xmm1
-vpxor xmm13,xmm13,xmm2
-vpxor xmm14,xmm14,xmm3
-vmovdqa [rsp], xmm0
-vmovdqa xmm0, xword [xshufb_ror16]
-vpshufb xmm15,xmm15,xmm0
-vpshufb xmm12,xmm12,xmm0
-vpshufb xmm13,xmm13,xmm0
-vpshufb xmm14,xmm14,xmm0
-vpaddq xmm10,xmm10,xmm15
-vpaddq xmm11,xmm11,xmm12
-vpaddq xmm8,xmm8,xmm13
-vpaddq xmm9,xmm9,xmm14
-vpxor xmm5,xmm5,xmm10
-vpxor xmm6,xmm6,xmm11
-vpxor xmm7,xmm7,xmm8
-vpxor xmm4,xmm4,xmm9
-vpaddq xmm0,xmm5,xmm5
-vpsrlq xmm5,xmm5,63
-vpor xmm5,xmm5,xmm0
-vpaddq xmm0,xmm6,xmm6
-vpsrlq xmm6,xmm6,63
-vpor xmm6,xmm6,xmm0
-vpaddq xmm0,xmm7,xmm7
-vpsrlq xmm7,xmm7,63
-vpor xmm7,xmm7,xmm0
-vpaddq xmm0,xmm4,xmm4
-vpsrlq xmm4,xmm4,63
-vpor xmm4,xmm4,xmm0
-vmovdqa xmm0, [rsp]
-
-Blake2bRounds2 2,src
-
-vpxor xmm0, xmm0, xmm8
-vpxor xmm1, xmm1, xmm9
-vpxor xmm2, xmm2, xmm10
-vpxor xmm3, xmm3, xmm11
-vpxor xmm4, xmm4, xmm12
-vpxor xmm5, xmm5, xmm13
-vpxor xmm6, xmm6, xmm14
-;vpxor xmm7, xmm7, xmm15
-vmovddup xmm8, qword [mids+0x80]
-vmovddup xmm9, qword [mids+0x88]
-vmovddup xmm10, qword [mids+0x90]
-vmovddup xmm11, qword [mids+0x98]
-vmovddup xmm12, qword [mids+0xa0]
-vmovddup xmm13, qword [mids+0xa8]
-vmovddup xmm14, qword [mids+0xb0]
-;vmovddup xmm15, qword [mids+0xb8]
-vpxor xmm0, xmm0, xmm8
-vpxor xmm1, xmm1, xmm9
-vpxor xmm2, xmm2, xmm10
-vpxor xmm3, xmm3, xmm11
-vpxor xmm4, xmm4, xmm12
-vpxor xmm5, xmm5, xmm13
-vpxor xmm6, xmm6, xmm14
-;vpxor xmm7, xmm7, xmm15
-}
diff --git a/cpu_xenoncat/Linux/blake2b/asm/macro_blake2b_avx2.asm b/cpu_xenoncat/Linux/blake2b/asm/macro_blake2b_avx2.asm
deleted file mode 100644
index 35fff17b9..000000000
--- a/cpu_xenoncat/Linux/blake2b/asm/macro_blake2b_avx2.asm
+++ /dev/null
@@ -1,350 +0,0 @@
-macro hR0 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
-{
-vpaddq ymm0,ymm0,ymm4
-vpaddq ymm1,ymm1,ymm5
-vpaddq ymm2,ymm2,ymm6
-vpaddq ymm3,ymm3,ymm7
-if m0<lim
-vpaddq ymm0,ymm0, yword [src+m0*32]
-end if
-if m1<lim
-vpaddq ymm1,ymm1, yword [src+m1*32]
-end if
-if m2<lim
-vpaddq ymm2,ymm2, yword [src+m2*32]
-end if
-if m3<lim
-vpaddq ymm3,ymm3, yword [src+m3*32]
-end if
-vpxor ymm12,ymm12,ymm0
-vpxor ymm13,ymm13,ymm1
-vpxor ymm14,ymm14,ymm2
-vpxor ymm15,ymm15,ymm3
-vpshufd ymm12,ymm12,0xB1
-vpshufd ymm13,ymm13,0xB1
-vpshufd ymm14,ymm14,0xB1
-vpshufd ymm15,ymm15,0xB1
-vpaddq ymm8,ymm8,ymm12
-vpaddq ymm9,ymm9,ymm13
-vpaddq ymm10,ymm10,ymm14
-vpaddq ymm11,ymm11,ymm15
-vpxor ymm4,ymm4,ymm8
-vpxor ymm5,ymm5,ymm9
-vpxor ymm6,ymm6,ymm10
-vpxor ymm7,ymm7,ymm11
-vmovdqa [rsp], ymm8
-vbroadcasti128 ymm8, xword [xshufb_ror24]
-vpshufb ymm4,ymm4,ymm8
-vpshufb ymm5,ymm5,ymm8
-vpshufb ymm6,ymm6,ymm8
-vpshufb ymm7,ymm7,ymm8
-vmovdqa ymm8, [rsp]
-
-vpaddq ymm0,ymm0,ymm4
-vpaddq ymm1,ymm1,ymm5
-vpaddq ymm2,ymm2,ymm6
-vpaddq ymm3,ymm3,ymm7
-if m4<lim
-vpaddq ymm0,ymm0, yword [src+m4*32]
-end if
-if m5<lim
-vpaddq ymm1,ymm1, yword [src+m5*32]
-end if
-if m6<lim
-vpaddq ymm2,ymm2, yword [src+m6*32]
-end if
-if m7<lim
-vpaddq ymm3,ymm3, yword [src+m7*32]
-end if
-vpxor ymm12,ymm12,ymm0
-vpxor ymm13,ymm13,ymm1
-vpxor ymm14,ymm14,ymm2
-vpxor ymm15,ymm15,ymm3
-vmovdqa [rsp], ymm0
-vbroadcasti128 ymm0, xword [xshufb_ror16]
-vpshufb ymm12,ymm12,ymm0
-vpshufb ymm13,ymm13,ymm0
-vpshufb ymm14,ymm14,ymm0
-vpshufb ymm15,ymm15,ymm0
-vpaddq ymm8,ymm8,ymm12
-vpaddq ymm9,ymm9,ymm13
-vpaddq ymm10,ymm10,ymm14
-vpaddq ymm11,ymm11,ymm15
-vpxor ymm4,ymm4,ymm8
-vpxor ymm5,ymm5,ymm9
-vpxor ymm6,ymm6,ymm10
-vpxor ymm7,ymm7,ymm11
-
-vpaddq ymm0,ymm4,ymm4
-vpsrlq ymm4,ymm4,63
-vpor ymm4,ymm4,ymm0
-vpaddq ymm0,ymm5,ymm5
-vpsrlq ymm5,ymm5,63
-vpor ymm5,ymm5,ymm0
-vpaddq ymm0,ymm6,ymm6
-vpsrlq ymm6,ymm6,63
-vpor ymm6,ymm6,ymm0
-vpaddq ymm0,ymm7,ymm7
-vpsrlq ymm7,ymm7,63
-vpor ymm7,ymm7,ymm0
-
-vmovdqa ymm0, [rsp]
-}
-
-macro hR1 m0,m1,m2,m3,m4,m5,m6,m7,lim,src
-{
-vpaddq ymm0,ymm0,ymm5
-vpaddq ymm1,ymm1,ymm6
-vpaddq ymm2,ymm2,ymm7
-vpaddq ymm3,ymm3,ymm4
-if m0<lim
-vpaddq ymm0,ymm0, yword [src+m0*32]
-end if
-if m1<lim
-vpaddq ymm1,ymm1, yword [src+m1*32]
-end if
-if m2<lim
-vpaddq ymm2,ymm2, yword [src+m2*32]
-end if
-if m3<lim
-vpaddq ymm3,ymm3, yword [src+m3*32]
-end if
-vpxor ymm15,ymm15,ymm0
-vpxor ymm12,ymm12,ymm1
-vpxor ymm13,ymm13,ymm2
-vpxor ymm14,ymm14,ymm3
-vpshufd ymm15,ymm15,0xB1
-vpshufd ymm12,ymm12,0xB1
-vpshufd ymm13,ymm13,0xB1
-vpshufd ymm14,ymm14,0xB1
-vpaddq ymm10,ymm10,ymm15
-vpaddq ymm11,ymm11,ymm12
-vpaddq ymm8,ymm8,ymm13
-vpaddq ymm9,ymm9,ymm14
-vpxor ymm5,ymm5,ymm10
-vpxor ymm6,ymm6,ymm11
-vpxor ymm7,ymm7,ymm8
-vpxor ymm4,ymm4,ymm9
-vmovdqa [rsp], ymm10
-vbroadcasti128 ymm10, xword [xshufb_ror24]
-vpshufb ymm5,ymm5,ymm10
-vpshufb ymm6,ymm6,ymm10
-vpshufb ymm7,ymm7,ymm10
-vpshufb ymm4,ymm4,ymm10
-vmovdqa ymm10, [rsp]
-
-vpaddq ymm0,ymm0,ymm5
-vpaddq ymm1,ymm1,ymm6
-vpaddq ymm2,ymm2,ymm7
-vpaddq ymm3,ymm3,ymm4
-if m4<lim
-vpaddq ymm0,ymm0, yword [src+m4*32]
-end if
-if m5<lim
-vpaddq ymm1,ymm1, yword [src+m5*32]
-end if
-if m6<lim
-vpaddq ymm2,ymm2, yword [src+m6*32]
-end if
-if m7<lim
-vpaddq ymm3,ymm3, yword [src+m7*32]
-end if
-vpxor ymm15,ymm15,ymm0
-vpxor ymm12,ymm12,ymm1
-vpxor ymm13,ymm13,ymm2
-vpxor ymm14,ymm14,ymm3
-vmovdqa [rsp], ymm0
-vbroadcasti128 ymm0, xword [xshufb_ror16]
-vpshufb ymm15,ymm15,ymm0
-vpshufb ymm12,ymm12,ymm0
-vpshufb ymm13,ymm13,ymm0
-vpshufb ymm14,ymm14,ymm0
-vpaddq ymm10,ymm10,ymm15
-vpaddq ymm11,ymm11,ymm12
-vpaddq ymm8,ymm8,ymm13
-vpaddq ymm9,ymm9,ymm14
-vpxor ymm5,ymm5,ymm10
-vpxor ymm6,ymm6,ymm11
-vpxor ymm7,ymm7,ymm8
-vpxor ymm4,ymm4,ymm9
-
-vpaddq ymm0,ymm5,ymm5
-vpsrlq ymm5,ymm5,63
-vpor ymm5,ymm5,ymm0
-vpaddq ymm0,ymm6,ymm6
-vpsrlq ymm6,ymm6,63
-vpor ymm6,ymm6,ymm0
-vpaddq ymm0,ymm7,ymm7
-vpsrlq ymm7,ymm7,63
-vpor ymm7,ymm7,ymm0
-vpaddq ymm0,ymm4,ymm4
-vpsrlq ymm4,ymm4,63
-vpor ymm4,ymm4,ymm0
-
-vmovdqa ymm0, [rsp]
-}
-
-macro Blake2bRounds2 lim,src
-{
-;ROUND 0
-;hR0 0,2,4,6,1,3,5,7,lim,src
-;hR1 8,10,12,14,9,11,13,15,lim,src
-
-;ROUND 1
-hR0 14,4,9,13,10,8,15,6,lim,src
-hR1 1,0,11,5,12,2,7,3,lim,src
-
-;ROUND 2
-hR0 11,12,5,15,8,0,2,13,lim,src
-hR1 10,3,7,9,14,6,1,4,lim,src
-
-;ROUND 3
-hR0 7,3,13,11,9,1,12,14,lim,src
-hR1 2,5,4,15,6,10,0,8,lim,src
-
-;ROUND 4
-hR0 9,5,2,10,0,7,4,15,lim,src
-hR1 14,11,6,3,1,12,8,13,lim,src
-
-;ROUND 5
-hR0 2,6,0,8,12,10,11,3,lim,src
-hR1 4,7,15,1,13,5,14,9,lim,src
-
-;ROUND 6
-hR0 12,1,14,4,5,15,13,10,lim,src
-hR1 0,6,9,8,7,3,2,11,lim,src
-
-;ROUND 7
-hR0 13,7,12,3,11,14,1,9,lim,src
-hR1 5,15,8,2,0,4,6,10,lim,src
-
-;ROUND 8
-hR0 6,14,11,0,15,9,3,8,lim,src
-hR1 12,13,1,10,2,7,4,5,lim,src
-
-;ROUND 9
-hR0 10,8,7,1,2,4,6,5,lim,src
-hR1 15,9,3,13,11,14,12,0,lim,src
-
-;ROUND 10
-hR0 0,2,4,6,1,3,5,7,lim,src
-hR1 8,10,12,14,9,11,13,15,lim,src
-
-;ROUND 11
-hR0 14,4,9,13,10,8,15,6,lim,src
-hR1 1,0,11,5,12,2,7,3,lim,src
-}
-
-macro Blake2beq2of2 mids, src
-{
-vpbroadcastq ymm0, qword [mids]
-vpaddq ymm0,ymm0, yword [src+1*32]
-vpbroadcastq ymm12, qword [mids+0x08]
-vpxor ymm12,ymm12,ymm0
-vbroadcasti128 ymm2, xword [xshufb_ror16]	;ymm2 is temp
-vpshufb ymm12,ymm12,ymm2
-vpbroadcastq ymm8, qword [mids+0x10]
-vpaddq ymm8,ymm8,ymm12
-vpbroadcastq ymm4, qword [mids+0x18]
-vpxor ymm4,ymm4,ymm8
-vpaddq ymm2,ymm4,ymm4	;ymm2 is temp
-vpsrlq ymm4,ymm4,63
-vpor ymm4,ymm4,ymm2
-
-vpbroadcastq ymm5, qword [mids+0x20]
-vpaddq ymm0,ymm0,ymm5
-vpbroadcastq ymm1, qword [mids+0x30]
-vpxor ymm12,ymm12,ymm1
-vpshufd ymm12,ymm12,0xB1
-vpbroadcastq ymm13, qword [mids+0x38]
-vpaddq ymm8,ymm8,ymm13
-vpbroadcastq ymm3, qword [mids+0x60]
-vpaddq ymm3,ymm3,ymm4
-vpbroadcastq ymm15, qword [mids+0x48]
-vpxor ymm15,ymm15,ymm0
-vpshufd ymm15,ymm15,0xB1
-vpbroadcastq ymm11, qword [mids+0x58]
-vpaddq ymm11,ymm11,ymm12
-vpbroadcastq ymm7, qword [mids+0x68]
-vpxor ymm7,ymm7,ymm8
-vpbroadcastq ymm14, qword [mids+0x40]
-vpxor ymm14,ymm14,ymm3
-vpshufd ymm14,ymm14,0xB1
-vpbroadcastq ymm10, qword [mids+0x50]
-vpaddq ymm10,ymm10,ymm15
-vpbroadcastq ymm6, qword [mids+0x28]
-vpxor ymm6,ymm6,ymm11
-vpbroadcastq ymm9, qword [mids+0x70]
-vpaddq ymm9,ymm9,ymm14
-vpxor ymm5,ymm5,ymm10
-vpxor ymm4,ymm4,ymm9
-vbroadcasti128 ymm2, xword [xshufb_ror24]	;ymm2 is temp
-vpshufb ymm5,ymm5,ymm2
-vpshufb ymm6,ymm6,ymm2
-vpshufb ymm7,ymm7,ymm2
-vpshufb ymm4,ymm4,ymm2
-vpbroadcastq ymm2, qword [mids+0x78]
-
-vpaddq ymm0,ymm0,ymm5
-vpaddq ymm1,ymm1,ymm6
-vpaddq ymm2,ymm2,ymm7
-vpaddq ymm3,ymm3,ymm4
-vpxor ymm15,ymm15,ymm0
-vpxor ymm12,ymm12,ymm1
-vpxor ymm13,ymm13,ymm2
-vpxor ymm14,ymm14,ymm3
-vmovdqa [rsp], ymm0
-vbroadcasti128 ymm0, xword [xshufb_ror16]
-vpshufb ymm15,ymm15,ymm0
-vpshufb ymm12,ymm12,ymm0
-vpshufb ymm13,ymm13,ymm0
-vpshufb ymm14,ymm14,ymm0
-vpaddq ymm10,ymm10,ymm15
-vpaddq ymm11,ymm11,ymm12
-vpaddq ymm8,ymm8,ymm13
-vpaddq ymm9,ymm9,ymm14
-vpxor ymm5,ymm5,ymm10
-vpxor ymm6,ymm6,ymm11
-vpxor ymm7,ymm7,ymm8
-vpxor ymm4,ymm4,ymm9
-vpaddq ymm0,ymm5,ymm5
-vpsrlq ymm5,ymm5,63
-vpor ymm5,ymm5,ymm0
-vpaddq ymm0,ymm6,ymm6
-vpsrlq ymm6,ymm6,63
-vpor ymm6,ymm6,ymm0
-vpaddq ymm0,ymm7,ymm7
-vpsrlq ymm7,ymm7,63
-vpor ymm7,ymm7,ymm0
-vpaddq ymm0,ymm4,ymm4
-vpsrlq ymm4,ymm4,63
-vpor ymm4,ymm4,ymm0
-vmovdqa ymm0, [rsp]
-
-Blake2bRounds2 2,src
-
-vpxor ymm0, ymm0, ymm8
-vpxor ymm1, ymm1, ymm9
-vpxor ymm2, ymm2, ymm10
-vpxor ymm3, ymm3, ymm11
-vpxor ymm4, ymm4, ymm12
-vpxor ymm5, ymm5, ymm13
-vpxor ymm6, ymm6, ymm14
-;vpxor ymm7, ymm7, ymm15
-vpbroadcastq ymm8, qword [mids+0x80]
-vpbroadcastq ymm9, qword [mids+0x88]
-vpbroadcastq ymm10, qword [mids+0x90]
-vpbroadcastq ymm11, qword [mids+0x98]
-vpbroadcastq ymm12, qword [mids+0xa0]
-vpbroadcastq ymm13, qword [mids+0xa8]
-vpbroadcastq ymm14, qword [mids+0xb0]
-;vpbroadcastq ymm15, qword [mids+0xb8]
-vpxor ymm0, ymm0, ymm8
-vpxor ymm1, ymm1, ymm9
-vpxor ymm2, ymm2, ymm10
-vpxor ymm3, ymm3, ymm11
-vpxor ymm4, ymm4, ymm12
-vpxor ymm5, ymm5, ymm13
-vpxor ymm6, ymm6, ymm14
-;vpxor ymm7, ymm7, ymm15
-}
diff --git a/cpu_xenoncat/Linux/blake2b/asm/proc_blake2_avx1.asm b/cpu_xenoncat/Linux/blake2b/asm/proc_blake2_avx1.asm
deleted file mode 100644
index 8a61663aa..000000000
--- a/cpu_xenoncat/Linux/blake2b/asm/proc_blake2_avx1.asm
+++ /dev/null
@@ -1,39 +0,0 @@
-;void Blake2Run2(unsigned char *hashout, void *midstate, uint32_t indexctr);
-;hashout: hash output buffer: 2*64 bytes
-;midstate: 256 bytes from Blake2PrepareMidstate2
-;indexctr: For n=200, k=9: {0, 2, 4, ..., 1048574}
-
-include "macro_blake2b_avx1.asm"
-
-Blake2Run2:
-mov rax, rsp
-sub rsp, 0x28
-and rsp, -32
-mov [rsp+0x20], rax
-
-mov [rsi+0xd4], edx
-add edx, 1
-mov [rsi+0xdc], edx
-
-Blake2beq2of2 rsi, rsi+0xc0
-
-vpunpcklqdq xmm8, xmm0, xmm1
-vpunpckhqdq xmm1, xmm0, xmm1
-vpunpcklqdq xmm10, xmm2, xmm3
-vpunpckhqdq xmm3, xmm2, xmm3
-vpunpcklqdq xmm12, xmm4, xmm5
-vpunpckhqdq xmm5, xmm4, xmm5
-vpunpcklqdq xmm14, xmm6, xmm7
-vpunpckhqdq xmm7, xmm6, xmm7
-
-vmovdqa [rdi], xmm8
-vmovdqa [rdi+0x10], xmm10
-vmovdqa [rdi+0x20], xmm12
-vmovdqa [rdi+0x30], xmm14
-vmovdqa [rdi+0x40], xmm1
-vmovdqa [rdi+0x50], xmm3
-vmovdqa [rdi+0x60], xmm5
-vmovdqa [rdi+0x70], xmm7
-
-mov rsp, [rsp+0x20]
-ret
diff --git a/cpu_xenoncat/Linux/blake2b/asm/proc_blake2_avx2.asm b/cpu_xenoncat/Linux/blake2b/asm/proc_blake2_avx2.asm
deleted file mode 100644
index a005e1430..000000000
--- a/cpu_xenoncat/Linux/blake2b/asm/proc_blake2_avx2.asm
+++ /dev/null
@@ -1,49 +0,0 @@
-;void Blake2Run4(unsigned char *hashout, void *midstate, uint32_t indexctr);
-;hashout: hash output buffer: 4*64 bytes
-;midstate: 256 bytes from Blake2PrepareMidstate4
-;indexctr: For n=200, k=9: {0, 4, 8, ..., 1048572}
-
-include "macro_blake2b_avx2.asm"
-
-Blake2Run4:
-mov rax, rsp
-sub rsp, 0x28
-and rsp, -32
-mov [rsp+0x20], rax
-
-vmovd xmm0, edx		;indexctr
-vpbroadcastd ymm0, xmm0
-vpaddd ymm0, ymm0, yword [yctrinit]
-vpblendd ymm0, ymm0, yword [rsi+0xe0], 0x55
-vmovdqa yword [rsi+0xe0], ymm0
-
-Blake2beq2of2 rsi, rsi+0xc0
-
-vpunpcklqdq ymm8, ymm0, ymm1
-vpunpckhqdq ymm9, ymm0, ymm1
-vpunpcklqdq ymm10, ymm2, ymm3
-vpunpckhqdq ymm11, ymm2, ymm3
-vpunpcklqdq ymm12, ymm4, ymm5
-vpunpckhqdq ymm13, ymm4, ymm5
-vpunpcklqdq ymm14, ymm6, ymm7
-vpunpckhqdq ymm15, ymm6, ymm7
-vperm2i128 ymm0, ymm8, ymm10, 0x20
-vperm2i128 ymm1, ymm12, ymm14, 0x20
-vperm2i128 ymm2, ymm9, ymm11, 0x20
-vperm2i128 ymm3, ymm13, ymm15, 0x20
-vperm2i128 ymm4, ymm8, ymm10, 0x31
-vperm2i128 ymm5, ymm12, ymm14, 0x31
-vperm2i128 ymm6, ymm9, ymm11, 0x31
-vperm2i128 ymm7, ymm13, ymm15, 0x31
-
-vmovdqa [rdi], ymm0
-vmovdqa [rdi+0x20], ymm1
-vmovdqa [rdi+0x40], ymm2
-vmovdqa [rdi+0x60], ymm3
-vmovdqa [rdi+0x80], ymm4
-vmovdqa [rdi+0xa0], ymm5
-vmovdqa [rdi+0xc0], ymm6
-vmovdqa [rdi+0xe0], ymm7
-
-mov rsp, [rsp+0x20]
-ret
diff --git a/cpu_xenoncat/Linux/blake2b/asm/proc_prepmidstate_avx1.asm b/cpu_xenoncat/Linux/blake2b/asm/proc_prepmidstate_avx1.asm
deleted file mode 100644
index 2cbefa3e4..000000000
--- a/cpu_xenoncat/Linux/blake2b/asm/proc_prepmidstate_avx1.asm
+++ /dev/null
@@ -1,212 +0,0 @@
-;void Blake2PrepareMidstate2(void *midstate, unsigned char *input);
-;midstate: 256 bytes of buffer for output midstate, aligned by 32
-;input: 140 bytes header, preferably aligned by 8
-
-Blake2PrepareMidstate2:
-sub rsp, 0x188
-
-vmovdqa xmm10, xword [xshufb_ror24]
-vmovdqa xmm11, xword [xshufb_ror16]
-
-vmovdqa xmm0, xword [s0]
-vmovdqa xmm1, xword [s2]
-vmovdqa xmm2, xword [s4]
-vmovdqa xmm3, xword [s6]
-vmovdqa xmm4, xword [iv]
-vmovdqa xmm5, xword [iv+0x10]
-vmovdqa xmm6, xword [iv4xor128]
-vmovdqa xmm7, xword [iv4xor128+0x10]
-
-mov r8, rsp
-lea r9, [blake2sigma]
-lea r11, [blake2sigma+160]
-call _ProcBlakeMsgSched
-call _ProcBlakeRound
-add r8, 0x80
-add r9, 16
-call _ProcBlakeMsgSched
-call _ProcBlakeRound
-add r8, 0x80
-add r9, 16
-_LoopEhPrepare1:
-call _ProcBlakeMsgSched
-call _ProcBlakeRound
-add r9, 16
-cmp r9, r11
-jb _LoopEhPrepare1
-mov r8, rsp
-call _ProcBlakeRound
-add r8, 0x80
-call _ProcBlakeRound
-
-vpxor xmm0, xmm0, xmm4
-vpxor xmm1, xmm1, xmm5
-vpxor xmm2, xmm2, xmm6
-vpxor xmm3, xmm3, xmm7
-vpxor xmm0, xmm0, xword [s0]
-vpxor xmm1, xmm1, xword [s2]
-vpxor xmm2, xmm2, xword [s4]
-vpxor xmm3, xmm3, xword [s6]
-vmovdqa xword [rdi+0x80], xmm0
-vmovdqa xword [rdi+0x90], xmm1
-vmovdqa xword [rdi+0xa0], xmm2
-vmovdqa xword [rdi+0xb0], xmm3
-vmovq xmm8, [rsi+0x80]
-vpshufd xmm4, xmm8, 0x44
-vmovdqa xword [rdi+0xc0], xmm4
-vmovd xmm4, [rsi+0x88]
-vpshufd xmm4, xmm4, 0x44
-vmovdqa xword [rdi+0xd0], xmm4
-
-;Begin second message block
-vmovdqa xmm4, xword [iv]
-vmovdqa xmm5, xword [iv+0x10]
-vmovdqa xmm6, xword [iv4xor144]
-vmovdqa xmm7, xword [iv6inverted]
-vpaddq xmm0, xmm0, xmm2
-vpaddq xmm1, xmm1, xmm3
-vpaddq xmm0, xmm0, xmm8		;xmm8[63:0]=message
-vpxor xmm6, xmm6, xmm0
-vpxor xmm7, xmm7, xmm1
-vpshufd xmm6, xmm6, 0xb1
-	vmovq [rdi+0x08], xmm6	;v12
-vpshufd xmm7, xmm7, 0xb1
-vpaddq xmm4, xmm4, xmm6
-	vmovq [rdi+0x10], xmm4	;v8
-vpaddq xmm5, xmm5, xmm7
-vpxor xmm2, xmm2, xmm4
-vpxor xmm3, xmm3, xmm5
-vpshufb xmm2, xmm2, xmm10
-	vmovq [rdi+0x18], xmm2	;v4
-vpshufb xmm3, xmm3, xmm10
-
-vpaddq xmm0, xmm0, xmm2
-	vmovq [rdi], xmm0	;v0
-vpaddq xmm1, xmm1, xmm3
-	vpextrq [rdi+0x60], xmm1, 1	;v3
-;add message (nonce, index) to xmm0 here, but we don't have
-vpxor xmm6, xmm6, xmm0
-vpxor xmm7, xmm7, xmm1
-vpshufb xmm6, xmm6, xmm11
-vpshufb xmm7, xmm7, xmm11
-	vmovdqa xword [rdi+0x40], xmm7	;v14,15
-vpaddq xmm4, xmm4, xmm6
-	vpextrq [rdi+0x70], xmm4, 1	;v9
-vpaddq xmm5, xmm5, xmm7
-	vmovdqa xword [rdi+0x50], xmm5	;v10,11
-vpxor xmm2, xmm2, xmm4
-vpxor xmm3, xmm3, xmm5
-vpaddq xmm8, xmm2, xmm2
-vpsrlq xmm2, xmm2, 63
-vpor xmm8, xmm2, xmm8		;xmm8 takes xmm2
-vpaddq xmm2, xmm3, xmm3		;xmm2 is temp
-vpsrlq xmm3, xmm3, 63
-vpor xmm3, xmm3, xmm2
-
-vpalignr xmm2, xmm3, xmm8, 8	;xmm2 resume
-	vmovdqa xword [rdi+0x20], xmm2	;v5,6
-vpsrldq xmm3, xmm3, 8
-	vmovq [rdi+0x68], xmm3		;v7
-vpsrldq xmm7, xmm6, 8
-vpaddq xmm0, xmm0, xmm2
-	vpextrq [rdi+0x30], xmm0, 1	;v1
-vpaddq xmm1, xmm1, xmm3
-	vmovq [rdi+0x78], xmm1		;v2
-vpxor xmm7, xmm7, xmm1
-vpshufd xmm7, xmm7, 0xb1
-	vmovq [rdi+0x38], xmm7		;v13
-
-add rsp, 0x188
-ret
-
-align 16
-_ProcBlakeMsgSched:
-;rsi=src
-;r8=dst
-;r9=sigma table
-xor r10d, r10d
-_LoopBlakeMsgSched:
-movzx eax, byte [r9+r10]
-mov rax, [rsi+rax*8]
-mov [r8+r10*8], rax
-add r10d, 1
-cmp r10d, 16
-jb _LoopBlakeMsgSched
-ret
-
-align 16
-_ProcBlakeRound:
-vpaddq xmm0, xmm0, xmm2
-vpaddq xmm1, xmm1, xmm3
-vpaddq xmm0, xmm0, [r8]
-vpaddq xmm1, xmm1, [r8+0x10]
-vpxor xmm6, xmm6, xmm0
-vpxor xmm7, xmm7, xmm1
-vpshufd xmm6, xmm6, 0xb1
-vpshufd xmm7, xmm7, 0xb1
-vpaddq xmm4, xmm4, xmm6
-vpaddq xmm5, xmm5, xmm7
-vpxor xmm2, xmm2, xmm4
-vpxor xmm3, xmm3, xmm5
-vpshufb xmm2, xmm2, xmm10
-vpshufb xmm3, xmm3, xmm10
-vpaddq xmm0, xmm0, xmm2
-vpaddq xmm1, xmm1, xmm3
-vpaddq xmm0, xmm0, [r8+0x20]
-vpaddq xmm1, xmm1, [r8+0x30]
-vpxor xmm6, xmm6, xmm0
-vpxor xmm7, xmm7, xmm1
-vpshufb xmm9, xmm6, xmm11	;xmm9 takes xmm6
-vpshufb xmm7, xmm7, xmm11
-vpaddq xmm4, xmm4, xmm9
-vpaddq xmm5, xmm5, xmm7
-vpxor xmm2, xmm2, xmm4
-vpxor xmm3, xmm3, xmm5
-vpaddq xmm8, xmm2, xmm2
-vpsrlq xmm2, xmm2, 63
-vpor xmm8, xmm2, xmm8		;xmm8 takes xmm2
-vpaddq xmm2, xmm3, xmm3		;xmm2 is temp
-vpsrlq xmm3, xmm3, 63
-vpor xmm3, xmm3, xmm2
-
-vpalignr xmm2, xmm3, xmm8, 8	;xmm2 resume
-vpalignr xmm3, xmm8, xmm3, 8
-vpalignr xmm6, xmm9, xmm7, 8	;xmm6 resume
-vpalignr xmm7, xmm7, xmm9, 8
-vpaddq xmm0, xmm0, xmm2
-vpaddq xmm1, xmm1, xmm3
-vpaddq xmm0, xmm0, [r8+0x40]
-vpaddq xmm1, xmm1, [r8+0x50]
-vpxor xmm6, xmm6, xmm0
-vpxor xmm7, xmm7, xmm1
-vpshufd xmm6, xmm6, 0xb1
-vpshufd xmm7, xmm7, 0xb1
-vpaddq xmm5, xmm5, xmm6
-vpaddq xmm4, xmm4, xmm7
-vpxor xmm2, xmm2, xmm5
-vpxor xmm3, xmm3, xmm4
-vpshufb xmm2, xmm2, xmm10
-vpshufb xmm3, xmm3, xmm10
-vpaddq xmm0, xmm0, xmm2
-vpaddq xmm1, xmm1, xmm3
-vpaddq xmm0, xmm0, [r8+0x60]
-vpaddq xmm1, xmm1, [r8+0x70]
-vpxor xmm6, xmm6, xmm0
-vpxor xmm7, xmm7, xmm1
-vpshufb xmm9, xmm6, xmm11	;xmm9 takes xmm6
-vpshufb xmm7, xmm7, xmm11
-vpaddq xmm5, xmm5, xmm9
-vpaddq xmm4, xmm4, xmm7
-vpxor xmm2, xmm2, xmm5
-vpxor xmm3, xmm3, xmm4
-vpaddq xmm8, xmm2, xmm2
-vpsrlq xmm2, xmm2, 63
-vpor xmm8, xmm2, xmm8		;xmm8 takes xmm2
-vpaddq xmm2, xmm3, xmm3		;xmm2 is temp
-vpsrlq xmm3, xmm3, 63
-vpor xmm3, xmm3, xmm2
-vpalignr xmm2, xmm8, xmm3, 8	;xmm2 resume
-vpalignr xmm3, xmm3, xmm8, 8
-vpalignr xmm6, xmm7, xmm9, 8	;xmm6 resume
-vpalignr xmm7, xmm9, xmm7, 8
-ret
diff --git a/cpu_xenoncat/Linux/blake2b/asm/proc_prepmidstate_avx2.asm b/cpu_xenoncat/Linux/blake2b/asm/proc_prepmidstate_avx2.asm
deleted file mode 100644
index 2447dadb5..000000000
--- a/cpu_xenoncat/Linux/blake2b/asm/proc_prepmidstate_avx2.asm
+++ /dev/null
@@ -1,166 +0,0 @@
-;void Blake2PrepareMidstate4(void *midstate, unsigned char *input);
-;midstate: 256 bytes of buffer for output midstate, aligned by 32
-;input: 140 bytes header, preferably aligned by 8
-
-Blake2PrepareMidstate4:
-sub rsp, 0x188
-vbroadcasti128 ymm6, xword [xshufb_ror24]
-vbroadcasti128 ymm7, xword [xshufb_ror16]
-
-vmovdqa ymm0, yword [s0]
-vmovdqa ymm1, yword [s4]
-vmovdqa ymm2, yword [iv]
-vmovdqa ymm3, yword [iv4xor128]
-
-mov r8, rsp
-lea r9, [blake2sigma]
-lea r11, [blake2sigma+160]
-call _ProcBlakeMsgSched
-call _ProcBlakeRound
-add r8, 0x80
-add r9, 16
-call _ProcBlakeMsgSched
-call _ProcBlakeRound
-add r8, 0x80
-add r9, 16
-_LoopEhPrepare1:
-call _ProcBlakeMsgSched
-call _ProcBlakeRound
-add r9, 16
-cmp r9, r11
-jb _LoopEhPrepare1
-mov r8, rsp
-call _ProcBlakeRound
-add r8, 0x80
-call _ProcBlakeRound
-
-vpxor ymm0, ymm0, ymm2
-vpxor ymm1, ymm1, ymm3
-vpxor ymm0, ymm0, yword [s0]
-vpxor ymm1, ymm1, yword [s4]
-vmovdqa yword [rdi+0x80], ymm0
-vmovdqa yword [rdi+0xa0], ymm1
-vmovq xmm5, [rsi+0x80]
-vpbroadcastq ymm4, xmm5
-vmovdqa yword [rdi+0xc0], ymm4
-vmovd xmm4, [rsi+0x88]
-vpbroadcastq ymm4, xmm4
-vmovdqa yword [rdi+0xe0], ymm4
-
-;Begin second message block
-vmovdqa ymm2, yword [iv]
-vmovdqa ymm3, yword [iv4xor144]	;also loads iv6inverted
-vpaddq ymm0, ymm0, ymm1
-vpaddq ymm0, ymm0, ymm5		;ymm5[63:0]=message
-vpxor ymm3, ymm3, ymm0
-vpshufd ymm3, ymm3, 0xb1
-	vmovq [rdi+0x08], xmm3	;v12
-vpaddq ymm2, ymm2, ymm3
-	vmovq [rdi+0x10], xmm2	;v8
-vpxor ymm1, ymm1, ymm2
-vpshufb ymm1, ymm1, ymm6
-	vmovq [rdi+0x18], xmm1	;v4
-
-vpaddq ymm0, ymm0, ymm1
-	vmovq [rdi], xmm0	;v0, v3 ready
-;add message (nonce, index) to xmm0 here, but we don't have
-vpxor ymm3, ymm3, ymm0
-vpshufb ymm3, ymm3, ymm7
-vextracti128 xmm4, ymm3, 1
-	vmovdqa xword [rdi+0x40], xmm4	;v14,15
-vpaddq ymm2, ymm2, ymm3
-	vpextrq [rdi+0x70], xmm2, 1	;v9
-vextracti128 xmm5, ymm2, 1
-	vmovdqa xword [rdi+0x50], xmm5	;v10,11
-vpxor ymm1, ymm1, ymm2
-vpaddq ymm4, ymm1, ymm1
-vpsrlq ymm1, ymm1, 63
-vpor ymm1, ymm1, ymm4
-;Valid:
-;    v1  v2  v3
-;    v5  v6  v7
-;    v9  v10 v11
-;    v13 v14 v15
-;
-;v1 v2 <- v6 v7
-;v13 <- v2
-
-vpermq ymm1, ymm1, 0x39
-	vmovdqa xword [rdi+0x20], xmm1	;v5,6
-
-vextracti128 xmm4, ymm0, 1
-vextracti128 xmm5, ymm1, 1
-	vpextrq [rdi+0x60], xmm4, 1	;v3
-	vmovq [rdi+0x68], xmm5		;v7
-
-vpsrldq xmm3, xmm3, 8
-vpaddq xmm0, xmm0, xmm1
-	vpextrq [rdi+0x30], xmm0, 1	;v1
-vpaddq xmm4, xmm4, xmm5
-	vmovq [rdi+0x78], xmm4		;v2
-vpxor xmm3, xmm3, xmm4
-vpshufd xmm3, xmm3, 0xb1
-	vmovq [rdi+0x38], xmm3		;v13
-
-add rsp, 0x188
-ret
-
-align 16
-_ProcBlakeMsgSched:
-;rsi=src
-;r8=dst
-;r9=sigma table
-xor r10d, r10d
-_LoopBlakeMsgSched:
-movzx eax, byte [r9+r10]
-mov rax, [rsi+rax*8]
-mov [r8+r10*8], rax
-add r10d, 1
-cmp r10d, 16
-jb _LoopBlakeMsgSched
-ret
-
-align 16
-_ProcBlakeRound:
-vpaddq ymm0, ymm0, ymm1
-vpaddq ymm0, ymm0, [r8]
-vpxor ymm3, ymm3, ymm0
-vpshufd ymm3, ymm3, 0xb1
-vpaddq ymm2, ymm2, ymm3
-vpxor ymm1, ymm1, ymm2
-vpshufb ymm1, ymm1, ymm6	;ror24
-vpaddq ymm0, ymm0, ymm1
-vpaddq ymm0, ymm0, [r8+0x20]
-vpxor ymm3, ymm3, ymm0
-vpshufb ymm3, ymm3, ymm7	;ror16
-vpaddq ymm2, ymm2, ymm3
-vpxor ymm1, ymm1, ymm2
-vpaddq ymm4, ymm1, ymm1
-vpsrlq ymm1, ymm1, 63
-vpor ymm1, ymm1, ymm4
-
-vpermq ymm1, ymm1, 0x39
-vpermq ymm2, ymm2, 0x4e
-vpermq ymm3, ymm3, 0x93
-
-vpaddq ymm0, ymm0, ymm1
-vpaddq ymm0, ymm0, [r8+0x40]
-vpxor ymm3, ymm3, ymm0
-vpshufd ymm3, ymm3, 0xb1
-vpaddq ymm2, ymm2, ymm3
-vpxor ymm1, ymm1, ymm2
-vpshufb ymm1, ymm1, ymm6	;ror24
-vpaddq ymm0, ymm0, ymm1
-vpaddq ymm0, ymm0, [r8+0x60]
-vpxor ymm3, ymm3, ymm0
-vpshufb ymm3, ymm3, ymm7	;ror16
-vpaddq ymm2, ymm2, ymm3
-vpxor ymm1, ymm1, ymm2
-vpaddq ymm4, ymm1, ymm1
-vpsrlq ymm1, ymm1, 63
-vpor ymm1, ymm1, ymm4
-
-vpermq ymm1, ymm1, 0x93
-vpermq ymm2, ymm2, 0x4e
-vpermq ymm3, ymm3, 0x39
-ret
diff --git a/cpu_xenoncat/Linux/blake2b/asm/zcblake2_avx1.asm b/cpu_xenoncat/Linux/blake2b/asm/zcblake2_avx1.asm
deleted file mode 100644
index c918be9f6..000000000
--- a/cpu_xenoncat/Linux/blake2b/asm/zcblake2_avx1.asm
+++ /dev/null
@@ -1,11 +0,0 @@
-format elf64
-public Blake2PrepareMidstate2
-public Blake2Run2
-
-section '.text' executable align 64
-include "proc_prepmidstate_avx1.asm"
-align 16
-include "proc_blake2_avx1.asm"
-
-section '.data' writeable align 64
-include "data_blake2b.asm"
diff --git a/cpu_xenoncat/Linux/blake2b/asm/zcblake2_avx2.asm b/cpu_xenoncat/Linux/blake2b/asm/zcblake2_avx2.asm
deleted file mode 100644
index 1f455c00f..000000000
--- a/cpu_xenoncat/Linux/blake2b/asm/zcblake2_avx2.asm
+++ /dev/null
@@ -1,11 +0,0 @@
-format elf64
-public Blake2PrepareMidstate4
-public Blake2Run4
-
-section '.text' executable align 64
-include "proc_prepmidstate_avx2.asm"
-align 16
-include "proc_blake2_avx2.asm"
-
-section '.data' writeable align 64
-include "data_blake2b.asm"
diff --git a/cpu_xenoncat/Linux/blake2b/example_avx1.c b/cpu_xenoncat/Linux/blake2b/example_avx1.c
deleted file mode 100644
index 2ccded166..000000000
--- a/cpu_xenoncat/Linux/blake2b/example_avx1.c
+++ /dev/null
@@ -1,51 +0,0 @@
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-
-void Blake2PrepareMidstate2(void *midstate, unsigned char *input);
-//midstate: 256 bytes of buffer for output midstate, aligned by 32
-//input: 140 bytes header, preferably aligned by 8
-
-void Blake2Run2(unsigned char *hashout, void *midstate, uint32_t indexctr);
-//hashout: hash output buffer: 2*64 bytes
-//midstate: 256 bytes from Blake2PrepareMidstate2
-//indexctr: For n=200, k=9: {0, 2, 4, ..., 1048574}
-
-unsigned char __attribute__((aligned(8))) testdata[140] =
-{
-    0x04, 0x00, 0x00, 0x00, 0x91, 0x5F, 0xA6, 0x1C, 0x4F, 0xA5, 0x92, 0x3C, 0xE6, 0xEE, 0xAD, 0x06, 
-    0x74, 0x6B, 0x61, 0x22, 0x54, 0x94, 0xEA, 0x5A, 0x2A, 0x97, 0xAE, 0x46, 0x6E, 0x6F, 0xAA, 0x9C, 
-    0x6E, 0xF6, 0x3A, 0x0D, 0xA5, 0xFC, 0x67, 0xD7, 0xF8, 0xDC, 0x78, 0xC3, 0xC8, 0x70, 0xCA, 0x09, 
-    0xBA, 0xAB, 0xAA, 0xF7, 0x02, 0x59, 0x68, 0xA8, 0x6F, 0xEB, 0x88, 0x75, 0xD3, 0xF3, 0xFF, 0xA7, 
-    0x2E, 0xB0, 0x0F, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-    0x00, 0x00, 0x00, 0x00, 0x66, 0xCE, 0xD2, 0x57, 0x0F, 0x0F, 0x0F, 0x20, 0x00, 0x00, 0xF7, 0xF1, 
-    0x94, 0xA2, 0x53, 0x8E, 0x42, 0x5F, 0x21, 0x33, 0xCF, 0xA8, 0xD3, 0xCB, 0xF4, 0xDF, 0x71, 0xEF, 
-    0x38, 0x28, 0x51, 0x75, 0xCF, 0xED, 0xCB, 0x3E, 0x63, 0xA2, 0x00, 0x00
-};
-//expected output: 281dd5fc6d878538e640987b9bc597dbbd4af2cdf8bf5fb03bdfcefa40d8747d  out.bin
-
-int main(void)
-{
-	unsigned char midstate_a[256+32];
-	void *pmidstate = (void *) (((long) midstate_a+31L) & -32L);
-	unsigned char hashout_a[128+32];
-	unsigned char *phashout = (unsigned char *) (((long) hashout_a+31L) & -32L);
-	unsigned char buf[128];
-	FILE *outfile;
-	int i;
-
-	Blake2PrepareMidstate2(pmidstate, testdata);
-	outfile = fopen("out.bin", "wb");
-
-	for (i=0; i<1048576; i+=2) {
-		Blake2Run2(phashout, pmidstate, i);
-		memcpy(buf, phashout, 50);
-		memcpy(buf+50, phashout+64, 50);
-		fwrite(buf, 100, 1, outfile);	
-	}
-
-	fclose(outfile);
-
-	return 0;
-}
diff --git a/cpu_xenoncat/Linux/blake2b/example_avx2.c b/cpu_xenoncat/Linux/blake2b/example_avx2.c
deleted file mode 100644
index bbf9782d3..000000000
--- a/cpu_xenoncat/Linux/blake2b/example_avx2.c
+++ /dev/null
@@ -1,53 +0,0 @@
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-
-void Blake2PrepareMidstate4(void *midstate, unsigned char *input);
-//midstate: 256 bytes of buffer for output midstate, aligned by 32
-//input: 140 bytes header, preferably aligned by 8
-
-void Blake2Run4(unsigned char *hashout, void *midstate, uint32_t indexctr);
-//hashout: hash output buffer: 4*64 bytes
-//midstate: 256 bytes from Blake2PrepareMidstate4
-//indexctr: For n=200, k=9: {0, 4, 8, ..., 1048572}
-
-unsigned char __attribute__((aligned(8))) testdata[140] =
-{
-    0x04, 0x00, 0x00, 0x00, 0x91, 0x5F, 0xA6, 0x1C, 0x4F, 0xA5, 0x92, 0x3C, 0xE6, 0xEE, 0xAD, 0x06, 
-    0x74, 0x6B, 0x61, 0x22, 0x54, 0x94, 0xEA, 0x5A, 0x2A, 0x97, 0xAE, 0x46, 0x6E, 0x6F, 0xAA, 0x9C, 
-    0x6E, 0xF6, 0x3A, 0x0D, 0xA5, 0xFC, 0x67, 0xD7, 0xF8, 0xDC, 0x78, 0xC3, 0xC8, 0x70, 0xCA, 0x09, 
-    0xBA, 0xAB, 0xAA, 0xF7, 0x02, 0x59, 0x68, 0xA8, 0x6F, 0xEB, 0x88, 0x75, 0xD3, 0xF3, 0xFF, 0xA7, 
-    0x2E, 0xB0, 0x0F, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-    0x00, 0x00, 0x00, 0x00, 0x66, 0xCE, 0xD2, 0x57, 0x0F, 0x0F, 0x0F, 0x20, 0x00, 0x00, 0xF7, 0xF1, 
-    0x94, 0xA2, 0x53, 0x8E, 0x42, 0x5F, 0x21, 0x33, 0xCF, 0xA8, 0xD3, 0xCB, 0xF4, 0xDF, 0x71, 0xEF, 
-    0x38, 0x28, 0x51, 0x75, 0xCF, 0xED, 0xCB, 0x3E, 0x63, 0xA2, 0x00, 0x00
-};
-//expected output: 281dd5fc6d878538e640987b9bc597dbbd4af2cdf8bf5fb03bdfcefa40d8747d  out.bin
-
-int main(void)
-{
-	unsigned char midstate_a[256+32];
-	void *pmidstate = (void *) (((long) midstate_a+31L) & -32L);
-	unsigned char hashout_a[256+32];
-	unsigned char *phashout = (unsigned char *) (((long) hashout_a+31L) & -32L);
-	unsigned char buf[256];
-	FILE *outfile;
-	int i;
-
-	Blake2PrepareMidstate4(pmidstate, testdata);
-	outfile = fopen("out.bin", "wb");
-
-	for (i=0; i<1048576; i+=4) {
-		Blake2Run4(phashout, pmidstate, i);
-		memcpy(buf, phashout, 50);
-		memcpy(buf+50, phashout+64, 50);
-		memcpy(buf+100, phashout+128, 50);
-		memcpy(buf+150, phashout+192, 50);
-		fwrite(buf, 200, 1, outfile);	
-	}
-
-	fclose(outfile);
-
-	return 0;
-}
diff --git a/cpu_xenoncat/Linux/demo/input.bin b/cpu_xenoncat/Linux/demo/input.bin
deleted file mode 100644
index 432b9ab90..000000000
Binary files a/cpu_xenoncat/Linux/demo/input.bin and /dev/null differ
diff --git a/cpu_xenoncat/Linux/demo/quickbench.c b/cpu_xenoncat/Linux/demo/quickbench.c
deleted file mode 100644
index 036f8122a..000000000
--- a/cpu_xenoncat/Linux/demo/quickbench.c
+++ /dev/null
@@ -1,78 +0,0 @@
-//compile with
-//gcc -o quickbench quickbench.c equihash_avx2.o
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <time.h>
-
-#define CONTEXT_SIZE 178033152
-#define ITERATIONS 10
-
-//Linkage with assembly
-//EhPrepare takes in 136 bytes of input. The remaining 4 bytes of input is fed as nonce to EhSolver.
-//EhPrepare saves the 136 bytes in context, and EhSolver can be called repeatedly with different nonce.
-void EhPrepare(void *context, void *input);
-int32_t EhSolver(void *context, uint32_t nonce);
-extern char testinput[];
-
-int main(void)
-{
-	void *context_alloc, *context, *context_end;
-	uint32_t *pu32;
-	uint64_t *pu64, previous_rdtsc;
-	uint8_t inputheader[144];	//140 byte header
-	FILE *infile, *outfile;
-	struct timespec time0, time1;
-	long t0, t1;
-	int32_t numsolutions, total_solutions;
-	uint32_t nonce, delta_time, total_time;
-	int i, j;
-
-	context_alloc = malloc(CONTEXT_SIZE+4096);
-	context = (void*) (((long) context_alloc+4095) & -4096);
-	context_end = context + CONTEXT_SIZE;
-
-	infile = 0;
-	infile = fopen("input.bin", "rb");
-	if (infile) {
-		puts("Reading input.bin");
-		fread(inputheader, 140, 1, infile);
-		fclose(infile);
-	} else {
-		puts("input.bin not found, use sample data (beta1 testnet block 2)");
-		memcpy(inputheader, testinput, 140);
-	}
-
-
-	EhPrepare(context, (void *) inputheader);
-
-	//Warm up, timing not taken into average
-	nonce = 0;
-	clock_gettime(CLOCK_MONOTONIC, &time0);
-	numsolutions = EhSolver(context, nonce);
-	clock_gettime(CLOCK_MONOTONIC, &time1);
-	delta_time = (uint32_t) ((time1.tv_sec * 1000000000 + time1.tv_nsec)
-			- (time0.tv_sec * 1000000000 + time0.tv_nsec))/1000000;
-	printf("(Warm up) Time: %u ms, solutions: %u\n", delta_time, numsolutions);
-
-	printf("Running %d iterations...\n", ITERATIONS);
-	nonce = 58;	//arbritary number to get 19 solutions in 10 iterations (to match 1.88 solutions per run)
-	total_time = total_solutions = 0;
-	for (i=0; i<ITERATIONS; i++) {
-		clock_gettime(CLOCK_MONOTONIC, &time0);
-		numsolutions = EhSolver(context, nonce);
-		clock_gettime(CLOCK_MONOTONIC, &time1);
-		nonce++;
-		delta_time = (uint32_t) ((time1.tv_sec * 1000000000 + time1.tv_nsec)
-				- (time0.tv_sec * 1000000000 + time0.tv_nsec))/1000000;
-		total_time += delta_time;
-		total_solutions += numsolutions;
-		printf("Time: %u ms, solutions: %u\n", delta_time, numsolutions);
-	}
-
-	printf("Average time: %d ms; %.3f Sol/s\n", total_time/ITERATIONS, (double) 1000.0*total_solutions/total_time);
-
-	free(context_alloc);
-	return 0;
-}
diff --git a/cpu_xenoncat/Linux/demo/solver.c b/cpu_xenoncat/Linux/demo/solver.c
deleted file mode 100644
index 15da58961..000000000
--- a/cpu_xenoncat/Linux/demo/solver.c
+++ /dev/null
@@ -1,128 +0,0 @@
-//compile with
-//gcc -o solver solver.c equihash_avx2.o
-//
-//./solver
-//sha256sum out2.bin
-//Expected result with default input.bin (beta1 testnet block 2),
-//257d3c3250c14978614ac169edcf72bd131a2e4c227c8d7e21a2cd6131a13dda  out2.bin
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <string.h>
-#include <time.h>
-#include <x86intrin.h>	//for rdtsc
-
-#define CONTEXT_SIZE 178033152
-
-//Linkage with assembly
-//EhPrepare takes in 136 bytes of input. The remaining 4 bytes of input is fed as nonce to EhSolver.
-//EhPrepare saves the 136 bytes in context, and EhSolver can be called repeatedly with different nonce.
-void EhPrepare(void *context, void *input);
-int32_t EhSolver(void *context, uint32_t nonce);
-extern char testinput[];
-
-//context is the memory used for Equihash computation. It should be allocated outside of SolverFunction, the size is defined by CONTEXT_SIZE, about 180MB.
-//SolverFunction API has slight overhead in mining due to missing opportunity to run EhSolver multiple times after a single EhPrepare.
-int SolverFunction(void* context, const unsigned char* input,
-	bool (*validBlock)(void*, const unsigned char*),
-	void* validBlockData,
-	bool (*cancelled)(void*),
-	void* cancelledData,
-	int numThreads,
-	int n, int k)
-{
-	int numsolutions, i;
-
-	EhPrepare(context, (void *) input);
-	numsolutions = EhSolver(context, *(uint32_t *)(input+136));
-
-	for (i=0; i<numsolutions; i++) {
-		validBlock(validBlockData, (unsigned char*)(context+1344*i));
-	}
-	return numsolutions;
-}
-
-bool validBlock(void *validBlockData, const unsigned char *solution)
-{
-	return 0;
-}
-
-bool cancelled(void *cancelledData)
-{
-	return 0;
-}
-
-int main(void)
-{
-	void *context_alloc, *context, *context_end;
-	uint32_t *pu32;
-	uint64_t *pu64, previous_rdtsc;
-	uint8_t inputheader[144];	//140 byte header
-	FILE *infile, *outfile;
-	struct timespec time0, time1;
-	uint64_t rdtsc0, rdtsc1;
-	long t0, t1;
-	int32_t numsolutions;
-	int i, j;
-	char outfilename[32];
-
-	context_alloc = malloc(CONTEXT_SIZE+4096);
-	context = (void*) (((long) context_alloc+4095) & -4096);
-	context_end = context + CONTEXT_SIZE;
-
-	//Init page tables. This is not necessary, but useful to get a more consistent single-run timing.
-	for (pu32=context; (void*) pu32<context_end; pu32+=1024)
-		*pu32 = 0;
-
-	infile = 0;
-	infile = fopen("input.bin", "rb");
-	if (infile) {
-		puts("Reading input.bin");
-		fread(inputheader, 140, 1, infile);
-		fclose(infile);
-	} else {
-		puts("input.bin not found, use sample data (beta1 testnet block 2)");
-		memcpy(inputheader, testinput, 140);
-	}
-
-	puts("Running solver...");
-	clock_gettime(CLOCK_MONOTONIC, &time0);
-	rdtsc0 = __rdtsc();
-	numsolutions = SolverFunction(context, inputheader, validBlock, 0, cancelled, 0, 1, 200, 9);
-	//EhPrepare(context, (void *) inputheader);
-	//numsolutions = EhSolver(context, *(uint32_t *)(inputheader+136));
-	clock_gettime(CLOCK_MONOTONIC, &time1);
-	rdtsc1 = __rdtsc();
-
-	//Print some debug information
-	pu64 = (uint64_t *) (context + 102408);	//Read the debug area for statistics
-	printf("BLAKE2b rdtsc: %lu\n", pu64[1]-pu64[0]);
-	previous_rdtsc = pu64[1];
-	for (i=1, j=2; i<=9; i++, j+=2) {
-		printf("Stage %u, Output pairs %u, rdtsc: %lu\n", i, (uint32_t) pu64[j+1], pu64[j]-previous_rdtsc);
-		previous_rdtsc = pu64[j];
-	}
-	printf("Number of solutions before duplicate removal: %u\n", *(uint32_t *) (context+16384));
-	printf("Duplicate removal and tree expand rdtsc: %lu\n", pu64[j]-previous_rdtsc);
-
-	printf("Number of solutions: %d\n", numsolutions);
-
-	j = numsolutions < 4 ? numsolutions : 4;
-	for (i=0; i<j; i++) {
-		sprintf(outfilename, "out%d.bin", i);
-		outfile = fopen(outfilename, "wb");
-		fwrite(context+1344*i, 1344, 1, outfile);
-		fclose(outfile);
-	}
-
-	t0 = time0.tv_sec * 1000000000 + time0.tv_nsec;
-	t1 = time1.tv_sec * 1000000000 + time1.tv_nsec;
-	printf("Time: %ld ms\n", (t1-t0)/1000000);
-	t0 = (t1-t0)/1000;
-	printf("Measure rdtsc frequency = %.3f MHz\n", (double) (rdtsc1-rdtsc0)/t0);
-
-	free(context_alloc);
-	return 0;
-}
diff --git a/cpu_xenoncat/Linux/asm/assemble.sh b/cpu_xenoncat/asm_linux/assemble.sh
old mode 100755
new mode 100644
similarity index 100%
rename from cpu_xenoncat/Linux/asm/assemble.sh
rename to cpu_xenoncat/asm_linux/assemble.sh
diff --git a/cpu_xenoncat/Linux/asm/data_blake2b.asm b/cpu_xenoncat/asm_linux/data_blake2b.asm
similarity index 100%
rename from cpu_xenoncat/Linux/asm/data_blake2b.asm
rename to cpu_xenoncat/asm_linux/data_blake2b.asm
diff --git a/cpu_xenoncat/Linux/asm/equihash_avx1.asm b/cpu_xenoncat/asm_linux/equihash_avx1.asm
similarity index 85%
rename from cpu_xenoncat/Linux/asm/equihash_avx1.asm
rename to cpu_xenoncat/asm_linux/equihash_avx1.asm
index 71c6edc83..f0ce84681 100644
--- a/cpu_xenoncat/Linux/asm/equihash_avx1.asm
+++ b/cpu_xenoncat/asm_linux/equihash_avx1.asm
@@ -1,7 +1,6 @@
 format elf64
 public EhPrepare as 'EhPrepareAVX1'
 public EhSolver as 'EhSolverAVX1'
-public testinput as 'testinputAVX1'
 
 include "struct.inc"
 include "params.inc"
@@ -14,4 +13,3 @@ include "proc_ehsolver_avx1.asm"
 
 section '.data' writeable align 64
 include "data_blake2b.asm"
-testinput file "t2.bin"
diff --git a/cpu_xenoncat/Linux/asm/equihash_avx2.asm b/cpu_xenoncat/asm_linux/equihash_avx2.asm
similarity index 85%
rename from cpu_xenoncat/Linux/asm/equihash_avx2.asm
rename to cpu_xenoncat/asm_linux/equihash_avx2.asm
index 351daf847..8582d108d 100644
--- a/cpu_xenoncat/Linux/asm/equihash_avx2.asm
+++ b/cpu_xenoncat/asm_linux/equihash_avx2.asm
@@ -1,7 +1,6 @@
 format elf64
 public EhPrepare as 'EhPrepareAVX2'
 public EhSolver as 'EhSolverAVX2'
-public testinput as 'testinputAVX2'
 
 include "struct.inc"
 include "params.inc"
@@ -14,4 +13,3 @@ include "proc_ehsolver_avx2.asm"
 
 section '.data' writeable align 64
 include "data_blake2b.asm"
-testinput file "t2.bin"
diff --git a/cpu_xenoncat/Linux/asm/fasm b/cpu_xenoncat/asm_linux/fasm
old mode 100755
new mode 100644
similarity index 100%
rename from cpu_xenoncat/Linux/asm/fasm
rename to cpu_xenoncat/asm_linux/fasm
diff --git a/cpu_xenoncat/Linux/asm/macro_blake2b_avx1.asm b/cpu_xenoncat/asm_linux/macro_blake2b_avx1.asm
similarity index 100%
rename from cpu_xenoncat/Linux/asm/macro_blake2b_avx1.asm
rename to cpu_xenoncat/asm_linux/macro_blake2b_avx1.asm
diff --git a/cpu_xenoncat/Linux/asm/macro_blake2b_avx2.asm b/cpu_xenoncat/asm_linux/macro_blake2b_avx2.asm
similarity index 100%
rename from cpu_xenoncat/Linux/asm/macro_blake2b_avx2.asm
rename to cpu_xenoncat/asm_linux/macro_blake2b_avx2.asm
diff --git a/cpu_xenoncat/Linux/asm/macro_eh.asm b/cpu_xenoncat/asm_linux/macro_eh.asm
similarity index 100%
rename from cpu_xenoncat/Linux/asm/macro_eh.asm
rename to cpu_xenoncat/asm_linux/macro_eh.asm
diff --git a/cpu_xenoncat/Linux/asm/params.inc b/cpu_xenoncat/asm_linux/params.inc
similarity index 100%
rename from cpu_xenoncat/Linux/asm/params.inc
rename to cpu_xenoncat/asm_linux/params.inc
diff --git a/cpu_xenoncat/Linux/asm/proc_ehprepare_avx1.asm b/cpu_xenoncat/asm_linux/proc_ehprepare_avx1.asm
similarity index 100%
rename from cpu_xenoncat/Linux/asm/proc_ehprepare_avx1.asm
rename to cpu_xenoncat/asm_linux/proc_ehprepare_avx1.asm
diff --git a/cpu_xenoncat/Linux/asm/proc_ehprepare_avx2.asm b/cpu_xenoncat/asm_linux/proc_ehprepare_avx2.asm
similarity index 100%
rename from cpu_xenoncat/Linux/asm/proc_ehprepare_avx2.asm
rename to cpu_xenoncat/asm_linux/proc_ehprepare_avx2.asm
diff --git a/cpu_xenoncat/Linux/asm/proc_ehsolver_avx1.asm b/cpu_xenoncat/asm_linux/proc_ehsolver_avx1.asm
similarity index 100%
rename from cpu_xenoncat/Linux/asm/proc_ehsolver_avx1.asm
rename to cpu_xenoncat/asm_linux/proc_ehsolver_avx1.asm
diff --git a/cpu_xenoncat/Linux/asm/proc_ehsolver_avx2.asm b/cpu_xenoncat/asm_linux/proc_ehsolver_avx2.asm
similarity index 100%
rename from cpu_xenoncat/Linux/asm/proc_ehsolver_avx2.asm
rename to cpu_xenoncat/asm_linux/proc_ehsolver_avx2.asm
diff --git a/cpu_xenoncat/Linux/asm/struct.inc b/cpu_xenoncat/asm_linux/struct.inc
similarity index 100%
rename from cpu_xenoncat/Linux/asm/struct.inc
rename to cpu_xenoncat/asm_linux/struct.inc
diff --git a/cpu_xenoncat/Linux/asm/struct_eh.inc b/cpu_xenoncat/asm_linux/struct_eh.inc
similarity index 100%
rename from cpu_xenoncat/Linux/asm/struct_eh.inc
rename to cpu_xenoncat/asm_linux/struct_eh.inc
diff --git a/cuda_djezo/CMakeLists.txt b/cuda_djezo/CMakeLists.txt
new file mode 100644
index 000000000..e755afb8c
--- /dev/null
+++ b/cuda_djezo/CMakeLists.txt
@@ -0,0 +1,43 @@
+set(EXECUTABLE cuda_djezo)
+
+option(ENABLE_CUDA "Enable the cuda build" ON)
+
+# depending on gcc version
+# ;-std=c++11 => Ubuntu 14.04 check gcc versions
+#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-std=c++11)
+
+file(GLOB SRC_LIST
+    cuda_djezo.cpp
+    equi_miner.cu )
+file(GLOB HEADERS
+    cuda_djezo.hpp
+    eqcuda.hpp
+    )
+
+
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-D_FORCE_INLINES;--disable-warnings;--ptxas-options=-v;-Xptxas=-dlcm=ca;-Xptxas=-dscm=cs; -O3)
+
+FIND_PACKAGE(CUDA REQUIRED)
+if(COMPUTE AND (COMPUTE GREATER 0))
+        LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE})
+else(COMPUTE AND (COMPUTE GREATER 0))
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=sm_50; -gencode arch=compute_52,code=sm_52; -gencode arch=compute_60,code=sm_60 )
+endif(COMPUTE AND (COMPUTE GREATER 0))
+
+if(CUDA_FOUND)
+message("CUDA FOUND")
+else()
+message("CUDA NOT FOUND")
+endif()
+
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CUDA_INCLUDE_DIRS})
+include_directories(..)
+CUDA_ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
+TARGET_LINK_LIBRARIES(${EXECUTABLE} ${CUDA_LIBRARIES} cuda)
+
+message("-- CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
+
+install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
+install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )
diff --git a/cuda_djezo/LICENSE b/cuda_djezo/LICENSE
new file mode 100644
index 000000000..bb7b082bf
--- /dev/null
+++ b/cuda_djezo/LICENSE
@@ -0,0 +1,675 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+         Copyright (C) 2016-2017 NiceHash (www.nicehash.com)
+
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/cuda_djezo/blake2b.cu b/cuda_djezo/blake2b.cu
new file mode 100644
index 000000000..866c82592
--- /dev/null
+++ b/cuda_djezo/blake2b.cu
@@ -0,0 +1,336 @@
+// Blake2-B CUDA Implementation
+// tpruvot@github July 2016
+// permission granted to use under MIT license
+// modified for use in Zcash by John Tromp September 2016
+
+/**
+ * uint2 direct ops by c++ operator definitions
+ */
+static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) {
+  return make_uint2(a.x ^ b.x, a.y ^ b.y);
+}
+static __forceinline__ __device__ uint4 operator^ (uint4 a, uint4 b) { 
+	return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); 
+}
+// uint2 ROR/ROL methods
+__device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset) {
+  uint2 result;
+#if __CUDA_ARCH__ > 300
+/*  if (offset < 32) {
+          asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
+          asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+  } else *//* if (offset < 64) */ {
+          /* offset SHOULD BE < 64 ! */
+          asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+          asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+  }
+#else
+  if (!offset)
+          result = a;
+  else if (offset < 32) {
+          result.y = ((a.y >> offset) | (a.x << (32 - offset)));
+          result.x = ((a.x >> offset) | (a.y << (32 - offset)));
+  } else if (offset == 32) {
+          result.y = a.x;
+          result.x = a.y;
+  } else {
+          result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
+          result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
+  }
+#endif
+  return result;
+}
+__device__ __forceinline__ uint2 SWAPUINT2(uint2 value) {
+  return make_uint2(value.y, value.x);
+}
+#ifdef __CUDA_ARCH__
+__device__ __inline__ uint2 ROR24(const uint2 a) {
+  uint2 result;
+  result.x = __byte_perm(a.y, a.x, 0x2107);
+  result.y = __byte_perm(a.y, a.x, 0x6543);
+  return result;
+}
+__device__ __inline__ uint2 ROR16(const uint2 a) {
+  uint2 result;
+  result.x = __byte_perm(a.y, a.x, 0x1076);
+  result.y = __byte_perm(a.y, a.x, 0x5432);
+  return result;
+}
+#else
+#define ROR24(u) ROR2(u,24)
+#define ROR16(u) ROR2(u,16)
+#endif
+
+typedef uint64_t u64;
+
+static __constant__ const int8_t blake2b_sigma[12][16] = {
+  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
+  { 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  } ,
+  { 11, 8,  12, 0,  5,  2,  15, 13, 10, 14, 3,  6,  7,  1,  9,  4  } ,
+  { 7,  9,  3,  1,  13, 12, 11, 14, 2,  6,  5,  10, 4,  0,  15, 8  } ,
+  { 9,  0,  5,  7,  2,  4,  10, 15, 14, 1,  11, 12, 6,  8,  3,  13 } ,
+  { 2,  12, 6,  10, 0,  11, 8,  3,  4,  13, 7,  5,  15, 14, 1,  9  } ,
+  { 12, 5,  1,  15, 14, 13, 4,  10, 0,  7,  6,  3,  9,  2,  8,  11 } ,
+  { 13, 11, 7,  14, 12, 1,  3,  9,  5,  0,  15, 4,  8,  6,  2,  10 } ,
+  { 6,  15, 14, 9,  11, 3,  0,  8,  12, 2,  13, 7,  1,  4,  10, 5  } ,
+  { 10, 2,  8,  4,  7,  6,  1,  5,  15, 11, 9,  14, 3,  12, 13, 0  } ,
+  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
+  { 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  }
+};
+
+__device__ __constant__
+static const u64 blake_iv[] = 
+{
+	0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+	0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+	0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+	0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+};
+
+__device__ __forceinline__
+static void G(const int r, const int i, u64 &a, u64 &b, u64 &c, u64 &d, u64 const m[16]) {
+  a = a + b + m[ blake2b_sigma[r][2*i] ];
+  ((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
+  c = c + d;
+  ((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] );
+  a = a + b + m[ blake2b_sigma[r][2*i+1] ];
+  ((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
+  c = c + d;
+  ((uint2*)&b)[0] = ROR2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
+}
+
+//__device__ __forceinline__
+//static void G2(u64 &a, u64 &b, u64 &c, u64 &d, u64 x, u64 y) {
+//	a = a + b + x;
+//	((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
+//	c = c + d;
+//	((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]);
+//	a = a + b + y;
+//	((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
+//	c = c + d;
+//	((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
+//}
+
+__device__ __forceinline__
+static void G2(u64 & a, u64 & b, u64 & c, u64 & d, u64 x, u64 y) {
+	a = a + b + x;
+	((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
+	c = c + d;
+	((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]);
+	a = a + b + y;
+	((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
+	c = c + d;
+	((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
+}
+
+#define ROUND(r) \
+  G(r, 0, v[0], v[4], v[ 8], v[12], m); \
+  G(r, 1, v[1], v[5], v[ 9], v[13], m); \
+  G(r, 2, v[2], v[6], v[10], v[14], m); \
+  G(r, 3, v[3], v[7], v[11], v[15], m); \
+  G(r, 4, v[0], v[5], v[10], v[15], m); \
+  G(r, 5, v[1], v[6], v[11], v[12], m); \
+  G(r, 6, v[2], v[7], v[ 8], v[13], m); \
+  G(r, 7, v[3], v[4], v[ 9], v[14], m);
+
+
+__forceinline__ __device__ void blake2b_gpu_hash3(uint64_t* h, u32 idx, u32 nonce) {
+	u64 m = (u64)idx << 32 | (u64)nonce;
+
+	u64 v[16];
+
+	v[0] = h[0];
+	v[1] = h[1];
+	v[2] = h[2];
+	v[3] = h[3];
+	v[4] = h[4];
+	v[5] = h[5];
+	v[6] = h[6];
+	v[7] = h[7];
+	v[8] = blake_iv[0];
+	v[9] = blake_iv[1];
+	v[10] = blake_iv[2];
+	v[11] = blake_iv[3];
+	v[12] = blake_iv[4] ^ (128 + 16);
+	v[13] = blake_iv[5];
+	v[14] = blake_iv[6] ^ 0xffffffffffffffff;
+	v[15] = blake_iv[7];
+
+	// mix 1
+	G2(v[0], v[4], v[8], v[12], 0, m);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 2
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], m, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 3
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, m);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+	
+	// mix 4
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, m);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 5
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, m);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 6
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], m, 0);
+
+	// mix 7
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], m, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 8
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, m);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 9
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], m, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 10
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], m, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 11
+	G2(v[0], v[4], v[8], v[12], 0, m);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 12
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], m, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	h[0] ^= v[0] ^ v[8];
+	h[1] ^= v[1] ^ v[9];
+	h[2] ^= v[2] ^ v[10];
+	h[3] ^= v[3] ^ v[11];
+	h[4] ^= v[4] ^ v[12];
+	h[5] ^= v[5] ^ v[13];
+	h[6] ^= v[6] ^ v[14];
+}
+
+
+__forceinline__ __device__ void blake2b_gpu_hash2(uint64_t* h, u32 idx) {
+	u64 m[16] = { 0 };
+	u32* ptr = (u32*)&m[1];
+
+	ptr[1] = idx;
+
+	u64 v[16];
+
+	v[0] = h[0];
+	v[1] = h[1];
+	v[2] = h[2];
+	v[3] = h[3];
+	v[4] = h[4];
+	v[5] = h[5];
+	v[6] = h[6];
+	v[7] = h[7];
+	v[8] = 0x6a09e667f3bcc908;
+	v[9] = 0xbb67ae8584caa73b;
+	v[10] = 0x3c6ef372fe94f82b;
+	v[11] = 0xa54ff53a5f1d36f1;
+	v[12] = 0x510e527fade682d1 ^ (128 + 16);
+	v[13] = 0x9b05688c2b3e6c1f;
+	v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff;
+	v[15] = 0x5be0cd19137e2179;
+
+	ROUND(0);
+	ROUND(1);
+	ROUND(2);
+	ROUND(3);
+	ROUND(4);
+	ROUND(5);
+	ROUND(6);
+	ROUND(7);
+	ROUND(8);
+	ROUND(9);
+	ROUND(10);
+	ROUND(11);
+
+	h[0] ^= v[0] ^ v[8];
+	h[1] ^= v[1] ^ v[9];
+	h[2] ^= v[2] ^ v[10];
+	h[3] ^= v[3] ^ v[11];
+	h[4] ^= v[4] ^ v[12];
+	h[5] ^= v[5] ^ v[13];
+	h[6] ^= v[6] ^ v[14];
+	//h[7] ^= v[7] ^ v[15];
+	//memcpy(hash, (uchar *)h, outlen);
+}
\ No newline at end of file
diff --git a/cuda_djezo/cuda_djezo.cpp b/cuda_djezo/cuda_djezo.cpp
new file mode 100644
index 000000000..30d672e89
--- /dev/null
+++ b/cuda_djezo/cuda_djezo.cpp
@@ -0,0 +1,128 @@
+#include <iostream>
+#include <functional>
+#include <vector>
+#include <stdint.h>
+#include <string>
+
+#include "cuda_djezo.hpp"
+
+struct proof;
+#include "eqcuda.hpp"
+
+
+cuda_djezo::cuda_djezo(int platf_id, int dev_id)
+{
+	device_id = dev_id;
+	getinfo(0, dev_id, m_gpu_name, m_sm_count, m_version);
+
+	combo_mode = 1;
+
+	int major, minor;
+	std::string::size_type n = m_version.find(".");
+	if (n != std::string::npos)
+	{
+		major = atoi(m_version.substr(0, n).c_str());
+		minor = atoi(m_version.substr(n + 1, m_version.length() - n - 1).c_str());
+
+		if (major < 5)
+		{
+			throw std::runtime_error("Only CUDA devices with SM 5.0 and higher are supported.");
+		}
+		else if (major == 5 && minor == 0)
+		{
+			combo_mode = 2;
+		}
+	}
+	else
+		throw std::runtime_error("Uknown Compute/SM version.");
+}
+
+
+std::string cuda_djezo::getdevinfo()
+{
+	return m_gpu_name + " (#" + std::to_string(device_id) + ") M=" + std::to_string(combo_mode);
+}
+
+
+int cuda_djezo::getcount()
+{
+	int device_count;
+	checkCudaErrors(cudaGetDeviceCount(&device_count));
+	return device_count;
+}
+
+void cuda_djezo::getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version)
+{
+	//int runtime_version;
+	//checkCudaErrors(cudaRuntimeGetVersion(&runtime_version));
+
+	cudaDeviceProp device_props;
+
+	checkCudaErrors(cudaGetDeviceProperties(&device_props, d_id));
+
+	gpu_name = device_props.name;
+	sm_count = device_props.multiProcessorCount;
+	version = std::to_string(device_props.major) + "." + std::to_string(device_props.minor);
+}
+
+
+void cuda_djezo::start(cuda_djezo& device_context)
+{ 
+	switch (device_context.combo_mode)
+	{
+#ifdef CONFIG_MODE_2
+	case 2:
+		device_context.context = new eq_cuda_context<CONFIG_MODE_2>(device_context.device_id);
+		break;
+#endif
+#ifdef CONFIG_MODE_3
+	case 3:
+		device_context.context = new eq_cuda_context<CONFIG_MODE_3>(device_context.device_id);
+		break;
+#endif
+	default:
+		device_context.context = new eq_cuda_context<CONFIG_MODE_1>(device_context.device_id);
+		break;
+	}
+}
+
+void cuda_djezo::stop(cuda_djezo& device_context)
+{ 
+	if (device_context.context)
+	{
+		delete device_context.context;
+		device_context.context = nullptr;
+	}
+}
+
+void cuda_djezo::solve(const char *tequihash_header,
+	unsigned int tequihash_header_len,
+	const char* nonce,
+	unsigned int nonce_len,
+	std::function<bool()> cancelf,
+	std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
+	std::function<void(void)> hashdonef,
+	cuda_djezo& device_context)
+{
+	device_context.context->solve(tequihash_header,
+		tequihash_header_len,
+		nonce,
+		nonce_len,
+		cancelf,
+		solutionf,
+		hashdonef);
+}
+
+
+void eq_cuda_context_interface::solve(const char *tequihash_header,
+	unsigned int tequihash_header_len,
+	const char* nonce,
+	unsigned int nonce_len,
+	std::function<bool()> cancelf,
+	std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
+	std::function<void(void)> hashdonef)
+{
+}
+
+
+eq_cuda_context_interface::~eq_cuda_context_interface() { }
\ No newline at end of file
diff --git a/ocl_xpm/ocl_xmp.hpp b/cuda_djezo/cuda_djezo.hpp
similarity index 51%
rename from ocl_xpm/ocl_xmp.hpp
rename to cuda_djezo/cuda_djezo.hpp
index 3e0d4054a..1843c462a 100644
--- a/ocl_xpm/ocl_xmp.hpp
+++ b/cuda_djezo/cuda_djezo.hpp
@@ -1,33 +1,22 @@
 #pragma once
+
 #ifdef _LIB
-#define DLL_OCL_XMP __declspec(dllexport)
+#define DLL_CUDA_DJEZO __declspec(dllexport)
 #else
-#define DLL_OCL_XMP
+#define DLL_CUDA_DJEZO
 #endif
 
-// remove after
-#include <string>
-#include <functional>
-#include <vector>
-#include <cstdint>
-
-struct MinerInstance;
+struct eq_cuda_context_interface;
 
-struct DLL_OCL_XMP ocl_xmp
+struct DLL_CUDA_DJEZO cuda_djezo
 {
-	//int threadsperblock;
+	int threadsperblock;
 	int blocks;
 	int device_id;
-	int platform_id;
-
-	MinerInstance* context;
-	// threads
-	unsigned threadsNum; // TMP
-	unsigned wokrsize;
-
-	bool is_init_success = false;
+	int combo_mode;
+	eq_cuda_context_interface* context;
 
-	ocl_xmp(int platf_id, int dev_id);
+	cuda_djezo(int platf_id, int dev_id);
 
 	std::string getdevinfo();
 
@@ -35,9 +24,9 @@ struct DLL_OCL_XMP ocl_xmp
 
 	static void getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version);
 
-	static void start(ocl_xmp& device_context);
+	static void start(cuda_djezo& device_context);
 
-	static void stop(ocl_xmp& device_context);
+	static void stop(cuda_djezo& device_context);
 
 	static void solve(const char *tequihash_header,
 		unsigned int tequihash_header_len,
@@ -46,11 +35,12 @@ struct DLL_OCL_XMP ocl_xmp
 		std::function<bool()> cancelf,
 		std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
 		std::function<void(void)> hashdonef,
-		ocl_xmp& device_context);
+		cuda_djezo& device_context);
 
-	std::string getname() { return "OCL_XMP"; }
+	std::string getname() { return "CUDA-DJEZO"; }
 
 private:
 	std::string m_gpu_name;
 	std::string m_version;
+	int m_sm_count;
 };
\ No newline at end of file
diff --git a/cuda_djezo/cuda_djezo.vcxproj b/cuda_djezo/cuda_djezo.vcxproj
new file mode 100644
index 000000000..759a7cf20
--- /dev/null
+++ b/cuda_djezo/cuda_djezo.vcxproj
@@ -0,0 +1,117 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="cuda_djezo.hpp" />
+    <ClInclude Include="eqcuda.hpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\cpu_tromp\blake2\blake2bx.cpp" />
+    <ClCompile Include="cuda_djezo.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CudaCompile Include="equi_miner.cu">
+    </CudaCompile>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{268B10AD-D845-498B-8663-AB8911CA2039}</ProjectGuid>
+    <RootNamespace>cuda_djezo</RootNamespace>
+    <CudaToolkitCustomDir>$(CUDA_PATH_V8_0)</CudaToolkitCustomDir>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;WIN64;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <DisableSpecificWarnings>4334;4316;4244;4996;4251;</DisableSpecificWarnings>
+      <AdditionalIncludeDirectories>..\3rdparty\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <IgnoreSpecificDefaultLibraries>
+      </IgnoreSpecificDefaultLibraries>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+      <CodeGeneration>compute_61,sm_61;compute_52,sm_52;compute_50,sm_50;</CodeGeneration>
+      <PtxAsOptionV>true</PtxAsOptionV>
+      <GPUDebugInfo>false</GPUDebugInfo>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;WIN64;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <DisableSpecificWarnings>4334;4316;4244;4996;4251;</DisableSpecificWarnings>
+      <AdditionalIncludeDirectories>..\3rdparty\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_61,sm_61;</CodeGeneration>
+      <PtxAsOptionV>true</PtxAsOptionV>
+      <AdditionalOptions>–Xptxas –dlcm=ca -Xptxas -dscm=cs %(AdditionalOptions)</AdditionalOptions>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" />
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/cuda_djezo/eqcuda.hpp b/cuda_djezo/eqcuda.hpp
new file mode 100644
index 000000000..48d663a45
--- /dev/null
+++ b/cuda_djezo/eqcuda.hpp
@@ -0,0 +1,99 @@
+#pragma once
+
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include "device_functions_decls.h"
+#include "../cpu_tromp/blake2/blake2.h"
+#include "cuda_djezo.hpp"
+
+#ifdef WIN32
+#define _SNPRINTF _snprintf
+#else
+#include <stdio.h>
+#define _SNPRINTF snprintf
+#endif
+
+#define checkCudaErrors(call)								\
+do {														\
+	cudaError_t err = call;									\
+	if (cudaSuccess != err) {								\
+		char errorBuff[512];								\
+        _SNPRINTF(errorBuff, sizeof(errorBuff) - 1,			\
+			"CUDA error '%s' in func '%s' line %d",			\
+			cudaGetErrorString(err), __FUNCTION__, __LINE__);	\
+		throw std::runtime_error(errorBuff);				\
+		}														\
+} while (0)
+
+#define checkCudaDriverErrors(call)								\
+do {														\
+	CUresult err = call;									\
+	if (CUDA_SUCCESS != err) {								\
+		char errorBuff[512];								\
+		_SNPRINTF(errorBuff, sizeof(errorBuff) - 1,			\
+			"CUDA error DRIVER: '%d' in func '%s' line %d",			\
+			err, __FUNCTION__, __LINE__);	\
+		throw std::runtime_error(errorBuff);				\
+				}														\
+} while (0)
+
+typedef uint64_t u64;
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+typedef unsigned char uchar;
+
+struct packer_default;
+struct packer_cantor;
+
+#define MAXREALSOLS 9
+
+struct scontainerreal
+{
+	u32 sols[MAXREALSOLS][512];
+	u32 nsols;
+};
+
+template <u32 RB, u32 SM>
+struct equi;
+
+struct eq_cuda_context_interface
+{
+	virtual ~eq_cuda_context_interface();
+
+	virtual void solve(const char *tequihash_header,
+		unsigned int tequihash_header_len,
+		const char* nonce,
+		unsigned int nonce_len,
+		std::function<bool()> cancelf,
+		std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
+		std::function<void(void)> hashdonef);
+};
+
+
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+struct eq_cuda_context : public eq_cuda_context_interface
+{
+	int threadsperblock;
+	int totalblocks;
+	int device_id;
+	equi<RB, SM>* device_eq;
+	scontainerreal* solutions;
+	CUcontext pctx;
+
+	eq_cuda_context(int id);
+	~eq_cuda_context();
+
+	void solve(const char *tequihash_header,
+		unsigned int tequihash_header_len,
+		const char* nonce,
+		unsigned int nonce_len,
+		std::function<bool()> cancelf,
+		std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
+		std::function<void(void)> hashdonef);
+};
+
+#define CONFIG_MODE_1	9, 1248, 12, 640, packer_cantor
+
+#define CONFIG_MODE_2	8, 640, 12, 512, packer_default
\ No newline at end of file
diff --git a/cuda_djezo/equi_miner.cu b/cuda_djezo/equi_miner.cu
new file mode 100644
index 000000000..6ef9f45f2
--- /dev/null
+++ b/cuda_djezo/equi_miner.cu
@@ -0,0 +1,2159 @@
+/*
+  Equihash solver created by djeZo (l33tsoftw@gmail.com) for NiceHash
+
+  Based on CUDA solver by John Tromp released under MIT license.
+
+  Some helper functions taken out of OpenCL solver by Marc Bevand
+  released under MIT license.
+
+  cuda_djezo solver is released by NiceHash (www.nicehash.com) under
+  GPL 3.0 license. If you don't have a copy, you can obtain one from
+  https://www.gnu.org/licenses/gpl-3.0.txt
+*/
+
+/*
+The MIT License (MIT)
+
+Copyright (c) 2016 John Tromp
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software, and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+/*
+The MIT License (MIT)
+
+Copyright (c) 2016 Marc Bevand
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software, and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifdef WIN32
+#include <Windows.h>
+#endif
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <assert.h>
+#include <functional>
+#include <vector>
+#include <iostream>
+#include <mutex>
+
+#include "eqcuda.hpp"
+#include "sm_32_intrinsics.h"
+
+#define WN	200
+#define WK	9
+#define NDIGITS		(WK+1)
+#define DIGITBITS	(WN/(NDIGITS))
+#define PROOFSIZE (1<<WK)
+#define BASE (1<<DIGITBITS)
+#define NHASHES (2*BASE)
+#define HASHESPERBLAKE (512/WN)
+#define HASHOUT (HASHESPERBLAKE*WN/8)
+#define NBLOCKS ((NHASHES + HASHESPERBLAKE - 1) / HASHESPERBLAKE)
+#define BUCKBITS (DIGITBITS - RB)
+#define NBUCKETS (1 << BUCKBITS)
+#define BUCKMASK (NBUCKETS - 1)
+#define SLOTBITS (RB + 2)
+#define SLOTRANGE (1 << SLOTBITS)
+#define NSLOTS SM
+#define SLOTMASK (SLOTRANGE - 1)
+#define NRESTS (1 << RB)
+#define RESTMASK (NRESTS - 1)
+#define CANTORBITS (2 * SLOTBITS - 2)
+#define CANTORMASK ((1 << CANTORBITS) - 1)
+#define CANTORMAXSQRT (2 * NSLOTS)
+#define RB8_NSLOTS 640
+#define RB8_NSLOTS_LD 624
+#define FD_THREADS 128
+
+// reduce vstudio warnings (__byteperm, blockIdx...)
+#ifdef __INTELLISENSE__
+#include <device_functions.h>
+#include <device_launch_parameters.h>
+#define __launch_bounds__(max_tpb, min_blocks)
+#define __CUDA_ARCH__ 520
+uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
+uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
+uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z);
+uint32_t atomicExch(uint32_t *x, uint32_t y);
+uint32_t atomicAdd(uint32_t *x, uint32_t y);
+void __syncthreads(void);
+void __threadfence(void);
+void __threadfence_block(void);
+uint32_t __ldg(const uint32_t* address);
+uint64_t __ldg(const uint64_t* address);
+uint4 __ldca(const uint4 *ptr);
+u32 __ldca(const u32 *ptr);
+u32 umin(const u32, const u32);
+u32 umax(const u32, const u32);
+#endif
+
+
+typedef u32 proof[PROOFSIZE];
+
+
+struct __align__(32) slot
+{
+	u32 hash[8];
+};
+
+
+struct __align__(16) slotsmall
+{
+	u32 hash[4];
+};
+
+
+struct __align__(8) slottiny
+{
+	u32 hash[2];
+};
+
+
+template <u32 RB, u32 SM>
+struct equi
+{
+	slot round0trees[4096][RB8_NSLOTS];
+	slot trees[1][NBUCKETS][NSLOTS];
+	struct
+	{
+		slotsmall treessmall[NSLOTS];
+		slottiny treestiny[NSLOTS];
+	} round2trees[NBUCKETS];
+	struct
+	{
+		slotsmall treessmall[NSLOTS];
+		slottiny treestiny[NSLOTS];
+	} round3trees[NBUCKETS];
+	slotsmall treessmall[4][NBUCKETS][NSLOTS];
+	slottiny treestiny[1][4096][RB8_NSLOTS_LD];
+	u32 round4bidandsids[NBUCKETS][NSLOTS];
+	union
+	{
+		u64 blake_h[8];
+		u32 blake_h32[16];
+	};
+	struct
+	{
+		u32 nslots8[4096];
+		u32 nslots0[4096];
+		u32 nslots[9][NBUCKETS];
+		scontainerreal srealcont;
+	} edata;
+};
+
+
+__device__ __constant__ const u64 blake_iv[] =
+{
+	0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+	0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+	0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+	0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+};
+
+__device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b)
+{
+	return make_uint2(a.x ^ b.x, a.y ^ b.y);
+}
+
+__device__ __forceinline__ uint4 operator^ (uint4 a, uint4 b)
+{
+	return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+}
+
+__device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset) 
+{
+	uint2 result;
+	{
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+	return result;
+}
+
+__device__ __forceinline__ uint2 SWAPUINT2(uint2 value) 
+{
+	return make_uint2(value.y, value.x);
+}
+
+__device__ __forceinline__ uint2 ROR24(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x2107);
+	result.y = __byte_perm(a.y, a.x, 0x6543);
+	return result;
+}
+
+__device__ __forceinline__ uint2 ROR16(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x1076);
+	result.y = __byte_perm(a.y, a.x, 0x5432);
+	return result;
+}
+
+__device__ __forceinline__ void G2(u64 & a, u64 & b, u64 & c, u64 & d, u64 x, u64 y) 
+{
+	a = a + b + x;
+	((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
+	c = c + d;
+	((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]);
+	a = a + b + y;
+	((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
+	c = c + d;
+	((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
+}
+
+
+struct packer_default
+{
+	__device__ __forceinline__ static u32 set_bucketid_and_slots(const u32 bucketid, const u32 s0, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return (((bucketid << SLOTBITS) | s0) << SLOTBITS) | s1;
+	}
+
+	__device__ __forceinline__ static u32 get_bucketid(const u32 bid, const u32 RB, const u32 SM)
+	{
+		// BUCKMASK-ed to prevent illegal memory accesses in case of memory errors
+		return (bid >> (2 * SLOTBITS)) & BUCKMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot0(const u32 bid, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return bid & SLOTMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot1(const u32 bid, const u32 RB, const u32 SM)
+	{
+		return (bid >> SLOTBITS) & SLOTMASK;
+	}
+};
+
+
+struct packer_cantor
+{
+	__device__ __forceinline__ static u32 cantor(const u32 s0, const u32 s1)
+	{
+		u32 a = umax(s0, s1);
+		u32 b = umin(s0, s1);
+		return a * (a + 1) / 2 + b;
+	}
+
+	__device__ __forceinline__ static u32 set_bucketid_and_slots(const u32 bucketid, const u32 s0, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return (bucketid << CANTORBITS) | cantor(s0, s1);
+	}
+
+	__device__ __forceinline__ static u32 get_bucketid(const u32 bid, const u32 RB, const u32 SM)
+	{
+		return (bid >> CANTORBITS) & BUCKMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot0(const u32 bid, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return ((bid & CANTORMASK) - cantor(0, s1)) & SLOTMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot1(const u32 bid, const u32 RB, const u32 SM)
+	{
+		u32 k, q, sqr = 8 * (bid & CANTORMASK) + 1;
+		// this k=sqrt(sqr) computing loop averages 3.4 iterations out of maximum 9
+		for (k = CANTORMAXSQRT; (q = sqr / k) < k; k = (k + q) / 2);
+		return ((k - 1) / 2) & SLOTMASK;
+	}
+};
+
+
+template <u32 RB, u32 SM, typename PACKER>
+__global__ void digit_first(equi<RB, SM>* eq, u32 nonce)
+{
+	const u32 block = blockIdx.x * blockDim.x + threadIdx.x;
+	__shared__ u64 hash_h[8];
+	u32* hash_h32 = (u32*)hash_h;
+
+	if (threadIdx.x < 16)
+		hash_h32[threadIdx.x] = __ldca(&eq->blake_h32[threadIdx.x]);
+
+	__syncthreads();
+
+	u64 m = (u64)block << 32 | (u64)nonce;
+
+	union
+	{
+		u64 v[16];
+		u32 v32[32];
+		uint4 v128[8];
+	};
+
+	v[0] = hash_h[0];
+	v[1] = hash_h[1];
+	v[2] = hash_h[2];
+	v[3] = hash_h[3];
+	v[4] = hash_h[4];
+	v[5] = hash_h[5];
+	v[6] = hash_h[6];
+	v[7] = hash_h[7];
+	v[8] = blake_iv[0];
+	v[9] = blake_iv[1];
+	v[10] = blake_iv[2];
+	v[11] = blake_iv[3];
+	v[12] = blake_iv[4] ^ (128 + 16);
+	v[13] = blake_iv[5];
+	v[14] = blake_iv[6] ^ 0xffffffffffffffff;
+	v[15] = blake_iv[7];
+
+	// mix 1
+	G2(v[0], v[4], v[8], v[12], 0, m);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 2
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], m, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 3
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, m);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 4
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, m);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 5
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, m);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 6
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], m, 0);
+
+	// mix 7
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], m, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 8
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, m);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 9
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], m, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 10
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], m, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 11
+	G2(v[0], v[4], v[8], v[12], 0, m);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 12
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], m, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	v[0] ^= hash_h[0] ^ v[8];
+	v[1] ^= hash_h[1] ^ v[9];
+	v[2] ^= hash_h[2] ^ v[10];
+	v[3] ^= hash_h[3] ^ v[11];
+	v[4] ^= hash_h[4] ^ v[12];
+	v[5] ^= hash_h[5] ^ v[13];
+	v32[12] ^= hash_h32[12] ^ v32[28];
+
+	u32 bexor = __byte_perm(v32[0], 0, 0x4012); // first 20 bits
+	u32 bucketid;
+	asm("bfe.u32 %0, %1, 12, 12;" : "=r"(bucketid) : "r"(bexor));
+	u32 slotp = atomicAdd(&eq->edata.nslots0[bucketid], 1);
+	if (slotp < RB8_NSLOTS)
+	{
+		slot* s = &eq->round0trees[bucketid][slotp];
+
+		uint4 tt;
+		tt.x = __byte_perm(v32[0], v32[1], 0x1234);
+		tt.y = __byte_perm(v32[1], v32[2], 0x1234);
+		tt.z = __byte_perm(v32[2], v32[3], 0x1234);
+		tt.w = __byte_perm(v32[3], v32[4], 0x1234);
+		*(uint4*)(&s->hash[0]) = tt;
+
+		tt.x = __byte_perm(v32[4], v32[5], 0x1234);
+		tt.y = __byte_perm(v32[5], v32[6], 0x1234);
+		tt.z = 0;
+		tt.w = block << 1;
+		*(uint4*)(&s->hash[4]) = tt;
+	}
+
+	bexor = __byte_perm(v32[6], 0, 0x0123);
+	asm("bfe.u32 %0, %1, 12, 12;" : "=r"(bucketid) : "r"(bexor));
+	slotp = atomicAdd(&eq->edata.nslots0[bucketid], 1);
+	if (slotp < RB8_NSLOTS)
+	{
+		slot* s = &eq->round0trees[bucketid][slotp];
+
+		uint4 tt;
+		tt.x = __byte_perm(v32[6], v32[7], 0x2345);
+		tt.y = __byte_perm(v32[7], v32[8], 0x2345);
+		tt.z = __byte_perm(v32[8], v32[9], 0x2345);
+		tt.w = __byte_perm(v32[9], v32[10], 0x2345);
+		*(uint4*)(&s->hash[0]) = tt;
+
+		tt.x = __byte_perm(v32[10], v32[11], 0x2345);
+		tt.y = __byte_perm(v32[11], v32[12], 0x2345);
+		tt.z = 0;
+		tt.w = (block << 1) + 1;
+		*(uint4*)(&s->hash[4]) = tt;
+	}
+}
+
+/*
+  Functions digit_1 to digit_8 works by the same principle;
+  Each thread does 2-3 slot loads (loads are coalesced). 
+  Xorwork of slots is loaded into shared memory and is kept in registers (except for digit_1).
+  At the same time, restbits (8 or 9 bits) in xorwork are used for collisions. 
+  Restbits determine position in ht.
+  Following next is pair creation. First one (or two) pairs' xorworks are put into global memory
+  as soon as possible, the rest pairs are saved in shared memory (one u32 per pair - 16 bit indices). 
+  In most cases, all threads have one (or two) pairs so with this trick, we offload memory writes a bit in last step.
+  In last step we save xorwork of pairs in memory.
+*/
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_1(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[256][SSM - 1];
+	__shared__ uint2 lastword1[RB8_NSLOTS];
+	__shared__ uint4 lastword2[RB8_NSLOTS];
+	__shared__ int ht_len[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < 256)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	u32 bsize = umin(eq->edata.nslots0[bucketid], RB8_NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	uint2 ta[2];
+	uint4 tb[2];
+
+	u32 si[2];
+
+	// enable this to make fully safe shared mem operations;
+	// disabled gains some speed, but can rarely cause a crash
+	//__syncthreads();
+
+#pragma unroll
+	for (u32 i = 0; i != 2; ++i)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slot* pslot1 = eq->round0trees[bucketid] + si[i];
+
+		// get xhash
+		uint4 a1 = *(uint4*)(&pslot1->hash[0]);
+		uint2 a2 = *(uint2*)(&pslot1->hash[4]);
+		ta[i].x = a1.x;
+		ta[i].y = a1.y;
+		lastword1[si[i]] = ta[i];
+		tb[i].x = a1.z;
+		tb[i].y = a1.w;
+		tb[i].z = a2.x;
+		tb[i].w = a2.y;
+		lastword2[si[i]] = tb[i];
+
+		asm("bfe.u32 %0, %1, 20, 8;" : "=r"(hr[i]) : "r"(ta[i].x));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+	int* pairs = ht_len;
+
+	u32 xors[6];
+	u32 xorbucketid, xorslot;
+
+#pragma unroll
+	for (u32 i = 0; i != 2; ++i)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint2*)(&xors[0]) = ta[i] ^ lastword1[p];
+
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[1][xorbucketid], 1);
+
+			if (xorslot < NSLOTS)
+			{
+				*(uint4*)(&xors[2]) = lastword2[si[i]] ^ lastword2[p];
+
+				slot &xs = eq->trees[0][xorbucketid][xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+				uint4 ttx;
+				ttx.x = xors[5];
+				ttx.y = xors[0];
+				ttx.z = packer_default::set_bucketid_and_slots(bucketid, si[i], p, 8, RB8_NSLOTS);
+				ttx.w = 0;
+				*(uint4*)(&xs.hash[4]) = ttx;
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint2*)(&xors[0]) = lastword1[i] ^ lastword1[k];
+
+		asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(RB), "r"(BUCKBITS));
+		xorslot = atomicAdd(&eq->edata.nslots[1][xorbucketid], 1);
+
+		if (xorslot < NSLOTS)
+		{
+			*(uint4*)(&xors[2]) = lastword2[i] ^ lastword2[k];
+
+			slot &xs = eq->trees[0][xorbucketid][xorslot];
+			*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+			uint4 ttx;
+			ttx.x = xors[5];
+			ttx.y = xors[0];
+			ttx.z = packer_default::set_bucketid_and_slots(bucketid, i, k, 8, RB8_NSLOTS);
+			ttx.w = 0;
+			*(uint4*)(&xs.hash[4]) = ttx;
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_2(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][SSM - 1];
+	__shared__ u32 lastword1[NSLOTS];
+	__shared__ uint4 lastword2[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	slot* buck = eq->trees[0][bucketid];
+	u32 bsize = umin(eq->edata.nslots[1][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 ta[2];
+	uint4 tt[2];
+
+	u32 si[2];
+
+	// enable this to make fully safe shared mem operations;
+	// disabled gains some speed, but can rarely cause a crash
+	//__syncthreads();
+
+#pragma unroll
+	for (u32 i = 0; i != 2; ++i)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		// get slot
+		const slot* pslot1 = buck + si[i];
+
+		uint4 ttx = *(uint4*)(&pslot1->hash[0]);
+		lastword1[si[i]] = ta[i] = ttx.x;
+		uint2 tty = *(uint2*)(&pslot1->hash[4]);
+		tt[i].x = ttx.y;
+		tt[i].y = ttx.z;
+		tt[i].z = ttx.w;
+		tt[i].w = tty.x;
+		lastword2[si[i]] = tt[i];
+
+		hr[i] = tty.y & RESTMASK;
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[5];
+	u32 xorbucketid, xorslot;
+
+#pragma unroll
+	for (u32 i = 0; i != 2; ++i)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			xors[0] = ta[i] ^ lastword1[p];
+
+			xorbucketid = xors[0] >> (12 + RB);
+			xorslot = atomicAdd(&eq->edata.nslots[2][xorbucketid], 1);
+			if (xorslot < NSLOTS)
+			{
+				*(uint4*)(&xors[1]) = tt[i] ^ lastword2[p];
+				slotsmall &xs = eq->round2trees[xorbucketid].treessmall[xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+				slottiny &xst = eq->round2trees[xorbucketid].treestiny[xorslot];
+				uint2 ttx;
+				ttx.x = xors[4];
+				ttx.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+				*(uint2*)(&xst.hash[0]) = ttx;
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		xors[0] = lastword1[i] ^ lastword1[k];
+
+		xorbucketid = xors[0] >> (12 + RB);
+		xorslot = atomicAdd(&eq->edata.nslots[2][xorbucketid], 1);
+		if (xorslot < NSLOTS)
+		{
+			*(uint4*)(&xors[1]) = lastword2[i] ^ lastword2[k];
+			slotsmall &xs = eq->round2trees[xorbucketid].treessmall[xorslot];
+			*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+			slottiny &xst = eq->round2trees[xorbucketid].treestiny[xorslot];
+			uint2 ttx;
+			ttx.x = xors[4];
+			ttx.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+			*(uint2*)(&xst.hash[0]) = ttx;
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_3(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint4 lastword1[NSLOTS];
+	__shared__ u32 lastword2[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	u32 bsize = umin(eq->edata.nslots[2][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 si[2];
+	uint4 tt[2];
+	u32 ta[2];
+
+	// enable this to make fully safe shared mem operations;
+	// disabled gains some speed, but can rarely cause a crash
+	//__syncthreads();
+
+#pragma unroll
+	for (u32 i = 0; i != 2; ++i)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		slotsmall &xs = eq->round2trees[bucketid].treessmall[si[i]];
+		slottiny &xst = eq->round2trees[bucketid].treestiny[si[i]];
+
+		tt[i] = *(uint4*)(&xs.hash[0]);
+		lastword1[si[i]] = tt[i];
+		ta[i] = xst.hash[0];
+		lastword2[si[i]] = ta[i];
+		asm("bfe.u32 %0, %1, 12, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[5];
+	u32 bexor, xorbucketid, xorslot;
+
+#pragma unroll
+	for (u32 i = 0; i != 2; ++i)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			xors[4] = ta[i] ^ lastword2[p];
+
+			if (xors[4] != 0)
+			{
+				*(uint4*)(&xors[0]) = tt[i] ^ lastword1[p];
+
+				bexor = __byte_perm(xors[0], xors[1], 0x2107);
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[3][xorbucketid], 1);
+
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->round3trees[xorbucketid].treessmall[xorslot];
+					*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+					slottiny &xst = eq->round3trees[xorbucketid].treestiny[xorslot];
+					uint2 ttx;
+					ttx.x = bexor;
+					ttx.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint2*)(&xst.hash[0]) = ttx;
+				}
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		xors[4] = lastword2[i] ^ lastword2[k];
+
+		if (xors[4] != 0)
+		{
+			*(uint4*)(&xors[0]) = lastword1[i] ^ lastword1[k];
+
+			bexor = __byte_perm(xors[0], xors[1], 0x2107);
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[3][xorbucketid], 1);
+
+			if (xorslot < NSLOTS)
+			{
+				slotsmall &xs = eq->round3trees[xorbucketid].treessmall[xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+				slottiny &xst = eq->round3trees[xorbucketid].treestiny[xorslot];
+				uint2 ttx;
+				ttx.x = bexor;
+				ttx.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+				*(uint2*)(&xst.hash[0]) = ttx;
+			}
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_4(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint4 lastword[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	u32 bsize = umin(eq->edata.nslots[3][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 si[2];
+	uint4 tt[2];
+
+	// enable this to make fully safe shared mem operations;
+	// disabled gains some speed, but can rarely cause a crash
+	//__syncthreads();
+
+#pragma unroll
+	for (u32 i = 0; i != 2; ++i)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		slotsmall &xs = eq->round3trees[bucketid].treessmall[si[i]];
+		slottiny &xst = eq->round3trees[bucketid].treestiny[si[i]];
+
+		// get xhash
+		tt[i] = *(uint4*)(&xs.hash[0]);
+		lastword[si[i]] = tt[i];
+		hr[i] = xst.hash[0] & RESTMASK;
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+	u32 xors[4];
+	u32 xorbucketid, xorslot;
+
+#pragma unroll
+	for (u32 i = 0; i != 2; ++i)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint4*)(&xors[0]) = tt[i] ^ lastword[p];
+
+			if (xors[3] != 0)
+			{
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(4 + RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[4][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[3][xorbucketid][xorslot];
+					*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+
+					eq->round4bidandsids[xorbucketid][xorslot] = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+				}
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint4*)(&xors[0]) = lastword[i] ^ lastword[k];
+		if (xors[3] != 0)
+		{
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(4 + RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[4][xorbucketid], 1);
+			if (xorslot < NSLOTS)
+			{
+				slotsmall &xs = eq->treessmall[3][xorbucketid][xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+				eq->round4bidandsids[xorbucketid][xorslot] = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+			}
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_5(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint4 lastword[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	slotsmall* buck = eq->treessmall[3][bucketid];
+	u32 bsize = umin(eq->edata.nslots[4][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 si[2];
+	uint4 tt[2];
+
+	// enable this to make fully safe shared mem operations;
+	// disabled gains some speed, but can rarely cause a crash
+	//__syncthreads();
+
+#pragma unroll
+	for (u32 i = 0; i != 2; ++i)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		tt[i] = *(uint4*)(&pslot1->hash[0]);
+		lastword[si[i]] = tt[i];
+		asm("bfe.u32 %0, %1, 4, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+	u32 xors[4];
+	u32 bexor, xorbucketid, xorslot;
+
+#pragma unroll
+	for (u32 i = 0; i != 2; ++i)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint4*)(&xors[0]) = tt[i] ^ lastword[p];
+
+			if (xors[3] != 0)
+			{
+				bexor = __byte_perm(xors[0], xors[1], 0x1076);
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[5][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[2][xorbucketid][xorslot];
+					uint4 ttx;
+					ttx.x = xors[1];
+					ttx.y = xors[2];
+					ttx.z = xors[3];
+					ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint4*)(&xs.hash[0]) = ttx;
+				}
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint4*)(&xors[0]) = lastword[i] ^ lastword[k];
+
+		if (xors[3] != 0)
+		{
+			bexor = __byte_perm(xors[0], xors[1], 0x1076);
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[5][xorbucketid], 1);
+			if (xorslot < NSLOTS)
+			{
+				slotsmall &xs = eq->treessmall[2][xorbucketid][xorslot];
+				uint4 tt;
+				tt.x = xors[1];
+				tt.y = xors[2];
+				tt.z = xors[3];
+				tt.w = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+				*(uint4*)(&xs.hash[0]) = tt;
+			}
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS>
+__global__ void digit_6(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint2 lastword1[NSLOTS];
+	__shared__ u32 lastword2[NSLOTS];
+	__shared__ int ht_len[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 bsize_sh;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	ht_len[threadid] = 0;
+	if (threadid == (NRESTS - 1))
+	{
+		pairs_len = 0;
+		next_pair = 0;
+	}
+	else if (threadid == (NRESTS - 33))
+		bsize_sh = umin(eq->edata.nslots[5][bucketid], NSLOTS);
+
+	slotsmall* buck = eq->treessmall[2][bucketid];
+
+	u32 hr[3];
+	int pos[3];
+	pos[0] = pos[1] = pos[2] = SSM;
+
+	u32 si[3];
+	uint4 tt[3];
+
+	__syncthreads();
+
+	u32 bsize = bsize_sh;
+
+#pragma unroll
+	for (u32 i = 0; i != 3; ++i)
+	{
+		si[i] = i * NRESTS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		tt[i] = *(uint4*)(&pslot1->hash[0]);
+		lastword1[si[i]] = *(uint2*)(&tt[i].x);
+		lastword2[si[i]] = tt[i].z;
+		asm("bfe.u32 %0, %1, 16, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	// doing this to save shared memory
+	int* pairs = ht_len;
+	__syncthreads();
+
+	u32 xors[3];
+	u32 bexor, xorbucketid, xorslot;
+
+#pragma unroll
+	for (u32 i = 0; i != 3; ++i)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			xors[2] = tt[i].z ^ lastword2[p];
+
+			if (xors[2] != 0)
+			{
+				*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ lastword1[p];
+
+				bexor = __byte_perm(xors[0], xors[1], 0x1076);
+				xorbucketid = bexor >> (12 + RB);
+				xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot];
+					uint4 ttx;
+					ttx.x = xors[1];
+					ttx.y = xors[2];
+					ttx.z = bexor;
+					ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint4*)(&xs.hash[0]) = ttx;
+				}
+			}
+
+			if (pos[i] > 1)
+			{
+				p = ht[hr[i]][1];
+
+				xors[2] = tt[i].z ^ lastword2[p];
+
+				if (xors[2] != 0)
+				{
+					*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ lastword1[p];
+
+					bexor = __byte_perm(xors[0], xors[1], 0x1076);
+					xorbucketid = bexor >> (12 + RB);
+					xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1);
+					if (xorslot < NSLOTS)
+					{
+						slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot];
+						uint4 ttx;
+						ttx.x = xors[1];
+						ttx.y = xors[2];
+						ttx.z = bexor;
+						ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+						*(uint4*)(&xs.hash[0]) = ttx;
+					}
+				}
+
+				for (int k = 2; k != pos[i]; ++k)
+				{
+					u32 pindex = atomicAdd(&pairs_len, 1);
+					if (pindex >= MAXPAIRS) break;
+					u16 prev = ht[hr[i]][k];
+					pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+				}
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		u32 pair = pairs[s];
+		u32 i = __byte_perm(pair, 0, 0x4510);
+		u32 k = __byte_perm(pair, 0, 0x4532);
+
+		xors[2] = lastword2[i] ^ lastword2[k];
+		if (xors[2] == 0)
+			continue;
+
+		*(uint2*)(&xors[0]) = lastword1[i] ^ lastword1[k];
+
+		bexor = __byte_perm(xors[0], xors[1], 0x1076);
+		xorbucketid = bexor >> (12 + RB);
+		xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1);
+		if (xorslot >= NSLOTS) continue;
+		slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot];
+		uint4 ttx;
+		ttx.x = xors[1];
+		ttx.y = xors[2];
+		ttx.z = bexor;
+		ttx.w = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+		*(uint4*)(&xs.hash[0]) = ttx;
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS>
+__global__ void digit_7(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ u32 lastword[NSLOTS][2];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 bsize_sh;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	ht_len[threadid] = 0;
+	if (threadid == (NRESTS - 1))
+	{
+		pairs_len = 0;
+		next_pair = 0;
+	}
+	else if (threadid == (NRESTS - 33))
+		bsize_sh = umin(eq->edata.nslots[6][bucketid], NSLOTS);
+
+	slotsmall* buck = eq->treessmall[0][bucketid];
+
+	u32 hr[3];
+	int pos[3];
+	pos[0] = pos[1] = pos[2] = SSM;
+
+	u32 si[3];
+	uint4 tt[3];
+
+	__syncthreads();
+
+	u32 bsize = bsize_sh;
+
+#pragma unroll
+	for (u32 i = 0; i != 3; ++i)
+	{
+		si[i] = i * NRESTS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		// get xhash
+		tt[i] = *(uint4*)(&pslot1->hash[0]);
+		*(uint2*)(&lastword[si[i]][0]) = *(uint2*)(&tt[i].x);
+		asm("bfe.u32 %0, %1, 12, %2;" : "=r"(hr[i]) : "r"(tt[i].z), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[2];
+	u32 xorbucketid, xorslot;
+
+#pragma unroll
+	for (u32 i = 0; i != 3; ++i)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+			if (xors[1] != 0)
+			{
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot];
+					uint4 ttx;
+					ttx.x = xors[0];
+					ttx.y = xors[1];
+					ttx.z = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					ttx.w = 0;
+					*(uint4*)(&xs.hash[0]) = ttx;
+				}
+			}
+
+			if (pos[i] > 1)
+			{
+				p = ht[hr[i]][1];
+
+				*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+				if (xors[1] != 0)
+				{
+					asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS));
+					xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1);
+					if (xorslot < NSLOTS)
+					{
+						slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot];
+						uint4 ttx;
+						ttx.x = xors[0];
+						ttx.y = xors[1];
+						ttx.z = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+						ttx.w = 0;
+						*(uint4*)(&xs.hash[0]) = ttx;
+					}
+				}
+
+				for (int k = 2; k != pos[i]; ++k)
+				{
+					u32 pindex = atomicAdd(&pairs_len, 1);
+					if (pindex >= MAXPAIRS) break;
+					u16 prev = ht[hr[i]][k];
+					pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+				}
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		u32 i = __byte_perm(pair, 0, 0x4510);
+		u32 k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint2*)(&xors[0]) = *(uint2*)(&lastword[i][0]) ^ *(uint2*)(&lastword[k][0]);
+
+		if (xors[1] == 0)
+			continue;
+
+		asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS));
+		xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1);
+		if (xorslot >= NSLOTS) continue;
+		slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot];
+		uint4 tt;
+		tt.x = xors[0];
+		tt.y = xors[1];
+		tt.z = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+		tt.w = 0;
+		*(uint4*)(&xs.hash[0]) = tt;
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS>
+__global__ void digit_8(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ u32 lastword[NSLOTS][2];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 bsize_sh;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	ht_len[threadid] = 0;
+	if (threadid == (NRESTS - 1))
+	{
+		next_pair = 0;
+		pairs_len = 0;
+	}
+	else if (threadid == (NRESTS - 33))
+		bsize_sh = umin(eq->edata.nslots[7][bucketid], NSLOTS);
+
+	slotsmall* buck = eq->treessmall[1][bucketid];
+
+	u32 hr[3];
+	int pos[3];
+	pos[0] = pos[1] = pos[2] = SSM;
+
+	u32 si[3];
+	uint2 tt[3];
+
+	__syncthreads();
+
+	u32 bsize = bsize_sh;
+
+#pragma unroll
+	for (u32 i = 0; i != 3; ++i)
+	{
+		si[i] = i * NRESTS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		// get xhash
+		tt[i] = *(uint2*)(&pslot1->hash[0]);
+		*(uint2*)(&lastword[si[i]][0]) = *(uint2*)(&tt[i].x);
+		asm("bfe.u32 %0, %1, 8, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[2];
+	u32 bexor, xorbucketid, xorslot;
+
+#pragma unroll
+	for (u32 i = 0; i != 3; ++i)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+			if (xors[1] != 0)
+			{
+				bexor = __byte_perm(xors[0], xors[1], 0x0765);
+				xorbucketid = bexor >> (12 + 8);
+				xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1);
+				if (xorslot < RB8_NSLOTS_LD)
+				{
+					slottiny &xs = eq->treestiny[0][xorbucketid][xorslot];
+					uint2 tt;
+					tt.x = xors[1];
+					tt.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint2*)(&xs.hash[0]) = tt;
+				}
+			}
+
+			if (pos[i] > 1)
+			{
+				p = ht[hr[i]][1];
+
+				*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+				if (xors[1] != 0)
+				{
+					bexor = __byte_perm(xors[0], xors[1], 0x0765);
+					xorbucketid = bexor >> (12 + 8);
+					xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1);
+					if (xorslot < RB8_NSLOTS_LD)
+					{
+						slottiny &xs = eq->treestiny[0][xorbucketid][xorslot];
+						uint2 tt;
+						tt.x = xors[1];
+						tt.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+						*(uint2*)(&xs.hash[0]) = tt;
+					}
+				}
+
+				for (int k = 2; k != pos[i]; ++k)
+				{
+					u32 pindex = atomicAdd(&pairs_len, 1);
+					if (pindex >= MAXPAIRS) break;
+					u16 prev = ht[hr[i]][k];
+					pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+				}
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		u32 i = __byte_perm(pair, 0, 0x4510);
+		u32 k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint2*)(&xors[0]) = *(uint2*)(&lastword[i][0]) ^ *(uint2*)(&lastword[k][0]);
+
+		if (xors[1] == 0)
+			continue;
+
+		bexor = __byte_perm(xors[0], xors[1], 0x0765);
+		xorbucketid = bexor >> (12 + 8);
+		xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1);
+		if (xorslot >= RB8_NSLOTS_LD) continue;
+		slottiny &xs = eq->treestiny[0][xorbucketid][xorslot];
+		uint2 tt;
+		tt.x = xors[1];
+		tt.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+		*(uint2*)(&xs.hash[0]) = tt;
+	}
+}
+
+/*
+  Last round function is similar to previous ones but has different ending.
+  We use warps to process final candidates. Each warp process one candidate.
+  First two bidandsids (u32 of stored bucketid and two slotids) are retreived by
+  lane 0 and lane 16, next four bidandsids by lane 0, 8, 16 and 24, ... until
+  all lanes in warp have bidandsids from round 4. Next, each thread retreives
+  16 indices. While doing so, indices are put into comparison using atomicExch
+  to determine if there are duplicates (tromp's method). At the end, if no
+  duplicates are found, candidate solution is saved (all indices). Note that this
+  dup check method is not exact so CPU dup checking is needed after.
+*/
+template <u32 RB, u32 SM, int SSM, u32 FCT, typename PACKER, u32 MAXPAIRS, u32 DUPBITS, u32 W>
+__global__ void digit_last_wdc(equi<RB, SM>* eq)
+{
+	__shared__ u8 shared_data[8192];
+	int* ht_len = (int*)(&shared_data[0]);
+	int* pairs = ht_len;
+	u32* lastword = (u32*)(&shared_data[256 * 4]);
+	u16* ht = (u16*)(&shared_data[256 * 4 + RB8_NSLOTS_LD * 4]);
+	u32* pairs_len = (u32*)(&shared_data[8188]);
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+#pragma unroll
+	for (u32 i = 0; i != FCT; ++i)
+		ht_len[(i * (256 / FCT)) + threadid] = 0;
+
+	if (threadid == ((256 / FCT) - 1))
+		*pairs_len = 0;
+
+	slottiny* buck = eq->treestiny[0][bucketid];
+	u32 bsize = umin(eq->edata.nslots8[bucketid], RB8_NSLOTS_LD);
+
+	u32 si[3 * FCT];
+	u32 hr[3 * FCT];
+	int pos[3 * FCT];
+	u32 lw[3 * FCT];
+#pragma unroll
+	for (u32 i = 0; i != (3 * FCT); ++i)
+		pos[i] = SSM;
+
+	__syncthreads();
+
+#pragma unroll
+	for (u32 i = 0; i != (3 * FCT); ++i)
+	{
+		si[i] = i * (256 / FCT) + threadid;
+		if (si[i] >= bsize) break;
+
+		const slottiny* pslot1 = buck + si[i];
+
+		// get xhash
+		uint2 tt = *(uint2*)(&pslot1->hash[0]);
+		lw[i] = tt.x;
+		lastword[si[i]] = lw[i];
+
+		u32 a;
+		asm("bfe.u32 %0, %1, 20, 8;" : "=r"(a) : "r"(lw[i]));
+		hr[i] = a;
+
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1))
+			ht[hr[i] * (SSM - 1) + pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+#pragma unroll
+	for (u32 i = 0; i != (3 * FCT); ++i)
+	{
+		if (pos[i] >= SSM) continue;
+
+		for (int k = 0; k != pos[i]; ++k)
+		{
+			u16 prev = ht[hr[i] * (SSM - 1) + k];
+			if (lw[i] != lastword[prev]) continue;
+			u32 pindex = atomicAdd(pairs_len, 1);
+			if (pindex >= MAXPAIRS) break;
+			pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+		}
+	}
+
+	__syncthreads();
+	u32 plen = umin(*pairs_len, 64);
+
+#define CALC_LEVEL(a, b, c, d) { \
+	u32 plvl = levels[b]; \
+	u32* bucks = eq->round4bidandsids[PACKER::get_bucketid(plvl, RB, SM)]; \
+	u32 slot1 = PACKER::get_slot1(plvl, RB, SM); \
+	u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); \
+	levels[b] = bucks[slot1]; \
+	levels[c] = bucks[slot0]; \
+				}
+
+#define CALC_LEVEL_SMALL(a, b, c, d) { \
+	u32 plvl = levels[b]; \
+	slotsmall* bucks = eq->treessmall[a][PACKER::get_bucketid(plvl, RB, SM)]; \
+	u32 slot1 = PACKER::get_slot1(plvl, RB, SM); \
+	u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); \
+	levels[b] = bucks[slot1].hash[d]; \
+	levels[c] = bucks[slot0].hash[d]; \
+				}
+
+	u32 lane = threadIdx.x & 0x1f;
+	u32 par = threadIdx.x >> 5;
+
+	u32* levels = (u32*)&pairs[MAXPAIRS + (par << DUPBITS)];
+	u32* susp = levels;
+
+	while (par < plen)
+	{
+		int pair = pairs[par];
+		par += W;
+
+		if (lane % 16 == 0)
+		{
+			u32 plvl;
+			if (lane == 0) plvl = buck[__byte_perm(pair, 0, 0x4510)].hash[1];
+			else plvl = buck[__byte_perm(pair, 0, 0x4532)].hash[1];
+			slotsmall* bucks = eq->treessmall[1][PACKER::get_bucketid(plvl, RB, SM)];
+			u32 slot1 = PACKER::get_slot1(plvl, RB, SM);
+			u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM);
+			levels[lane] = bucks[slot1].hash[2];
+			levels[lane + 8] = bucks[slot0].hash[2];
+		}
+
+		if (lane % 8 == 0)
+			CALC_LEVEL_SMALL(0, lane, lane + 4, 3);
+
+		if (lane % 4 == 0)
+			CALC_LEVEL_SMALL(2, lane, lane + 2, 3);
+
+		if (lane % 2 == 0)
+			CALC_LEVEL(0, lane, lane + 1, 4);
+
+		u32 ind[16];
+
+		u32 f1 = levels[lane];
+		const slottiny* buck_v4 = &eq->round3trees[PACKER::get_bucketid(f1, RB, SM)].treestiny[0];
+		const u32 slot1_v4 = PACKER::get_slot1(f1, RB, SM);
+		const u32 slot0_v4 = PACKER::get_slot0(f1, slot1_v4, RB, SM);
+
+		susp[lane] = 0xffffffff;
+		susp[32 + lane] = 0xffffffff;
+
+#define CHECK_DUP(a) \
+	__any(atomicExch(&susp[(ind[a] & ((1 << DUPBITS) - 1))], (ind[a] >> DUPBITS)) == (ind[a] >> DUPBITS))
+
+		u32 f2 = buck_v4[slot1_v4].hash[1];
+		const slottiny* buck_v3_1 = &eq->round2trees[PACKER::get_bucketid(f2, RB, SM)].treestiny[0];
+		const u32 slot1_v3_1 = PACKER::get_slot1(f2, RB, SM);
+		const u32 slot0_v3_1 = PACKER::get_slot0(f2, slot1_v3_1, RB, SM);
+
+		susp[64 + lane] = 0xffffffff;
+		susp[96 + lane] = 0xffffffff;
+
+		u32 f0 = buck_v3_1[slot1_v3_1].hash[1];
+		const slot* buck_v2_1 = eq->trees[0][PACKER::get_bucketid(f0, RB, SM)];
+		const u32 slot1_v2_1 = PACKER::get_slot1(f0, RB, SM);
+		const u32 slot0_v2_1 = PACKER::get_slot0(f0, slot1_v2_1, RB, SM);
+
+		susp[128 + lane] = 0xffffffff;
+		susp[160 + lane] = 0xffffffff;
+
+		u32 f3 = buck_v2_1[slot1_v2_1].hash[6];
+		const slot* buck_fin_1 = eq->round0trees[packer_default::get_bucketid(f3, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_1 = packer_default::get_slot1(f3, 8, RB8_NSLOTS);
+		const u32 slot0_fin_1 = packer_default::get_slot0(f3, slot1_fin_1, 8, RB8_NSLOTS);
+
+		susp[192 + lane] = 0xffffffff;
+		susp[224 + lane] = 0xffffffff;
+
+		ind[0] = buck_fin_1[slot1_fin_1].hash[7];
+		if (CHECK_DUP(0)) continue;
+		ind[1] = buck_fin_1[slot0_fin_1].hash[7];
+		if (CHECK_DUP(1)) continue;
+
+		u32 f4 = buck_v2_1[slot0_v2_1].hash[6];
+		const slot* buck_fin_2 = eq->round0trees[packer_default::get_bucketid(f4, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_2 = packer_default::get_slot1(f4, 8, RB8_NSLOTS);
+		const u32 slot0_fin_2 = packer_default::get_slot0(f4, slot1_fin_2, 8, RB8_NSLOTS);
+
+		ind[2] = buck_fin_2[slot1_fin_2].hash[7];
+		if (CHECK_DUP(2)) continue;
+		ind[3] = buck_fin_2[slot0_fin_2].hash[7];
+		if (CHECK_DUP(3)) continue;
+
+		u32 f5 = buck_v3_1[slot0_v3_1].hash[1];
+		const slot* buck_v2_2 = eq->trees[0][PACKER::get_bucketid(f5, RB, SM)];
+		const u32 slot1_v2_2 = PACKER::get_slot1(f5, RB, SM);
+		const u32 slot0_v2_2 = PACKER::get_slot0(f5, slot1_v2_2, RB, SM);
+
+		u32 f6 = buck_v2_2[slot1_v2_2].hash[6];
+		const slot* buck_fin_3 = eq->round0trees[packer_default::get_bucketid(f6, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_3 = packer_default::get_slot1(f6, 8, RB8_NSLOTS);
+		const u32 slot0_fin_3 = packer_default::get_slot0(f6, slot1_fin_3, 8, RB8_NSLOTS);
+
+		ind[4] = buck_fin_3[slot1_fin_3].hash[7];
+		if (CHECK_DUP(4)) continue;
+		ind[5] = buck_fin_3[slot0_fin_3].hash[7];
+		if (CHECK_DUP(5)) continue;
+
+		u32 f7 = buck_v2_2[slot0_v2_2].hash[6];
+		const slot* buck_fin_4 = eq->round0trees[packer_default::get_bucketid(f7, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_4 = packer_default::get_slot1(f7, 8, RB8_NSLOTS);
+		const u32 slot0_fin_4 = packer_default::get_slot0(f7, slot1_fin_4, 8, RB8_NSLOTS);
+
+		ind[6] = buck_fin_4[slot1_fin_4].hash[7];
+		if (CHECK_DUP(6)) continue;
+		ind[7] = buck_fin_4[slot0_fin_4].hash[7];
+		if (CHECK_DUP(7)) continue;
+
+		u32 f8 = buck_v4[slot0_v4].hash[1];
+		const slottiny* buck_v3_2 = &eq->round2trees[PACKER::get_bucketid(f8, RB, SM)].treestiny[0];
+		const u32 slot1_v3_2 = PACKER::get_slot1(f8, RB, SM);
+		const u32 slot0_v3_2 = PACKER::get_slot0(f8, slot1_v3_2, RB, SM);
+
+		u32 f9 = buck_v3_2[slot1_v3_2].hash[1];
+		const slot* buck_v2_3 = eq->trees[0][PACKER::get_bucketid(f9, RB, SM)];
+		const u32 slot1_v2_3 = PACKER::get_slot1(f9, RB, SM);
+		const u32 slot0_v2_3 = PACKER::get_slot0(f9, slot1_v2_3, RB, SM);
+
+		u32 f10 = buck_v2_3[slot1_v2_3].hash[6];
+		const slot* buck_fin_5 = eq->round0trees[packer_default::get_bucketid(f10, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_5 = packer_default::get_slot1(f10, 8, RB8_NSLOTS);
+		const u32 slot0_fin_5 = packer_default::get_slot0(f10, slot1_fin_5, 8, RB8_NSLOTS);
+
+		ind[8] = buck_fin_5[slot1_fin_5].hash[7];
+		if (CHECK_DUP(8)) continue;
+		ind[9] = buck_fin_5[slot0_fin_5].hash[7];
+		if (CHECK_DUP(9)) continue;
+
+		u32 f11 = buck_v2_3[slot0_v2_3].hash[6];
+		const slot* buck_fin_6 = eq->round0trees[packer_default::get_bucketid(f11, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_6 = packer_default::get_slot1(f11, 8, RB8_NSLOTS);
+		const u32 slot0_fin_6 = packer_default::get_slot0(f11, slot1_fin_6, 8, RB8_NSLOTS);
+
+		ind[10] = buck_fin_6[slot1_fin_6].hash[7];
+		if (CHECK_DUP(10)) continue;
+		ind[11] = buck_fin_6[slot0_fin_6].hash[7];
+		if (CHECK_DUP(11)) continue;
+
+		u32 f12 = buck_v3_2[slot0_v3_2].hash[1];
+		const slot* buck_v2_4 = eq->trees[0][PACKER::get_bucketid(f12, RB, SM)];
+		const u32 slot1_v2_4 = PACKER::get_slot1(f12, RB, SM);
+		const u32 slot0_v2_4 = PACKER::get_slot0(f12, slot1_v2_4, RB, SM);
+
+		u32 f13 = buck_v2_4[slot1_v2_4].hash[6];
+		const slot* buck_fin_7 = eq->round0trees[packer_default::get_bucketid(f13, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_7 = packer_default::get_slot1(f13, 8, RB8_NSLOTS);
+		const u32 slot0_fin_7 = packer_default::get_slot0(f13, slot1_fin_7, 8, RB8_NSLOTS);
+
+		ind[12] = buck_fin_7[slot1_fin_7].hash[7];
+		if (CHECK_DUP(12)) continue;
+		ind[13] = buck_fin_7[slot0_fin_7].hash[7];
+		if (CHECK_DUP(13)) continue;
+
+		u32 f14 = buck_v2_4[slot0_v2_4].hash[6];
+		const slot* buck_fin_8 = eq->round0trees[packer_default::get_bucketid(f14, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_8 = packer_default::get_slot1(f14, 8, RB8_NSLOTS);
+		const u32 slot0_fin_8 = packer_default::get_slot0(f14, slot1_fin_8, 8, RB8_NSLOTS);
+
+		ind[14] = buck_fin_8[slot1_fin_8].hash[7];
+		if (CHECK_DUP(14)) continue;
+		ind[15] = buck_fin_8[slot0_fin_8].hash[7];
+		if (CHECK_DUP(15)) continue;
+
+		u32 soli;
+		if (lane == 0)
+		{
+			soli = atomicAdd(&eq->edata.srealcont.nsols, 1);
+		}
+		soli = __shfl(soli, 0);
+
+		if (soli < MAXREALSOLS)
+		{
+			u32 pos = lane << 4;
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos]) = *(uint4*)(&ind[0]);
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos + 4]) = *(uint4*)(&ind[4]);
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos + 8]) = *(uint4*)(&ind[8]);
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos + 12]) = *(uint4*)(&ind[12]);
+		}
+	}
+}
+
+
+std::mutex dev_init;
+int dev_init_done[8] = { 0 };
+
+
+__host__ int compu32(const void *pa, const void *pb)
+{
+	uint32_t a = *(uint32_t *)pa, b = *(uint32_t *)pb;
+	return a<b ? -1 : a == b ? 0 : +1;
+}
+
+
+__host__ bool duped(uint32_t* prf)
+{
+	uint32_t sortprf[512];
+	memcpy(sortprf, prf, sizeof(uint32_t) * 512);
+	qsort(sortprf, 512, sizeof(uint32_t), &compu32);
+	for (uint32_t i = 1; i<512; i++)
+		if (sortprf[i] <= sortprf[i - 1])
+			return true;
+	return false;
+}
+
+
+__host__ void sort_pair(uint32_t *a, uint32_t len)
+{
+	uint32_t    *b = a + len;
+	uint32_t     tmp, need_sorting = 0;
+	for (uint32_t i = 0; i < len; i++)
+		if (need_sorting || a[i] > b[i])
+		{
+			need_sorting = 1;
+			tmp = a[i];
+			a[i] = b[i];
+			b[i] = tmp;
+		}
+		else if (a[i] < b[i])
+			return;
+}
+
+
+__host__ void setheader(blake2b_state *ctx, const char *header, const u32 headerLen, const char* nce, const u32 nonceLen)
+{
+	uint32_t le_N = WN;
+	uint32_t le_K = WK;
+	uchar personal[] = "ZcashPoW01230123";
+	memcpy(personal + 8, &le_N, 4);
+	memcpy(personal + 12, &le_K, 4);
+	blake2b_param P[1];
+	P->digest_length = HASHOUT;
+	P->key_length = 0;
+	P->fanout = 1;
+	P->depth = 1;
+	P->leaf_length = 0;
+	P->node_offset = 0;
+	P->node_depth = 0;
+	P->inner_length = 0;
+	memset(P->reserved, 0, sizeof(P->reserved));
+	memset(P->salt, 0, sizeof(P->salt));
+	memcpy(P->personal, (const uint8_t *)personal, 16);
+	blake2b_init_param(ctx, P);
+	blake2b_update(ctx, (const uchar *)header, headerLen);
+	blake2b_update(ctx, (const uchar *)nce, nonceLen);
+}
+
+
+#ifdef WIN32
+typedef CUresult(CUDAAPI *dec_cuDeviceGet)(CUdevice*, int);
+typedef CUresult(CUDAAPI *dec_cuCtxCreate)(CUcontext*, unsigned int, CUdevice);
+typedef CUresult(CUDAAPI *dec_cuCtxPushCurrent)(CUcontext);
+typedef CUresult(CUDAAPI *dec_cuCtxDestroy)(CUcontext);
+
+dec_cuDeviceGet _cuDeviceGet = nullptr;
+dec_cuCtxCreate _cuCtxCreate = nullptr;
+dec_cuCtxPushCurrent _cuCtxPushCurrent = nullptr;
+dec_cuCtxDestroy _cuCtxDestroy = nullptr;
+#endif
+
+
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+__host__ eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::eq_cuda_context(int id)
+	: device_id(id)
+{
+	solutions = nullptr;
+
+	dev_init.lock();
+	if (!dev_init_done[device_id])
+	{
+		// only first thread shall init device
+		checkCudaErrors(cudaSetDevice(device_id));
+		checkCudaErrors(cudaDeviceReset());
+		checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+
+		pctx = nullptr;
+	}
+	else
+	{
+		// create new context
+		CUdevice dev;
+
+#ifdef WIN32
+		if (_cuDeviceGet == nullptr)
+		{
+			HMODULE hmod = LoadLibraryA("nvcuda.dll");
+			if (hmod == NULL)
+				throw std::runtime_error("Failed to load nvcuda.dll");
+			_cuDeviceGet = (dec_cuDeviceGet)GetProcAddress(hmod, "cuDeviceGet");
+			if (_cuDeviceGet == nullptr)
+				throw std::runtime_error("Failed to get cuDeviceGet address");
+			_cuCtxCreate = (dec_cuCtxCreate)GetProcAddress(hmod, "cuCtxCreate_v2");
+			if (_cuCtxCreate == nullptr)
+				throw std::runtime_error("Failed to get cuCtxCreate address");
+			_cuCtxPushCurrent = (dec_cuCtxPushCurrent)GetProcAddress(hmod, "cuCtxPushCurrent_v2");
+			if (_cuCtxPushCurrent == nullptr)
+				throw std::runtime_error("Failed to get cuCtxPushCurrent address");
+			_cuCtxDestroy = (dec_cuCtxDestroy)GetProcAddress(hmod, "cuCtxDestroy_v2");
+			if (_cuCtxDestroy == nullptr)
+				throw std::runtime_error("Failed to get cuCtxDestroy address");
+		}
+
+
+		checkCudaDriverErrors(_cuDeviceGet(&dev, device_id));
+		checkCudaDriverErrors(_cuCtxCreate(&pctx, CU_CTX_SCHED_BLOCKING_SYNC, dev));
+		checkCudaDriverErrors(_cuCtxPushCurrent(pctx));
+#else
+		checkCudaDriverErrors(cuDeviceGet(&dev, device_id));
+		checkCudaDriverErrors(cuCtxCreate(&pctx, CU_CTX_SCHED_BLOCKING_SYNC, dev));
+		checkCudaDriverErrors(cuCtxPushCurrent(pctx));
+#endif
+	}
+	++dev_init_done[device_id];
+	dev_init.unlock();
+
+	if (cudaMalloc((void**)&device_eq, sizeof(equi<RB, SM>)) != cudaSuccess)
+		throw std::runtime_error("CUDA: failed to alloc memory");
+
+	solutions = (scontainerreal*)malloc(sizeof(scontainerreal));
+}
+
+
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+__host__ void eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::solve(const char *tequihash_header,
+	unsigned int tequihash_header_len,
+	const char* nonce,
+	unsigned int nonce_len,
+	std::function<bool()> cancelf,
+	std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
+	std::function<void(void)> hashdonef)
+{
+	blake2b_state blake_ctx;
+
+	int blocks = NBUCKETS;
+
+	setheader(&blake_ctx, tequihash_header, tequihash_header_len, nonce, nonce_len);
+
+	// todo: improve
+	// djezo solver allows last 4 bytes of nonce to be iterrated
+	// this can be used to create internal loop - calc initial blake hash only once, then load 8*8 bytes on device (blake state h)
+	// then just iterate nn++
+	// less CPU load, 1 cudaMemcpy less -> faster
+	//u32 nn = *(u32*)&nonce[28];
+	u32 nn = 0;
+
+	checkCudaErrors(cudaMemcpy(&device_eq->blake_h, &blake_ctx.h, sizeof(u64) * 8, cudaMemcpyHostToDevice));
+
+	checkCudaErrors(cudaMemset(&device_eq->edata, 0, sizeof(device_eq->edata)));
+
+	digit_first<RB, SM, PACKER> << <NBLOCKS / FD_THREADS, FD_THREADS >> >(device_eq, nn);
+
+	digit_1<RB, SM, SSM, PACKER, 4 * NRESTS, 512> << <4096, 512 >> >(device_eq);
+
+	digit_2<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> << <blocks, THREADS >> >(device_eq);
+
+	digit_3<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> << <blocks, THREADS >> >(device_eq);
+
+	if (cancelf()) return;
+
+	digit_4<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> << <blocks, THREADS >> >(device_eq);
+
+	digit_5<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> << <blocks, THREADS >> >(device_eq);
+
+	digit_6<RB, SM, SSM - 1, PACKER, 3 * NRESTS> << <blocks, NRESTS >> >(device_eq);
+
+	digit_7<RB, SM, SSM - 1, PACKER, 3 * NRESTS> << <blocks, NRESTS >> >(device_eq);
+
+	digit_8<RB, SM, SSM - 1, PACKER, 3 * NRESTS> << <blocks, NRESTS >> >(device_eq);
+
+	digit_last_wdc<RB, SM, SSM - 3, 2, PACKER, 64, 8, 4> << <4096, 256 / 2 >> >(device_eq);
+
+	checkCudaErrors(cudaMemcpy(solutions, &device_eq->edata.srealcont, (MAXREALSOLS * (512 * 4)) + 4, cudaMemcpyDeviceToHost));
+
+	//printf("nsols: %u\n", solutions->nsols);
+	//if (solutions->nsols > 9)
+	//	printf("missing sol, total: %u\n", solutions->nsols);
+
+	for (u32 s = 0; (s < solutions->nsols) && (s < MAXREALSOLS); s++)
+	{
+		// remove dups on CPU (dup removal on GPU is not fully exact and can pass on some invalid solutions)
+		if (duped(solutions->sols[s])) continue;
+
+		// perform sort of pairs
+		for (uint32_t level = 0; level < 9; level++)
+			for (uint32_t i = 0; i < (1 << 9); i += (2 << level))
+				sort_pair(&solutions->sols[s][i], 1 << level);
+
+		std::vector<uint32_t> index_vector(PROOFSIZE);
+		for (u32 i = 0; i < PROOFSIZE; i++) {
+			index_vector[i] = solutions->sols[s][i];
+		}
+		
+		solutionf(index_vector, DIGITBITS, nullptr);
+	}
+
+	hashdonef();
+}
+
+
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+__host__ eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::~eq_cuda_context()
+{
+	if (solutions)
+		free(solutions);
+
+	cudaFree(device_eq);
+
+	if (pctx)
+	{
+		// non primary thread, destroy context
+#ifdef WIN32
+		checkCudaDriverErrors(_cuCtxDestroy(pctx));
+#else
+		checkCudaDriverErrors(cuCtxDestroy(pctx));
+#endif
+	}
+	else
+	{
+		checkCudaErrors(cudaDeviceReset());
+
+		dev_init_done[device_id] = 0;
+	}
+}
+
+
+#ifdef CONFIG_MODE_1
+template class eq_cuda_context<CONFIG_MODE_1>;
+#endif
+
+#ifdef CONFIG_MODE_2
+template class eq_cuda_context<CONFIG_MODE_2>;
+#endif
+
+#ifdef CONFIG_MODE_3
+template class eq_cuda_context<CONFIG_MODE_3>;
+#endif
diff --git a/cuda_tromp/CMakeLists.txt b/cuda_tromp/CMakeLists.txt
new file mode 100644
index 000000000..12bdc8bf4
--- /dev/null
+++ b/cuda_tromp/CMakeLists.txt
@@ -0,0 +1,57 @@
+set(EXECUTABLE cuda_tromp)
+
+option(ENABLE_CUDA "Enable the cuda build" ON)
+
+# depending on gcc version
+# ;-std=c++11 => Ubuntu 14.04 check gcc versions
+#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-std=c++11)
+
+file(GLOB SRC_LIST
+    cuda_tromp.cpp
+    equi_miner.cu )
+file(GLOB HEADERS
+    cuda_tromp.hpp
+    eqcuda.hpp
+    )
+
+
+#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-m64;--std=c++11;--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo)
+
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo)
+
+add_definitions(-DHIST)
+#add_definitions(-DXINTREE)
+#add_definitions(-DUNROLL)
+
+list(APPEND CUDA_NVCC_FLAGS_RELEASE -O3)
+
+
+FIND_PACKAGE(CUDA REQUIRED)
+if(COMPUTE AND (COMPUTE GREATER 0))
+        LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE})
+else(COMPUTE AND (COMPUTE GREATER 0))
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};; -gencode arch=compute_20,code=sm_21; -gencode arch=compute_30,code=sm_30; -gencode arch=compute_35,code=sm_35; -gencode arch=compute_50,code=sm_50; -gencode arch=compute_52,code=sm_52; -gencode arch=compute_61,code=sm_61 )
+endif(COMPUTE AND (COMPUTE GREATER 0))
+
+include_directories(${CUDA_INCLUDE_DIRS})
+
+find_package(Threads REQUIRED COMPONENTS)
+find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread)
+
+if(CUDA_FOUND)
+message("CUDA FOUND")
+else()
+message("CUDA NOT FOUND")
+endif()
+
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CUDA_INCLUDE_DIRS})
+include_directories(..)
+CUDA_ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS})
+TARGET_LINK_LIBRARIES(${EXECUTABLE} ${CUDA_LIBRARIES})
+
+message("-- CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
+
+install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib )
+install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} )
diff --git a/nheqminer/AvailableSolvers.h b/nheqminer/AvailableSolvers.h
new file mode 100644
index 000000000..a071ae3b5
--- /dev/null
+++ b/nheqminer/AvailableSolvers.h
@@ -0,0 +1,97 @@
+#pragma once
+
+#include "Solver.h"
+#include "SolverStub.h"
+
+
+#ifdef USE_CPU_TROMP
+#include "../cpu_tromp/cpu_tromp.hpp"
+#else
+CREATE_SOLVER_STUB(cpu_tromp, "cpu_tromp_STUB")
+#endif
+#ifdef USE_CPU_XENONCAT
+#include "../cpu_xenoncat/cpu_xenoncat.hpp"
+#else
+CREATE_SOLVER_STUB(cpu_xenoncat, "cpu_xenoncat_STUB")
+#endif
+#ifdef USE_CUDA_TROMP
+#include "../cuda_tromp/cuda_tromp.hpp"
+#else
+CREATE_SOLVER_STUB(cuda_tromp, "cuda_tromp_STUB")
+#endif
+#ifdef USE_CUDA_DJEZO
+#include "../cuda_djezo/cuda_djezo.hpp"
+#else
+CREATE_SOLVER_STUB(cuda_djezo, "cuda_djezo_STUB")
+#endif
+// OpenCL solvers are fropped replace with new OS solvers
+#ifdef USE_OCL_XMP
+#include "../ocl_xpm/ocl_xmp.hpp"
+#else
+CREATE_SOLVER_STUB(ocl_xmp, "ocl_xmp_STUB")
+#endif
+#ifdef USE_OCL_SILENTARMY
+#include "../ocl_silentarmy/ocl_silentarmy.hpp"
+#else
+CREATE_SOLVER_STUB(ocl_silentarmy, "ocl_silentarmy_STUB")
+#endif
+
+//namespace AvailableSolvers
+//{
+//} // AvailableSolvers
+
+// CPU solvers
+class CPUSolverTromp : public Solver<cpu_tromp> {
+public:
+	CPUSolverTromp(int use_opt) : Solver<cpu_tromp>(new cpu_tromp(), SolverType::CPU) {
+		_context->use_opt = use_opt;
+	}
+	virtual ~CPUSolverTromp() {}
+};
+class CPUSolverXenoncat : public Solver<cpu_xenoncat> {
+public:
+	CPUSolverXenoncat(int use_opt) : Solver<cpu_xenoncat>(new cpu_xenoncat(), SolverType::CPU) {
+		_context->use_opt = use_opt;
+	}
+	virtual ~CPUSolverXenoncat() {}
+};
+// TODO remove platform id for cuda solvers
+// CUDA solvers
+class CUDASolverDjezo : public Solver<cuda_djezo> {
+public:
+	CUDASolverDjezo(int dev_id, int blocks, int threadsperblock) : Solver<cuda_djezo>(new cuda_djezo(0, dev_id), SolverType::CUDA) {
+		if (blocks > 0) {
+			_context->blocks = blocks;
+		}
+		if (threadsperblock > 0) {
+			_context->threadsperblock = threadsperblock;
+		}
+	}
+	virtual ~CUDASolverDjezo() {}
+};
+class CUDASolverTromp : public Solver<cuda_tromp> {
+public:
+	CUDASolverTromp(int dev_id, int blocks, int threadsperblock) : Solver<cuda_tromp>(new cuda_tromp(0, dev_id), SolverType::CUDA) {
+		if (blocks > 0) {
+			_context->blocks = blocks;
+		}
+		if (threadsperblock > 0) {
+			_context->threadsperblock = threadsperblock;
+		}
+	}
+	virtual ~CUDASolverTromp() {}
+};
+// OpenCL solvers
+class OPENCLSolverSilentarmy : public Solver<ocl_silentarmy> {
+public:
+	OPENCLSolverSilentarmy(int platf_id, int dev_id) : Solver<ocl_silentarmy>(new ocl_silentarmy(platf_id, dev_id), SolverType::OPENCL) {
+	}
+	virtual ~OPENCLSolverSilentarmy() {}
+};
+class OPENCLSolverXMP : public Solver<ocl_xmp> {
+public:
+	OPENCLSolverXMP(int platf_id, int dev_id) : Solver<ocl_xmp>(new ocl_xmp(platf_id, dev_id), SolverType::OPENCL) {
+	}
+	virtual ~OPENCLSolverXMP() {}
+};
+
diff --git a/nheqminer/ISolver.h b/nheqminer/ISolver.h
new file mode 100644
index 000000000..bad815f66
--- /dev/null
+++ b/nheqminer/ISolver.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+#include <functional>
+
+enum class SolverType {
+	CPU = 0,
+	CUDA,
+	OPENCL
+};
+
+class ISolver
+{
+public:
+	//ISolver() { }
+	//virtual ~ISolver() { }
+	virtual void start() = 0;
+	virtual void stop() = 0;
+
+	virtual void solve(const char *tequihash_header,
+		unsigned int tequihash_header_len,
+		const char* nonce,
+		unsigned int nonce_len,
+		std::function<bool()> cancelf,
+		std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
+		std::function<void(void)> hashdonef) = 0;
+
+	virtual std::string getdevinfo() = 0;
+	virtual std::string getname() = 0;
+	virtual SolverType GetType() const = 0;
+};
+
diff --git a/nheqminer/MinerFactory.cpp b/nheqminer/MinerFactory.cpp
new file mode 100644
index 000000000..b8a701956
--- /dev/null
+++ b/nheqminer/MinerFactory.cpp
@@ -0,0 +1,94 @@
+#include "MinerFactory.h"
+
+#include <thread>
+
+extern int use_avx;
+extern int use_avx2;
+
+
+
+MinerFactory::~MinerFactory()
+{
+	ClearAllSolvers();
+}
+
+std::vector<ISolver *> MinerFactory::GenerateSolvers(int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
+	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) {
+	std::vector<ISolver *> solversPointers;
+
+	for (int i = 0; i < cuda_count; ++i) {
+		solversPointers.push_back(GenCUDASolver(cuda_en[i], cuda_b[i], cuda_t[i]));
+	}
+
+	for (int i = 0; i < opencl_count; ++i)
+	{
+		if (opencl_t[i] < 1) opencl_t[i] = 1;
+
+		// add multiple threads if wanted
+		for (int k = 0; k < opencl_t[i]; ++k) {
+			// todo: save local&global work size, new solvers
+			solversPointers.push_back(GenOPENCLSolver(opencl_platf, opencl_en[i]));
+		}
+	}
+
+	bool hasGpus = solversPointers.size() > 0;
+	if (cpu_threads < 0) {
+		cpu_threads = std::thread::hardware_concurrency();
+		if (cpu_threads < 1) cpu_threads = 1;
+		else if (hasGpus) --cpu_threads; // decrease number of threads if there are GPU workers
+	}
+
+	for (int i = 0; i < cpu_threads; ++i)
+	{
+		solversPointers.push_back(GenCPUSolver(use_avx2));
+	}
+
+	return solversPointers;
+}
+
+void MinerFactory::ClearAllSolvers() {
+	for (ISolver * ds : _solvers) {
+		if (ds != nullptr) {
+			delete ds;
+		}
+	}
+	_solvers.clear();
+}
+
+ISolver * MinerFactory::GenCPUSolver(int use_opt) {
+    // TODO fix dynamic linking on Linux
+#ifdef    USE_CPU_XENONCAT
+	if (_use_xenoncat) {
+		_solvers.push_back(new CPUSolverXenoncat(use_opt));
+		return _solvers.back();
+	} else {
+		_solvers.push_back(new CPUSolverTromp(use_opt));
+		return _solvers.back();
+	}
+#else
+    _solvers.push_back(new CPUSolverTromp(use_opt));
+    return _solvers.back();
+#endif
+}
+
+ISolver * MinerFactory::GenCUDASolver(int dev_id, int blocks, int threadsperblock) {
+	if (_use_cuda_djezo) {
+		_solvers.push_back(new CUDASolverDjezo(dev_id, blocks, threadsperblock));
+		return _solvers.back();
+	}
+	else {
+		_solvers.push_back(new CUDASolverTromp(dev_id, blocks, threadsperblock));
+		return _solvers.back();
+	}
+}
+// no OpenCL solvers at the moment keep for future reference
+ISolver * MinerFactory::GenOPENCLSolver(int platf_id, int dev_id) {
+	if (_use_silentarmy) {
+		_solvers.push_back(new OPENCLSolverSilentarmy(platf_id, dev_id));
+		return _solvers.back();
+	}
+	else {
+		_solvers.push_back(new OPENCLSolverXMP(platf_id, dev_id));
+		return _solvers.back();
+	}
+}
diff --git a/nheqminer/MinerFactory.h b/nheqminer/MinerFactory.h
new file mode 100644
index 000000000..94c63a0d7
--- /dev/null
+++ b/nheqminer/MinerFactory.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <AvailableSolvers.h>
+
+class MinerFactory
+{
+public:
+	MinerFactory(bool use_xenoncat, bool use_cuda_djezo, bool use_silentarmy)
+		: _use_xenoncat(use_xenoncat), _use_cuda_djezo(use_cuda_djezo), _use_silentarmy(use_silentarmy) {
+	}
+
+	~MinerFactory();
+
+	std::vector<ISolver *> GenerateSolvers(int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
+		int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
+	void ClearAllSolvers();
+
+private:
+	std::vector<ISolver *> _solvers;
+
+	bool _use_xenoncat = true;
+	bool _use_cuda_djezo = true;
+	bool _use_silentarmy = true;
+
+	ISolver * GenCPUSolver(int use_opt);
+	ISolver * GenCUDASolver(int dev_id, int blocks, int threadsperblock);
+	ISolver * GenOPENCLSolver(int platf_id, int dev_id);
+
+};
+
diff --git a/nheqminer/Solver.h b/nheqminer/Solver.h
new file mode 100644
index 000000000..7ebb8c100
--- /dev/null
+++ b/nheqminer/Solver.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include "ISolver.h"
+
+template<typename StaticInterface>
+class Solver : public ISolver
+{
+protected:
+	const SolverType _type;
+	StaticInterface * const _context = nullptr;	
+public:
+	Solver(StaticInterface *contex, SolverType type) : _context(contex), _type(type){}
+	virtual ~Solver() {
+		// the solver owns the context should delete it
+		if (_context != nullptr) {
+			delete _context;
+		}
+	}
+
+	virtual void start() override {
+		StaticInterface::start(*_context);
+	}
+
+	virtual void stop() override {
+		StaticInterface::stop(*_context);
+	}
+
+	virtual void solve(const char *tequihash_header,
+		unsigned int tequihash_header_len,
+		const char* nonce,
+		unsigned int nonce_len,
+		std::function<bool()> cancelf,
+		std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
+		std::function<void(void)> hashdonef) override {
+		StaticInterface::solve(
+			tequihash_header,
+			tequihash_header_len,
+			nonce,
+			nonce_len,
+			cancelf,
+			solutionf,
+			hashdonef,
+			*_context);
+	}
+
+	virtual std::string getdevinfo() override {
+		return _context->getdevinfo();
+	}
+
+	virtual std::string getname() override {
+		return _context->getname();
+	}
+
+	virtual SolverType GetType() const override {
+		return _type;
+	}
+};
\ No newline at end of file
diff --git a/nheqminer/libstratum/StratumClient.cpp b/nheqminer/libstratum/StratumClient.cpp
index 50dbb6f1f..165f1a007 100644
--- a/nheqminer/libstratum/StratumClient.cpp
+++ b/nheqminer/libstratum/StratumClient.cpp
@@ -421,13 +421,5 @@ bool StratumClient<Miner, Job, Solution>::submit(const Solution* solution, const
 	return true;
 }
 
-// XMP
-template class StratumClient<ZMinerAVXCUDA80_XMP, ZcashJob, EquihashSolution>;
-template class StratumClient<ZMinerSSE2CUDA80_XMP, ZcashJob, EquihashSolution>;
-template class StratumClient<ZMinerAVXCUDA75_XMP, ZcashJob, EquihashSolution>;
-template class StratumClient<ZMinerSSE2CUDA75_XMP, ZcashJob, EquihashSolution>;
-// Silentarmy
-template class StratumClient<ZMinerAVXCUDA80_SA, ZcashJob, EquihashSolution>;
-template class StratumClient<ZMinerSSE2CUDA80_SA, ZcashJob, EquihashSolution>;
-template class StratumClient<ZMinerAVXCUDA75_SA, ZcashJob, EquihashSolution>;
-template class StratumClient<ZMinerSSE2CUDA75_SA, ZcashJob, EquihashSolution>;
\ No newline at end of file
+// create StratumClient class
+template class StratumClient<ZcashMiner, ZcashJob, EquihashSolution>;
\ No newline at end of file
diff --git a/nheqminer/libstratum/StratumClient.h b/nheqminer/libstratum/StratumClient.h
index 373baf183..eff327812 100644
--- a/nheqminer/libstratum/StratumClient.h
+++ b/nheqminer/libstratum/StratumClient.h
@@ -162,13 +162,5 @@ class StratumClient
 };
 
 
-// XMP
-typedef StratumClient<ZMinerAVXCUDA80_XMP, ZcashJob, EquihashSolution> ZcashStratumClientAVXCUDA80_XMP;
-typedef StratumClient<ZMinerSSE2CUDA80_XMP, ZcashJob, EquihashSolution> ZcashStratumClientSSE2CUDA80_XMP;
-typedef StratumClient<ZMinerAVXCUDA75_XMP, ZcashJob, EquihashSolution> ZcashStratumClientAVXCUDA75_XMP;
-typedef StratumClient<ZMinerSSE2CUDA75_XMP, ZcashJob, EquihashSolution> ZcashStratumClientSSE2CUDA75_XMP;
-// Silentarmy
-typedef StratumClient<ZMinerAVXCUDA80_SA, ZcashJob, EquihashSolution> ZcashStratumClientAVXCUDA80_SA;
-typedef StratumClient<ZMinerSSE2CUDA80_SA, ZcashJob, EquihashSolution> ZcashStratumClientSSE2CUDA80_SA;
-typedef StratumClient<ZMinerAVXCUDA75_SA, ZcashJob, EquihashSolution> ZcashStratumClientAVXCUDA75_SA;
-typedef StratumClient<ZMinerSSE2CUDA75_SA, ZcashJob, EquihashSolution> ZcashStratumClientSSE2CUDA75_SA;
\ No newline at end of file
+// ZcashStratumClient
+typedef StratumClient<ZcashMiner, ZcashJob, EquihashSolution> ZcashStratumClient;
\ No newline at end of file
diff --git a/nheqminer/libstratum/ZcashStratum.cpp b/nheqminer/libstratum/ZcashStratum.cpp
index 6c7758c20..7eac71999 100644
--- a/nheqminer/libstratum/ZcashStratum.cpp
+++ b/nheqminer/libstratum/ZcashStratum.cpp
@@ -6,7 +6,6 @@
 #include "ZcashStratum.h"
 
 #include "utilstrencodings.h"
-//#include "trompequihash/equi_miner.h"
 #include "streams.h"
 
 #include <iostream>
@@ -96,10 +95,10 @@ std::vector<unsigned char> GetMinimalFromIndices(std::vector<eh_index> indices,
 	return ret;
 }
 
-template <typename CPUSolver, typename CUDASolver, typename OPENCLSolver, typename Solver>
-void static ZcashMinerThread(ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>* miner, int size, int pos, Solver& extra)
+
+void static ZcashMinerThread(ZcashMiner* miner, int size, int pos, ISolver *solver)
 {
-	BOOST_LOG_CUSTOM(info, pos) << "Starting thread #" << pos << " (" << extra.getname() << ") " << extra.getdevinfo();
+	BOOST_LOG_CUSTOM(info, pos) << "Starting thread #" << pos << " (" << solver->getname() << ") " << solver->getdevinfo();
 
     std::shared_ptr<std::mutex> m_zmt(new std::mutex);
     CBlockHeader header;
@@ -141,7 +140,7 @@ void static ZcashMinerThread(ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>* mi
 
     try {
 
-		Solver::start(extra);
+		solver->start();
 
         while (true) {
             // Wait for work
@@ -236,14 +235,13 @@ void static ZcashMinerThread(ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>* mi
 					speed.AddHash();
 				};
 
-				Solver::solve(tequihash_header,
+				solver->solve(tequihash_header,
 					tequihash_header_len,
 					(const char*)bNonce.begin(),
 					bNonce.size(),
 					cancelFun,
 					solutionFound,
-					hashDone,
-					extra);
+					hashDone);
 				
                 // Check for stop
 				if (!miner->minerThreadActive[pos])
@@ -278,18 +276,19 @@ void static ZcashMinerThread(ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>* mi
     catch (const std::runtime_error &e)
     {
 		BOOST_LOG_CUSTOM(error, pos) << e.what();
+		exit(0);
     }
 
 	try
 	{
-		Solver::stop(extra);
+		solver->stop();
 	}
 	catch (const std::runtime_error &e)
 	{
 		BOOST_LOG_CUSTOM(error, pos) << e.what();
 	}
 
-	BOOST_LOG_CUSTOM(info, pos) << "Thread #" << pos << " ended (" << extra.getname() << ")";
+	BOOST_LOG_CUSTOM(info, pos) << "Thread #" << pos << " ended (" << solver->getname() << ")";
 }
 
 ZcashJob* ZcashJob::clone() const
@@ -333,82 +332,29 @@ std::string ZcashJob::getSubmission(const EquihashSolution* solution)
     return stream.str();
 }
 
-template <typename CPUSolver, typename CUDASolver, typename OPENCLSolver>
-ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::ZcashMiner(int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, 
-	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t)
-    : minerThreads{nullptr}
+
+ZcashMiner::ZcashMiner(const std::vector<ISolver *> &i_solvers)
+	: minerThreads{ nullptr }
 {
 	m_isActive = false;
-    nThreads = 0;
-
-    for (int i = 0; i < cuda_count; ++i)
-    {
-        CUDASolver* context = new CUDASolver(0, cuda_en[i]);
-        if (cuda_b[i] > 0)
-            context->blocks = cuda_b[i];
-        if (cuda_t[i] > 0)
-            context->threadsperblock = cuda_t[i];
-
-        cuda_contexts.push_back(context);
-    }
-    nThreads += cuda_contexts.size();
-
-
-    for (int i = 0; i < opencl_count; ++i)
-    {
-		if (opencl_t[i] < 1) opencl_t[i] = 1;
-
-		// add multiple threads if wanted
-		for (int k = 0; k < opencl_t[i]; ++k)
-		{
-			OPENCLSolver* context = new OPENCLSolver(opencl_platf, opencl_en[i]);
-			// todo: save local&global work size
-			opencl_contexts.push_back(context);
-		}
-    }
-    nThreads += opencl_contexts.size();
-
-
-
-    if (cpu_threads < 0) {
-        cpu_threads = std::thread::hardware_concurrency();
-        if (cpu_threads < 1) cpu_threads = 1;
-        else if (cuda_contexts.size() + opencl_contexts.size() > 0) --cpu_threads; // decrease number of threads if there are GPU workers
-    }
-
-
-    for (int i = 0; i < cpu_threads; ++i)
-    {
-        CPUSolver* context = new CPUSolver();
-        context->use_opt = use_avx2;
-        cpu_contexts.push_back(context);
-    }
-    nThreads += cpu_contexts.size();
-
-
-//	nThreads = cpu_contexts.size() + cuda_contexts.size() + opencl_contexts.size();
+	solvers = i_solvers;
+	nThreads = solvers.size();
 }
 
-template <typename CPUSolver, typename CUDASolver, typename OPENCLSolver>
-ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::~ZcashMiner()
+
+ZcashMiner::~ZcashMiner()
 {
     stop();
-    for (auto it = cpu_contexts.begin(); it != cpu_contexts.end(); ++it)
-        delete (*it);
-    for (auto it = cuda_contexts.begin(); it != cuda_contexts.end(); ++it)
-        delete (*it);
-	cpu_contexts.clear();
-	cuda_contexts.clear();
 }
 
-template <typename CPUSolver, typename CUDASolver, typename OPENCLSolver>
-std::string ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::userAgent()
+
+std::string ZcashMiner::userAgent()
 {
-	return "equihashminer/" STANDALONE_MINER_VERSION;
+	return "nheqminer/" STANDALONE_MINER_VERSION;
 }
 
-template <typename CPUSolver, typename CUDASolver, typename OPENCLSolver>
-void ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::start()
+
+void ZcashMiner::start()
 {
     if (minerThreads) {
         stop();
@@ -419,60 +365,71 @@ void ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::start()
 	minerThreads = new std::thread[nThreads];
 	minerThreadActive = new bool[nThreads];
 
-
-    // start cpu threads
-    int i = 0;
-    for ( ; i < cpu_contexts.size(); ++i)
-    {
-        minerThreadActive[i] = true;
-        minerThreads[i] = std::thread(boost::bind(&ZcashMinerThread<CPUSolver, CUDASolver, OPENCLSolver, CPUSolver>,
-            this, nThreads, i, *cpu_contexts.at(i)));
+	// sort solvers CPU, CUDA, OPENCL
+	std::sort(solvers.begin(), solvers.end(), [](const ISolver* a, const ISolver* b) { return a->GetType() < b->GetType(); });
+
+	// start solvers
+	// #1 start cpu threads
+	// #2 start CUDA threads
+	// #3 start OPENCL threads
+	for (int i = 0; i < solvers.size(); ++i) {
+		minerThreadActive[i] = true;
+		minerThreads[i] = std::thread(boost::bind(&ZcashMinerThread, this, nThreads, i, solvers[i]));
+		if (solvers[i]->GetType() == SolverType::CPU) {
 #ifdef WIN32
-        HANDLE hThread = minerThreads[i].native_handle();
-        if (!SetThreadPriority(hThread, THREAD_PRIORITY_LOWEST))
-        {
-            BOOST_LOG_CUSTOM(warning, i) << "Failed to set low priority";
-        }
-        else
-        {
-            BOOST_LOG_CUSTOM(debug, i) << "Priority set to " << GetThreadPriority(hThread);
-        }
+			HANDLE hThread = minerThreads[i].native_handle();
+			if (!SetThreadPriority(hThread, THREAD_PRIORITY_LOWEST))
+			{
+				BOOST_LOG_CUSTOM(warning, i) << "Failed to set low priority";
+			}
+			else
+			{
+				BOOST_LOG_CUSTOM(debug, i) << "Priority set to " << GetThreadPriority(hThread);
+			}
 #else
-        // todo: linux set low priority
+			// todo: linux set low priority
 #endif
-    }
+		}
+	}
 
+    
+    
+    //for ( ; )
+    //{
+    //    
+    //}
 
 
-    // start CUDA threads
-    for (; i < (cpu_contexts.size() + cuda_contexts.size()); ++i)
-    {
-        minerThreadActive[i] = true;
-        minerThreads[i] = std::thread(boost::bind(&ZcashMinerThread<CPUSolver, CUDASolver, OPENCLSolver, CUDASolver>,
-            this, nThreads, i, *cuda_contexts.at(i - cpu_contexts.size())));
-    }
 
+    //
+    //for (; i < (cpu_contexts.size() + cuda_contexts.size()); ++i)
+    //{
+    //    minerThreadActive[i] = true;
+    //    minerThreads[i] = std::thread(boost::bind(&ZcashMinerThread<CPUSolver, CUDASolver, OPENCLSolver, CUDASolver>,
+    //        this, nThreads, i, *cuda_contexts.at(i - cpu_contexts.size())));
+    //}
 
 
-    // start OPENCL threads
-    for (; i < (cpu_contexts.size() + cuda_contexts.size() + opencl_contexts.size()); ++i)
-    {
-        minerThreadActive[i] = true;
-        minerThreads[i] = std::thread(boost::bind(&ZcashMinerThread<CPUSolver, CUDASolver, OPENCLSolver, OPENCLSolver>,
-            this, nThreads, i, *opencl_contexts.at(i - cpu_contexts.size() - cuda_contexts.size())));
-    }
 
+    //
+    //for (; i < (cpu_contexts.size() + cuda_contexts.size() + opencl_contexts.size()); ++i)
+    //{
+    //    minerThreadActive[i] = true;
+    //    minerThreads[i] = std::thread(boost::bind(&ZcashMinerThread<CPUSolver, CUDASolver, OPENCLSolver, OPENCLSolver>,
+    //        this, nThreads, i, *opencl_contexts.at(i - cpu_contexts.size() - cuda_contexts.size())));
+    //}
 
-    /*minerThreads = new boost::thread_group();
-    for (int i = 0; i < nThreads; i++) {
-        minerThreads->create_thread(boost::bind(&ZcashMinerThread, this, nThreads, i));
-    }*/
+
+    ///*minerThreads = new boost::thread_group();
+    //for (int i = 0; i < nThreads; i++) {
+    //    minerThreads->create_thread(boost::bind(&ZcashMinerThread, this, nThreads, i));
+    //}*/
 
 	speed.Reset();
 }
 
-template <typename CPUSolver, typename CUDASolver, typename OPENCLSolver>
-void ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::stop()
+
+void ZcashMiner::stop()
 {
 	m_isActive = false;
 	if (minerThreads)
@@ -492,8 +449,8 @@ void ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::stop()
     }*/
 }
 
-template <typename CPUSolver, typename CUDASolver, typename OPENCLSolver>
-void ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::setServerNonce(const std::string& n1str)
+
+void ZcashMiner::setServerNonce(const std::string& n1str)
 {
     //auto n1str = params[1].get_str();
 	BOOST_LOG_TRIVIAL(info) << "miner | Extranonce is " << n1str;
@@ -518,8 +475,8 @@ void ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::setServerNonce(const std::
     nonce2Inc <<= nonce1Bits;
 }
 
-template <typename CPUSolver, typename CUDASolver, typename OPENCLSolver>
-ZcashJob* ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::parseJob(const Array& params)
+
+ZcashJob* ZcashMiner::parseJob(const Array& params)
 {
     if (params.size() < 2) {
         throw std::logic_error("Invalid job params");
@@ -571,59 +528,47 @@ ZcashJob* ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::parseJob(const Array&
     return ret;
 }
 
-template <typename CPUSolver, typename CUDASolver, typename OPENCLSolver>
-void ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::setJob(ZcashJob* job)
+
+void ZcashMiner::setJob(ZcashJob* job)
 {
     NewJob(job);
 }
 
-template <typename CPUSolver, typename CUDASolver, typename OPENCLSolver>
-void ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::onSolutionFound(
+
+void ZcashMiner::onSolutionFound(
         const std::function<bool(const EquihashSolution&, const std::string&)> callback)
 {
     solutionFoundCallback = callback;
 }
 
-template <typename CPUSolver, typename CUDASolver, typename OPENCLSolver>
-void ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::submitSolution(const EquihashSolution& solution, const std::string& jobid)
+
+void ZcashMiner::submitSolution(const EquihashSolution& solution, const std::string& jobid)
 {
     solutionFoundCallback(solution, jobid);
 	speed.AddShare();
 }
 
-template <typename CPUSolver, typename CUDASolver, typename OPENCLSolver>
-void ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::acceptedSolution(bool stale)
+
+void ZcashMiner::acceptedSolution(bool stale)
 {
 	speed.AddShareOK();
 }
 
-template <typename CPUSolver, typename CUDASolver, typename OPENCLSolver>
-void ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::rejectedSolution(bool stale)
+
+void ZcashMiner::rejectedSolution(bool stale)
 {
 }
 
-template <typename CPUSolver, typename CUDASolver, typename OPENCLSolver>
-void ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::failedSolution()
+
+void ZcashMiner::failedSolution()
 {
 }
 
-// XMP
-template class ZcashMiner<cpu_xenoncat, cuda_tromp, ocl_xmp>;
-template class ZcashMiner<cpu_tromp, cuda_tromp, ocl_xmp>;
-template class ZcashMiner<cpu_xenoncat, cuda_tromp_75, ocl_xmp>;
-template class ZcashMiner<cpu_tromp, cuda_tromp_75, ocl_xmp>;
-// Silentarmy
-template class ZcashMiner<cpu_xenoncat, cuda_tromp, ocl_silentarmy>;
-template class ZcashMiner<cpu_tromp, cuda_tromp, ocl_silentarmy>;
-template class ZcashMiner<cpu_xenoncat, cuda_tromp_75, ocl_silentarmy>;
-template class ZcashMiner<cpu_tromp, cuda_tromp_75, ocl_silentarmy>;
-
 std::mutex benchmark_work;
 std::vector<uint256*> benchmark_nonces;
 std::atomic_int benchmark_solutions;
 
-template <typename Solver>
-bool benchmark_solve_equihash(const CBlock& pblock, const char *tequihash_header, unsigned int tequihash_header_len, Solver& extra)
+bool benchmark_solve_equihash(const CBlock& pblock, const char *tequihash_header, unsigned int tequihash_header_len, ISolver *solver)
 {
 	benchmark_work.lock();
 	if (benchmark_nonces.empty())
@@ -658,24 +603,23 @@ bool benchmark_solve_equihash(const CBlock& pblock, const char *tequihash_header
 		++benchmark_solutions;
 	};
 
-	Solver::solve(tequihash_header,
+	solver->solve(tequihash_header,
 		tequihash_header_len,
 		(const char*)nonce->begin(),
 		nonce->size(),
 		[]() { return false; },
 		solutionFound,
-		[]() {},
-		extra);
+		[]() {}
+	);
 
 	delete nonce;
 
 	return true;
 }
 
-template <typename Solver>
-int benchmark_thread(int tid, Solver& extra)
+int benchmark_thread(int tid, ISolver *solver)
 {
-	BOOST_LOG_TRIVIAL(debug) << "Thread #" << tid << " started (" << extra.getname() << ")";
+	BOOST_LOG_TRIVIAL(debug) << "Thread #" << tid << " started (" << solver->getname() << ")";
 
 	try
 	{
@@ -687,7 +631,11 @@ int benchmark_thread(int tid, Solver& extra)
 		const char *tequihash_header = (char *)&ss[0];
 		unsigned int tequihash_header_len = ss.size();
 
-		while (benchmark_solve_equihash<Solver>(pblock, tequihash_header, tequihash_header_len, extra)) {}
+		solver->start();
+
+		while (benchmark_solve_equihash(pblock, tequihash_header, tequihash_header_len, solver)) {}
+
+		solver->stop();
 	}
 	catch (const std::runtime_error &e)
 	{
@@ -696,15 +644,12 @@ int benchmark_thread(int tid, Solver& extra)
 		return 0;
 	}
 
-	BOOST_LOG_TRIVIAL(debug) << "Thread #" << tid << " ended (" << extra.getname() << ")";
+	BOOST_LOG_TRIVIAL(debug) << "Thread #" << tid << " ended (" << solver->getname() << ")";
 
 	return 0;
 }
 
-template <typename CPUSolver, typename CUDASolver, typename OPENCLSolver>
-void ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t)
-{
+void Solvers_doBenchmark(int hashes, const std::vector<ISolver *> &solvers) {
 	// generate array of various nonces
 	std::srand(std::time(0));
 	benchmark_nonces.push_back(new uint256());
@@ -719,77 +664,39 @@ void ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::doBenchmark(int hashes, in
 
 	size_t total_hashes = benchmark_nonces.size();
 
-	std::vector<CPUSolver*> cpu_contexts;
-	std::vector<CUDASolver*> cuda_contexts;
-	std::vector<OPENCLSolver*> opencl_contexts;
-
-	for (int i = 0; i < cuda_count; ++i)
-	{
-		CUDASolver* context = new CUDASolver(0, cuda_en[i]);
-		if (cuda_b[i] > 0)
-			context->blocks = cuda_b[i];
-		if (cuda_t[i] > 0)
-			context->threadsperblock = cuda_t[i];
-
-		BOOST_LOG_TRIVIAL(info) << "Benchmarking CUDA worker (" << context->getname() << ") " << context->getdevinfo();
-
-		CUDASolver::start(*context); // init CUDA before to get more accurate benchmark
-
-		cuda_contexts.push_back(context);
-	}
-
-	for (int i = 0; i < opencl_count; ++i)
-	{
-		if (opencl_t[i] < 1) opencl_t[i] = 1;
-
-		for (int k = 0; k < opencl_t[i]; ++k)
-		{
-			OPENCLSolver* context = new OPENCLSolver(opencl_platf, opencl_en[i]);
-
-			// todo: save local&global work size
-
-			BOOST_LOG_TRIVIAL(info) << "Benchmarking OPENCL worker (" << context->getname() << ") " << context->getdevinfo();
-
-			OPENCLSolver::start(*context); // init OPENCL before to get more accurate benchmark
-
-			opencl_contexts.push_back(context);
+	// log what is benchmarking
+	for (ISolver* solver : solvers) {
+		if (solver->GetType() == SolverType::CPU) {
+			BOOST_LOG_TRIVIAL(info) << "Benchmarking CPU worker (" << solver->getname() << ") " << solver->getdevinfo();
+		}
+		else if (solver->GetType() == SolverType::CUDA) {
+			BOOST_LOG_TRIVIAL(info) << "Benchmarking CUDA worker (" << solver->getname() << ") " << solver->getdevinfo();
+		}
+		else if (solver->GetType() == SolverType::OPENCL) {
+			BOOST_LOG_TRIVIAL(info) << "Benchmarking OPENCL worker (" << solver->getname() << ") " << solver->getdevinfo();
 		}
 	}
 
-	if (cpu_threads < 0)
-	{
-		cpu_threads = std::thread::hardware_concurrency();
-		if (cpu_threads < 1) cpu_threads = 1;
-		else if (cuda_contexts.size() + opencl_contexts.size() > 0) --cpu_threads; // decrease number of threads if there are GPU workers
-	}
-
-	for (int i = 0; i < cpu_threads; ++i)
-	{
-		CPUSolver* context = new CPUSolver();
-		context->use_opt = use_avx2;
-		BOOST_LOG_TRIVIAL(info) << "Benchmarking CPU worker (" << context->getname() << ") " << context->getdevinfo();
-		CPUSolver::start(*context);
-		cpu_contexts.push_back(context);
-	}
-
-	int nThreads = cpu_contexts.size() + cuda_contexts.size() + opencl_contexts.size();
-
+	int nThreads = solvers.size();
 	std::thread* bthreads = new std::thread[nThreads];
 
+	benchmark_work.lock();
+	// bind benchmark threads
+	for (int i = 0; i < solvers.size(); ++i) {
+		bthreads[i] = std::thread(boost::bind(&benchmark_thread, i, solvers[i]));
+    }
+#ifdef WIN32
+    // TODO get back to this sleep
+    Sleep(1000);
+#else
+    sleep(1);
+#endif
+
 	BOOST_LOG_TRIVIAL(info) << "Benchmark starting... this may take several minutes, please wait...";
 
+	benchmark_work.unlock();
 	auto start = std::chrono::high_resolution_clock::now();
 
-	int i = 0;
-	for ( ; i < cpu_contexts.size(); ++i)
-		bthreads[i] = std::thread(boost::bind(&benchmark_thread<CPUSolver>, i, *cpu_contexts.at(i)));
-
-	for (; i < (cuda_contexts.size() + cpu_contexts.size()); ++i)
-		bthreads[i] = std::thread(boost::bind(&benchmark_thread<CUDASolver>, i, *cuda_contexts.at(i - cpu_contexts.size())));
-
-	for (; i < (opencl_contexts.size() + cuda_contexts.size() + cpu_contexts.size()); ++i)
-		bthreads[i] = std::thread(boost::bind(&benchmark_thread<OPENCLSolver>, i, *opencl_contexts.at(i - cpu_contexts.size() - cuda_contexts.size())));
-
 	for (int i = 0; i < nThreads; ++i)
 		bthreads[i].join();
 
@@ -799,25 +706,6 @@ void ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::doBenchmark(int hashes, in
 
 	size_t hashes_done = total_hashes - benchmark_nonces.size();
 
-    for (auto it = cpu_contexts.begin(); it != cpu_contexts.end(); ++it)
-	{
-		CPUSolver::stop(**it);
-		delete (*it);
-	}
-    for (auto it = cuda_contexts.begin(); it != cuda_contexts.end(); ++it)
-	{
-		CUDASolver::stop(**it);
-		delete (*it);
-	}
-    for (auto it = opencl_contexts.begin(); it != opencl_contexts.end(); ++it)
-	{
-		OPENCLSolver::stop(**it);
-		delete (*it);
-	}
-	cpu_contexts.clear();
-	cuda_contexts.clear();
-	opencl_contexts.clear();
-
 	BOOST_LOG_TRIVIAL(info) << "Benchmark done!";
 	BOOST_LOG_TRIVIAL(info) << "Total time : " << msec << " ms";
 	BOOST_LOG_TRIVIAL(info) << "Total iterations: " << hashes_done;
@@ -825,53 +713,3 @@ void ZcashMiner<CPUSolver, CUDASolver, OPENCLSolver>::doBenchmark(int hashes, in
 	BOOST_LOG_TRIVIAL(info) << "Speed: " << ((double)hashes_done * 1000 / (double)msec) << " I/s";
 	BOOST_LOG_TRIVIAL(info) << "Speed: " << ((double)benchmark_solutions * 1000 / (double)msec) << " Sols/s";
 }
-
-
-//void ZMinerAVX_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-//                           int opencl_count, int opencl_platf, int* opencl_en) {
-//    ZMinerAVX::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en);
-//}
-//
-//void ZMinerSSE2_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-//                            int opencl_count, int opencl_platf, int* opencl_en) {
-//    ZMinerSSE2::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en);
-//}
-
-
-
-// ocl_xmp
-// gcc static undefined reference workaround
-void ZMinerAVXCUDA80_XMP_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-    int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) {
-    ZMinerAVXCUDA80_XMP::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t);
-}
-void ZMinerSSE2CUDA80_XMP_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-    int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) {
-    ZMinerSSE2CUDA80_XMP::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t);
-}
-void ZMinerAVXCUDA75_XMP_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-    int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) {
-    ZMinerAVXCUDA75_XMP::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t);
-}
-void ZMinerSSE2CUDA75_XMP_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-    int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) {
-    ZMinerSSE2CUDA75_XMP::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t);
-}
-// ocl_silentarmy
-void ZMinerAVXCUDA80_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-    int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) {
-    ZMinerAVXCUDA80_SA::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t);
-}
-void ZMinerSSE2CUDA80_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-    int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) {
-    ZMinerSSE2CUDA80_SA::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t);
-}
-void ZMinerAVXCUDA75_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-    int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) {
-    ZMinerAVXCUDA75_SA::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t);
-}
-void ZMinerSSE2CUDA75_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-    int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) {
-    ZMinerSSE2CUDA75_SA::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t);
-}
-
diff --git a/nheqminer/libstratum/ZcashStratum.h b/nheqminer/libstratum/ZcashStratum.h
index 7b8940fbb..ad659962a 100644
--- a/nheqminer/libstratum/ZcashStratum.h
+++ b/nheqminer/libstratum/ZcashStratum.h
@@ -16,40 +16,7 @@
 
 #include "json/json_spirit_value.h"
 
-#include "SolverStub.h"
-
-#ifdef USE_CPU_TROMP
-#include "../cpu_tromp/cpu_tromp.hpp"
-#else
-CREATE_SOLVER_STUB(cpu_tromp, "cpu_tromp_STUB")
-#endif
-#ifdef USE_CPU_XENONCAT
-#include "../cpu_xenoncat/cpu_xenoncat.hpp"
-#else
-CREATE_SOLVER_STUB(cpu_xenoncat, "cpu_xenoncat_STUB")
-#endif
-#ifdef USE_CUDA_TROMP
-#include "../cuda_tromp/cuda_tromp.hpp"
-
-// TODO fix this
-#ifndef WIN32
-CREATE_SOLVER_STUB(cuda_tromp_75, "cuda_tromp_75_STUB")
-#endif
-
-#else
-CREATE_SOLVER_STUB(cuda_tromp, "cuda_tromp_STUB")
-CREATE_SOLVER_STUB(cuda_tromp_75, "cuda_tromp_75_STUB")
-#endif
-#ifdef USE_OCL_XMP
-#include "../ocl_xpm/ocl_xmp.hpp"
-#else
-CREATE_SOLVER_STUB(ocl_xmp, "ocl_xmp_STUB")
-#endif
-#ifdef USE_OCL_SILENTARMY
-#include "../ocl_silentarmy/ocl_silentarmy.hpp"
-#else
-CREATE_SOLVER_STUB(ocl_silentarmy, "ocl_silentarmy_STUB")
-#endif
+#include "ISolver.h"
 
 using namespace json_spirit;
 
@@ -108,7 +75,7 @@ inline bool operator==(const ZcashJob& a, const ZcashJob& b)
 
 typedef boost::signals2::signal<void (const ZcashJob*)> NewJob_t;
 
-template <typename CPUSolver, typename CUDASolver, typename OPENCLSolver>
+
 class ZcashMiner
 {
     int nThreads;
@@ -121,18 +88,13 @@ class ZcashMiner
     std::function<bool(const EquihashSolution&, const std::string&)> solutionFoundCallback;
 	bool m_isActive;
 
-
-	std::vector<CPUSolver*> cpu_contexts;
-	std::vector<CUDASolver*> cuda_contexts;
-	std::vector<OPENCLSolver*> opencl_contexts;
-
+	std::vector<ISolver *> solvers;
 
 public:
     NewJob_t NewJob;
 	bool* minerThreadActive;
 
-	ZcashMiner(int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-		int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
+	ZcashMiner(const std::vector<ISolver *> &i_solvers);
 	~ZcashMiner();
 
     std::string userAgent();
@@ -147,39 +109,7 @@ class ZcashMiner
     void acceptedSolution(bool stale);
     void rejectedSolution(bool stale);
     void failedSolution();
-
-    static void doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-		int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
 };
 
-// 8 combos make sure not to go beyond this
-// ocl_xmp
-typedef ZcashMiner<cpu_xenoncat, cuda_tromp, ocl_xmp> ZMinerAVXCUDA80_XMP;
-typedef ZcashMiner<cpu_tromp, cuda_tromp, ocl_xmp> ZMinerSSE2CUDA80_XMP;
-typedef ZcashMiner<cpu_xenoncat, cuda_tromp_75, ocl_xmp> ZMinerAVXCUDA75_XMP;
-typedef ZcashMiner<cpu_tromp, cuda_tromp_75, ocl_xmp> ZMinerSSE2CUDA75_XMP;
-// ocl_silentarmy
-typedef ZcashMiner<cpu_xenoncat, cuda_tromp, ocl_silentarmy> ZMinerAVXCUDA80_SA;
-typedef ZcashMiner<cpu_tromp, cuda_tromp, ocl_silentarmy> ZMinerSSE2CUDA80_SA;
-typedef ZcashMiner<cpu_xenoncat, cuda_tromp_75, ocl_silentarmy> ZMinerAVXCUDA75_SA;
-typedef ZcashMiner<cpu_tromp, cuda_tromp_75, ocl_silentarmy> ZMinerSSE2CUDA75_SA;
-
-// ocl_xmp
-// gcc static undefined reference workaround
-void ZMinerAVXCUDA80_XMP_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-    int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
-void ZMinerSSE2CUDA80_XMP_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-    int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
-void ZMinerAVXCUDA75_XMP_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-    int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
-void ZMinerSSE2CUDA75_XMP_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-    int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
-// ocl_silentarmy
-void ZMinerAVXCUDA80_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-    int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
-void ZMinerSSE2CUDA80_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-    int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
-void ZMinerAVXCUDA75_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-    int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
-void ZMinerSSE2CUDA75_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-    int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
+void Solvers_doBenchmark(int hashes, const std::vector<ISolver *> &solvers);
+
diff --git a/nheqminer/main.cpp b/nheqminer/main.cpp
index ce818dae9..6f7a230c5 100644
--- a/nheqminer/main.cpp
+++ b/nheqminer/main.cpp
@@ -5,6 +5,8 @@
 #include "primitives/block.h"
 #include "streams.h"
 
+#include "MinerFactory.h"
+
 #include "libstratum/StratumClient.h"
 
 #if defined(USE_OCL_XMP) || defined(USE_OCL_SILENTARMY)
@@ -43,36 +45,27 @@ namespace keywords = boost::log::keywords;
 #endif
 
 // TODO:
-// fix compiler issues with standard vs2013 compiler
-// file logging
-// mingw compilation for windows (faster?)
+// #1 file logging
+// #2 mingw compilation for windows (faster?)
+// #3 benchmark accuracy fix: first wait for solvers to init and then measure speed
+// #4 Linux fix cmake to generate all in one binary (just like Windows)
+// #5 after #4 is done add solver chooser for CPU and CUDA devices (general and per device), example: [-s 0 automatic, -s 1 solver1, -s 2 solver2, ...]
 
 int use_avx = 0;
 int use_avx2 = 0;
 int use_old_cuda = 0;
 int use_old_xmp = 0;
 
-// _XMP
-static ZcashStratumClientAVXCUDA80_XMP* scSigAVXC80_XMP = nullptr;
-static ZcashStratumClientSSE2CUDA80_XMP* scSigSSE2C80_XMP = nullptr;
-static ZcashStratumClientAVXCUDA75_XMP* scSigAVXC75_XMP = nullptr;
-static ZcashStratumClientSSE2CUDA75_XMP* scSigSSE2C75_XMP = nullptr;
-// _SA
-static ZcashStratumClientAVXCUDA80_SA* scSigAVXC80_SA = nullptr;
-static ZcashStratumClientSSE2CUDA80_SA* scSigSSE2C80_SA = nullptr;
-static ZcashStratumClientAVXCUDA75_SA* scSigAVXC75_SA = nullptr;
-static ZcashStratumClientSSE2CUDA75_SA* scSigSSE2C75_SA = nullptr;
+// TODO move somwhere else
+MinerFactory *_MinerFactory = nullptr;
+
+// stratum client sig
+static ZcashStratumClient* scSig = nullptr;
 
 extern "C" void stratum_sigint_handler(int signum) 
 { 
-	if (scSigAVXC80_XMP) scSigAVXC80_XMP->disconnect();
-	if (scSigSSE2C80_XMP) scSigSSE2C80_XMP->disconnect();
-	if (scSigAVXC75_XMP) scSigAVXC75_XMP->disconnect();
-	if (scSigSSE2C75_XMP) scSigSSE2C75_XMP->disconnect();
-	if (scSigAVXC80_SA) scSigAVXC80_SA->disconnect();
-	if (scSigSSE2C80_SA) scSigSSE2C80_SA->disconnect();
-	if (scSigAVXC75_SA) scSigAVXC75_SA->disconnect();
-	if (scSigSSE2C75_SA) scSigSSE2C75_SA->disconnect();
+	if (scSig) scSig->disconnect();
+	if (_MinerFactory) _MinerFactory->ClearAllSolvers();
 }
 
 void print_help()
@@ -96,28 +89,33 @@ void print_help()
 	std::cout << std::endl;
 	std::cout << "NVIDIA CUDA settings" << std::endl;
 	std::cout << "\t-ci\t\tCUDA info" << std::endl;
-	std::cout << "\t-cv [ver]\tSet CUDA version (0 = default 8.0, 1 = 7.5)" << std::endl;
+	std::cout << "\t-cv [ver]\tSet CUDA solver (0 = djeZo, 1 = tromp)" << std::endl;
 	std::cout << "\t-cd [devices]\tEnable CUDA mining on spec. devices" << std::endl;
 	std::cout << "\t-cb [blocks]\tNumber of blocks" << std::endl;
 	std::cout << "\t-ct [tpb]\tNumber of threads per block" << std::endl;
 	std::cout << "Example: -cd 0 2 -cb 12 16 -ct 64 128" << std::endl;
 	std::cout << std::endl;
-	std::cout << "OpenCL settings" << std::endl;
-	std::cout << "\t-oi\t\tOpenCL info" << std::endl;
-	std::cout << "\t-ov [ver]\tSet OpenCL solver (0 = silentarmy, 1 = xmp)" << std::endl;
-	std::cout << "\t-op [devices]\tSet OpenCL platform to selecd platform devices (-od)" << std::endl;
-	std::cout << "\t-od [devices]\tEnable OpenCL mining on spec. devices (specify plafrom number first -op)" << std::endl;
-	std::cout << "\t-ot [threads]\tSet number of threads per device" << std::endl;
-	//std::cout << "\t-cb [blocks]\tNumber of blocks" << std::endl;
-	//std::cout << "\t-ct [tpb]\tNumber of threads per block" << std::endl;
-	std::cout << "Example: -op 2 -od 0 2" << std::endl; //-cb 12 16 -ct 64 128" << std::endl;
+	//std::cout << "OpenCL settings" << std::endl;
+	//std::cout << "\t-oi\t\tOpenCL info" << std::endl;
+	//std::cout << "\t-ov [ver]\tSet OpenCL solver (0 = silentarmy, 1 = xmp)" << std::endl;
+	//std::cout << "\t-op [platf]\tSet OpenCL platform to selecd platform devices (-od)" << std::endl;
+	//std::cout << "\t-od [devices]\tEnable OpenCL mining on spec. devices (specify plafrom number first -op)" << std::endl;
+	//std::cout << "\t-ot [threads]\tSet number of threads per device" << std::endl;
+	////std::cout << "\t-cb [blocks]\tNumber of blocks" << std::endl;
+	////std::cout << "\t-ct [tpb]\tNumber of threads per block" << std::endl;
+	//std::cout << "Example: -op 2 -od 0 2" << std::endl; //-cb 12 16 -ct 64 128" << std::endl;
 	std::cout << std::endl;
 }
 
 
 void print_cuda_info()
 {
-	int num_devices = cuda_tromp::getcount();
+#if defined(USE_CUDA_DJEZO) || defined(USE_CUDA_TROMP)
+#ifdef USE_CUDA_DJEZO
+    int num_devices = cuda_djezo::getcount();
+#elif USE_CUDA_TROMP
+    int num_devices = cuda_tromp::getcount();
+#endif
 
 	std::cout << "Number of CUDA devices found: " << num_devices << std::endl;
 
@@ -125,9 +123,14 @@ void print_cuda_info()
 	{
 		std::string gpuname, version;
 		int smcount;
-		cuda_tromp::getinfo(0, i, gpuname, smcount, version);
+#ifdef USE_CUDA_DJEZO
+        cuda_djezo::getinfo(0, i, gpuname, smcount, version);
+#elif USE_CUDA_TROMP
+        cuda_tromp::getinfo(0, i, gpuname, smcount, version);
+#endif
 		std::cout << "\t#" << i << " " << gpuname << " | SM version: " << version << " | SM count: " << smcount << std::endl;
 	}
+#endif
 }
 
 void print_opencl_info() {
@@ -136,13 +139,14 @@ void print_opencl_info() {
 #endif
 }
 
+#define MAX_INSTANCES 8 * 2
 
-int cuda_enabled[8] = { 0 };
-int cuda_blocks[8] = { 0 };
-int cuda_tpb[8] = { 0 };
+int cuda_enabled[MAX_INSTANCES] = { 0 };
+int cuda_blocks[MAX_INSTANCES] = { 0 };
+int cuda_tpb[MAX_INSTANCES] = { 0 };
 
-int opencl_enabled[8] = { 0 };
-int opencl_threads[8] = { 0 };
+int opencl_enabled[MAX_INSTANCES] = { 0 };
+int opencl_threads[MAX_INSTANCES] = { 0 };
 // todo: opencl local and global worksize
 
 
@@ -180,10 +184,10 @@ void detect_AVX_and_AVX2()
 	}
 }
 
-template <typename MinerType, typename StratumType>
-void start_mining(int api_port, int cpu_threads, int cuda_device_count, int opencl_device_count, int opencl_platform,
-	const std::string& host, const std::string& port, const std::string& user, const std::string& password,
-	StratumType* handler)
+
+void start_mining(int api_port, const std::string& host, const std::string& port,
+	const std::string& user, const std::string& password,
+	ZcashStratumClient* handler, const std::vector<ISolver *> &i_solvers)
 {
 	std::shared_ptr<boost::asio::io_service> io_service(new boost::asio::io_service);
 
@@ -197,9 +201,9 @@ void start_mining(int api_port, int cpu_threads, int cuda_device_count, int open
 			api = nullptr;
 		}
 	}
-
-	MinerType miner(cpu_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads);
-	StratumType sc{
+	
+	ZcashMiner miner(i_solvers);
+	ZcashStratumClient sc{
 		io_service, &miner, host, port, user, password, 0, 0
 	};
 
@@ -241,16 +245,15 @@ int main(int argc, char* argv[])
 	std::cout << "\t==================== www.nicehash.com ====================" << std::endl;
 	std::cout << "\t\tEquihash CPU&GPU Miner for NiceHash v" STANDALONE_MINER_VERSION << std::endl;
 	std::cout << "\tThanks to Zcash developers for providing base of the code." << std::endl;
-	std::cout << "\t       Special thanks to tromp, xenoncat, mbevand "<< std::endl;
-	std::cout << "\t             and eXtremal-ik7 for providing " << std::endl;
-	std::cout << "\t      optimized CPU, CUDA and AMD equihash solvers." << std::endl;
+	std::cout << "\t    Special thanks to tromp, xenoncat and djeZo for providing "<< std::endl;
+	std::cout << "\t      optimized CPU and CUDA equihash solvers." << std::endl;
 	std::cout << "\t==================== www.nicehash.com ====================" << std::endl;
 	std::cout << std::endl;
 
 	std::string location = "equihash.eu.nicehash.com:3357";
-	std::string user = "";
+	std::string user = "34HKWdzLxWBduUfJE9JxaFhoXnfC6gmePG";
 	std::string password = "x";
-	int num_threads = -1;
+	int num_threads = 0;
 	bool benchmark = false;
 	int log_level = 2;
 	int num_hashes = 200;
@@ -280,7 +283,7 @@ int main(int argc, char* argv[])
 				use_old_cuda = atoi(argv[++i]);
 				break;
 			case 'd':
-				while (cuda_device_count < 8 && i + 1 < argc)
+				while (cuda_device_count < MAX_INSTANCES && i + 1 < argc)
 				{
 					try
 					{
@@ -295,7 +298,7 @@ int main(int argc, char* argv[])
 				}
 				break;
 			case 'b':
-				while (cuda_bc < 8 && i + 1 < argc)
+				while (cuda_bc < MAX_INSTANCES && i + 1 < argc)
 				{
 					try
 					{
@@ -310,7 +313,7 @@ int main(int argc, char* argv[])
 				}
 				break;
 			case 't':
-				while (cuda_tbpc < 8 && i + 1 < argc)
+				while (cuda_tbpc < MAX_INSTANCES && i + 1 < argc)
 				{
 					try
 					{
@@ -327,53 +330,53 @@ int main(int argc, char* argv[])
 			}
 			break;
 		}
-		case 'o':
-		{
-			switch (argv[i][2])
-			{
-			case 'i':
-				print_opencl_info();
-				return 0;
-			case 'v':
-				use_old_xmp = atoi(argv[++i]);
-				break;
-			case 'p':
-				opencl_platform = std::stol(argv[++i]);
-				break;
-			case 'd':
-				while (opencl_device_count < 8 && i + 1 < argc)
-				{
-					try
-					{
-						opencl_enabled[opencl_device_count] = std::stol(argv[++i]);
-						++opencl_device_count;
-					}
-					catch (...)
-					{
-						--i;
-						break;
-					}
-				}
-				break;
-			case 't':
-				while (opencl_t < 8 && i + 1 < argc)
-				{
-					try
-					{
-						opencl_threads[opencl_t] = std::stol(argv[++i]);
-						++opencl_t;
-					}
-					catch (...)
-					{
-						--i;
-						break;
-					}
-				}
-				break;
-				// TODO extra parameters for OpenCL
-			}
-			break;
-		}
+		//case 'o':
+		//{
+		//	switch (argv[i][2])
+		//	{
+		//	case 'i':
+		//		print_opencl_info();
+		//		return 0;
+		//	case 'v':
+		//		use_old_xmp = atoi(argv[++i]);
+		//		break;
+		//	case 'p':
+		//		opencl_platform = std::stol(argv[++i]);
+		//		break;
+		//	case 'd':
+		//		while (opencl_device_count < 8 && i + 1 < argc)
+		//		{
+		//			try
+		//			{
+		//				opencl_enabled[opencl_device_count] = std::stol(argv[++i]);
+		//				++opencl_device_count;
+		//			}
+		//			catch (...)
+		//			{
+		//				--i;
+		//				break;
+		//			}
+		//		}
+		//		break;
+		//	case 't':
+		//		while (opencl_t < 8 && i + 1 < argc)
+		//		{
+		//			try
+		//			{
+		//				opencl_threads[opencl_t] = std::stol(argv[++i]);
+		//				++opencl_t;
+		//			}
+		//			catch (...)
+		//			{
+		//				--i;
+		//				break;
+		//			}
+		//		}
+		//		break;
+		//		// TODO extra parameters for OpenCL
+		//	}
+		//	break;
+		//}
 		case 'l':
 			location = argv[++i];
 			break;
@@ -445,6 +448,7 @@ int main(int argc, char* argv[])
 
 	try
 	{
+		_MinerFactory = new MinerFactory(use_avx == 1, use_old_cuda == 0, use_old_xmp == 0);
 		if (!benchmark)
 		{
 			if (user.length() == 0)
@@ -457,97 +461,14 @@ int main(int argc, char* argv[])
 			std::string host = delim != std::string::npos ? location.substr(0, delim) : location;
 			std::string port = delim != std::string::npos ? location.substr(delim + 1) : "2142";
 
-			if (use_old_xmp) {
-				if (use_avx)
-				{
-					if (use_old_cuda)
-					{
-						start_mining<ZMinerAVXCUDA75_XMP, ZcashStratumClientAVXCUDA75_XMP>(api_port, num_threads, cuda_device_count, opencl_device_count, opencl_platform,
-							host, port, user, password, scSigAVXC75_XMP);
-					}
-					else
-					{
-						start_mining<ZMinerAVXCUDA80_XMP, ZcashStratumClientAVXCUDA80_XMP>(api_port, num_threads, cuda_device_count, opencl_device_count, opencl_platform,
-							host, port, user, password, scSigAVXC80_XMP);
-					}
-				}
-				else
-				{
-					if (use_old_cuda)
-					{
-						start_mining<ZMinerSSE2CUDA75_XMP, ZcashStratumClientSSE2CUDA75_XMP>(api_port, num_threads, cuda_device_count, opencl_device_count, opencl_platform,
-							host, port, user, password, scSigSSE2C75_XMP);
-					}
-					else
-					{
-						start_mining<ZMinerSSE2CUDA80_XMP, ZcashStratumClientSSE2CUDA80_XMP>(api_port, num_threads, cuda_device_count, opencl_device_count, opencl_platform,
-							host, port, user, password, scSigSSE2C80_XMP);
-					}
-				}
-			}
-			else { // sarmy
-				if (use_avx)
-				{
-					if (use_old_cuda)
-					{
-						start_mining<ZMinerAVXCUDA75_SA, ZcashStratumClientAVXCUDA75_SA>(api_port, num_threads, cuda_device_count, opencl_device_count, opencl_platform,
-							host, port, user, password, scSigAVXC75_SA);
-					}
-					else
-					{
-						start_mining<ZMinerAVXCUDA80_SA, ZcashStratumClientAVXCUDA80_SA>(api_port, num_threads, cuda_device_count, opencl_device_count, opencl_platform,
-							host, port, user, password, scSigAVXC80_SA);
-					}
-				}
-				else
-				{
-					if (use_old_cuda)
-					{
-						start_mining<ZMinerSSE2CUDA75_SA, ZcashStratumClientSSE2CUDA75_SA>(api_port, num_threads, cuda_device_count, opencl_device_count, opencl_platform,
-							host, port, user, password, scSigSSE2C75_SA);
-					}
-					else
-					{
-						start_mining<ZMinerSSE2CUDA80_SA, ZcashStratumClientSSE2CUDA80_SA>(api_port, num_threads, cuda_device_count, opencl_device_count, opencl_platform,
-							host, port, user, password, scSigSSE2C80_SA);
-					}
-				}
-			}
+			start_mining(api_port, host, port, user, password,
+				scSig,
+				_MinerFactory->GenerateSolvers(num_threads, cuda_device_count, cuda_enabled, cuda_blocks,
+				cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads));
 		}
 		else
 		{
-			if (use_old_xmp) {
-				if (use_avx)
-				{
-					if (use_old_cuda)
-                        ZMinerAVXCUDA75_XMP_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads);
-					else
-                        ZMinerAVXCUDA80_XMP_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads);
-				}
-				else
-				{
-					if (use_old_cuda)
-                        ZMinerSSE2CUDA75_XMP_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads);
-					else
-                        ZMinerSSE2CUDA80_XMP_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads);
-				}
-			}
-			else { // sarmy
-				if (use_avx)
-				{
-					if (use_old_cuda)
-                        ZMinerAVXCUDA75_SA_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads);
-					else
-                        ZMinerAVXCUDA80_SA_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads);
-				}
-				else
-				{
-					if (use_old_cuda)
-                        ZMinerSSE2CUDA75_SA_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads);
-					else
-                        ZMinerSSE2CUDA80_SA_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads);
-				}
-			}
+			Solvers_doBenchmark(num_hashes, _MinerFactory->GenerateSolvers(num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads));
 		}
 	}
 	catch (std::runtime_error& er)
diff --git a/nheqminer/nheqminer.sln b/nheqminer/nheqminer.sln
index 0359f7bac..42fcd0e06 100644
--- a/nheqminer/nheqminer.sln
+++ b/nheqminer/nheqminer.sln
@@ -8,7 +8,6 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nheqminer", "nheqminer.vcxp
 		{299E011B-5242-4EDA-B2F2-73C9B48F12FD} = {299E011B-5242-4EDA-B2F2-73C9B48F12FD}
 		{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B} = {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}
 		{33C2B469-F025-4223-B9B6-E69D42FEA7D6} = {33C2B469-F025-4223-B9B6-E69D42FEA7D6}
-		{5EC9EDEB-8E49-4126-9161-1560683CBC71} = {5EC9EDEB-8E49-4126-9161-1560683CBC71}
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuda_tromp", "..\cuda_tromp\cuda_tromp.vcxproj", "{33C2B469-F025-4223-B9B6-E69D42FEA7D6}"
@@ -17,130 +16,63 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cpu_xenoncat", "..\cpu_xeno
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cpu_tromp", "..\cpu_tromp\cpu_tromp.vcxproj", "{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ocl_xpm", "..\ocl_xpm\ocl_xpm.vcxproj", "{5EC9EDEB-8E49-4126-9161-1560683CBC71}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ocl_device_utils", "..\ocl_device_utils\ocl_device_utils.vcxproj", "{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ocl_silentarmy", "..\ocl_silentarmy\ocl_silentarmy.vcxproj", "{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuda_djezo", "..\cuda_djezo\cuda_djezo.vcxproj", "{268B10AD-D845-498B-8663-AB8911CA2039}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Mixed Platforms = Debug|Mixed Platforms
 		Debug|Win32 = Debug|Win32
 		Debug|x64 = Debug|x64
-		Release|Mixed Platforms = Release|Mixed Platforms
 		Release|Win32 = Release|Win32
 		Release|x64 = Release|x64
-		ReleaseSlow|Mixed Platforms = ReleaseSlow|Mixed Platforms
 		ReleaseSlow|Win32 = ReleaseSlow|Win32
 		ReleaseSlow|x64 = ReleaseSlow|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{6FF7D209-05A3-4550-93CC-211D33503719}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
-		{6FF7D209-05A3-4550-93CC-211D33503719}.Debug|Mixed Platforms.Build.0 = Debug|x64
 		{6FF7D209-05A3-4550-93CC-211D33503719}.Debug|Win32.ActiveCfg = Debug|x64
 		{6FF7D209-05A3-4550-93CC-211D33503719}.Debug|x64.ActiveCfg = Debug|x64
 		{6FF7D209-05A3-4550-93CC-211D33503719}.Debug|x64.Build.0 = Debug|x64
-		{6FF7D209-05A3-4550-93CC-211D33503719}.Release|Mixed Platforms.ActiveCfg = Release|x64
-		{6FF7D209-05A3-4550-93CC-211D33503719}.Release|Mixed Platforms.Build.0 = Release|x64
 		{6FF7D209-05A3-4550-93CC-211D33503719}.Release|Win32.ActiveCfg = Release|x64
 		{6FF7D209-05A3-4550-93CC-211D33503719}.Release|x64.ActiveCfg = Release|x64
 		{6FF7D209-05A3-4550-93CC-211D33503719}.Release|x64.Build.0 = Release|x64
-		{6FF7D209-05A3-4550-93CC-211D33503719}.ReleaseSlow|Mixed Platforms.ActiveCfg = Release|x64
-		{6FF7D209-05A3-4550-93CC-211D33503719}.ReleaseSlow|Mixed Platforms.Build.0 = Release|x64
 		{6FF7D209-05A3-4550-93CC-211D33503719}.ReleaseSlow|Win32.ActiveCfg = Release|x64
 		{6FF7D209-05A3-4550-93CC-211D33503719}.ReleaseSlow|x64.ActiveCfg = Release|x64
 		{6FF7D209-05A3-4550-93CC-211D33503719}.ReleaseSlow|x64.Build.0 = Release|x64
-		{33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
-		{33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Debug|Mixed Platforms.Build.0 = Debug|x64
 		{33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Debug|Win32.ActiveCfg = Debug|x64
 		{33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Debug|x64.ActiveCfg = Debug|x64
 		{33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Debug|x64.Build.0 = Debug|x64
-		{33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Release|Mixed Platforms.ActiveCfg = Release|x64
-		{33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Release|Mixed Platforms.Build.0 = Release|x64
 		{33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Release|Win32.ActiveCfg = Release|x64
 		{33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Release|x64.ActiveCfg = Release|x64
 		{33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Release|x64.Build.0 = Release|x64
-		{33C2B469-F025-4223-B9B6-E69D42FEA7D6}.ReleaseSlow|Mixed Platforms.ActiveCfg = Release7.5|x64
-		{33C2B469-F025-4223-B9B6-E69D42FEA7D6}.ReleaseSlow|Mixed Platforms.Build.0 = Release7.5|x64
 		{33C2B469-F025-4223-B9B6-E69D42FEA7D6}.ReleaseSlow|Win32.ActiveCfg = Release7.5|x64
 		{33C2B469-F025-4223-B9B6-E69D42FEA7D6}.ReleaseSlow|x64.ActiveCfg = Release7.5|x64
 		{33C2B469-F025-4223-B9B6-E69D42FEA7D6}.ReleaseSlow|x64.Build.0 = Release7.5|x64
-		{299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
-		{299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Debug|Mixed Platforms.Build.0 = Debug|x64
 		{299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Debug|Win32.ActiveCfg = Debug|x64
 		{299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Debug|x64.ActiveCfg = Debug|x64
 		{299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Debug|x64.Build.0 = Debug|x64
-		{299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Release|Mixed Platforms.ActiveCfg = Release|x64
-		{299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Release|Mixed Platforms.Build.0 = Release|x64
 		{299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Release|Win32.ActiveCfg = Release|x64
 		{299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Release|x64.ActiveCfg = Release|x64
 		{299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Release|x64.Build.0 = Release|x64
-		{299E011B-5242-4EDA-B2F2-73C9B48F12FD}.ReleaseSlow|Mixed Platforms.ActiveCfg = Release|x64
-		{299E011B-5242-4EDA-B2F2-73C9B48F12FD}.ReleaseSlow|Mixed Platforms.Build.0 = Release|x64
 		{299E011B-5242-4EDA-B2F2-73C9B48F12FD}.ReleaseSlow|Win32.ActiveCfg = Release|x64
 		{299E011B-5242-4EDA-B2F2-73C9B48F12FD}.ReleaseSlow|x64.ActiveCfg = Release|x64
 		{299E011B-5242-4EDA-B2F2-73C9B48F12FD}.ReleaseSlow|x64.Build.0 = Release|x64
-		{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
-		{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Debug|Mixed Platforms.Build.0 = Debug|x64
 		{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Debug|Win32.ActiveCfg = Debug|x64
 		{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Debug|x64.ActiveCfg = Debug|x64
 		{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Debug|x64.Build.0 = Debug|x64
-		{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Release|Mixed Platforms.ActiveCfg = Release|x64
-		{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Release|Mixed Platforms.Build.0 = Release|x64
 		{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Release|Win32.ActiveCfg = Release|x64
 		{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Release|x64.ActiveCfg = Release|x64
 		{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Release|x64.Build.0 = Release|x64
-		{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.ReleaseSlow|Mixed Platforms.ActiveCfg = ReleaseSSE2|x64
-		{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.ReleaseSlow|Mixed Platforms.Build.0 = ReleaseSSE2|x64
 		{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.ReleaseSlow|Win32.ActiveCfg = ReleaseSSE2|x64
 		{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.ReleaseSlow|x64.ActiveCfg = ReleaseSSE2|x64
 		{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.ReleaseSlow|x64.Build.0 = ReleaseSSE2|x64
-		{5EC9EDEB-8E49-4126-9161-1560683CBC71}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
-		{5EC9EDEB-8E49-4126-9161-1560683CBC71}.Debug|Mixed Platforms.Build.0 = Debug|x64
-		{5EC9EDEB-8E49-4126-9161-1560683CBC71}.Debug|Win32.ActiveCfg = Debug|x64
-		{5EC9EDEB-8E49-4126-9161-1560683CBC71}.Debug|x64.ActiveCfg = Debug|x64
-		{5EC9EDEB-8E49-4126-9161-1560683CBC71}.Debug|x64.Build.0 = Debug|x64
-		{5EC9EDEB-8E49-4126-9161-1560683CBC71}.Release|Mixed Platforms.ActiveCfg = Release|x64
-		{5EC9EDEB-8E49-4126-9161-1560683CBC71}.Release|Mixed Platforms.Build.0 = Release|x64
-		{5EC9EDEB-8E49-4126-9161-1560683CBC71}.Release|Win32.ActiveCfg = Release|x64
-		{5EC9EDEB-8E49-4126-9161-1560683CBC71}.Release|x64.ActiveCfg = Release|x64
-		{5EC9EDEB-8E49-4126-9161-1560683CBC71}.Release|x64.Build.0 = Release|x64
-		{5EC9EDEB-8E49-4126-9161-1560683CBC71}.ReleaseSlow|Mixed Platforms.ActiveCfg = Release|x64
-		{5EC9EDEB-8E49-4126-9161-1560683CBC71}.ReleaseSlow|Mixed Platforms.Build.0 = Release|x64
-		{5EC9EDEB-8E49-4126-9161-1560683CBC71}.ReleaseSlow|Win32.ActiveCfg = Release|x64
-		{5EC9EDEB-8E49-4126-9161-1560683CBC71}.ReleaseSlow|x64.ActiveCfg = Release|x64
-		{5EC9EDEB-8E49-4126-9161-1560683CBC71}.ReleaseSlow|x64.Build.0 = Release|x64
-		{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
-		{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Debug|Mixed Platforms.Build.0 = Debug|x64
-		{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Debug|Win32.ActiveCfg = Debug|x64
-		{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Debug|x64.ActiveCfg = Debug|x64
-		{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Debug|x64.Build.0 = Debug|x64
-		{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Release|Mixed Platforms.ActiveCfg = Release|x64
-		{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Release|Mixed Platforms.Build.0 = Release|x64
-		{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Release|Win32.ActiveCfg = Release|x64
-		{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Release|x64.ActiveCfg = Release|x64
-		{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Release|x64.Build.0 = Release|x64
-		{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.ReleaseSlow|Mixed Platforms.ActiveCfg = Release|x64
-		{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.ReleaseSlow|Mixed Platforms.Build.0 = Release|x64
-		{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.ReleaseSlow|Win32.ActiveCfg = Release|x64
-		{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.ReleaseSlow|x64.ActiveCfg = Release|x64
-		{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.ReleaseSlow|x64.Build.0 = Release|x64
-		{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
-		{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Debug|Mixed Platforms.Build.0 = Debug|x64
-		{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Debug|Win32.ActiveCfg = Debug|x64
-		{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Debug|x64.ActiveCfg = Debug|x64
-		{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Debug|x64.Build.0 = Debug|x64
-		{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Release|Mixed Platforms.ActiveCfg = Release|x64
-		{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Release|Mixed Platforms.Build.0 = Release|x64
-		{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Release|Win32.ActiveCfg = Release|x64
-		{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Release|x64.ActiveCfg = Release|x64
-		{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Release|x64.Build.0 = Release|x64
-		{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.ReleaseSlow|Mixed Platforms.ActiveCfg = Release|x64
-		{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.ReleaseSlow|Mixed Platforms.Build.0 = Release|x64
-		{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.ReleaseSlow|Win32.ActiveCfg = Release|x64
-		{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.ReleaseSlow|x64.ActiveCfg = Release|x64
-		{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.ReleaseSlow|x64.Build.0 = Release|x64
+		{268B10AD-D845-498B-8663-AB8911CA2039}.Debug|Win32.ActiveCfg = Debug|x64
+		{268B10AD-D845-498B-8663-AB8911CA2039}.Debug|x64.ActiveCfg = Debug|x64
+		{268B10AD-D845-498B-8663-AB8911CA2039}.Debug|x64.Build.0 = Debug|x64
+		{268B10AD-D845-498B-8663-AB8911CA2039}.Release|Win32.ActiveCfg = Release|x64
+		{268B10AD-D845-498B-8663-AB8911CA2039}.Release|x64.ActiveCfg = Release|x64
+		{268B10AD-D845-498B-8663-AB8911CA2039}.Release|x64.Build.0 = Release|x64
+		{268B10AD-D845-498B-8663-AB8911CA2039}.ReleaseSlow|Win32.ActiveCfg = Release|x64
+		{268B10AD-D845-498B-8663-AB8911CA2039}.ReleaseSlow|x64.ActiveCfg = Release|x64
+		{268B10AD-D845-498B-8663-AB8911CA2039}.ReleaseSlow|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/nheqminer/nheqminer.vcxproj b/nheqminer/nheqminer.vcxproj
index 032382760..f37bb361f 100644
--- a/nheqminer/nheqminer.vcxproj
+++ b/nheqminer/nheqminer.vcxproj
@@ -84,7 +84,7 @@
       <Optimization>MaxSpeed</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;USE_CPU_TROMP;USE_CPU_XENONCAT;USE_CUDA_TROMP;USE_OCL_XMP;USE_OCL_SILENTARMY;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;USE_CPU_TROMP;USE_CPU_XENONCAT;USE_CUDA_TROMP;USE_CUDA_DJEZO;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
       <AdditionalOptions>-D_WIN32_WINNT=0x0601 %(AdditionalOptions)</AdditionalOptions>
       <DisableSpecificWarnings>4068;4996;4503;4267;4180;4290;4244;4800;4334;4251</DisableSpecificWarnings>
@@ -96,8 +96,8 @@
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>cuda_tromp.lib;cuda_tromp_75.lib;cpu_xenoncat.lib;cpu_tromp_SSE2.lib;cpu_tromp_AVX.lib;ocl_device_utils.lib;ocl_xpm.lib;ocl_silentarmy.lib;OpenCL.lib</AdditionalDependencies>
-      <AdditionalLibraryDirectories>.\trompequihash\pthreads\x64;..\3rdparty\libs\win64;$(AMDAPPSDKROOT)\lib\x86_64\;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>cuda_djezo.lib;cuda_tromp.lib;cuda_tromp_75.lib;cpu_xenoncat.lib;cpu_tromp_SSE2.lib;cpu_tromp_AVX.lib</AdditionalDependencies>
+      <AdditionalLibraryDirectories>.\trompequihash\pthreads\x64;..\3rdparty\libs\win64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
       <LargeAddressAware>
       </LargeAddressAware>
     </Link>
@@ -105,11 +105,13 @@
   <ItemGroup>
     <ClInclude Include="..\cpu_tromp\cpu_tromp.hpp" />
     <ClInclude Include="..\cpu_xenoncat\cpu_xenoncat.hpp" />
+    <ClInclude Include="..\cuda_djezo\cuda_djezo.hpp" />
     <ClInclude Include="..\cuda_tromp\cuda_tromp.hpp" />
     <ClInclude Include="..\ocl_xpm\ocl_xmp.hpp" />
     <ClInclude Include="amount.h" />
     <ClInclude Include="api.hpp" />
     <ClInclude Include="arith_uint256.h" />
+    <ClInclude Include="AvailableSolvers.h" />
     <ClInclude Include="compat\byteswap.h" />
     <ClInclude Include="compat\endian.h" />
     <ClInclude Include="crypto\common.h" />
@@ -126,10 +128,13 @@
     <ClInclude Include="json\json_spirit_writer_template.h" />
     <ClInclude Include="libstratum\StratumClient.h" />
     <ClInclude Include="libstratum\ZcashStratum.h" />
+    <ClInclude Include="MinerFactory.h" />
     <ClInclude Include="primitives\block.h" />
     <ClInclude Include="primitives\transaction.h" />
     <ClInclude Include="script\script.h" />
     <ClInclude Include="serialize.h" />
+    <ClInclude Include="Solver.h" />
+    <ClInclude Include="ISolver.h" />
     <ClInclude Include="SolverStub.h" />
     <ClInclude Include="speed.hpp" />
     <ClInclude Include="streams.h" />
@@ -158,6 +163,7 @@
     <ClCompile Include="libstratum\StratumClient.cpp" />
     <ClCompile Include="libstratum\ZcashStratum.cpp" />
     <ClCompile Include="main.cpp" />
+    <ClCompile Include="MinerFactory.cpp" />
     <ClCompile Include="primitives\block.cpp" />
     <ClCompile Include="speed.cpp" />
     <ClCompile Include="uint256.cpp" />
diff --git a/nheqminer/nheqminer.vcxproj.filters b/nheqminer/nheqminer.vcxproj.filters
index 80dca38c7..92d63fe46 100644
--- a/nheqminer/nheqminer.vcxproj.filters
+++ b/nheqminer/nheqminer.vcxproj.filters
@@ -167,6 +167,21 @@
     <ClInclude Include="SolverStub.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\cuda_djezo\cuda_djezo.hpp">
+      <Filter>Header Files\solvers</Filter>
+    </ClInclude>
+    <ClInclude Include="Solver.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ISolver.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="AvailableSolvers.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="MinerFactory.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="main.cpp">
@@ -211,5 +226,8 @@
     <ClCompile Include="utilstrencodings.cpp">
       <Filter>Source Files\stuff</Filter>
     </ClCompile>
+    <ClCompile Include="MinerFactory.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/nheqminer/speed.hpp b/nheqminer/speed.hpp
index b758a1a2e..beeea05fd 100644
--- a/nheqminer/speed.hpp
+++ b/nheqminer/speed.hpp
@@ -1,6 +1,6 @@
 #pragma once
 
-#define INTERVAL_SECONDS 300 // 5 minutes
+#define INTERVAL_SECONDS 15 // 15 seconds
 
 class Speed
 {
diff --git a/nheqminer/version.h b/nheqminer/version.h
index 716eb2bae..af4c23e33 100644
--- a/nheqminer/version.h
+++ b/nheqminer/version.h
@@ -34,7 +34,7 @@ static const int BIP0031_VERSION = 60000;
 //! "mempool" command, enhanced "getdata" behavior starts with this version
 static const int MEMPOOL_GD_VERSION = 60002;
 
-#define STANDALONE_MINER_VERSION	"0.4b"
+#define STANDALONE_MINER_VERSION	"0.5c"
 
 // uncomment to use with ZCash address
 //#define ZCASH_POOL
diff --git a/ocl_device_utils/OpenCLDevice.h b/ocl_device_utils/OpenCLDevice.h
deleted file mode 100644
index ab0f5f437..000000000
--- a/ocl_device_utils/OpenCLDevice.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#pragma once
-
-#include <string>
-
-// This will list OpenCL devices, but AMD will only have aditional BusID
-struct OpenCLDevice {
-	unsigned int DeviceID;
-	std::string _CL_DEVICE_NAME;
-	std::string _CL_DEVICE_TYPE;
-	unsigned long long _CL_DEVICE_GLOBAL_MEM_SIZE;
-	std::string _CL_DEVICE_VENDOR;
-	std::string _CL_DEVICE_VERSION;
-	std::string _CL_DRIVER_VERSION;
-};
-
diff --git a/ocl_device_utils/cl_ext.hpp b/ocl_device_utils/cl_ext.hpp
deleted file mode 100644
index 507598171..000000000
--- a/ocl_device_utils/cl_ext.hpp
+++ /dev/null
@@ -1,12355 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2008-2013 The Khronos Group Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and/or associated documentation files (the
-* "Materials"), to deal in the Materials without restriction, including
-* without limitation the rights to use, copy, modify, merge, publish,
-* distribute, sublicense, and/or sell copies of the Materials, and to
-* permit persons to whom the Materials are furnished to do so, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be included
-* in all copies or substantial portions of the Materials.
-*
-* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
-******************************************************************************/
-
-/*! \file
-*
-*   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and
-*       OpenCL 1.2 (rev 15)
-*   \author Benedict R. Gaster, Laurent Morichetti and Lee Howes
-*
-*   Additions and fixes from:
-*       Brian Cole, March 3rd 2010 and April 2012
-*       Matt Gruenke, April 2012.
-*       Bruce Merry, February 2013.
-*
-*   \version 1.2.5
-*   \date June 2013
-*
-*   Optional extension support
-*
-*         cl
-*         cl_ext_device_fission
-*				#define USE_CL_DEVICE_FISSION
-*/
-
-/*! \mainpage
-* \section intro Introduction
-* For many large applications C++ is the language of choice and so it seems
-* reasonable to define C++ bindings for OpenCL.
-*
-*
-* The interface is contained with a single C++ header file \em cl.hpp and all
-* definitions are contained within the namespace \em cl. There is no additional
-* requirement to include \em cl.h and to use either the C++ or original C
-* bindings it is enough to simply include \em cl.hpp.
-*
-* The bindings themselves are lightweight and correspond closely to the
-* underlying C API. Using the C++ bindings introduces no additional execution
-* overhead.
-*
-* For detail documentation on the bindings see:
-*
-* The OpenCL C++ Wrapper API 1.2 (revision 09)
-*  http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf
-*
-* \section example Example
-*
-* The following example shows a general use case for the C++
-* bindings, including support for the optional exception feature and
-* also the supplied vector and string classes, see following sections for
-* decriptions of these features.
-*
-* \code
-* #define __CL_ENABLE_EXCEPTIONS
-*
-* #if defined(__APPLE__) || defined(__MACOSX)
-* #include <OpenCL/cl.hpp>
-* #else
-* #include <CL/cl.hpp>
-* #endif
-* #include <cstdio>
-* #include <cstdlib>
-* #include <iostream>
-*
-*  const char * helloStr  = "__kernel void "
-*                           "hello(void) "
-*                           "{ "
-*                           "  "
-*                           "} ";
-*
-*  int
-*  main(void)
-*  {
-*     cl_int err = CL_SUCCESS;
-*     try {
-*
-*       std::vector<cl::Platform> platforms;
-*       cl::Platform::get(&platforms);
-*       if (platforms.size() == 0) {
-*           std::cout << "Platform size 0\n";
-*           return -1;
-*       }
-*
-*       cl_context_properties properties[] =
-*          { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
-*       cl::Context context(CL_DEVICE_TYPE_CPU, properties);
-*
-*       std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
-*
-*       cl::Program::Sources source(1,
-*           std::make_pair(helloStr,strlen(helloStr)));
-*       cl::Program program_ = cl::Program(context, source);
-*       program_.build(devices);
-*
-*       cl::Kernel kernel(program_, "hello", &err);
-*
-*       cl::Event event;
-*       cl::CommandQueue queue(context, devices[0], 0, &err);
-*       queue.enqueueNDRangeKernel(
-*           kernel,
-*           cl::NullRange,
-*           cl::NDRange(4,4),
-*           cl::NullRange,
-*           NULL,
-*           &event);
-*
-*       event.wait();
-*     }
-*     catch (cl::Error err) {
-*        std::cerr
-*           << "ERROR: "
-*           << err.what()
-*           << "("
-*           << err.err()
-*           << ")"
-*           << std::endl;
-*     }
-*
-*    return EXIT_SUCCESS;
-*  }
-*
-* \endcode
-*
-*/
-#ifndef CL_HPP_
-#define CL_HPP_
-
-#ifdef _WIN32
-
-#include <windows.h>
-#include <malloc.h>
-#include <iterator>
-#include <intrin.h>
-
-#if defined(__CL_ENABLE_EXCEPTIONS)
-#include <exception>
-#endif // #if defined(__CL_ENABLE_EXCEPTIONS)
-
-#pragma push_macro("max")
-#undef max
-#if defined(USE_DX_INTEROP)
-#include <CL/cl_d3d10.h>
-#include <CL/cl_dx9_media_sharing.h>
-#endif
-#endif // _WIN32
-
-// 
-#if defined(USE_CL_DEVICE_FISSION)
-#include <CL/cl_ext.h> // AMD topology not needed here
-#endif
-
-#if defined(__APPLE__) || defined(__MACOSX)
-#include <OpenGL/OpenGL.h>
-#include <OpenCL/opencl.h>
-#include <libkern/OSAtomic.h>
-#else
-#include <GL/gl.h>
-#include <CL/opencl.h>
-#endif // !__APPLE__
-
-// To avoid accidentally taking ownership of core OpenCL types
-// such as cl_kernel constructors are made explicit
-// under OpenCL 1.2
-#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-#define __CL_EXPLICIT_CONSTRUCTORS explicit
-#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-#define __CL_EXPLICIT_CONSTRUCTORS 
-#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-
-// Define deprecated prefixes and suffixes to ensure compilation
-// in case they are not pre-defined
-#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED  
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
-#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-
-#if !defined(CL_CALLBACK)
-#define CL_CALLBACK
-#endif //CL_CALLBACK
-
-#include <utility>
-#include <limits>
-
-#if !defined(__NO_STD_VECTOR)
-#include <vector>
-#endif
-
-#if !defined(__NO_STD_STRING)
-#include <string>
-#endif 
-
-#if defined(__linux__) || defined(__APPLE__) || defined(__MACOSX)
-#include <alloca.h>
-
-#include <emmintrin.h>
-#include <xmmintrin.h>
-#endif // __linux__
-
-#include <cstring>
-
-
-/*! \namespace cl
-*
-* \brief The OpenCL C++ bindings are defined within this namespace.
-*
-*/
-namespace cl {
-
-	class Memory;
-
-	/**
-	* Deprecated APIs for 1.2
-	*/
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
-#define __INIT_CL_EXT_FCN_PTR(name) \
-    if(!pfn_##name) { \
-        pfn_##name = (PFN_##name) \
-            clGetExtensionFunctionAddress(#name); \
-        if(!pfn_##name) { \
-		        } \
-	    }
-#endif // #if defined(CL_VERSION_1_1)
-
-#if defined(CL_VERSION_1_2)
-#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \
-    if(!pfn_##name) { \
-        pfn_##name = (PFN_##name) \
-            clGetExtensionFunctionAddressForPlatform(platform, #name); \
-        if(!pfn_##name) { \
-		        } \
-	    }
-#endif // #if defined(CL_VERSION_1_1)
-
-	class Program;
-	class Device;
-	class Context;
-	class CommandQueue;
-	class Memory;
-	class Buffer;
-
-#if defined(__CL_ENABLE_EXCEPTIONS)
-	/*! \brief Exception class
-	*
-	*  This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined.
-	*/
-	class Error : public std::exception
-	{
-	private:
-		cl_int err_;
-		const char * errStr_;
-	public:
-		/*! \brief Create a new CL error exception for a given error code
-		*  and corresponding message.
-		*
-		*  \param err error code value.
-		*
-		*  \param errStr a descriptive string that must remain in scope until
-		*                handling of the exception has concluded.  If set, it
-		*                will be returned by what().
-		*/
-		Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
-		{}
-
-		~Error() throw() {}
-
-		/*! \brief Get error string associated with exception
-		*
-		* \return A memory pointer to the error message string.
-		*/
-		virtual const char * what() const throw ()
-		{
-			if (errStr_ == NULL) {
-				return "empty";
-			}
-			else {
-				return errStr_;
-			}
-		}
-
-		/*! \brief Get error code associated with exception
-		*
-		*  \return The error code.
-		*/
-		cl_int err(void) const { return err_; }
-	};
-
-#define __ERR_STR(x) #x
-#else
-#define __ERR_STR(x) NULL
-#endif // __CL_ENABLE_EXCEPTIONS
-
-
-	namespace detail
-	{
-#if defined(__CL_ENABLE_EXCEPTIONS)
-		static inline cl_int errHandler(
-			cl_int err,
-			const char * errStr = NULL)
-		{
-			if (err != CL_SUCCESS) {
-				throw Error(err, errStr);
-			}
-			return err;
-		}
-#else
-		static inline cl_int errHandler(cl_int err, const char * errStr = NULL)
-		{
-			(void)errStr; // suppress unused variable warning
-			return err;
-		}
-#endif // __CL_ENABLE_EXCEPTIONS
-	}
-
-
-
-	//! \cond DOXYGEN_DETAIL
-#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
-#define __GET_DEVICE_INFO_ERR               __ERR_STR(clGetDeviceInfo)
-#define __GET_PLATFORM_INFO_ERR             __ERR_STR(clGetPlatformInfo)
-#define __GET_DEVICE_IDS_ERR                __ERR_STR(clGetDeviceIDs)
-#define __GET_PLATFORM_IDS_ERR              __ERR_STR(clGetPlatformIDs)
-#define __GET_CONTEXT_INFO_ERR              __ERR_STR(clGetContextInfo)
-#define __GET_EVENT_INFO_ERR                __ERR_STR(clGetEventInfo)
-#define __GET_EVENT_PROFILE_INFO_ERR        __ERR_STR(clGetEventProfileInfo)
-#define __GET_MEM_OBJECT_INFO_ERR           __ERR_STR(clGetMemObjectInfo)
-#define __GET_IMAGE_INFO_ERR                __ERR_STR(clGetImageInfo)
-#define __GET_SAMPLER_INFO_ERR              __ERR_STR(clGetSamplerInfo)
-#define __GET_KERNEL_INFO_ERR               __ERR_STR(clGetKernelInfo)
-#if defined(CL_VERSION_1_2)
-#define __GET_KERNEL_ARG_INFO_ERR               __ERR_STR(clGetKernelArgInfo)
-#endif // #if defined(CL_VERSION_1_2)
-#define __GET_KERNEL_WORK_GROUP_INFO_ERR    __ERR_STR(clGetKernelWorkGroupInfo)
-#define __GET_PROGRAM_INFO_ERR              __ERR_STR(clGetProgramInfo)
-#define __GET_PROGRAM_BUILD_INFO_ERR        __ERR_STR(clGetProgramBuildInfo)
-#define __GET_COMMAND_QUEUE_INFO_ERR        __ERR_STR(clGetCommandQueueInfo)
-
-#define __CREATE_CONTEXT_ERR                __ERR_STR(clCreateContext)
-#define __CREATE_CONTEXT_FROM_TYPE_ERR      __ERR_STR(clCreateContextFromType)
-#define __GET_SUPPORTED_IMAGE_FORMATS_ERR   __ERR_STR(clGetSupportedImageFormats)
-
-#define __CREATE_BUFFER_ERR                 __ERR_STR(clCreateBuffer)
-#define __COPY_ERR                          __ERR_STR(cl::copy)
-#define __CREATE_SUBBUFFER_ERR              __ERR_STR(clCreateSubBuffer)
-#define __CREATE_GL_BUFFER_ERR              __ERR_STR(clCreateFromGLBuffer)
-#define __CREATE_GL_RENDER_BUFFER_ERR       __ERR_STR(clCreateFromGLBuffer)
-#define __GET_GL_OBJECT_INFO_ERR            __ERR_STR(clGetGLObjectInfo)
-#if defined(CL_VERSION_1_2)
-#define __CREATE_IMAGE_ERR                  __ERR_STR(clCreateImage)
-#define __CREATE_GL_TEXTURE_ERR             __ERR_STR(clCreateFromGLTexture)
-#define __IMAGE_DIMENSION_ERR               __ERR_STR(Incorrect image dimensions)
-#endif // #if defined(CL_VERSION_1_2)
-#define __CREATE_SAMPLER_ERR                __ERR_STR(clCreateSampler)
-#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback)
-
-#define __CREATE_USER_EVENT_ERR             __ERR_STR(clCreateUserEvent)
-#define __SET_USER_EVENT_STATUS_ERR         __ERR_STR(clSetUserEventStatus)
-#define __SET_EVENT_CALLBACK_ERR            __ERR_STR(clSetEventCallback)
-#define __WAIT_FOR_EVENTS_ERR               __ERR_STR(clWaitForEvents)
-
-#define __CREATE_KERNEL_ERR                 __ERR_STR(clCreateKernel)
-#define __SET_KERNEL_ARGS_ERR               __ERR_STR(clSetKernelArg)
-#define __CREATE_PROGRAM_WITH_SOURCE_ERR    __ERR_STR(clCreateProgramWithSource)
-#define __CREATE_PROGRAM_WITH_BINARY_ERR    __ERR_STR(clCreateProgramWithBinary)
-#if defined(CL_VERSION_1_2)
-#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    __ERR_STR(clCreateProgramWithBuiltInKernels)
-#endif // #if defined(CL_VERSION_1_2)
-#define __BUILD_PROGRAM_ERR                 __ERR_STR(clBuildProgram)
-#if defined(CL_VERSION_1_2)
-#define __COMPILE_PROGRAM_ERR                  __ERR_STR(clCompileProgram)
-
-#endif // #if defined(CL_VERSION_1_2)
-#define __CREATE_KERNELS_IN_PROGRAM_ERR     __ERR_STR(clCreateKernelsInProgram)
-
-#define __CREATE_COMMAND_QUEUE_ERR          __ERR_STR(clCreateCommandQueue)
-#define __SET_COMMAND_QUEUE_PROPERTY_ERR    __ERR_STR(clSetCommandQueueProperty)
-#define __ENQUEUE_READ_BUFFER_ERR           __ERR_STR(clEnqueueReadBuffer)
-#define __ENQUEUE_READ_BUFFER_RECT_ERR      __ERR_STR(clEnqueueReadBufferRect)
-#define __ENQUEUE_WRITE_BUFFER_ERR          __ERR_STR(clEnqueueWriteBuffer)
-#define __ENQUEUE_WRITE_BUFFER_RECT_ERR     __ERR_STR(clEnqueueWriteBufferRect)
-#define __ENQEUE_COPY_BUFFER_ERR            __ERR_STR(clEnqueueCopyBuffer)
-#define __ENQEUE_COPY_BUFFER_RECT_ERR       __ERR_STR(clEnqueueCopyBufferRect)
-#define __ENQUEUE_FILL_BUFFER_ERR           __ERR_STR(clEnqueueFillBuffer)
-#define __ENQUEUE_READ_IMAGE_ERR            __ERR_STR(clEnqueueReadImage)
-#define __ENQUEUE_WRITE_IMAGE_ERR           __ERR_STR(clEnqueueWriteImage)
-#define __ENQUEUE_COPY_IMAGE_ERR            __ERR_STR(clEnqueueCopyImage)
-#define __ENQUEUE_FILL_IMAGE_ERR           __ERR_STR(clEnqueueFillImage)
-#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  __ERR_STR(clEnqueueCopyImageToBuffer)
-#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  __ERR_STR(clEnqueueCopyBufferToImage)
-#define __ENQUEUE_MAP_BUFFER_ERR            __ERR_STR(clEnqueueMapBuffer)
-#define __ENQUEUE_MAP_IMAGE_ERR             __ERR_STR(clEnqueueMapImage)
-#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR      __ERR_STR(clEnqueueUnMapMemObject)
-#define __ENQUEUE_NDRANGE_KERNEL_ERR        __ERR_STR(clEnqueueNDRangeKernel)
-#define __ENQUEUE_TASK_ERR                  __ERR_STR(clEnqueueTask)
-#define __ENQUEUE_NATIVE_KERNEL             __ERR_STR(clEnqueueNativeKernel)
-#if defined(CL_VERSION_1_2)
-#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   __ERR_STR(clEnqueueMigrateMemObjects)
-#endif // #if defined(CL_VERSION_1_2)
-
-#define __ENQUEUE_ACQUIRE_GL_ERR            __ERR_STR(clEnqueueAcquireGLObjects)
-#define __ENQUEUE_RELEASE_GL_ERR            __ERR_STR(clEnqueueReleaseGLObjects)
-
-
-#define __RETAIN_ERR                        __ERR_STR(Retain Object)
-#define __RELEASE_ERR                       __ERR_STR(Release Object)
-#define __FLUSH_ERR                         __ERR_STR(clFlush)
-#define __FINISH_ERR                        __ERR_STR(clFinish)
-#define __VECTOR_CAPACITY_ERR               __ERR_STR(Vector capacity error)
-
-	/**
-	* CL 1.2 version that uses device fission.
-	*/
-#if defined(CL_VERSION_1_2)
-#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevices)
-#else
-#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevicesEXT)
-#endif // #if defined(CL_VERSION_1_2)
-
-	/**
-	* Deprecated APIs for 1.2
-	*/
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
-#define __ENQUEUE_MARKER_ERR                __ERR_STR(clEnqueueMarker)
-#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       __ERR_STR(clEnqueueWaitForEvents)
-#define __ENQUEUE_BARRIER_ERR               __ERR_STR(clEnqueueBarrier)
-#define __UNLOAD_COMPILER_ERR               __ERR_STR(clUnloadCompiler)
-#define __CREATE_GL_TEXTURE_2D_ERR          __ERR_STR(clCreateFromGLTexture2D)
-#define __CREATE_GL_TEXTURE_3D_ERR          __ERR_STR(clCreateFromGLTexture3D)
-#define __CREATE_IMAGE2D_ERR                __ERR_STR(clCreateImage2D)
-#define __CREATE_IMAGE3D_ERR                __ERR_STR(clCreateImage3D)
-#endif // #if defined(CL_VERSION_1_1)
-
-#endif // __CL_USER_OVERRIDE_ERROR_STRINGS
-	//! \endcond
-
-	/**
-	* CL 1.2 marker and barrier commands
-	*/
-#if defined(CL_VERSION_1_2)
-#define __ENQUEUE_MARKER_WAIT_LIST_ERR                __ERR_STR(clEnqueueMarkerWithWaitList)
-#define __ENQUEUE_BARRIER_WAIT_LIST_ERR               __ERR_STR(clEnqueueBarrierWithWaitList)
-#endif // #if defined(CL_VERSION_1_2)
-
-#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
-	typedef std::string STRING_CLASS;
-#elif !defined(__USE_DEV_STRING) 
-
-	/*! \class string
-	* \brief Simple string class, that provides a limited subset of std::string
-	* functionality but avoids many of the issues that come with that class.
-
-	*  \note Deprecated. Please use std::string as default or
-	*  re-define the string class to match the std::string
-	*  interface by defining STRING_CLASS
-	*/
-	class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-	{
-	private:
-		::size_t size_;
-		char * str_;
-	public:
-		//! \brief Constructs an empty string, allocating no memory.
-		string(void) : size_(0), str_(NULL)
-		{
-		}
-
-		/*! \brief Constructs a string populated from an arbitrary value of
-		*  specified size.
-		*
-		*  An extra '\0' is added, in case none was contained in str.
-		*
-		*  \param str the initial value of the string instance.  Note that '\0'
-		*             characters receive no special treatment.  If NULL,
-		*             the string is left empty, with a size of 0.
-		*
-		*  \param size the number of characters to copy from str.
-		*/
-		string(const char * str, ::size_t size) :
-			size_(size),
-			str_(NULL)
-		{
-			if (size > 0) {
-				str_ = new char[size_ + 1];
-				if (str_ != NULL) {
-					memcpy(str_, str, size_  * sizeof(char));
-					str_[size_] = '\0';
-				}
-				else {
-					size_ = 0;
-				}
-			}
-		}
-
-		/*! \brief Constructs a string populated from a null-terminated value.
-		*
-		*  \param str the null-terminated initial value of the string instance.
-		*             If NULL, the string is left empty, with a size of 0.
-		*/
-		string(const char * str) :
-			size_(0),
-			str_(NULL)
-		{
-			if (str) {
-				size_ = ::strlen(str);
-			}
-			if (size_ > 0) {
-				str_ = new char[size_ + 1];
-				if (str_ != NULL) {
-					memcpy(str_, str, (size_ + 1) * sizeof(char));
-				}
-			}
-		}
-
-		void resize(::size_t n)
-		{
-			if (size_ == n) {
-				return;
-			}
-			if (n == 0) {
-				if (str_) {
-					delete[] str_;
-				}
-				str_ = NULL;
-				size_ = 0;
-			}
-			else {
-				char *newString = new char[n + 1];
-				int copySize = n;
-				if (size_ < n) {
-					copySize = size_;
-				}
-				size_ = n;
-
-				if (str_) {
-					memcpy(newString, str_, (copySize + 1) * sizeof(char));
-				}
-				if (copySize < size_) {
-					memset(newString + copySize, 0, size_ - copySize);
-				}
-				newString[size_] = '\0';
-
-				delete[] str_;
-				str_ = newString;
-			}
-		}
-
-		const char& operator[] (::size_t pos) const
-		{
-			return str_[pos];
-		}
-
-		char& operator[] (::size_t pos)
-		{
-			return str_[pos];
-		}
-
-		/*! \brief Copies the value of another string to this one.
-		*
-		*  \param rhs the string to copy.
-		*
-		*  \returns a reference to the modified instance.
-		*/
-		string& operator=(const string& rhs)
-		{
-			if (this == &rhs) {
-				return *this;
-			}
-
-			if (str_ != NULL) {
-				delete[] str_;
-				str_ = NULL;
-				size_ = 0;
-			}
-
-			if (rhs.size_ == 0 || rhs.str_ == NULL) {
-				str_ = NULL;
-				size_ = 0;
-			}
-			else {
-				str_ = new char[rhs.size_ + 1];
-				size_ = rhs.size_;
-
-				if (str_ != NULL) {
-					memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char));
-				}
-				else {
-					size_ = 0;
-				}
-			}
-
-			return *this;
-		}
-
-		/*! \brief Constructs a string by copying the value of another instance.
-		*
-		*  \param rhs the string to copy.
-		*/
-		string(const string& rhs) :
-			size_(0),
-			str_(NULL)
-		{
-			*this = rhs;
-		}
-
-		//! \brief Destructor - frees memory used to hold the current value.
-		~string()
-		{
-			delete[] str_;
-			str_ = NULL;
-		}
-
-		//! \brief Queries the length of the string, excluding any added '\0's.
-		::size_t size(void) const   { return size_; }
-
-		//! \brief Queries the length of the string, excluding any added '\0's.
-		::size_t length(void) const { return size(); }
-
-		/*! \brief Returns a pointer to the private copy held by this instance,
-		*  or "" if empty/unset.
-		*/
-		const char * c_str(void) const { return (str_) ? str_ : ""; }
-	};
-	typedef cl::string STRING_CLASS;
-#endif // #elif !defined(__USE_DEV_STRING) 
-
-#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
-#define VECTOR_CLASS std::vector
-#elif !defined(__USE_DEV_VECTOR) 
-#define VECTOR_CLASS cl::vector 
-
-#if !defined(__MAX_DEFAULT_VECTOR_SIZE)
-#define __MAX_DEFAULT_VECTOR_SIZE 10
-#endif
-
-	/*! \class vector
-	* \brief Fixed sized vector implementation that mirroring
-	*
-	*  \note Deprecated. Please use std::vector as default or
-	*  re-define the vector class to match the std::vector
-	*  interface by defining VECTOR_CLASS
-
-	*  \note Not recommended for use with custom objects as
-	*  current implementation will construct N elements
-	*
-	* std::vector functionality.
-	*  \brief Fixed sized vector compatible with std::vector.
-	*
-	*  \note
-	*  This differs from std::vector<> not just in memory allocation,
-	*  but also in terms of when members are constructed, destroyed,
-	*  and assigned instead of being copy constructed.
-	*
-	*  \param T type of element contained in the vector.
-	*
-	*  \param N maximum size of the vector.
-	*/
-	template <typename T, unsigned int N = __MAX_DEFAULT_VECTOR_SIZE>
-	class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-	{
-	private:
-		T data_[N];
-		unsigned int size_;
-
-	public:
-		//! \brief Constructs an empty vector with no memory allocated.
-		vector() :
-			size_(static_cast<unsigned int>(0))
-		{}
-
-		//! \brief Deallocates the vector's memory and destroys all of its elements.
-		~vector()
-		{
-			clear();
-		}
-
-		//! \brief Returns the number of elements currently contained.
-		unsigned int size(void) const
-		{
-			return size_;
-		}
-
-		/*! \brief Empties the vector of all elements.
-		*  \note
-		*  This does not deallocate memory but will invoke destructors
-		*  on contained elements.
-		*/
-		void clear()
-		{
-			while (!empty()) {
-				pop_back();
-			}
-		}
-
-		/*! \brief Appends an element after the last valid element.
-		* Calling this on a vector that has reached capacity will throw an
-		* exception if exceptions are enabled.
-		*/
-		void push_back(const T& x)
-		{
-			if (size() < N) {
-				new (&data_[size_]) T(x);
-				size_++;
-			}
-			else {
-				detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
-			}
-		}
-
-		/*! \brief Removes the last valid element from the vector.
-		* Calling this on an empty vector will throw an exception
-		* if exceptions are enabled.
-		*/
-		void pop_back(void)
-		{
-			if (size_ != 0) {
-				--size_;
-				data_[size_].~T();
-			}
-			else {
-				detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
-			}
-		}
-
-		/*! \brief Constructs with a value copied from another.
-		*
-		*  \param vec the vector to copy.
-		*/
-		vector(const vector<T, N>& vec) :
-			size_(vec.size_)
-		{
-			if (size_ != 0) {
-				assign(vec.begin(), vec.end());
-			}
-		}
-
-		/*! \brief Constructs with a specified number of initial elements.
-		*
-		*  \param size number of initial elements.
-		*
-		*  \param val value of initial elements.
-		*/
-		vector(unsigned int size, const T& val = T()) :
-			size_(0)
-		{
-			for (unsigned int i = 0; i < size; i++) {
-				push_back(val);
-			}
-		}
-
-		/*! \brief Overwrites the current content with that copied from another
-		*         instance.
-		*
-		*  \param rhs vector to copy.
-		*
-		*  \returns a reference to this.
-		*/
-		vector<T, N>& operator=(const vector<T, N>& rhs)
-		{
-			if (this == &rhs) {
-				return *this;
-			}
-
-			if (rhs.size_ != 0) {
-				assign(rhs.begin(), rhs.end());
-			}
-			else {
-				clear();
-			}
-
-			return *this;
-		}
-
-		/*! \brief Tests equality against another instance.
-		*
-		*  \param vec the vector against which to compare.
-		*/
-		bool operator==(vector<T, N> &vec)
-		{
-			if (size() != vec.size()) {
-				return false;
-			}
-
-			for (unsigned int i = 0; i < size(); ++i) {
-				if (operator[](i) != vec[i]) {
-					return false;
-				}
-			}
-			return true;
-		}
-
-		//! \brief Conversion operator to T*.
-		operator T* ()             { return data_; }
-
-		//! \brief Conversion operator to const T*.
-		operator const T* () const { return data_; }
-
-		//! \brief Tests whether this instance has any elements.
-		bool empty(void) const
-		{
-			return size_ == 0;
-		}
-
-		//! \brief Returns the maximum number of elements this instance can hold.
-		unsigned int max_size(void) const
-		{
-			return N;
-		}
-
-		//! \brief Returns the maximum number of elements this instance can hold.
-		unsigned int capacity() const
-		{
-			return N;
-		}
-
-		/*! \brief Returns a reference to a given element.
-		*
-		*  \param index which element to access.     *
-		*  \note
-		*  The caller is responsible for ensuring index is >= 0 and < size().
-		*/
-		T& operator[](int index)
-		{
-			return data_[index];
-		}
-
-		/*! \brief Returns a const reference to a given element.
-		*
-		*  \param index which element to access.
-		*
-		*  \note
-		*  The caller is responsible for ensuring index is >= 0 and < size().
-		*/
-		const T& operator[](int index) const
-		{
-			return data_[index];
-		}
-
-		/*! \brief Assigns elements of the vector based on a source iterator range.
-		*
-		*  \param start Beginning iterator of source range
-		*  \param end Enditerator of source range
-		*
-		*  \note
-		*  Will throw an exception if exceptions are enabled and size exceeded.
-		*/
-		template<class I>
-		void assign(I start, I end)
-		{
-			clear();
-			while (start != end) {
-				push_back(*start);
-				start++;
-			}
-		}
-
-		/*! \class iterator
-		* \brief Const iterator class for vectors
-		*/
-		class iterator
-		{
-		private:
-			const vector<T, N> *vec_;
-			int index_;
-
-			/**
-			* Internal iterator constructor to capture reference
-			* to the vector it iterates over rather than taking
-			* the vector by copy.
-			*/
-			iterator(const vector<T, N> &vec, int index) :
-				vec_(&vec)
-			{
-				if (!vec.empty()) {
-					index_ = index;
-				}
-				else {
-					index_ = -1;
-				}
-			}
-
-		public:
-			iterator(void) :
-				index_(-1),
-				vec_(NULL)
-			{
-			}
-
-			iterator(const iterator& rhs) :
-				vec_(rhs.vec_),
-				index_(rhs.index_)
-			{
-			}
-
-			~iterator(void) {}
-
-			static iterator begin(const cl::vector<T, N> &vec)
-			{
-				iterator i(vec, 0);
-
-				return i;
-			}
-
-			static iterator end(const cl::vector<T, N> &vec)
-			{
-				iterator i(vec, vec.size());
-
-				return i;
-			}
-
-			bool operator==(iterator i)
-			{
-				return ((vec_ == i.vec_) &&
-					(index_ == i.index_));
-			}
-
-			bool operator!=(iterator i)
-			{
-				return (!(*this == i));
-			}
-
-			iterator& operator++()
-			{
-				++index_;
-				return *this;
-			}
-
-			iterator operator++(int)
-			{
-				iterator retVal(*this);
-				++index_;
-				return retVal;
-			}
-
-			iterator& operator--()
-			{
-				--index_;
-				return *this;
-			}
-
-			iterator operator--(int)
-			{
-				iterator retVal(*this);
-				--index_;
-				return retVal;
-			}
-
-			const T& operator *() const
-			{
-				return (*vec_)[index_];
-			}
-		};
-
-		iterator begin(void)
-		{
-			return iterator::begin(*this);
-		}
-
-		iterator begin(void) const
-		{
-			return iterator::begin(*this);
-		}
-
-		iterator end(void)
-		{
-			return iterator::end(*this);
-		}
-
-		iterator end(void) const
-		{
-			return iterator::end(*this);
-		}
-
-		T& front(void)
-		{
-			return data_[0];
-		}
-
-		T& back(void)
-		{
-			return data_[size_];
-		}
-
-		const T& front(void) const
-		{
-			return data_[0];
-		}
-
-		const T& back(void) const
-		{
-			return data_[size_ - 1];
-		}
-	};
-#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
-
-
-
-
-
-	namespace detail {
-#define __DEFAULT_NOT_INITIALIZED 1 
-#define __DEFAULT_BEING_INITIALIZED 2
-#define __DEFAULT_INITIALIZED 4
-
-		/*
-		* Compare and exchange primitives are needed for handling of defaults
-		*/
-		inline int compare_exchange(volatile int * dest, int exchange, int comparand)
-		{
-#ifdef _WIN32
-			return (int)(InterlockedCompareExchange(
-				(volatile long*)dest,
-				(long)exchange,
-				(long)comparand));
-#elif defined(__APPLE__) || defined(__MACOSX)
-			return OSAtomicOr32Orig((uint32_t)exchange, (volatile uint32_t*)dest);
-#else // !_WIN32 || defined(__APPLE__) || defined(__MACOSX)
-			return (__sync_val_compare_and_swap(
-				dest,
-				comparand,
-				exchange));
-#endif // !_WIN32
-		}
-
-		inline void fence() { _mm_mfence(); }
-	}; // namespace detail
-
-
-	/*! \brief class used to interface between C++ and
-	*  OpenCL C calls that require arrays of size_t values, whose
-	*  size is known statically.
-	*/
-	template <int N>
-	class size_t
-	{
-	private:
-		::size_t data_[N];
-
-	public:
-		//! \brief Initialize size_t to all 0s
-		size_t()
-		{
-			for (int i = 0; i < N; ++i) {
-				data_[i] = 0;
-			}
-		}
-
-		::size_t& operator[](int index)
-		{
-			return data_[index];
-		}
-
-		const ::size_t& operator[](int index) const
-		{
-			return data_[index];
-		}
-
-		//! \brief Conversion operator to T*.
-		operator ::size_t* ()             { return data_; }
-
-		//! \brief Conversion operator to const T*.
-		operator const ::size_t* () const { return data_; }
-	};
-
-	namespace detail {
-
-		// Generic getInfoHelper. The final parameter is used to guide overload
-		// resolution: the actual parameter passed is an int, which makes this
-		// a worse conversion sequence than a specialization that declares the
-		// parameter as an int.
-		template<typename Functor, typename T>
-		inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
-		{
-			return f(name, sizeof(T), param, NULL);
-		}
-
-		// Specialized getInfoHelper for VECTOR_CLASS params
-		template <typename Func, typename T>
-		inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
-		{
-			::size_t required;
-			cl_int err = f(name, 0, NULL, &required);
-			if (err != CL_SUCCESS) {
-				return err;
-			}
-
-			T* value = (T*)alloca(required);
-			err = f(name, required, value, NULL);
-			if (err != CL_SUCCESS) {
-				return err;
-			}
-
-			param->assign(&value[0], &value[required / sizeof(T)]);
-			return CL_SUCCESS;
-		}
-
-		/* Specialization for reference-counted types. This depends on the
-		* existence of Wrapper<T>::cl_type, and none of the other types having the
-		* cl_type member. Note that simplify specifying the parameter as Wrapper<T>
-		* does not work, because when using a derived type (e.g. Context) the generic
-		* template will provide a better match.
-		*/
-		template <typename Func, typename T>
-		inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int, typename T::cl_type = 0)
-		{
-			::size_t required;
-			cl_int err = f(name, 0, NULL, &required);
-			if (err != CL_SUCCESS) {
-				return err;
-			}
-
-			typename T::cl_type * value = (typename T::cl_type *) alloca(required);
-			err = f(name, required, value, NULL);
-			if (err != CL_SUCCESS) {
-				return err;
-			}
-
-			::size_t elements = required / sizeof(typename T::cl_type);
-			param->assign(&value[0], &value[elements]);
-			for (::size_t i = 0; i < elements; i++)
-			{
-				if (value[i] != NULL)
-				{
-					err = (*param)[i].retain();
-					if (err != CL_SUCCESS) {
-						return err;
-					}
-				}
-			}
-			return CL_SUCCESS;
-		}
-
-		// Specialized for getInfo<CL_PROGRAM_BINARIES>
-		template <typename Func>
-		inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<char *>* param, int)
-		{
-			cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL);
-
-			if (err != CL_SUCCESS) {
-				return err;
-			}
-
-			return CL_SUCCESS;
-		}
-
-		// Specialized GetInfoHelper for STRING_CLASS params
-		template <typename Func>
-		inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long)
-		{
-			::size_t required;
-			cl_int err = f(name, 0, NULL, &required);
-			if (err != CL_SUCCESS) {
-				return err;
-			}
-
-			char* value = (char*)alloca(required);
-			err = f(name, required, value, NULL);
-			if (err != CL_SUCCESS) {
-				return err;
-			}
-
-			*param = value;
-			return CL_SUCCESS;
-		}
-
-		// Specialized GetInfoHelper for cl::size_t params
-		template <typename Func, ::size_t N>
-		inline cl_int getInfoHelper(Func f, cl_uint name, size_t<N>* param, long)
-		{
-			::size_t required;
-			cl_int err = f(name, 0, NULL, &required);
-			if (err != CL_SUCCESS) {
-				return err;
-			}
-
-			::size_t* value = (::size_t*) alloca(required);
-			err = f(name, required, value, NULL);
-			if (err != CL_SUCCESS) {
-				return err;
-			}
-
-			for (int i = 0; i < N; ++i) {
-				(*param)[i] = value[i];
-			}
-
-			return CL_SUCCESS;
-		}
-
-		template<typename T> struct ReferenceHandler;
-
-		/* Specialization for reference-counted types. This depends on the
-		* existence of Wrapper<T>::cl_type, and none of the other types having the
-		* cl_type member. Note that simplify specifying the parameter as Wrapper<T>
-		* does not work, because when using a derived type (e.g. Context) the generic
-		* template will provide a better match.
-		*/
-		template<typename Func, typename T>
-		inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
-		{
-			typename T::cl_type value;
-			cl_int err = f(name, sizeof(value), &value, NULL);
-			if (err != CL_SUCCESS) {
-				return err;
-			}
-			*param = value;
-			if (value != NULL)
-			{
-				err = param->retain();
-				if (err != CL_SUCCESS) {
-					return err;
-				}
-			}
-			return CL_SUCCESS;
-		}
-
-#define __PARAM_NAME_INFO_1_0(F) \
-    F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \
-    F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \
-    F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \
-    F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \
-    F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \
-    \
-    F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
-    F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \
-    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
-    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
-    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \
-    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \
-    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \
-    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \
-    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \
-    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \
-    F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \
-    F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
-    F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
-    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
-    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
-    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
-    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
-    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
-    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
-    F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
-    F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
-    F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \
-    F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
-    F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
-    F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
-    F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
-    F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \
-    F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
-    F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \
-    F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \
-    F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \
-    F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \
-    F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \
-    F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \
-    \
-    F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
-    F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS<Device>) \
-    F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS<cl_context_properties>) \
-    \
-    F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
-    F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
-    F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
-    F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_uint) \
-    \
-    F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
-    F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
-    F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
-    F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
-    \
-    F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
-    F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
-    F(cl_mem_info, CL_MEM_SIZE, ::size_t) \
-    F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
-    F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
-    F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
-    F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
-    \
-    F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
-    F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \
-    F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \
-    F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \
-    F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \
-    F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \
-    F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \
-    \
-    F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
-    F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
-    F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \
-    F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \
-    F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \
-    \
-    F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
-    F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
-    F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
-    F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS<Device>) \
-    F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \
-    F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \
-    F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS<char *>) \
-    \
-    F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
-    F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \
-    F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \
-    \
-    F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \
-    F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
-    F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
-    F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
-    F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
-    \
-    F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \
-    F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \
-    F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
-    \
-    F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
-    F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
-    F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
-    F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)
-
-#if defined(CL_VERSION_1_1)
-#define __PARAM_NAME_INFO_1_1(F) \
-    F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
-    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
-    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
-    F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \
-    F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \
-    \
-    F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
-    F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \
-    \
-    F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \
-    F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
-    \
-    F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
-#endif // CL_VERSION_1_1
-
-
-#if defined(CL_VERSION_1_2)
-#define __PARAM_NAME_INFO_1_2(F) \
-    F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \
-    \
-    F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \
-    F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \
-    \
-    F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \
-    \
-    F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \
-    \
-    F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \
-    F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \
-    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \
-    F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \
-    \
-    F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \
-    F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS<cl_device_partition_property>) \
-    F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS<cl_device_partition_property>)  \
-    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \
-    F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
-	F(cl_device_info, CL_DEVICE_TOPOLOGY_AMD, cl_device_topology_amd) \
-    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS)
-#endif // #if defined(CL_VERSION_1_2)
-
-#if defined(USE_CL_DEVICE_FISSION)
-#define __PARAM_NAME_DEVICE_FISSION(F) \
-    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
-    F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
-    F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
-    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
-    F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS<cl_device_partition_property_ext>)
-#endif // USE_CL_DEVICE_FISSION
-
-		template <typename enum_type, cl_int Name>
-		struct param_traits {};
-
-#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \
-struct token;                                        \
-template<>                                           \
-struct param_traits<detail:: token,param_name>       \
-		{                                                    \
-    enum { value = param_name };                     \
-    typedef T param_type;                            \
-		};
-
-		__PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS)
-#if defined(CL_VERSION_1_1)
-			__PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS)
-#endif // CL_VERSION_1_1
-#if defined(CL_VERSION_1_2)
-			__PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS)
-#endif // CL_VERSION_1_1
-
-#if defined(USE_CL_DEVICE_FISSION)
-			__PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS);
-#endif // USE_CL_DEVICE_FISSION
-
-#ifdef CL_PLATFORM_ICD_SUFFIX_KHR
-		__CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS)
-#endif
-
-#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
-			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
-#endif
-
-#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
-			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>)
-#endif
-#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
-			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_SIMD_WIDTH_AMD
-			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
-			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
-			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
-			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
-			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
-			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
-			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
-			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
-#endif
-
-#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
-			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
-#endif
-#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
-			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint)
-#endif
-#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV
-			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint)
-#endif
-#ifdef CL_DEVICE_WARP_SIZE_NV
-			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint)
-#endif
-#ifdef CL_DEVICE_GPU_OVERLAP_NV
-			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool)
-#endif
-#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV
-			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool)
-#endif
-#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV
-			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
-#endif
-
-			// Convenience functions
-
-			template <typename Func, typename T>
-		inline cl_int
-			getInfo(Func f, cl_uint name, T* param)
-		{
-			return getInfoHelper(f, name, param, 0);
-		}
-
-		template <typename Func, typename Arg0>
-		struct GetInfoFunctor0
-		{
-			Func f_; const Arg0& arg0_;
-			cl_int operator ()(
-				cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
-			{
-				return f_(arg0_, param, size, value, size_ret);
-			}
-		};
-
-		template <typename Func, typename Arg0, typename Arg1>
-		struct GetInfoFunctor1
-		{
-			Func f_; const Arg0& arg0_; const Arg1& arg1_;
-			cl_int operator ()(
-				cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
-			{
-				return f_(arg0_, arg1_, param, size, value, size_ret);
-			}
-		};
-
-		template <typename Func, typename Arg0, typename T>
-		inline cl_int
-			getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
-		{
-			GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
-			return getInfoHelper(f0, name, param, 0);
-		}
-
-		template <typename Func, typename Arg0, typename Arg1, typename T>
-		inline cl_int
-			getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
-		{
-			GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
-			return getInfoHelper(f0, name, param, 0);
-		}
-
-		template<typename T>
-		struct ReferenceHandler
-		{ };
-
-#if defined(CL_VERSION_1_2)
-		/**
-		* OpenCL 1.2 devices do have retain/release.
-		*/
-		template <>
-		struct ReferenceHandler<cl_device_id>
-		{
-			/**
-			* Retain the device.
-			* \param device A valid device created using createSubDevices
-			* \return
-			*   CL_SUCCESS if the function executed successfully.
-			*   CL_INVALID_DEVICE if device was not a valid subdevice
-			*   CL_OUT_OF_RESOURCES
-			*   CL_OUT_OF_HOST_MEMORY
-			*/
-			static cl_int retain(cl_device_id device)
-			{
-				return ::clRetainDevice(device);
-			}
-			/**
-			* Retain the device.
-			* \param device A valid device created using createSubDevices
-			* \return
-			*   CL_SUCCESS if the function executed successfully.
-			*   CL_INVALID_DEVICE if device was not a valid subdevice
-			*   CL_OUT_OF_RESOURCES
-			*   CL_OUT_OF_HOST_MEMORY
-			*/
-			static cl_int release(cl_device_id device)
-			{
-				return ::clReleaseDevice(device);
-			}
-		};
-#else // #if defined(CL_VERSION_1_2)
-		/**
-		* OpenCL 1.1 devices do not have retain/release.
-		*/
-		template <>
-		struct ReferenceHandler<cl_device_id>
-		{
-			// cl_device_id does not have retain().
-			static cl_int retain(cl_device_id)
-			{
-				return CL_SUCCESS;
-			}
-			// cl_device_id does not have release().
-			static cl_int release(cl_device_id)
-			{
-				return CL_SUCCESS;
-			}
-		};
-#endif // #if defined(CL_VERSION_1_2)
-
-		template <>
-		struct ReferenceHandler<cl_platform_id>
-		{
-			// cl_platform_id does not have retain().
-			static cl_int retain(cl_platform_id)
-			{
-				return CL_SUCCESS;
-			}
-			// cl_platform_id does not have release().
-			static cl_int release(cl_platform_id)
-			{
-				return CL_SUCCESS;
-			}
-		};
-
-		template <>
-		struct ReferenceHandler<cl_context>
-		{
-			static cl_int retain(cl_context context)
-			{
-				return ::clRetainContext(context);
-			}
-			static cl_int release(cl_context context)
-			{
-				return ::clReleaseContext(context);
-			}
-		};
-
-		template <>
-		struct ReferenceHandler<cl_command_queue>
-		{
-			static cl_int retain(cl_command_queue queue)
-			{
-				return ::clRetainCommandQueue(queue);
-			}
-			static cl_int release(cl_command_queue queue)
-			{
-				return ::clReleaseCommandQueue(queue);
-			}
-		};
-
-		template <>
-		struct ReferenceHandler<cl_mem>
-		{
-			static cl_int retain(cl_mem memory)
-			{
-				return ::clRetainMemObject(memory);
-			}
-			static cl_int release(cl_mem memory)
-			{
-				return ::clReleaseMemObject(memory);
-			}
-		};
-
-		template <>
-		struct ReferenceHandler<cl_sampler>
-		{
-			static cl_int retain(cl_sampler sampler)
-			{
-				return ::clRetainSampler(sampler);
-			}
-			static cl_int release(cl_sampler sampler)
-			{
-				return ::clReleaseSampler(sampler);
-			}
-		};
-
-		template <>
-		struct ReferenceHandler<cl_program>
-		{
-			static cl_int retain(cl_program program)
-			{
-				return ::clRetainProgram(program);
-			}
-			static cl_int release(cl_program program)
-			{
-				return ::clReleaseProgram(program);
-			}
-		};
-
-		template <>
-		struct ReferenceHandler<cl_kernel>
-		{
-			static cl_int retain(cl_kernel kernel)
-			{
-				return ::clRetainKernel(kernel);
-			}
-			static cl_int release(cl_kernel kernel)
-			{
-				return ::clReleaseKernel(kernel);
-			}
-		};
-
-		template <>
-		struct ReferenceHandler<cl_event>
-		{
-			static cl_int retain(cl_event event)
-			{
-				return ::clRetainEvent(event);
-			}
-			static cl_int release(cl_event event)
-			{
-				return ::clReleaseEvent(event);
-			}
-		};
-
-
-		// Extracts version number with major in the upper 16 bits, minor in the lower 16
-		static cl_uint getVersion(const char *versionInfo)
-		{
-			int highVersion = 0;
-			int lowVersion = 0;
-			int index = 7;
-			while (versionInfo[index] != '.') {
-				highVersion *= 10;
-				highVersion += versionInfo[index] - '0';
-				++index;
-			}
-			++index;
-			while (versionInfo[index] != ' ') {
-				lowVersion *= 10;
-				lowVersion += versionInfo[index] - '0';
-				++index;
-			}
-			return (highVersion << 16) | lowVersion;
-		}
-
-		static cl_uint getPlatformVersion(cl_platform_id platform)
-		{
-			::size_t size = 0;
-			clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
-			char *versionInfo = (char *)alloca(size);
-			clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size);
-			return getVersion(versionInfo);
-		}
-
-		static cl_uint getDevicePlatformVersion(cl_device_id device)
-		{
-			cl_platform_id platform;
-			clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
-			return getPlatformVersion(platform);
-		}
-
-#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-		static cl_uint getContextPlatformVersion(cl_context context)
-		{
-			// The platform cannot be queried directly, so we first have to grab a
-			// device and obtain its context
-			::size_t size = 0;
-			clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
-			if (size == 0)
-				return 0;
-			cl_device_id *devices = (cl_device_id *)alloca(size);
-			clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL);
-			return getDevicePlatformVersion(devices[0]);
-		}
-#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-
-		template <typename T>
-		class Wrapper
-		{
-		public:
-			typedef T cl_type;
-
-		protected:
-			cl_type object_;
-
-		public:
-			Wrapper() : object_(NULL) { }
-
-			Wrapper(const cl_type &obj) : object_(obj) { }
-
-			~Wrapper()
-			{
-				if (object_ != NULL) { release(); }
-			}
-
-			Wrapper(const Wrapper<cl_type>& rhs)
-			{
-				object_ = rhs.object_;
-				if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
-			}
-
-			Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
-			{
-				if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
-				object_ = rhs.object_;
-				if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
-				return *this;
-			}
-
-			Wrapper<cl_type>& operator = (const cl_type &rhs)
-			{
-				if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
-				object_ = rhs;
-				return *this;
-			}
-
-			cl_type operator ()() const { return object_; }
-
-			cl_type& operator ()() { return object_; }
-
-		protected:
-			template<typename Func, typename U>
-			friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
-
-			cl_int retain() const
-			{
-				return ReferenceHandler<cl_type>::retain(object_);
-			}
-
-			cl_int release() const
-			{
-				return ReferenceHandler<cl_type>::release(object_);
-			}
-		};
-
-		template <>
-		class Wrapper<cl_device_id>
-		{
-		public:
-			typedef cl_device_id cl_type;
-
-		protected:
-			cl_type object_;
-			bool referenceCountable_;
-
-			static bool isReferenceCountable(cl_device_id device)
-			{
-				bool retVal = false;
-				if (device != NULL) {
-					int version = getDevicePlatformVersion(device);
-					if (version > ((1 << 16) + 1)) {
-						retVal = true;
-					}
-				}
-				return retVal;
-			}
-
-		public:
-			Wrapper() : object_(NULL), referenceCountable_(false)
-			{
-			}
-
-			Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false)
-			{
-				referenceCountable_ = isReferenceCountable(obj);
-			}
-
-			~Wrapper()
-			{
-				if (object_ != NULL) { release(); }
-			}
-
-			Wrapper(const Wrapper<cl_type>& rhs)
-			{
-				object_ = rhs.object_;
-				referenceCountable_ = isReferenceCountable(object_);
-				if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
-			}
-
-			Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
-			{
-				if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
-				object_ = rhs.object_;
-				referenceCountable_ = rhs.referenceCountable_;
-				if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
-				return *this;
-			}
-
-			Wrapper<cl_type>& operator = (const cl_type &rhs)
-			{
-				if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
-				object_ = rhs;
-				referenceCountable_ = isReferenceCountable(object_);
-				return *this;
-			}
-
-			cl_type operator ()() const { return object_; }
-
-			cl_type& operator ()() { return object_; }
-
-		protected:
-			template<typename Func, typename U>
-			friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
-
-			template<typename Func, typename U>
-			friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS<U>*, int, typename U::cl_type);
-
-			cl_int retain() const
-			{
-				if (referenceCountable_) {
-					return ReferenceHandler<cl_type>::retain(object_);
-				}
-				else {
-					return CL_SUCCESS;
-				}
-			}
-
-			cl_int release() const
-			{
-				if (referenceCountable_) {
-					return ReferenceHandler<cl_type>::release(object_);
-				}
-				else {
-					return CL_SUCCESS;
-				}
-			}
-		};
-
-	} // namespace detail
-	//! \endcond
-
-	/*! \stuct ImageFormat
-	*  \brief Adds constructors and member functions for cl_image_format.
-	*
-	*  \see cl_image_format
-	*/
-	struct ImageFormat : public cl_image_format
-	{
-		//! \brief Default constructor - performs no initialization.
-		ImageFormat(){}
-
-		//! \brief Initializing constructor.
-		ImageFormat(cl_channel_order order, cl_channel_type type)
-		{
-			image_channel_order = order;
-			image_channel_data_type = type;
-		}
-
-		//! \brief Assignment operator.
-		ImageFormat& operator = (const ImageFormat& rhs)
-		{
-			if (this != &rhs) {
-				this->image_channel_data_type = rhs.image_channel_data_type;
-				this->image_channel_order = rhs.image_channel_order;
-			}
-			return *this;
-		}
-	};
-
-	/*! \brief Class interface for cl_device_id.
-	*
-	*  \note Copies of these objects are inexpensive, since they don't 'own'
-	*        any underlying resources or data structures.
-	*
-	*  \see cl_device_id
-	*/
-	class Device : public detail::Wrapper<cl_device_id>
-	{
-	public:
-		//! \brief Default constructor - initializes to NULL.
-		Device() : detail::Wrapper<cl_type>() { }
-
-		/*! \brief Copy constructor.
-		*
-		*  This simply copies the device ID value, which is an inexpensive operation.
-		*/
-		Device(const Device& device) : detail::Wrapper<cl_type>(device) { }
-
-		/*! \brief Constructor from cl_device_id.
-		*
-		*  This simply copies the device ID value, which is an inexpensive operation.
-		*/
-		Device(const cl_device_id &device) : detail::Wrapper<cl_type>(device) { }
-
-		/*! \brief Returns the first device on the default context.
-		*
-		*  \see Context::getDefault()
-		*/
-		static Device getDefault(cl_int * err = NULL);
-
-		/*! \brief Assignment operator from Device.
-		*
-		*  This simply copies the device ID value, which is an inexpensive operation.
-		*/
-		Device& operator = (const Device& rhs)
-		{
-			if (this != &rhs) {
-				detail::Wrapper<cl_type>::operator=(rhs);
-			}
-			return *this;
-		}
-
-		/*! \brief Assignment operator from cl_device_id.
-		*
-		*  This simply copies the device ID value, which is an inexpensive operation.
-		*/
-		Device& operator = (const cl_device_id& rhs)
-		{
-			detail::Wrapper<cl_type>::operator=(rhs);
-			return *this;
-		}
-
-		//! \brief Wrapper for clGetDeviceInfo().
-		template <typename T>
-		cl_int getInfo(cl_device_info name, T* param) const
-		{
-			return detail::errHandler(
-				detail::getInfo(&::clGetDeviceInfo, object_, name, param),
-				__GET_DEVICE_INFO_ERR);
-		}
-
-		//! \brief Wrapper for clGetDeviceInfo() that returns by value.
-		template <cl_int name> typename
-			detail::param_traits<detail::cl_device_info, name>::param_type
-			getInfo(cl_int* err = NULL) const
-		{
-			typename detail::param_traits<
-				detail::cl_device_info, name>::param_type param;
-			cl_int result = getInfo(name, &param);
-			if (err != NULL) {
-				*err = result;
-			}
-			return param;
-		}
-
-		/**
-		* CL 1.2 version
-		*/
-#if defined(CL_VERSION_1_2)
-		//! \brief Wrapper for clCreateSubDevicesEXT().
-		cl_int createSubDevices(
-			const cl_device_partition_property * properties,
-			VECTOR_CLASS<Device>* devices)
-		{
-			cl_uint n = 0;
-			cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
-			if (err != CL_SUCCESS) {
-				return detail::errHandler(err, __CREATE_SUB_DEVICES);
-			}
-
-			cl_device_id* ids = (cl_device_id*)alloca(n * sizeof(cl_device_id));
-			err = clCreateSubDevices(object_, properties, n, ids, NULL);
-			if (err != CL_SUCCESS) {
-				return detail::errHandler(err, __CREATE_SUB_DEVICES);
-			}
-
-			devices->assign(&ids[0], &ids[n]);
-			return CL_SUCCESS;
-		}
-#endif // #if defined(CL_VERSION_1_2)
-
-		/**
-		* CL 1.1 version that uses device fission.
-		*/
-#if defined(CL_VERSION_1_1)
-#if defined(USE_CL_DEVICE_FISSION)
-		cl_int createSubDevices(
-			const cl_device_partition_property_ext * properties,
-			VECTOR_CLASS<Device>* devices)
-		{
-			typedef CL_API_ENTRY cl_int
-				(CL_API_CALL * PFN_clCreateSubDevicesEXT)(
-				cl_device_id /*in_device*/,
-				const cl_device_partition_property_ext * /* properties */,
-				cl_uint /*num_entries*/,
-				cl_device_id * /*out_devices*/,
-				cl_uint * /*num_devices*/) CL_EXT_SUFFIX__VERSION_1_1;
-
-			static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
-			__INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT);
-
-			cl_uint n = 0;
-			cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
-			if (err != CL_SUCCESS) {
-				return detail::errHandler(err, __CREATE_SUB_DEVICES);
-			}
-
-			cl_device_id* ids = (cl_device_id*)alloca(n * sizeof(cl_device_id));
-			err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL);
-			if (err != CL_SUCCESS) {
-				return detail::errHandler(err, __CREATE_SUB_DEVICES);
-			}
-
-			devices->assign(&ids[0], &ids[n]);
-			return CL_SUCCESS;
-		}
-#endif // #if defined(USE_CL_DEVICE_FISSION)
-#endif // #if defined(CL_VERSION_1_1)
-	};
-
-	/*! \brief Class interface for cl_platform_id.
-	*
-	*  \note Copies of these objects are inexpensive, since they don't 'own'
-	*        any underlying resources or data structures.
-	*
-	*  \see cl_platform_id
-	*/
-	class Platform : public detail::Wrapper<cl_platform_id>
-	{
-	public:
-		//! \brief Default constructor - initializes to NULL.
-		Platform() : detail::Wrapper<cl_type>()  { }
-
-		/*! \brief Copy constructor.
-		*
-		*  This simply copies the platform ID value, which is an inexpensive operation.
-		*/
-		Platform(const Platform& platform) : detail::Wrapper<cl_type>(platform) { }
-
-		/*! \brief Constructor from cl_platform_id.
-		*
-		*  This simply copies the platform ID value, which is an inexpensive operation.
-		*/
-		Platform(const cl_platform_id &platform) : detail::Wrapper<cl_type>(platform) { }
-
-		/*! \brief Assignment operator from Platform.
-		*
-		*  This simply copies the platform ID value, which is an inexpensive operation.
-		*/
-		Platform& operator = (const Platform& rhs)
-		{
-			if (this != &rhs) {
-				detail::Wrapper<cl_type>::operator=(rhs);
-			}
-			return *this;
-		}
-
-		/*! \brief Assignment operator from cl_platform_id.
-		*
-		*  This simply copies the platform ID value, which is an inexpensive operation.
-		*/
-		Platform& operator = (const cl_platform_id& rhs)
-		{
-			detail::Wrapper<cl_type>::operator=(rhs);
-			return *this;
-		}
-
-		//! \brief Wrapper for clGetPlatformInfo().
-		cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const
-		{
-			return detail::errHandler(
-				detail::getInfo(&::clGetPlatformInfo, object_, name, param),
-				__GET_PLATFORM_INFO_ERR);
-		}
-
-		//! \brief Wrapper for clGetPlatformInfo() that returns by value.
-		template <cl_int name> typename
-			detail::param_traits<detail::cl_platform_info, name>::param_type
-			getInfo(cl_int* err = NULL) const
-		{
-			typename detail::param_traits<
-				detail::cl_platform_info, name>::param_type param;
-			cl_int result = getInfo(name, &param);
-			if (err != NULL) {
-				*err = result;
-			}
-			return param;
-		}
-
-		/*! \brief Gets a list of devices for this platform.
-		*
-		*  Wraps clGetDeviceIDs().
-		*/
-		cl_int getDevices(
-			cl_device_type type,
-			VECTOR_CLASS<Device>* devices) const
-		{
-			cl_uint n = 0;
-			if (devices == NULL) {
-				return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
-			}
-			cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
-			if (err != CL_SUCCESS) {
-				return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
-			}
-
-			cl_device_id* ids = (cl_device_id*)alloca(n * sizeof(cl_device_id));
-			err = ::clGetDeviceIDs(object_, type, n, ids, NULL);
-			if (err != CL_SUCCESS) {
-				return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
-			}
-
-			devices->assign(&ids[0], &ids[n]);
-			return CL_SUCCESS;
-		}
-
-#if defined(USE_DX_INTEROP)
-		/*! \brief Get the list of available D3D10 devices.
-		*
-		*  \param d3d_device_source.
-		*
-		*  \param d3d_object.
-		*
-		*  \param d3d_device_set.
-		*
-		*  \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
-		*  values returned in devices can be used to identify a specific OpenCL
-		*  device. If \a devices argument is NULL, this argument is ignored.
-		*
-		*  \return One of the following values:
-		*    - CL_SUCCESS if the function is executed successfully.
-		*
-		*  The application can query specific capabilities of the OpenCL device(s)
-		*  returned by cl::getDevices. This can be used by the application to
-		*  determine which device(s) to use.
-		*
-		* \note In the case that exceptions are enabled and a return value
-		* other than CL_SUCCESS is generated, then cl::Error exception is
-		* generated.
-		*/
-		cl_int getDevices(
-			cl_d3d10_device_source_khr d3d_device_source,
-			void *                     d3d_object,
-			cl_d3d10_device_set_khr    d3d_device_set,
-			VECTOR_CLASS<Device>* devices) const
-		{
-			typedef CL_API_ENTRY cl_int(CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
-				cl_platform_id platform,
-				cl_d3d10_device_source_khr d3d_device_source,
-				void * d3d_object,
-				cl_d3d10_device_set_khr d3d_device_set,
-				cl_uint num_entries,
-				cl_device_id * devices,
-				cl_uint* num_devices);
-
-			if (devices == NULL) {
-				return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
-			}
-
-			static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
-			__INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR);
-
-			cl_uint n = 0;
-			cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
-				object_,
-				d3d_device_source,
-				d3d_object,
-				d3d_device_set,
-				0,
-				NULL,
-				&n);
-			if (err != CL_SUCCESS) {
-				return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
-			}
-
-			cl_device_id* ids = (cl_device_id*)alloca(n * sizeof(cl_device_id));
-			err = pfn_clGetDeviceIDsFromD3D10KHR(
-				object_,
-				d3d_device_source,
-				d3d_object,
-				d3d_device_set,
-				n,
-				ids,
-				NULL);
-			if (err != CL_SUCCESS) {
-				return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
-			}
-
-			devices->assign(&ids[0], &ids[n]);
-			return CL_SUCCESS;
-		}
-#endif
-
-		/*! \brief Gets a list of available platforms.
-		*
-		*  Wraps clGetPlatformIDs().
-		*/
-		static cl_int get(
-			VECTOR_CLASS<Platform>* platforms)
-		{
-			cl_uint n = 0;
-
-			if (platforms == NULL) {
-				return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
-			}
-
-			cl_int err = ::clGetPlatformIDs(0, NULL, &n);
-			if (err != CL_SUCCESS) {
-				return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
-			}
-
-			cl_platform_id* ids = (cl_platform_id*)alloca(
-				n * sizeof(cl_platform_id));
-			err = ::clGetPlatformIDs(n, ids, NULL);
-			if (err != CL_SUCCESS) {
-				return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
-			}
-
-			platforms->assign(&ids[0], &ids[n]);
-			return CL_SUCCESS;
-		}
-
-		/*! \brief Gets the first available platform.
-		*
-		*  Wraps clGetPlatformIDs(), returning the first result.
-		*/
-		static cl_int get(
-			Platform * platform)
-		{
-			cl_uint n = 0;
-
-			if (platform == NULL) {
-				return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
-			}
-
-			cl_int err = ::clGetPlatformIDs(0, NULL, &n);
-			if (err != CL_SUCCESS) {
-				return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
-			}
-
-			cl_platform_id* ids = (cl_platform_id*)alloca(
-				n * sizeof(cl_platform_id));
-			err = ::clGetPlatformIDs(n, ids, NULL);
-			if (err != CL_SUCCESS) {
-				return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
-			}
-
-			*platform = ids[0];
-			return CL_SUCCESS;
-		}
-
-		/*! \brief Gets the first available platform, returning it by value.
-		*
-		*  Wraps clGetPlatformIDs(), returning the first result.
-		*/
-		static Platform get(
-			cl_int * errResult = NULL)
-		{
-			Platform platform;
-			cl_uint n = 0;
-			cl_int err = ::clGetPlatformIDs(0, NULL, &n);
-			if (err != CL_SUCCESS) {
-				detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
-				if (errResult != NULL) {
-					*errResult = err;
-				}
-			}
-
-			cl_platform_id* ids = (cl_platform_id*)alloca(
-				n * sizeof(cl_platform_id));
-			err = ::clGetPlatformIDs(n, ids, NULL);
-
-			if (err != CL_SUCCESS) {
-				detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
-			}
-
-			if (errResult != NULL) {
-				*errResult = err;
-			}
-
-			return ids[0];
-		}
-
-		static Platform getDefault(
-			cl_int *errResult = NULL)
-		{
-			return get(errResult);
-		}
-
-
-#if defined(CL_VERSION_1_2)
-		//! \brief Wrapper for clUnloadCompiler().
-		cl_int
-			unloadCompiler()
-		{
-			return ::clUnloadPlatformCompiler(object_);
-		}
-#endif // #if defined(CL_VERSION_1_2)
-	}; // class Platform
-
-	/**
-	* Deprecated APIs for 1.2
-	*/
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
-	/**
-	* Unload the OpenCL compiler.
-	* \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
-	*/
-	inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
-		UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-	inline cl_int
-		UnloadCompiler()
-	{
-		return ::clUnloadCompiler();
-	}
-#endif // #if defined(CL_VERSION_1_1)
-
-	/*! \brief Class interface for cl_context.
-	*
-	*  \note Copies of these objects are shallow, meaning that the copy will refer
-	*        to the same underlying cl_context as the original.  For details, see
-	*        clRetainContext() and clReleaseContext().
-	*
-	*  \see cl_context
-	*/
-	class Context
-		: public detail::Wrapper<cl_context>
-	{
-	private:
-		static volatile int default_initialized_;
-		static Context default_;
-		static volatile cl_int default_error_;
-	public:
-		/*! \brief Destructor.
-		*
-		*  This calls clReleaseContext() on the value held by this instance.
-		*/
-		~Context() { }
-
-		/*! \brief Constructs a context including a list of specified devices.
-		*
-		*  Wraps clCreateContext().
-		*/
-		Context(
-			const VECTOR_CLASS<Device>& devices,
-			cl_context_properties* properties = NULL,
-			void (CL_CALLBACK * notifyFptr)(
-			const char *,
-			const void *,
-			::size_t,
-			void *) = NULL,
-			void* data = NULL,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-
-			::size_t numDevices = devices.size();
-			cl_device_id* deviceIDs = (cl_device_id*)alloca(numDevices * sizeof(cl_device_id));
-			for (::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex) {
-				deviceIDs[deviceIndex] = (devices[deviceIndex])();
-			}
-
-			object_ = ::clCreateContext(
-				properties, (cl_uint)numDevices,
-				deviceIDs,
-				notifyFptr, data, &error);
-
-			detail::errHandler(error, __CREATE_CONTEXT_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		Context(
-			const Device& device,
-			cl_context_properties* properties = NULL,
-			void (CL_CALLBACK * notifyFptr)(
-			const char *,
-			const void *,
-			::size_t,
-			void *) = NULL,
-			void* data = NULL,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-
-			cl_device_id deviceID = device();
-
-			object_ = ::clCreateContext(
-				properties, 1,
-				&deviceID,
-				notifyFptr, data, &error);
-
-			detail::errHandler(error, __CREATE_CONTEXT_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		/*! \brief Constructs a context including all devices of a specified type.
-		*
-		*  Wraps clCreateContextFromType().
-		*/
-		Context(
-			cl_device_type type,
-			cl_context_properties* properties = NULL,
-			void (CL_CALLBACK * notifyFptr)(
-			const char *,
-			const void *,
-			::size_t,
-			void *) = NULL,
-			void* data = NULL,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-
-#if !defined(__APPLE__) || !defined(__MACOS)
-			cl_context_properties prop[4] = { CL_CONTEXT_PLATFORM, 0, 0, 0 };
-			if (properties == NULL) {
-				prop[1] = (cl_context_properties)Platform::get(&error)();
-				if (error != CL_SUCCESS) {
-					detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
-					if (err != NULL) {
-						*err = error;
-						return;
-					}
-				}
-
-				properties = &prop[0];
-			}
-#endif
-			object_ = ::clCreateContextFromType(
-				properties, type, notifyFptr, data, &error);
-
-			detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		/*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
-		*
-		*  \note All calls to this function return the same cl_context as the first.
-		*/
-		static Context getDefault(cl_int * err = NULL)
-		{
-			int state = detail::compare_exchange(
-				&default_initialized_,
-				__DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
-
-			if (state & __DEFAULT_INITIALIZED) {
-				if (err != NULL) {
-					*err = default_error_;
-				}
-				return default_;
-			}
-
-			if (state & __DEFAULT_BEING_INITIALIZED) {
-				// Assume writes will propagate eventually...
-				while (default_initialized_ != __DEFAULT_INITIALIZED) {
-					detail::fence();
-				}
-
-				if (err != NULL) {
-					*err = default_error_;
-				}
-				return default_;
-			}
-
-			cl_int error;
-			default_ = Context(
-				CL_DEVICE_TYPE_DEFAULT,
-				NULL,
-				NULL,
-				NULL,
-				&error);
-
-			detail::fence();
-
-			default_error_ = error;
-			// Assume writes will propagate eventually...
-			default_initialized_ = __DEFAULT_INITIALIZED;
-
-			detail::fence();
-
-			if (err != NULL) {
-				*err = default_error_;
-			}
-			return default_;
-
-		}
-
-		//! \brief Default constructor - initializes to NULL.
-		Context() : detail::Wrapper<cl_type>() { }
-
-		/*! \brief Copy constructor.
-		*
-		*  This calls clRetainContext() on the parameter's cl_context.
-		*/
-		Context(const Context& context) : detail::Wrapper<cl_type>(context) { }
-
-		/*! \brief Constructor from cl_context - takes ownership.
-		*
-		*  This effectively transfers ownership of a refcount on the cl_context
-		*  into the new Context object.
-		*/
-		__CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper<cl_type>(context) { }
-
-		/*! \brief Assignment operator from Context.
-		*
-		*  This calls clRetainContext() on the parameter and clReleaseContext() on
-		*  the previous value held by this instance.
-		*/
-		Context& operator = (const Context& rhs)
-		{
-			if (this != &rhs) {
-				detail::Wrapper<cl_type>::operator=(rhs);
-			}
-			return *this;
-		}
-
-		/*! \brief Assignment operator from cl_context - takes ownership.
-		*
-		*  This effectively transfers ownership of a refcount on the rhs and calls
-		*  clReleaseContext() on the value previously held by this instance.
-		*/
-		Context& operator = (const cl_context& rhs)
-		{
-			detail::Wrapper<cl_type>::operator=(rhs);
-			return *this;
-		}
-
-		//! \brief Wrapper for clGetContextInfo().
-		template <typename T>
-		cl_int getInfo(cl_context_info name, T* param) const
-		{
-			return detail::errHandler(
-				detail::getInfo(&::clGetContextInfo, object_, name, param),
-				__GET_CONTEXT_INFO_ERR);
-		}
-
-		//! \brief Wrapper for clGetContextInfo() that returns by value.
-		template <cl_int name> typename
-			detail::param_traits<detail::cl_context_info, name>::param_type
-			getInfo(cl_int* err = NULL) const
-		{
-			typename detail::param_traits<
-				detail::cl_context_info, name>::param_type param;
-			cl_int result = getInfo(name, &param);
-			if (err != NULL) {
-				*err = result;
-			}
-			return param;
-		}
-
-		/*! \brief Gets a list of supported image formats.
-		*
-		*  Wraps clGetSupportedImageFormats().
-		*/
-		cl_int getSupportedImageFormats(
-			cl_mem_flags flags,
-			cl_mem_object_type type,
-			VECTOR_CLASS<ImageFormat>* formats) const
-		{
-			cl_uint numEntries;
-			cl_int err = ::clGetSupportedImageFormats(
-				object_,
-				flags,
-				type,
-				0,
-				NULL,
-				&numEntries);
-			if (err != CL_SUCCESS) {
-				return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
-			}
-
-			ImageFormat* value = (ImageFormat*)
-				alloca(numEntries * sizeof(ImageFormat));
-			err = ::clGetSupportedImageFormats(
-				object_,
-				flags,
-				type,
-				numEntries,
-				(cl_image_format*)value,
-				NULL);
-			if (err != CL_SUCCESS) {
-				return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
-			}
-
-			formats->assign(&value[0], &value[numEntries]);
-			return CL_SUCCESS;
-		}
-	};
-
-	inline Device Device::getDefault(cl_int * err)
-	{
-		cl_int error;
-		Device device;
-
-		Context context = Context::getDefault(&error);
-		detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-
-		if (error != CL_SUCCESS) {
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-		else {
-			device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
-			if (err != NULL) {
-				*err = CL_SUCCESS;
-			}
-		}
-
-		return device;
-	}
-
-
-#ifdef _WIN32
-	__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
-	__declspec(selectany) Context Context::default_;
-	__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS;
-#else
-	__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
-	__attribute__((weak)) Context Context::default_;
-	__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS;
-#endif
-
-	/*! \brief Class interface for cl_event.
-	*
-	*  \note Copies of these objects are shallow, meaning that the copy will refer
-	*        to the same underlying cl_event as the original.  For details, see
-	*        clRetainEvent() and clReleaseEvent().
-	*
-	*  \see cl_event
-	*/
-	class Event : public detail::Wrapper<cl_event>
-	{
-	public:
-		/*! \brief Destructor.
-		*
-		*  This calls clReleaseEvent() on the value held by this instance.
-		*/
-		~Event() { }
-
-		//! \brief Default constructor - initializes to NULL.
-		Event() : detail::Wrapper<cl_type>() { }
-
-		/*! \brief Copy constructor.
-		*
-		*  This calls clRetainEvent() on the parameter's cl_event.
-		*/
-		Event(const Event& event) : detail::Wrapper<cl_type>(event) { }
-
-		/*! \brief Constructor from cl_event - takes ownership.
-		*
-		*  This effectively transfers ownership of a refcount on the cl_event
-		*  into the new Event object.
-		*/
-		Event(const cl_event& event) : detail::Wrapper<cl_type>(event) { }
-
-		/*! \brief Assignment operator from cl_event - takes ownership.
-		*
-		*  This effectively transfers ownership of a refcount on the rhs and calls
-		*  clReleaseEvent() on the value previously held by this instance.
-		*/
-		Event& operator = (const Event& rhs)
-		{
-			if (this != &rhs) {
-				detail::Wrapper<cl_type>::operator=(rhs);
-			}
-			return *this;
-		}
-
-		/*! \brief Assignment operator from cl_event.
-		*
-		*  This calls clRetainEvent() on the parameter and clReleaseEvent() on
-		*  the previous value held by this instance.
-		*/
-		Event& operator = (const cl_event& rhs)
-		{
-			detail::Wrapper<cl_type>::operator=(rhs);
-			return *this;
-		}
-
-		//! \brief Wrapper for clGetEventInfo().
-		template <typename T>
-		cl_int getInfo(cl_event_info name, T* param) const
-		{
-			return detail::errHandler(
-				detail::getInfo(&::clGetEventInfo, object_, name, param),
-				__GET_EVENT_INFO_ERR);
-		}
-
-		//! \brief Wrapper for clGetEventInfo() that returns by value.
-		template <cl_int name> typename
-			detail::param_traits<detail::cl_event_info, name>::param_type
-			getInfo(cl_int* err = NULL) const
-		{
-			typename detail::param_traits<
-				detail::cl_event_info, name>::param_type param;
-			cl_int result = getInfo(name, &param);
-			if (err != NULL) {
-				*err = result;
-			}
-			return param;
-		}
-
-		//! \brief Wrapper for clGetEventProfilingInfo().
-		template <typename T>
-		cl_int getProfilingInfo(cl_profiling_info name, T* param) const
-		{
-			return detail::errHandler(detail::getInfo(
-				&::clGetEventProfilingInfo, object_, name, param),
-				__GET_EVENT_PROFILE_INFO_ERR);
-		}
-
-		//! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
-		template <cl_int name> typename
-			detail::param_traits<detail::cl_profiling_info, name>::param_type
-			getProfilingInfo(cl_int* err = NULL) const
-		{
-			typename detail::param_traits<
-				detail::cl_profiling_info, name>::param_type param;
-			cl_int result = getProfilingInfo(name, &param);
-			if (err != NULL) {
-				*err = result;
-			}
-			return param;
-		}
-
-		/*! \brief Blocks the calling thread until this event completes.
-		*
-		*  Wraps clWaitForEvents().
-		*/
-		cl_int wait() const
-		{
-			return detail::errHandler(
-				::clWaitForEvents(1, &object_),
-				__WAIT_FOR_EVENTS_ERR);
-		}
-
-#if defined(CL_VERSION_1_1)
-		/*! \brief Registers a user callback function for a specific command execution status.
-		*
-		*  Wraps clSetEventCallback().
-		*/
-		cl_int setCallback(
-			cl_int type,
-			void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),
-			void * user_data = NULL)
-		{
-			return detail::errHandler(
-				::clSetEventCallback(
-				object_,
-				type,
-				pfn_notify,
-				user_data),
-				__SET_EVENT_CALLBACK_ERR);
-		}
-#endif
-
-		/*! \brief Blocks the calling thread until every event specified is complete.
-		*
-		*  Wraps clWaitForEvents().
-		*/
-		static cl_int
-			waitForEvents(const VECTOR_CLASS<Event>& events)
-		{
-			return detail::errHandler(
-				::clWaitForEvents(
-				(cl_uint)events.size(), (cl_event*)&events.front()),
-				__WAIT_FOR_EVENTS_ERR);
-		}
-	};
-
-#if defined(CL_VERSION_1_1)
-	/*! \brief Class interface for user events (a subset of cl_event's).
-	*
-	*  See Event for details about copy semantics, etc.
-	*/
-	class UserEvent : public Event
-	{
-	public:
-		/*! \brief Constructs a user event on a given context.
-		*
-		*  Wraps clCreateUserEvent().
-		*/
-		UserEvent(
-			const Context& context,
-			cl_int * err = NULL)
-		{
-			cl_int error;
-			object_ = ::clCreateUserEvent(
-				context(),
-				&error);
-
-			detail::errHandler(error, __CREATE_USER_EVENT_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		//! \brief Default constructor - initializes to NULL.
-		UserEvent() : Event() { }
-
-		//! \brief Copy constructor - performs shallow copy.
-		UserEvent(const UserEvent& event) : Event(event) { }
-
-		//! \brief Assignment Operator - performs shallow copy.
-		UserEvent& operator = (const UserEvent& rhs)
-		{
-			if (this != &rhs) {
-				Event::operator=(rhs);
-			}
-			return *this;
-		}
-
-		/*! \brief Sets the execution status of a user event object.
-		*
-		*  Wraps clSetUserEventStatus().
-		*/
-		cl_int setStatus(cl_int status)
-		{
-			return detail::errHandler(
-				::clSetUserEventStatus(object_, status),
-				__SET_USER_EVENT_STATUS_ERR);
-		}
-	};
-#endif
-
-	/*! \brief Blocks the calling thread until every event specified is complete.
-	*
-	*  Wraps clWaitForEvents().
-	*/
-	inline static cl_int
-		WaitForEvents(const VECTOR_CLASS<Event>& events)
-	{
-		return detail::errHandler(
-			::clWaitForEvents(
-			(cl_uint)events.size(), (cl_event*)&events.front()),
-			__WAIT_FOR_EVENTS_ERR);
-	}
-
-	/*! \brief Class interface for cl_mem.
-	*
-	*  \note Copies of these objects are shallow, meaning that the copy will refer
-	*        to the same underlying cl_mem as the original.  For details, see
-	*        clRetainMemObject() and clReleaseMemObject().
-	*
-	*  \see cl_mem
-	*/
-	class Memory : public detail::Wrapper<cl_mem>
-	{
-	public:
-
-		/*! \brief Destructor.
-		*
-		*  This calls clReleaseMemObject() on the value held by this instance.
-		*/
-		~Memory() {}
-
-		//! \brief Default constructor - initializes to NULL.
-		Memory() : detail::Wrapper<cl_type>() { }
-
-		/*! \brief Copy constructor - performs shallow copy.
-		*
-		*  This calls clRetainMemObject() on the parameter's cl_mem.
-		*/
-		Memory(const Memory& memory) : detail::Wrapper<cl_type>(memory) { }
-
-		/*! \brief Constructor from cl_mem - takes ownership.
-		*
-		*  This effectively transfers ownership of a refcount on the cl_mem
-		*  into the new Memory object.
-		*/
-		__CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper<cl_type>(memory) { }
-
-		/*! \brief Assignment operator from Memory.
-		*
-		*  This calls clRetainMemObject() on the parameter and clReleaseMemObject()
-		*  on the previous value held by this instance.
-		*/
-		Memory& operator = (const Memory& rhs)
-		{
-			if (this != &rhs) {
-				detail::Wrapper<cl_type>::operator=(rhs);
-			}
-			return *this;
-		}
-
-		/*! \brief Assignment operator from cl_mem - takes ownership.
-		*
-		*  This effectively transfers ownership of a refcount on the rhs and calls
-		*  clReleaseMemObject() on the value previously held by this instance.
-		*/
-		Memory& operator = (const cl_mem& rhs)
-		{
-			detail::Wrapper<cl_type>::operator=(rhs);
-			return *this;
-		}
-
-		//! \brief Wrapper for clGetMemObjectInfo().
-		template <typename T>
-		cl_int getInfo(cl_mem_info name, T* param) const
-		{
-			return detail::errHandler(
-				detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
-				__GET_MEM_OBJECT_INFO_ERR);
-		}
-
-		//! \brief Wrapper for clGetMemObjectInfo() that returns by value.
-		template <cl_int name> typename
-			detail::param_traits<detail::cl_mem_info, name>::param_type
-			getInfo(cl_int* err = NULL) const
-		{
-			typename detail::param_traits<
-				detail::cl_mem_info, name>::param_type param;
-			cl_int result = getInfo(name, &param);
-			if (err != NULL) {
-				*err = result;
-			}
-			return param;
-		}
-
-#if defined(CL_VERSION_1_1)
-		/*! \brief Registers a callback function to be called when the memory object
-		*         is no longer needed.
-		*
-		*  Wraps clSetMemObjectDestructorCallback().
-		*
-		*  Repeated calls to this function, for a given cl_mem value, will append
-		*  to the list of functions called (in reverse order) when memory object's
-		*  resources are freed and the memory object is deleted.
-		*
-		*  \note
-		*  The registered callbacks are associated with the underlying cl_mem
-		*  value - not the Memory class instance.
-		*/
-		cl_int setDestructorCallback(
-			void (CL_CALLBACK * pfn_notify)(cl_mem, void *),
-			void * user_data = NULL)
-		{
-			return detail::errHandler(
-				::clSetMemObjectDestructorCallback(
-				object_,
-				pfn_notify,
-				user_data),
-				__SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);
-		}
-#endif
-
-	};
-
-	// Pre-declare copy functions
-	class Buffer;
-	template< typename IteratorType >
-	cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer);
-	template< typename IteratorType >
-	cl_int copy(const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator);
-
-	/*! \brief Class interface for Buffer Memory Objects.
-	*
-	*  See Memory for details about copy semantics, etc.
-	*
-	*  \see Memory
-	*/
-	class Buffer : public Memory
-	{
-	public:
-
-		/*! \brief Constructs a Buffer in a specified context.
-		*
-		*  Wraps clCreateBuffer().
-		*
-		*  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
-		*                  specified.  Note alignment & exclusivity requirements.
-		*/
-		Buffer(
-			const Context& context,
-			cl_mem_flags flags,
-			::size_t size,
-			void* host_ptr = NULL,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-			object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
-
-			detail::errHandler(error, __CREATE_BUFFER_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		/*! \brief Constructs a Buffer in the default context.
-		*
-		*  Wraps clCreateBuffer().
-		*
-		*  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
-		*                  specified.  Note alignment & exclusivity requirements.
-		*
-		*  \see Context::getDefault()
-		*/
-		Buffer(
-			cl_mem_flags flags,
-			::size_t size,
-			void* host_ptr = NULL,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-
-			Context context = Context::getDefault(err);
-
-			object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
-
-			detail::errHandler(error, __CREATE_BUFFER_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		/*!
-		* \brief Construct a Buffer from a host container via iterators.
-		* If useHostPtr is specified iterators must be random access.
-		*/
-		template< typename IteratorType >
-		Buffer(
-			IteratorType startIterator,
-			IteratorType endIterator,
-			bool readOnly,
-			bool useHostPtr = false,
-			cl_int* err = NULL)
-		{
-			typedef typename std::iterator_traits<IteratorType>::value_type DataType;
-			cl_int error;
-
-			cl_mem_flags flags = 0;
-			if (readOnly) {
-				flags |= CL_MEM_READ_ONLY;
-			}
-			else {
-				flags |= CL_MEM_READ_WRITE;
-			}
-			if (useHostPtr) {
-				flags |= CL_MEM_USE_HOST_PTR;
-			}
-
-			::size_t size = sizeof(DataType)*(endIterator - startIterator);
-
-			Context context = Context::getDefault(err);
-
-			if (useHostPtr) {
-				object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
-			}
-			else {
-				object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
-			}
-
-			detail::errHandler(error, __CREATE_BUFFER_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-
-			if (!useHostPtr) {
-				error = cl::copy(startIterator, endIterator, *this);
-				detail::errHandler(error, __CREATE_BUFFER_ERR);
-				if (err != NULL) {
-					*err = error;
-				}
-			}
-		}
-
-		//! \brief Default constructor - initializes to NULL.
-		Buffer() : Memory() { }
-
-		/*! \brief Copy constructor - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Buffer(const Buffer& buffer) : Memory(buffer) { }
-
-		/*! \brief Constructor from cl_mem - takes ownership.
-		*
-		*  See Memory for further details.
-		*/
-		__CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { }
-
-		/*! \brief Assignment from Buffer - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Buffer& operator = (const Buffer& rhs)
-		{
-			if (this != &rhs) {
-				Memory::operator=(rhs);
-			}
-			return *this;
-		}
-
-		/*! \brief Assignment from cl_mem - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Buffer& operator = (const cl_mem& rhs)
-		{
-			Memory::operator=(rhs);
-			return *this;
-		}
-
-#if defined(CL_VERSION_1_1)
-		/*! \brief Creates a new buffer object from this.
-		*
-		*  Wraps clCreateSubBuffer().
-		*/
-		Buffer createSubBuffer(
-			cl_mem_flags flags,
-			cl_buffer_create_type buffer_create_type,
-			const void * buffer_create_info,
-			cl_int * err = NULL)
-		{
-			Buffer result;
-			cl_int error;
-			result.object_ = ::clCreateSubBuffer(
-				object_,
-				flags,
-				buffer_create_type,
-				buffer_create_info,
-				&error);
-
-			detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-
-			return result;
-		}
-#endif
-	};
-
-#if defined (USE_DX_INTEROP)
-	/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
-	*
-	*  This is provided to facilitate interoperability with Direct3D.
-	*
-	*  See Memory for details about copy semantics, etc.
-	*
-	*  \see Memory
-	*/
-	class BufferD3D10 : public Buffer
-	{
-	public:
-		typedef CL_API_ENTRY cl_mem(CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
-			cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
-			cl_int* errcode_ret);
-
-		/*! \brief Constructs a BufferD3D10, in a specified context, from a
-		*         given ID3D10Buffer.
-		*
-		*  Wraps clCreateFromD3D10BufferKHR().
-		*/
-		BufferD3D10(
-			const Context& context,
-			cl_mem_flags flags,
-			ID3D10Buffer* bufobj,
-			cl_int * err = NULL)
-		{
-			static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL;
-
-#if defined(CL_VERSION_1_2)
-			vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
-			cl_platform platform = -1;
-			for (int i = 0; i < props.size(); ++i) {
-				if (props[i] == CL_CONTEXT_PLATFORM) {
-					platform = props[i + 1];
-				}
-			}
-			__INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR);
-#endif
-#if defined(CL_VERSION_1_1)
-			__INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR);
-#endif
-
-			cl_int error;
-			object_ = pfn_clCreateFromD3D10BufferKHR(
-				context(),
-				flags,
-				bufobj,
-				&error);
-
-			detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		//! \brief Default constructor - initializes to NULL.
-		BufferD3D10() : Buffer() { }
-
-		/*! \brief Copy constructor - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { }
-
-		/*! \brief Constructor from cl_mem - takes ownership.
-		*
-		*  See Memory for further details.
-		*/
-		__CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { }
-
-		/*! \brief Assignment from BufferD3D10 - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		BufferD3D10& operator = (const BufferD3D10& rhs)
-		{
-			if (this != &rhs) {
-				Buffer::operator=(rhs);
-			}
-			return *this;
-		}
-
-		/*! \brief Assignment from cl_mem - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		BufferD3D10& operator = (const cl_mem& rhs)
-		{
-			Buffer::operator=(rhs);
-			return *this;
-		}
-	};
-#endif
-
-	/*! \brief Class interface for GL Buffer Memory Objects.
-	*
-	*  This is provided to facilitate interoperability with OpenGL.
-	*
-	*  See Memory for details about copy semantics, etc.
-	*
-	*  \see Memory
-	*/
-	class BufferGL : public Buffer
-	{
-	public:
-		/*! \brief Constructs a BufferGL in a specified context, from a given
-		*         GL buffer.
-		*
-		*  Wraps clCreateFromGLBuffer().
-		*/
-		BufferGL(
-			const Context& context,
-			cl_mem_flags flags,
-			GLuint bufobj,
-			cl_int * err = NULL)
-		{
-			cl_int error;
-			object_ = ::clCreateFromGLBuffer(
-				context(),
-				flags,
-				bufobj,
-				&error);
-
-			detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		//! \brief Default constructor - initializes to NULL.
-		BufferGL() : Buffer() { }
-
-		/*! \brief Copy constructor - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		BufferGL(const BufferGL& buffer) : Buffer(buffer) { }
-
-		/*! \brief Constructor from cl_mem - takes ownership.
-		*
-		*  See Memory for further details.
-		*/
-		__CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { }
-
-		/*! \brief Assignment from BufferGL - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		BufferGL& operator = (const BufferGL& rhs)
-		{
-			if (this != &rhs) {
-				Buffer::operator=(rhs);
-			}
-			return *this;
-		}
-
-		/*! \brief Assignment from cl_mem - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		BufferGL& operator = (const cl_mem& rhs)
-		{
-			Buffer::operator=(rhs);
-			return *this;
-		}
-
-		//! \brief Wrapper for clGetGLObjectInfo().
-		cl_int getObjectInfo(
-			cl_gl_object_type *type,
-			GLuint * gl_object_name)
-		{
-			return detail::errHandler(
-				::clGetGLObjectInfo(object_, type, gl_object_name),
-				__GET_GL_OBJECT_INFO_ERR);
-		}
-	};
-
-	/*! \brief Class interface for GL Render Buffer Memory Objects.
-	*
-	*  This is provided to facilitate interoperability with OpenGL.
-	*
-	*  See Memory for details about copy semantics, etc.
-	*
-	*  \see Memory
-	*/
-	class BufferRenderGL : public Buffer
-	{
-	public:
-		/*! \brief Constructs a BufferRenderGL in a specified context, from a given
-		*         GL Renderbuffer.
-		*
-		*  Wraps clCreateFromGLRenderbuffer().
-		*/
-		BufferRenderGL(
-			const Context& context,
-			cl_mem_flags flags,
-			GLuint bufobj,
-			cl_int * err = NULL)
-		{
-			cl_int error;
-			object_ = ::clCreateFromGLRenderbuffer(
-				context(),
-				flags,
-				bufobj,
-				&error);
-
-			detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		//! \brief Default constructor - initializes to NULL.
-		BufferRenderGL() : Buffer() { }
-
-		/*! \brief Copy constructor - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { }
-
-		/*! \brief Constructor from cl_mem - takes ownership.
-		*
-		*  See Memory for further details.
-		*/
-		__CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Buffer(buffer) { }
-
-		/*! \brief Assignment from BufferGL - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		BufferRenderGL& operator = (const BufferRenderGL& rhs)
-		{
-			if (this != &rhs) {
-				Buffer::operator=(rhs);
-			}
-			return *this;
-		}
-
-		/*! \brief Assignment from cl_mem - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		BufferRenderGL& operator = (const cl_mem& rhs)
-		{
-			Buffer::operator=(rhs);
-			return *this;
-		}
-
-		//! \brief Wrapper for clGetGLObjectInfo().
-		cl_int getObjectInfo(
-			cl_gl_object_type *type,
-			GLuint * gl_object_name)
-		{
-			return detail::errHandler(
-				::clGetGLObjectInfo(object_, type, gl_object_name),
-				__GET_GL_OBJECT_INFO_ERR);
-		}
-	};
-
-	/*! \brief C++ base class for Image Memory objects.
-	*
-	*  See Memory for details about copy semantics, etc.
-	*
-	*  \see Memory
-	*/
-	class Image : public Memory
-	{
-	protected:
-		//! \brief Default constructor - initializes to NULL.
-		Image() : Memory() { }
-
-		/*! \brief Copy constructor - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Image(const Image& image) : Memory(image) { }
-
-		/*! \brief Constructor from cl_mem - takes ownership.
-		*
-		*  See Memory for further details.
-		*/
-		__CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { }
-
-		/*! \brief Assignment from Image - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Image& operator = (const Image& rhs)
-		{
-			if (this != &rhs) {
-				Memory::operator=(rhs);
-			}
-			return *this;
-		}
-
-		/*! \brief Assignment from cl_mem - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Image& operator = (const cl_mem& rhs)
-		{
-			Memory::operator=(rhs);
-			return *this;
-		}
-
-	public:
-		//! \brief Wrapper for clGetImageInfo().
-		template <typename T>
-		cl_int getImageInfo(cl_image_info name, T* param) const
-		{
-			return detail::errHandler(
-				detail::getInfo(&::clGetImageInfo, object_, name, param),
-				__GET_IMAGE_INFO_ERR);
-		}
-
-		//! \brief Wrapper for clGetImageInfo() that returns by value.
-		template <cl_int name> typename
-			detail::param_traits<detail::cl_image_info, name>::param_type
-			getImageInfo(cl_int* err = NULL) const
-		{
-			typename detail::param_traits<
-				detail::cl_image_info, name>::param_type param;
-			cl_int result = getImageInfo(name, &param);
-			if (err != NULL) {
-				*err = result;
-			}
-			return param;
-		}
-	};
-
-#if defined(CL_VERSION_1_2)
-	/*! \brief Class interface for 1D Image Memory objects.
-	*
-	*  See Memory for details about copy semantics, etc.
-	*
-	*  \see Memory
-	*/
-	class Image1D : public Image
-	{
-	public:
-		/*! \brief Constructs a 1D Image in a specified context.
-		*
-		*  Wraps clCreateImage().
-		*/
-		Image1D(
-			const Context& context,
-			cl_mem_flags flags,
-			ImageFormat format,
-			::size_t width,
-			void* host_ptr = NULL,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-			cl_image_desc desc;
-			desc.image_type = CL_MEM_OBJECT_IMAGE1D;
-			desc.image_width = width;
-			desc.image_row_pitch = 0;
-			desc.num_mip_levels = 0;
-			desc.num_samples = 0;
-			desc.buffer = 0;
-			object_ = ::clCreateImage(
-				context(),
-				flags,
-				&format,
-				&desc,
-				host_ptr,
-				&error);
-
-			detail::errHandler(error, __CREATE_IMAGE_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		//! \brief Default constructor - initializes to NULL.
-		Image1D() { }
-
-		/*! \brief Copy constructor - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Image1D(const Image1D& image1D) : Image(image1D) { }
-
-		/*! \brief Constructor from cl_mem - takes ownership.
-		*
-		*  See Memory for further details.
-		*/
-		__CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { }
-
-		/*! \brief Assignment from Image1D - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Image1D& operator = (const Image1D& rhs)
-		{
-			if (this != &rhs) {
-				Image::operator=(rhs);
-			}
-			return *this;
-		}
-
-		/*! \brief Assignment from cl_mem - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Image1D& operator = (const cl_mem& rhs)
-		{
-			Image::operator=(rhs);
-			return *this;
-		}
-	};
-
-	/*! \class Image1DBuffer
-	* \brief Image interface for 1D buffer images.
-	*/
-	class Image1DBuffer : public Image
-	{
-	public:
-		Image1DBuffer(
-			const Context& context,
-			cl_mem_flags flags,
-			ImageFormat format,
-			::size_t width,
-			const Buffer &buffer,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-			cl_image_desc desc;
-			desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
-			desc.image_width = width;
-			desc.image_row_pitch = 0;
-			desc.num_mip_levels = 0;
-			desc.num_samples = 0;
-			desc.buffer = buffer();
-			object_ = ::clCreateImage(
-				context(),
-				flags,
-				&format,
-				&desc,
-				NULL,
-				&error);
-
-			detail::errHandler(error, __CREATE_IMAGE_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		Image1DBuffer() { }
-
-		Image1DBuffer(const Image1DBuffer& image1D) : Image(image1D) { }
-
-		__CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { }
-
-		Image1DBuffer& operator = (const Image1DBuffer& rhs)
-		{
-			if (this != &rhs) {
-				Image::operator=(rhs);
-			}
-			return *this;
-		}
-
-		Image1DBuffer& operator = (const cl_mem& rhs)
-		{
-			Image::operator=(rhs);
-			return *this;
-		}
-	};
-
-	/*! \class Image1DArray
-	* \brief Image interface for arrays of 1D images.
-	*/
-	class Image1DArray : public Image
-	{
-	public:
-		Image1DArray(
-			const Context& context,
-			cl_mem_flags flags,
-			ImageFormat format,
-			::size_t arraySize,
-			::size_t width,
-			::size_t rowPitch,
-			void* host_ptr = NULL,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-			cl_image_desc desc;
-			desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
-			desc.image_array_size = arraySize;
-			desc.image_width = width;
-			desc.image_row_pitch = rowPitch;
-			desc.num_mip_levels = 0;
-			desc.num_samples = 0;
-			desc.buffer = 0;
-			object_ = ::clCreateImage(
-				context(),
-				flags,
-				&format,
-				&desc,
-				host_ptr,
-				&error);
-
-			detail::errHandler(error, __CREATE_IMAGE_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		Image1DArray() { }
-
-		Image1DArray(const Image1DArray& imageArray) : Image(imageArray) { }
-
-		__CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { }
-
-		Image1DArray& operator = (const Image1DArray& rhs)
-		{
-			if (this != &rhs) {
-				Image::operator=(rhs);
-			}
-			return *this;
-		}
-
-		Image1DArray& operator = (const cl_mem& rhs)
-		{
-			Image::operator=(rhs);
-			return *this;
-		}
-	};
-#endif // #if defined(CL_VERSION_1_2)
-
-
-	/*! \brief Class interface for 2D Image Memory objects.
-	*
-	*  See Memory for details about copy semantics, etc.
-	*
-	*  \see Memory
-	*/
-	class Image2D : public Image
-	{
-	public:
-		/*! \brief Constructs a 1D Image in a specified context.
-		*
-		*  Wraps clCreateImage().
-		*/
-		Image2D(
-			const Context& context,
-			cl_mem_flags flags,
-			ImageFormat format,
-			::size_t width,
-			::size_t height,
-			::size_t row_pitch = 0,
-			void* host_ptr = NULL,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-			bool useCreateImage;
-
-#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-			// Run-time decision based on the actual platform
-			{
-				cl_uint version = detail::getContextPlatformVersion(context());
-				useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
-			}
-#elif defined(CL_VERSION_1_2)
-			useCreateImage = true;
-#else
-			useCreateImage = false;
-#endif
-
-#if defined(CL_VERSION_1_2)
-			if (useCreateImage)
-			{
-				cl_image_desc desc;
-				desc.image_type = CL_MEM_OBJECT_IMAGE2D;
-				desc.image_width = width;
-				desc.image_height = height;
-				desc.image_row_pitch = row_pitch;
-				desc.num_mip_levels = 0;
-				desc.num_samples = 0;
-				desc.buffer = 0;
-				object_ = ::clCreateImage(
-					context(),
-					flags,
-					&format,
-					&desc,
-					host_ptr,
-					&error);
-
-				detail::errHandler(error, __CREATE_IMAGE_ERR);
-				if (err != NULL) {
-					*err = error;
-				}
-			}
-#endif // #if defined(CL_VERSION_1_2)
-#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-			if (!useCreateImage)
-			{
-				object_ = ::clCreateImage2D(
-					context(), flags, &format, width, height, row_pitch, host_ptr, &error);
-
-				detail::errHandler(error, __CREATE_IMAGE2D_ERR);
-				if (err != NULL) {
-					*err = error;
-				}
-			}
-#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-		}
-
-		//! \brief Default constructor - initializes to NULL.
-		Image2D() { }
-
-		/*! \brief Copy constructor - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Image2D(const Image2D& image2D) : Image(image2D) { }
-
-		/*! \brief Constructor from cl_mem - takes ownership.
-		*
-		*  See Memory for further details.
-		*/
-		__CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { }
-
-		/*! \brief Assignment from Image2D - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Image2D& operator = (const Image2D& rhs)
-		{
-			if (this != &rhs) {
-				Image::operator=(rhs);
-			}
-			return *this;
-		}
-
-		/*! \brief Assignment from cl_mem - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Image2D& operator = (const cl_mem& rhs)
-		{
-			Image::operator=(rhs);
-			return *this;
-		}
-	};
-
-
-#if !defined(CL_VERSION_1_2)
-	/*! \brief Class interface for GL 2D Image Memory objects.
-	*
-	*  This is provided to facilitate interoperability with OpenGL.
-	*
-	*  See Memory for details about copy semantics, etc.
-	*
-	*  \see Memory
-	*  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
-	*/
-	class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D
-	{
-	public:
-		/*! \brief Constructs an Image2DGL in a specified context, from a given
-		*         GL Texture.
-		*
-		*  Wraps clCreateFromGLTexture2D().
-		*/
-		Image2DGL(
-			const Context& context,
-			cl_mem_flags flags,
-			GLenum target,
-			GLint  miplevel,
-			GLuint texobj,
-			cl_int * err = NULL)
-		{
-			cl_int error;
-			object_ = ::clCreateFromGLTexture2D(
-				context(),
-				flags,
-				target,
-				miplevel,
-				texobj,
-				&error);
-
-			detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-
-		}
-
-		//! \brief Default constructor - initializes to NULL.
-		Image2DGL() : Image2D() { }
-
-		/*! \brief Copy constructor - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Image2DGL(const Image2DGL& image) : Image2D(image) { }
-
-		/*! \brief Constructor from cl_mem - takes ownership.
-		*
-		*  See Memory for further details.
-		*/
-		__CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { }
-
-		/*! \brief Assignment from Image2DGL - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Image2DGL& operator = (const Image2DGL& rhs)
-		{
-			if (this != &rhs) {
-				Image2D::operator=(rhs);
-			}
-			return *this;
-		}
-
-		/*! \brief Assignment from cl_mem - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Image2DGL& operator = (const cl_mem& rhs)
-		{
-			Image2D::operator=(rhs);
-			return *this;
-		}
-	};
-#endif // #if !defined(CL_VERSION_1_2)
-
-#if defined(CL_VERSION_1_2)
-	/*! \class Image2DArray
-	* \brief Image interface for arrays of 2D images.
-	*/
-	class Image2DArray : public Image
-	{
-	public:
-		Image2DArray(
-			const Context& context,
-			cl_mem_flags flags,
-			ImageFormat format,
-			::size_t arraySize,
-			::size_t width,
-			::size_t height,
-			::size_t rowPitch,
-			::size_t slicePitch,
-			void* host_ptr = NULL,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-			cl_image_desc desc;
-			desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
-			desc.image_array_size = arraySize;
-			desc.image_width = width;
-			desc.image_height = height;
-			desc.image_row_pitch = rowPitch;
-			desc.image_slice_pitch = slicePitch;
-			desc.num_mip_levels = 0;
-			desc.num_samples = 0;
-			desc.buffer = 0;
-			object_ = ::clCreateImage(
-				context(),
-				flags,
-				&format,
-				&desc,
-				host_ptr,
-				&error);
-
-			detail::errHandler(error, __CREATE_IMAGE_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		Image2DArray() { }
-
-		Image2DArray(const Image2DArray& imageArray) : Image(imageArray) { }
-
-		__CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { }
-
-		Image2DArray& operator = (const Image2DArray& rhs)
-		{
-			if (this != &rhs) {
-				Image::operator=(rhs);
-			}
-			return *this;
-		}
-
-		Image2DArray& operator = (const cl_mem& rhs)
-		{
-			Image::operator=(rhs);
-			return *this;
-		}
-	};
-#endif // #if defined(CL_VERSION_1_2)
-
-	/*! \brief Class interface for 3D Image Memory objects.
-	*
-	*  See Memory for details about copy semantics, etc.
-	*
-	*  \see Memory
-	*/
-	class Image3D : public Image
-	{
-	public:
-		/*! \brief Constructs a 3D Image in a specified context.
-		*
-		*  Wraps clCreateImage().
-		*/
-		Image3D(
-			const Context& context,
-			cl_mem_flags flags,
-			ImageFormat format,
-			::size_t width,
-			::size_t height,
-			::size_t depth,
-			::size_t row_pitch = 0,
-			::size_t slice_pitch = 0,
-			void* host_ptr = NULL,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-			bool useCreateImage;
-
-#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-			// Run-time decision based on the actual platform
-			{
-				cl_uint version = detail::getContextPlatformVersion(context());
-				useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
-			}
-#elif defined(CL_VERSION_1_2)
-			useCreateImage = true;
-#else
-			useCreateImage = false;
-#endif
-
-#if defined(CL_VERSION_1_2)
-			if (useCreateImage)
-			{
-				cl_image_desc desc;
-				desc.image_type = CL_MEM_OBJECT_IMAGE3D;
-				desc.image_width = width;
-				desc.image_height = height;
-				desc.image_depth = depth;
-				desc.image_row_pitch = row_pitch;
-				desc.image_slice_pitch = slice_pitch;
-				desc.num_mip_levels = 0;
-				desc.num_samples = 0;
-				desc.buffer = 0;
-				object_ = ::clCreateImage(
-					context(),
-					flags,
-					&format,
-					&desc,
-					host_ptr,
-					&error);
-
-				detail::errHandler(error, __CREATE_IMAGE_ERR);
-				if (err != NULL) {
-					*err = error;
-				}
-			}
-#endif  // #if defined(CL_VERSION_1_2)
-#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-			if (!useCreateImage)
-			{
-				object_ = ::clCreateImage3D(
-					context(), flags, &format, width, height, depth, row_pitch,
-					slice_pitch, host_ptr, &error);
-
-				detail::errHandler(error, __CREATE_IMAGE3D_ERR);
-				if (err != NULL) {
-					*err = error;
-				}
-			}
-#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-		}
-
-		//! \brief Default constructor - initializes to NULL.
-		Image3D() { }
-
-		/*! \brief Copy constructor - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Image3D(const Image3D& image3D) : Image(image3D) { }
-
-		/*! \brief Constructor from cl_mem - takes ownership.
-		*
-		*  See Memory for further details.
-		*/
-		__CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { }
-
-		/*! \brief Assignment from Image3D - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Image3D& operator = (const Image3D& rhs)
-		{
-			if (this != &rhs) {
-				Image::operator=(rhs);
-			}
-			return *this;
-		}
-
-		/*! \brief Assignment from cl_mem - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Image3D& operator = (const cl_mem& rhs)
-		{
-			Image::operator=(rhs);
-			return *this;
-		}
-	};
-
-#if !defined(CL_VERSION_1_2)
-	/*! \brief Class interface for GL 3D Image Memory objects.
-	*
-	*  This is provided to facilitate interoperability with OpenGL.
-	*
-	*  See Memory for details about copy semantics, etc.
-	*
-	*  \see Memory
-	*/
-	class Image3DGL : public Image3D
-	{
-	public:
-		/*! \brief Constructs an Image3DGL in a specified context, from a given
-		*         GL Texture.
-		*
-		*  Wraps clCreateFromGLTexture3D().
-		*/
-		Image3DGL(
-			const Context& context,
-			cl_mem_flags flags,
-			GLenum target,
-			GLint  miplevel,
-			GLuint texobj,
-			cl_int * err = NULL)
-		{
-			cl_int error;
-			object_ = ::clCreateFromGLTexture3D(
-				context(),
-				flags,
-				target,
-				miplevel,
-				texobj,
-				&error);
-
-			detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		//! \brief Default constructor - initializes to NULL.
-		Image3DGL() : Image3D() { }
-
-		/*! \brief Copy constructor - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Image3DGL(const Image3DGL& image) : Image3D(image) { }
-
-		/*! \brief Constructor from cl_mem - takes ownership.
-		*
-		*  See Memory for further details.
-		*/
-		__CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { }
-
-		/*! \brief Assignment from Image3DGL - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Image3DGL& operator = (const Image3DGL& rhs)
-		{
-			if (this != &rhs) {
-				Image3D::operator=(rhs);
-			}
-			return *this;
-		}
-
-		/*! \brief Assignment from cl_mem - performs shallow copy.
-		*
-		*  See Memory for further details.
-		*/
-		Image3DGL& operator = (const cl_mem& rhs)
-		{
-			Image3D::operator=(rhs);
-			return *this;
-		}
-	};
-#endif // #if !defined(CL_VERSION_1_2)
-
-#if defined(CL_VERSION_1_2)
-	/*! \class ImageGL
-	* \brief general image interface for GL interop.
-	* We abstract the 2D and 3D GL images into a single instance here
-	* that wraps all GL sourced images on the grounds that setup information
-	* was performed by OpenCL anyway.
-	*/
-	class ImageGL : public Image
-	{
-	public:
-		ImageGL(
-			const Context& context,
-			cl_mem_flags flags,
-			GLenum target,
-			GLint  miplevel,
-			GLuint texobj,
-			cl_int * err = NULL)
-		{
-			cl_int error;
-			object_ = ::clCreateFromGLTexture(
-				context(),
-				flags,
-				target,
-				miplevel,
-				texobj,
-				&error);
-
-			detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		ImageGL() : Image() { }
-
-		ImageGL(const ImageGL& image) : Image(image) { }
-
-		__CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { }
-
-		ImageGL& operator = (const ImageGL& rhs)
-		{
-			if (this != &rhs) {
-				Image::operator=(rhs);
-			}
-			return *this;
-		}
-
-		ImageGL& operator = (const cl_mem& rhs)
-		{
-			Image::operator=(rhs);
-			return *this;
-		}
-	};
-#endif // #if defined(CL_VERSION_1_2)
-
-	/*! \brief Class interface for cl_sampler.
-	*
-	*  \note Copies of these objects are shallow, meaning that the copy will refer
-	*        to the same underlying cl_sampler as the original.  For details, see
-	*        clRetainSampler() and clReleaseSampler().
-	*
-	*  \see cl_sampler
-	*/
-	class Sampler : public detail::Wrapper<cl_sampler>
-	{
-	public:
-		/*! \brief Destructor.
-		*
-		*  This calls clReleaseSampler() on the value held by this instance.
-		*/
-		~Sampler() { }
-
-		//! \brief Default constructor - initializes to NULL.
-		Sampler() { }
-
-		/*! \brief Constructs a Sampler in a specified context.
-		*
-		*  Wraps clCreateSampler().
-		*/
-		Sampler(
-			const Context& context,
-			cl_bool normalized_coords,
-			cl_addressing_mode addressing_mode,
-			cl_filter_mode filter_mode,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-			object_ = ::clCreateSampler(
-				context(),
-				normalized_coords,
-				addressing_mode,
-				filter_mode,
-				&error);
-
-			detail::errHandler(error, __CREATE_SAMPLER_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		/*! \brief Copy constructor - performs shallow copy.
-		*
-		*  This calls clRetainSampler() on the parameter's cl_sampler.
-		*/
-		Sampler(const Sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
-
-		/*! \brief Constructor from cl_sampler - takes ownership.
-		*
-		*  This effectively transfers ownership of a refcount on the cl_sampler
-		*  into the new Sampler object.
-		*/
-		Sampler(const cl_sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
-
-		/*! \brief Assignment operator from Sampler.
-		*
-		*  This calls clRetainSampler() on the parameter and clReleaseSampler()
-		*  on the previous value held by this instance.
-		*/
-		Sampler& operator = (const Sampler& rhs)
-		{
-			if (this != &rhs) {
-				detail::Wrapper<cl_type>::operator=(rhs);
-			}
-			return *this;
-		}
-
-		/*! \brief Assignment operator from cl_sampler - takes ownership.
-		*
-		*  This effectively transfers ownership of a refcount on the rhs and calls
-		*  clReleaseSampler() on the value previously held by this instance.
-		*/
-		Sampler& operator = (const cl_sampler& rhs)
-		{
-			detail::Wrapper<cl_type>::operator=(rhs);
-			return *this;
-		}
-
-		//! \brief Wrapper for clGetSamplerInfo().
-		template <typename T>
-		cl_int getInfo(cl_sampler_info name, T* param) const
-		{
-			return detail::errHandler(
-				detail::getInfo(&::clGetSamplerInfo, object_, name, param),
-				__GET_SAMPLER_INFO_ERR);
-		}
-
-		//! \brief Wrapper for clGetSamplerInfo() that returns by value.
-		template <cl_int name> typename
-			detail::param_traits<detail::cl_sampler_info, name>::param_type
-			getInfo(cl_int* err = NULL) const
-		{
-			typename detail::param_traits<
-				detail::cl_sampler_info, name>::param_type param;
-			cl_int result = getInfo(name, &param);
-			if (err != NULL) {
-				*err = result;
-			}
-			return param;
-		}
-	};
-
-	class Program;
-	class CommandQueue;
-	class Kernel;
-
-	//! \brief Class interface for specifying NDRange values.
-	class NDRange
-	{
-	private:
-		size_t<3> sizes_;
-		cl_uint dimensions_;
-
-	public:
-		//! \brief Default constructor - resulting range has zero dimensions.
-		NDRange()
-			: dimensions_(0)
-		{ }
-
-		//! \brief Constructs one-dimensional range.
-		NDRange(::size_t size0)
-			: dimensions_(1)
-		{
-			sizes_[0] = size0;
-		}
-
-		//! \brief Constructs two-dimensional range.
-		NDRange(::size_t size0, ::size_t size1)
-			: dimensions_(2)
-		{
-			sizes_[0] = size0;
-			sizes_[1] = size1;
-		}
-
-		//! \brief Constructs three-dimensional range.
-		NDRange(::size_t size0, ::size_t size1, ::size_t size2)
-			: dimensions_(3)
-		{
-			sizes_[0] = size0;
-			sizes_[1] = size1;
-			sizes_[2] = size2;
-		}
-
-		/*! \brief Conversion operator to const ::size_t *.
-		*
-		*  \returns a pointer to the size of the first dimension.
-		*/
-		operator const ::size_t*() const {
-			return (const ::size_t*) sizes_;
-		}
-
-		//! \brief Queries the number of dimensions in the range.
-		::size_t dimensions() const { return dimensions_; }
-	};
-
-	//! \brief A zero-dimensional range.
-	static const NDRange NullRange;
-
-	//! \brief Local address wrapper for use with Kernel::setArg
-	struct LocalSpaceArg
-	{
-		::size_t size_;
-	};
-
-	namespace detail {
-
-		template <typename T>
-		struct KernelArgumentHandler
-		{
-			static ::size_t size(const T&) { return sizeof(T); }
-			static T* ptr(T& value) { return &value; }
-		};
-
-		template <>
-		struct KernelArgumentHandler<LocalSpaceArg>
-		{
-			static ::size_t size(const LocalSpaceArg& value) { return value.size_; }
-			static void* ptr(LocalSpaceArg&) { return NULL; }
-		};
-
-	}
-	//! \endcond
-
-	/*! __local
-	* \brief Helper function for generating LocalSpaceArg objects.
-	* Deprecated. Replaced with Local.
-	*/
-	inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg
-		__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-	inline LocalSpaceArg
-		__local(::size_t size)
-	{
-		LocalSpaceArg ret = { size };
-		return ret;
-	}
-
-	/*! Local
-	* \brief Helper function for generating LocalSpaceArg objects.
-	*/
-	inline LocalSpaceArg
-		Local(::size_t size)
-	{
-		LocalSpaceArg ret = { size };
-		return ret;
-	}
-
-	//class KernelFunctor;
-
-	/*! \brief Class interface for cl_kernel.
-	*
-	*  \note Copies of these objects are shallow, meaning that the copy will refer
-	*        to the same underlying cl_kernel as the original.  For details, see
-	*        clRetainKernel() and clReleaseKernel().
-	*
-	*  \see cl_kernel
-	*/
-	class Kernel : public detail::Wrapper<cl_kernel>
-	{
-	public:
-		inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
-
-		/*! \brief Destructor.
-		*
-		*  This calls clReleaseKernel() on the value held by this instance.
-		*/
-		~Kernel() { }
-
-		//! \brief Default constructor - initializes to NULL.
-		Kernel() { }
-
-		/*! \brief Copy constructor - performs shallow copy.
-		*
-		*  This calls clRetainKernel() on the parameter's cl_kernel.
-		*/
-		Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
-
-		/*! \brief Constructor from cl_kernel - takes ownership.
-		*
-		*  This effectively transfers ownership of a refcount on the cl_kernel
-		*  into the new Kernel object.
-		*/
-		__CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
-
-		/*! \brief Assignment operator from Kernel.
-		*
-		*  This calls clRetainKernel() on the parameter and clReleaseKernel()
-		*  on the previous value held by this instance.
-		*/
-		Kernel& operator = (const Kernel& rhs)
-		{
-			if (this != &rhs) {
-				detail::Wrapper<cl_type>::operator=(rhs);
-			}
-			return *this;
-		}
-
-		/*! \brief Assignment operator from cl_kernel - takes ownership.
-		*
-		*  This effectively transfers ownership of a refcount on the rhs and calls
-		*  clReleaseKernel() on the value previously held by this instance.
-		*/
-		Kernel& operator = (const cl_kernel& rhs)
-		{
-			detail::Wrapper<cl_type>::operator=(rhs);
-			return *this;
-		}
-
-		template <typename T>
-		cl_int getInfo(cl_kernel_info name, T* param) const
-		{
-			return detail::errHandler(
-				detail::getInfo(&::clGetKernelInfo, object_, name, param),
-				__GET_KERNEL_INFO_ERR);
-		}
-
-		template <cl_int name> typename
-			detail::param_traits<detail::cl_kernel_info, name>::param_type
-			getInfo(cl_int* err = NULL) const
-		{
-			typename detail::param_traits<
-				detail::cl_kernel_info, name>::param_type param;
-			cl_int result = getInfo(name, &param);
-			if (err != NULL) {
-				*err = result;
-			}
-			return param;
-		}
-
-#if defined(CL_VERSION_1_2)
-		template <typename T>
-		cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const
-		{
-			return detail::errHandler(
-				detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),
-				__GET_KERNEL_ARG_INFO_ERR);
-		}
-
-		template <cl_int name> typename
-			detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
-			getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
-		{
-			typename detail::param_traits<
-				detail::cl_kernel_arg_info, name>::param_type param;
-			cl_int result = getArgInfo(argIndex, name, &param);
-			if (err != NULL) {
-				*err = result;
-			}
-			return param;
-		}
-#endif // #if defined(CL_VERSION_1_2)
-
-		template <typename T>
-		cl_int getWorkGroupInfo(
-			const Device& device, cl_kernel_work_group_info name, T* param) const
-		{
-			return detail::errHandler(
-				detail::getInfo(
-				&::clGetKernelWorkGroupInfo, object_, device(), name, param),
-				__GET_KERNEL_WORK_GROUP_INFO_ERR);
-		}
-
-		template <cl_int name> typename
-			detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
-			getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
-		{
-			typename detail::param_traits<
-				detail::cl_kernel_work_group_info, name>::param_type param;
-			cl_int result = getWorkGroupInfo(device, name, &param);
-			if (err != NULL) {
-				*err = result;
-			}
-			return param;
-		}
-
-		template <typename T>
-		cl_int setArg(cl_uint index, T value)
-		{
-			return detail::errHandler(
-				::clSetKernelArg(
-				object_,
-				index,
-				detail::KernelArgumentHandler<T>::size(value),
-				detail::KernelArgumentHandler<T>::ptr(value)),
-				__SET_KERNEL_ARGS_ERR);
-		}
-
-		cl_int setArg(cl_uint index, ::size_t size, void* argPtr)
-		{
-			return detail::errHandler(
-				::clSetKernelArg(object_, index, size, argPtr),
-				__SET_KERNEL_ARGS_ERR);
-		}
-	};
-
-	/*! \class Program
-	* \brief Program interface that implements cl_program.
-	*/
-	class Program : public detail::Wrapper<cl_program>
-	{
-	public:
-		typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries;
-		typedef VECTOR_CLASS<std::pair<const char*, ::size_t> > Sources;
-
-		Program(
-			const STRING_CLASS& source,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-
-			const char * strings = source.c_str();
-			const ::size_t length = source.size();
-
-			Context context = Context::getDefault(err);
-
-			object_ = ::clCreateProgramWithSource(
-				context(), (cl_uint)1, &strings, &length, &error);
-
-			detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
-
-			if (error == CL_SUCCESS) {
-
-				error = ::clBuildProgram(
-					object_,
-					0,
-					NULL,
-					"",
-					NULL,
-					NULL);
-
-				detail::errHandler(error, __BUILD_PROGRAM_ERR);
-			}
-
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		Program(
-			const STRING_CLASS& source,
-			bool build,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-
-			const char * strings = source.c_str();
-			const ::size_t length = source.size();
-
-			Context context = Context::getDefault(err);
-
-			object_ = ::clCreateProgramWithSource(
-				context(), (cl_uint)1, &strings, &length, &error);
-
-			detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
-
-			if (error == CL_SUCCESS && build) {
-
-				error = ::clBuildProgram(
-					object_,
-					0,
-					NULL,
-					"",
-					NULL,
-					NULL);
-
-				detail::errHandler(error, __BUILD_PROGRAM_ERR);
-			}
-
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		Program(
-			const Context& context,
-			const STRING_CLASS& source,
-			bool build = false,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-
-			const char * strings = source.c_str();
-			const ::size_t length = source.size();
-
-			object_ = ::clCreateProgramWithSource(
-				context(), (cl_uint)1, &strings, &length, &error);
-
-			detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
-
-			if (error == CL_SUCCESS && build) {
-
-				error = ::clBuildProgram(
-					object_,
-					0,
-					NULL,
-					"",
-					NULL,
-					NULL);
-
-				detail::errHandler(error, __BUILD_PROGRAM_ERR);
-			}
-
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		Program(
-			const Context& context,
-			const Sources& sources,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-
-			const ::size_t n = (::size_t)sources.size();
-			::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
-			const char** strings = (const char**)alloca(n * sizeof(const char*));
-
-			for (::size_t i = 0; i < n; ++i) {
-				strings[i] = sources[(int)i].first;
-				lengths[i] = sources[(int)i].second;
-			}
-
-			object_ = ::clCreateProgramWithSource(
-				context(), (cl_uint)n, strings, lengths, &error);
-
-			detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		/**
-		* Construct a program object from a list of devices and a per-device list of binaries.
-		* \param context A valid OpenCL context in which to construct the program.
-		* \param devices A vector of OpenCL device objects for which the program will be created.
-		* \param binaries A vector of pairs of a pointer to a binary object and its length.
-		* \param binaryStatus An optional vector that on completion will be resized to
-		*   match the size of binaries and filled with values to specify if each binary
-		*   was successfully loaded.
-		*   Set to CL_SUCCESS if the binary was successfully loaded.
-		*   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
-		*   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
-		* \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
-		*   CL_INVALID_CONTEXT if context is not a valid context.
-		*   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices;
-		*     or if any entry in binaries is NULL or has length 0.
-		*   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
-		*   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
-		*   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
-		*/
-		Program(
-			const Context& context,
-			const VECTOR_CLASS<Device>& devices,
-			const Binaries& binaries,
-			VECTOR_CLASS<cl_int>* binaryStatus = NULL,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-
-			const ::size_t numDevices = devices.size();
-
-			// Catch size mismatch early and return
-			if (binaries.size() != numDevices) {
-				error = CL_INVALID_VALUE;
-				detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
-				if (err != NULL) {
-					*err = error;
-				}
-				return;
-			}
-
-			::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t));
-			const unsigned char** images = (const unsigned char**)alloca(numDevices * sizeof(const unsigned char**));
-
-			for (::size_t i = 0; i < numDevices; ++i) {
-				images[i] = (const unsigned char*)binaries[i].first;
-				lengths[i] = binaries[(int)i].second;
-			}
-
-			cl_device_id* deviceIDs = (cl_device_id*)alloca(numDevices * sizeof(cl_device_id));
-			for (::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex) {
-				deviceIDs[deviceIndex] = (devices[deviceIndex])();
-			}
-
-			if (binaryStatus) {
-				binaryStatus->resize(numDevices);
-			}
-
-			object_ = ::clCreateProgramWithBinary(
-				context(), (cl_uint)devices.size(),
-				deviceIDs,
-				lengths, images, binaryStatus != NULL
-				? &binaryStatus->front()
-				: NULL, &error);
-
-			detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-
-#if defined(CL_VERSION_1_2)
-		/**
-		* Create program using builtin kernels.
-		* \param kernelNames Semi-colon separated list of builtin kernel names
-		*/
-		Program(
-			const Context& context,
-			const VECTOR_CLASS<Device>& devices,
-			const STRING_CLASS& kernelNames,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-
-
-			::size_t numDevices = devices.size();
-			cl_device_id* deviceIDs = (cl_device_id*)alloca(numDevices * sizeof(cl_device_id));
-			for (::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex) {
-				deviceIDs[deviceIndex] = (devices[deviceIndex])();
-			}
-
-			object_ = ::clCreateProgramWithBuiltInKernels(
-				context(),
-				(cl_uint)devices.size(),
-				deviceIDs,
-				kernelNames.c_str(),
-				&error);
-
-			detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-#endif // #if defined(CL_VERSION_1_2)
-
-		Program() { }
-
-		Program(const Program& program) : detail::Wrapper<cl_type>(program) { }
-
-		__CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper<cl_type>(program) { }
-
-		Program& operator = (const Program& rhs)
-		{
-			if (this != &rhs) {
-				detail::Wrapper<cl_type>::operator=(rhs);
-			}
-			return *this;
-		}
-
-		Program& operator = (const cl_program& rhs)
-		{
-			detail::Wrapper<cl_type>::operator=(rhs);
-			return *this;
-		}
-
-		cl_int build(
-			const VECTOR_CLASS<Device>& devices,
-			const char* options = NULL,
-			void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-			void* data = NULL) const
-		{
-			::size_t numDevices = devices.size();
-			cl_device_id* deviceIDs = (cl_device_id*)alloca(numDevices * sizeof(cl_device_id));
-			for (::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex) {
-				deviceIDs[deviceIndex] = (devices[deviceIndex])();
-			}
-
-			return detail::errHandler(
-				::clBuildProgram(
-				object_,
-				(cl_uint)
-				devices.size(),
-				deviceIDs,
-				options,
-				notifyFptr,
-				data),
-				__BUILD_PROGRAM_ERR);
-		}
-
-		cl_int build(
-			const char* options = NULL,
-			void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-			void* data = NULL) const
-		{
-			return detail::errHandler(
-				::clBuildProgram(
-				object_,
-				0,
-				NULL,
-				options,
-				notifyFptr,
-				data),
-				__BUILD_PROGRAM_ERR);
-		}
-
-#if defined(CL_VERSION_1_2)
-		cl_int compile(
-			const char* options = NULL,
-			void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-			void* data = NULL) const
-		{
-			return detail::errHandler(
-				::clCompileProgram(
-				object_,
-				0,
-				NULL,
-				options,
-				0,
-				NULL,
-				NULL,
-				notifyFptr,
-				data),
-				__COMPILE_PROGRAM_ERR);
-		}
-#endif
-
-		template <typename T>
-		cl_int getInfo(cl_program_info name, T* param) const
-		{
-			return detail::errHandler(
-				detail::getInfo(&::clGetProgramInfo, object_, name, param),
-				__GET_PROGRAM_INFO_ERR);
-		}
-
-		template <cl_int name> typename
-			detail::param_traits<detail::cl_program_info, name>::param_type
-			getInfo(cl_int* err = NULL) const
-		{
-			typename detail::param_traits<
-				detail::cl_program_info, name>::param_type param;
-			cl_int result = getInfo(name, &param);
-			if (err != NULL) {
-				*err = result;
-			}
-			return param;
-		}
-
-		template <typename T>
-		cl_int getBuildInfo(
-			const Device& device, cl_program_build_info name, T* param) const
-		{
-			return detail::errHandler(
-				detail::getInfo(
-				&::clGetProgramBuildInfo, object_, device(), name, param),
-				__GET_PROGRAM_BUILD_INFO_ERR);
-		}
-
-		template <cl_int name> typename
-			detail::param_traits<detail::cl_program_build_info, name>::param_type
-			getBuildInfo(const Device& device, cl_int* err = NULL) const
-		{
-			typename detail::param_traits<
-				detail::cl_program_build_info, name>::param_type param;
-			cl_int result = getBuildInfo(device, name, &param);
-			if (err != NULL) {
-				*err = result;
-			}
-			return param;
-		}
-
-		cl_int createKernels(VECTOR_CLASS<Kernel>* kernels)
-		{
-			cl_uint numKernels;
-			cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
-			if (err != CL_SUCCESS) {
-				return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
-			}
-
-			Kernel* value = (Kernel*)alloca(numKernels * sizeof(Kernel));
-			err = ::clCreateKernelsInProgram(
-				object_, numKernels, (cl_kernel*)value, NULL);
-			if (err != CL_SUCCESS) {
-				return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
-			}
-
-			kernels->assign(&value[0], &value[numKernels]);
-			return CL_SUCCESS;
-		}
-	};
-
-#if defined(CL_VERSION_1_2)
-	inline Program linkProgram(
-		Program input1,
-		Program input2,
-		const char* options = NULL,
-		void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-		void* data = NULL,
-		cl_int* err = NULL)
-	{
-		cl_int err_local = CL_SUCCESS;
-
-		cl_program programs[2] = { input1(), input2() };
-
-		Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>();
-
-		cl_program prog = ::clLinkProgram(
-			ctx(),
-			0,
-			NULL,
-			options,
-			2,
-			programs,
-			notifyFptr,
-			data,
-			&err_local);
-
-		detail::errHandler(err_local, __COMPILE_PROGRAM_ERR);
-		if (err != NULL) {
-			*err = err_local;
-		}
-
-		return Program(prog);
-	}
-
-	inline Program linkProgram(
-		VECTOR_CLASS<Program> inputPrograms,
-		const char* options = NULL,
-		void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-		void* data = NULL,
-		cl_int* err = NULL)
-	{
-		cl_int err_local = CL_SUCCESS;
-
-		cl_program * programs = (cl_program*)alloca(inputPrograms.size() * sizeof(cl_program));
-
-		if (programs != NULL) {
-			for (unsigned int i = 0; i < inputPrograms.size(); i++) {
-				programs[i] = inputPrograms[i]();
-			}
-		}
-
-		cl_program prog = ::clLinkProgram(
-			Context::getDefault()(),
-			0,
-			NULL,
-			options,
-			(cl_uint)inputPrograms.size(),
-			programs,
-			notifyFptr,
-			data,
-			&err_local);
-
-		detail::errHandler(err_local, __COMPILE_PROGRAM_ERR);
-		if (err != NULL) {
-			*err = err_local;
-		}
-
-		return Program(prog);
-	}
-#endif
-
-	template<>
-	inline VECTOR_CLASS<char *> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const
-	{
-		VECTOR_CLASS< ::size_t> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();
-		VECTOR_CLASS<char *> binaries;
-		for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s)
-		{
-			char *ptr = NULL;
-			if (*s != 0)
-				ptr = new char[*s];
-			binaries.push_back(ptr);
-		}
-
-		cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries);
-		if (err != NULL) {
-			*err = result;
-		}
-		return binaries;
-	}
-
-	inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
-	{
-		cl_int error;
-
-		object_ = ::clCreateKernel(program(), name, &error);
-		detail::errHandler(error, __CREATE_KERNEL_ERR);
-
-		if (err != NULL) {
-			*err = error;
-		}
-
-	}
-
-	/*! \class CommandQueue
-	* \brief CommandQueue interface for cl_command_queue.
-	*/
-	class CommandQueue : public detail::Wrapper<cl_command_queue>
-	{
-	private:
-		static volatile int default_initialized_;
-		static CommandQueue default_;
-		static volatile cl_int default_error_;
-	public:
-		CommandQueue(
-			cl_command_queue_properties properties,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-
-			Context context = Context::getDefault(&error);
-			detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-
-			if (error != CL_SUCCESS) {
-				if (err != NULL) {
-					*err = error;
-				}
-			}
-			else {
-				Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
-
-				object_ = ::clCreateCommandQueue(
-					context(), device(), properties, &error);
-
-				detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-				if (err != NULL) {
-					*err = error;
-				}
-			}
-		}
-
-		CommandQueue(
-			const Context& context,
-			const Device& device,
-			cl_command_queue_properties properties = 0,
-			cl_int* err = NULL)
-		{
-			cl_int error;
-			object_ = ::clCreateCommandQueue(
-				context(), device(), properties, &error);
-
-			detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-		}
-
-		static CommandQueue getDefault(cl_int * err = NULL)
-		{
-			int state = detail::compare_exchange(
-				&default_initialized_,
-				__DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
-
-			if (state & __DEFAULT_INITIALIZED) {
-				if (err != NULL) {
-					*err = default_error_;
-				}
-				return default_;
-			}
-
-			if (state & __DEFAULT_BEING_INITIALIZED) {
-				// Assume writes will propagate eventually...
-				while (default_initialized_ != __DEFAULT_INITIALIZED) {
-					detail::fence();
-				}
-
-				if (err != NULL) {
-					*err = default_error_;
-				}
-				return default_;
-			}
-
-			cl_int error;
-
-			Context context = Context::getDefault(&error);
-			detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-
-			if (error != CL_SUCCESS) {
-				if (err != NULL) {
-					*err = error;
-				}
-			}
-			else {
-				Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
-
-				default_ = CommandQueue(context, device, 0, &error);
-
-				detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-				if (err != NULL) {
-					*err = error;
-				}
-			}
-
-			detail::fence();
-
-			default_error_ = error;
-			// Assume writes will propagate eventually...
-			default_initialized_ = __DEFAULT_INITIALIZED;
-
-			detail::fence();
-
-			if (err != NULL) {
-				*err = default_error_;
-			}
-			return default_;
-
-		}
-
-		CommandQueue() { }
-
-		CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
-
-		CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
-
-		CommandQueue& operator = (const CommandQueue& rhs)
-		{
-			if (this != &rhs) {
-				detail::Wrapper<cl_type>::operator=(rhs);
-			}
-			return *this;
-		}
-
-		CommandQueue& operator = (const cl_command_queue& rhs)
-		{
-			detail::Wrapper<cl_type>::operator=(rhs);
-			return *this;
-		}
-
-		template <typename T>
-		cl_int getInfo(cl_command_queue_info name, T* param) const
-		{
-			return detail::errHandler(
-				detail::getInfo(
-				&::clGetCommandQueueInfo, object_, name, param),
-				__GET_COMMAND_QUEUE_INFO_ERR);
-		}
-
-		template <cl_int name> typename
-			detail::param_traits<detail::cl_command_queue_info, name>::param_type
-			getInfo(cl_int* err = NULL) const
-		{
-			typename detail::param_traits<
-				detail::cl_command_queue_info, name>::param_type param;
-			cl_int result = getInfo(name, &param);
-			if (err != NULL) {
-				*err = result;
-			}
-			return param;
-		}
-
-		cl_int enqueueReadBuffer(
-			const Buffer& buffer,
-			cl_bool blocking,
-			::size_t offset,
-			::size_t size,
-			void* ptr,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueReadBuffer(
-				object_, buffer(), blocking, offset, size,
-				ptr,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_READ_BUFFER_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-		cl_int enqueueWriteBuffer(
-			const Buffer& buffer,
-			cl_bool blocking,
-			::size_t offset,
-			::size_t size,
-			const void* ptr,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueWriteBuffer(
-				object_, buffer(), blocking, offset, size,
-				ptr,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_WRITE_BUFFER_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-		cl_int enqueueCopyBuffer(
-			const Buffer& src,
-			const Buffer& dst,
-			::size_t src_offset,
-			::size_t dst_offset,
-			::size_t size,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueCopyBuffer(
-				object_, src(), dst(), src_offset, dst_offset, size,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQEUE_COPY_BUFFER_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-		cl_int enqueueReadBufferRect(
-			const Buffer& buffer,
-			cl_bool blocking,
-			const size_t<3>& buffer_offset,
-			const size_t<3>& host_offset,
-			const size_t<3>& region,
-			::size_t buffer_row_pitch,
-			::size_t buffer_slice_pitch,
-			::size_t host_row_pitch,
-			::size_t host_slice_pitch,
-			void *ptr,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueReadBufferRect(
-				object_,
-				buffer(),
-				blocking,
-				(const ::size_t *)buffer_offset,
-				(const ::size_t *)host_offset,
-				(const ::size_t *)region,
-				buffer_row_pitch,
-				buffer_slice_pitch,
-				host_row_pitch,
-				host_slice_pitch,
-				ptr,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_READ_BUFFER_RECT_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-		cl_int enqueueWriteBufferRect(
-			const Buffer& buffer,
-			cl_bool blocking,
-			const size_t<3>& buffer_offset,
-			const size_t<3>& host_offset,
-			const size_t<3>& region,
-			::size_t buffer_row_pitch,
-			::size_t buffer_slice_pitch,
-			::size_t host_row_pitch,
-			::size_t host_slice_pitch,
-			void *ptr,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueWriteBufferRect(
-				object_,
-				buffer(),
-				blocking,
-				(const ::size_t *)buffer_offset,
-				(const ::size_t *)host_offset,
-				(const ::size_t *)region,
-				buffer_row_pitch,
-				buffer_slice_pitch,
-				host_row_pitch,
-				host_slice_pitch,
-				ptr,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_WRITE_BUFFER_RECT_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-		cl_int enqueueCopyBufferRect(
-			const Buffer& src,
-			const Buffer& dst,
-			const size_t<3>& src_origin,
-			const size_t<3>& dst_origin,
-			const size_t<3>& region,
-			::size_t src_row_pitch,
-			::size_t src_slice_pitch,
-			::size_t dst_row_pitch,
-			::size_t dst_slice_pitch,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueCopyBufferRect(
-				object_,
-				src(),
-				dst(),
-				(const ::size_t *)src_origin,
-				(const ::size_t *)dst_origin,
-				(const ::size_t *)region,
-				src_row_pitch,
-				src_slice_pitch,
-				dst_row_pitch,
-				dst_slice_pitch,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQEUE_COPY_BUFFER_RECT_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-#if defined(CL_VERSION_1_2)
-		/**
-		* Enqueue a command to fill a buffer object with a pattern
-		* of a given size. The pattern is specified a as vector.
-		* \tparam PatternType The datatype of the pattern field.
-		*     The pattern type must be an accepted OpenCL data type.
-		*/
-		template<typename PatternType>
-		cl_int enqueueFillBuffer(
-			const Buffer& buffer,
-			PatternType pattern,
-			::size_t offset,
-			::size_t size,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueFillBuffer(
-				object_,
-				buffer(),
-				static_cast<void*>(&pattern),
-				sizeof(PatternType),
-				offset,
-				size,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_FILL_BUFFER_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-#endif // #if defined(CL_VERSION_1_2)
-
-		cl_int enqueueReadImage(
-			const Image& image,
-			cl_bool blocking,
-			const size_t<3>& origin,
-			const size_t<3>& region,
-			::size_t row_pitch,
-			::size_t slice_pitch,
-			void* ptr,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueReadImage(
-				object_, image(), blocking, (const ::size_t *) origin,
-				(const ::size_t *) region, row_pitch, slice_pitch, ptr,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_READ_IMAGE_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-		cl_int enqueueWriteImage(
-			const Image& image,
-			cl_bool blocking,
-			const size_t<3>& origin,
-			const size_t<3>& region,
-			::size_t row_pitch,
-			::size_t slice_pitch,
-			void* ptr,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueWriteImage(
-				object_, image(), blocking, (const ::size_t *) origin,
-				(const ::size_t *) region, row_pitch, slice_pitch, ptr,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_WRITE_IMAGE_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-		cl_int enqueueCopyImage(
-			const Image& src,
-			const Image& dst,
-			const size_t<3>& src_origin,
-			const size_t<3>& dst_origin,
-			const size_t<3>& region,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueCopyImage(
-				object_, src(), dst(), (const ::size_t *) src_origin,
-				(const ::size_t *)dst_origin, (const ::size_t *) region,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_COPY_IMAGE_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-#if defined(CL_VERSION_1_2)
-		/**
-		* Enqueue a command to fill an image object with a specified color.
-		* \param fillColor is the color to use to fill the image.
-		*     This is a four component RGBA floating-point color value if
-		*     the image channel data type is not an unnormalized signed or
-		*     unsigned data type.
-		*/
-		cl_int enqueueFillImage(
-			const Image& image,
-			cl_float4 fillColor,
-			const size_t<3>& origin,
-			const size_t<3>& region,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueFillImage(
-				object_,
-				image(),
-				static_cast<void*>(&fillColor),
-				(const ::size_t *) origin,
-				(const ::size_t *) region,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_FILL_IMAGE_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-		/**
-		* Enqueue a command to fill an image object with a specified color.
-		* \param fillColor is the color to use to fill the image.
-		*     This is a four component RGBA signed integer color value if
-		*     the image channel data type is an unnormalized signed integer
-		*     type.
-		*/
-		cl_int enqueueFillImage(
-			const Image& image,
-			cl_int4 fillColor,
-			const size_t<3>& origin,
-			const size_t<3>& region,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueFillImage(
-				object_,
-				image(),
-				static_cast<void*>(&fillColor),
-				(const ::size_t *) origin,
-				(const ::size_t *) region,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_FILL_IMAGE_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-		/**
-		* Enqueue a command to fill an image object with a specified color.
-		* \param fillColor is the color to use to fill the image.
-		*     This is a four component RGBA unsigned integer color value if
-		*     the image channel data type is an unnormalized unsigned integer
-		*     type.
-		*/
-		cl_int enqueueFillImage(
-			const Image& image,
-			cl_uint4 fillColor,
-			const size_t<3>& origin,
-			const size_t<3>& region,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueFillImage(
-				object_,
-				image(),
-				static_cast<void*>(&fillColor),
-				(const ::size_t *) origin,
-				(const ::size_t *) region,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_FILL_IMAGE_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-#endif // #if defined(CL_VERSION_1_2)
-
-		cl_int enqueueCopyImageToBuffer(
-			const Image& src,
-			const Buffer& dst,
-			const size_t<3>& src_origin,
-			const size_t<3>& region,
-			::size_t dst_offset,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueCopyImageToBuffer(
-				object_, src(), dst(), (const ::size_t *) src_origin,
-				(const ::size_t *) region, dst_offset,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-		cl_int enqueueCopyBufferToImage(
-			const Buffer& src,
-			const Image& dst,
-			::size_t src_offset,
-			const size_t<3>& dst_origin,
-			const size_t<3>& region,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueCopyBufferToImage(
-				object_, src(), dst(), src_offset,
-				(const ::size_t *) dst_origin, (const ::size_t *) region,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-		void* enqueueMapBuffer(
-			const Buffer& buffer,
-			cl_bool blocking,
-			cl_map_flags flags,
-			::size_t offset,
-			::size_t size,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL,
-			cl_int* err = NULL) const
-		{
-			cl_int error;
-			void * result = ::clEnqueueMapBuffer(
-				object_, buffer(), blocking, flags, offset, size,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(cl_event*)event,
-				&error);
-
-			detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-			return result;
-		}
-
-		void* enqueueMapImage(
-			const Image& buffer,
-			cl_bool blocking,
-			cl_map_flags flags,
-			const size_t<3>& origin,
-			const size_t<3>& region,
-			::size_t * row_pitch,
-			::size_t * slice_pitch,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL,
-			cl_int* err = NULL) const
-		{
-			cl_int error;
-			void * result = ::clEnqueueMapImage(
-				object_, buffer(), blocking, flags,
-				(const ::size_t *) origin, (const ::size_t *) region,
-				row_pitch, slice_pitch,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(cl_event*)event,
-				&error);
-
-			detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
-			if (err != NULL) {
-				*err = error;
-			}
-			return result;
-		}
-
-		cl_int enqueueUnmapMemObject(
-			const Memory& memory,
-			void* mapped_ptr,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueUnmapMemObject(
-				object_, memory(), mapped_ptr,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_UNMAP_MEM_OBJECT_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-#if defined(CL_VERSION_1_2)
-		/**
-		* Enqueues a marker command which waits for either a list of events to complete,
-		* or all previously enqueued commands to complete.
-		*
-		* Enqueues a marker command which waits for either a list of events to complete,
-		* or if the list is empty it waits for all commands previously enqueued in command_queue
-		* to complete before it completes. This command returns an event which can be waited on,
-		* i.e. this event can be waited on to insure that all events either in the event_wait_list
-		* or all previously enqueued commands, queued before this command to command_queue,
-		* have completed.
-		*/
-		cl_int enqueueMarkerWithWaitList(
-			const VECTOR_CLASS<Event> *events = 0,
-			Event *event = 0)
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueMarkerWithWaitList(
-				object_,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_MARKER_WAIT_LIST_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-		/**
-		* A synchronization point that enqueues a barrier operation.
-		*
-		* Enqueues a barrier command which waits for either a list of events to complete,
-		* or if the list is empty it waits for all commands previously enqueued in command_queue
-		* to complete before it completes. This command blocks command execution, that is, any
-		* following commands enqueued after it do not execute until it completes. This command
-		* returns an event which can be waited on, i.e. this event can be waited on to insure that
-		* all events either in the event_wait_list or all previously enqueued commands, queued
-		* before this command to command_queue, have completed.
-		*/
-		cl_int enqueueBarrierWithWaitList(
-			const VECTOR_CLASS<Event> *events = 0,
-			Event *event = 0)
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueBarrierWithWaitList(
-				object_,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_BARRIER_WAIT_LIST_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-		/**
-		* Enqueues a command to indicate with which device a set of memory objects
-		* should be associated.
-		*/
-		cl_int enqueueMigrateMemObjects(
-			const VECTOR_CLASS<Memory> &memObjects,
-			cl_mem_migration_flags flags,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL
-			)
-		{
-			cl_event tmp;
-
-			cl_mem* localMemObjects = static_cast<cl_mem*>(alloca(memObjects.size() * sizeof(cl_mem)));
-			for (int i = 0; i < (int)memObjects.size(); ++i) {
-				localMemObjects[i] = memObjects[i]();
-			}
-
-
-			cl_int err = detail::errHandler(
-				::clEnqueueMigrateMemObjects(
-				object_,
-				(cl_uint)memObjects.size(),
-				static_cast<const cl_mem*>(localMemObjects),
-				flags,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_UNMAP_MEM_OBJECT_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-#endif // #if defined(CL_VERSION_1_2)
-
-		cl_int enqueueNDRangeKernel(
-			const Kernel& kernel,
-			const NDRange& offset,
-			const NDRange& global,
-			const NDRange& local = NullRange,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueNDRangeKernel(
-				object_, kernel(), (cl_uint)global.dimensions(),
-				offset.dimensions() != 0 ? (const ::size_t*) offset : NULL,
-				(const ::size_t*) global,
-				local.dimensions() != 0 ? (const ::size_t*) local : NULL,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_NDRANGE_KERNEL_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-		cl_int enqueueTask(
-			const Kernel& kernel,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueTask(
-				object_, kernel(),
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_TASK_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-		cl_int enqueueNativeKernel(
-			void (CL_CALLBACK *userFptr)(void *),
-			std::pair<void*, ::size_t> args,
-			const VECTOR_CLASS<Memory>* mem_objects = NULL,
-			const VECTOR_CLASS<const void*>* mem_locs = NULL,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0)
-				? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem))
-				: NULL;
-
-			if (mems != NULL) {
-				for (unsigned int i = 0; i < mem_objects->size(); i++) {
-					mems[i] = ((*mem_objects)[i])();
-				}
-			}
-
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueNativeKernel(
-				object_, userFptr, args.first, args.second,
-				(mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0,
-				mems,
-				(mem_locs != NULL) ? (const void **)&mem_locs->front() : NULL,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_NATIVE_KERNEL);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-		/**
-		* Deprecated APIs for 1.2
-		*/
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
-		CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-			cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-		{
-			return detail::errHandler(
-				::clEnqueueMarker(object_, (cl_event*)event),
-				__ENQUEUE_MARKER_ERR);
-		}
-
-		CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-			cl_int enqueueWaitForEvents(const VECTOR_CLASS<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-		{
-			return detail::errHandler(
-			::clEnqueueWaitForEvents(
-			object_,
-			(cl_uint)events.size(),
-			(const cl_event*)&events.front()),
-			__ENQUEUE_WAIT_FOR_EVENTS_ERR);
-		}
-#endif // #if defined(CL_VERSION_1_1)
-
-			cl_int enqueueAcquireGLObjects(
-			const VECTOR_CLASS<Memory>* mem_objects = NULL,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueAcquireGLObjects(
-				object_,
-				(mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0,
-				(mem_objects != NULL) ? (const cl_mem *)&mem_objects->front() : NULL,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_ACQUIRE_GL_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-		cl_int enqueueReleaseGLObjects(
-			const VECTOR_CLASS<Memory>* mem_objects = NULL,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				::clEnqueueReleaseGLObjects(
-				object_,
-				(mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0,
-				(mem_objects != NULL) ? (const cl_mem *)&mem_objects->front() : NULL,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_RELEASE_GL_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-#if defined (USE_DX_INTEROP)
-		typedef CL_API_ENTRY cl_int(CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
-			cl_command_queue command_queue, cl_uint num_objects,
-			const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
-			const cl_event* event_wait_list, cl_event* event);
-		typedef CL_API_ENTRY cl_int(CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
-			cl_command_queue command_queue, cl_uint num_objects,
-			const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
-			const cl_event* event_wait_list, cl_event* event);
-
-		cl_int enqueueAcquireD3D10Objects(
-			const VECTOR_CLASS<Memory>* mem_objects = NULL,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
-#if defined(CL_VERSION_1_2)
-			cl_context context = getInfo<CL_QUEUE_CONTEXT>();
-			cl::Device device(getInfo<CL_QUEUE_DEVICE>());
-			cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
-			__INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR);
-#endif
-#if defined(CL_VERSION_1_1)
-			__INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
-#endif
-
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				pfn_clEnqueueAcquireD3D10ObjectsKHR(
-				object_,
-				(mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0,
-				(mem_objects != NULL) ? (const cl_mem *)&mem_objects->front() : NULL,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_ACQUIRE_GL_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-
-		cl_int enqueueReleaseD3D10Objects(
-			const VECTOR_CLASS<Memory>* mem_objects = NULL,
-			const VECTOR_CLASS<Event>* events = NULL,
-			Event* event = NULL) const
-		{
-			static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
-#if defined(CL_VERSION_1_2)
-			cl_context context = getInfo<CL_QUEUE_CONTEXT>();
-			cl::Device device(getInfo<CL_QUEUE_DEVICE>());
-			cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
-			__INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR);
-#endif // #if defined(CL_VERSION_1_2)
-#if defined(CL_VERSION_1_1)
-			__INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
-#endif // #if defined(CL_VERSION_1_1)
-
-			cl_event tmp;
-			cl_int err = detail::errHandler(
-				pfn_clEnqueueReleaseD3D10ObjectsKHR(
-				object_,
-				(mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0,
-				(mem_objects != NULL) ? (const cl_mem *)&mem_objects->front() : NULL,
-				(events != NULL) ? (cl_uint)events->size() : 0,
-				(events != NULL) ? (cl_event*)&events->front() : NULL,
-				(event != NULL) ? &tmp : NULL),
-				__ENQUEUE_RELEASE_GL_ERR);
-
-			if (event != NULL && err == CL_SUCCESS)
-				*event = tmp;
-
-			return err;
-		}
-#endif
-
-		/**
-		* Deprecated APIs for 1.2
-		*/
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
-		CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-			cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-		{
-			return detail::errHandler(
-			::clEnqueueBarrier(object_),
-			__ENQUEUE_BARRIER_ERR);
-		}
-#endif // #if defined(CL_VERSION_1_1)
-
-			cl_int flush() const
-		{
-			return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
-		}
-
-		cl_int finish() const
-		{
-			return detail::errHandler(::clFinish(object_), __FINISH_ERR);
-		}
-	};
-
-#ifdef _WIN32
-	__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
-	__declspec(selectany) CommandQueue CommandQueue::default_;
-	__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
-#else
-	__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
-	__attribute__((weak)) CommandQueue CommandQueue::default_;
-	__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
-#endif
-
-	inline cl_int enqueueReadBuffer(
-		const Buffer& buffer,
-		cl_bool blocking,
-		::size_t offset,
-		::size_t size,
-		void* ptr,
-		const VECTOR_CLASS<Event>* events = NULL,
-		Event* event = NULL)
-	{
-		cl_int error;
-		CommandQueue queue = CommandQueue::getDefault(&error);
-
-		if (error != CL_SUCCESS) {
-			return error;
-		}
-
-		return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);
-	}
-
-	inline cl_int enqueueWriteBuffer(
-		const Buffer& buffer,
-		cl_bool blocking,
-		::size_t offset,
-		::size_t size,
-		const void* ptr,
-		const VECTOR_CLASS<Event>* events = NULL,
-		Event* event = NULL)
-	{
-		cl_int error;
-		CommandQueue queue = CommandQueue::getDefault(&error);
-
-		if (error != CL_SUCCESS) {
-			return error;
-		}
-
-		return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);
-	}
-
-	inline void* enqueueMapBuffer(
-		const Buffer& buffer,
-		cl_bool blocking,
-		cl_map_flags flags,
-		::size_t offset,
-		::size_t size,
-		const VECTOR_CLASS<Event>* events = NULL,
-		Event* event = NULL,
-		cl_int* err = NULL)
-	{
-		cl_int error;
-		CommandQueue queue = CommandQueue::getDefault(&error);
-		detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-		if (err != NULL) {
-			*err = error;
-		}
-
-		void * result = ::clEnqueueMapBuffer(
-			queue(), buffer(), blocking, flags, offset, size,
-			(events != NULL) ? (cl_uint)events->size() : 0,
-			(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-			(cl_event*)event,
-			&error);
-
-		detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-		if (err != NULL) {
-			*err = error;
-		}
-		return result;
-	}
-
-	inline cl_int enqueueUnmapMemObject(
-		const Memory& memory,
-		void* mapped_ptr,
-		const VECTOR_CLASS<Event>* events = NULL,
-		Event* event = NULL)
-	{
-		cl_int error;
-		CommandQueue queue = CommandQueue::getDefault(&error);
-		detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-		if (error != CL_SUCCESS) {
-			return error;
-		}
-
-		cl_event tmp;
-		cl_int err = detail::errHandler(
-			::clEnqueueUnmapMemObject(
-			queue(), memory(), mapped_ptr,
-			(events != NULL) ? (cl_uint)events->size() : 0,
-			(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-			(event != NULL) ? &tmp : NULL),
-			__ENQUEUE_UNMAP_MEM_OBJECT_ERR);
-
-		if (event != NULL && err == CL_SUCCESS)
-			*event = tmp;
-
-		return err;
-	}
-
-	inline cl_int enqueueCopyBuffer(
-		const Buffer& src,
-		const Buffer& dst,
-		::size_t src_offset,
-		::size_t dst_offset,
-		::size_t size,
-		const VECTOR_CLASS<Event>* events = NULL,
-		Event* event = NULL)
-	{
-		cl_int error;
-		CommandQueue queue = CommandQueue::getDefault(&error);
-
-		if (error != CL_SUCCESS) {
-			return error;
-		}
-
-		return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);
-	}
-
-	/**
-	* Blocking copy operation between iterators and a buffer.
-	*/
-	template< typename IteratorType >
-	inline cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer)
-	{
-		typedef typename std::iterator_traits<IteratorType>::value_type DataType;
-		cl_int error;
-
-		::size_t length = endIterator - startIterator;
-		::size_t byteLength = length*sizeof(DataType);
-
-		DataType *pointer =
-			static_cast<DataType*>(enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));
-		// if exceptions enabled, enqueueMapBuffer will throw
-		if (error != CL_SUCCESS) {
-			return error;
-		}
-#if defined(_MSC_VER)
-		std::copy(
-			startIterator,
-			endIterator,
-			stdext::checked_array_iterator<DataType*>(
-			pointer, length));
-#else
-		std::copy(startIterator, endIterator, pointer);
-#endif
-		Event endEvent;
-		error = enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
-		// if exceptions enabled, enqueueUnmapMemObject will throw
-		if (error != CL_SUCCESS) {
-			return error;
-		}
-		endEvent.wait();
-		return CL_SUCCESS;
-	}
-
-	/**
-	* Blocking copy operation between iterators and a buffer.
-	*/
-	template< typename IteratorType >
-	inline cl_int copy(const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator)
-	{
-		typedef typename std::iterator_traits<IteratorType>::value_type DataType;
-		cl_int error;
-
-		::size_t length = endIterator - startIterator;
-		::size_t byteLength = length*sizeof(DataType);
-
-		DataType *pointer =
-			static_cast<DataType*>(enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));
-		// if exceptions enabled, enqueueMapBuffer will throw
-		if (error != CL_SUCCESS) {
-			return error;
-		}
-		std::copy(pointer, pointer + length, startIterator);
-		Event endEvent;
-		error = enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
-		// if exceptions enabled, enqueueUnmapMemObject will throw
-		if (error != CL_SUCCESS) {
-			return error;
-		}
-		endEvent.wait();
-		return CL_SUCCESS;
-	}
-
-#if defined(CL_VERSION_1_1)
-	inline cl_int enqueueReadBufferRect(
-		const Buffer& buffer,
-		cl_bool blocking,
-		const size_t<3>& buffer_offset,
-		const size_t<3>& host_offset,
-		const size_t<3>& region,
-		::size_t buffer_row_pitch,
-		::size_t buffer_slice_pitch,
-		::size_t host_row_pitch,
-		::size_t host_slice_pitch,
-		void *ptr,
-		const VECTOR_CLASS<Event>* events = NULL,
-		Event* event = NULL)
-	{
-		cl_int error;
-		CommandQueue queue = CommandQueue::getDefault(&error);
-
-		if (error != CL_SUCCESS) {
-			return error;
-		}
-
-		return queue.enqueueReadBufferRect(
-			buffer,
-			blocking,
-			buffer_offset,
-			host_offset,
-			region,
-			buffer_row_pitch,
-			buffer_slice_pitch,
-			host_row_pitch,
-			host_slice_pitch,
-			ptr,
-			events,
-			event);
-	}
-
-	inline cl_int enqueueWriteBufferRect(
-		const Buffer& buffer,
-		cl_bool blocking,
-		const size_t<3>& buffer_offset,
-		const size_t<3>& host_offset,
-		const size_t<3>& region,
-		::size_t buffer_row_pitch,
-		::size_t buffer_slice_pitch,
-		::size_t host_row_pitch,
-		::size_t host_slice_pitch,
-		void *ptr,
-		const VECTOR_CLASS<Event>* events = NULL,
-		Event* event = NULL)
-	{
-		cl_int error;
-		CommandQueue queue = CommandQueue::getDefault(&error);
-
-		if (error != CL_SUCCESS) {
-			return error;
-		}
-
-		return queue.enqueueWriteBufferRect(
-			buffer,
-			blocking,
-			buffer_offset,
-			host_offset,
-			region,
-			buffer_row_pitch,
-			buffer_slice_pitch,
-			host_row_pitch,
-			host_slice_pitch,
-			ptr,
-			events,
-			event);
-	}
-
-	inline cl_int enqueueCopyBufferRect(
-		const Buffer& src,
-		const Buffer& dst,
-		const size_t<3>& src_origin,
-		const size_t<3>& dst_origin,
-		const size_t<3>& region,
-		::size_t src_row_pitch,
-		::size_t src_slice_pitch,
-		::size_t dst_row_pitch,
-		::size_t dst_slice_pitch,
-		const VECTOR_CLASS<Event>* events = NULL,
-		Event* event = NULL)
-	{
-		cl_int error;
-		CommandQueue queue = CommandQueue::getDefault(&error);
-
-		if (error != CL_SUCCESS) {
-			return error;
-		}
-
-		return queue.enqueueCopyBufferRect(
-			src,
-			dst,
-			src_origin,
-			dst_origin,
-			region,
-			src_row_pitch,
-			src_slice_pitch,
-			dst_row_pitch,
-			dst_slice_pitch,
-			events,
-			event);
-	}
-#endif
-
-	inline cl_int enqueueReadImage(
-		const Image& image,
-		cl_bool blocking,
-		const size_t<3>& origin,
-		const size_t<3>& region,
-		::size_t row_pitch,
-		::size_t slice_pitch,
-		void* ptr,
-		const VECTOR_CLASS<Event>* events = NULL,
-		Event* event = NULL)
-	{
-		cl_int error;
-		CommandQueue queue = CommandQueue::getDefault(&error);
-
-		if (error != CL_SUCCESS) {
-			return error;
-		}
-
-		return queue.enqueueReadImage(
-			image,
-			blocking,
-			origin,
-			region,
-			row_pitch,
-			slice_pitch,
-			ptr,
-			events,
-			event);
-	}
-
-	inline cl_int enqueueWriteImage(
-		const Image& image,
-		cl_bool blocking,
-		const size_t<3>& origin,
-		const size_t<3>& region,
-		::size_t row_pitch,
-		::size_t slice_pitch,
-		void* ptr,
-		const VECTOR_CLASS<Event>* events = NULL,
-		Event* event = NULL)
-	{
-		cl_int error;
-		CommandQueue queue = CommandQueue::getDefault(&error);
-
-		if (error != CL_SUCCESS) {
-			return error;
-		}
-
-		return queue.enqueueWriteImage(
-			image,
-			blocking,
-			origin,
-			region,
-			row_pitch,
-			slice_pitch,
-			ptr,
-			events,
-			event);
-	}
-
-	inline cl_int enqueueCopyImage(
-		const Image& src,
-		const Image& dst,
-		const size_t<3>& src_origin,
-		const size_t<3>& dst_origin,
-		const size_t<3>& region,
-		const VECTOR_CLASS<Event>* events = NULL,
-		Event* event = NULL)
-	{
-		cl_int error;
-		CommandQueue queue = CommandQueue::getDefault(&error);
-
-		if (error != CL_SUCCESS) {
-			return error;
-		}
-
-		return queue.enqueueCopyImage(
-			src,
-			dst,
-			src_origin,
-			dst_origin,
-			region,
-			events,
-			event);
-	}
-
-	inline cl_int enqueueCopyImageToBuffer(
-		const Image& src,
-		const Buffer& dst,
-		const size_t<3>& src_origin,
-		const size_t<3>& region,
-		::size_t dst_offset,
-		const VECTOR_CLASS<Event>* events = NULL,
-		Event* event = NULL)
-	{
-		cl_int error;
-		CommandQueue queue = CommandQueue::getDefault(&error);
-
-		if (error != CL_SUCCESS) {
-			return error;
-		}
-
-		return queue.enqueueCopyImageToBuffer(
-			src,
-			dst,
-			src_origin,
-			region,
-			dst_offset,
-			events,
-			event);
-	}
-
-	inline cl_int enqueueCopyBufferToImage(
-		const Buffer& src,
-		const Image& dst,
-		::size_t src_offset,
-		const size_t<3>& dst_origin,
-		const size_t<3>& region,
-		const VECTOR_CLASS<Event>* events = NULL,
-		Event* event = NULL)
-	{
-		cl_int error;
-		CommandQueue queue = CommandQueue::getDefault(&error);
-
-		if (error != CL_SUCCESS) {
-			return error;
-		}
-
-		return queue.enqueueCopyBufferToImage(
-			src,
-			dst,
-			src_offset,
-			dst_origin,
-			region,
-			events,
-			event);
-	}
-
-
-	inline cl_int flush(void)
-	{
-		cl_int error;
-		CommandQueue queue = CommandQueue::getDefault(&error);
-
-		if (error != CL_SUCCESS) {
-			return error;
-		}
-
-		return queue.flush();
-	}
-
-	inline cl_int finish(void)
-	{
-		cl_int error;
-		CommandQueue queue = CommandQueue::getDefault(&error);
-
-		if (error != CL_SUCCESS) {
-			return error;
-		}
-
-
-		return queue.finish();
-	}
-
-	// Kernel Functor support
-	// New interface as of September 2011
-	// Requires the C++11 std::tr1::function (note do not support TR1)
-	// Visual Studio 2010 and GCC 4.2
-
-	struct EnqueueArgs
-	{
-		CommandQueue queue_;
-		const NDRange offset_;
-		const NDRange global_;
-		const NDRange local_;
-		VECTOR_CLASS<Event> events_;
-
-		EnqueueArgs(NDRange global) :
-			queue_(CommandQueue::getDefault()),
-			offset_(NullRange),
-			global_(global),
-			local_(NullRange)
-		{
-
-		}
-
-		EnqueueArgs(NDRange global, NDRange local) :
-			queue_(CommandQueue::getDefault()),
-			offset_(NullRange),
-			global_(global),
-			local_(local)
-		{
-
-		}
-
-		EnqueueArgs(NDRange offset, NDRange global, NDRange local) :
-			queue_(CommandQueue::getDefault()),
-			offset_(offset),
-			global_(global),
-			local_(local)
-		{
-
-		}
-
-		EnqueueArgs(Event e, NDRange global) :
-			queue_(CommandQueue::getDefault()),
-			offset_(NullRange),
-			global_(global),
-			local_(NullRange)
-		{
-			events_.push_back(e);
-		}
-
-		EnqueueArgs(Event e, NDRange global, NDRange local) :
-			queue_(CommandQueue::getDefault()),
-			offset_(NullRange),
-			global_(global),
-			local_(local)
-		{
-			events_.push_back(e);
-		}
-
-		EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) :
-			queue_(CommandQueue::getDefault()),
-			offset_(offset),
-			global_(global),
-			local_(local)
-		{
-			events_.push_back(e);
-		}
-
-		EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global) :
-			queue_(CommandQueue::getDefault()),
-			offset_(NullRange),
-			global_(global),
-			local_(NullRange),
-			events_(events)
-		{
-
-		}
-
-		EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) :
-			queue_(CommandQueue::getDefault()),
-			offset_(NullRange),
-			global_(global),
-			local_(local),
-			events_(events)
-		{
-
-		}
-
-		EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) :
-			queue_(CommandQueue::getDefault()),
-			offset_(offset),
-			global_(global),
-			local_(local),
-			events_(events)
-		{
-
-		}
-
-		EnqueueArgs(CommandQueue &queue, NDRange global) :
-			queue_(queue),
-			offset_(NullRange),
-			global_(global),
-			local_(NullRange)
-		{
-
-		}
-
-		EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) :
-			queue_(queue),
-			offset_(NullRange),
-			global_(global),
-			local_(local)
-		{
-
-		}
-
-		EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) :
-			queue_(queue),
-			offset_(offset),
-			global_(global),
-			local_(local)
-		{
-
-		}
-
-		EnqueueArgs(CommandQueue &queue, Event e, NDRange global) :
-			queue_(queue),
-			offset_(NullRange),
-			global_(global),
-			local_(NullRange)
-		{
-			events_.push_back(e);
-		}
-
-		EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) :
-			queue_(queue),
-			offset_(NullRange),
-			global_(global),
-			local_(local)
-		{
-			events_.push_back(e);
-		}
-
-		EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) :
-			queue_(queue),
-			offset_(offset),
-			global_(global),
-			local_(local)
-		{
-			events_.push_back(e);
-		}
-
-		EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global) :
-			queue_(queue),
-			offset_(NullRange),
-			global_(global),
-			local_(NullRange),
-			events_(events)
-		{
-
-		}
-
-		EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) :
-			queue_(queue),
-			offset_(NullRange),
-			global_(global),
-			local_(local),
-			events_(events)
-		{
-
-		}
-
-		EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) :
-			queue_(queue),
-			offset_(offset),
-			global_(global),
-			local_(local),
-			events_(events)
-		{
-
-		}
-	};
-
-	namespace detail {
-
-		class NullType {};
-
-		template<int index, typename T0>
-		struct SetArg
-		{
-			static void set(Kernel kernel, T0 arg)
-			{
-				kernel.setArg(index, arg);
-			}
-		};
-
-		template<int index>
-		struct SetArg<index, NullType>
-		{
-			static void set(Kernel, NullType)
-			{
-			}
-		};
-
-		template <
-			typename T0, typename T1, typename T2, typename T3,
-			typename T4, typename T5, typename T6, typename T7,
-			typename T8, typename T9, typename T10, typename T11,
-			typename T12, typename T13, typename T14, typename T15,
-			typename T16, typename T17, typename T18, typename T19,
-			typename T20, typename T21, typename T22, typename T23,
-			typename T24, typename T25, typename T26, typename T27,
-			typename T28, typename T29, typename T30, typename T31
-		>
-		class KernelFunctorGlobal
-		{
-		private:
-			Kernel kernel_;
-
-		public:
-			KernelFunctorGlobal(
-				Kernel kernel) :
-				kernel_(kernel)
-			{}
-
-			KernelFunctorGlobal(
-				const Program& program,
-				const STRING_CLASS name,
-				cl_int * err = NULL) :
-				kernel_(program, name.c_str(), err)
-			{}
-
-			Event operator() (
-				const EnqueueArgs& args,
-				T0 t0,
-				T1 t1 = NullType(),
-				T2 t2 = NullType(),
-				T3 t3 = NullType(),
-				T4 t4 = NullType(),
-				T5 t5 = NullType(),
-				T6 t6 = NullType(),
-				T7 t7 = NullType(),
-				T8 t8 = NullType(),
-				T9 t9 = NullType(),
-				T10 t10 = NullType(),
-				T11 t11 = NullType(),
-				T12 t12 = NullType(),
-				T13 t13 = NullType(),
-				T14 t14 = NullType(),
-				T15 t15 = NullType(),
-				T16 t16 = NullType(),
-				T17 t17 = NullType(),
-				T18 t18 = NullType(),
-				T19 t19 = NullType(),
-				T20 t20 = NullType(),
-				T21 t21 = NullType(),
-				T22 t22 = NullType(),
-				T23 t23 = NullType(),
-				T24 t24 = NullType(),
-				T25 t25 = NullType(),
-				T26 t26 = NullType(),
-				T27 t27 = NullType(),
-				T28 t28 = NullType(),
-				T29 t29 = NullType(),
-				T30 t30 = NullType(),
-				T31 t31 = NullType()
-				)
-			{
-				Event event;
-				SetArg<0, T0>::set(kernel_, t0);
-				SetArg<1, T1>::set(kernel_, t1);
-				SetArg<2, T2>::set(kernel_, t2);
-				SetArg<3, T3>::set(kernel_, t3);
-				SetArg<4, T4>::set(kernel_, t4);
-				SetArg<5, T5>::set(kernel_, t5);
-				SetArg<6, T6>::set(kernel_, t6);
-				SetArg<7, T7>::set(kernel_, t7);
-				SetArg<8, T8>::set(kernel_, t8);
-				SetArg<9, T9>::set(kernel_, t9);
-				SetArg<10, T10>::set(kernel_, t10);
-				SetArg<11, T11>::set(kernel_, t11);
-				SetArg<12, T12>::set(kernel_, t12);
-				SetArg<13, T13>::set(kernel_, t13);
-				SetArg<14, T14>::set(kernel_, t14);
-				SetArg<15, T15>::set(kernel_, t15);
-				SetArg<16, T16>::set(kernel_, t16);
-				SetArg<17, T17>::set(kernel_, t17);
-				SetArg<18, T18>::set(kernel_, t18);
-				SetArg<19, T19>::set(kernel_, t19);
-				SetArg<20, T20>::set(kernel_, t20);
-				SetArg<21, T21>::set(kernel_, t21);
-				SetArg<22, T22>::set(kernel_, t22);
-				SetArg<23, T23>::set(kernel_, t23);
-				SetArg<24, T24>::set(kernel_, t24);
-				SetArg<25, T25>::set(kernel_, t25);
-				SetArg<26, T26>::set(kernel_, t26);
-				SetArg<27, T27>::set(kernel_, t27);
-				SetArg<28, T28>::set(kernel_, t28);
-				SetArg<29, T29>::set(kernel_, t29);
-				SetArg<30, T30>::set(kernel_, t30);
-				SetArg<31, T31>::set(kernel_, t31);
-
-				args.queue_.enqueueNDRangeKernel(
-					kernel_,
-					args.offset_,
-					args.global_,
-					args.local_,
-					&args.events_,
-					&event);
-
-				return event;
-			}
-
-		};
-
-		//------------------------------------------------------------------------------------------------------
-
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13,
-			typename T14,
-			typename T15,
-			typename T16,
-			typename T17,
-			typename T18,
-			typename T19,
-			typename T20,
-			typename T21,
-			typename T22,
-			typename T23,
-			typename T24,
-			typename T25,
-			typename T26,
-			typename T27,
-			typename T28,
-			typename T29,
-			typename T30,
-			typename T31>
-		struct functionImplementation_
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				T23,
-				T24,
-				T25,
-				T26,
-				T27,
-				T28,
-				T29,
-				T30,
-				T31> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				T23,
-				T24,
-				T25,
-				T26,
-				T27,
-				T28,
-				T29,
-				T30,
-				T31);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13,
-				T14 arg14,
-				T15 arg15,
-				T16 arg16,
-				T17 arg17,
-				T18 arg18,
-				T19 arg19,
-				T20 arg20,
-				T21 arg21,
-				T22 arg22,
-				T23 arg23,
-				T24 arg24,
-				T25 arg25,
-				T26 arg26,
-				T27 arg27,
-				T28 arg28,
-				T29 arg29,
-				T30 arg30,
-				T31 arg31)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13,
-					arg14,
-					arg15,
-					arg16,
-					arg17,
-					arg18,
-					arg19,
-					arg20,
-					arg21,
-					arg22,
-					arg23,
-					arg24,
-					arg25,
-					arg26,
-					arg27,
-					arg28,
-					arg29,
-					arg30,
-					arg31);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13,
-			typename T14,
-			typename T15,
-			typename T16,
-			typename T17,
-			typename T18,
-			typename T19,
-			typename T20,
-			typename T21,
-			typename T22,
-			typename T23,
-			typename T24,
-			typename T25,
-			typename T26,
-			typename T27,
-			typename T28,
-			typename T29,
-			typename T30>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			T13,
-			T14,
-			T15,
-			T16,
-			T17,
-			T18,
-			T19,
-			T20,
-			T21,
-			T22,
-			T23,
-			T24,
-			T25,
-			T26,
-			T27,
-			T28,
-			T29,
-			T30,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				T23,
-				T24,
-				T25,
-				T26,
-				T27,
-				T28,
-				T29,
-				T30,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				T23,
-				T24,
-				T25,
-				T26,
-				T27,
-				T28,
-				T29,
-				T30);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13,
-				T14 arg14,
-				T15 arg15,
-				T16 arg16,
-				T17 arg17,
-				T18 arg18,
-				T19 arg19,
-				T20 arg20,
-				T21 arg21,
-				T22 arg22,
-				T23 arg23,
-				T24 arg24,
-				T25 arg25,
-				T26 arg26,
-				T27 arg27,
-				T28 arg28,
-				T29 arg29,
-				T30 arg30)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13,
-					arg14,
-					arg15,
-					arg16,
-					arg17,
-					arg18,
-					arg19,
-					arg20,
-					arg21,
-					arg22,
-					arg23,
-					arg24,
-					arg25,
-					arg26,
-					arg27,
-					arg28,
-					arg29,
-					arg30);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13,
-			typename T14,
-			typename T15,
-			typename T16,
-			typename T17,
-			typename T18,
-			typename T19,
-			typename T20,
-			typename T21,
-			typename T22,
-			typename T23,
-			typename T24,
-			typename T25,
-			typename T26,
-			typename T27,
-			typename T28,
-			typename T29>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			T13,
-			T14,
-			T15,
-			T16,
-			T17,
-			T18,
-			T19,
-			T20,
-			T21,
-			T22,
-			T23,
-			T24,
-			T25,
-			T26,
-			T27,
-			T28,
-			T29,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				T23,
-				T24,
-				T25,
-				T26,
-				T27,
-				T28,
-				T29,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				T23,
-				T24,
-				T25,
-				T26,
-				T27,
-				T28,
-				T29);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13,
-				T14 arg14,
-				T15 arg15,
-				T16 arg16,
-				T17 arg17,
-				T18 arg18,
-				T19 arg19,
-				T20 arg20,
-				T21 arg21,
-				T22 arg22,
-				T23 arg23,
-				T24 arg24,
-				T25 arg25,
-				T26 arg26,
-				T27 arg27,
-				T28 arg28,
-				T29 arg29)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13,
-					arg14,
-					arg15,
-					arg16,
-					arg17,
-					arg18,
-					arg19,
-					arg20,
-					arg21,
-					arg22,
-					arg23,
-					arg24,
-					arg25,
-					arg26,
-					arg27,
-					arg28,
-					arg29);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13,
-			typename T14,
-			typename T15,
-			typename T16,
-			typename T17,
-			typename T18,
-			typename T19,
-			typename T20,
-			typename T21,
-			typename T22,
-			typename T23,
-			typename T24,
-			typename T25,
-			typename T26,
-			typename T27,
-			typename T28>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			T13,
-			T14,
-			T15,
-			T16,
-			T17,
-			T18,
-			T19,
-			T20,
-			T21,
-			T22,
-			T23,
-			T24,
-			T25,
-			T26,
-			T27,
-			T28,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				T23,
-				T24,
-				T25,
-				T26,
-				T27,
-				T28,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				T23,
-				T24,
-				T25,
-				T26,
-				T27,
-				T28);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13,
-				T14 arg14,
-				T15 arg15,
-				T16 arg16,
-				T17 arg17,
-				T18 arg18,
-				T19 arg19,
-				T20 arg20,
-				T21 arg21,
-				T22 arg22,
-				T23 arg23,
-				T24 arg24,
-				T25 arg25,
-				T26 arg26,
-				T27 arg27,
-				T28 arg28)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13,
-					arg14,
-					arg15,
-					arg16,
-					arg17,
-					arg18,
-					arg19,
-					arg20,
-					arg21,
-					arg22,
-					arg23,
-					arg24,
-					arg25,
-					arg26,
-					arg27,
-					arg28);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13,
-			typename T14,
-			typename T15,
-			typename T16,
-			typename T17,
-			typename T18,
-			typename T19,
-			typename T20,
-			typename T21,
-			typename T22,
-			typename T23,
-			typename T24,
-			typename T25,
-			typename T26,
-			typename T27>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			T13,
-			T14,
-			T15,
-			T16,
-			T17,
-			T18,
-			T19,
-			T20,
-			T21,
-			T22,
-			T23,
-			T24,
-			T25,
-			T26,
-			T27,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				T23,
-				T24,
-				T25,
-				T26,
-				T27,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				T23,
-				T24,
-				T25,
-				T26,
-				T27);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13,
-				T14 arg14,
-				T15 arg15,
-				T16 arg16,
-				T17 arg17,
-				T18 arg18,
-				T19 arg19,
-				T20 arg20,
-				T21 arg21,
-				T22 arg22,
-				T23 arg23,
-				T24 arg24,
-				T25 arg25,
-				T26 arg26,
-				T27 arg27)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13,
-					arg14,
-					arg15,
-					arg16,
-					arg17,
-					arg18,
-					arg19,
-					arg20,
-					arg21,
-					arg22,
-					arg23,
-					arg24,
-					arg25,
-					arg26,
-					arg27);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13,
-			typename T14,
-			typename T15,
-			typename T16,
-			typename T17,
-			typename T18,
-			typename T19,
-			typename T20,
-			typename T21,
-			typename T22,
-			typename T23,
-			typename T24,
-			typename T25,
-			typename T26>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			T13,
-			T14,
-			T15,
-			T16,
-			T17,
-			T18,
-			T19,
-			T20,
-			T21,
-			T22,
-			T23,
-			T24,
-			T25,
-			T26,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				T23,
-				T24,
-				T25,
-				T26,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				T23,
-				T24,
-				T25,
-				T26);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13,
-				T14 arg14,
-				T15 arg15,
-				T16 arg16,
-				T17 arg17,
-				T18 arg18,
-				T19 arg19,
-				T20 arg20,
-				T21 arg21,
-				T22 arg22,
-				T23 arg23,
-				T24 arg24,
-				T25 arg25,
-				T26 arg26)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13,
-					arg14,
-					arg15,
-					arg16,
-					arg17,
-					arg18,
-					arg19,
-					arg20,
-					arg21,
-					arg22,
-					arg23,
-					arg24,
-					arg25,
-					arg26);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13,
-			typename T14,
-			typename T15,
-			typename T16,
-			typename T17,
-			typename T18,
-			typename T19,
-			typename T20,
-			typename T21,
-			typename T22,
-			typename T23,
-			typename T24,
-			typename T25>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			T13,
-			T14,
-			T15,
-			T16,
-			T17,
-			T18,
-			T19,
-			T20,
-			T21,
-			T22,
-			T23,
-			T24,
-			T25,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				T23,
-				T24,
-				T25,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				T23,
-				T24,
-				T25);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13,
-				T14 arg14,
-				T15 arg15,
-				T16 arg16,
-				T17 arg17,
-				T18 arg18,
-				T19 arg19,
-				T20 arg20,
-				T21 arg21,
-				T22 arg22,
-				T23 arg23,
-				T24 arg24,
-				T25 arg25)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13,
-					arg14,
-					arg15,
-					arg16,
-					arg17,
-					arg18,
-					arg19,
-					arg20,
-					arg21,
-					arg22,
-					arg23,
-					arg24,
-					arg25);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13,
-			typename T14,
-			typename T15,
-			typename T16,
-			typename T17,
-			typename T18,
-			typename T19,
-			typename T20,
-			typename T21,
-			typename T22,
-			typename T23,
-			typename T24>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			T13,
-			T14,
-			T15,
-			T16,
-			T17,
-			T18,
-			T19,
-			T20,
-			T21,
-			T22,
-			T23,
-			T24,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				T23,
-				T24,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				T23,
-				T24);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13,
-				T14 arg14,
-				T15 arg15,
-				T16 arg16,
-				T17 arg17,
-				T18 arg18,
-				T19 arg19,
-				T20 arg20,
-				T21 arg21,
-				T22 arg22,
-				T23 arg23,
-				T24 arg24)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13,
-					arg14,
-					arg15,
-					arg16,
-					arg17,
-					arg18,
-					arg19,
-					arg20,
-					arg21,
-					arg22,
-					arg23,
-					arg24);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13,
-			typename T14,
-			typename T15,
-			typename T16,
-			typename T17,
-			typename T18,
-			typename T19,
-			typename T20,
-			typename T21,
-			typename T22,
-			typename T23>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			T13,
-			T14,
-			T15,
-			T16,
-			T17,
-			T18,
-			T19,
-			T20,
-			T21,
-			T22,
-			T23,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				T23,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				T23);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13,
-				T14 arg14,
-				T15 arg15,
-				T16 arg16,
-				T17 arg17,
-				T18 arg18,
-				T19 arg19,
-				T20 arg20,
-				T21 arg21,
-				T22 arg22,
-				T23 arg23)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13,
-					arg14,
-					arg15,
-					arg16,
-					arg17,
-					arg18,
-					arg19,
-					arg20,
-					arg21,
-					arg22,
-					arg23);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13,
-			typename T14,
-			typename T15,
-			typename T16,
-			typename T17,
-			typename T18,
-			typename T19,
-			typename T20,
-			typename T21,
-			typename T22>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			T13,
-			T14,
-			T15,
-			T16,
-			T17,
-			T18,
-			T19,
-			T20,
-			T21,
-			T22,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				T22);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13,
-				T14 arg14,
-				T15 arg15,
-				T16 arg16,
-				T17 arg17,
-				T18 arg18,
-				T19 arg19,
-				T20 arg20,
-				T21 arg21,
-				T22 arg22)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13,
-					arg14,
-					arg15,
-					arg16,
-					arg17,
-					arg18,
-					arg19,
-					arg20,
-					arg21,
-					arg22);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13,
-			typename T14,
-			typename T15,
-			typename T16,
-			typename T17,
-			typename T18,
-			typename T19,
-			typename T20,
-			typename T21>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			T13,
-			T14,
-			T15,
-			T16,
-			T17,
-			T18,
-			T19,
-			T20,
-			T21,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				T21);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13,
-				T14 arg14,
-				T15 arg15,
-				T16 arg16,
-				T17 arg17,
-				T18 arg18,
-				T19 arg19,
-				T20 arg20,
-				T21 arg21)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13,
-					arg14,
-					arg15,
-					arg16,
-					arg17,
-					arg18,
-					arg19,
-					arg20,
-					arg21);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13,
-			typename T14,
-			typename T15,
-			typename T16,
-			typename T17,
-			typename T18,
-			typename T19,
-			typename T20>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			T13,
-			T14,
-			T15,
-			T16,
-			T17,
-			T18,
-			T19,
-			T20,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				T20);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13,
-				T14 arg14,
-				T15 arg15,
-				T16 arg16,
-				T17 arg17,
-				T18 arg18,
-				T19 arg19,
-				T20 arg20)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13,
-					arg14,
-					arg15,
-					arg16,
-					arg17,
-					arg18,
-					arg19,
-					arg20);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13,
-			typename T14,
-			typename T15,
-			typename T16,
-			typename T17,
-			typename T18,
-			typename T19>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			T13,
-			T14,
-			T15,
-			T16,
-			T17,
-			T18,
-			T19,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				T19);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13,
-				T14 arg14,
-				T15 arg15,
-				T16 arg16,
-				T17 arg17,
-				T18 arg18,
-				T19 arg19)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13,
-					arg14,
-					arg15,
-					arg16,
-					arg17,
-					arg18,
-					arg19);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13,
-			typename T14,
-			typename T15,
-			typename T16,
-			typename T17,
-			typename T18>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			T13,
-			T14,
-			T15,
-			T16,
-			T17,
-			T18,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				T18);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13,
-				T14 arg14,
-				T15 arg15,
-				T16 arg16,
-				T17 arg17,
-				T18 arg18)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13,
-					arg14,
-					arg15,
-					arg16,
-					arg17,
-					arg18);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13,
-			typename T14,
-			typename T15,
-			typename T16,
-			typename T17>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			T13,
-			T14,
-			T15,
-			T16,
-			T17,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				T17);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13,
-				T14 arg14,
-				T15 arg15,
-				T16 arg16,
-				T17 arg17)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13,
-					arg14,
-					arg15,
-					arg16,
-					arg17);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13,
-			typename T14,
-			typename T15,
-			typename T16>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			T13,
-			T14,
-			T15,
-			T16,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				T16);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13,
-				T14 arg14,
-				T15 arg15,
-				T16 arg16)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13,
-					arg14,
-					arg15,
-					arg16);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13,
-			typename T14,
-			typename T15>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			T13,
-			T14,
-			T15,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				T15);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13,
-				T14 arg14,
-				T15 arg15)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13,
-					arg14,
-					arg15);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13,
-			typename T14>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			T13,
-			T14,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				T14);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13,
-				T14 arg14)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13,
-					arg14);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12,
-			typename T13>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			T13,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				T13);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12,
-				T13 arg13)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12,
-					arg13);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11,
-			typename T12>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			T12,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				T12);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11,
-				T12 arg12)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11,
-					arg12);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10,
-			typename T11>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			T11,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				T11);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10,
-				T11 arg11)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10,
-					arg11);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9,
-			typename T10>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			T10,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				T10);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9,
-				T10 arg10)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9,
-					arg10);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8,
-			typename T9>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			T9,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				T9);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8,
-				T9 arg9)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8,
-					arg9);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7,
-			typename T8>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			T8,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				T8);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7,
-				T8 arg8)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7,
-					arg8);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6,
-			typename T7>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			T7,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				T7);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6,
-				T7 arg7)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6,
-					arg7);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5,
-			typename T6>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			T6,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				T6);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5,
-				T6 arg6)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5,
-					arg6);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4,
-			typename T5>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			T5,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				T5);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4,
-				T5 arg5)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4,
-					arg5);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3,
-			typename T4>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			T4,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				T4,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3,
-				T4);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3,
-				T4 arg4)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3,
-					arg4);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2,
-			typename T3>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			T3,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				T3,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2,
-				T3);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2,
-				T3 arg3)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2,
-					arg3);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1,
-			typename T2>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			T2,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				T2,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1,
-				T2);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1,
-				T2 arg2)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1,
-					arg2);
-			}
-
-
-		};
-
-		template<
-			typename T0,
-			typename T1>
-		struct functionImplementation_
-			<	T0,
-			T1,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				T1,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0,
-				T1);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0,
-				T1 arg1)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0,
-					arg1);
-			}
-
-
-		};
-
-		template<
-			typename T0>
-		struct functionImplementation_
-			<	T0,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType,
-			NullType>
-		{
-			typedef detail::KernelFunctorGlobal<
-				T0,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType,
-				NullType> FunctorType;
-
-			FunctorType functor_;
-
-			functionImplementation_(const FunctorType &functor) :
-				functor_(functor)
-			{
-
-#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1))
-				// Fail variadic expansion for dev11
-				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-#endif
-
-			}
-
-			//! \brief Return type of the functor
-			typedef Event result_type;
-
-			//! \brief Function signature of kernel functor with no event dependency.
-			typedef Event type_(
-				const EnqueueArgs&,
-				T0);
-
-			Event operator()(
-				const EnqueueArgs& enqueueArgs,
-				T0 arg0)
-			{
-				return functor_(
-					enqueueArgs,
-					arg0);
-			}
-
-
-		};
-
-
-
-
-
-	} // namespace detail
-
-	//----------------------------------------------------------------------------------------------
-
-	template <
-		typename T0, typename T1 = detail::NullType, typename T2 = detail::NullType,
-		typename T3 = detail::NullType, typename T4 = detail::NullType,
-		typename T5 = detail::NullType, typename T6 = detail::NullType,
-		typename T7 = detail::NullType, typename T8 = detail::NullType,
-		typename T9 = detail::NullType, typename T10 = detail::NullType,
-		typename T11 = detail::NullType, typename T12 = detail::NullType,
-		typename T13 = detail::NullType, typename T14 = detail::NullType,
-		typename T15 = detail::NullType, typename T16 = detail::NullType,
-		typename T17 = detail::NullType, typename T18 = detail::NullType,
-		typename T19 = detail::NullType, typename T20 = detail::NullType,
-		typename T21 = detail::NullType, typename T22 = detail::NullType,
-		typename T23 = detail::NullType, typename T24 = detail::NullType,
-		typename T25 = detail::NullType, typename T26 = detail::NullType,
-		typename T27 = detail::NullType, typename T28 = detail::NullType,
-		typename T29 = detail::NullType, typename T30 = detail::NullType,
-		typename T31 = detail::NullType
-	>
-	struct make_kernel :
-		public detail::functionImplementation_<
-		T0, T1, T2, T3,
-		T4, T5, T6, T7,
-		T8, T9, T10, T11,
-		T12, T13, T14, T15,
-		T16, T17, T18, T19,
-		T20, T21, T22, T23,
-		T24, T25, T26, T27,
-		T28, T29, T30, T31
-		>
-	{
-	public:
-		typedef detail::KernelFunctorGlobal<
-			T0, T1, T2, T3,
-			T4, T5, T6, T7,
-			T8, T9, T10, T11,
-			T12, T13, T14, T15,
-			T16, T17, T18, T19,
-			T20, T21, T22, T23,
-			T24, T25, T26, T27,
-			T28, T29, T30, T31
-		> FunctorType;
-
-		make_kernel(
-			const Program& program,
-			const STRING_CLASS name,
-			cl_int * err = NULL) :
-			detail::functionImplementation_<
-			T0, T1, T2, T3,
-			T4, T5, T6, T7,
-			T8, T9, T10, T11,
-			T12, T13, T14, T15,
-			T16, T17, T18, T19,
-			T20, T21, T22, T23,
-			T24, T25, T26, T27,
-			T28, T29, T30, T31
-			>(
-			FunctorType(program, name, err))
-		{}
-
-		make_kernel(
-			const Kernel kernel) :
-			detail::functionImplementation_<
-			T0, T1, T2, T3,
-			T4, T5, T6, T7,
-			T8, T9, T10, T11,
-			T12, T13, T14, T15,
-			T16, T17, T18, T19,
-			T20, T21, T22, T23,
-			T24, T25, T26, T27,
-			T28, T29, T30, T31
-			>(
-			FunctorType(kernel))
-		{}
-	};
-
-
-	//----------------------------------------------------------------------------------------------------------------------
-
-#undef __ERR_STR
-#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
-#undef __GET_DEVICE_INFO_ERR
-#undef __GET_PLATFORM_INFO_ERR
-#undef __GET_DEVICE_IDS_ERR
-#undef __GET_CONTEXT_INFO_ERR
-#undef __GET_EVENT_INFO_ERR
-#undef __GET_EVENT_PROFILE_INFO_ERR
-#undef __GET_MEM_OBJECT_INFO_ERR
-#undef __GET_IMAGE_INFO_ERR
-#undef __GET_SAMPLER_INFO_ERR
-#undef __GET_KERNEL_INFO_ERR
-#undef __GET_KERNEL_ARG_INFO_ERR
-#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
-#undef __GET_PROGRAM_INFO_ERR
-#undef __GET_PROGRAM_BUILD_INFO_ERR
-#undef __GET_COMMAND_QUEUE_INFO_ERR
-
-#undef __CREATE_CONTEXT_ERR
-#undef __CREATE_CONTEXT_FROM_TYPE_ERR
-#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
-
-#undef __CREATE_BUFFER_ERR
-#undef __CREATE_SUBBUFFER_ERR
-#undef __CREATE_IMAGE2D_ERR
-#undef __CREATE_IMAGE3D_ERR
-#undef __CREATE_SAMPLER_ERR
-#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
-
-#undef __CREATE_USER_EVENT_ERR
-#undef __SET_USER_EVENT_STATUS_ERR
-#undef __SET_EVENT_CALLBACK_ERR
-#undef __SET_PRINTF_CALLBACK_ERR
-
-#undef __WAIT_FOR_EVENTS_ERR
-
-#undef __CREATE_KERNEL_ERR
-#undef __SET_KERNEL_ARGS_ERR
-#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
-#undef __CREATE_PROGRAM_WITH_BINARY_ERR
-#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR
-#undef __BUILD_PROGRAM_ERR
-#undef __CREATE_KERNELS_IN_PROGRAM_ERR
-
-#undef __CREATE_COMMAND_QUEUE_ERR
-#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
-#undef __ENQUEUE_READ_BUFFER_ERR
-#undef __ENQUEUE_WRITE_BUFFER_ERR
-#undef __ENQUEUE_READ_BUFFER_RECT_ERR
-#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
-#undef __ENQEUE_COPY_BUFFER_ERR
-#undef __ENQEUE_COPY_BUFFER_RECT_ERR
-#undef __ENQUEUE_READ_IMAGE_ERR
-#undef __ENQUEUE_WRITE_IMAGE_ERR
-#undef __ENQUEUE_COPY_IMAGE_ERR
-#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
-#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
-#undef __ENQUEUE_MAP_BUFFER_ERR
-#undef __ENQUEUE_MAP_IMAGE_ERR
-#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
-#undef __ENQUEUE_NDRANGE_KERNEL_ERR
-#undef __ENQUEUE_TASK_ERR
-#undef __ENQUEUE_NATIVE_KERNEL
-
-#undef __CL_EXPLICIT_CONSTRUCTORS
-
-#undef __UNLOAD_COMPILER_ERR
-#endif //__CL_USER_OVERRIDE_ERROR_STRINGS
-
-#undef __CL_FUNCTION_TYPE
-
-	// Extensions
-	/**
-	* Deprecated APIs for 1.2
-	*/
-#if defined(CL_VERSION_1_1)
-#undef __INIT_CL_EXT_FCN_PTR
-#endif // #if defined(CL_VERSION_1_1)
-#undef __CREATE_SUB_DEVICES
-
-#if defined(USE_CL_DEVICE_FISSION)
-#undef __PARAM_NAME_DEVICE_FISSION
-#endif // USE_CL_DEVICE_FISSION
-
-#undef __DEFAULT_NOT_INITIALIZED 
-#undef __DEFAULT_BEING_INITIALIZED 
-#undef __DEFAULT_INITIALIZED
-
-} // namespace cl
-
-#ifdef _WIN32
-#pragma pop_macro("max")
-#endif // _WIN32
-
-#endif // CL_HPP_
diff --git a/ocl_device_utils/ocl_device_utils.cpp b/ocl_device_utils/ocl_device_utils.cpp
deleted file mode 100644
index f7a802d00..000000000
--- a/ocl_device_utils/ocl_device_utils.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-#include "ocl_device_utils.h"
-
-#include <iostream>
-#include <stdexcept>
-#include <utility>
-#include <algorithm>
-
-using namespace std;
-using namespace cl;
-
-
-bool ocl_device_utils::_hasQueried = false;
-std::vector<std::string> ocl_device_utils::_platformNames;
-std::vector<PrintInfo> ocl_device_utils::_devicesPlatformsDevices;
-
-vector<Platform> ocl_device_utils::getPlatforms() {
-	vector<Platform> platforms;
-	try {
-		Platform::get(&platforms);
-	}
-	catch (Error const& err) {
-#if defined(CL_PLATFORM_NOT_FOUND_KHR)
-		if (err.err() == CL_PLATFORM_NOT_FOUND_KHR)
-			cout << "No OpenCL platforms found" << endl;
-		else
-#endif
-			throw err;
-	}
-	return platforms;
-}
-
-void ocl_device_utils::print_opencl_devices() {
-	ocl_device_utils::QueryDevices();
-	ocl_device_utils::PrintDevices();
-}
-
-vector<Device> ocl_device_utils::getDevices(vector<Platform> const& _platforms, unsigned _platformId) {
-	vector<Device> devices;
-	try {
-		_platforms[_platformId].getDevices(/*CL_DEVICE_TYPE_CPU| */CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, &devices);
-	}
-	catch (Error const& err) {
-		// if simply no devices found return empty vector
-		if (err.err() != CL_DEVICE_NOT_FOUND)
-			throw err;
-	}
-	return devices;
-}
-
-string ocl_device_utils::StringnNullTerminatorFix(const string& str) {
-	return string(str.c_str(), strlen(str.c_str()));
-}
-
-bool ocl_device_utils::QueryDevices() {
-	if (!_hasQueried) {
-		_hasQueried = true;
-		try {
-			// get platforms
-			auto platforms = getPlatforms();
-			if (platforms.empty()) {
-				cout << "No OpenCL platforms found" << endl;
-				return false;
-			}
-			else {
-				for (auto i_pId = 0u; i_pId < platforms.size(); ++i_pId) {
-					string platformName = StringnNullTerminatorFix(platforms[i_pId].getInfo<CL_PLATFORM_NAME>());
-					if (std::find(_platformNames.begin(), _platformNames.end(), platformName) == _platformNames.end()) {
-						PrintInfo current;
-						_platformNames.push_back(platformName);
-						// new
-						current.PlatformName = platformName;
-						current.PlatformNum = i_pId;
-
-						auto clDevs = getDevices(platforms, i_pId);
-						for (auto i_devId = 0u; i_devId < clDevs.size(); ++i_devId) {
-							OpenCLDevice curDevice;
-							curDevice.DeviceID = i_devId;
-							curDevice._CL_DEVICE_NAME = StringnNullTerminatorFix(clDevs[i_devId].getInfo<CL_DEVICE_NAME>());
-							switch (clDevs[i_devId].getInfo<CL_DEVICE_TYPE>()) {
-							case CL_DEVICE_TYPE_CPU:
-								curDevice._CL_DEVICE_TYPE = "CPU";
-								break;
-							case CL_DEVICE_TYPE_GPU:
-								curDevice._CL_DEVICE_TYPE = "GPU";
-								break;
-							case CL_DEVICE_TYPE_ACCELERATOR:
-								curDevice._CL_DEVICE_TYPE = "ACCELERATOR";
-								break;
-							default:
-								curDevice._CL_DEVICE_TYPE = "DEFAULT";
-								break;
-							}
-
-
-							curDevice._CL_DEVICE_GLOBAL_MEM_SIZE = clDevs[i_devId].getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
-							curDevice._CL_DEVICE_VENDOR = StringnNullTerminatorFix(clDevs[i_devId].getInfo<CL_DEVICE_VENDOR>());
-							curDevice._CL_DEVICE_VERSION = StringnNullTerminatorFix(clDevs[i_devId].getInfo<CL_DEVICE_VERSION>());
-							curDevice._CL_DRIVER_VERSION = StringnNullTerminatorFix(clDevs[i_devId].getInfo<CL_DRIVER_VERSION>());
-
-							current.Devices.push_back(curDevice);
-						}
-						_devicesPlatformsDevices.push_back(current);
-					}
-				}
-			}
-		}
-		catch (exception &ex) {
-			// TODO
-			cout << "ocl_device_utils::QueryDevices() exception: " << ex.what() << endl;
-			return false;
-		}
-		return true;
-	}
-	
-	return false;
-}
-
-int ocl_device_utils::GetCountForPlatform(int platformID) {
-	for (const auto &platInfo : _devicesPlatformsDevices)
-	{
-		if (platformID == platInfo.PlatformNum) {
-			return platInfo.Devices.size();
-		}
-	}
-	return 0;
-}
-
-void ocl_device_utils::PrintDevices() {
-	int allDevsCount = 0;
-	for (const auto &platInfo : _devicesPlatformsDevices) {
-		allDevsCount += platInfo.Devices.size();
-	}
-	cout << "Number of OpenCL devices found: " << allDevsCount << endl;
-	{
-		int devPlatformsComma = _devicesPlatformsDevices.size();
-		for (const auto &platInfo : _devicesPlatformsDevices) {
-			cout << "\tPlatform: " << platInfo.PlatformName << " | " << "PlatformNum: " << platInfo.PlatformNum << endl;
-			cout << "\t\tDevices: " << endl;
-			// device print
-			int devComma = platInfo.Devices.size();
-			for (const auto &dev : platInfo.Devices) {
-				cout << "\t\t\t#" << dev.DeviceID << " " << dev._CL_DEVICE_NAME << " | " << dev._CL_DEVICE_TYPE << endl;
-			}
-		}
-	}
-}
\ No newline at end of file
diff --git a/ocl_device_utils/ocl_device_utils.h b/ocl_device_utils/ocl_device_utils.h
deleted file mode 100644
index cf74aaf2b..000000000
--- a/ocl_device_utils/ocl_device_utils.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#pragma once
-
-#define __CL_ENABLE_EXCEPTIONS
-#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
-
-#include "cl_ext.hpp"
-#include <map>
-#include <vector>
-#include "OpenCLDevice.h"
-
-
-struct PrintInfo {
-	std::string PlatformName;
-	int PlatformNum;
-	std::vector<OpenCLDevice> Devices;
-};
-
-class ocl_device_utils {
-public:
-	static bool QueryDevices();
-	static void PrintDevices();
-	static int GetCountForPlatform(int platformID);
-	static void print_opencl_devices();
-
-private:
-	static std::vector<cl::Device> getDevices(std::vector<cl::Platform> const& _platforms, unsigned _platformId);
-	static std::vector<cl::Platform> getPlatforms();
-
-	static bool _hasQueried;
-	static std::vector<std::string> _platformNames;
-	static std::vector<PrintInfo> _devicesPlatformsDevices;
-
-	static std::string StringnNullTerminatorFix(const std::string& str);
-};
\ No newline at end of file
diff --git a/ocl_device_utils/ocl_device_utils.vcxproj b/ocl_device_utils/ocl_device_utils.vcxproj
deleted file mode 100644
index 4830defa1..000000000
--- a/ocl_device_utils/ocl_device_utils.vcxproj
+++ /dev/null
@@ -1,95 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="cl_ext.hpp" />
-    <ClInclude Include="ocl_device_utils.h" />
-    <ClInclude Include="opencl.h" />
-    <ClInclude Include="OpenCLDevice.h" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="ocl_device_utils.cpp" />
-    <ClCompile Include="opencl.cpp" />
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}</ProjectGuid>
-    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>ocl_device_utils</RootNamespace>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(Platform)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(Platform)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(AMDAPPSDKROOT)\include\</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Windows</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(AMDAPPSDKROOT)\include\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Windows</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/ocl_device_utils/ocl_device_utils.vcxproj.filters b/ocl_device_utils/ocl_device_utils.vcxproj.filters
deleted file mode 100644
index 1c4a6cd21..000000000
--- a/ocl_device_utils/ocl_device_utils.vcxproj.filters
+++ /dev/null
@@ -1,13 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup>
-    <ClInclude Include="cl_ext.hpp" />
-    <ClInclude Include="ocl_device_utils.h" />
-    <ClInclude Include="OpenCLDevice.h" />
-    <ClInclude Include="opencl.h" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="ocl_device_utils.cpp" />
-    <ClCompile Include="opencl.cpp" />
-  </ItemGroup>
-</Project>
\ No newline at end of file
diff --git a/ocl_device_utils/opencl.cpp b/ocl_device_utils/opencl.cpp
deleted file mode 100644
index cea4c9082..000000000
--- a/ocl_device_utils/opencl.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-#include "opencl.h"
-#include <fstream>
-#include <vector>
-#include <memory>
-#include <stdio.h>
-
-extern cl_platform_id gPlatform;
-// extern cl_program gProgram;
-
-bool clInitialize(int requiredPlatform, std::vector<cl_device_id> &gpus)
-{
-  cl_platform_id platforms[64];
-  cl_uint numPlatforms;
-  OCLR(clGetPlatformIDs(sizeof(platforms)/sizeof(cl_platform_id), platforms, &numPlatforms), false);
-  if (!numPlatforms) {
-    printf("<error> no OpenCL platforms found\n");
-    return false;
-  }
-  
-  /*int platformIdx = -1;
-  if (requiredPlatform) {
-    for (decltype(numPlatforms) i = 0; i < numPlatforms; i++) {
-      char name[1024] = {0};
-      OCLR(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(name), name, 0), false);
-      printf("found platform[%i] name = '%s'\n", (int)i, name);
-      if (strcmp(name, requiredPlatform) == 0) {
-        platformIdx = i;
-        break;
-      }
-    }
-  } else {
-    platformIdx = 0;
-  }*/
-
-  int platformIdx = requiredPlatform;
-  
-  
-  if (platformIdx == -1) {
-    printf("<error> platform %s not exists\n", requiredPlatform);
-    return false;
-  }
-  
-  gPlatform = platforms[platformIdx];
-  
-  cl_uint numDevices = 0;
-  cl_device_id devices[64];
-  clGetDeviceIDs(gPlatform, CL_DEVICE_TYPE_GPU, sizeof(devices)/sizeof(cl_device_id), devices, &numDevices);
-  if (numDevices) {
-    printf("<info> found %d devices\n", numDevices);
-  } else {
-    printf("<error> no OpenCL GPU devices found.\n");
-    return false;
-  }
-
-  for (decltype(numDevices) i = 0; i < numDevices; i++) {
-    gpus.push_back(devices[i]);
-  }
-  
-  return true;
-}
-
-bool clCompileKernel(cl_context gContext,
-                     cl_device_id gpu,
-                     const char *binaryName,
-                     const std::vector<const char*> &sources,
-                     const char *arguments,
-                     cl_int *binstatus,
-                     cl_program *gProgram)
-{
-  std::ifstream testfile(binaryName);
-  
-//   size_t binsizes[64];
-
-//   const unsigned char *binaries[64];
-  
-  if(!testfile) {
-    
-    
-    printf("<info> compiling ...\n");
-    
-    std::string sourceFile;
-    for (auto &i: sources) {
-      std::ifstream stream;
-      stream.exceptions(std::ifstream::failbit | std::ifstream::badbit);
-      try {
-        stream.open(i);
-      } catch (std::system_error& e) {
-		fprintf(stderr, "<error> %s\n", e.code().message().c_str());
-        return false;
-      }
-      std::string str((std::istreambuf_iterator<char>(stream)), std::istreambuf_iterator<char>());
-      sourceFile.append(str);
-    }
-    
-    printf("<info> source: %u bytes\n", (unsigned)sourceFile.size());
-    if(sourceFile.size() < 1){
-      fprintf(stderr, "<error> source files not found or empty\n");
-      return false;
-    }
-    
-    cl_int error;
-    const char *sources[] = { sourceFile.c_str(), 0 };
-    *gProgram = clCreateProgramWithSource(gContext, 1, sources, 0, &error);
-    OCLR(error, false);
-    
-    if (clBuildProgram(*gProgram, 1, &gpu, arguments, 0, 0) != CL_SUCCESS) {    
-      size_t logSize;
-      clGetProgramBuildInfo(*gProgram, gpu, CL_PROGRAM_BUILD_LOG, 0, 0, &logSize);
-      
-      std::unique_ptr<char[]> log(new char[logSize]);
-      clGetProgramBuildInfo(*gProgram, gpu, CL_PROGRAM_BUILD_LOG, logSize, log.get(), 0);
-      printf("%s\n", log.get());
-
-      return false;
-    }
-    
-    size_t binsize;
-    OCLR(clGetProgramInfo(*gProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binsize, 0), false);
-//     for (size_t i = 0; i < 1; i++) {
-      if(!binsize) {
-        printf("<error> no binary available!\n");
-        return false;
-      }
-//     }
-    
-    printf("<info> binsize = %u bytes\n", (unsigned)binsize);
-//     std::unique_ptr<unsigned char[]> binary(new unsigned char[binsize+1]);
-    
-//     for (size_t i = 0; i < gpus.size(); i++)
-    std::unique_ptr<unsigned char[]> binary(new unsigned char[binsize+1]);
-//       binaries[i] = new unsigned char[binsizes[i]];
-    
-//     for (auto &b: binaries)
-//       b = binary.get();
-    OCLR(clGetProgramInfo(*gProgram, CL_PROGRAM_BINARIES, sizeof(void*), &binary, 0), false);
-    
-    {
-      std::ofstream bin(binaryName, std::ofstream::binary | std::ofstream::trunc);
-      bin.write((const char*)binary.get(), binsize);
-      bin.close();      
-    }
-   
-    OCLR(clReleaseProgram(*gProgram), false);
-  }
-  
-  std::ifstream bfile(binaryName, std::ifstream::binary);
-  if(!bfile) {
-    printf("<error> %s not found\n", binaryName);
-    return false;
-  }  
-  
-  bfile.seekg(0, bfile.end);
-  size_t binsize = bfile.tellg();
-  bfile.seekg(0, bfile.beg);
-  if(!binsize){
-    printf("<error> %s empty\n", binaryName);
-    return false;
-  }
-  
-  std::vector<char> binary(binsize+1);
-  bfile.read(&binary[0], binsize);
-  bfile.close();
-  
-  cl_int error;
-//   binstatus.resize(gpus.size(), 0);
-//   std::vector<size_t> binsizes(gpus.size(), binsize);
-//   std::vector<const unsigned char*> binaries(gpus.size(), (const unsigned char*)&binary[0]);
-  const unsigned char *binaryPtr = (const unsigned char*)&binary[0];
-  
-  *gProgram = clCreateProgramWithBinary(gContext, 1, &gpu, &binsize, &binaryPtr, binstatus, &error);
-  OCLR(error, false);
-  OCLR(clBuildProgram(*gProgram, 1, &gpu, 0, 0, 0), false);  
-  return true;
-}
diff --git a/ocl_device_utils/opencl.h b/ocl_device_utils/opencl.h
deleted file mode 100644
index 566d9ffa2..000000000
--- a/ocl_device_utils/opencl.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * opencl.h
- *
- *  Created on: 01.05.2014
- *      Author: mad
- */
-
-#ifndef OPENCL_H_
-#define OPENCL_H_
-
-#pragma warning(disable: 4996)
-#include <CL/cl.h>
-#include <stdio.h>
-#include <string.h>
-#include <vector>
-
-// extern cl_context gContext;
-
-
-
-#define OCL(error) \
-  if(cl_int err = error){ \
-    printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); \
-    return; \
-  }
-
-#define OCLR(error, ret) \
-  if(cl_int err = error){ \
-    printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); \
-    return ret; \
-  }
-
-#define OCLE(error) \
-  if(cl_int err = error){ \
-    printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); \
-    exit(err); \
-  }
-
-
-
-
-
-template<typename T>
-class clBuffer {
-public:
-  
-  clBuffer() {
-    
-    Size = 0;
-    HostData = 0;
-    DeviceData = 0;
-    
-  }
-  
-  ~clBuffer() {
-    
-    if(HostData)
-      delete [] HostData;
-    
-    if(DeviceData)
-      clReleaseMemObject(DeviceData);
-    
-  }
-  
-  void init(cl_context gContext, int size, cl_mem_flags flags = 0) {
-    
-    Size = size;
-    
-    if(!(flags & CL_MEM_HOST_NO_ACCESS)){
-      HostData = new T[Size];
-      memset(HostData, 0, Size*sizeof(T));
-    }else
-      HostData = 0;
-    
-    //printf("clCreateBuffer: size = %d, %d bytes\n", Size, Size*sizeof(T));
-    
-    cl_int error;
-    if (flags & CL_MEM_HOST_NO_ACCESS)
-      flags = CL_MEM_READ_WRITE;
-    DeviceData = clCreateBuffer(gContext, flags, Size*sizeof(T), 0, &error);
-    OCL(error);
-    
-  }
-  
-  void copyToDevice(cl_command_queue cq, bool blocking = true) {
-    
-    OCL(clEnqueueWriteBuffer(cq, DeviceData, blocking, 0, Size*sizeof(T), HostData, 0, 0, 0));
-    
-  }
-  
-  void copyToHost(cl_command_queue cq, bool blocking = true, unsigned size = 0) {
-    
-    if(size == 0)
-      size = Size;
-    
-    OCL(clEnqueueReadBuffer(cq, DeviceData, blocking, 0, size*sizeof(T), HostData, 0, 0, 0));
-    
-  }
-  
-  T& get(int index) {
-    return HostData[index];
-  }
-  
-  T& operator[](int index) {
-    return HostData[index];
-  }
-  
-public:
-  
-  int Size;
-  T* HostData;
-  cl_mem DeviceData;
-  
-  
-};
-
-
-bool clInitialize(int requiredPlatform, std::vector<cl_device_id> &gpus);
-bool clCompileKernel(cl_context gContext,
-                     cl_device_id gpu,
-                     const char *binaryName,
-                     const std::vector<const char*> &sources,
-                     const char *arguments,
-                     cl_int *binstatus,
-                     cl_program *gProgram);
-
-
-
-
-
-#endif /* OPENCL_H_ */
diff --git a/ocl_silentarmy/ocl_silentarmy.cpp b/ocl_silentarmy/ocl_silentarmy.cpp
deleted file mode 100644
index d67a3fa76..000000000
--- a/ocl_silentarmy/ocl_silentarmy.cpp
+++ /dev/null
@@ -1,536 +0,0 @@
-#include "ocl_silentarmy.hpp"
-
-//#define _CRT_SECURE_NO_WARNINGS
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <string.h>
-#include <stdint.h>
-#include <assert.h>
-#include <sys/types.h>
-//#include <sys/time.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-//#include <unistd.h>
-//#include <getopt.h>
-#include <errno.h>
-
-
-#include "opencl.h"
-
-#include <fstream>
-
-#include "sa_blake.h"
-
-typedef uint8_t		uchar;
-typedef uint32_t	uint;
-typedef uint64_t	ulong;
-#include "param.h"
-
-#define MIN(A, B)	(((A) < (B)) ? (A) : (B))
-#define MAX(A, B)	(((A) > (B)) ? (A) : (B))
-
-#define WN PARAM_N
-#define WK PARAM_K
-
-#define COLLISION_BIT_LENGTH (WN / (WK+1))
-#define COLLISION_BYTE_LENGTH ((COLLISION_BIT_LENGTH+7)/8)
-#define FINAL_FULL_WIDTH (2*COLLISION_BYTE_LENGTH+sizeof(uint32_t)*(1 << (WK)))
-
-#define NDIGITS   (WK+1)
-#define DIGITBITS (WN/(NDIGITS))
-#define PROOFSIZE (1u<<WK)
-#define COMPRESSED_PROOFSIZE ((COLLISION_BIT_LENGTH+1)*PROOFSIZE*4/(8*sizeof(uint32_t)))
-
-typedef struct  debug_s
-{
-	uint32_t    dropped_coll;
-	uint32_t    dropped_stor;
-}               debug_t;
-
-struct OclContext {
-	cl_context _context;
-	cl_program _program;
-	cl_device_id _dev_id;
-
-	cl_platform_id platform_id = 0;
-
-	cl_command_queue queue;
-
-	cl_kernel k_init_ht;
-	cl_kernel k_rounds[PARAM_K];
-	cl_kernel k_sols;
-
-	cl_mem buf_ht[2], buf_sols, buf_dbg;
-	size_t global_ws;
-	size_t local_work_size = 64;
-
-	sols_t	*sols;
-
-	bool init(cl_device_id dev, unsigned threadsNum, unsigned threadsPerBlock);
-	
-	~OclContext() {
-		clReleaseMemObject(buf_dbg);
-		clReleaseMemObject(buf_ht[0]);
-		clReleaseMemObject(buf_ht[1]);
-		free(sols);
-	}
-};
-
-cl_mem check_clCreateBuffer(cl_context ctx, cl_mem_flags flags, size_t size,
-	void *host_ptr);
-
-bool OclContext::init(
-	cl_device_id dev,
-	unsigned int threadsNum,
-	unsigned int threadsPerBlock)
-{
-	cl_int error;
-
-	queue = clCreateCommandQueue(_context, dev, 0, &error);
-
-#ifdef ENABLE_DEBUG
-	size_t              dbg_size = NR_ROWS;
-#else
-	size_t              dbg_size = 1;
-#endif
-
-	buf_dbg = check_clCreateBuffer(_context, CL_MEM_READ_WRITE |
-		CL_MEM_HOST_NO_ACCESS, dbg_size, NULL);
-	buf_ht[0] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, HT_SIZE, NULL);
-	buf_ht[1] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, HT_SIZE, NULL);
-	buf_sols = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, sizeof(sols_t),
-		NULL);
-
-
-	fprintf(stderr, "Hash tables will use %.1f MB\n", 2.0 * HT_SIZE / 1e6);
-
-	k_init_ht = clCreateKernel(_program, "kernel_init_ht", &error);
-	for (unsigned i = 0; i < WK; i++) {
-		char kernelName[128];
-		sprintf(kernelName, "kernel_round%d", i);
-		k_rounds[i] = clCreateKernel(_program, kernelName, &error);
-	}
-
-	sols = (sols_t *)malloc(sizeof(*sols));
-
-	k_sols = clCreateKernel(_program, "kernel_sols", &error);
-	return true;
-}
-
-///
-int             verbose = 0;
-uint32_t	show_encoded = 0;
-
-cl_mem check_clCreateBuffer(cl_context ctx, cl_mem_flags flags, size_t size,
-	void *host_ptr)
-{
-	cl_int	status;
-	cl_mem	ret;
-	ret = clCreateBuffer(ctx, flags, size, host_ptr, &status);
-	if (status != CL_SUCCESS || !ret)
-		printf("clCreateBuffer (%d)\n", status);
-	return ret;
-}
-
-void check_clSetKernelArg(cl_kernel k, cl_uint a_pos, cl_mem *a)
-{
-	cl_int	status;
-	status = clSetKernelArg(k, a_pos, sizeof(*a), a);
-	if (status != CL_SUCCESS)
-		printf("clSetKernelArg (%d)\n", status);
-}
-
-void check_clEnqueueNDRangeKernel(cl_command_queue queue, cl_kernel k, cl_uint
-	work_dim, const size_t *global_work_offset, const size_t
-	*global_work_size, const size_t *local_work_size, cl_uint
-	num_events_in_wait_list, const cl_event *event_wait_list, cl_event
-	*event)
-{
-	cl_uint	status;
-	status = clEnqueueNDRangeKernel(queue, k, work_dim, global_work_offset,
-		global_work_size, local_work_size, num_events_in_wait_list,
-		event_wait_list, event);
-	if (status != CL_SUCCESS)
-		printf("clEnqueueNDRangeKernel (%d)\n", status);
-}
-
-void check_clEnqueueReadBuffer(cl_command_queue queue, cl_mem buffer, cl_bool
-	blocking_read, size_t offset, size_t size, void *ptr, cl_uint
-	num_events_in_wait_list, const cl_event *event_wait_list, cl_event
-	*event)
-{
-	cl_int	status;
-	status = clEnqueueReadBuffer(queue, buffer, blocking_read, offset,
-		size, ptr, num_events_in_wait_list, event_wait_list, event);
-	if (status != CL_SUCCESS)
-		printf("clEnqueueReadBuffer (%d)\n", status);
-}
-
-void hexdump(uint8_t *a, uint32_t a_len)
-{
-	for (uint32_t i = 0; i < a_len; i++)
-		fprintf(stderr, "%02x", a[i]);
-}
-
-char *s_hexdump(const void *_a, uint32_t a_len)
-{
-	const uint8_t	*a = (uint8_t	*)_a;
-	static char		buf[1024];
-	uint32_t		i;
-	for (i = 0; i < a_len && i + 2 < sizeof(buf); i++)
-		sprintf(buf + i * 2, "%02x", a[i]);
-	buf[i * 2] = 0;
-	return buf;
-}
-
-uint8_t hex2val(const char *base, size_t off)
-{
-	const char          c = base[off];
-	if (c >= '0' && c <= '9')           return c - '0';
-	else if (c >= 'a' && c <= 'f')      return 10 + c - 'a';
-	else if (c >= 'A' && c <= 'F')      return 10 + c - 'A';
-	printf("Invalid hex char at offset %zd: ...%c...\n", off, c);
-	return 0;
-}
-
-unsigned nr_compute_units(const char *gpu)
-{
-	if (!strcmp(gpu, "rx480")) return 36;
-	fprintf(stderr, "Unknown GPU: %s\n", gpu);
-	return 0;
-}
-
-static void compress(uint8_t *out, uint32_t *inputs, uint32_t n)
-{
-	uint32_t byte_pos = 0;
-	int32_t bits_left = PREFIX + 1;
-	uint8_t x = 0;
-	uint8_t x_bits_used = 0;
-	uint8_t *pOut = out;
-	while (byte_pos < n)
-	{
-		if (bits_left >= 8 - x_bits_used)
-		{
-			x |= inputs[byte_pos] >> (bits_left - 8 + x_bits_used);
-			bits_left -= 8 - x_bits_used;
-			x_bits_used = 8;
-		}
-		else if (bits_left > 0)
-		{
-			uint32_t mask = ~(-1 << (8 - x_bits_used));
-			mask = ((~mask) >> bits_left) & mask;
-			x |= (inputs[byte_pos] << (8 - x_bits_used - bits_left)) & mask;
-			x_bits_used += bits_left;
-			bits_left = 0;
-		}
-		else if (bits_left <= 0)
-		{
-			assert(!bits_left);
-			byte_pos++;
-			bits_left = PREFIX + 1;
-		}
-		if (x_bits_used == 8)
-		{
-			*pOut++ = x;
-			x = x_bits_used = 0;
-		}
-	}
-}
-
-void get_program_build_log(cl_program program, cl_device_id device)
-{
-	cl_int		status;
-	char	        val[2 * 1024 * 1024];
-	size_t		ret = 0;
-	status = clGetProgramBuildInfo(program, device,
-		CL_PROGRAM_BUILD_LOG,
-		sizeof(val),	// size_t param_value_size
-		&val,		// void *param_value
-		&ret);		// size_t *param_value_size_ret
-	if (status != CL_SUCCESS)
-		printf("clGetProgramBuildInfo (%d)\n", status);
-	fprintf(stderr, "%s\n", val);
-}
-
-size_t select_work_size_blake(void)
-{
-	size_t              work_size =
-		64 * /* thread per wavefront */
-		BLAKE_WPS * /* wavefront per simd */
-		4 * /* simd per compute unit */
-		nr_compute_units("rx480");
-	// Make the work group size a multiple of the nr of wavefronts, while
-	// dividing the number of inputs. This results in the worksize being a
-	// power of 2.
-	while (NR_INPUTS % work_size)
-		work_size += 64;
-	//debug("Blake: work size %zd\n", work_size);
-	return work_size;
-}
-
-static void init_ht(cl_command_queue queue, cl_kernel k_init_ht, cl_mem buf_ht)
-{
-	size_t      global_ws = NR_ROWS;
-	size_t      local_ws = 64;
-	cl_int      status;
-#if 0
-	uint32_t    pat = -1;
-	status = clEnqueueFillBuffer(queue, buf_ht, &pat, sizeof(pat), 0,
-		NR_ROWS * NR_SLOTS * SLOT_LEN,
-		0,		// cl_uint	num_events_in_wait_list
-		NULL,	// cl_event	*event_wait_list
-		NULL);	// cl_event	*event
-	if (status != CL_SUCCESS)
-		fatal("clEnqueueFillBuffer (%d)\n", status);
-#endif
-	status = clSetKernelArg(k_init_ht, 0, sizeof(buf_ht), &buf_ht);
-	if (status != CL_SUCCESS)
-		printf("clSetKernelArg (%d)\n", status);
-	check_clEnqueueNDRangeKernel(queue, k_init_ht,
-		1,		// cl_uint	work_dim
-		NULL,	// size_t	*global_work_offset
-		&global_ws,	// size_t	*global_work_size
-		&local_ws,	// size_t	*local_work_size
-		0,		// cl_uint	num_events_in_wait_list
-		NULL,	// cl_event	*event_wait_list
-		NULL);	// cl_event	*event
-}
-
-
-/*
-** Sort a pair of binary blobs (a, b) which are consecutive in memory and
-** occupy a total of 2*len 32-bit words.
-**
-** a            points to the pair
-** len          number of 32-bit words in each pair
-*/
-void sort_pair(uint32_t *a, uint32_t len)
-{
-	uint32_t    *b = a + len;
-	uint32_t     tmp, need_sorting = 0;
-	for (uint32_t i = 0; i < len; i++)
-		if (need_sorting || a[i] > b[i])
-		{
-			need_sorting = 1;
-			tmp = a[i];
-			a[i] = b[i];
-			b[i] = tmp;
-		}
-		else if (a[i] < b[i])
-			return;
-}
-static uint32_t verify_sol(sols_t *sols, unsigned sol_i)
-{
-	uint32_t  *inputs = sols->values[sol_i];
-	uint32_t  seen_len = (1 << (PREFIX + 1)) / 8;
-	uint8_t seen[(1 << (PREFIX + 1)) / 8];
-	uint32_t  i;
-	uint8_t tmp;
-	// look for duplicate inputs
-	memset(seen, 0, seen_len);
-	for (i = 0; i < (1 << PARAM_K); i++)
-	{
-		tmp = seen[inputs[i] / 8];
-		seen[inputs[i] / 8] |= 1 << (inputs[i] & 7);
-		if (tmp == seen[inputs[i] / 8])
-		{
-			// at least one input value is a duplicate
-			sols->valid[sol_i] = 0;
-			return 0;
-		}
-	}
-	// the valid flag is already set by the GPU, but set it again because
-	// I plan to change the GPU code to not set it
-	sols->valid[sol_i] = 1;
-	// sort the pairs in place
-	for (uint32_t level = 0; level < PARAM_K; level++)
-		for (i = 0; i < (1 << PARAM_K); i += (2 << level))
-			sort_pair(&inputs[i], 1 << level);
-	return 1;
-}
-
-
-
-ocl_silentarmy::ocl_silentarmy(int platf_id, int dev_id) {
-	platform_id = platf_id;
-	device_id = dev_id;
-	// TODO 
-	threadsNum = 8192;
-	wokrsize = 128; // 256;
-}
-
-std::string ocl_silentarmy::getdevinfo() {
-	/*TODO get name*/
-	return "GPU_ID(" + std::to_string(device_id)+ ")";
-}
-
-// STATICS START
-int ocl_silentarmy::getcount() { /*TODO*/
-	return 0;
-}
-
-void ocl_silentarmy::getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version) { /*TODO*/ }
-
-void ocl_silentarmy::start(ocl_silentarmy& device_context) {
-	/*TODO*/
-	device_context.is_init_success = false;
-	device_context.oclc = new OclContext();
-
-	std::vector<cl_device_id> allGpus;
-	if (!clInitialize(device_context.platform_id, allGpus)) {
-		return;
-	}
-
-	// this is kinda stupid but it works
-	std::vector<cl_device_id> gpus;
-	for (unsigned i = 0; i < allGpus.size(); ++i) {
-		if (i == device_context.device_id) {
-			printf("Using device %d as GPU %d\n", i, (int)gpus.size());
-			device_context.oclc->_dev_id = allGpus[i];
-			gpus.push_back(allGpus[i]);
-		}
-	}
-
-	if (!gpus.size()){
-		printf("Device id %d not found\n", device_context.device_id);
-		return;
-	}
-
-	// context create
-	for (unsigned i = 0; i < gpus.size(); i++) {
-		cl_context_properties props[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)device_context.oclc->platform_id, 0 };
-		cl_int error;
-		device_context.oclc->_context = clCreateContext(NULL, 1, &gpus[i], 0, 0, &error);
-		//OCLR(error, false);
-		if (cl_int err = error) {
-			printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__);
-			return;
-		}
-	}
-
-	std::vector<cl_int> binstatus;
-	binstatus.resize(gpus.size());
-
-	for (size_t i = 0; i < gpus.size(); i++) {
-		char kernelName[64];
-		sprintf(kernelName, "silentarmy_gpu%u.bin", (unsigned)i);
-		if (!clCompileKernel(device_context.oclc->_context,
-			gpus[i],
-			kernelName,
-			{ "zcash/gpu/kernel.cl" },
-			"",
-			&binstatus[i],
-			&device_context.oclc->_program)) {
-			return;
-		}
-	}
-
-	for (unsigned i = 0; i < gpus.size(); ++i) {
-		if (binstatus[i] == CL_SUCCESS) {
-			if (!device_context.oclc->init(gpus[i], device_context.threadsNum, device_context.wokrsize)) {
-				printf("Init failed");
-				return;
-			}
-		}
-		else {
-			printf("GPU %d: failed to load kernel\n", i);
-			return;
-		}
-	}
-
-	device_context.is_init_success = true;
-}
-
-void ocl_silentarmy::stop(ocl_silentarmy& device_context) {
-	if (device_context.oclc != nullptr) delete device_context.oclc;
-}
-
-void ocl_silentarmy::solve(const char *tequihash_header,
-	unsigned int tequihash_header_len,
-	const char* nonce,
-	unsigned int nonce_len,
-	std::function<bool()> cancelf,
-	std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
-	std::function<void(void)> hashdonef,
-	ocl_silentarmy& device_context) {
-
-	unsigned char context[140];
-	memset(context, 0, 140);
-	memcpy(context, tequihash_header, tequihash_header_len);
-	memcpy(context + tequihash_header_len, nonce, nonce_len);
-
-	OclContext *miner = device_context.oclc;
-	clFlush(miner->queue);
-
-	blake2b_state_t initialCtx;
-	zcash_blake2b_init(&initialCtx, ZCASH_HASH_LEN, PARAM_N, PARAM_K);
-	zcash_blake2b_update(&initialCtx, (const uint8_t*)context, 128, 0);
-
-	cl_mem buf_blake_st;
-	buf_blake_st = check_clCreateBuffer(miner->_context, CL_MEM_READ_ONLY |
-		CL_MEM_COPY_HOST_PTR, sizeof(blake2b_state_s), &initialCtx);
-
-
-	for (unsigned round = 0; round < PARAM_K; round++)
-	{
-		if (round < 2)
-			init_ht(miner->queue, miner->k_init_ht, miner->buf_ht[round % 2]);
-		if (!round)
-		{
-			check_clSetKernelArg(miner->k_rounds[round], 0, &buf_blake_st);
-			check_clSetKernelArg(miner->k_rounds[round], 1, &miner->buf_ht[round % 2]);
-			miner->global_ws = select_work_size_blake();
-		}
-		else
-		{
-			check_clSetKernelArg(miner->k_rounds[round], 0, &miner->buf_ht[(round - 1) % 2]);
-			check_clSetKernelArg(miner->k_rounds[round], 1, &miner->buf_ht[round % 2]);
-			miner->global_ws = NR_ROWS;
-		}
-		check_clSetKernelArg(miner->k_rounds[round], 2, &miner->buf_dbg);
-		if (round == PARAM_K - 1)
-			check_clSetKernelArg(miner->k_rounds[round], 3, &miner->buf_sols);
-		check_clEnqueueNDRangeKernel(miner->queue, miner->k_rounds[round], 1, NULL,
-			&miner->global_ws, &miner->local_work_size, 0, NULL, NULL);
-		// cancel function
-		if (cancelf()) return;
-	}
-	check_clSetKernelArg(miner->k_sols, 0, &miner->buf_ht[0]);
-	check_clSetKernelArg(miner->k_sols, 1, &miner->buf_ht[1]);
-	check_clSetKernelArg(miner->k_sols, 2, &miner->buf_sols);
-	miner->global_ws = NR_ROWS;
-	check_clEnqueueNDRangeKernel(miner->queue, miner->k_sols, 1, NULL,
-		&miner->global_ws, &miner->local_work_size, 0, NULL, NULL);
-
-	check_clEnqueueReadBuffer(miner->queue, miner->buf_sols,
-		CL_TRUE,	// cl_bool	blocking_read
-		0,		// size_t	offset
-		sizeof(*miner->sols),	// size_t	size
-		miner->sols,	// void		*ptr
-		0,		// cl_uint	num_events_in_wait_list
-		NULL,	// cl_event	*event_wait_list
-		NULL);	// cl_event	*event
-
-	if (miner->sols->nr > MAX_SOLS)
-		miner->sols->nr = MAX_SOLS;
-
-	clReleaseMemObject(buf_blake_st);
-
-	for (unsigned sol_i = 0; sol_i < miner->sols->nr; sol_i++) {
-		verify_sol(miner->sols, sol_i);
-	}
-
-	uint8_t proof[COMPRESSED_PROOFSIZE * 2];
-	for (uint32_t i = 0; i < miner->sols->nr; i++) {
-		if (miner->sols->valid[i]) {
-			compress(proof, (uint32_t *)(miner->sols->values[i]), 1 << PARAM_K);
-			solutionf(std::vector<uint32_t>(0), 1344, proof);
-		}
-	}
-	hashdonef();
-}
-
-// STATICS END
-
diff --git a/ocl_silentarmy/ocl_silentarmy.hpp b/ocl_silentarmy/ocl_silentarmy.hpp
deleted file mode 100644
index 4740ac8b5..000000000
--- a/ocl_silentarmy/ocl_silentarmy.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-#pragma once
-#ifdef _LIB
-#define DLL_OCL_SILENTARMY __declspec(dllexport)
-#else
-#define DLL_OCL_SILENTARMY
-#endif
-
-// remove after
-#include <string>
-#include <functional>
-#include <vector>
-#include <cstdint>
-
-struct OclContext;
-
-
-
-struct DLL_OCL_SILENTARMY ocl_silentarmy
-{
-	//int threadsperblock;
-	int blocks;
-	int device_id;
-	int platform_id;
-
-	OclContext* oclc;
-	// threads
-	unsigned threadsNum; // TMP
-	unsigned wokrsize;
-
-	bool is_init_success = false;
-
-	ocl_silentarmy(int platf_id, int dev_id);
-
-	std::string getdevinfo();
-
-	static int getcount();
-
-	static void getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version);
-
-	static void start(ocl_silentarmy& device_context);
-
-	static void stop(ocl_silentarmy& device_context);
-
-	static void solve(const char *tequihash_header,
-		unsigned int tequihash_header_len,
-		const char* nonce,
-		unsigned int nonce_len,
-		std::function<bool()> cancelf,
-		std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
-		std::function<void(void)> hashdonef,
-		ocl_silentarmy& device_context);
-
-	std::string getname() { return "OCL_SILENTARMY"; }
-
-private:
-	std::string m_gpu_name;
-	std::string m_version;
-};
\ No newline at end of file
diff --git a/ocl_silentarmy/ocl_silentarmy.vcxproj b/ocl_silentarmy/ocl_silentarmy.vcxproj
deleted file mode 100644
index 1aae0f6ca..000000000
--- a/ocl_silentarmy/ocl_silentarmy.vcxproj
+++ /dev/null
@@ -1,98 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="ocl_silentarmy.cpp" />
-    <ClCompile Include="sa_blake.cpp" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="ocl_silentarmy.hpp" />
-    <ClInclude Include="param.h" />
-    <ClInclude Include="sa_blake.h" />
-  </ItemGroup>
-  <ItemGroup>
-    <None Include="zcash\gpu\input.cl" />
-    <None Include="zcash\gpu\kernel.cl" />
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}</ProjectGuid>
-    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>ocl_silentarmy</RootNamespace>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(Platform)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <IntDir>$(Platform)\$(Configuration)\</IntDir>
-    <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>..\ocl_device_utils;$(AMDAPPSDKROOT)\include\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Windows</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>..\ocl_device_utils;$(AMDAPPSDKROOT)\include\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Windows</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/ocl_silentarmy/ocl_silentarmy.vcxproj.filters b/ocl_silentarmy/ocl_silentarmy.vcxproj.filters
deleted file mode 100644
index 9659f2c07..000000000
--- a/ocl_silentarmy/ocl_silentarmy.vcxproj.filters
+++ /dev/null
@@ -1,28 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup>
-    <ClCompile Include="ocl_silentarmy.cpp" />
-    <ClCompile Include="sa_blake.cpp" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="ocl_silentarmy.hpp" />
-    <ClInclude Include="param.h" />
-    <ClInclude Include="sa_blake.h" />
-  </ItemGroup>
-  <ItemGroup>
-    <Filter Include="zcash">
-      <UniqueIdentifier>{34381c66-ca5c-4daa-aa30-58dcf33e2d66}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="zcash\gpu">
-      <UniqueIdentifier>{c7687099-e206-4d36-8836-f7032bffc7da}</UniqueIdentifier>
-    </Filter>
-  </ItemGroup>
-  <ItemGroup>
-    <None Include="zcash\gpu\input.cl">
-      <Filter>zcash\gpu</Filter>
-    </None>
-    <None Include="zcash\gpu\kernel.cl">
-      <Filter>zcash\gpu</Filter>
-    </None>
-  </ItemGroup>
-</Project>
\ No newline at end of file
diff --git a/ocl_silentarmy/param.h b/ocl_silentarmy/param.h
deleted file mode 100644
index 51ef42ea9..000000000
--- a/ocl_silentarmy/param.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#define PARAM_N				200
-#define PARAM_K				9
-#define PREFIX                          (PARAM_N / (PARAM_K + 1))
-#define NR_INPUTS                       (1 << PREFIX)
-// Approximate log base 2 of number of elements in hash tables
-#define APX_NR_ELMS_LOG                 (PREFIX + 1)
-// Number of rows and slots is affected by this. 20 offers the best performance
-// but occasionally misses ~1% of solutions.
-#define NR_ROWS_LOG                     20
-
-// Make hash tables OVERHEAD times larger than necessary to store the average
-// number of elements per row. The ideal value is as small as possible to
-// reduce memory usage, but not too small or else elements are dropped from the
-// hash tables.
-//
-// The actual number of elements per row is closer to the theoretical average
-// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be
-// smaller.
-//
-// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease
-// performance as they cause VRAM channel conflicts.
-#if NR_ROWS_LOG == 16
-#define OVERHEAD                        3
-#elif NR_ROWS_LOG == 18
-#define OVERHEAD                        5
-#elif NR_ROWS_LOG == 19
-#define OVERHEAD                        9
-#elif NR_ROWS_LOG == 20
-#define OVERHEAD                        13
-#endif
-
-#define NR_ROWS                         (1 << NR_ROWS_LOG)
-#define NR_SLOTS            ((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD)
-// Length of 1 element (slot) in bytes
-#define SLOT_LEN                        32
-// Total size of hash table
-#define HT_SIZE				(NR_ROWS * NR_SLOTS * SLOT_LEN)
-// Length of Zcash block header and nonce
-#define ZCASH_BLOCK_HEADER_LEN		140
-#define ZCASH_NONCE_LEN			32
-// Number of bytes Zcash needs out of Blake
-#define ZCASH_HASH_LEN                  50
-// Number of wavefronts per SIMD for the Blake kernel.
-// Blake is ALU-bound (beside the atomic counter being incremented) so we need
-// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer
-// instructions. 10 is the max supported by the hw.
-#define BLAKE_WPS               	10
-#define MAX_SOLS			2000
-
-// Optional features
-#undef ENABLE_DEBUG
-
-/*
-** Return the offset of Xi in bytes from the beginning of the slot.
-*/
-#define xi_offset_for_round(round)	(8 + ((round) / 2) * 4)
-
-// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values
-#define SOL_SIZE			((1 << PARAM_K) * 4)
-typedef struct	sols_s
-{
-    uint	nr;
-    uint	likely_invalids;
-    uchar	valid[MAX_SOLS];
-    uint	values[MAX_SOLS][(1 << PARAM_K)];
-}		sols_t;
diff --git a/ocl_silentarmy/sa_blake.cpp b/ocl_silentarmy/sa_blake.cpp
deleted file mode 100644
index c10800de8..000000000
--- a/ocl_silentarmy/sa_blake.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-#include <stdint.h>
-#include <string.h>
-#include <assert.h>
-#include "sa_blake.h"
-
-static const uint32_t   blake2b_block_len = 128;
-static const uint32_t   blake2b_rounds = 12;
-static const uint64_t   blake2b_iv[8] =
-{
-    0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
-    0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
-    0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
-    0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL,
-};
-static const uint8_t    blake2b_sigma[12][16] =
-{
-      {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
-      { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
-      { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
-      {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
-      {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
-      {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
-      { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
-      { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
-      {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
-      { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
-      {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
-      { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
-};
-
-/*
-** Init the state according to Zcash parameters.
-*/
-void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len,
-	uint32_t n, uint32_t k)
-{
-    assert(n > k);
-    assert(hash_len <= 64);
-    st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len);
-    for (uint32_t i = 1; i <= 5; i++)
-        st->h[i] = blake2b_iv[i];
-    st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW";
-    st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n);
-    st->bytes = 0;
-}
-
-static uint64_t rotr64(uint64_t a, uint8_t bits)
-{
-    return (a >> bits) | (a << (64 - bits));
-}
-
-static void mix(uint64_t *va, uint64_t *vb, uint64_t *vc, uint64_t *vd,
-        uint64_t x, uint64_t y)
-{
-    *va = (*va + *vb + x);
-    *vd = rotr64(*vd ^ *va, 32);
-    *vc = (*vc + *vd);
-    *vb = rotr64(*vb ^ *vc, 24);
-    *va = (*va + *vb + y);
-    *vd = rotr64(*vd ^ *va, 16);
-    *vc = (*vc + *vd);
-    *vb = rotr64(*vb ^ *vc, 63);
-}
-
-/*
-** Process either a full message block or the final partial block.
-** Note that v[13] is not XOR'd because st->bytes is assumed to never overflow.
-**
-** _msg         pointer to message (must be zero-padded to 128 bytes if final block)
-** msg_len      must be 128 (<= 128 allowed only for final partial block)
-** is_final     indicate if this is the final block
-*/
-void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg,
-        uint32_t msg_len, uint32_t is_final)
-{
-    const uint64_t      *m = (const uint64_t *)_msg;
-    uint64_t            v[16];
-    assert(msg_len <= 128);
-    assert(st->bytes <= UINT64_MAX - msg_len);
-    memcpy(v + 0, st->h, 8 * sizeof (*v));
-    memcpy(v + 8, blake2b_iv, 8 * sizeof (*v));
-    v[12] ^= (st->bytes += msg_len);
-    v[14] ^= is_final ? -1 : 0;
-    for (uint32_t round = 0; round < blake2b_rounds; round++)
-      {
-        const uint8_t   *s = blake2b_sigma[round];
-        mix(v + 0, v + 4, v + 8,  v + 12, m[s[0]],  m[s[1]]);
-        mix(v + 1, v + 5, v + 9,  v + 13, m[s[2]],  m[s[3]]);
-        mix(v + 2, v + 6, v + 10, v + 14, m[s[4]],  m[s[5]]);
-        mix(v + 3, v + 7, v + 11, v + 15, m[s[6]],  m[s[7]]);
-        mix(v + 0, v + 5, v + 10, v + 15, m[s[8]],  m[s[9]]);
-        mix(v + 1, v + 6, v + 11, v + 12, m[s[10]], m[s[11]]);
-        mix(v + 2, v + 7, v + 8,  v + 13, m[s[12]], m[s[13]]);
-        mix(v + 3, v + 4, v + 9,  v + 14, m[s[14]], m[s[15]]);
-      }
-    for (uint32_t i = 0; i < 8; i++)
-        st->h[i] ^= v[i] ^ v[i + 8];
-}
-
-void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen)
-{
-    assert(outlen <= 64);
-    memcpy(out, st->h, outlen);
-}
diff --git a/ocl_silentarmy/sa_blake.h b/ocl_silentarmy/sa_blake.h
deleted file mode 100644
index 40270a95e..000000000
--- a/ocl_silentarmy/sa_blake.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#pragma once
-typedef struct  blake2b_state_s
-{
-    uint64_t    h[8];
-    uint64_t    bytes;
-}               blake2b_state_t;
-void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len,
-	uint32_t n, uint32_t k);
-void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg,
-        uint32_t msg_len, uint32_t is_final);
-void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen);
diff --git a/ocl_silentarmy/zcash/gpu/input.cl b/ocl_silentarmy/zcash/gpu/input.cl
deleted file mode 100644
index f5112c816..000000000
--- a/ocl_silentarmy/zcash/gpu/input.cl
+++ /dev/null
@@ -1,704 +0,0 @@
-#include "param.h"
-
-/*
-** Assuming NR_ROWS_LOG == 16, the hash table slots have this layout (length in
-** bytes in parens):
-**
-** round 0, table 0: cnt(4) i(4)                     pad(0)   Xi(23.0) pad(1)
-** round 1, table 1: cnt(4) i(4)                     pad(0.5) Xi(20.5) pad(3)
-** round 2, table 0: cnt(4) i(4) i(4)                pad(0)   Xi(18.0) pad(2)
-** round 3, table 1: cnt(4) i(4) i(4)                pad(0.5) Xi(15.5) pad(4)
-** round 4, table 0: cnt(4) i(4) i(4) i(4)           pad(0)   Xi(13.0) pad(3)
-** round 5, table 1: cnt(4) i(4) i(4) i(4)           pad(0.5) Xi(10.5) pad(5)
-** round 6, table 0: cnt(4) i(4) i(4) i(4) i(4)      pad(0)   Xi( 8.0) pad(4)
-** round 7, table 1: cnt(4) i(4) i(4) i(4) i(4)      pad(0.5) Xi( 5.5) pad(6)
-** round 8, table 0: cnt(4) i(4) i(4) i(4) i(4) i(4) pad(0)   Xi( 3.0) pad(5)
-**
-** If the first byte of Xi is 0xAB then:
-** - on even rounds, 'A' is part of the colliding PREFIX, 'B' is part of Xi
-** - on odd rounds, 'A' and 'B' are both part of the colliding PREFIX, but
-**   'A' is considered redundant padding as it was used to compute the row #
-**
-** - cnt is an atomic counter keeping track of the number of used slots.
-**   it is used in the first slot only; subsequent slots replace it with
-**   4 padding bytes
-** - i encodes either the 21-bit input value (round 0) or a reference to two
-**   inputs from the previous round
-**
-** Formula for Xi length and pad length above:
-** > for i in range(9):
-** >   xi=(200-20*i-NR_ROWS_LOG)/8.; ci=8+4*((i)/2); print xi,32-ci-xi
-**
-** Note that the fractional .5-byte/4-bit padding following Xi for odd rounds
-** is the 4 most significant bits of the last byte of Xi.
-*/
-
-__constant ulong blake_iv[] =
-{
-    0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
-    0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
-    0x510e527fade682d1, 0x9b05688c2b3e6c1f,
-    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
-};
-
-/*
-** Reset counters in hash table.
-*/
-__kernel
-void kernel_init_ht(__global char *ht)
-{
-    uint        tid = get_global_id(0);
-    *(__global uint *)(ht + tid * NR_SLOTS * SLOT_LEN) = 0;
-}
-
-/*
-** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they
-** represent (hex notation, group of 5 hex digits are a group of PREFIX bits):
-**   aa aa ab bb bb cc cc cd dd...  [round 0]
-**         --------------------
-**      ...ab bb bb cc cc cd dd...  [odd round]
-**               --------------
-**               ...cc cc cd dd...  [next even round]
-**                        -----
-** Bytes underlined are going to be stored in the slot. Preceding bytes
-** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are
-** used to compute the row number.
-**
-** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter)
-** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble)
-** TODO: update lines below with padding nibbles
-** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter)
-** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter)
-** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter)
-** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter)
-** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter)
-** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter)
-** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter)
-**
-** Return 0 if successfully stored, or 1 if the row overflowed.
-*/
-uint ht_store(uint round, __global char *ht, uint i,
-        ulong xi0, ulong xi1, ulong xi2, ulong xi3)
-{
-    uint		row;
-    __global char       *p;
-    uint                cnt;
-#if NR_ROWS_LOG == 16
-    if (!(round % 2))
-	row = (xi0 & 0xffff);
-    else
-	// if we have in hex: "ab cd ef..." (little endian xi0) then this
-	// formula computes the row as 0xdebc. it skips the 'a' nibble as it
-	// is part of the PREFIX. The Xi will be stored starting with "ef...";
-	// 'e' will be considered padding and 'f' is part of the current PREFIX
-	row = ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
-	    ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-#elif NR_ROWS_LOG == 18
-    if (!(round % 2))
-	row = (xi0 & 0xffff) | ((xi0 & 0xc00000) >> 6);
-    else
-	row = ((xi0 & 0xc0000) >> 2) |
-	    ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
-	    ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-#elif NR_ROWS_LOG == 19
-    if (!(round % 2))
-	row = (xi0 & 0xffff) | ((xi0 & 0xe00000) >> 5);
-    else
-	row = ((xi0 & 0xe0000) >> 1) |
-	    ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
-	    ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-#elif NR_ROWS_LOG == 20
-    if (!(round % 2))
-	row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4);
-    else
-	row = ((xi0 & 0xf0000) >> 0) |
-	    ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
-	    ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
-    xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
-    xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
-    xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
-    p = ht + row * NR_SLOTS * SLOT_LEN;
-    cnt = atomic_inc((__global uint *)p);
-    if (cnt >= NR_SLOTS)
-        return 1;
-    p += cnt * SLOT_LEN + xi_offset_for_round(round);
-    // store "i" (always 4 bytes before Xi)
-    *(__global uint *)(p - 4) = i;
-    if (round == 0 || round == 1)
-      {
-	// store 24 bytes
-	*(__global ulong *)(p + 0) = xi0;
-	*(__global ulong *)(p + 8) = xi1;
-	*(__global ulong *)(p + 16) = xi2;
-      }
-    else if (round == 2)
-      {
-	// store 20 bytes
-	*(__global ulong *)(p + 0) = xi0;
-	*(__global ulong *)(p + 8) = xi1;
-	*(__global uint *)(p + 16) = xi2;
-      }
-    else if (round == 3 || round == 4)
-      {
-	// store 16 bytes
-	*(__global ulong *)(p + 0) = xi0;
-	*(__global ulong *)(p + 8) = xi1;
-
-      }
-    else if (round == 5)
-      {
-	// store 12 bytes
-	*(__global ulong *)(p + 0) = xi0;
-	*(__global uint *)(p + 8) = xi1;
-      }
-    else if (round == 6 || round == 7)
-      {
-	// store 8 bytes
-	*(__global ulong *)(p + 0) = xi0;
-      }
-    else if (round == 8)
-      {
-	// store 4 bytes
-	*(__global uint *)(p + 0) = xi0;
-      }
-    return 0;
-}
-
-#define mix(va, vb, vc, vd, x, y) \
-    va = (va + vb + x); \
-    vd = rotate((vd ^ va), (ulong)64 - 32); \
-    vc = (vc + vd); \
-    vb = rotate((vb ^ vc), (ulong)64 - 24); \
-    va = (va + vb + y); \
-    vd = rotate((vd ^ va), (ulong)64 - 16); \
-    vc = (vc + vd); \
-    vb = rotate((vb ^ vc), (ulong)64 - 63);
-
-/*
-** Execute round 0 (blake).
-**
-** Note: making the work group size less than or equal to the wavefront size
-** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local
-** Memory (LDS) Optimization 2-10" in:
-** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/
-*/
-__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
-void kernel_round0(__global ulong *blake_state, __global char *ht,
-        __global uint *debug)
-{
-    uint                tid = get_global_id(0);
-    ulong               v[16];
-    uint                inputs_per_thread = NR_INPUTS / get_global_size(0);
-    uint                input = tid * inputs_per_thread;
-    uint                input_end = (tid + 1) * inputs_per_thread;
-    uint                dropped = 0;
-    while (input < input_end)
-      {
-        // shift "i" to occupy the high 32 bits of the second ulong word in the
-        // message block
-        ulong word1 = (ulong)input << 32;
-        // init vector v
-        v[0] = blake_state[0];
-        v[1] = blake_state[1];
-        v[2] = blake_state[2];
-        v[3] = blake_state[3];
-        v[4] = blake_state[4];
-        v[5] = blake_state[5];
-        v[6] = blake_state[6];
-        v[7] = blake_state[7];
-        v[8] =  blake_iv[0];
-        v[9] =  blake_iv[1];
-        v[10] = blake_iv[2];
-        v[11] = blake_iv[3];
-        v[12] = blake_iv[4];
-        v[13] = blake_iv[5];
-        v[14] = blake_iv[6];
-        v[15] = blake_iv[7];
-        // mix in length of data
-        v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */;
-        // last block
-        v[14] ^= -1;
-
-        // round 1
-        mix(v[0], v[4], v[8],  v[12], 0, word1);
-        mix(v[1], v[5], v[9],  v[13], 0, 0);
-        mix(v[2], v[6], v[10], v[14], 0, 0);
-        mix(v[3], v[7], v[11], v[15], 0, 0);
-        mix(v[0], v[5], v[10], v[15], 0, 0);
-        mix(v[1], v[6], v[11], v[12], 0, 0);
-        mix(v[2], v[7], v[8],  v[13], 0, 0);
-        mix(v[3], v[4], v[9],  v[14], 0, 0);
-        // round 2
-        mix(v[0], v[4], v[8],  v[12], 0, 0);
-        mix(v[1], v[5], v[9],  v[13], 0, 0);
-        mix(v[2], v[6], v[10], v[14], 0, 0);
-        mix(v[3], v[7], v[11], v[15], 0, 0);
-        mix(v[0], v[5], v[10], v[15], word1, 0);
-        mix(v[1], v[6], v[11], v[12], 0, 0);
-        mix(v[2], v[7], v[8],  v[13], 0, 0);
-        mix(v[3], v[4], v[9],  v[14], 0, 0);
-        // round 3
-        mix(v[0], v[4], v[8],  v[12], 0, 0);
-        mix(v[1], v[5], v[9],  v[13], 0, 0);
-        mix(v[2], v[6], v[10], v[14], 0, 0);
-        mix(v[3], v[7], v[11], v[15], 0, 0);
-        mix(v[0], v[5], v[10], v[15], 0, 0);
-        mix(v[1], v[6], v[11], v[12], 0, 0);
-        mix(v[2], v[7], v[8],  v[13], 0, word1);
-        mix(v[3], v[4], v[9],  v[14], 0, 0);
-        // round 4
-        mix(v[0], v[4], v[8],  v[12], 0, 0);
-        mix(v[1], v[5], v[9],  v[13], 0, word1);
-        mix(v[2], v[6], v[10], v[14], 0, 0);
-        mix(v[3], v[7], v[11], v[15], 0, 0);
-        mix(v[0], v[5], v[10], v[15], 0, 0);
-        mix(v[1], v[6], v[11], v[12], 0, 0);
-        mix(v[2], v[7], v[8],  v[13], 0, 0);
-        mix(v[3], v[4], v[9],  v[14], 0, 0);
-        // round 5
-        mix(v[0], v[4], v[8],  v[12], 0, 0);
-        mix(v[1], v[5], v[9],  v[13], 0, 0);
-        mix(v[2], v[6], v[10], v[14], 0, 0);
-        mix(v[3], v[7], v[11], v[15], 0, 0);
-        mix(v[0], v[5], v[10], v[15], 0, word1);
-        mix(v[1], v[6], v[11], v[12], 0, 0);
-        mix(v[2], v[7], v[8],  v[13], 0, 0);
-        mix(v[3], v[4], v[9],  v[14], 0, 0);
-        // round 6
-        mix(v[0], v[4], v[8],  v[12], 0, 0);
-        mix(v[1], v[5], v[9],  v[13], 0, 0);
-        mix(v[2], v[6], v[10], v[14], 0, 0);
-        mix(v[3], v[7], v[11], v[15], 0, 0);
-        mix(v[0], v[5], v[10], v[15], 0, 0);
-        mix(v[1], v[6], v[11], v[12], 0, 0);
-        mix(v[2], v[7], v[8],  v[13], 0, 0);
-        mix(v[3], v[4], v[9],  v[14], word1, 0);
-        // round 7
-        mix(v[0], v[4], v[8],  v[12], 0, 0);
-        mix(v[1], v[5], v[9],  v[13], word1, 0);
-        mix(v[2], v[6], v[10], v[14], 0, 0);
-        mix(v[3], v[7], v[11], v[15], 0, 0);
-        mix(v[0], v[5], v[10], v[15], 0, 0);
-        mix(v[1], v[6], v[11], v[12], 0, 0);
-        mix(v[2], v[7], v[8],  v[13], 0, 0);
-        mix(v[3], v[4], v[9],  v[14], 0, 0);
-        // round 8
-        mix(v[0], v[4], v[8],  v[12], 0, 0);
-        mix(v[1], v[5], v[9],  v[13], 0, 0);
-        mix(v[2], v[6], v[10], v[14], 0, word1);
-        mix(v[3], v[7], v[11], v[15], 0, 0);
-        mix(v[0], v[5], v[10], v[15], 0, 0);
-        mix(v[1], v[6], v[11], v[12], 0, 0);
-        mix(v[2], v[7], v[8],  v[13], 0, 0);
-        mix(v[3], v[4], v[9],  v[14], 0, 0);
-        // round 9
-        mix(v[0], v[4], v[8],  v[12], 0, 0);
-        mix(v[1], v[5], v[9],  v[13], 0, 0);
-        mix(v[2], v[6], v[10], v[14], 0, 0);
-        mix(v[3], v[7], v[11], v[15], 0, 0);
-        mix(v[0], v[5], v[10], v[15], 0, 0);
-        mix(v[1], v[6], v[11], v[12], 0, 0);
-        mix(v[2], v[7], v[8],  v[13], word1, 0);
-        mix(v[3], v[4], v[9],  v[14], 0, 0);
-        // round 10
-        mix(v[0], v[4], v[8],  v[12], 0, 0);
-        mix(v[1], v[5], v[9],  v[13], 0, 0);
-        mix(v[2], v[6], v[10], v[14], 0, 0);
-        mix(v[3], v[7], v[11], v[15], word1, 0);
-        mix(v[0], v[5], v[10], v[15], 0, 0);
-        mix(v[1], v[6], v[11], v[12], 0, 0);
-        mix(v[2], v[7], v[8],  v[13], 0, 0);
-        mix(v[3], v[4], v[9],  v[14], 0, 0);
-        // round 11
-        mix(v[0], v[4], v[8],  v[12], 0, word1);
-        mix(v[1], v[5], v[9],  v[13], 0, 0);
-        mix(v[2], v[6], v[10], v[14], 0, 0);
-        mix(v[3], v[7], v[11], v[15], 0, 0);
-        mix(v[0], v[5], v[10], v[15], 0, 0);
-        mix(v[1], v[6], v[11], v[12], 0, 0);
-        mix(v[2], v[7], v[8],  v[13], 0, 0);
-        mix(v[3], v[4], v[9],  v[14], 0, 0);
-        // round 12
-        mix(v[0], v[4], v[8],  v[12], 0, 0);
-        mix(v[1], v[5], v[9],  v[13], 0, 0);
-        mix(v[2], v[6], v[10], v[14], 0, 0);
-        mix(v[3], v[7], v[11], v[15], 0, 0);
-        mix(v[0], v[5], v[10], v[15], word1, 0);
-        mix(v[1], v[6], v[11], v[12], 0, 0);
-        mix(v[2], v[7], v[8],  v[13], 0, 0);
-        mix(v[3], v[4], v[9],  v[14], 0, 0);
-
-        // compress v into the blake state; this produces the 50-byte hash
-        // (two Xi values)
-        ulong h[7];
-        h[0] = blake_state[0] ^ v[0] ^ v[8];
-        h[1] = blake_state[1] ^ v[1] ^ v[9];
-        h[2] = blake_state[2] ^ v[2] ^ v[10];
-        h[3] = blake_state[3] ^ v[3] ^ v[11];
-        h[4] = blake_state[4] ^ v[4] ^ v[12];
-        h[5] = blake_state[5] ^ v[5] ^ v[13];
-        h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
-
-        // store the two Xi values in the hash table
-#if ZCASH_HASH_LEN == 50
-        dropped += ht_store(0, ht, input * 2,
-                h[0],
-                h[1],
-                h[2],
-                h[3]);
-        dropped += ht_store(0, ht, input * 2 + 1,
-                (h[3] >> 8) | (h[4] << (64 - 8)),
-                (h[4] >> 8) | (h[5] << (64 - 8)),
-                (h[5] >> 8) | (h[6] << (64 - 8)),
-                (h[6] >> 8));
-#else
-#error "unsupported ZCASH_HASH_LEN"
-#endif
-
-        input++;
-      }
-#ifdef ENABLE_DEBUG
-    debug[tid * 2] = 0;
-    debug[tid * 2 + 1] = dropped;
-#endif
-}
-
-#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
-    ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff))
-#define DECODE_ROW(REF)		(REF >> 16)
-#define DECODE_SLOT1(REF)	((REF >> 8) & 0xff)
-#define DECODE_SLOT0(REF)	(REF & 0xff)
-
-#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
-    ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f))
-#define DECODE_ROW(REF)		(REF >> 14)
-#define DECODE_SLOT1(REF)	((REF >> 7) & 0x7f)
-#define DECODE_SLOT0(REF)	(REF & 0x7f)
-
-#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
-    ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */
-#define DECODE_ROW(REF)		(REF >> 13)
-#define DECODE_SLOT1(REF)	((REF >> 6) & 0x3f)
-#define DECODE_SLOT0(REF)	(REF & 0x3f)
-
-#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
-    ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f))
-#define DECODE_ROW(REF)		(REF >> 12)
-#define DECODE_SLOT1(REF)	((REF >> 6) & 0x3f)
-#define DECODE_SLOT0(REF)	(REF & 0x3f)
-
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
-
-/*
-** XOR a pair of Xi values computed at "round - 1" and store the result in the
-** hash table being built for "round". Note that when building the table for
-** even rounds we need to skip 1 padding byte present in the "round - 1" table
-** (the "0xAB" byte mentioned in the description at the top of this file.) But
-** also note we can't load data directly past this byte because this would
-** cause an unaligned memory access which is undefined per the OpenCL spec.
-**
-** Return 0 if successfully stored, or 1 if the row overflowed.
-*/
-uint xor_and_store(uint round, __global char *ht_dst, uint row,
-	uint slot_a, uint slot_b, __global ulong *a, __global ulong *b)
-{
-    ulong	xi0, xi1, xi2;
-#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
-    // Note: for NR_ROWS_LOG == 20, for odd rounds, we could optimize by not
-    // storing the byte containing bits from the previous PREFIX block for
-    if (round == 1 || round == 2)
-      {
-	// xor 24 bytes
-	xi0 = *(a++) ^ *(b++);
-	xi1 = *(a++) ^ *(b++);
-	xi2 = *a ^ *b;
-	if (round == 2)
-	  {
-	    // skip padding byte
-	    xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
-	    xi1 = (xi1 >> 8) | (xi2 << (64 - 8));
-	    xi2 = (xi2 >> 8);
-	  }
-      }
-    else if (round == 3)
-      {
-	// xor 20 bytes
-	xi0 = *a++ ^ *b++;
-	xi1 = *a++ ^ *b++;
-	xi2 = *(__global uint *)a ^ *(__global uint *)b;
-      }
-    else if (round == 4 || round == 5)
-      {
-	// xor 16 bytes
-	xi0 = *a++ ^ *b++;
-	xi1 = *a ^ *b;
-	xi2 = 0;
-	if (round == 4)
-	  {
-	    // skip padding byte
-	    xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
-	    xi1 = (xi1 >> 8);
-	  }
-      }
-    else if (round == 6)
-      {
-	// xor 12 bytes
-	xi0 = *a++ ^ *b++;
-	xi1 = *(__global uint *)a ^ *(__global uint *)b;
-	xi2 = 0;
-	if (round == 6)
-	  {
-	    // skip padding byte
-	    xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
-	    xi1 = (xi1 >> 8);
-	  }
-      }
-    else if (round == 7 || round == 8)
-      {
-	// xor 8 bytes
-	xi0 = *a ^ *b;
-	xi1 = 0;
-	xi2 = 0;
-	if (round == 8)
-	  {
-	    // skip padding byte
-	    xi0 = (xi0 >> 8);
-	  }
-      }
-    // invalid solutions (which start happenning in round 5) have duplicate
-    // inputs and xor to zero, so discard them
-    if (!xi0 && !xi1)
-	return 0;
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
-    return ht_store(round, ht_dst, ENCODE_INPUTS(row, slot_a, slot_b),
-	    xi0, xi1, xi2, 0);
-}
-
-/*
-** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi,
-** store them in ht_dst.
-*/
-void equihash_round(uint round, __global char *ht_src, __global char *ht_dst,
-	__global uint *debug)
-{
-    uint                tid = get_global_id(0);
-    uint		tlid = get_local_id(0);
-    __global char       *p;
-    uint                cnt;
-    uchar		first_words[NR_SLOTS];
-    uchar		mask;
-    uint                i, j;
-    // NR_SLOTS is already oversized (by a factor of OVERHEAD), but we want to
-    // make it even larger
-    ushort		collisions[NR_SLOTS * 3];
-    uint                nr_coll = 0;
-    uint                n;
-    uint                dropped_coll, dropped_stor;
-    __global ulong      *a, *b;
-    uint		xi_offset;
-    // read first words of Xi from the previous (round - 1) hash table
-    xi_offset = xi_offset_for_round(round - 1);
-    // the mask is also computed to read data from the previous round
-#if NR_ROWS_LOG == 16
-    mask = ((!(round % 2)) ? 0x0f : 0xf0);
-#elif NR_ROWS_LOG == 18
-    mask = ((!(round % 2)) ? 0x03 : 0x30);
-#elif NR_ROWS_LOG == 19
-    mask = ((!(round % 2)) ? 0x01 : 0x10);
-#elif NR_ROWS_LOG == 20
-    mask = 0; /* we can vastly simplify the code below */
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
-    p = (ht_src + tid * NR_SLOTS * SLOT_LEN);
-    cnt = *(__global uint *)p;
-    cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in prev. round
-    p += xi_offset;
-    for (i = 0; i < cnt; i++, p += SLOT_LEN)
-        first_words[i] = *(__global uchar *)p;
-    // find collisions
-    nr_coll = 0;
-    dropped_coll = 0;
-    for (i = 0; i < cnt; i++)
-        for (j = i + 1; j < cnt; j++)
-            if ((first_words[i] & mask) ==
-		    (first_words[j] & mask))
-              {
-                // collision!
-                if (nr_coll >= sizeof (collisions) / sizeof (*collisions))
-                    dropped_coll++;
-                else
-#if NR_SLOTS <= (1 << 8)
-                    // note: this assumes slots can be encoded in 8 bits
-                    collisions[nr_coll++] =
-			((ushort)j << 8) | ((ushort)i & 0xff);
-#else
-#error "unsupported NR_SLOTS"
-#endif
-              }
-    // XOR colliding pairs of Xi
-    dropped_stor = 0;
-    for (n = 0; n < nr_coll; n++)
-      {
-        i = collisions[n] & 0xff;
-        j = collisions[n] >> 8;
-        a = (__global ulong *)
-            (ht_src + tid * NR_SLOTS * SLOT_LEN + i * SLOT_LEN + xi_offset);
-        b = (__global ulong *)
-            (ht_src + tid * NR_SLOTS * SLOT_LEN + j * SLOT_LEN + xi_offset);
-	dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b);
-      }
-    if (round < 8)
-	// reset the counter in preparation of the next round
-	*(__global uint *)(ht_src + tid * NR_SLOTS * SLOT_LEN) = 0;
-#ifdef ENABLE_DEBUG
-    debug[tid * 2] = dropped_coll;
-    debug[tid * 2 + 1] = dropped_stor;
-#endif
-}
-
-/*
-** This defines kernel_round1, kernel_round2, ..., kernel_round7.
-*/
-#define KERNEL_ROUND(N) \
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) \
-void kernel_round ## N(__global char *ht_src, __global char *ht_dst, \
-	__global uint *debug) \
-{ \
-    equihash_round(N, ht_src, ht_dst, debug); \
-}
-KERNEL_ROUND(1)
-KERNEL_ROUND(2)
-KERNEL_ROUND(3)
-KERNEL_ROUND(4)
-KERNEL_ROUND(5)
-KERNEL_ROUND(6)
-KERNEL_ROUND(7)
-
-// kernel_round8 takes an extra argument, "sols"
-__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
-void kernel_round8(__global char *ht_src, __global char *ht_dst,
-	__global uint *debug, __global sols_t *sols)
-{
-    uint                tid = get_global_id(0);
-    equihash_round(8, ht_src, ht_dst, debug);
-    if (!tid)
-	sols->nr = sols->likely_invalids = 0;
-}
-
-uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
-{
-    return *(__global uint *)(ht + row * NR_SLOTS * SLOT_LEN +
-	    slot * SLOT_LEN + xi_offset - 4);
-}
-
-void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs,
-	uint round)
-{
-    __global char	*ht = htabs[round % 2];
-    uint		i = nr_inputs - 1;
-    uint		j = nr_inputs * 2 - 1;
-    uint		xi_offset = xi_offset_for_round(round);
-    do
-      {
-	ins[j] = expand_ref(ht, xi_offset,
-		DECODE_ROW(ins[i]), DECODE_SLOT1(ins[i]));
-	ins[j - 1] = expand_ref(ht, xi_offset,
-		DECODE_ROW(ins[i]), DECODE_SLOT0(ins[i]));
-	if (!i)
-	    break ;
-	i--;
-	j -= 2;
-      }
-    while (1);
-}
-
-/*
-** Verify if a potential solution is in fact valid.
-*/
-void potential_sol(__global char **htabs, __global sols_t *sols,
-	uint ref0, uint ref1)
-{
-    uint	sol_i;
-    uint	nr_values;
-    sol_i = atomic_inc(&sols->nr);
-    if (sol_i >= MAX_SOLS)
-	return ;
-    sols->valid[sol_i] = 0;
-    nr_values = 0;
-    sols->values[sol_i][nr_values++] = ref0;
-    sols->values[sol_i][nr_values++] = ref1;
-    uint round = PARAM_K - 1;
-    do
-      {
-	round--;
-	expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round);
-	nr_values *= 2;
-      }
-    while (round > 0);
-    sols->valid[sol_i] = 1;
-}
-
-/*
-** Scan the hash tables to find Equihash solutions.
-*/
-__kernel
-void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols)
-{
-    uint		tid = get_global_id(0);
-    __global char	*htabs[2] = { ht0, ht1 };
-    uint		ht_i = (PARAM_K - 1) % 2; // table filled at last round
-    uint		cnt;
-    uint		xi_offset = xi_offset_for_round(PARAM_K - 1);
-    uint		i, j;
-    __global char	*a, *b;
-    uint		ref_i, ref_j;
-    // it's ok for the collisions array to be so small, as if it fills up
-    // the potential solutions are likely invalid (many duplicate inputs)
-    ulong		collisions[5];
-    uint		coll;
-#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
-    // in the final hash table, we are looking for a match on both the bits
-    // part of the previous PREFIX colliding bits, and the last PREFIX bits.
-    uint		mask = 0xffffff;
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
-    a = htabs[ht_i] + tid * NR_SLOTS * SLOT_LEN;
-    cnt = *(__global uint *)a;
-    cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round
-    coll = 0;
-    a += xi_offset;
-    for (i = 0; i < cnt; i++, a += SLOT_LEN)
-	for (j = i + 1, b = a + SLOT_LEN; j < cnt; j++, b += SLOT_LEN)
-	    if (((*(__global uint *)a) & mask) ==
-		    ((*(__global uint *)b) & mask))
-	      {
-		ref_i = *(__global uint *)(a - 4);
-		ref_j = *(__global uint *)(b - 4);
-		if (coll < sizeof (collisions) / sizeof (*collisions))
-		    collisions[coll++] = ((ulong)ref_i << 32) | ref_j;
-		else
-		    atomic_inc(&sols->likely_invalids);
-	      }
-    if (!coll)
-	return ;
-    for (i = 0; i < coll; i++)
-	potential_sol(htabs, sols, collisions[i] >> 32,
-		collisions[i] & 0xffffffff);
-}
diff --git a/ocl_silentarmy/zcash/gpu/kernel.cl b/ocl_silentarmy/zcash/gpu/kernel.cl
deleted file mode 100644
index 0fdc74d83..000000000
--- a/ocl_silentarmy/zcash/gpu/kernel.cl
+++ /dev/null
@@ -1,555 +0,0 @@
-# 1 "input.cl"
-# 1 "<built-in>"
-# 1 "<command-line>"
-# 1 "/usr/include/stdc-predef.h" 1 3 4
-# 1 "<command-line>" 2
-# 1 "input.cl"
-# 1 "param.h" 1
-# 60 "param.h"
-typedef struct sols_s
-{
-    uint nr;
-    uint likely_invalids;
-    uchar valid[2000];
-    uint values[2000][(1 << 9)];
-} sols_t;
-# 2 "input.cl" 2
-# 36 "input.cl"
-__constant ulong blake_iv[] =
-{
-    0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
-    0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
-    0x510e527fade682d1, 0x9b05688c2b3e6c1f,
-    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
-};
-
-
-
-
-__kernel
-void kernel_init_ht(__global char *ht)
-{
-    uint tid = get_global_id(0);
-    *(__global uint *)(ht + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32) = 0;
-}
-# 80 "input.cl"
-uint ht_store(uint round, __global char *ht, uint i,
-        ulong xi0, ulong xi1, ulong xi2, ulong xi3)
-{
-    uint row;
-    __global char *p;
-    uint cnt;
-# 111 "input.cl"
-    if (!(round % 2))
- row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4);
-    else
- row = ((xi0 & 0xf0000) >> 0) |
-     ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
-     ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-
-
-
-    xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
-    xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
-    xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
-    p = ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32;
-    cnt = atomic_inc((__global uint *)p);
-    if (cnt >= ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9))
-        return 1;
-    p += cnt * 32 + (8 + ((round) / 2) * 4);
-
-    *(__global uint *)(p - 4) = i;
-    if (round == 0 || round == 1)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global ulong *)(p + 8) = xi1;
- *(__global ulong *)(p + 16) = xi2;
-      }
-    else if (round == 2)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global ulong *)(p + 8) = xi1;
- *(__global uint *)(p + 16) = xi2;
-      }
-    else if (round == 3 || round == 4)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global ulong *)(p + 8) = xi1;
-
-      }
-    else if (round == 5)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
- *(__global uint *)(p + 8) = xi1;
-      }
-    else if (round == 6 || round == 7)
-      {
-
- *(__global ulong *)(p + 0) = xi0;
-      }
-    else if (round == 8)
-      {
-
- *(__global uint *)(p + 0) = xi0;
-      }
-    return 0;
-}
-# 188 "input.cl"
-__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
-void kernel_round0(__global ulong *blake_state, __global char *ht,
-        __global uint *debug)
-{
-    uint tid = get_global_id(0);
-    ulong v[16];
-    uint inputs_per_thread = (1 << (200 / (9 + 1))) / get_global_size(0);
-    uint input = tid * inputs_per_thread;
-    uint input_end = (tid + 1) * inputs_per_thread;
-    uint dropped = 0;
-    while (input < input_end)
-      {
-
-
-        ulong word1 = (ulong)input << 32;
-
-        v[0] = blake_state[0];
-        v[1] = blake_state[1];
-        v[2] = blake_state[2];
-        v[3] = blake_state[3];
-        v[4] = blake_state[4];
-        v[5] = blake_state[5];
-        v[6] = blake_state[6];
-        v[7] = blake_state[7];
-        v[8] = blake_iv[0];
-        v[9] = blake_iv[1];
-        v[10] = blake_iv[2];
-        v[11] = blake_iv[3];
-        v[12] = blake_iv[4];
-        v[13] = blake_iv[5];
-        v[14] = blake_iv[6];
-        v[15] = blake_iv[7];
-
-        v[12] ^= 140 + 4 ;
-
-        v[14] ^= -1;
-
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + word1); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + word1); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + word1); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-        v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);;
-        v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);;
-        v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);;
-        v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);;
-        v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);;
-
-
-
-        ulong h[7];
-        h[0] = blake_state[0] ^ v[0] ^ v[8];
-        h[1] = blake_state[1] ^ v[1] ^ v[9];
-        h[2] = blake_state[2] ^ v[2] ^ v[10];
-        h[3] = blake_state[3] ^ v[3] ^ v[11];
-        h[4] = blake_state[4] ^ v[4] ^ v[12];
-        h[5] = blake_state[5] ^ v[5] ^ v[13];
-        h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
-
-
-
-        dropped += ht_store(0, ht, input * 2,
-                h[0],
-                h[1],
-                h[2],
-                h[3]);
-        dropped += ht_store(0, ht, input * 2 + 1,
-                (h[3] >> 8) | (h[4] << (64 - 8)),
-                (h[4] >> 8) | (h[5] << (64 - 8)),
-                (h[5] >> 8) | (h[6] << (64 - 8)),
-                (h[6] >> 8));
-
-
-
-
-        input++;
-      }
-
-
-
-
-}
-# 415 "input.cl"
-uint xor_and_store(uint round, __global char *ht_dst, uint row,
- uint slot_a, uint slot_b, __global ulong *a, __global ulong *b)
-{
-    ulong xi0, xi1, xi2;
-
-
-
-    if (round == 1 || round == 2)
-      {
-
- xi0 = *(a++) ^ *(b++);
- xi1 = *(a++) ^ *(b++);
- xi2 = *a ^ *b;
- if (round == 2)
-   {
-
-     xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
-     xi1 = (xi1 >> 8) | (xi2 << (64 - 8));
-     xi2 = (xi2 >> 8);
-   }
-      }
-    else if (round == 3)
-      {
-
- xi0 = *a++ ^ *b++;
- xi1 = *a++ ^ *b++;
- xi2 = *(__global uint *)a ^ *(__global uint *)b;
-      }
-    else if (round == 4 || round == 5)
-      {
-
- xi0 = *a++ ^ *b++;
- xi1 = *a ^ *b;
- xi2 = 0;
- if (round == 4)
-   {
-
-     xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
-     xi1 = (xi1 >> 8);
-   }
-      }
-    else if (round == 6)
-      {
-
- xi0 = *a++ ^ *b++;
- xi1 = *(__global uint *)a ^ *(__global uint *)b;
- xi2 = 0;
- if (round == 6)
-   {
-
-     xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
-     xi1 = (xi1 >> 8);
-   }
-      }
-    else if (round == 7 || round == 8)
-      {
-
- xi0 = *a ^ *b;
- xi1 = 0;
- xi2 = 0;
- if (round == 8)
-   {
-
-     xi0 = (xi0 >> 8);
-   }
-      }
-
-
-    if (!xi0 && !xi1)
- return 0;
-
-
-
-    return ht_store(round, ht_dst, ((row << 12) | ((slot_b & 0x3f) << 6) | (slot_a & 0x3f)),
-     xi0, xi1, xi2, 0);
-}
-
-
-
-
-
-void equihash_round(uint round, __global char *ht_src, __global char *ht_dst,
- __global uint *debug)
-{
-    uint tid = get_global_id(0);
-    uint tlid = get_local_id(0);
-    __global char *p;
-    uint cnt;
-    uchar first_words[((1 << (((200 / (9 + 1)) + 1) - 20)) * 9)];
-    uchar mask;
-    uint i, j;
-
-
-    ushort collisions[((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 3];
-    uint nr_coll = 0;
-    uint n;
-    uint dropped_coll, dropped_stor;
-    __global ulong *a, *b;
-    uint xi_offset;
-
-    xi_offset = (8 + ((round - 1) / 2) * 4);
-# 524 "input.cl"
-    mask = 0;
-
-
-
-    p = (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32);
-    cnt = *(__global uint *)p;
-    cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 9));
-    p += xi_offset;
-    for (i = 0; i < cnt; i++, p += 32)
-        first_words[i] = *(__global uchar *)p;
-
-    nr_coll = 0;
-    dropped_coll = 0;
-    for (i = 0; i < cnt; i++)
-        for (j = i + 1; j < cnt; j++)
-            if ((first_words[i] & mask) ==
-      (first_words[j] & mask))
-              {
-
-                if (nr_coll >= sizeof (collisions) / sizeof (*collisions))
-                    dropped_coll++;
-                else
-
-
-                    collisions[nr_coll++] =
-   ((ushort)j << 8) | ((ushort)i & 0xff);
-
-
-
-              }
-
-    dropped_stor = 0;
-    for (n = 0; n < nr_coll; n++)
-      {
-        i = collisions[n] & 0xff;
-        j = collisions[n] >> 8;
-        a = (__global ulong *)
-            (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 + i * 32 + xi_offset);
-        b = (__global ulong *)
-            (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 + j * 32 + xi_offset);
- dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b);
-      }
-    if (round < 8)
-
- *(__global uint *)(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32) = 0;
-
-
-
-
-}
-# 585 "input.cl"
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round1(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(1, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round2(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(2, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round3(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(3, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round4(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(4, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round5(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(5, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round6(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(6, ht_src, ht_dst, debug); }
-__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round7(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(7, ht_src, ht_dst, debug); }
-
-
-__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
-void kernel_round8(__global char *ht_src, __global char *ht_dst,
- __global uint *debug, __global sols_t *sols)
-{
-    uint tid = get_global_id(0);
-    equihash_round(8, ht_src, ht_dst, debug);
-    if (!tid)
- sols->nr = sols->likely_invalids = 0;
-}
-
-uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
-{
-    return *(__global uint *)(ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 +
-     slot * 32 + xi_offset - 4);
-}
-
-void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs,
- uint round)
-{
-    __global char *ht = htabs[round % 2];
-    uint i = nr_inputs - 1;
-    uint j = nr_inputs * 2 - 1;
-    uint xi_offset = (8 + ((round) / 2) * 4);
-    do
-      {
- ins[j] = expand_ref(ht, xi_offset,
-  (ins[i] >> 12), ((ins[i] >> 6) & 0x3f));
- ins[j - 1] = expand_ref(ht, xi_offset,
-  (ins[i] >> 12), (ins[i] & 0x3f));
- if (!i)
-     break ;
- i--;
- j -= 2;
-      }
-    while (1);
-}
-
-
-
-
-void potential_sol(__global char **htabs, __global sols_t *sols,
- uint ref0, uint ref1)
-{
-    uint sol_i;
-    uint nr_values;
-    sol_i = atomic_inc(&sols->nr);
-    if (sol_i >= 2000)
- return ;
-    sols->valid[sol_i] = 0;
-    nr_values = 0;
-    sols->values[sol_i][nr_values++] = ref0;
-    sols->values[sol_i][nr_values++] = ref1;
-    uint round = 9 - 1;
-    do
-      {
- round--;
- expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round);
- nr_values *= 2;
-      }
-    while (round > 0);
-    sols->valid[sol_i] = 1;
-}
-
-
-
-
-__kernel
-void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols)
-{
-    uint tid = get_global_id(0);
-    __global char *htabs[2] = { ht0, ht1 };
-    uint ht_i = (9 - 1) % 2;
-    uint cnt;
-    uint xi_offset = (8 + ((9 - 1) / 2) * 4);
-    uint i, j;
-    __global char *a, *b;
-    uint ref_i, ref_j;
-
-
-    ulong collisions[5];
-    uint coll;
-
-
-
-    uint mask = 0xffffff;
-
-
-
-    a = htabs[ht_i] + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32;
-    cnt = *(__global uint *)a;
-    cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 9));
-    coll = 0;
-    a += xi_offset;
-    for (i = 0; i < cnt; i++, a += 32)
- for (j = i + 1, b = a + 32; j < cnt; j++, b += 32)
-     if (((*(__global uint *)a) & mask) ==
-      ((*(__global uint *)b) & mask))
-       {
-  ref_i = *(__global uint *)(a - 4);
-  ref_j = *(__global uint *)(b - 4);
-  if (coll < sizeof (collisions) / sizeof (*collisions))
-      collisions[coll++] = ((ulong)ref_i << 32) | ref_j;
-  else
-      atomic_inc(&sols->likely_invalids);
-       }
-    if (!coll)
- return ;
-    for (i = 0; i < coll; i++)
- potential_sol(htabs, sols, collisions[i] >> 32,
-  collisions[i] & 0xffffffff);
-}
diff --git a/ocl_xpm/ocl_xmp.cpp b/ocl_xpm/ocl_xmp.cpp
deleted file mode 100644
index d0a96a2a8..000000000
--- a/ocl_xpm/ocl_xmp.cpp
+++ /dev/null
@@ -1,305 +0,0 @@
-#include "ocl_xmp.hpp"
-
-
-
-// miner instance
-#include "opencl.h"
-#include <cstdint>
-
-#include <boost/filesystem.hpp>
-
-// is this really needed?
-//#include "uint256.h"
-
-// hardcoded defines, looks like not working
-// hardcoded defines fix this
-#define RESTBITS 4
-#define XINTREE
-#define UNROLL
-#define __OPENCL_HOST__
-#include "zcash/gpu/common.h"
-
-struct MinerInstance {
-	cl_context _context;
-	cl_program _program;
-
-	cl_command_queue queue;
-	clBuffer<blake2b_state> blake2bState;
-	clBuffer<uint32_t> heap0;
-	clBuffer<uint32_t> heap1;
-	clBuffer<bsizes> nslots;
-	clBuffer<proof> sols;
-	clBuffer<uint32_t> numSols;
-	cl_kernel _digitHKernel;
-	cl_kernel _digitOKernel;
-	cl_kernel _digitEKernel;
-	cl_kernel _digitKKernel;
-	cl_kernel _digitKernels[9];
-
-	//hide_xmp_hack::uint256 nonce; // TODO IS THIS NEEDED????
-
-	bool init(cl_context context, cl_program program, cl_device_id dev, unsigned threadsNum, unsigned threadsPerBlock);
-};
-
-cl_context gContext = 0;
-cl_program gProgram = 0;
-cl_platform_id gPlatform = 0;
-
-
-bool MinerInstance::init(cl_context context,
-	cl_program program,
-	cl_device_id dev,
-	unsigned int threadsNum,
-	unsigned int threadsPerBlock)
-{
-	cl_int error;
-
-	_context = context;
-	_program = program;
-	queue = clCreateCommandQueue(context, dev, 0, &error);
-
-	blake2bState.init(context, 1, CL_MEM_READ_WRITE);
-	heap0.init(context, sizeof(digit0) / sizeof(uint32_t), CL_MEM_HOST_NO_ACCESS);
-	heap1.init(context, sizeof(digit1) / sizeof(uint32_t), CL_MEM_HOST_NO_ACCESS);
-	nslots.init(context, 2, CL_MEM_READ_WRITE);
-	sols.init(context, MAXSOLS, CL_MEM_READ_WRITE);
-	numSols.init(context, 1, CL_MEM_READ_WRITE);
-
-	_digitHKernel = clCreateKernel(program, "digitH", &error);
-	_digitOKernel = clCreateKernel(program, "digitOdd", &error);
-	_digitEKernel = clCreateKernel(program, "digitEven", &error);
-	_digitKKernel = clCreateKernel(program, "digitK", &error);
-	OCLR(clSetKernelArg(_digitHKernel, 0, sizeof(cl_mem), &blake2bState.DeviceData), 1);
-	OCLR(clSetKernelArg(_digitHKernel, 1, sizeof(cl_mem), &heap0.DeviceData), 1);
-	OCLR(clSetKernelArg(_digitHKernel, 2, sizeof(cl_mem), &nslots.DeviceData), 1);
-
-	OCLR(clSetKernelArg(_digitOKernel, 1, sizeof(cl_mem), &heap0.DeviceData), 1);
-	OCLR(clSetKernelArg(_digitOKernel, 2, sizeof(cl_mem), &heap1.DeviceData), 1);
-	OCLR(clSetKernelArg(_digitOKernel, 3, sizeof(cl_mem), &nslots.DeviceData), 1);
-	OCLR(clSetKernelArg(_digitEKernel, 1, sizeof(cl_mem), &heap0.DeviceData), 1);
-	OCLR(clSetKernelArg(_digitEKernel, 2, sizeof(cl_mem), &heap1.DeviceData), 1);
-	OCLR(clSetKernelArg(_digitEKernel, 3, sizeof(cl_mem), &nslots.DeviceData), 1);
-
-	for (unsigned i = 1; i <= 8; i++) {
-		char kernelName[32];
-		sprintf(kernelName, "digit_%u", i);
-		_digitKernels[i] = clCreateKernel(program, kernelName, &error);
-		OCLR(clSetKernelArg(_digitKernels[i], 0, sizeof(cl_mem), &heap0.DeviceData), 1);
-		OCLR(clSetKernelArg(_digitKernels[i], 1, sizeof(cl_mem), &heap1.DeviceData), 1);
-		OCLR(clSetKernelArg(_digitKernels[i], 2, sizeof(cl_mem), &nslots.DeviceData), 1);
-	}
-
-	OCLR(clSetKernelArg(_digitKKernel, 0, sizeof(cl_mem), &heap0.DeviceData), 1);
-	OCLR(clSetKernelArg(_digitKKernel, 1, sizeof(cl_mem), &heap1.DeviceData), 1);
-	OCLR(clSetKernelArg(_digitKKernel, 2, sizeof(cl_mem), &nslots.DeviceData), 1);
-	OCLR(clSetKernelArg(_digitKKernel, 3, sizeof(cl_mem), &sols.DeviceData), 1);
-	OCLR(clSetKernelArg(_digitKKernel, 4, sizeof(cl_mem), &numSols.DeviceData), 1);
-
-	return true;
-}
-
-////////////////////////////
-////statics non class START
-
-static void setheader(blake2b_state *ctx, const char *header, const uint32_t headerlen)
-{
-	uint32_t le_N = WN;
-	uint32_t le_K = WK;
-	char personal[] = "ZcashPoW01230123";
-	memcpy(personal + 8, &le_N, 4);
-	memcpy(personal + 12, &le_K, 4);
-	blake2b_param P[1];
-	P->digest_length = HASHOUT;
-	P->key_length = 0;
-	P->fanout = 1;
-	P->depth = 1;
-	P->leaf_length = 0;
-	P->node_offset = 0;
-	P->node_depth = 0;
-	P->inner_length = 0;
-	memset(P->reserved, 0, sizeof(P->reserved));
-	memset(P->salt, 0, sizeof(P->salt));
-	memcpy(P->personal, (const uint8_t *)personal, 16);
-	blake2b_init_param(ctx, P);
-	blake2b_update(ctx, (const uint8_t*)header, headerlen);
-}
-
-static void setnonce(blake2b_state *ctx, const uint8_t *nonce)
-{
-	blake2b_update(ctx, nonce, 32);
-}
-
-static int inline digit(cl_command_queue clQueue, cl_kernel kernel, size_t nthreads, size_t threadsPerBlock)
-{
-	size_t globalSize[] = { nthreads, 1, 1 };
-	size_t localSize[] = { threadsPerBlock, 1 };
-	OCLR(clEnqueueNDRangeKernel(clQueue, kernel, 1, 0, globalSize, localSize, 0, 0, 0), 1);
-	return 0;
-}
-
-
-////statics non class END
-////////////////////////////
-
-ocl_xmp::ocl_xmp(int platf_id, int dev_id) { /*TODO*/
-	platform_id = platf_id;
-	device_id = dev_id;
-	// TODO 
-	threadsNum = 8192;
-	wokrsize = 128; // 256;
-	//threadsperblock = 128;
-}
-
-std::string ocl_xmp::getdevinfo() { /*TODO*/
-	return "GPU_ID(" + std::to_string(device_id) + ")";
-}
-
-// STATICS START
-int ocl_xmp::getcount() { /*TODO*/
-	return 0;
-}
-
-void ocl_xmp::getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version) { /*TODO*/ }
-
-void ocl_xmp::start(ocl_xmp& device_context) {
-	/*TODO*/
-	device_context.is_init_success = false;
-	cl_context gContext[64] = { 0 };
-	cl_program gProgram[64] = { 0 };
-
-	
-	std::vector<cl_device_id> allGpus;
-	if (!clInitialize(device_context.platform_id, allGpus)) {
-		return;
-	}
-	
-	// this is kinda stupid but it works
-	std::vector<cl_device_id> gpus;
-	for (unsigned i = 0; i < allGpus.size(); ++i) {
-		if (i == device_context.device_id) {
-			printf("Using device %d as GPU %d\n", i, (int)gpus.size());
-			gpus.push_back(allGpus[i]);
-		}
-	}
-
-	if (!gpus.size()){
-		printf("Device id %d not found\n", device_context.device_id);
-		return;
-	}
-
-	// context create
-	for (unsigned i = 0; i < gpus.size(); i++) {
-		cl_context_properties props[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)gPlatform, 0 };
-		cl_int error;
-		gContext[i] = clCreateContext(props, 1, &gpus[i], 0, 0, &error);
-		//OCLR(error, false);
-		if (cl_int err = error) {
-			printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__);
-			return;
-		}
-	}
-
-	std::vector<cl_int> binstatus;
-	binstatus.resize(gpus.size());
-
-	for (size_t i = 0; i < gpus.size(); i++) {
-		char kernelName[64];
-		sprintf(kernelName, "equiw200k9_gpu%u.bin", (unsigned)i);
-		if (!clCompileKernel(gContext[i],
-			gpus[i],
-			kernelName,
-			{ "zcash/gpu/equihash.cl" },
-			"-I./zcash/gpu -DXINTREE -DWN=200 -DWK=9 -DRESTBITS=4 -DUNROLL",
-			&binstatus[i],
-			&gProgram[i])) {
-			return;
-		}
-	}
-
-	for (unsigned i = 0; i < gpus.size(); ++i) {
-		if (binstatus[i] == CL_SUCCESS) {
-			device_context.context = new MinerInstance();
-			if (!device_context.context->init(gContext[i], gProgram[i], gpus[i], device_context.threadsNum, device_context.wokrsize)) {
-				printf("Init failed");
-				return;
-			}
-		}
-		else {
-			printf("GPU %d: failed to load kernel\n", i);
-			return;
-		}
-	}
-
-	device_context.is_init_success = true;
-}
-
-void ocl_xmp::stop(ocl_xmp& device_context) { /*TODO*/ }
-
-void ocl_xmp::solve(const char *tequihash_header,
-	unsigned int tequihash_header_len,
-	const char* nonce,
-	unsigned int nonce_len,
-	std::function<bool()> cancelf,
-	std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
-	std::function<void(void)> hashdonef,
-	ocl_xmp& device_context) {
-	if (device_context.is_init_success == false) {
-		printf("fail OCL\n");
-		//cancelf();
-		return;
-	}
-
-	// move to context or somewhere or leave?
-	blake2b_state initialCtx;
-	setheader(&initialCtx, tequihash_header, tequihash_header_len);
-
-	MinerInstance *miner = device_context.context;
-	clFlush(miner->queue);
-
-	/*hide_xmp_hack::uint256 nNonce = hide_xmp_hack::uint256(nonce);
-	miner->nonce = nNonce;*/
-	*miner->blake2bState.HostData = initialCtx;
-	setnonce(miner->blake2bState.HostData, (const uint8_t*)nonce);
-	memset(miner->nslots.HostData, 0, 2 * sizeof(bsizes));
-	*miner->numSols.HostData = 0;
-	miner->blake2bState.copyToDevice(miner->queue, false);
-	miner->nslots.copyToDevice(miner->queue, false);
-	miner->numSols.copyToDevice(miner->queue, false);
-
-	digit(miner->queue, miner->_digitHKernel, device_context.threadsNum, device_context.wokrsize);
-#if BUCKBITS == 16 && RESTBITS == 4 && defined XINTREE && defined(UNROLL)
-	for (unsigned i = 1; i <= 8; i++)
-		digit(miner->queue, miner->_digitKernels[i], device_context.threadsNum, device_context.wokrsize);
-#else    
-	size_t globalSize[] = { _threadsNum, 1, 1 };
-	size_t localSize[] = { _threadsPerBlocksNum, 1 };
-	for (unsigned r = 1; r < WK; r++) {
-		if (r & 1) {
-			OCL(clSetKernelArg(miner->_digitOKernel, 0, sizeof(cl_uint), &r));
-			OCL(clEnqueueNDRangeKernel(miner->queue, miner->_digitOKernel, 1, 0, globalSize, localSize, 0, 0, 0));
-		}
-		else {
-			OCL(clSetKernelArg(miner->_digitEKernel, 0, sizeof(cl_uint), &r));
-			OCL(clEnqueueNDRangeKernel(miner->queue, miner->_digitEKernel, 1, 0, globalSize, localSize, 0, 0, 0));
-		}
-	}
-#endif
-	digit(miner->queue, miner->_digitKKernel, device_context.threadsNum, device_context.wokrsize);
-
-	// get solutions
-	miner->sols.copyToHost(miner->queue, true);
-	miner->numSols.copyToHost(miner->queue, true);
-	for (unsigned s = 0; s < miner->numSols.HostData[0]; s++)
-	{
-		std::vector<uint32_t> index_vector(PROOFSIZE);
-		for (u32 i = 0; i < PROOFSIZE; i++) {
-			index_vector[i] = miner->sols[s][i];
-		}
-
-		solutionf(index_vector, DIGITBITS, nullptr);
-		if (cancelf()) return;
-	}
-	hashdonef();
-}
-
-// STATICS END
\ No newline at end of file
diff --git a/ocl_xpm/ocl_xpm.vcxproj b/ocl_xpm/ocl_xpm.vcxproj
deleted file mode 100644
index 7c2c299c5..000000000
--- a/ocl_xpm/ocl_xpm.vcxproj
+++ /dev/null
@@ -1,100 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="ocl_xmp.hpp" />
-    <ClInclude Include="zcash\gpu\common.h" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="..\cpu_tromp\blake2\blake2bx.cpp" />
-    <ClCompile Include="ocl_xmp.cpp" />
-  </ItemGroup>
-  <ItemGroup>
-    <None Include="zcash\gpu\equihash.cl" />
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{5EC9EDEB-8E49-4126-9161-1560683CBC71}</ProjectGuid>
-    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>ocl_xpm</RootNamespace>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;OCL_XPM_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <SDLCheck>
-      </SDLCheck>
-      <AdditionalIncludeDirectories>..\ocl_device_utils;..\cpu_tromp;..\3rdparty\include;$(AMDAPPSDKROOT)\include\</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Windows</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <AdditionalLibraryDirectories>..\3rdparty\libs\win64;$(AMDAPPSDKROOT)\lib\x86_64\</AdditionalLibraryDirectories>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;OCL_XPM_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <SDLCheck>
-      </SDLCheck>
-      <AdditionalIncludeDirectories>..\ocl_device_utils;..\3rdparty\include;$(AMDAPPSDKROOT)\include\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Windows</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <AdditionalLibraryDirectories>..\3rdparty\libs\win64;$(AMDAPPSDKROOT)\lib\x86_64\</AdditionalLibraryDirectories>
-    </Link>
-  </ItemDefinitionGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/ocl_xpm/ocl_xpm.vcxproj.filters b/ocl_xpm/ocl_xpm.vcxproj.filters
deleted file mode 100644
index ae440bef8..000000000
--- a/ocl_xpm/ocl_xpm.vcxproj.filters
+++ /dev/null
@@ -1,26 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup>
-    <Filter Include="zcash">
-      <UniqueIdentifier>{69f1aa4c-1be3-4265-a93c-b58266bad10b}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="zcash\gpu">
-      <UniqueIdentifier>{a95c2e64-90c0-48d9-9287-46723392025d}</UniqueIdentifier>
-    </Filter>
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="ocl_xmp.hpp" />
-    <ClInclude Include="zcash\gpu\common.h">
-      <Filter>zcash\gpu</Filter>
-    </ClInclude>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="ocl_xmp.cpp" />
-    <ClCompile Include="..\cpu_tromp\blake2\blake2bx.cpp" />
-  </ItemGroup>
-  <ItemGroup>
-    <None Include="zcash\gpu\equihash.cl">
-      <Filter>zcash\gpu</Filter>
-    </None>
-  </ItemGroup>
-</Project>
\ No newline at end of file
diff --git a/ocl_xpm/zcash/gpu/blake2bcl.h b/ocl_xpm/zcash/gpu/blake2bcl.h
deleted file mode 100644
index 13cad965c..000000000
--- a/ocl_xpm/zcash/gpu/blake2bcl.h
+++ /dev/null
@@ -1,150 +0,0 @@
-// Blake2-B CUDA Implementation
-// tpruvot@github July 2016
-// permission granted to use under MIT license
-// modified for use in Zcash by John Tromp September 2016
-
-/**
- * uint2 direct ops by c++ operator definitions
- */
-
-// static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) {
-//   return make_uint2(a.x ^ b.x, a.y ^ b.y);
-// }
-
-// uint2 ROR/ROL methods
-uint2 ROR2(const uint2 a, const int offset) {
-  uint2 result;
-  if (!offset)
-          result = a;
-  else if (offset < 32) {
-          result.y = ((a.y >> offset) | (a.x << (32 - offset)));
-          result.x = ((a.x >> offset) | (a.y << (32 - offset)));
-  } else if (offset == 32) {
-          result.y = a.x;
-          result.x = a.y;
-  } else {
-          result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
-          result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
-  }
-  return result;
-}
-
-uint2 SWAPUINT2(uint2 value) {
-  uint2 result;
-  result.x = value.y;
-  result.y = value.x;
-  return result;
-//   return make_uint2(value.y, value.x);
-}
-
-#define ROR24(u) ROR2(u,24)
-#define ROR16(u) ROR2(u,16)
-
-__constant int8_t blake2b_sigma[12][16] = {
-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
-  { 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  } ,
-  { 11, 8,  12, 0,  5,  2,  15, 13, 10, 14, 3,  6,  7,  1,  9,  4  } ,
-  { 7,  9,  3,  1,  13, 12, 11, 14, 2,  6,  5,  10, 4,  0,  15, 8  } ,
-  { 9,  0,  5,  7,  2,  4,  10, 15, 14, 1,  11, 12, 6,  8,  3,  13 } ,
-  { 2,  12, 6,  10, 0,  11, 8,  3,  4,  13, 7,  5,  15, 14, 1,  9  } ,
-  { 12, 5,  1,  15, 14, 13, 4,  10, 0,  7,  6,  3,  9,  2,  8,  11 } ,
-  { 13, 11, 7,  14, 12, 1,  3,  9,  5,  0,  15, 4,  8,  6,  2,  10 } ,
-  { 6,  15, 14, 9,  11, 3,  0,  8,  12, 2,  13, 7,  1,  4,  10, 5  } ,
-  { 10, 2,  8,  4,  7,  6,  1,  5,  15, 11, 9,  14, 3,  12, 13, 0  } ,
-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
-  { 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  }
-};
-
-void G(const int32_t r, const int32_t i, uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d, uint64_t const m[16]) {
-  *a += *b + m[ blake2b_sigma[r][2*i] ];
-  ((uint2*)d)[0] = SWAPUINT2( ((uint2*)d)[0] ^ ((uint2*)a)[0] );
-  *c += *d;
-  ((uint2*)b)[0] = ROR24( ((uint2*)b)[0] ^ ((uint2*)c)[0] );
-  *a += *b + m[ blake2b_sigma[r][2*i+1] ];
-  ((uint2*)d)[0] = ROR16( ((uint2*)d)[0] ^ ((uint2*)a)[0] );
-  *c += *d;
-  ((uint2*)b)[0] = ROR2( ((uint2*)b)[0] ^ ((uint2*)c)[0], 63U);
-}
-
-#define ROUND(r) \
-  G(r, 0, &v[0], &v[4], &v[ 8], &v[12], m); \
-  G(r, 1, &v[1], &v[5], &v[ 9], &v[13], m); \
-  G(r, 2, &v[2], &v[6], &v[10], &v[14], m); \
-  G(r, 3, &v[3], &v[7], &v[11], &v[15], m); \
-  G(r, 4, &v[0], &v[5], &v[10], &v[15], m); \
-  G(r, 5, &v[1], &v[6], &v[11], &v[12], m); \
-  G(r, 6, &v[2], &v[7], &v[ 8], &v[13], m); \
-  G(r, 7, &v[3], &v[4], &v[ 9], &v[14], m);
-
-void blake2b_gpu_hash(blake2b_state *state, uint32_t idx, uint8_t *hash, uint32_t outlen) {
-  const uint32_t leb = idx;
-  *(uint32_t*)(state->buf + state->buflen) = leb;
-  state->buflen += 4;
-  state->counter += state->buflen;
-  for (unsigned i = 0; i < BLAKE2B_BLOCKBYTES - state->buflen; i++)
-    state->buf[i+state->buflen] = 0;  
-
-  uint64_t *d_data = (uint64_t *)state->buf;
-  uint64_t m[16];
-
-  m[0] = d_data[0];
-  m[1] = d_data[1];
-  m[2] = d_data[2];
-  m[3] = d_data[3];
-  m[4] = d_data[4];
-  m[5] = d_data[5];
-  m[6] = d_data[6];
-  m[7] = d_data[7];
-  m[8] = d_data[8];
-  m[9] = d_data[9];
-  m[10] = d_data[10];
-  m[11] = d_data[11];
-  m[12] = d_data[12];
-  m[13] = d_data[13];
-  m[14] = d_data[14];
-  m[15] = d_data[15];
-
-  uint64_t v[16];
-
-  v[0] = state->h[0];
-  v[1] = state->h[1];
-  v[2] = state->h[2];
-  v[3] = state->h[3];
-  v[4] = state->h[4];
-  v[5] = state->h[5];
-  v[6] = state->h[6];
-  v[7] = state->h[7];
-  v[8] = 0x6a09e667f3bcc908;
-  v[9] = 0xbb67ae8584caa73b;
-  v[10] =  0x3c6ef372fe94f82b;
-  v[11] = 0xa54ff53a5f1d36f1;
-  v[12] = 0x510e527fade682d1 ^ state->counter;
-  v[13] = 0x9b05688c2b3e6c1f;
-  v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff;
-  v[15] = 0x5be0cd19137e2179;
-
-  ROUND( 0 );
-  ROUND( 1 );
-  ROUND( 2 );
-  ROUND( 3 );
-  ROUND( 4 );
-  ROUND( 5 );
-  ROUND( 6 );
-  ROUND( 7 );
-  ROUND( 8 );
-  ROUND( 9 );
-  ROUND( 10 );
-  ROUND( 11 );
-  
-  state->h[0] ^= v[0] ^ v[ 8];
-  state->h[1] ^= v[1] ^ v[ 9];
-  state->h[2] ^= v[2] ^ v[10];
-  state->h[3] ^= v[3] ^ v[11];
-  state->h[4] ^= v[4] ^ v[12];
-  state->h[5] ^= v[5] ^ v[13];
-  state->h[6] ^= v[6] ^ v[14];
-  state->h[7] ^= v[7] ^ v[15];
-
-  for (unsigned i = 0; i < outlen; i++)
-    hash[i] = ((uint8_t*)state->h)[i];
-}
diff --git a/ocl_xpm/zcash/gpu/common.h b/ocl_xpm/zcash/gpu/common.h
deleted file mode 100644
index 8c7727406..000000000
--- a/ocl_xpm/zcash/gpu/common.h
+++ /dev/null
@@ -1,159 +0,0 @@
-#if defined(__OPENCL_HOST__)
-#define __global
-//#include "blake2/blake2.h"
-//#include "equi.h"
-#include "../cpu_tromp/equi.h"
-
-#else
-typedef char int8_t;
-typedef uchar uint8_t;
-typedef short int16_t;
-typedef ushort uint16_t;
-typedef int int32_t;
-typedef uint uint32_t;
-typedef long int64_t;
-typedef ulong uint64_t;
-
-#if defined(_MSC_VER)
-#define ALIGN(x) __declspec(align(x))
-#else
-#define ALIGN(x) __attribute__ ((__aligned__(x)))
-#endif
-
-enum blake2b_constant
-{
-  BLAKE2B_BLOCKBYTES = 128,
-  BLAKE2B_OUTBYTES   = 64,
-  BLAKE2B_KEYBYTES   = 64,
-  BLAKE2B_SALTBYTES  = 16,
-  BLAKE2B_PERSONALBYTES = 16
-};
-
-#pragma pack(push, 1)
-ALIGN( 64 ) typedef struct __blake2b_state {
-  uint64_t h[8];
-  uint8_t  buf[BLAKE2B_BLOCKBYTES];
-  uint16_t counter;
-  uint8_t  buflen;
-  uint8_t  lastblock;
-} blake2b_state;
-#pragma pack(pop)
-#endif
-
-#define COLLISION_BIT_LENGTH (WN / (WK+1))
-#define COLLISION_BYTE_LENGTH ((COLLISION_BIT_LENGTH+7)/8)
-#define FINAL_FULL_WIDTH (2*COLLISION_BYTE_LENGTH+sizeof(uint32_t)*(1 << (WK)))
-
-
-#define NDIGITS   (WK+1)
-#define DIGITBITS (WN/(NDIGITS))
-//#define PROOFSIZE (1u<<WK)
-#define COMPRESSED_PROOFSIZE ((COLLISION_BIT_LENGTH+1)*PROOFSIZE*4/(8*sizeof(uint32_t)))
-//#define BASE (1u<<DIGITBITS)
-//#define NHASHES (2u*BASE)
-//#define HASHESPERBLAKE (512/WN)
-//#define HASHOUT (HASHESPERBLAKE*WN/8)
-
-// 2_log of number of buckets
-#define BUCKBITS  (DIGITBITS-RESTBITS)
-
-// number of buckets
-#define NBUCKETS (1<<BUCKBITS)
-// 2_log of number of slots per bucket
-#define SLOTBITS (RESTBITS+1+1)
-// number of slots per bucket
-#define NSLOTS (1u<<SLOTBITS)
-// number of per-xhash slots
-#define XFULL 16
-// SLOTBITS mask
-#define SLOTMASK (NSLOTS-1)
-// number of possible values of xhash (rest of n) bits
-#define NRESTS (1u<<RESTBITS)
-// number of blocks of hashes extracted from single 512 bit blake2b output
-#define NBLOCKS ((NHASHES+HASHESPERBLAKE-1)/HASHESPERBLAKE)
-// nothing larger found in 100000 runs
-#define MAXSOLS 8
-
-#define WORDS(bits)     ((bits + 31) / 32)
-#define HASHWORDS0 WORDS(WN - DIGITBITS + RESTBITS)
-#define HASHWORDS1 WORDS(WN - 2*DIGITBITS + RESTBITS)
-
-typedef uint32_t proof[PROOFSIZE];
-
-// tree  = | xhash(RESTBITS)    | slotid1(SLOTBITS) | slotid0(SLOTBITS) | bucketid(BUCKBITS) |
-// index = | bucketid(BUCKBITS) | slotid0(SLOTBITS) |
-typedef uint32_t tree;
-
-typedef union hashunit {
-  uint32_t word;
-  uint8_t bytes[4];
-} hashunit;
-
-typedef struct slot0 {
-  tree attr;
-  hashunit hash[HASHWORDS0];
-} slot0;
-
-typedef struct slot1 {
-  tree attr;
-  hashunit hash[HASHWORDS1];
-} slot1;
-
-// a bucket is NSLOTS treenodes
-typedef slot0 bucket0[NSLOTS];
-typedef slot1 bucket1[NSLOTS];
-// the N-bit hash consists of K+1 n-bit "digits"
-// each of which corresponds to a layer of NBUCKETS buckets
-typedef bucket0 digit0[NBUCKETS];
-typedef bucket1 digit1[NBUCKETS];
-
-// manages hash and tree data
-typedef struct htalloc {
-  __global bucket0 *trees0[(WK+1)/2];
-  __global bucket1 *trees1[WK/2];
-} htalloc;
-
-typedef uint32_t bsizes[NBUCKETS];
-
-
-typedef struct htlayout {
-  htalloc hta;
-  uint32_t prevhashunits;
-  uint32_t nexthashunits;
-  uint32_t dunits;
-  uint32_t prevbo;
-  uint32_t nextbo;
-} htlayout;
-
-#if RESTBITS <= 6
-  typedef uint8_t xslot;
-#else
-  typedef uint16_t xslot;
-#endif
-
-typedef struct collisiondata {
-#ifdef XBITMAP
-#if NSLOTS > 64
-#error cant use XBITMAP with more than 64 slots
-#endif
-  uint64_t xhashmap[NRESTS];
-  uint64_t xmap;
-#else
-  xslot nxhashslots[NRESTS];
-  xslot xhashslots[NRESTS][XFULL];
-  xslot *xx;
-  uint32_t n0;
-  uint32_t n1;
-#endif
-  uint32_t s0;
-} collisiondata;
-
-
-typedef struct equi {
-  blake2b_state blake_ctx;
-  htalloc hta;
-  __global bsizes *nslots;
-  __global proof *sols;
-  uint32_t nsols;
-  uint32_t nthreads;
-} equi;
diff --git a/ocl_xpm/zcash/gpu/equihash.cl b/ocl_xpm/zcash/gpu/equihash.cl
deleted file mode 100644
index 213a8e4d6..000000000
--- a/ocl_xpm/zcash/gpu/equihash.cl
+++ /dev/null
@@ -1,1038 +0,0 @@
-#include "common.h"
-
-#include "blake2bcl.h"
-
-#define tree0_ptr(heap, r) ((__global bucket0 *)(heap + r))
-#define tree1_ptr(heap, r) ((__global bucket1 *)(heap + r))
-
-uint32_t tree_bucket(tree t)
-{
-  const uint32_t bucketMask = ((1u<<BUCKBITS)-1);
-  return t & bucketMask;
-}
-
-uint32_t tree_slotid0(tree t)
-{
-  const uint32_t slotMask =  ((1u<<SLOTBITS)-1);
-  return (t >> BUCKBITS) & SLOTMASK;
-}
-
-uint32_t tree_slotid1(tree t)
-{
-  const uint32_t slotMask =  ((1u<<SLOTBITS)-1);
-  return (t >> (BUCKBITS+SLOTBITS)) & SLOTMASK;
-}
-
-uint32_t tree_xhash(tree t)
-{
-  return t >> (2*SLOTBITS + BUCKBITS);
-}
-
-uint32_t tree_getindex(const tree t)
-{
-  const uint32_t bucketMask = ((1u<<BUCKBITS)-1);
-  const uint32_t slotMask =  ((1u<<SLOTBITS)-1);
-  return ((t & bucketMask) << SLOTBITS) | ((t & (slotMask << BUCKBITS)) >> BUCKBITS);  
-}
-
-void tree_setindex(tree *t, uint32_t idx)
-{
-  const uint32_t bucketMask = ((1u<<BUCKBITS)-1);
-  const uint32_t slotMask =  ((1u<<SLOTBITS)-1);
-
-  (*t) &= ~(bucketMask | (slotMask << BUCKBITS));
-  (*t) |= (idx >> SLOTBITS);
-  (*t) |= ((idx & slotMask) << BUCKBITS);
-}
-
-void tree_setxhash(tree *t, uint32_t xhash)
-{
-  const uint32_t xhashMask = ((1u << RESTBITS)-1);
-  (*t) &= ~(xhashMask << (2*SLOTBITS + BUCKBITS));
-  (*t) |= (xhash << (2*SLOTBITS + BUCKBITS));
-}
-
-tree tree_create3(uint32_t bucketId, uint32_t s0, uint32_t s1)
-{
-  return bucketId | (s0 << BUCKBITS) | (s1 << (BUCKBITS+SLOTBITS));
-}
-
-tree tree_create4(uint32_t bucketId, uint32_t s0, uint32_t s1, uint32_t xhash)
-{
-  return bucketId | (s0 << BUCKBITS) | (s1 << (BUCKBITS+SLOTBITS)) | (xhash << (2*SLOTBITS+BUCKBITS));;
-}
-
-// size (in bytes) of hash in round 0 <= r < WK
-uint32_t hashsize(const uint32_t r)
-{
-#ifdef XINTREE
-  const uint32_t hashbits = WN - (r+1) * DIGITBITS;
-#else
-  const uint32_t hashbits = WN - (r+1) * DIGITBITS + RESTBITS;
-#endif
-  return (hashbits + 7) / 8;
-}
-
-uint32_t hashwords(uint32_t bytes)
-{
-  return (bytes + 3) / 4;
-}
-
-htlayout htlayout_create_2(uint32_t r)
-{
-  htlayout R;
-  R.prevhashunits = 0;
-  R.dunits = 0;
-  
-  uint32_t nexthashbytes = hashsize(r);
-  R.nexthashunits = hashwords(nexthashbytes);
-  
-  R.prevbo = 0;
-  R.nextbo = R.nexthashunits * sizeof(hashunit) - nexthashbytes; // 0-3
-  if (r) {
-    uint32_t prevhashbytes = hashsize(r-1);
-    R.prevhashunits = hashwords(prevhashbytes);
-    R.prevbo = R.prevhashunits * sizeof(hashunit) - prevhashbytes; // 0-3
-    R.dunits = R.prevhashunits - R.nexthashunits;
-  }
-  
-  return R;
-}
-
-uint32_t htlayout_getxhash0(uint32_t prevbo, __global const slot0 *pslot)
-{
-#ifdef XINTREE
-  return tree_xhash(pslot->attr);
-#elif WN == 200 && RESTBITS == 4
-  return pslot->hash->bytes[prevbo] >> 4;
-#elif WN == 200 && RESTBITS == 8
-  return (pslot->hash->bytes[prevbo] & 0xf) << 4 | pslot->hash->bytes[prevbo+1] >> 4;
-#elif WN == 144 && RESTBITS == 4
-  return pslot->hash->bytes[prevbo] & 0xf;
-#elif WN == 200 && RESTBITS == 6
-  return (pslot->hash->bytes[prevbo] & 0x3) << 4 | pslot->hash->bytes[prevbo+1] >> 4;
-#else
-#error non implemented
-#endif
-}
-
-uint32_t htlayout_getxhash1(uint32_t prevbo, __global const slot1 *pslot)
-{
-#ifdef XINTREE
-  return tree_xhash(pslot->attr);
-#elif WN == 200 && RESTBITS == 4
-  return pslot->hash->bytes[prevbo] & 0xf;
-#elif WN == 200 && RESTBITS == 8
-  return pslot->hash->bytes[prevbo];
-#elif WN == 144 && RESTBITS == 4
-  return pslot->hash->bytes[prevbo] & 0xf;
-#elif WN == 200 && RESTBITS == 6
-  return pslot->hash->bytes[prevbo] & 0x3f;
-#else
-#error non implemented
-#endif  
-}
-
-bool htlayout_equal(uint32_t prevhashunits, __global const hashunit *hash0, __global const hashunit *hash1)
-{
-  return hash0[prevhashunits-1].word == hash1[prevhashunits-1].word;
-}
-
-void collisiondata_clear(collisiondata *data) 
-{
-#ifdef XBITMAP
-  // memset(xhashmap, 0, NRESTS * sizeof(u64));
-  for (unsigned i = 0; i < NRESTS; i++)
-    data->xhashmap[i] = 0;
-#else
-  // memset(nxhashslots, 0, NRESTS * sizeof(xslot));
-  for (unsigned i = 0; i < NRESTS; i++)
-    data->nxhashslots[i] = 0;
-#endif
-}
-
-bool collisiondata_addslot(collisiondata *data, uint32_t s1, uint32_t xh)
-{
-#ifdef XBITMAP
-  data->xmap = data->xhashmap[xh];
-  data->xhashmap[xh] |= (uint64_t)1 << s1;
-  data->s0 = ~0;
-  return true;
-#else
-  data->n1 = (uint32_t)data->nxhashslots[xh]++;
-  if (data->n1 >= XFULL)
-    return false;
-  data->xx = data->xhashslots[xh];
-  data->xx[data->n1] = s1;
-  data->n0 = 0;
-  return true;
-#endif
-}
-
-bool collisiondata_nextcollision(collisiondata *data)
-{
-#ifdef XBITMAP
-  return data->xmap != 0;
-#else
-  return data->n0 < data->n1;
-#endif
-}
-
-uint64_t __ffsll(uint64_t x)
-{
-  return x ? (64 - clz(x & -x)) : 0;
-}
-
-uint32_t collisiondata_slot(collisiondata *data) {
-#ifdef XBITMAP
-  const uint32_t ffs = __ffsll(xmap);
-  data->s0 += ffs;
-  data->xmap >>= ffs;
-  return data->s0;
-#else
-  return (uint32_t)data->xx[data->n0++];
-#endif
-}
-
-uint32_t equi_getnslots(__global bsizes *nslots, const uint32_t r, const uint32_t bid)
-{
-  __global uint32_t *nslot = &nslots[r&1][bid];
-  const uint32_t n = min(*nslot, NSLOTS);
-  *nslot = 0;
-  return n;
-}
-
-void equi_orderindices(__global uint32_t *indices, uint32_t size)
-{
-  if (indices[0] > indices[size]) {
-    for (uint32_t i = 0; i < size; i++) {
-      const uint32_t tmp = indices[i];
-      indices[i] = indices[size+i];
-      indices[size+i] = tmp;
-    }
-  }
-}
-
-void local_orderindices(uint32_t *indices, uint32_t size)
-{
-  if (indices[0] > indices[size]) {
-    for (uint32_t i = 0; i < size; i++) {
-      const uint32_t tmp = indices[i];
-      indices[i] = indices[size+i];
-      indices[size+i] = tmp;
-    }
-  }
-}
-
-
-void equi_listindices1(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 0)[tree_bucket(t)];
-  const uint32_t size = 1 << 0;
-  indices[0]    = tree_getindex((*buck)[tree_slotid0(t)].attr);
-  indices[size] = tree_getindex((*buck)[tree_slotid1(t)].attr);
-  equi_orderindices(indices, size);
-}
-
-void equi_listindices2(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 0)[tree_bucket(t)];
-  const uint32_t size = 1 << 1;
-  equi_listindices1(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices1(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}
-
-void equi_listindices3(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 1)[tree_bucket(t)];
-  const uint32_t size = 1 << 2;
-  equi_listindices2(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices2(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}
-
-void equi_listindices4(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 1)[tree_bucket(t)];
-  const uint32_t size = 1 << 3;
-  equi_listindices3(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices3(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}
- 
-void equi_listindices5(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 2)[tree_bucket(t)];
-  const uint32_t size = 1 << 4;
-  equi_listindices4(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices4(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}  
-  
-void equi_listindices6(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 2)[tree_bucket(t)];
-  const uint32_t size = 1 << 5;
-  equi_listindices5(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices5(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}  
-  
-void equi_listindices7(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 3)[tree_bucket(t)];
-  const uint32_t size = 1 << 6;
-  equi_listindices6(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices6(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}  
-
-void equi_listindices8(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 3)[tree_bucket(t)];
-  const uint32_t size = 1 << 7;
-  equi_listindices7(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices7(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}  
-
-void equi_listindices9(__global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       const tree t,
-                       __global uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 4)[tree_bucket(t)];
-  const uint32_t size = 1 << 8;
-  equi_listindices8(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  equi_listindices8(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  equi_orderindices(indices, size);
-}
-
-void local_listindices1(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 0)[tree_bucket(t)];
-  const uint32_t size = 1 << 0;
-  indices[0]    = tree_getindex((*buck)[tree_slotid0(t)].attr);
-  indices[size] = tree_getindex((*buck)[tree_slotid1(t)].attr);
-  local_orderindices(indices, size);
-}
-
-void local_listindices2(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 0)[tree_bucket(t)];
-  const uint32_t size = 1 << 1;
-  local_listindices1(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices1(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}
-
-void local_listindices3(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 1)[tree_bucket(t)];
-  const uint32_t size = 1 << 2;
-  local_listindices2(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices2(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}
-
-void local_listindices4(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 1)[tree_bucket(t)];
-  const uint32_t size = 1 << 3;
-  local_listindices3(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices3(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}
- 
-void local_listindices5(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 2)[tree_bucket(t)];
-  const uint32_t size = 1 << 4;
-  local_listindices4(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices4(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}  
-  
-void local_listindices6(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 2)[tree_bucket(t)];
-  const uint32_t size = 1 << 5;
-  local_listindices5(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices5(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}  
-  
-void local_listindices7(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 3)[tree_bucket(t)];
-  const uint32_t size = 1 << 6;
-  local_listindices6(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices6(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}  
-
-void local_listindices8(__global uint32_t *heap0,
-                        __global uint32_t *heap1,                        
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket1 *buck = &tree1_ptr(heap1, 3)[tree_bucket(t)];
-  const uint32_t size = 1 << 7;
-  local_listindices7(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices7(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}  
-
-void local_listindices9(__global uint32_t *heap0,
-                        __global uint32_t *heap1,
-                        const tree t,
-                        uint32_t *indices)
-{
-  const __global bucket0 *buck = &tree0_ptr(heap0, 4)[tree_bucket(t)];
-  const uint32_t size = 1 << 8;
-  local_listindices8(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
-  local_listindices8(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
-  local_orderindices(indices, size);
-}
-
-// proper dupe test is a little costly on GPU, so allow false negatives
-bool equi_probdupe(uint32_t *prf) {
-  unsigned short susp[PROOFSIZE];
-  for (unsigned i = 0; i < PROOFSIZE; i++)
-    susp[i] = 0xFFFF;
-    
-  for (unsigned i = 0; i < PROOFSIZE; i++) {
-    uint32_t bin = prf[i] & (PROOFSIZE-1);
-    unsigned short msb = prf[i] >> WK;
-    if (msb == susp[bin])
-      return true;
-    susp[bin] = msb;
-  }
-  
-  return false;
-}
-
-void equi_candidate(__global uint32_t *heap0,
-                    __global uint32_t *heap1,
-                    __global proof *sols,
-                    __global uint32_t *nsols,
-                    const tree t)
-{
-  proof prf;
-#if WK==9
-  local_listindices9(heap0, heap1, t, (uint32_t*)&prf);
-#elif WK==5
-  local_listindices5(heap0, heap1, t, (uint32_t*)&prf);
-#else
-#error not implemented
-#endif
-  if (equi_probdupe(prf))
-    return;
-  uint32_t soli = atomic_inc(nsols);
-  if (soli < MAXSOLS)
-#if WK==9
-    equi_listindices9(heap0, heap1, t, sols[soli]);
-#elif WK==5
-    equi_listindices5(heap0, heap1, t, sols[soli]);
-#else
-#error not implemented
-#endif
-}
-
-
-__kernel void digitH(__global blake2b_state *blake2bState,
-                     __global const uint32_t *heap0,
-                     __global bsizes *nslots)
-{
-  uint8_t hash[HASHOUT];
-  blake2b_state state;
-  // equi::htlayout htl(eq, 0);
-  htlayout htl = htlayout_create_2(0);
-  const uint32_t hashbytes = hashsize(0);
-  // const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t block = id; block < NBLOCKS; block += get_global_size(0)) {
-    state = *blake2bState;
-    blake2b_gpu_hash(&state, block, hash, HASHOUT);
-    for (uint32_t i = 0; i < HASHESPERBLAKE; i++) {
-      const uint8_t *ph = hash + i * WN/8;
-#if BUCKBITS == 16 && RESTBITS == 4
-      const uint32_t bucketid = ((uint32_t)ph[0] << 8) | ph[1];
-#ifdef XINTREE
-      const uint32_t xhash = ph[2] >> 4;
-#endif
-#elif BUCKBITS == 14 && RESTBITS == 6
-      const uint32_t bucketid = ((uint32_t)ph[0] << 6) | ph[1] >> 2;
-#elif BUCKBITS == 12 && RESTBITS == 8
-      const uint32_t bucketid = ((uint32_t)ph[0] << 4) | ph[1] >> 4;
-#elif BUCKBITS == 20 && RESTBITS == 4
-      const uint32_t bucketid = ((((uint32_t)ph[0] << 8) | ph[1]) << 4) | ph[2] >> 4;
-#ifdef XINTREE
-      const uint32_t xhash = ph[2] & 0xf;
-#endif
-#elif BUCKBITS == 12 && RESTBITS == 4
-      const uint32_t bucketid = ((uint32_t)ph[0] << 4) | ph[1] >> 4;
-      const uint32_t xhash = ph[1] & 0xf;
-#else
-#error not implemented
-#endif
-      const uint32_t slot = atomic_inc(&nslots[0][bucketid]);
-      if (slot >= NSLOTS)
-        continue;
-      tree leaf;
-      tree_setindex(&leaf, block*HASHESPERBLAKE+i);
-#ifdef XINTREE
-      tree_setxhash(&leaf, xhash);
-#endif
-      __global slot0 *s = &tree0_ptr(heap0, 0)[bucketid][slot];
-      s->attr = leaf;
-      
-      // memcpy(s.hash->bytes+htl.nextbo, ph+WN/8-hashbytes, hashbytes);
-      for (unsigned i = 0; i < hashbytes; i++)
-        ((__global uint8_t*)s->hash->bytes+htl.nextbo)[i] = ((uint8_t*)(ph+WN/8-hashbytes))[i];
-    }
-  }
-}
-
-__kernel void digitOdd(const uint32_t r,
-                       __global uint32_t *heap0,
-                       __global uint32_t *heap1,
-                       __global bsizes *nslots)
-{
-  // equi::htlayout htl(eq, r);
-//   htlayout htl = htlayout_create(eq, r);
-  htlayout htl = htlayout_create_2(r);  
-  collisiondata cd;
-  
-  // const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x;
-  const uint32_t id = get_global_id(0);
-  
-  for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    // cd.clear();
-    collisiondata_clear(&cd);
-//     __global slot0 *buck = htl.hta.trees0[(r-1)/2][bucketid]; // optimize by updating previous buck?!
-    __global slot0 *buck = tree0_ptr(heap0, (r-1)/2)[bucketid]; // optimize by updating previous buck?!    
-    uint32_t bsize = equi_getnslots(nslots, r-1, bucketid);       // optimize by putting bucketsize with block?!
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot0 *pslot1 = buck + s1;          // optimize by updating previous pslot1?!
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot0 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-#if WN == 200 && BUCKBITS == 16 && RESTBITS == 4 && defined(XINTREE)
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
-                          | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
-                  | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        xhash &= 0xf;
-#elif WN == 144 && BUCKBITS == 20 && RESTBITS == 4
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 8)
-                            | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 4)
-                    | (xhash = bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 4;
-        xhash &= 0xf;
-#elif WN == 96 && BUCKBITS == 12 && RESTBITS == 4
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4)
-                  | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        xhash &= 0xf;
-#elif WN == 200 && BUCKBITS == 14 && RESTBITS == 6
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) & 0xf) << 8)
-                           | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 2
-                           | (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 6;
-#else
-#error not implemented
-#endif
-        const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-#ifdef XINTREE
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-#else
-        tree xort = tree_create3(bucketid, s0, s1);
-#endif
-//         __global slot1 *xs = &htl.hta.trees1[r/2][xorbucketid][xorslot];
-        __global slot1 *xs = &tree1_ptr(heap1, r/2)[xorbucketid][xorslot];        
-        xs->attr = xort;
-        for (uint32_t i = htl.dunits; i < htl.prevhashunits; i++)
-          xs->hash[i-htl.dunits].word = pslot0->hash[i].word ^ pslot1->hash[i].word;
-      }
-    }
-  }
-}
-
-
-__kernel void digitEven(const uint32_t r,
-                        __global uint32_t *heap0,
-                        __global uint32_t *heap1,
-                        __global bsizes *nslots)
-{
-  // equi::htlayout htl(eq, r);
-//   htlayout htl = htlayout_create(eq, r);
-  htlayout htl = htlayout_create_2(r);
-  collisiondata cd;
-  
-  // const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x;
-  const uint32_t id = get_global_id(0);
-  
-  for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    // cd.clear();
-    collisiondata_clear(&cd);    
-//     __global slot1 *buck = htl.hta.trees1[(r-1)/2][bucketid]; // OPTIMIZE BY UPDATING PREVIOUS
-     __global slot1 *buck = tree1_ptr(heap1, (r-1)/2)[bucketid]; // OPTIMIZE BY UPDATING PREVIOUS
-    uint32_t bsize = equi_getnslots(nslots, r-1, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot1 *pslot1 = buck + s1;          // OPTIMIZE BY UPDATING PREVIOUS
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot1 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-#if WN == 200 && BUCKBITS == 16 && RESTBITS == 4 && defined(XINTREE)
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
-                        | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
-                  xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-#elif WN == 144 && BUCKBITS == 20 && RESTBITS == 4
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 8)
-                            | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 4)
-                            | (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 4;
-#elif WN == 96 && BUCKBITS == 12 && RESTBITS == 4
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4)
-                          | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-#elif WN == 200 && BUCKBITS == 14 && RESTBITS == 6
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 6)
-                          | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 2;
-#else
-#error not implemented
-#endif
-        const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-#ifdef XINTREE
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-#else
-        tree xort = tree_create3(bucketid, s0, s1);
-#endif
-//         __global slot0 *xs = &htl.hta.trees0[r/2][xorbucketid][xorslot];
-        __global slot0 *xs = &tree0_ptr(heap0, r/2)[xorbucketid][xorslot];        
-        xs->attr = xort;
-        for (uint32_t i=htl.dunits; i < htl.prevhashunits; i++)
-          xs->hash[i-htl.dunits].word = pslot0->hash[i].word ^ pslot1->hash[i].word;
-      }
-    }
-  }
-}
-
-
-#ifdef UNROLL
-
-__kernel void digit_1(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots)
-{
-  htlayout htl = htlayout_create_2(1);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-    __global slot0 *buck = tree0_ptr(heap0, 0)[bucketid];
-    uint32_t bsize = equi_getnslots(nslots, 0, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot0 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot0 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
-                          | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
-                  | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        xhash &= 0xf;
-        const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot1 *xs = &htl.hta.trees1[0][xorbucketid][xorslot];
-        __global slot1 *xs = &tree1_ptr(heap1, 0)[xorbucketid][xorslot];
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-        xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
-        xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
-        xs->hash[3].word = pslot0->hash[4].word ^ pslot1->hash[4].word;
-        xs->hash[4].word = pslot0->hash[5].word ^ pslot1->hash[5].word;
-      }
-    }
-  }
-}
-__kernel void digit_2(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(2);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot1 *buck = htl.hta.trees1[0][bucketid];
-    __global slot1 *buck = tree1_ptr(heap1, 0)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 1, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot1 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))  
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot1 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
-                        | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
-                  xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-        // __global slot0 *xs = &htl.hta.trees0[1][xorbucketid][xorslot];
-         __global slot0 *xs = &tree0_ptr(heap0, 1)[xorbucketid][xorslot];
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word;
-        xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-        xs->hash[2].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
-        xs->hash[3].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
-        xs->hash[4].word = pslot0->hash[4].word ^ pslot1->hash[4].word;
-      }
-    }
-  }
-}
-__kernel void digit_3(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(3);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot0 *buck = htl.hta.trees0[1][bucketid];
-    __global slot0 *buck = tree0_ptr(heap0, 1)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 2, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot0 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))  
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot0 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
-                          | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
-                  | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        xhash &= 0xf;
-        const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot1 *xs = &htl.hta.trees1[1][xorbucketid][xorslot];
-        __global slot1 *xs = &tree1_ptr(heap1, 1)[xorbucketid][xorslot];
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-        xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
-        xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
-        xs->hash[3].word = pslot0->hash[4].word ^ pslot1->hash[4].word;
-      }
-    }
-  }
-}
-__kernel void digit_4(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(4);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot1 *buck = htl.hta.trees1[1][bucketid];
-    __global slot1 *buck = tree1_ptr(heap1, 1)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 3, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot1 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot1 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
-                        | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
-                  xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot0 *xs = &htl.hta.trees0[2][xorbucketid][xorslot];
-        __global slot0 *xs = &tree0_ptr(heap0, 2)[xorbucketid][xorslot];        
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word;
-        xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-        xs->hash[2].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
-        xs->hash[3].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
-      }
-    }
-  }
-}
-__kernel void digit_5(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(5);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot0 *buck = htl.hta.trees0[2][bucketid];
-    __global slot0 *buck = tree0_ptr(heap0, 2)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 4, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot0 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot0 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
-                          | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
-                  | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        xhash &= 0xf;
-        const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot1 *xs = &htl.hta.trees1[2][xorbucketid][xorslot];
-        __global slot1 *xs = &tree1_ptr(heap1, 2)[xorbucketid][xorslot];        
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-        xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
-        xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
-      }
-    }
-  }
-}
-__kernel void digit_6(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(6);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot1 *buck = htl.hta.trees1[2][bucketid];
-    __global slot1 *buck = tree1_ptr(heap1, 2)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 5, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot1 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot1 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
-                        | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
-                  xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot0 *xs = &htl.hta.trees0[3][xorbucketid][xorslot];
-        __global slot0 *xs = &tree0_ptr(heap0, 3)[xorbucketid][xorslot];        
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-        xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
-      }
-    }
-  }
-}
-__kernel void digit_7(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(7);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot0 *buck = htl.hta.trees0[3][bucketid];
-    __global slot0 *buck = tree0_ptr(heap0, 3)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 6, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot0 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot0 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
-                          | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
-                  | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        xhash &= 0xf;
-        const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot1 *xs = &htl.hta.trees1[3][xorbucketid][xorslot];
-        __global slot1 *xs = &tree1_ptr(heap1, 3)[xorbucketid][xorslot];        
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word;
-        xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-      }
-    }
-  }
-}
-__kernel void digit_8(__global uint32_t *heap0,
-                      __global uint32_t *heap1,
-                      __global bsizes *nslots) {
-  htlayout htl = htlayout_create_2(8);
-  collisiondata cd;
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-//     __global slot1 *buck = htl.hta.trees1[3][bucketid];
-    __global slot1 *buck = tree1_ptr(heap1, 3)[bucketid];    
-    uint32_t bsize = equi_getnslots(nslots, 7, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot1 *pslot1 = buck + s1;          // OPTIMIZE BY UPDATING PREVIOUS
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot1 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
-          continue;
-        uint32_t xorbucketid;
-        uint32_t xhash;
-        __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
-        xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
-                        | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
-                  xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
-        const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
-        if (xorslot >= NSLOTS)
-          continue;
-        tree xort = tree_create4(bucketid, s0, s1, xhash);
-//         __global slot0 *xs = &htl.hta.trees0[4][xorbucketid][xorslot];
-        __global slot0 *xs = &tree0_ptr(heap0, 4)[xorbucketid][xorslot];     
-        xs->attr = xort;
-        xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
-      }
-    }
-  }
-}
-#endif //UNROLL
-
-__kernel void digitK(__global uint32_t *heap0,
-                     __global uint32_t *heap1,
-                     __global bsizes *nslots,
-                     __global proof *sols,
-                     __global uint32_t *nsols) {
-  collisiondata cd;
-  htlayout htl = htlayout_create_2(WK);
-  const uint32_t id = get_global_id(0);
-  for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
-    collisiondata_clear(&cd); 
-    __global slot0 *buck = tree0_ptr(heap0, (WK-1)/2)[bucketid];
-    uint32_t bsize = equi_getnslots(nslots, WK-1, bucketid);
-    for (uint32_t s1 = 0; s1 < bsize; s1++) {
-      __global const slot0 *pslot1 = buck + s1;
-      if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) // assume WK odd
-        continue;
-      for (; collisiondata_nextcollision(&cd); ) {
-        const uint32_t s0 = collisiondata_slot(&cd);
-        __global const slot0 *pslot0 = buck + s0;
-        if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) {
-          tree xort = tree_create3(bucketid, s0, s1);
-          equi_candidate(heap0, heap1, sols, nsols, xort);
-        }
-      }
-    }
-  }
-}