From 1c57e7b5efbe84e1c252544b1da970bb8b298a3b Mon Sep 17 00:00:00 2001
From: maztheman <maztheman@example.com>
Date: Tue, 20 Dec 2016 08:19:45 -0700
Subject: [PATCH] AMD/NVIDIA GATELESS GATE v0.0.1

---
 contrib/blake/blake.hpp                       |   112 +
 contrib/ocl/algorithm/compress.hpp            |    46 +
 .../algorithm/detail/gatelessgate_context.hpp |    15 +
 .../algorithm/detail/silentarmy_context.hpp   |    82 +
 .../algorithm/detail/silentarmy_detail.hpp    |   114 +
 contrib/ocl/algorithm/gatelessgate.hpp        |   114 +
 contrib/ocl/algorithm/silentarmy.hpp          |   254 +
 contrib/ocl/cl_ext.hpp                        | 12355 ++++++++++++++++
 contrib/ocl/crypto/blake.hpp                  |    62 +
 contrib/ocl/crypto/detail/blake.hpp           |    54 +
 contrib/ocl/hex.hpp                           |    34 +
 contrib/ocl/include/blake.hpp                 |   103 +
 contrib/ocl/include/ocl_gatelessgate.hpp      |    73 +
 contrib/ocl/include/ocl_gg_context.hpp        |    34 +
 contrib/ocl/include/param.h                   |   198 +
 contrib/ocl/kernels/gatelessgate.cl           |   993 ++
 contrib/ocl/kernels/silentarmy.cl             |   946 ++
 contrib/ocl/opencl.hpp                        |    72 +
 contrib/ocl/sols.hpp                          |    20 +
 contrib/ocl/utility/device_utils.hpp          |   181 +
 contrib/sha256/sha256.hpp                     |   228 +
 cpu_tromp/equi.h                              |     2 +-
 cuda_silentarmy/cuda_silentarmy.vcxproj       |     6 +-
 cuda_silentarmy/kernel.cu                     |    38 +-
 .../cuda_silentarmy_sm30.vcxproj              |     8 +-
 cuda_silentarmy_sm30/kernel.cu                |   157 +-
 nheqminer/libstratum/StratumClient.cpp        |     9 +-
 nheqminer/libstratum/ZcashStratum.cpp         |    36 +-
 nheqminer/libstratum/ZcashStratum.h           |    29 +-
 nheqminer/main.cpp                            |    75 +-
 nheqminer/nheqminer.sln                       |    20 +
 nheqminer/nheqminer.vcxproj                   |     4 +-
 ocl_gatelessgate/gatelessgate.cl              |  1245 ++
 ocl_gatelessgate/gettimeofday.h               |    43 +
 ocl_gatelessgate/ocl_gatelessgate.cpp         |   912 ++
 ocl_gatelessgate/ocl_gatelessgate.hpp         |    58 +
 ocl_gatelessgate/ocl_gatelessgate.vcxproj     |   133 +
 .../ocl_gatelessgate.vcxproj.filters          |    14 +
 ocl_gatelessgate/param.h                      |   373 +
 ocl_gatelessgate/param_nr15.h                 |   198 +
 ocl_silentarmy/ocl_silentarmy.cpp             |     4 +-
 ocl_silentarmy/ocl_silentarmy.vcxproj         |     4 +-
 ocl_silentarmy/ocl_silentarmy.vcxproj.filters |     2 -
 ocl_xpm/ocl_xmp.cpp                           |     2 +-
 44 files changed, 19292 insertions(+), 170 deletions(-)
 create mode 100644 contrib/blake/blake.hpp
 create mode 100644 contrib/ocl/algorithm/compress.hpp
 create mode 100644 contrib/ocl/algorithm/detail/gatelessgate_context.hpp
 create mode 100644 contrib/ocl/algorithm/detail/silentarmy_context.hpp
 create mode 100644 contrib/ocl/algorithm/detail/silentarmy_detail.hpp
 create mode 100644 contrib/ocl/algorithm/gatelessgate.hpp
 create mode 100644 contrib/ocl/algorithm/silentarmy.hpp
 create mode 100644 contrib/ocl/cl_ext.hpp
 create mode 100644 contrib/ocl/crypto/blake.hpp
 create mode 100644 contrib/ocl/crypto/detail/blake.hpp
 create mode 100644 contrib/ocl/hex.hpp
 create mode 100644 contrib/ocl/include/blake.hpp
 create mode 100644 contrib/ocl/include/ocl_gatelessgate.hpp
 create mode 100644 contrib/ocl/include/ocl_gg_context.hpp
 create mode 100644 contrib/ocl/include/param.h
 create mode 100644 contrib/ocl/kernels/gatelessgate.cl
 create mode 100644 contrib/ocl/kernels/silentarmy.cl
 create mode 100644 contrib/ocl/opencl.hpp
 create mode 100644 contrib/ocl/sols.hpp
 create mode 100644 contrib/ocl/utility/device_utils.hpp
 create mode 100644 contrib/sha256/sha256.hpp
 create mode 100644 ocl_gatelessgate/gatelessgate.cl
 create mode 100644 ocl_gatelessgate/gettimeofday.h
 create mode 100644 ocl_gatelessgate/ocl_gatelessgate.cpp
 create mode 100644 ocl_gatelessgate/ocl_gatelessgate.hpp
 create mode 100644 ocl_gatelessgate/ocl_gatelessgate.vcxproj
 create mode 100644 ocl_gatelessgate/ocl_gatelessgate.vcxproj.filters
 create mode 100644 ocl_gatelessgate/param.h
 create mode 100644 ocl_gatelessgate/param_nr15.h

diff --git a/contrib/blake/blake.hpp b/contrib/blake/blake.hpp
new file mode 100644
index 000000000..976dc6dbc
--- /dev/null
+++ b/contrib/blake/blake.hpp
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <cassert>
+
+namespace blake {
+
+
+namespace impl {
+	
+static const uint32_t   blake2b_block_len = 128;
+static const uint32_t   blake2b_rounds = 12;
+static const uint64_t   blake2b_iv[8] =
+{
+    0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+    0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+    0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+    0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL,
+};
+
+static const uint8_t    blake2b_sigma[12][16] =
+{
+      {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+      { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+      { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+      {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+      {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+      {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
+      { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
+      { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
+      {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
+      { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
+      {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+      { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+};
+
+inline uint64_t rotr64(uint64_t a, uint8_t bits)
+{
+    return (a >> bits) | (a << (64 - bits));
+}
+
+inline void mix(uint64_t *va, uint64_t *vb, uint64_t *vc, uint64_t *vd,
+        uint64_t x, uint64_t y)
+{
+    *va = (*va + *vb + x);
+    *vd = rotr64(*vd ^ *va, 32);
+    *vc = (*vc + *vd);
+    *vb = rotr64(*vb ^ *vc, 24);
+    *va = (*va + *vb + y);
+    *vd = rotr64(*vd ^ *va, 16);
+    *vc = (*vc + *vd);
+    *vb = rotr64(*vb ^ *vc, 63);
+}
+
+}
+
+
+typedef struct  blake2b_state_s
+{
+    uint64_t    h[8];
+    uint64_t    bytes;
+}               blake2b_state_t;
+
+inline void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, uint32_t n, uint32_t k)
+{
+	using namespace blake::impl;
+	
+	assert(n > k);
+    assert(hash_len <= 64);
+    st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len);
+    for (uint32_t i = 1; i <= 5; i++)
+        st->h[i] = blake2b_iv[i];
+    st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW";
+    st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n);
+    st->bytes = 0;	
+	
+}
+
+inline void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg, uint32_t msg_len, uint32_t is_final)
+{
+	using namespace blake::impl;
+	
+	const uint64_t      *m = (const uint64_t *)_msg;
+    uint64_t            v[16];
+    assert(msg_len <= 128);
+    assert(st->bytes <= UINT64_MAX - msg_len);
+    memcpy(v + 0, st->h, 8 * sizeof (*v));
+    memcpy(v + 8, blake2b_iv, 8 * sizeof (*v));
+    v[12] ^= (st->bytes += msg_len);
+    v[14] ^= is_final ? -1 : 0;
+    for (uint32_t round = 0; round < blake2b_rounds; round++)
+      {
+        const uint8_t   *s = blake2b_sigma[round];
+        mix(v + 0, v + 4, v + 8,  v + 12, m[s[0]],  m[s[1]]);
+        mix(v + 1, v + 5, v + 9,  v + 13, m[s[2]],  m[s[3]]);
+        mix(v + 2, v + 6, v + 10, v + 14, m[s[4]],  m[s[5]]);
+        mix(v + 3, v + 7, v + 11, v + 15, m[s[6]],  m[s[7]]);
+        mix(v + 0, v + 5, v + 10, v + 15, m[s[8]],  m[s[9]]);
+        mix(v + 1, v + 6, v + 11, v + 12, m[s[10]], m[s[11]]);
+        mix(v + 2, v + 7, v + 8,  v + 13, m[s[12]], m[s[13]]);
+        mix(v + 3, v + 4, v + 9,  v + 14, m[s[14]], m[s[15]]);
+      }
+    for (uint32_t i = 0; i < 8; i++)
+        st->h[i] ^= v[i] ^ v[i + 8];
+}
+
+inline void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen)
+{
+	assert(outlen <= 64);
+    memcpy(out, st->h, outlen);
+}
+
+}
\ No newline at end of file
diff --git a/contrib/ocl/algorithm/compress.hpp b/contrib/ocl/algorithm/compress.hpp
new file mode 100644
index 000000000..36c8e14c0
--- /dev/null
+++ b/contrib/ocl/algorithm/compress.hpp
@@ -0,0 +1,46 @@
+#pragma once
+
+namespace ocl {
+namespace algorithm {
+	
+template<int _PREFIX>
+inline void compress(uint8_t *out, uint32_t *inputs, uint32_t n)
+{
+	uint32_t byte_pos = 0;
+	int32_t bits_left = _PREFIX + 1;
+	uint8_t x = 0;
+	uint8_t x_bits_used = 0;
+	uint8_t *pOut = out;
+	while (byte_pos < n)
+	{
+		if (bits_left >= 8 - x_bits_used)
+		{
+			x |= inputs[byte_pos] >> (bits_left - 8 + x_bits_used);
+			bits_left -= 8 - x_bits_used;
+			x_bits_used = 8;
+		}
+		else if (bits_left > 0)
+		{
+			uint32_t mask = ~(-1 << (8 - x_bits_used));
+			mask = ((~mask) >> bits_left) & mask;
+			x |= (inputs[byte_pos] << (8 - x_bits_used - bits_left)) & mask;
+			x_bits_used += bits_left;
+			bits_left = 0;
+		}
+		else if (bits_left <= 0)
+		{
+			assert(!bits_left);
+			byte_pos++;
+			bits_left = _PREFIX + 1;
+		}
+		if (x_bits_used == 8)
+		{
+			*pOut++ = x;
+			x = x_bits_used = 0;
+		}
+	}
+}
+	
+	
+}
+}
\ No newline at end of file
diff --git a/contrib/ocl/algorithm/detail/gatelessgate_context.hpp b/contrib/ocl/algorithm/detail/gatelessgate_context.hpp
new file mode 100644
index 000000000..47c36c21a
--- /dev/null
+++ b/contrib/ocl/algorithm/detail/gatelessgate_context.hpp
@@ -0,0 +1,15 @@
+#pragma once
+
+namespace ocl {
+namespace algorithm {
+namespace algorithm_detail {
+
+struct gatelessgate_context {
+	
+	
+	
+};
+		
+}	
+}
+}
\ No newline at end of file
diff --git a/contrib/ocl/algorithm/detail/silentarmy_context.hpp b/contrib/ocl/algorithm/detail/silentarmy_context.hpp
new file mode 100644
index 000000000..a72fac862
--- /dev/null
+++ b/contrib/ocl/algorithm/detail/silentarmy_context.hpp
@@ -0,0 +1,82 @@
+#pragma once
+#include <ocl/opencl.hpp>
+#include <ocl/sols.hpp>
+#include <cstdint>
+
+namespace ocl {
+namespace algorithm {
+namespace algorithm_detail {
+
+
+
+struct silentarmy_context {
+	cl_context _context;
+	cl_program _program;
+	cl_device_id _dev_id;
+	cl_platform_id platform_id = 0;
+	cl_command_queue queue;
+
+
+	cl_kernel k_init_ht;
+	cl_kernel k_rounds[SA_PARAM_K];
+	cl_kernel k_sols;
+
+	cl_mem buf_ht[2], buf_sols, buf_dbg, rowCounters[2];
+	size_t global_ws;
+	size_t local_work_size = 64;
+
+	sols_t	*sols;
+
+	bool init(cl_device_id dev, unsigned threadsNum, unsigned threadsPerBlock) {
+		cl_int error;
+
+		queue = clCreateCommandQueue(_context, dev, 0, &error);
+
+	#ifdef SA_ENABLE_DEBUG
+		size_t              dbg_size = SA_NR_ROWS;
+	#else
+		size_t              dbg_size = 1;
+	#endif
+
+		buf_dbg = check_clCreateBuffer(_context, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, dbg_size, NULL);
+		buf_ht[0] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_HT_SIZE, NULL);
+		buf_ht[1] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_HT_SIZE, NULL);
+		buf_sols = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, sizeof(sols_t), NULL);
+
+		rowCounters[0] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_NR_ROWS, NULL);
+		rowCounters[1] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_NR_ROWS, NULL);
+
+
+
+		fprintf(stderr, "Hash tables will use %.1f MB\n", 2.0 * SA_HT_SIZE / 1e6);
+
+		k_init_ht = clCreateKernel(_program, "kernel_init_ht", &error);
+		for (unsigned i = 0; i < SA_PARAM_K; i++) {
+			char kernelName[128];
+			sprintf(kernelName, "kernel_round%d", i);
+			k_rounds[i] = clCreateKernel(_program, kernelName, &error);
+		}
+
+		sols = (sols_t *)malloc(sizeof(*sols));
+
+		k_sols = clCreateKernel(_program, "kernel_sols", &error);
+		return true;		
+		
+		
+	}
+	
+	~silentarmy_context() {
+		clReleaseMemObject(buf_dbg);
+		clReleaseMemObject(buf_ht[0]);
+		clReleaseMemObject(buf_ht[1]);
+		clReleaseMemObject(rowCounters[0]);
+		clReleaseMemObject(rowCounters[1]);
+		free(sols);
+	}
+	
+	
+};
+		
+}	
+}
+}
\ No newline at end of file
diff --git a/contrib/ocl/algorithm/detail/silentarmy_detail.hpp b/contrib/ocl/algorithm/detail/silentarmy_detail.hpp
new file mode 100644
index 000000000..00f4ac35c
--- /dev/null
+++ b/contrib/ocl/algorithm/detail/silentarmy_detail.hpp
@@ -0,0 +1,114 @@
+#pragma once
+#include <ocl/opencl.hpp>
+#include <ocl/sols.hpp>
+#include <cstdint>
+
+namespace ocl {
+namespace algorithm {
+namespace algorithm_detail {
+
+inline void init_ht(cl_command_queue queue, cl_kernel k_init_ht, cl_mem buf_ht, cl_mem rowCounters)
+{
+	size_t      global_ws = SA_NR_ROWS / SA_ROWS_PER_UINT;
+	size_t      local_ws = 256;
+	cl_int      status;
+#if 0
+	uint32_t    pat = -1;
+	status = clEnqueueFillBuffer(queue, buf_ht, &pat, sizeof(pat), 0,
+		SA_NR_ROWS * SA_NR_SLOTS * SA_SLOT_LEN,
+		0,		// cl_uint	num_events_in_wait_list
+		NULL,	// cl_event	*event_wait_list
+		NULL);	// cl_event	*event
+	if (status != CL_SUCCESS)
+		fatal("clEnqueueFillBuffer (%d)\n", status);
+#endif
+	status = clSetKernelArg(k_init_ht, 0, sizeof(buf_ht), &buf_ht);
+	status = clSetKernelArg(k_init_ht, 1, sizeof(rowCounters), &rowCounters);
+	if (status != CL_SUCCESS)
+		printf("clSetKernelArg (%d)\n", status);
+	check_clEnqueueNDRangeKernel(queue, k_init_ht,
+		1,		// cl_uint	work_dim
+		NULL,	// size_t	*global_work_offset
+		&global_ws,	// size_t	*global_work_size
+		&local_ws,	// size_t	*local_work_size
+		0,		// cl_uint	num_events_in_wait_list
+		NULL,	// cl_event	*event_wait_list
+		NULL);	// cl_event	*event
+}
+
+
+/*
+** Sort a pair of binary blobs (a, b) which are consecutive in memory and
+** occupy a total of 2*len 32-bit words.
+**
+** a            points to the pair
+** len          number of 32-bit words in each pair
+*/
+inline void sort_pair(uint32_t *a, uint32_t len)
+{
+	uint32_t    *b = a + len;
+	uint32_t     tmp, need_sorting = 0;
+	for (uint32_t i = 0; i < len; i++)
+		if (need_sorting || a[i] > b[i])
+		{
+			need_sorting = 1;
+			tmp = a[i];
+			a[i] = b[i];
+			b[i] = tmp;
+		}
+		else if (a[i] < b[i])
+			return;
+}
+
+inline uint32_t verify_sol(sols_t *sols, unsigned sol_i)
+{
+	uint32_t  *inputs = sols->values[sol_i];
+	uint32_t  seen_len = (1 << (SA_PREFIX + 1)) / 8;
+	uint8_t seen[(1 << (SA_PREFIX + 1)) / 8];
+	uint32_t  i;
+	uint8_t tmp;
+	// look for duplicate inputs
+	memset(seen, 0, seen_len);
+	for (i = 0; i < (1 << SA_PARAM_K); i++)
+	{
+		tmp = seen[inputs[i] / 8];
+		seen[inputs[i] / 8] |= 1 << (inputs[i] & 7);
+		if (tmp == seen[inputs[i] / 8])
+		{
+			// at least one input value is a duplicate
+			sols->valid[sol_i] = 0;
+			return 0;
+		}
+	}
+	// the valid flag is already set by the GPU, but set it again because
+	// I plan to change the GPU code to not set it
+	sols->valid[sol_i] = 1;
+	// sort the pairs in place
+	for (uint32_t level = 0; level < SA_PARAM_K; level++)
+		for (i = 0; i < (1 << SA_PARAM_K); i += (2 << level))
+			sort_pair(&inputs[i], 1 << level);
+	return 1;
+}
+
+
+inline size_t select_work_size_blake(cl_device_id device_id)
+{
+	
+	size_t work_size =
+		64 * /* thread per wavefront */
+		SA_BLAKE_WPS * /* wavefront per simd */
+		4 * /* simd per compute unit */
+		nr_compute_units(device_id);
+	// Make the work group size a multiple of the nr of wavefronts, while
+	// dividing the number of inputs. This results in the worksize being a
+	// power of 2.
+	while (SA_NR_INPUTS % work_size)
+		work_size += 64;
+
+	return work_size;
+}
+
+
+}
+}
+}
\ No newline at end of file
diff --git a/contrib/ocl/algorithm/gatelessgate.hpp b/contrib/ocl/algorithm/gatelessgate.hpp
new file mode 100644
index 000000000..3c6480c65
--- /dev/null
+++ b/contrib/ocl/algorithm/gatelessgate.hpp
@@ -0,0 +1,114 @@
+#pragma once
+
+#include <ocl/opencl.hpp>
+#include <ocl/algorithm/detail/gatelessgate_context.hpp>
+#include <string>
+#include <vector>
+
+namespace ocl {
+namespace algorithm {
+
+struct gatelessgate {
+	
+	int blocks;
+	int device_id;
+	int platform_id;
+
+	algorithm_detail::gatelessgate_context* oclc;
+	// threads
+	unsigned threadsNum; // TMP
+	unsigned wokrsize;
+
+	bool is_init_success = false;
+
+	gatelessgate(int platf_id, int dev_id) 
+	: blocks(0)
+	, device_id(dev_id)
+	, platform_id(platf_id)
+	, oclc(nullptr)
+	, threadsNum(8192U)
+	, wokrsize(128)
+	{
+	
+	}
+
+	static int getcount() {
+		static auto devices = utility::GetAllDevices();
+		return devices.size();
+	}
+
+	static void getinfo(int platf_id, int d_id, ::std::string& gpu_name, int& sm_count, ::std::string& version) {
+		static auto devices = utility::GetAllDevices();
+
+		if (devices.size() <= d_id) {
+			return;
+		}
+		auto device = devices[d_id];
+
+		::std::vector<char> name(256, 0);
+		cl_uint compute_units = 0;
+
+		size_t nActualSize = 0;
+		cl_int rc = clGetDeviceInfo(device, CL_DEVICE_NAME, name.size(), &name[0], &nActualSize);
+
+		if (rc == CL_SUCCESS) {
+			gpu_name.assign(&name[0], nActualSize);
+		}
+
+		rc = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(cl_uint), &compute_units, &nActualSize);
+		if (rc == CL_SUCCESS) {
+			sm_count = (int)compute_units;
+		}
+
+		memset(&name[0], 0, name.size());
+		rc = clGetDeviceInfo(device, CL_DEVICE_VERSION, name.size(), &name[0], &nActualSize);
+		if (rc == CL_SUCCESS) {
+			version.assign(&name[0], nActualSize);
+		}		
+	}
+
+	static void start(gatelessgate& device_context) {
+		
+	}
+
+	static void stop(gatelessgate& device_context) {
+		
+	}
+
+	static void solve(const char *tequihash_header,
+		unsigned int tequihash_header_len,
+		const char* nonce,
+		unsigned int nonce_len,
+		::std::function<bool()> cancelf,
+		::std::function<void(const ::std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
+		::std::function<void(void)> hashdonef,
+		gatelessgate& device_context) {
+			
+		}
+
+	std::string getname() const { return "OCL_GATELESSGATE"; }
+
+	std::string getdevinfo() {
+		static auto devices = ocl::utility::GetAllDevices();
+		auto device = devices[device_id];
+		std::vector<char> name(256, 0);
+		size_t nActualSize = 0;
+		std::string gpu_name;
+
+		cl_int rc = clGetDeviceInfo(device, CL_DEVICE_NAME, name.size(), &name[0], &nActualSize);
+
+		gpu_name.assign(&name[0], nActualSize);
+
+		return "GPU_ID( " + gpu_name + ")";		
+	}
+	
+private:
+	::std::string m_gpu_name;
+	::std::string m_version;
+	
+};
+
+	
+}
+}
+
diff --git a/contrib/ocl/algorithm/silentarmy.hpp b/contrib/ocl/algorithm/silentarmy.hpp
new file mode 100644
index 000000000..173609285
--- /dev/null
+++ b/contrib/ocl/algorithm/silentarmy.hpp
@@ -0,0 +1,254 @@
+#pragma once
+
+#include <ocl/opencl.hpp>
+#include <ocl/sols.hpp>
+#include <ocl/algorithm/detail/silentarmy_context.hpp>
+#include <ocl/algorithm/detail/silentarmy_detail.hpp>
+#include <ocl/algorithm/compress.hpp>
+#include <ocl/crypto/blake.hpp>
+#include <string>
+#include <vector>
+#include <functional>
+
+
+#define SA_COLLISION_BIT_LENGTH (SA_PARAM_N / (SA_PARAM_K+1))
+#define SA_COLLISION_BYTE_LENGTH ((SA_COLLISION_BIT_LENGTH+7)/8)
+#define SA_FINAL_FULL_WIDTH (2*SA_COLLISION_BYTE_LENGTH+sizeof(uint32_t)*(1 << (SA_PARAM_K)))
+
+#define SA_NDIGITS   (SA_PARAM_K+1)
+#define SA_DIGITBITS (SA_PARAM_N/(SA_NDIGITS))
+#define SA_PROOFSIZE (1u<<SA_PARAM_K)
+#define SA_COMPRESSED_PROOFSIZE ((SA_COLLISION_BIT_LENGTH+1)*SA_PROOFSIZE*4/(8*sizeof(uint32_t)))
+
+namespace ocl {
+namespace algorithm {
+	
+struct silentarmy {
+	
+	int blocks;
+	int device_id;
+	int platform_id;
+
+	algorithm_detail::silentarmy_context* oclc;
+	// threads
+	unsigned threadsNum; // TMP
+	unsigned wokrsize;
+
+	bool is_init_success = false;
+
+	silentarmy(int platf_id, int dev_id) 
+	: blocks(0)
+	, device_id(dev_id)
+	, platform_id(platf_id)
+	, oclc(nullptr)
+	, threadsNum(8192U)
+	, wokrsize(128)
+	{
+	
+	}
+
+	static int getcount() {
+		static auto devices = utility::GetAllDevices();
+		return devices.size();
+	}
+
+	static void getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version) {
+		static auto devices = utility::GetAllDevices();
+
+		if (devices.size() <= d_id) {
+			return;
+		}
+		auto device = devices[d_id];
+
+		std::vector<char> name(256, 0);
+		cl_uint compute_units = 0;
+
+		size_t nActualSize = 0;
+		cl_int rc = clGetDeviceInfo(device, CL_DEVICE_NAME, name.size(), &name[0], &nActualSize);
+
+		if (rc == CL_SUCCESS) {
+			gpu_name.assign(&name[0], nActualSize);
+		}
+
+		rc = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(cl_uint), &compute_units, &nActualSize);
+		if (rc == CL_SUCCESS) {
+			sm_count = (int)compute_units;
+		}
+
+		memset(&name[0], 0, name.size());
+		rc = clGetDeviceInfo(device, CL_DEVICE_VERSION, name.size(), &name[0], &nActualSize);
+		if (rc == CL_SUCCESS) {
+			version.assign(&name[0], nActualSize);
+		}		
+	}
+
+	static void start(silentarmy& device_context) {
+		device_context.is_init_success = false;
+		device_context.oclc = new algorithm_detail::silentarmy_context;
+		auto devices = utility::GetAllDevices();
+
+		auto& device = devices[device_context.device_id];
+
+		size_t nActualSize = 0;
+		cl_platform_id platform_id = nullptr;
+		cl_int rc = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform_id, nullptr);
+		
+
+		device_context.oclc->_dev_id = device;
+		device_context.oclc->platform_id = platform_id;
+
+		// context create
+		cl_context_properties props[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)device_context.oclc->platform_id, 0 };
+		cl_int error;
+		device_context.oclc->_context = clCreateContext(props, 1, &device, 0, 0, &error);
+		//OCLR(error, false);
+		if (cl_int err = error) {
+			printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__);
+			return;
+		}
+
+		cl_int binstatus;
+
+		char kernelName[64];
+		sprintf(kernelName, "silentarmy_gpu_%u.bin", (unsigned)device_context.device_id);
+		if (!utility::clCompileKernel(device_context.oclc->_context,
+			device,
+			kernelName,
+			{ "zcash/gpu/silentarmy.cl" },
+			"",
+			&binstatus,
+			&device_context.oclc->_program)) {
+			return;
+		}
+
+		if (binstatus == CL_SUCCESS) {
+			if (!device_context.oclc->init(device, device_context.threadsNum, device_context.wokrsize)) {
+				printf("Init failed");
+				return;
+			}
+		} else {
+			printf("GPU %d: failed to load kernel\n", device_context.device_id);
+			return;
+		}
+
+		device_context.is_init_success = true;		
+	}
+
+	static void stop(silentarmy& device_context) {
+		if (device_context.oclc != nullptr) delete device_context.oclc;
+	}
+
+	static void solve(const char *tequihash_header,
+		unsigned int tequihash_header_len,
+		const char* nonce,
+		unsigned int nonce_len,
+		std::function<bool()> cancelf,
+		std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
+		std::function<void(void)> hashdonef,
+		silentarmy& device_context) {
+			using namespace ocl::crypto;
+			using namespace algorithm_detail;
+			
+			unsigned char context[140];
+			memset(context, 0, 140);
+			memcpy(context, tequihash_header, tequihash_header_len);
+			memcpy(context + tequihash_header_len, nonce, nonce_len);
+
+			auto *miner = device_context.oclc;
+			clFlush(miner->queue);
+
+			blake2b_state_t initialCtx;
+			zcash_blake2b_init(&initialCtx, SA_ZCASH_HASH_LEN, SA_PARAM_N, SA_PARAM_K);
+			zcash_blake2b_update(&initialCtx, (const uint8_t*)context, 128, 0);
+
+			cl_mem buf_blake_st;
+			buf_blake_st = check_clCreateBuffer(miner->_context, CL_MEM_READ_ONLY |
+				CL_MEM_COPY_HOST_PTR, sizeof(blake2b_state_s), &initialCtx);
+
+			for (unsigned round = 0; round < SA_PARAM_K; round++)
+			{
+				init_ht(miner->queue, miner->k_init_ht, miner->buf_ht[round & 1], miner->rowCounters[round & 1]);
+				if (!round)
+				{
+					check_clSetKernelArg(miner->k_rounds[round], 0, &buf_blake_st);
+					check_clSetKernelArg(miner->k_rounds[round], 1, &miner->buf_ht[round & 1]);
+					check_clSetKernelArg(miner->k_rounds[round], 2, &miner->rowCounters[round & 2]);
+					miner->global_ws = select_work_size_blake(miner->_dev_id);
+				}
+				else
+				{
+					check_clSetKernelArg(miner->k_rounds[round], 0, &miner->buf_ht[(round - 1) & 1]);
+					check_clSetKernelArg(miner->k_rounds[round], 1, &miner->buf_ht[round & 1]);
+					check_clSetKernelArg(miner->k_rounds[round], 2, &miner->rowCounters[(round - 1) & 1]);
+					check_clSetKernelArg(miner->k_rounds[round], 3, &miner->rowCounters[round & 1]);
+					miner->global_ws = SA_NR_ROWS;
+				}
+				check_clSetKernelArg(miner->k_rounds[round], round == 0 ? 3 : 4, &miner->buf_dbg);
+				if (round == SA_PARAM_K - 1)
+					check_clSetKernelArg(miner->k_rounds[round], 5, &miner->buf_sols);
+				check_clEnqueueNDRangeKernel(miner->queue, miner->k_rounds[round], 1, NULL,
+					&miner->global_ws, &miner->local_work_size, 0, NULL, NULL);
+				// cancel function
+				if (cancelf()) return;
+			}
+			check_clSetKernelArg(miner->k_sols, 0, &miner->buf_ht[0]);
+			check_clSetKernelArg(miner->k_sols, 1, &miner->buf_ht[1]);
+			check_clSetKernelArg(miner->k_sols, 2, &miner->buf_sols);
+			check_clSetKernelArg(miner->k_sols, 3, &miner->rowCounters[0]);
+			check_clSetKernelArg(miner->k_sols, 4, &miner->rowCounters[1]);
+			miner->global_ws = SA_NR_ROWS;
+			check_clEnqueueNDRangeKernel(miner->queue, miner->k_sols, 1, NULL,
+				&miner->global_ws, &miner->local_work_size, 0, NULL, NULL);
+
+			check_clEnqueueReadBuffer(miner->queue, miner->buf_sols,
+				CL_TRUE,	// cl_bool	blocking_read
+				0,		// size_t	offset
+				sizeof(*miner->sols),	// size_t	size
+				miner->sols,	// void		*ptr
+				0,		// cl_uint	num_events_in_wait_list
+				NULL,	// cl_event	*event_wait_list
+				NULL);	// cl_event	*event
+
+			if (miner->sols->nr > SA_MAX_SOLS)
+				miner->sols->nr = SA_MAX_SOLS;
+
+			clReleaseMemObject(buf_blake_st);
+
+			for (unsigned sol_i = 0; sol_i < miner->sols->nr; sol_i++) {
+				verify_sol(miner->sols, sol_i);
+			}
+
+			uint8_t proof[SA_COMPRESSED_PROOFSIZE * 2];
+			for (uint32_t i = 0; i < miner->sols->nr; i++) {
+				if (miner->sols->valid[i]) {
+					compress<SA_PREFIX>(proof, (uint32_t *)(miner->sols->values[i]), 1 << SA_PARAM_K);
+					solutionf(std::vector<uint32_t>(0), 1344, proof);
+				}
+			}
+			hashdonef();
+		}
+
+	std::string getname() const { return "OCL_SILENTARMY"; }
+
+	std::string getdevinfo() {
+		static auto devices = ocl::utility::GetAllDevices();
+		auto device = devices[device_id];
+		std::vector<char> name(256, 0);
+		size_t nActualSize = 0;
+		std::string gpu_name;
+
+		cl_int rc = clGetDeviceInfo(device, CL_DEVICE_NAME, name.size(), &name[0], &nActualSize);
+
+		gpu_name.assign(&name[0], nActualSize);
+
+		return "GPU_ID( " + gpu_name + ")";		
+	}
+	
+private:
+	std::string m_gpu_name;
+	std::string m_version;
+	
+};
+
+}
+}
\ No newline at end of file
diff --git a/contrib/ocl/cl_ext.hpp b/contrib/ocl/cl_ext.hpp
new file mode 100644
index 000000000..507598171
--- /dev/null
+++ b/contrib/ocl/cl_ext.hpp
@@ -0,0 +1,12355 @@
+/*******************************************************************************
+* Copyright (c) 2008-2013 The Khronos Group Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and/or associated documentation files (the
+* "Materials"), to deal in the Materials without restriction, including
+* without limitation the rights to use, copy, modify, merge, publish,
+* distribute, sublicense, and/or sell copies of the Materials, and to
+* permit persons to whom the Materials are furnished to do so, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Materials.
+*
+* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+******************************************************************************/
+
+/*! \file
+*
+*   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and
+*       OpenCL 1.2 (rev 15)
+*   \author Benedict R. Gaster, Laurent Morichetti and Lee Howes
+*
+*   Additions and fixes from:
+*       Brian Cole, March 3rd 2010 and April 2012
+*       Matt Gruenke, April 2012.
+*       Bruce Merry, February 2013.
+*
+*   \version 1.2.5
+*   \date June 2013
+*
+*   Optional extension support
+*
+*         cl
+*         cl_ext_device_fission
+*				#define USE_CL_DEVICE_FISSION
+*/
+
+/*! \mainpage
+* \section intro Introduction
+* For many large applications C++ is the language of choice and so it seems
+* reasonable to define C++ bindings for OpenCL.
+*
+*
+* The interface is contained with a single C++ header file \em cl.hpp and all
+* definitions are contained within the namespace \em cl. There is no additional
+* requirement to include \em cl.h and to use either the C++ or original C
+* bindings it is enough to simply include \em cl.hpp.
+*
+* The bindings themselves are lightweight and correspond closely to the
+* underlying C API. Using the C++ bindings introduces no additional execution
+* overhead.
+*
+* For detail documentation on the bindings see:
+*
+* The OpenCL C++ Wrapper API 1.2 (revision 09)
+*  http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf
+*
+* \section example Example
+*
+* The following example shows a general use case for the C++
+* bindings, including support for the optional exception feature and
+* also the supplied vector and string classes, see following sections for
+* decriptions of these features.
+*
+* \code
+* #define __CL_ENABLE_EXCEPTIONS
+*
+* #if defined(__APPLE__) || defined(__MACOSX)
+* #include <OpenCL/cl.hpp>
+* #else
+* #include <CL/cl.hpp>
+* #endif
+* #include <cstdio>
+* #include <cstdlib>
+* #include <iostream>
+*
+*  const char * helloStr  = "__kernel void "
+*                           "hello(void) "
+*                           "{ "
+*                           "  "
+*                           "} ";
+*
+*  int
+*  main(void)
+*  {
+*     cl_int err = CL_SUCCESS;
+*     try {
+*
+*       std::vector<cl::Platform> platforms;
+*       cl::Platform::get(&platforms);
+*       if (platforms.size() == 0) {
+*           std::cout << "Platform size 0\n";
+*           return -1;
+*       }
+*
+*       cl_context_properties properties[] =
+*          { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
+*       cl::Context context(CL_DEVICE_TYPE_CPU, properties);
+*
+*       std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
+*
+*       cl::Program::Sources source(1,
+*           std::make_pair(helloStr,strlen(helloStr)));
+*       cl::Program program_ = cl::Program(context, source);
+*       program_.build(devices);
+*
+*       cl::Kernel kernel(program_, "hello", &err);
+*
+*       cl::Event event;
+*       cl::CommandQueue queue(context, devices[0], 0, &err);
+*       queue.enqueueNDRangeKernel(
+*           kernel,
+*           cl::NullRange,
+*           cl::NDRange(4,4),
+*           cl::NullRange,
+*           NULL,
+*           &event);
+*
+*       event.wait();
+*     }
+*     catch (cl::Error err) {
+*        std::cerr
+*           << "ERROR: "
+*           << err.what()
+*           << "("
+*           << err.err()
+*           << ")"
+*           << std::endl;
+*     }
+*
+*    return EXIT_SUCCESS;
+*  }
+*
+* \endcode
+*
+*/
+#ifndef CL_HPP_
+#define CL_HPP_
+
+#ifdef _WIN32
+
+#include <windows.h>
+#include <malloc.h>
+#include <iterator>
+#include <intrin.h>
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+#include <exception>
+#endif // #if defined(__CL_ENABLE_EXCEPTIONS)
+
+#pragma push_macro("max")
+#undef max
+#if defined(USE_DX_INTEROP)
+#include <CL/cl_d3d10.h>
+#include <CL/cl_dx9_media_sharing.h>
+#endif
+#endif // _WIN32
+
+// 
+#if defined(USE_CL_DEVICE_FISSION)
+#include <CL/cl_ext.h> // AMD topology not needed here
+#endif
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenGL/OpenGL.h>
+#include <OpenCL/opencl.h>
+#include <libkern/OSAtomic.h>
+#else
+#include <GL/gl.h>
+#include <CL/opencl.h>
+#endif // !__APPLE__
+
+// To avoid accidentally taking ownership of core OpenCL types
+// such as cl_kernel constructors are made explicit
+// under OpenCL 1.2
+#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS explicit
+#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS 
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+// Define deprecated prefixes and suffixes to ensure compilation
+// in case they are not pre-defined
+#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+
+#if !defined(CL_CALLBACK)
+#define CL_CALLBACK
+#endif //CL_CALLBACK
+
+#include <utility>
+#include <limits>
+
+#if !defined(__NO_STD_VECTOR)
+#include <vector>
+#endif
+
+#if !defined(__NO_STD_STRING)
+#include <string>
+#endif 
+
+#if defined(__linux__) || defined(__APPLE__) || defined(__MACOSX)
+#include <alloca.h>
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#endif // __linux__
+
+#include <cstring>
+
+
+/*! \namespace cl
+*
+* \brief The OpenCL C++ bindings are defined within this namespace.
+*
+*/
+namespace cl {
+
+	class Memory;
+
+	/**
+	* Deprecated APIs for 1.2
+	*/
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+#define __INIT_CL_EXT_FCN_PTR(name) \
+    if(!pfn_##name) { \
+        pfn_##name = (PFN_##name) \
+            clGetExtensionFunctionAddress(#name); \
+        if(!pfn_##name) { \
+		        } \
+	    }
+#endif // #if defined(CL_VERSION_1_1)
+
+#if defined(CL_VERSION_1_2)
+#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \
+    if(!pfn_##name) { \
+        pfn_##name = (PFN_##name) \
+            clGetExtensionFunctionAddressForPlatform(platform, #name); \
+        if(!pfn_##name) { \
+		        } \
+	    }
+#endif // #if defined(CL_VERSION_1_1)
+
+	class Program;
+	class Device;
+	class Context;
+	class CommandQueue;
+	class Memory;
+	class Buffer;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+	/*! \brief Exception class
+	*
+	*  This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined.
+	*/
+	class Error : public std::exception
+	{
+	private:
+		cl_int err_;
+		const char * errStr_;
+	public:
+		/*! \brief Create a new CL error exception for a given error code
+		*  and corresponding message.
+		*
+		*  \param err error code value.
+		*
+		*  \param errStr a descriptive string that must remain in scope until
+		*                handling of the exception has concluded.  If set, it
+		*                will be returned by what().
+		*/
+		Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
+		{}
+
+		~Error() throw() {}
+
+		/*! \brief Get error string associated with exception
+		*
+		* \return A memory pointer to the error message string.
+		*/
+		virtual const char * what() const throw ()
+		{
+			if (errStr_ == NULL) {
+				return "empty";
+			}
+			else {
+				return errStr_;
+			}
+		}
+
+		/*! \brief Get error code associated with exception
+		*
+		*  \return The error code.
+		*/
+		cl_int err(void) const { return err_; }
+	};
+
+#define __ERR_STR(x) #x
+#else
+#define __ERR_STR(x) NULL
+#endif // __CL_ENABLE_EXCEPTIONS
+
+
+	namespace detail
+	{
+#if defined(__CL_ENABLE_EXCEPTIONS)
+		static inline cl_int errHandler(
+			cl_int err,
+			const char * errStr = NULL)
+		{
+			if (err != CL_SUCCESS) {
+				throw Error(err, errStr);
+			}
+			return err;
+		}
+#else
+		static inline cl_int errHandler(cl_int err, const char * errStr = NULL)
+		{
+			(void)errStr; // suppress unused variable warning
+			return err;
+		}
+#endif // __CL_ENABLE_EXCEPTIONS
+	}
+
+
+
+	//! \cond DOXYGEN_DETAIL
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#define __GET_DEVICE_INFO_ERR               __ERR_STR(clGetDeviceInfo)
+#define __GET_PLATFORM_INFO_ERR             __ERR_STR(clGetPlatformInfo)
+#define __GET_DEVICE_IDS_ERR                __ERR_STR(clGetDeviceIDs)
+#define __GET_PLATFORM_IDS_ERR              __ERR_STR(clGetPlatformIDs)
+#define __GET_CONTEXT_INFO_ERR              __ERR_STR(clGetContextInfo)
+#define __GET_EVENT_INFO_ERR                __ERR_STR(clGetEventInfo)
+#define __GET_EVENT_PROFILE_INFO_ERR        __ERR_STR(clGetEventProfileInfo)
+#define __GET_MEM_OBJECT_INFO_ERR           __ERR_STR(clGetMemObjectInfo)
+#define __GET_IMAGE_INFO_ERR                __ERR_STR(clGetImageInfo)
+#define __GET_SAMPLER_INFO_ERR              __ERR_STR(clGetSamplerInfo)
+#define __GET_KERNEL_INFO_ERR               __ERR_STR(clGetKernelInfo)
+#if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_ARG_INFO_ERR               __ERR_STR(clGetKernelArgInfo)
+#endif // #if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_WORK_GROUP_INFO_ERR    __ERR_STR(clGetKernelWorkGroupInfo)
+#define __GET_PROGRAM_INFO_ERR              __ERR_STR(clGetProgramInfo)
+#define __GET_PROGRAM_BUILD_INFO_ERR        __ERR_STR(clGetProgramBuildInfo)
+#define __GET_COMMAND_QUEUE_INFO_ERR        __ERR_STR(clGetCommandQueueInfo)
+
+#define __CREATE_CONTEXT_ERR                __ERR_STR(clCreateContext)
+#define __CREATE_CONTEXT_FROM_TYPE_ERR      __ERR_STR(clCreateContextFromType)
+#define __GET_SUPPORTED_IMAGE_FORMATS_ERR   __ERR_STR(clGetSupportedImageFormats)
+
+#define __CREATE_BUFFER_ERR                 __ERR_STR(clCreateBuffer)
+#define __COPY_ERR                          __ERR_STR(cl::copy)
+#define __CREATE_SUBBUFFER_ERR              __ERR_STR(clCreateSubBuffer)
+#define __CREATE_GL_BUFFER_ERR              __ERR_STR(clCreateFromGLBuffer)
+#define __CREATE_GL_RENDER_BUFFER_ERR       __ERR_STR(clCreateFromGLBuffer)
+#define __GET_GL_OBJECT_INFO_ERR            __ERR_STR(clGetGLObjectInfo)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_IMAGE_ERR                  __ERR_STR(clCreateImage)
+#define __CREATE_GL_TEXTURE_ERR             __ERR_STR(clCreateFromGLTexture)
+#define __IMAGE_DIMENSION_ERR               __ERR_STR(Incorrect image dimensions)
+#endif // #if defined(CL_VERSION_1_2)
+#define __CREATE_SAMPLER_ERR                __ERR_STR(clCreateSampler)
+#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback)
+
+#define __CREATE_USER_EVENT_ERR             __ERR_STR(clCreateUserEvent)
+#define __SET_USER_EVENT_STATUS_ERR         __ERR_STR(clSetUserEventStatus)
+#define __SET_EVENT_CALLBACK_ERR            __ERR_STR(clSetEventCallback)
+#define __WAIT_FOR_EVENTS_ERR               __ERR_STR(clWaitForEvents)
+
+#define __CREATE_KERNEL_ERR                 __ERR_STR(clCreateKernel)
+#define __SET_KERNEL_ARGS_ERR               __ERR_STR(clSetKernelArg)
+#define __CREATE_PROGRAM_WITH_SOURCE_ERR    __ERR_STR(clCreateProgramWithSource)
+#define __CREATE_PROGRAM_WITH_BINARY_ERR    __ERR_STR(clCreateProgramWithBinary)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    __ERR_STR(clCreateProgramWithBuiltInKernels)
+#endif // #if defined(CL_VERSION_1_2)
+#define __BUILD_PROGRAM_ERR                 __ERR_STR(clBuildProgram)
+#if defined(CL_VERSION_1_2)
+#define __COMPILE_PROGRAM_ERR                  __ERR_STR(clCompileProgram)
+
+#endif // #if defined(CL_VERSION_1_2)
+#define __CREATE_KERNELS_IN_PROGRAM_ERR     __ERR_STR(clCreateKernelsInProgram)
+
+#define __CREATE_COMMAND_QUEUE_ERR          __ERR_STR(clCreateCommandQueue)
+#define __SET_COMMAND_QUEUE_PROPERTY_ERR    __ERR_STR(clSetCommandQueueProperty)
+#define __ENQUEUE_READ_BUFFER_ERR           __ERR_STR(clEnqueueReadBuffer)
+#define __ENQUEUE_READ_BUFFER_RECT_ERR      __ERR_STR(clEnqueueReadBufferRect)
+#define __ENQUEUE_WRITE_BUFFER_ERR          __ERR_STR(clEnqueueWriteBuffer)
+#define __ENQUEUE_WRITE_BUFFER_RECT_ERR     __ERR_STR(clEnqueueWriteBufferRect)
+#define __ENQEUE_COPY_BUFFER_ERR            __ERR_STR(clEnqueueCopyBuffer)
+#define __ENQEUE_COPY_BUFFER_RECT_ERR       __ERR_STR(clEnqueueCopyBufferRect)
+#define __ENQUEUE_FILL_BUFFER_ERR           __ERR_STR(clEnqueueFillBuffer)
+#define __ENQUEUE_READ_IMAGE_ERR            __ERR_STR(clEnqueueReadImage)
+#define __ENQUEUE_WRITE_IMAGE_ERR           __ERR_STR(clEnqueueWriteImage)
+#define __ENQUEUE_COPY_IMAGE_ERR            __ERR_STR(clEnqueueCopyImage)
+#define __ENQUEUE_FILL_IMAGE_ERR           __ERR_STR(clEnqueueFillImage)
+#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  __ERR_STR(clEnqueueCopyImageToBuffer)
+#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  __ERR_STR(clEnqueueCopyBufferToImage)
+#define __ENQUEUE_MAP_BUFFER_ERR            __ERR_STR(clEnqueueMapBuffer)
+#define __ENQUEUE_MAP_IMAGE_ERR             __ERR_STR(clEnqueueMapImage)
+#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR      __ERR_STR(clEnqueueUnMapMemObject)
+#define __ENQUEUE_NDRANGE_KERNEL_ERR        __ERR_STR(clEnqueueNDRangeKernel)
+#define __ENQUEUE_TASK_ERR                  __ERR_STR(clEnqueueTask)
+#define __ENQUEUE_NATIVE_KERNEL             __ERR_STR(clEnqueueNativeKernel)
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   __ERR_STR(clEnqueueMigrateMemObjects)
+#endif // #if defined(CL_VERSION_1_2)
+
+#define __ENQUEUE_ACQUIRE_GL_ERR            __ERR_STR(clEnqueueAcquireGLObjects)
+#define __ENQUEUE_RELEASE_GL_ERR            __ERR_STR(clEnqueueReleaseGLObjects)
+
+
+#define __RETAIN_ERR                        __ERR_STR(Retain Object)
+#define __RELEASE_ERR                       __ERR_STR(Release Object)
+#define __FLUSH_ERR                         __ERR_STR(clFlush)
+#define __FINISH_ERR                        __ERR_STR(clFinish)
+#define __VECTOR_CAPACITY_ERR               __ERR_STR(Vector capacity error)
+
+	/**
+	* CL 1.2 version that uses device fission.
+	*/
+#if defined(CL_VERSION_1_2)
+#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevices)
+#else
+#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevicesEXT)
+#endif // #if defined(CL_VERSION_1_2)
+
+	/**
+	* Deprecated APIs for 1.2
+	*/
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+#define __ENQUEUE_MARKER_ERR                __ERR_STR(clEnqueueMarker)
+#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       __ERR_STR(clEnqueueWaitForEvents)
+#define __ENQUEUE_BARRIER_ERR               __ERR_STR(clEnqueueBarrier)
+#define __UNLOAD_COMPILER_ERR               __ERR_STR(clUnloadCompiler)
+#define __CREATE_GL_TEXTURE_2D_ERR          __ERR_STR(clCreateFromGLTexture2D)
+#define __CREATE_GL_TEXTURE_3D_ERR          __ERR_STR(clCreateFromGLTexture3D)
+#define __CREATE_IMAGE2D_ERR                __ERR_STR(clCreateImage2D)
+#define __CREATE_IMAGE3D_ERR                __ERR_STR(clCreateImage3D)
+#endif // #if defined(CL_VERSION_1_1)
+
+#endif // __CL_USER_OVERRIDE_ERROR_STRINGS
+	//! \endcond
+
+	/**
+	* CL 1.2 marker and barrier commands
+	*/
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MARKER_WAIT_LIST_ERR                __ERR_STR(clEnqueueMarkerWithWaitList)
+#define __ENQUEUE_BARRIER_WAIT_LIST_ERR               __ERR_STR(clEnqueueBarrierWithWaitList)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
+	typedef std::string STRING_CLASS;
+#elif !defined(__USE_DEV_STRING) 
+
+	/*! \class string
+	* \brief Simple string class, that provides a limited subset of std::string
+	* functionality but avoids many of the issues that come with that class.
+
+	*  \note Deprecated. Please use std::string as default or
+	*  re-define the string class to match the std::string
+	*  interface by defining STRING_CLASS
+	*/
+	class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+	{
+	private:
+		::size_t size_;
+		char * str_;
+	public:
+		//! \brief Constructs an empty string, allocating no memory.
+		string(void) : size_(0), str_(NULL)
+		{
+		}
+
+		/*! \brief Constructs a string populated from an arbitrary value of
+		*  specified size.
+		*
+		*  An extra '\0' is added, in case none was contained in str.
+		*
+		*  \param str the initial value of the string instance.  Note that '\0'
+		*             characters receive no special treatment.  If NULL,
+		*             the string is left empty, with a size of 0.
+		*
+		*  \param size the number of characters to copy from str.
+		*/
+		string(const char * str, ::size_t size) :
+			size_(size),
+			str_(NULL)
+		{
+			if (size > 0) {
+				str_ = new char[size_ + 1];
+				if (str_ != NULL) {
+					memcpy(str_, str, size_  * sizeof(char));
+					str_[size_] = '\0';
+				}
+				else {
+					size_ = 0;
+				}
+			}
+		}
+
+		/*! \brief Constructs a string populated from a null-terminated value.
+		*
+		*  \param str the null-terminated initial value of the string instance.
+		*             If NULL, the string is left empty, with a size of 0.
+		*/
+		string(const char * str) :
+			size_(0),
+			str_(NULL)
+		{
+			if (str) {
+				size_ = ::strlen(str);
+			}
+			if (size_ > 0) {
+				str_ = new char[size_ + 1];
+				if (str_ != NULL) {
+					memcpy(str_, str, (size_ + 1) * sizeof(char));
+				}
+			}
+		}
+
+		void resize(::size_t n)
+		{
+			if (size_ == n) {
+				return;
+			}
+			if (n == 0) {
+				if (str_) {
+					delete[] str_;
+				}
+				str_ = NULL;
+				size_ = 0;
+			}
+			else {
+				char *newString = new char[n + 1];
+				int copySize = n;
+				if (size_ < n) {
+					copySize = size_;
+				}
+				size_ = n;
+
+				if (str_) {
+					memcpy(newString, str_, (copySize + 1) * sizeof(char));
+				}
+				if (copySize < size_) {
+					memset(newString + copySize, 0, size_ - copySize);
+				}
+				newString[size_] = '\0';
+
+				delete[] str_;
+				str_ = newString;
+			}
+		}
+
+		const char& operator[] (::size_t pos) const
+		{
+			return str_[pos];
+		}
+
+		char& operator[] (::size_t pos)
+		{
+			return str_[pos];
+		}
+
+		/*! \brief Copies the value of another string to this one.
+		*
+		*  \param rhs the string to copy.
+		*
+		*  \returns a reference to the modified instance.
+		*/
+		string& operator=(const string& rhs)
+		{
+			if (this == &rhs) {
+				return *this;
+			}
+
+			if (str_ != NULL) {
+				delete[] str_;
+				str_ = NULL;
+				size_ = 0;
+			}
+
+			if (rhs.size_ == 0 || rhs.str_ == NULL) {
+				str_ = NULL;
+				size_ = 0;
+			}
+			else {
+				str_ = new char[rhs.size_ + 1];
+				size_ = rhs.size_;
+
+				if (str_ != NULL) {
+					memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char));
+				}
+				else {
+					size_ = 0;
+				}
+			}
+
+			return *this;
+		}
+
+		/*! \brief Constructs a string by copying the value of another instance.
+		*
+		*  \param rhs the string to copy.
+		*/
+		string(const string& rhs) :
+			size_(0),
+			str_(NULL)
+		{
+			*this = rhs;
+		}
+
+		//! \brief Destructor - frees memory used to hold the current value.
+		~string()
+		{
+			delete[] str_;
+			str_ = NULL;
+		}
+
+		//! \brief Queries the length of the string, excluding any added '\0's.
+		::size_t size(void) const   { return size_; }
+
+		//! \brief Queries the length of the string, excluding any added '\0's.
+		::size_t length(void) const { return size(); }
+
+		/*! \brief Returns a pointer to the private copy held by this instance,
+		*  or "" if empty/unset.
+		*/
+		const char * c_str(void) const { return (str_) ? str_ : ""; }
+	};
+	typedef cl::string STRING_CLASS;
+#endif // #elif !defined(__USE_DEV_STRING) 
+
+#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+#define VECTOR_CLASS std::vector
+#elif !defined(__USE_DEV_VECTOR) 
+#define VECTOR_CLASS cl::vector 
+
+#if !defined(__MAX_DEFAULT_VECTOR_SIZE)
+#define __MAX_DEFAULT_VECTOR_SIZE 10
+#endif
+
+	/*! \class vector
+	* \brief Fixed sized vector implementation that mirroring
+	*
+	*  \note Deprecated. Please use std::vector as default or
+	*  re-define the vector class to match the std::vector
+	*  interface by defining VECTOR_CLASS
+
+	*  \note Not recommended for use with custom objects as
+	*  current implementation will construct N elements
+	*
+	* std::vector functionality.
+	*  \brief Fixed sized vector compatible with std::vector.
+	*
+	*  \note
+	*  This differs from std::vector<> not just in memory allocation,
+	*  but also in terms of when members are constructed, destroyed,
+	*  and assigned instead of being copy constructed.
+	*
+	*  \param T type of element contained in the vector.
+	*
+	*  \param N maximum size of the vector.
+	*/
+	template <typename T, unsigned int N = __MAX_DEFAULT_VECTOR_SIZE>
+	class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+	{
+	private:
+		T data_[N];
+		unsigned int size_;
+
+	public:
+		//! \brief Constructs an empty vector with no memory allocated.
+		vector() :
+			size_(static_cast<unsigned int>(0))
+		{}
+
+		//! \brief Deallocates the vector's memory and destroys all of its elements.
+		~vector()
+		{
+			clear();
+		}
+
+		//! \brief Returns the number of elements currently contained.
+		unsigned int size(void) const
+		{
+			return size_;
+		}
+
+		/*! \brief Empties the vector of all elements.
+		*  \note
+		*  This does not deallocate memory but will invoke destructors
+		*  on contained elements.
+		*/
+		void clear()
+		{
+			while (!empty()) {
+				pop_back();
+			}
+		}
+
+		/*! \brief Appends an element after the last valid element.
+		* Calling this on a vector that has reached capacity will throw an
+		* exception if exceptions are enabled.
+		*/
+		void push_back(const T& x)
+		{
+			if (size() < N) {
+				new (&data_[size_]) T(x);
+				size_++;
+			}
+			else {
+				detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+			}
+		}
+
+		/*! \brief Removes the last valid element from the vector.
+		* Calling this on an empty vector will throw an exception
+		* if exceptions are enabled.
+		*/
+		void pop_back(void)
+		{
+			if (size_ != 0) {
+				--size_;
+				data_[size_].~T();
+			}
+			else {
+				detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+			}
+		}
+
+		/*! \brief Constructs with a value copied from another.
+		*
+		*  \param vec the vector to copy.
+		*/
+		vector(const vector<T, N>& vec) :
+			size_(vec.size_)
+		{
+			if (size_ != 0) {
+				assign(vec.begin(), vec.end());
+			}
+		}
+
+		/*! \brief Constructs with a specified number of initial elements.
+		*
+		*  \param size number of initial elements.
+		*
+		*  \param val value of initial elements.
+		*/
+		vector(unsigned int size, const T& val = T()) :
+			size_(0)
+		{
+			for (unsigned int i = 0; i < size; i++) {
+				push_back(val);
+			}
+		}
+
+		/*! \brief Overwrites the current content with that copied from another
+		*         instance.
+		*
+		*  \param rhs vector to copy.
+		*
+		*  \returns a reference to this.
+		*/
+		vector<T, N>& operator=(const vector<T, N>& rhs)
+		{
+			if (this == &rhs) {
+				return *this;
+			}
+
+			if (rhs.size_ != 0) {
+				assign(rhs.begin(), rhs.end());
+			}
+			else {
+				clear();
+			}
+
+			return *this;
+		}
+
+		/*! \brief Tests equality against another instance.
+		*
+		*  \param vec the vector against which to compare.
+		*/
+		bool operator==(vector<T, N> &vec)
+		{
+			if (size() != vec.size()) {
+				return false;
+			}
+
+			for (unsigned int i = 0; i < size(); ++i) {
+				if (operator[](i) != vec[i]) {
+					return false;
+				}
+			}
+			return true;
+		}
+
+		//! \brief Conversion operator to T*.
+		operator T* ()             { return data_; }
+
+		//! \brief Conversion operator to const T*.
+		operator const T* () const { return data_; }
+
+		//! \brief Tests whether this instance has any elements.
+		bool empty(void) const
+		{
+			return size_ == 0;
+		}
+
+		//! \brief Returns the maximum number of elements this instance can hold.
+		unsigned int max_size(void) const
+		{
+			return N;
+		}
+
+		//! \brief Returns the maximum number of elements this instance can hold.
+		unsigned int capacity() const
+		{
+			return N;
+		}
+
+		/*! \brief Returns a reference to a given element.
+		*
+		*  \param index which element to access.     *
+		*  \note
+		*  The caller is responsible for ensuring index is >= 0 and < size().
+		*/
+		T& operator[](int index)
+		{
+			return data_[index];
+		}
+
+		/*! \brief Returns a const reference to a given element.
+		*
+		*  \param index which element to access.
+		*
+		*  \note
+		*  The caller is responsible for ensuring index is >= 0 and < size().
+		*/
+		const T& operator[](int index) const
+		{
+			return data_[index];
+		}
+
+		/*! \brief Assigns elements of the vector based on a source iterator range.
+		*
+		*  \param start Beginning iterator of source range
+		*  \param end Enditerator of source range
+		*
+		*  \note
+		*  Will throw an exception if exceptions are enabled and size exceeded.
+		*/
+		template<class I>
+		void assign(I start, I end)
+		{
+			clear();
+			while (start != end) {
+				push_back(*start);
+				start++;
+			}
+		}
+
+		/*! \class iterator
+		* \brief Const iterator class for vectors
+		*/
+		class iterator
+		{
+		private:
+			const vector<T, N> *vec_;
+			int index_;
+
+			/**
+			* Internal iterator constructor to capture reference
+			* to the vector it iterates over rather than taking
+			* the vector by copy.
+			*/
+			iterator(const vector<T, N> &vec, int index) :
+				vec_(&vec)
+			{
+				if (!vec.empty()) {
+					index_ = index;
+				}
+				else {
+					index_ = -1;
+				}
+			}
+
+		public:
+			iterator(void) :
+				index_(-1),
+				vec_(NULL)
+			{
+			}
+
+			iterator(const iterator& rhs) :
+				vec_(rhs.vec_),
+				index_(rhs.index_)
+			{
+			}
+
+			~iterator(void) {}
+
+			static iterator begin(const cl::vector<T, N> &vec)
+			{
+				iterator i(vec, 0);
+
+				return i;
+			}
+
+			static iterator end(const cl::vector<T, N> &vec)
+			{
+				iterator i(vec, vec.size());
+
+				return i;
+			}
+
+			bool operator==(iterator i)
+			{
+				return ((vec_ == i.vec_) &&
+					(index_ == i.index_));
+			}
+
+			bool operator!=(iterator i)
+			{
+				return (!(*this == i));
+			}
+
+			iterator& operator++()
+			{
+				++index_;
+				return *this;
+			}
+
+			iterator operator++(int)
+			{
+				iterator retVal(*this);
+				++index_;
+				return retVal;
+			}
+
+			iterator& operator--()
+			{
+				--index_;
+				return *this;
+			}
+
+			iterator operator--(int)
+			{
+				iterator retVal(*this);
+				--index_;
+				return retVal;
+			}
+
+			const T& operator *() const
+			{
+				return (*vec_)[index_];
+			}
+		};
+
+		iterator begin(void)
+		{
+			return iterator::begin(*this);
+		}
+
+		iterator begin(void) const
+		{
+			return iterator::begin(*this);
+		}
+
+		iterator end(void)
+		{
+			return iterator::end(*this);
+		}
+
+		iterator end(void) const
+		{
+			return iterator::end(*this);
+		}
+
+		T& front(void)
+		{
+			return data_[0];
+		}
+
+		T& back(void)
+		{
+			return data_[size_];
+		}
+
+		const T& front(void) const
+		{
+			return data_[0];
+		}
+
+		const T& back(void) const
+		{
+			return data_[size_ - 1];
+		}
+	};
+#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+
+
+
+
+
+	namespace detail {
+#define __DEFAULT_NOT_INITIALIZED 1 
+#define __DEFAULT_BEING_INITIALIZED 2
+#define __DEFAULT_INITIALIZED 4
+
+		/*
+		* Compare and exchange primitives are needed for handling of defaults
+		*/
+		inline int compare_exchange(volatile int * dest, int exchange, int comparand)
+		{
+#ifdef _WIN32
+			return (int)(InterlockedCompareExchange(
+				(volatile long*)dest,
+				(long)exchange,
+				(long)comparand));
+#elif defined(__APPLE__) || defined(__MACOSX)
+			return OSAtomicOr32Orig((uint32_t)exchange, (volatile uint32_t*)dest);
+#else // !_WIN32 || defined(__APPLE__) || defined(__MACOSX)
+			return (__sync_val_compare_and_swap(
+				dest,
+				comparand,
+				exchange));
+#endif // !_WIN32
+		}
+
+		inline void fence() { _mm_mfence(); }
+	}; // namespace detail
+
+
+	/*! \brief class used to interface between C++ and
+	*  OpenCL C calls that require arrays of size_t values, whose
+	*  size is known statically.
+	*/
+	template <int N>
+	class size_t
+	{
+	private:
+		::size_t data_[N];
+
+	public:
+		//! \brief Initialize size_t to all 0s
+		size_t()
+		{
+			for (int i = 0; i < N; ++i) {
+				data_[i] = 0;
+			}
+		}
+
+		::size_t& operator[](int index)
+		{
+			return data_[index];
+		}
+
+		const ::size_t& operator[](int index) const
+		{
+			return data_[index];
+		}
+
+		//! \brief Conversion operator to T*.
+		operator ::size_t* ()             { return data_; }
+
+		//! \brief Conversion operator to const T*.
+		operator const ::size_t* () const { return data_; }
+	};
+
+	namespace detail {
+
+		// Generic getInfoHelper. The final parameter is used to guide overload
+		// resolution: the actual parameter passed is an int, which makes this
+		// a worse conversion sequence than a specialization that declares the
+		// parameter as an int.
+		template<typename Functor, typename T>
+		inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
+		{
+			return f(name, sizeof(T), param, NULL);
+		}
+
+		// Specialized getInfoHelper for VECTOR_CLASS params
+		template <typename Func, typename T>
+		inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
+		{
+			::size_t required;
+			cl_int err = f(name, 0, NULL, &required);
+			if (err != CL_SUCCESS) {
+				return err;
+			}
+
+			T* value = (T*)alloca(required);
+			err = f(name, required, value, NULL);
+			if (err != CL_SUCCESS) {
+				return err;
+			}
+
+			param->assign(&value[0], &value[required / sizeof(T)]);
+			return CL_SUCCESS;
+		}
+
+		/* Specialization for reference-counted types. This depends on the
+		* existence of Wrapper<T>::cl_type, and none of the other types having the
+		* cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+		* does not work, because when using a derived type (e.g. Context) the generic
+		* template will provide a better match.
+		*/
+		template <typename Func, typename T>
+		inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int, typename T::cl_type = 0)
+		{
+			::size_t required;
+			cl_int err = f(name, 0, NULL, &required);
+			if (err != CL_SUCCESS) {
+				return err;
+			}
+
+			typename T::cl_type * value = (typename T::cl_type *) alloca(required);
+			err = f(name, required, value, NULL);
+			if (err != CL_SUCCESS) {
+				return err;
+			}
+
+			::size_t elements = required / sizeof(typename T::cl_type);
+			param->assign(&value[0], &value[elements]);
+			for (::size_t i = 0; i < elements; i++)
+			{
+				if (value[i] != NULL)
+				{
+					err = (*param)[i].retain();
+					if (err != CL_SUCCESS) {
+						return err;
+					}
+				}
+			}
+			return CL_SUCCESS;
+		}
+
+		// Specialized for getInfo<CL_PROGRAM_BINARIES>
+		template <typename Func>
+		inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<char *>* param, int)
+		{
+			cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL);
+
+			if (err != CL_SUCCESS) {
+				return err;
+			}
+
+			return CL_SUCCESS;
+		}
+
+		// Specialized GetInfoHelper for STRING_CLASS params
+		template <typename Func>
+		inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long)
+		{
+			::size_t required;
+			cl_int err = f(name, 0, NULL, &required);
+			if (err != CL_SUCCESS) {
+				return err;
+			}
+
+			char* value = (char*)alloca(required);
+			err = f(name, required, value, NULL);
+			if (err != CL_SUCCESS) {
+				return err;
+			}
+
+			*param = value;
+			return CL_SUCCESS;
+		}
+
+		// Specialized GetInfoHelper for cl::size_t params
+		template <typename Func, ::size_t N>
+		inline cl_int getInfoHelper(Func f, cl_uint name, size_t<N>* param, long)
+		{
+			::size_t required;
+			cl_int err = f(name, 0, NULL, &required);
+			if (err != CL_SUCCESS) {
+				return err;
+			}
+
+			::size_t* value = (::size_t*) alloca(required);
+			err = f(name, required, value, NULL);
+			if (err != CL_SUCCESS) {
+				return err;
+			}
+
+			for (int i = 0; i < N; ++i) {
+				(*param)[i] = value[i];
+			}
+
+			return CL_SUCCESS;
+		}
+
+		template<typename T> struct ReferenceHandler;
+
+		/* Specialization for reference-counted types. This depends on the
+		* existence of Wrapper<T>::cl_type, and none of the other types having the
+		* cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+		* does not work, because when using a derived type (e.g. Context) the generic
+		* template will provide a better match.
+		*/
+		template<typename Func, typename T>
+		inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
+		{
+			typename T::cl_type value;
+			cl_int err = f(name, sizeof(value), &value, NULL);
+			if (err != CL_SUCCESS) {
+				return err;
+			}
+			*param = value;
+			if (value != NULL)
+			{
+				err = param->retain();
+				if (err != CL_SUCCESS) {
+					return err;
+				}
+			}
+			return CL_SUCCESS;
+		}
+
+#define __PARAM_NAME_INFO_1_0(F) \
+    F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \
+    \
+    F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
+    F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
+    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \
+    F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \
+    F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
+    F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \
+    F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
+    F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \
+    F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \
+    \
+    F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
+    F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS<Device>) \
+    F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS<cl_context_properties>) \
+    \
+    F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
+    F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
+    F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
+    F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_uint) \
+    \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
+    \
+    F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
+    F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
+    F(cl_mem_info, CL_MEM_SIZE, ::size_t) \
+    F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
+    F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
+    \
+    F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
+    F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \
+    F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \
+    F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \
+    \
+    F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
+    F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
+    F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \
+    F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \
+    F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \
+    \
+    F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
+    F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS<Device>) \
+    F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \
+    F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \
+    F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS<char *>) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \
+    \
+    F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \
+    F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
+    F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \
+    F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \
+    F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
+    F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
+    F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
+    F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)
+
+#if defined(CL_VERSION_1_1)
+#define __PARAM_NAME_INFO_1_1(F) \
+    F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \
+    \
+    F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
+    F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \
+    F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
+#endif // CL_VERSION_1_1
+
+
+#if defined(CL_VERSION_1_2)
+#define __PARAM_NAME_INFO_1_2(F) \
+    F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \
+    \
+    F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \
+    F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \
+    \
+    F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \
+    \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \
+    \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS<cl_device_partition_property>) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS<cl_device_partition_property>)  \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \
+    F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
+	F(cl_device_info, CL_DEVICE_TOPOLOGY_AMD, cl_device_topology_amd) \
+    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if defined(USE_CL_DEVICE_FISSION)
+#define __PARAM_NAME_DEVICE_FISSION(F) \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
+    F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS<cl_device_partition_property_ext>)
+#endif // USE_CL_DEVICE_FISSION
+
+		template <typename enum_type, cl_int Name>
+		struct param_traits {};
+
+#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \
+struct token;                                        \
+template<>                                           \
+struct param_traits<detail:: token,param_name>       \
+		{                                                    \
+    enum { value = param_name };                     \
+    typedef T param_type;                            \
+		};
+
+		__PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS)
+#if defined(CL_VERSION_1_1)
+			__PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+#if defined(CL_VERSION_1_2)
+			__PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+
+#if defined(USE_CL_DEVICE_FISSION)
+			__PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS);
+#endif // USE_CL_DEVICE_FISSION
+
+#ifdef CL_PLATFORM_ICD_SUFFIX_KHR
+		__CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS)
+#endif
+
+#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
+			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
+#endif
+
+#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
+			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>)
+#endif
+#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
+			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_WIDTH_AMD
+			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
+			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
+			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
+			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
+			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
+			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
+			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
+			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
+#endif
+
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
+			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV
+			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_WARP_SIZE_NV
+			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_GPU_OVERLAP_NV
+			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV
+			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV
+			__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
+#endif
+
+			// Convenience functions
+
+			template <typename Func, typename T>
+		inline cl_int
+			getInfo(Func f, cl_uint name, T* param)
+		{
+			return getInfoHelper(f, name, param, 0);
+		}
+
+		template <typename Func, typename Arg0>
+		struct GetInfoFunctor0
+		{
+			Func f_; const Arg0& arg0_;
+			cl_int operator ()(
+				cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+			{
+				return f_(arg0_, param, size, value, size_ret);
+			}
+		};
+
+		template <typename Func, typename Arg0, typename Arg1>
+		struct GetInfoFunctor1
+		{
+			Func f_; const Arg0& arg0_; const Arg1& arg1_;
+			cl_int operator ()(
+				cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+			{
+				return f_(arg0_, arg1_, param, size, value, size_ret);
+			}
+		};
+
+		template <typename Func, typename Arg0, typename T>
+		inline cl_int
+			getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
+		{
+			GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
+			return getInfoHelper(f0, name, param, 0);
+		}
+
+		template <typename Func, typename Arg0, typename Arg1, typename T>
+		inline cl_int
+			getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
+		{
+			GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
+			return getInfoHelper(f0, name, param, 0);
+		}
+
+		template<typename T>
+		struct ReferenceHandler
+		{ };
+
+#if defined(CL_VERSION_1_2)
+		/**
+		* OpenCL 1.2 devices do have retain/release.
+		*/
+		template <>
+		struct ReferenceHandler<cl_device_id>
+		{
+			/**
+			* Retain the device.
+			* \param device A valid device created using createSubDevices
+			* \return
+			*   CL_SUCCESS if the function executed successfully.
+			*   CL_INVALID_DEVICE if device was not a valid subdevice
+			*   CL_OUT_OF_RESOURCES
+			*   CL_OUT_OF_HOST_MEMORY
+			*/
+			static cl_int retain(cl_device_id device)
+			{
+				return ::clRetainDevice(device);
+			}
+			/**
+			* Retain the device.
+			* \param device A valid device created using createSubDevices
+			* \return
+			*   CL_SUCCESS if the function executed successfully.
+			*   CL_INVALID_DEVICE if device was not a valid subdevice
+			*   CL_OUT_OF_RESOURCES
+			*   CL_OUT_OF_HOST_MEMORY
+			*/
+			static cl_int release(cl_device_id device)
+			{
+				return ::clReleaseDevice(device);
+			}
+		};
+#else // #if defined(CL_VERSION_1_2)
+		/**
+		* OpenCL 1.1 devices do not have retain/release.
+		*/
+		template <>
+		struct ReferenceHandler<cl_device_id>
+		{
+			// cl_device_id does not have retain().
+			static cl_int retain(cl_device_id)
+			{
+				return CL_SUCCESS;
+			}
+			// cl_device_id does not have release().
+			static cl_int release(cl_device_id)
+			{
+				return CL_SUCCESS;
+			}
+		};
+#endif // #if defined(CL_VERSION_1_2)
+
+		template <>
+		struct ReferenceHandler<cl_platform_id>
+		{
+			// cl_platform_id does not have retain().
+			static cl_int retain(cl_platform_id)
+			{
+				return CL_SUCCESS;
+			}
+			// cl_platform_id does not have release().
+			static cl_int release(cl_platform_id)
+			{
+				return CL_SUCCESS;
+			}
+		};
+
+		template <>
+		struct ReferenceHandler<cl_context>
+		{
+			static cl_int retain(cl_context context)
+			{
+				return ::clRetainContext(context);
+			}
+			static cl_int release(cl_context context)
+			{
+				return ::clReleaseContext(context);
+			}
+		};
+
+		template <>
+		struct ReferenceHandler<cl_command_queue>
+		{
+			static cl_int retain(cl_command_queue queue)
+			{
+				return ::clRetainCommandQueue(queue);
+			}
+			static cl_int release(cl_command_queue queue)
+			{
+				return ::clReleaseCommandQueue(queue);
+			}
+		};
+
+		template <>
+		struct ReferenceHandler<cl_mem>
+		{
+			static cl_int retain(cl_mem memory)
+			{
+				return ::clRetainMemObject(memory);
+			}
+			static cl_int release(cl_mem memory)
+			{
+				return ::clReleaseMemObject(memory);
+			}
+		};
+
+		template <>
+		struct ReferenceHandler<cl_sampler>
+		{
+			static cl_int retain(cl_sampler sampler)
+			{
+				return ::clRetainSampler(sampler);
+			}
+			static cl_int release(cl_sampler sampler)
+			{
+				return ::clReleaseSampler(sampler);
+			}
+		};
+
+		template <>
+		struct ReferenceHandler<cl_program>
+		{
+			static cl_int retain(cl_program program)
+			{
+				return ::clRetainProgram(program);
+			}
+			static cl_int release(cl_program program)
+			{
+				return ::clReleaseProgram(program);
+			}
+		};
+
+		template <>
+		struct ReferenceHandler<cl_kernel>
+		{
+			static cl_int retain(cl_kernel kernel)
+			{
+				return ::clRetainKernel(kernel);
+			}
+			static cl_int release(cl_kernel kernel)
+			{
+				return ::clReleaseKernel(kernel);
+			}
+		};
+
+		template <>
+		struct ReferenceHandler<cl_event>
+		{
+			static cl_int retain(cl_event event)
+			{
+				return ::clRetainEvent(event);
+			}
+			static cl_int release(cl_event event)
+			{
+				return ::clReleaseEvent(event);
+			}
+		};
+
+
+		// Extracts version number with major in the upper 16 bits, minor in the lower 16
+		static cl_uint getVersion(const char *versionInfo)
+		{
+			int highVersion = 0;
+			int lowVersion = 0;
+			int index = 7;
+			while (versionInfo[index] != '.') {
+				highVersion *= 10;
+				highVersion += versionInfo[index] - '0';
+				++index;
+			}
+			++index;
+			while (versionInfo[index] != ' ') {
+				lowVersion *= 10;
+				lowVersion += versionInfo[index] - '0';
+				++index;
+			}
+			return (highVersion << 16) | lowVersion;
+		}
+
+		static cl_uint getPlatformVersion(cl_platform_id platform)
+		{
+			::size_t size = 0;
+			clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
+			char *versionInfo = (char *)alloca(size);
+			clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size);
+			return getVersion(versionInfo);
+		}
+
+		static cl_uint getDevicePlatformVersion(cl_device_id device)
+		{
+			cl_platform_id platform;
+			clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
+			return getPlatformVersion(platform);
+		}
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+		static cl_uint getContextPlatformVersion(cl_context context)
+		{
+			// The platform cannot be queried directly, so we first have to grab a
+			// device and obtain its context
+			::size_t size = 0;
+			clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
+			if (size == 0)
+				return 0;
+			cl_device_id *devices = (cl_device_id *)alloca(size);
+			clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL);
+			return getDevicePlatformVersion(devices[0]);
+		}
+#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+		template <typename T>
+		class Wrapper
+		{
+		public:
+			typedef T cl_type;
+
+		protected:
+			cl_type object_;
+
+		public:
+			Wrapper() : object_(NULL) { }
+
+			Wrapper(const cl_type &obj) : object_(obj) { }
+
+			~Wrapper()
+			{
+				if (object_ != NULL) { release(); }
+			}
+
+			Wrapper(const Wrapper<cl_type>& rhs)
+			{
+				object_ = rhs.object_;
+				if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+			}
+
+			Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+			{
+				if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+				object_ = rhs.object_;
+				if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+				return *this;
+			}
+
+			Wrapper<cl_type>& operator = (const cl_type &rhs)
+			{
+				if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+				object_ = rhs;
+				return *this;
+			}
+
+			cl_type operator ()() const { return object_; }
+
+			cl_type& operator ()() { return object_; }
+
+		protected:
+			template<typename Func, typename U>
+			friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+			cl_int retain() const
+			{
+				return ReferenceHandler<cl_type>::retain(object_);
+			}
+
+			cl_int release() const
+			{
+				return ReferenceHandler<cl_type>::release(object_);
+			}
+		};
+
+		template <>
+		class Wrapper<cl_device_id>
+		{
+		public:
+			typedef cl_device_id cl_type;
+
+		protected:
+			cl_type object_;
+			bool referenceCountable_;
+
+			static bool isReferenceCountable(cl_device_id device)
+			{
+				bool retVal = false;
+				if (device != NULL) {
+					int version = getDevicePlatformVersion(device);
+					if (version > ((1 << 16) + 1)) {
+						retVal = true;
+					}
+				}
+				return retVal;
+			}
+
+		public:
+			Wrapper() : object_(NULL), referenceCountable_(false)
+			{
+			}
+
+			Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false)
+			{
+				referenceCountable_ = isReferenceCountable(obj);
+			}
+
+			~Wrapper()
+			{
+				if (object_ != NULL) { release(); }
+			}
+
+			Wrapper(const Wrapper<cl_type>& rhs)
+			{
+				object_ = rhs.object_;
+				referenceCountable_ = isReferenceCountable(object_);
+				if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+			}
+
+			Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+			{
+				if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+				object_ = rhs.object_;
+				referenceCountable_ = rhs.referenceCountable_;
+				if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+				return *this;
+			}
+
+			Wrapper<cl_type>& operator = (const cl_type &rhs)
+			{
+				if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+				object_ = rhs;
+				referenceCountable_ = isReferenceCountable(object_);
+				return *this;
+			}
+
+			cl_type operator ()() const { return object_; }
+
+			cl_type& operator ()() { return object_; }
+
+		protected:
+			template<typename Func, typename U>
+			friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+			template<typename Func, typename U>
+			friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS<U>*, int, typename U::cl_type);
+
+			cl_int retain() const
+			{
+				if (referenceCountable_) {
+					return ReferenceHandler<cl_type>::retain(object_);
+				}
+				else {
+					return CL_SUCCESS;
+				}
+			}
+
+			cl_int release() const
+			{
+				if (referenceCountable_) {
+					return ReferenceHandler<cl_type>::release(object_);
+				}
+				else {
+					return CL_SUCCESS;
+				}
+			}
+		};
+
+	} // namespace detail
+	//! \endcond
+
+	/*! \stuct ImageFormat
+	*  \brief Adds constructors and member functions for cl_image_format.
+	*
+	*  \see cl_image_format
+	*/
+	struct ImageFormat : public cl_image_format
+	{
+		//! \brief Default constructor - performs no initialization.
+		ImageFormat(){}
+
+		//! \brief Initializing constructor.
+		ImageFormat(cl_channel_order order, cl_channel_type type)
+		{
+			image_channel_order = order;
+			image_channel_data_type = type;
+		}
+
+		//! \brief Assignment operator.
+		ImageFormat& operator = (const ImageFormat& rhs)
+		{
+			if (this != &rhs) {
+				this->image_channel_data_type = rhs.image_channel_data_type;
+				this->image_channel_order = rhs.image_channel_order;
+			}
+			return *this;
+		}
+	};
+
+	/*! \brief Class interface for cl_device_id.
+	*
+	*  \note Copies of these objects are inexpensive, since they don't 'own'
+	*        any underlying resources or data structures.
+	*
+	*  \see cl_device_id
+	*/
+	class Device : public detail::Wrapper<cl_device_id>
+	{
+	public:
+		//! \brief Default constructor - initializes to NULL.
+		Device() : detail::Wrapper<cl_type>() { }
+
+		/*! \brief Copy constructor.
+		*
+		*  This simply copies the device ID value, which is an inexpensive operation.
+		*/
+		Device(const Device& device) : detail::Wrapper<cl_type>(device) { }
+
+		/*! \brief Constructor from cl_device_id.
+		*
+		*  This simply copies the device ID value, which is an inexpensive operation.
+		*/
+		Device(const cl_device_id &device) : detail::Wrapper<cl_type>(device) { }
+
+		/*! \brief Returns the first device on the default context.
+		*
+		*  \see Context::getDefault()
+		*/
+		static Device getDefault(cl_int * err = NULL);
+
+		/*! \brief Assignment operator from Device.
+		*
+		*  This simply copies the device ID value, which is an inexpensive operation.
+		*/
+		Device& operator = (const Device& rhs)
+		{
+			if (this != &rhs) {
+				detail::Wrapper<cl_type>::operator=(rhs);
+			}
+			return *this;
+		}
+
+		/*! \brief Assignment operator from cl_device_id.
+		*
+		*  This simply copies the device ID value, which is an inexpensive operation.
+		*/
+		Device& operator = (const cl_device_id& rhs)
+		{
+			detail::Wrapper<cl_type>::operator=(rhs);
+			return *this;
+		}
+
+		//! \brief Wrapper for clGetDeviceInfo().
+		template <typename T>
+		cl_int getInfo(cl_device_info name, T* param) const
+		{
+			return detail::errHandler(
+				detail::getInfo(&::clGetDeviceInfo, object_, name, param),
+				__GET_DEVICE_INFO_ERR);
+		}
+
+		//! \brief Wrapper for clGetDeviceInfo() that returns by value.
+		template <cl_int name> typename
+			detail::param_traits<detail::cl_device_info, name>::param_type
+			getInfo(cl_int* err = NULL) const
+		{
+			typename detail::param_traits<
+				detail::cl_device_info, name>::param_type param;
+			cl_int result = getInfo(name, &param);
+			if (err != NULL) {
+				*err = result;
+			}
+			return param;
+		}
+
+		/**
+		* CL 1.2 version
+		*/
+#if defined(CL_VERSION_1_2)
+		//! \brief Wrapper for clCreateSubDevicesEXT().
+		cl_int createSubDevices(
+			const cl_device_partition_property * properties,
+			VECTOR_CLASS<Device>* devices)
+		{
+			cl_uint n = 0;
+			cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
+			if (err != CL_SUCCESS) {
+				return detail::errHandler(err, __CREATE_SUB_DEVICES);
+			}
+
+			cl_device_id* ids = (cl_device_id*)alloca(n * sizeof(cl_device_id));
+			err = clCreateSubDevices(object_, properties, n, ids, NULL);
+			if (err != CL_SUCCESS) {
+				return detail::errHandler(err, __CREATE_SUB_DEVICES);
+			}
+
+			devices->assign(&ids[0], &ids[n]);
+			return CL_SUCCESS;
+		}
+#endif // #if defined(CL_VERSION_1_2)
+
+		/**
+		* CL 1.1 version that uses device fission.
+		*/
+#if defined(CL_VERSION_1_1)
+#if defined(USE_CL_DEVICE_FISSION)
+		cl_int createSubDevices(
+			const cl_device_partition_property_ext * properties,
+			VECTOR_CLASS<Device>* devices)
+		{
+			typedef CL_API_ENTRY cl_int
+				(CL_API_CALL * PFN_clCreateSubDevicesEXT)(
+				cl_device_id /*in_device*/,
+				const cl_device_partition_property_ext * /* properties */,
+				cl_uint /*num_entries*/,
+				cl_device_id * /*out_devices*/,
+				cl_uint * /*num_devices*/) CL_EXT_SUFFIX__VERSION_1_1;
+
+			static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
+			__INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT);
+
+			cl_uint n = 0;
+			cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
+			if (err != CL_SUCCESS) {
+				return detail::errHandler(err, __CREATE_SUB_DEVICES);
+			}
+
+			cl_device_id* ids = (cl_device_id*)alloca(n * sizeof(cl_device_id));
+			err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL);
+			if (err != CL_SUCCESS) {
+				return detail::errHandler(err, __CREATE_SUB_DEVICES);
+			}
+
+			devices->assign(&ids[0], &ids[n]);
+			return CL_SUCCESS;
+		}
+#endif // #if defined(USE_CL_DEVICE_FISSION)
+#endif // #if defined(CL_VERSION_1_1)
+	};
+
+	/*! \brief Class interface for cl_platform_id.
+	*
+	*  \note Copies of these objects are inexpensive, since they don't 'own'
+	*        any underlying resources or data structures.
+	*
+	*  \see cl_platform_id
+	*/
+	class Platform : public detail::Wrapper<cl_platform_id>
+	{
+	public:
+		//! \brief Default constructor - initializes to NULL.
+		Platform() : detail::Wrapper<cl_type>()  { }
+
+		/*! \brief Copy constructor.
+		*
+		*  This simply copies the platform ID value, which is an inexpensive operation.
+		*/
+		Platform(const Platform& platform) : detail::Wrapper<cl_type>(platform) { }
+
+		/*! \brief Constructor from cl_platform_id.
+		*
+		*  This simply copies the platform ID value, which is an inexpensive operation.
+		*/
+		Platform(const cl_platform_id &platform) : detail::Wrapper<cl_type>(platform) { }
+
+		/*! \brief Assignment operator from Platform.
+		*
+		*  This simply copies the platform ID value, which is an inexpensive operation.
+		*/
+		Platform& operator = (const Platform& rhs)
+		{
+			if (this != &rhs) {
+				detail::Wrapper<cl_type>::operator=(rhs);
+			}
+			return *this;
+		}
+
+		/*! \brief Assignment operator from cl_platform_id.
+		*
+		*  This simply copies the platform ID value, which is an inexpensive operation.
+		*/
+		Platform& operator = (const cl_platform_id& rhs)
+		{
+			detail::Wrapper<cl_type>::operator=(rhs);
+			return *this;
+		}
+
+		//! \brief Wrapper for clGetPlatformInfo().
+		cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const
+		{
+			return detail::errHandler(
+				detail::getInfo(&::clGetPlatformInfo, object_, name, param),
+				__GET_PLATFORM_INFO_ERR);
+		}
+
+		//! \brief Wrapper for clGetPlatformInfo() that returns by value.
+		template <cl_int name> typename
+			detail::param_traits<detail::cl_platform_info, name>::param_type
+			getInfo(cl_int* err = NULL) const
+		{
+			typename detail::param_traits<
+				detail::cl_platform_info, name>::param_type param;
+			cl_int result = getInfo(name, &param);
+			if (err != NULL) {
+				*err = result;
+			}
+			return param;
+		}
+
+		/*! \brief Gets a list of devices for this platform.
+		*
+		*  Wraps clGetDeviceIDs().
+		*/
+		cl_int getDevices(
+			cl_device_type type,
+			VECTOR_CLASS<Device>* devices) const
+		{
+			cl_uint n = 0;
+			if (devices == NULL) {
+				return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+			}
+			cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
+			if (err != CL_SUCCESS) {
+				return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+			}
+
+			cl_device_id* ids = (cl_device_id*)alloca(n * sizeof(cl_device_id));
+			err = ::clGetDeviceIDs(object_, type, n, ids, NULL);
+			if (err != CL_SUCCESS) {
+				return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+			}
+
+			devices->assign(&ids[0], &ids[n]);
+			return CL_SUCCESS;
+		}
+
+#if defined(USE_DX_INTEROP)
+		/*! \brief Get the list of available D3D10 devices.
+		*
+		*  \param d3d_device_source.
+		*
+		*  \param d3d_object.
+		*
+		*  \param d3d_device_set.
+		*
+		*  \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
+		*  values returned in devices can be used to identify a specific OpenCL
+		*  device. If \a devices argument is NULL, this argument is ignored.
+		*
+		*  \return One of the following values:
+		*    - CL_SUCCESS if the function is executed successfully.
+		*
+		*  The application can query specific capabilities of the OpenCL device(s)
+		*  returned by cl::getDevices. This can be used by the application to
+		*  determine which device(s) to use.
+		*
+		* \note In the case that exceptions are enabled and a return value
+		* other than CL_SUCCESS is generated, then cl::Error exception is
+		* generated.
+		*/
+		cl_int getDevices(
+			cl_d3d10_device_source_khr d3d_device_source,
+			void *                     d3d_object,
+			cl_d3d10_device_set_khr    d3d_device_set,
+			VECTOR_CLASS<Device>* devices) const
+		{
+			typedef CL_API_ENTRY cl_int(CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
+				cl_platform_id platform,
+				cl_d3d10_device_source_khr d3d_device_source,
+				void * d3d_object,
+				cl_d3d10_device_set_khr d3d_device_set,
+				cl_uint num_entries,
+				cl_device_id * devices,
+				cl_uint* num_devices);
+
+			if (devices == NULL) {
+				return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+			}
+
+			static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
+			__INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR);
+
+			cl_uint n = 0;
+			cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
+				object_,
+				d3d_device_source,
+				d3d_object,
+				d3d_device_set,
+				0,
+				NULL,
+				&n);
+			if (err != CL_SUCCESS) {
+				return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+			}
+
+			cl_device_id* ids = (cl_device_id*)alloca(n * sizeof(cl_device_id));
+			err = pfn_clGetDeviceIDsFromD3D10KHR(
+				object_,
+				d3d_device_source,
+				d3d_object,
+				d3d_device_set,
+				n,
+				ids,
+				NULL);
+			if (err != CL_SUCCESS) {
+				return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+			}
+
+			devices->assign(&ids[0], &ids[n]);
+			return CL_SUCCESS;
+		}
+#endif
+
+		/*! \brief Gets a list of available platforms.
+		*
+		*  Wraps clGetPlatformIDs().
+		*/
+		static cl_int get(
+			VECTOR_CLASS<Platform>* platforms)
+		{
+			cl_uint n = 0;
+
+			if (platforms == NULL) {
+				return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+			}
+
+			cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+			if (err != CL_SUCCESS) {
+				return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+			}
+
+			cl_platform_id* ids = (cl_platform_id*)alloca(
+				n * sizeof(cl_platform_id));
+			err = ::clGetPlatformIDs(n, ids, NULL);
+			if (err != CL_SUCCESS) {
+				return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+			}
+
+			platforms->assign(&ids[0], &ids[n]);
+			return CL_SUCCESS;
+		}
+
+		/*! \brief Gets the first available platform.
+		*
+		*  Wraps clGetPlatformIDs(), returning the first result.
+		*/
+		static cl_int get(
+			Platform * platform)
+		{
+			cl_uint n = 0;
+
+			if (platform == NULL) {
+				return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+			}
+
+			cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+			if (err != CL_SUCCESS) {
+				return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+			}
+
+			cl_platform_id* ids = (cl_platform_id*)alloca(
+				n * sizeof(cl_platform_id));
+			err = ::clGetPlatformIDs(n, ids, NULL);
+			if (err != CL_SUCCESS) {
+				return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+			}
+
+			*platform = ids[0];
+			return CL_SUCCESS;
+		}
+
+		/*! \brief Gets the first available platform, returning it by value.
+		*
+		*  Wraps clGetPlatformIDs(), returning the first result.
+		*/
+		static Platform get(
+			cl_int * errResult = NULL)
+		{
+			Platform platform;
+			cl_uint n = 0;
+			cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+			if (err != CL_SUCCESS) {
+				detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+				if (errResult != NULL) {
+					*errResult = err;
+				}
+			}
+
+			cl_platform_id* ids = (cl_platform_id*)alloca(
+				n * sizeof(cl_platform_id));
+			err = ::clGetPlatformIDs(n, ids, NULL);
+
+			if (err != CL_SUCCESS) {
+				detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+			}
+
+			if (errResult != NULL) {
+				*errResult = err;
+			}
+
+			return ids[0];
+		}
+
+		static Platform getDefault(
+			cl_int *errResult = NULL)
+		{
+			return get(errResult);
+		}
+
+
+#if defined(CL_VERSION_1_2)
+		//! \brief Wrapper for clUnloadCompiler().
+		cl_int
+			unloadCompiler()
+		{
+			return ::clUnloadPlatformCompiler(object_);
+		}
+#endif // #if defined(CL_VERSION_1_2)
+	}; // class Platform
+
+	/**
+	* Deprecated APIs for 1.2
+	*/
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
+	/**
+	* Unload the OpenCL compiler.
+	* \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
+	*/
+	inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
+		UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+	inline cl_int
+		UnloadCompiler()
+	{
+		return ::clUnloadCompiler();
+	}
+#endif // #if defined(CL_VERSION_1_1)
+
+	/*! \brief Class interface for cl_context.
+	*
+	*  \note Copies of these objects are shallow, meaning that the copy will refer
+	*        to the same underlying cl_context as the original.  For details, see
+	*        clRetainContext() and clReleaseContext().
+	*
+	*  \see cl_context
+	*/
+	class Context
+		: public detail::Wrapper<cl_context>
+	{
+	private:
+		static volatile int default_initialized_;
+		static Context default_;
+		static volatile cl_int default_error_;
+	public:
+		/*! \brief Destructor.
+		*
+		*  This calls clReleaseContext() on the value held by this instance.
+		*/
+		~Context() { }
+
+		/*! \brief Constructs a context including a list of specified devices.
+		*
+		*  Wraps clCreateContext().
+		*/
+		Context(
+			const VECTOR_CLASS<Device>& devices,
+			cl_context_properties* properties = NULL,
+			void (CL_CALLBACK * notifyFptr)(
+			const char *,
+			const void *,
+			::size_t,
+			void *) = NULL,
+			void* data = NULL,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+
+			::size_t numDevices = devices.size();
+			cl_device_id* deviceIDs = (cl_device_id*)alloca(numDevices * sizeof(cl_device_id));
+			for (::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex) {
+				deviceIDs[deviceIndex] = (devices[deviceIndex])();
+			}
+
+			object_ = ::clCreateContext(
+				properties, (cl_uint)numDevices,
+				deviceIDs,
+				notifyFptr, data, &error);
+
+			detail::errHandler(error, __CREATE_CONTEXT_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		Context(
+			const Device& device,
+			cl_context_properties* properties = NULL,
+			void (CL_CALLBACK * notifyFptr)(
+			const char *,
+			const void *,
+			::size_t,
+			void *) = NULL,
+			void* data = NULL,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+
+			cl_device_id deviceID = device();
+
+			object_ = ::clCreateContext(
+				properties, 1,
+				&deviceID,
+				notifyFptr, data, &error);
+
+			detail::errHandler(error, __CREATE_CONTEXT_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		/*! \brief Constructs a context including all devices of a specified type.
+		*
+		*  Wraps clCreateContextFromType().
+		*/
+		Context(
+			cl_device_type type,
+			cl_context_properties* properties = NULL,
+			void (CL_CALLBACK * notifyFptr)(
+			const char *,
+			const void *,
+			::size_t,
+			void *) = NULL,
+			void* data = NULL,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+
+#if !defined(__APPLE__) || !defined(__MACOS)
+			cl_context_properties prop[4] = { CL_CONTEXT_PLATFORM, 0, 0, 0 };
+			if (properties == NULL) {
+				prop[1] = (cl_context_properties)Platform::get(&error)();
+				if (error != CL_SUCCESS) {
+					detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+					if (err != NULL) {
+						*err = error;
+						return;
+					}
+				}
+
+				properties = &prop[0];
+			}
+#endif
+			object_ = ::clCreateContextFromType(
+				properties, type, notifyFptr, data, &error);
+
+			detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		/*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
+		*
+		*  \note All calls to this function return the same cl_context as the first.
+		*/
+		static Context getDefault(cl_int * err = NULL)
+		{
+			int state = detail::compare_exchange(
+				&default_initialized_,
+				__DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+
+			if (state & __DEFAULT_INITIALIZED) {
+				if (err != NULL) {
+					*err = default_error_;
+				}
+				return default_;
+			}
+
+			if (state & __DEFAULT_BEING_INITIALIZED) {
+				// Assume writes will propagate eventually...
+				while (default_initialized_ != __DEFAULT_INITIALIZED) {
+					detail::fence();
+				}
+
+				if (err != NULL) {
+					*err = default_error_;
+				}
+				return default_;
+			}
+
+			cl_int error;
+			default_ = Context(
+				CL_DEVICE_TYPE_DEFAULT,
+				NULL,
+				NULL,
+				NULL,
+				&error);
+
+			detail::fence();
+
+			default_error_ = error;
+			// Assume writes will propagate eventually...
+			default_initialized_ = __DEFAULT_INITIALIZED;
+
+			detail::fence();
+
+			if (err != NULL) {
+				*err = default_error_;
+			}
+			return default_;
+
+		}
+
+		//! \brief Default constructor - initializes to NULL.
+		Context() : detail::Wrapper<cl_type>() { }
+
+		/*! \brief Copy constructor.
+		*
+		*  This calls clRetainContext() on the parameter's cl_context.
+		*/
+		Context(const Context& context) : detail::Wrapper<cl_type>(context) { }
+
+		/*! \brief Constructor from cl_context - takes ownership.
+		*
+		*  This effectively transfers ownership of a refcount on the cl_context
+		*  into the new Context object.
+		*/
+		__CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper<cl_type>(context) { }
+
+		/*! \brief Assignment operator from Context.
+		*
+		*  This calls clRetainContext() on the parameter and clReleaseContext() on
+		*  the previous value held by this instance.
+		*/
+		Context& operator = (const Context& rhs)
+		{
+			if (this != &rhs) {
+				detail::Wrapper<cl_type>::operator=(rhs);
+			}
+			return *this;
+		}
+
+		/*! \brief Assignment operator from cl_context - takes ownership.
+		*
+		*  This effectively transfers ownership of a refcount on the rhs and calls
+		*  clReleaseContext() on the value previously held by this instance.
+		*/
+		Context& operator = (const cl_context& rhs)
+		{
+			detail::Wrapper<cl_type>::operator=(rhs);
+			return *this;
+		}
+
+		//! \brief Wrapper for clGetContextInfo().
+		template <typename T>
+		cl_int getInfo(cl_context_info name, T* param) const
+		{
+			return detail::errHandler(
+				detail::getInfo(&::clGetContextInfo, object_, name, param),
+				__GET_CONTEXT_INFO_ERR);
+		}
+
+		//! \brief Wrapper for clGetContextInfo() that returns by value.
+		template <cl_int name> typename
+			detail::param_traits<detail::cl_context_info, name>::param_type
+			getInfo(cl_int* err = NULL) const
+		{
+			typename detail::param_traits<
+				detail::cl_context_info, name>::param_type param;
+			cl_int result = getInfo(name, &param);
+			if (err != NULL) {
+				*err = result;
+			}
+			return param;
+		}
+
+		/*! \brief Gets a list of supported image formats.
+		*
+		*  Wraps clGetSupportedImageFormats().
+		*/
+		cl_int getSupportedImageFormats(
+			cl_mem_flags flags,
+			cl_mem_object_type type,
+			VECTOR_CLASS<ImageFormat>* formats) const
+		{
+			cl_uint numEntries;
+			cl_int err = ::clGetSupportedImageFormats(
+				object_,
+				flags,
+				type,
+				0,
+				NULL,
+				&numEntries);
+			if (err != CL_SUCCESS) {
+				return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+			}
+
+			ImageFormat* value = (ImageFormat*)
+				alloca(numEntries * sizeof(ImageFormat));
+			err = ::clGetSupportedImageFormats(
+				object_,
+				flags,
+				type,
+				numEntries,
+				(cl_image_format*)value,
+				NULL);
+			if (err != CL_SUCCESS) {
+				return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+			}
+
+			formats->assign(&value[0], &value[numEntries]);
+			return CL_SUCCESS;
+		}
+	};
+
+	inline Device Device::getDefault(cl_int * err)
+	{
+		cl_int error;
+		Device device;
+
+		Context context = Context::getDefault(&error);
+		detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+		if (error != CL_SUCCESS) {
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+		else {
+			device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+			if (err != NULL) {
+				*err = CL_SUCCESS;
+			}
+		}
+
+		return device;
+	}
+
+
+#ifdef _WIN32
+	__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+	__declspec(selectany) Context Context::default_;
+	__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#else
+	__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+	__attribute__((weak)) Context Context::default_;
+	__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#endif
+
+	/*! \brief Class interface for cl_event.
+	*
+	*  \note Copies of these objects are shallow, meaning that the copy will refer
+	*        to the same underlying cl_event as the original.  For details, see
+	*        clRetainEvent() and clReleaseEvent().
+	*
+	*  \see cl_event
+	*/
+	class Event : public detail::Wrapper<cl_event>
+	{
+	public:
+		/*! \brief Destructor.
+		*
+		*  This calls clReleaseEvent() on the value held by this instance.
+		*/
+		~Event() { }
+
+		//! \brief Default constructor - initializes to NULL.
+		Event() : detail::Wrapper<cl_type>() { }
+
+		/*! \brief Copy constructor.
+		*
+		*  This calls clRetainEvent() on the parameter's cl_event.
+		*/
+		Event(const Event& event) : detail::Wrapper<cl_type>(event) { }
+
+		/*! \brief Constructor from cl_event - takes ownership.
+		*
+		*  This effectively transfers ownership of a refcount on the cl_event
+		*  into the new Event object.
+		*/
+		Event(const cl_event& event) : detail::Wrapper<cl_type>(event) { }
+
+		/*! \brief Assignment operator from cl_event - takes ownership.
+		*
+		*  This effectively transfers ownership of a refcount on the rhs and calls
+		*  clReleaseEvent() on the value previously held by this instance.
+		*/
+		Event& operator = (const Event& rhs)
+		{
+			if (this != &rhs) {
+				detail::Wrapper<cl_type>::operator=(rhs);
+			}
+			return *this;
+		}
+
+		/*! \brief Assignment operator from cl_event.
+		*
+		*  This calls clRetainEvent() on the parameter and clReleaseEvent() on
+		*  the previous value held by this instance.
+		*/
+		Event& operator = (const cl_event& rhs)
+		{
+			detail::Wrapper<cl_type>::operator=(rhs);
+			return *this;
+		}
+
+		//! \brief Wrapper for clGetEventInfo().
+		template <typename T>
+		cl_int getInfo(cl_event_info name, T* param) const
+		{
+			return detail::errHandler(
+				detail::getInfo(&::clGetEventInfo, object_, name, param),
+				__GET_EVENT_INFO_ERR);
+		}
+
+		//! \brief Wrapper for clGetEventInfo() that returns by value.
+		template <cl_int name> typename
+			detail::param_traits<detail::cl_event_info, name>::param_type
+			getInfo(cl_int* err = NULL) const
+		{
+			typename detail::param_traits<
+				detail::cl_event_info, name>::param_type param;
+			cl_int result = getInfo(name, &param);
+			if (err != NULL) {
+				*err = result;
+			}
+			return param;
+		}
+
+		//! \brief Wrapper for clGetEventProfilingInfo().
+		template <typename T>
+		cl_int getProfilingInfo(cl_profiling_info name, T* param) const
+		{
+			return detail::errHandler(detail::getInfo(
+				&::clGetEventProfilingInfo, object_, name, param),
+				__GET_EVENT_PROFILE_INFO_ERR);
+		}
+
+		//! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
+		template <cl_int name> typename
+			detail::param_traits<detail::cl_profiling_info, name>::param_type
+			getProfilingInfo(cl_int* err = NULL) const
+		{
+			typename detail::param_traits<
+				detail::cl_profiling_info, name>::param_type param;
+			cl_int result = getProfilingInfo(name, &param);
+			if (err != NULL) {
+				*err = result;
+			}
+			return param;
+		}
+
+		/*! \brief Blocks the calling thread until this event completes.
+		*
+		*  Wraps clWaitForEvents().
+		*/
+		cl_int wait() const
+		{
+			return detail::errHandler(
+				::clWaitForEvents(1, &object_),
+				__WAIT_FOR_EVENTS_ERR);
+		}
+
+#if defined(CL_VERSION_1_1)
+		/*! \brief Registers a user callback function for a specific command execution status.
+		*
+		*  Wraps clSetEventCallback().
+		*/
+		cl_int setCallback(
+			cl_int type,
+			void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),
+			void * user_data = NULL)
+		{
+			return detail::errHandler(
+				::clSetEventCallback(
+				object_,
+				type,
+				pfn_notify,
+				user_data),
+				__SET_EVENT_CALLBACK_ERR);
+		}
+#endif
+
+		/*! \brief Blocks the calling thread until every event specified is complete.
+		*
+		*  Wraps clWaitForEvents().
+		*/
+		static cl_int
+			waitForEvents(const VECTOR_CLASS<Event>& events)
+		{
+			return detail::errHandler(
+				::clWaitForEvents(
+				(cl_uint)events.size(), (cl_event*)&events.front()),
+				__WAIT_FOR_EVENTS_ERR);
+		}
+	};
+
+#if defined(CL_VERSION_1_1)
+	/*! \brief Class interface for user events (a subset of cl_event's).
+	*
+	*  See Event for details about copy semantics, etc.
+	*/
+	class UserEvent : public Event
+	{
+	public:
+		/*! \brief Constructs a user event on a given context.
+		*
+		*  Wraps clCreateUserEvent().
+		*/
+		UserEvent(
+			const Context& context,
+			cl_int * err = NULL)
+		{
+			cl_int error;
+			object_ = ::clCreateUserEvent(
+				context(),
+				&error);
+
+			detail::errHandler(error, __CREATE_USER_EVENT_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		//! \brief Default constructor - initializes to NULL.
+		UserEvent() : Event() { }
+
+		//! \brief Copy constructor - performs shallow copy.
+		UserEvent(const UserEvent& event) : Event(event) { }
+
+		//! \brief Assignment Operator - performs shallow copy.
+		UserEvent& operator = (const UserEvent& rhs)
+		{
+			if (this != &rhs) {
+				Event::operator=(rhs);
+			}
+			return *this;
+		}
+
+		/*! \brief Sets the execution status of a user event object.
+		*
+		*  Wraps clSetUserEventStatus().
+		*/
+		cl_int setStatus(cl_int status)
+		{
+			return detail::errHandler(
+				::clSetUserEventStatus(object_, status),
+				__SET_USER_EVENT_STATUS_ERR);
+		}
+	};
+#endif
+
+	/*! \brief Blocks the calling thread until every event specified is complete.
+	*
+	*  Wraps clWaitForEvents().
+	*/
+	inline static cl_int
+		WaitForEvents(const VECTOR_CLASS<Event>& events)
+	{
+		return detail::errHandler(
+			::clWaitForEvents(
+			(cl_uint)events.size(), (cl_event*)&events.front()),
+			__WAIT_FOR_EVENTS_ERR);
+	}
+
+	/*! \brief Class interface for cl_mem.
+	*
+	*  \note Copies of these objects are shallow, meaning that the copy will refer
+	*        to the same underlying cl_mem as the original.  For details, see
+	*        clRetainMemObject() and clReleaseMemObject().
+	*
+	*  \see cl_mem
+	*/
+	class Memory : public detail::Wrapper<cl_mem>
+	{
+	public:
+
+		/*! \brief Destructor.
+		*
+		*  This calls clReleaseMemObject() on the value held by this instance.
+		*/
+		~Memory() {}
+
+		//! \brief Default constructor - initializes to NULL.
+		Memory() : detail::Wrapper<cl_type>() { }
+
+		/*! \brief Copy constructor - performs shallow copy.
+		*
+		*  This calls clRetainMemObject() on the parameter's cl_mem.
+		*/
+		Memory(const Memory& memory) : detail::Wrapper<cl_type>(memory) { }
+
+		/*! \brief Constructor from cl_mem - takes ownership.
+		*
+		*  This effectively transfers ownership of a refcount on the cl_mem
+		*  into the new Memory object.
+		*/
+		__CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper<cl_type>(memory) { }
+
+		/*! \brief Assignment operator from Memory.
+		*
+		*  This calls clRetainMemObject() on the parameter and clReleaseMemObject()
+		*  on the previous value held by this instance.
+		*/
+		Memory& operator = (const Memory& rhs)
+		{
+			if (this != &rhs) {
+				detail::Wrapper<cl_type>::operator=(rhs);
+			}
+			return *this;
+		}
+
+		/*! \brief Assignment operator from cl_mem - takes ownership.
+		*
+		*  This effectively transfers ownership of a refcount on the rhs and calls
+		*  clReleaseMemObject() on the value previously held by this instance.
+		*/
+		Memory& operator = (const cl_mem& rhs)
+		{
+			detail::Wrapper<cl_type>::operator=(rhs);
+			return *this;
+		}
+
+		//! \brief Wrapper for clGetMemObjectInfo().
+		template <typename T>
+		cl_int getInfo(cl_mem_info name, T* param) const
+		{
+			return detail::errHandler(
+				detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
+				__GET_MEM_OBJECT_INFO_ERR);
+		}
+
+		//! \brief Wrapper for clGetMemObjectInfo() that returns by value.
+		template <cl_int name> typename
+			detail::param_traits<detail::cl_mem_info, name>::param_type
+			getInfo(cl_int* err = NULL) const
+		{
+			typename detail::param_traits<
+				detail::cl_mem_info, name>::param_type param;
+			cl_int result = getInfo(name, &param);
+			if (err != NULL) {
+				*err = result;
+			}
+			return param;
+		}
+
+#if defined(CL_VERSION_1_1)
+		/*! \brief Registers a callback function to be called when the memory object
+		*         is no longer needed.
+		*
+		*  Wraps clSetMemObjectDestructorCallback().
+		*
+		*  Repeated calls to this function, for a given cl_mem value, will append
+		*  to the list of functions called (in reverse order) when memory object's
+		*  resources are freed and the memory object is deleted.
+		*
+		*  \note
+		*  The registered callbacks are associated with the underlying cl_mem
+		*  value - not the Memory class instance.
+		*/
+		cl_int setDestructorCallback(
+			void (CL_CALLBACK * pfn_notify)(cl_mem, void *),
+			void * user_data = NULL)
+		{
+			return detail::errHandler(
+				::clSetMemObjectDestructorCallback(
+				object_,
+				pfn_notify,
+				user_data),
+				__SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);
+		}
+#endif
+
+	};
+
+	// Pre-declare copy functions
+	class Buffer;
+	template< typename IteratorType >
+	cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer);
+	template< typename IteratorType >
+	cl_int copy(const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator);
+
+	/*! \brief Class interface for Buffer Memory Objects.
+	*
+	*  See Memory for details about copy semantics, etc.
+	*
+	*  \see Memory
+	*/
+	class Buffer : public Memory
+	{
+	public:
+
+		/*! \brief Constructs a Buffer in a specified context.
+		*
+		*  Wraps clCreateBuffer().
+		*
+		*  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+		*                  specified.  Note alignment & exclusivity requirements.
+		*/
+		Buffer(
+			const Context& context,
+			cl_mem_flags flags,
+			::size_t size,
+			void* host_ptr = NULL,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+			object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+			detail::errHandler(error, __CREATE_BUFFER_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		/*! \brief Constructs a Buffer in the default context.
+		*
+		*  Wraps clCreateBuffer().
+		*
+		*  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+		*                  specified.  Note alignment & exclusivity requirements.
+		*
+		*  \see Context::getDefault()
+		*/
+		Buffer(
+			cl_mem_flags flags,
+			::size_t size,
+			void* host_ptr = NULL,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+
+			Context context = Context::getDefault(err);
+
+			object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+			detail::errHandler(error, __CREATE_BUFFER_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		/*!
+		* \brief Construct a Buffer from a host container via iterators.
+		* If useHostPtr is specified iterators must be random access.
+		*/
+		template< typename IteratorType >
+		Buffer(
+			IteratorType startIterator,
+			IteratorType endIterator,
+			bool readOnly,
+			bool useHostPtr = false,
+			cl_int* err = NULL)
+		{
+			typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+			cl_int error;
+
+			cl_mem_flags flags = 0;
+			if (readOnly) {
+				flags |= CL_MEM_READ_ONLY;
+			}
+			else {
+				flags |= CL_MEM_READ_WRITE;
+			}
+			if (useHostPtr) {
+				flags |= CL_MEM_USE_HOST_PTR;
+			}
+
+			::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+			Context context = Context::getDefault(err);
+
+			if (useHostPtr) {
+				object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+			}
+			else {
+				object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+			}
+
+			detail::errHandler(error, __CREATE_BUFFER_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+
+			if (!useHostPtr) {
+				error = cl::copy(startIterator, endIterator, *this);
+				detail::errHandler(error, __CREATE_BUFFER_ERR);
+				if (err != NULL) {
+					*err = error;
+				}
+			}
+		}
+
+		//! \brief Default constructor - initializes to NULL.
+		Buffer() : Memory() { }
+
+		/*! \brief Copy constructor - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Buffer(const Buffer& buffer) : Memory(buffer) { }
+
+		/*! \brief Constructor from cl_mem - takes ownership.
+		*
+		*  See Memory for further details.
+		*/
+		__CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { }
+
+		/*! \brief Assignment from Buffer - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Buffer& operator = (const Buffer& rhs)
+		{
+			if (this != &rhs) {
+				Memory::operator=(rhs);
+			}
+			return *this;
+		}
+
+		/*! \brief Assignment from cl_mem - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Buffer& operator = (const cl_mem& rhs)
+		{
+			Memory::operator=(rhs);
+			return *this;
+		}
+
+#if defined(CL_VERSION_1_1)
+		/*! \brief Creates a new buffer object from this.
+		*
+		*  Wraps clCreateSubBuffer().
+		*/
+		Buffer createSubBuffer(
+			cl_mem_flags flags,
+			cl_buffer_create_type buffer_create_type,
+			const void * buffer_create_info,
+			cl_int * err = NULL)
+		{
+			Buffer result;
+			cl_int error;
+			result.object_ = ::clCreateSubBuffer(
+				object_,
+				flags,
+				buffer_create_type,
+				buffer_create_info,
+				&error);
+
+			detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+
+			return result;
+		}
+#endif
+	};
+
+#if defined (USE_DX_INTEROP)
+	/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
+	*
+	*  This is provided to facilitate interoperability with Direct3D.
+	*
+	*  See Memory for details about copy semantics, etc.
+	*
+	*  \see Memory
+	*/
+	class BufferD3D10 : public Buffer
+	{
+	public:
+		typedef CL_API_ENTRY cl_mem(CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
+			cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
+			cl_int* errcode_ret);
+
+		/*! \brief Constructs a BufferD3D10, in a specified context, from a
+		*         given ID3D10Buffer.
+		*
+		*  Wraps clCreateFromD3D10BufferKHR().
+		*/
+		BufferD3D10(
+			const Context& context,
+			cl_mem_flags flags,
+			ID3D10Buffer* bufobj,
+			cl_int * err = NULL)
+		{
+			static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL;
+
+#if defined(CL_VERSION_1_2)
+			vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
+			cl_platform platform = -1;
+			for (int i = 0; i < props.size(); ++i) {
+				if (props[i] == CL_CONTEXT_PLATFORM) {
+					platform = props[i + 1];
+				}
+			}
+			__INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+			__INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR);
+#endif
+
+			cl_int error;
+			object_ = pfn_clCreateFromD3D10BufferKHR(
+				context(),
+				flags,
+				bufobj,
+				&error);
+
+			detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		//! \brief Default constructor - initializes to NULL.
+		BufferD3D10() : Buffer() { }
+
+		/*! \brief Copy constructor - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { }
+
+		/*! \brief Constructor from cl_mem - takes ownership.
+		*
+		*  See Memory for further details.
+		*/
+		__CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { }
+
+		/*! \brief Assignment from BufferD3D10 - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		BufferD3D10& operator = (const BufferD3D10& rhs)
+		{
+			if (this != &rhs) {
+				Buffer::operator=(rhs);
+			}
+			return *this;
+		}
+
+		/*! \brief Assignment from cl_mem - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		BufferD3D10& operator = (const cl_mem& rhs)
+		{
+			Buffer::operator=(rhs);
+			return *this;
+		}
+	};
+#endif
+
+	/*! \brief Class interface for GL Buffer Memory Objects.
+	*
+	*  This is provided to facilitate interoperability with OpenGL.
+	*
+	*  See Memory for details about copy semantics, etc.
+	*
+	*  \see Memory
+	*/
+	class BufferGL : public Buffer
+	{
+	public:
+		/*! \brief Constructs a BufferGL in a specified context, from a given
+		*         GL buffer.
+		*
+		*  Wraps clCreateFromGLBuffer().
+		*/
+		BufferGL(
+			const Context& context,
+			cl_mem_flags flags,
+			GLuint bufobj,
+			cl_int * err = NULL)
+		{
+			cl_int error;
+			object_ = ::clCreateFromGLBuffer(
+				context(),
+				flags,
+				bufobj,
+				&error);
+
+			detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		//! \brief Default constructor - initializes to NULL.
+		BufferGL() : Buffer() { }
+
+		/*! \brief Copy constructor - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		BufferGL(const BufferGL& buffer) : Buffer(buffer) { }
+
+		/*! \brief Constructor from cl_mem - takes ownership.
+		*
+		*  See Memory for further details.
+		*/
+		__CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { }
+
+		/*! \brief Assignment from BufferGL - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		BufferGL& operator = (const BufferGL& rhs)
+		{
+			if (this != &rhs) {
+				Buffer::operator=(rhs);
+			}
+			return *this;
+		}
+
+		/*! \brief Assignment from cl_mem - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		BufferGL& operator = (const cl_mem& rhs)
+		{
+			Buffer::operator=(rhs);
+			return *this;
+		}
+
+		//! \brief Wrapper for clGetGLObjectInfo().
+		cl_int getObjectInfo(
+			cl_gl_object_type *type,
+			GLuint * gl_object_name)
+		{
+			return detail::errHandler(
+				::clGetGLObjectInfo(object_, type, gl_object_name),
+				__GET_GL_OBJECT_INFO_ERR);
+		}
+	};
+
+	/*! \brief Class interface for GL Render Buffer Memory Objects.
+	*
+	*  This is provided to facilitate interoperability with OpenGL.
+	*
+	*  See Memory for details about copy semantics, etc.
+	*
+	*  \see Memory
+	*/
+	class BufferRenderGL : public Buffer
+	{
+	public:
+		/*! \brief Constructs a BufferRenderGL in a specified context, from a given
+		*         GL Renderbuffer.
+		*
+		*  Wraps clCreateFromGLRenderbuffer().
+		*/
+		BufferRenderGL(
+			const Context& context,
+			cl_mem_flags flags,
+			GLuint bufobj,
+			cl_int * err = NULL)
+		{
+			cl_int error;
+			object_ = ::clCreateFromGLRenderbuffer(
+				context(),
+				flags,
+				bufobj,
+				&error);
+
+			detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		//! \brief Default constructor - initializes to NULL.
+		BufferRenderGL() : Buffer() { }
+
+		/*! \brief Copy constructor - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { }
+
+		/*! \brief Constructor from cl_mem - takes ownership.
+		*
+		*  See Memory for further details.
+		*/
+		__CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Buffer(buffer) { }
+
+		/*! \brief Assignment from BufferGL - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		BufferRenderGL& operator = (const BufferRenderGL& rhs)
+		{
+			if (this != &rhs) {
+				Buffer::operator=(rhs);
+			}
+			return *this;
+		}
+
+		/*! \brief Assignment from cl_mem - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		BufferRenderGL& operator = (const cl_mem& rhs)
+		{
+			Buffer::operator=(rhs);
+			return *this;
+		}
+
+		//! \brief Wrapper for clGetGLObjectInfo().
+		cl_int getObjectInfo(
+			cl_gl_object_type *type,
+			GLuint * gl_object_name)
+		{
+			return detail::errHandler(
+				::clGetGLObjectInfo(object_, type, gl_object_name),
+				__GET_GL_OBJECT_INFO_ERR);
+		}
+	};
+
+	/*! \brief C++ base class for Image Memory objects.
+	*
+	*  See Memory for details about copy semantics, etc.
+	*
+	*  \see Memory
+	*/
+	class Image : public Memory
+	{
+	protected:
+		//! \brief Default constructor - initializes to NULL.
+		Image() : Memory() { }
+
+		/*! \brief Copy constructor - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Image(const Image& image) : Memory(image) { }
+
+		/*! \brief Constructor from cl_mem - takes ownership.
+		*
+		*  See Memory for further details.
+		*/
+		__CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { }
+
+		/*! \brief Assignment from Image - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Image& operator = (const Image& rhs)
+		{
+			if (this != &rhs) {
+				Memory::operator=(rhs);
+			}
+			return *this;
+		}
+
+		/*! \brief Assignment from cl_mem - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Image& operator = (const cl_mem& rhs)
+		{
+			Memory::operator=(rhs);
+			return *this;
+		}
+
+	public:
+		//! \brief Wrapper for clGetImageInfo().
+		template <typename T>
+		cl_int getImageInfo(cl_image_info name, T* param) const
+		{
+			return detail::errHandler(
+				detail::getInfo(&::clGetImageInfo, object_, name, param),
+				__GET_IMAGE_INFO_ERR);
+		}
+
+		//! \brief Wrapper for clGetImageInfo() that returns by value.
+		template <cl_int name> typename
+			detail::param_traits<detail::cl_image_info, name>::param_type
+			getImageInfo(cl_int* err = NULL) const
+		{
+			typename detail::param_traits<
+				detail::cl_image_info, name>::param_type param;
+			cl_int result = getImageInfo(name, &param);
+			if (err != NULL) {
+				*err = result;
+			}
+			return param;
+		}
+	};
+
+#if defined(CL_VERSION_1_2)
+	/*! \brief Class interface for 1D Image Memory objects.
+	*
+	*  See Memory for details about copy semantics, etc.
+	*
+	*  \see Memory
+	*/
+	class Image1D : public Image
+	{
+	public:
+		/*! \brief Constructs a 1D Image in a specified context.
+		*
+		*  Wraps clCreateImage().
+		*/
+		Image1D(
+			const Context& context,
+			cl_mem_flags flags,
+			ImageFormat format,
+			::size_t width,
+			void* host_ptr = NULL,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+			cl_image_desc desc;
+			desc.image_type = CL_MEM_OBJECT_IMAGE1D;
+			desc.image_width = width;
+			desc.image_row_pitch = 0;
+			desc.num_mip_levels = 0;
+			desc.num_samples = 0;
+			desc.buffer = 0;
+			object_ = ::clCreateImage(
+				context(),
+				flags,
+				&format,
+				&desc,
+				host_ptr,
+				&error);
+
+			detail::errHandler(error, __CREATE_IMAGE_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		//! \brief Default constructor - initializes to NULL.
+		Image1D() { }
+
+		/*! \brief Copy constructor - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Image1D(const Image1D& image1D) : Image(image1D) { }
+
+		/*! \brief Constructor from cl_mem - takes ownership.
+		*
+		*  See Memory for further details.
+		*/
+		__CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { }
+
+		/*! \brief Assignment from Image1D - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Image1D& operator = (const Image1D& rhs)
+		{
+			if (this != &rhs) {
+				Image::operator=(rhs);
+			}
+			return *this;
+		}
+
+		/*! \brief Assignment from cl_mem - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Image1D& operator = (const cl_mem& rhs)
+		{
+			Image::operator=(rhs);
+			return *this;
+		}
+	};
+
+	/*! \class Image1DBuffer
+	* \brief Image interface for 1D buffer images.
+	*/
+	class Image1DBuffer : public Image
+	{
+	public:
+		Image1DBuffer(
+			const Context& context,
+			cl_mem_flags flags,
+			ImageFormat format,
+			::size_t width,
+			const Buffer &buffer,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+			cl_image_desc desc;
+			desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+			desc.image_width = width;
+			desc.image_row_pitch = 0;
+			desc.num_mip_levels = 0;
+			desc.num_samples = 0;
+			desc.buffer = buffer();
+			object_ = ::clCreateImage(
+				context(),
+				flags,
+				&format,
+				&desc,
+				NULL,
+				&error);
+
+			detail::errHandler(error, __CREATE_IMAGE_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		Image1DBuffer() { }
+
+		Image1DBuffer(const Image1DBuffer& image1D) : Image(image1D) { }
+
+		__CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { }
+
+		Image1DBuffer& operator = (const Image1DBuffer& rhs)
+		{
+			if (this != &rhs) {
+				Image::operator=(rhs);
+			}
+			return *this;
+		}
+
+		Image1DBuffer& operator = (const cl_mem& rhs)
+		{
+			Image::operator=(rhs);
+			return *this;
+		}
+	};
+
+	/*! \class Image1DArray
+	* \brief Image interface for arrays of 1D images.
+	*/
+	class Image1DArray : public Image
+	{
+	public:
+		Image1DArray(
+			const Context& context,
+			cl_mem_flags flags,
+			ImageFormat format,
+			::size_t arraySize,
+			::size_t width,
+			::size_t rowPitch,
+			void* host_ptr = NULL,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+			cl_image_desc desc;
+			desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
+			desc.image_array_size = arraySize;
+			desc.image_width = width;
+			desc.image_row_pitch = rowPitch;
+			desc.num_mip_levels = 0;
+			desc.num_samples = 0;
+			desc.buffer = 0;
+			object_ = ::clCreateImage(
+				context(),
+				flags,
+				&format,
+				&desc,
+				host_ptr,
+				&error);
+
+			detail::errHandler(error, __CREATE_IMAGE_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		Image1DArray() { }
+
+		Image1DArray(const Image1DArray& imageArray) : Image(imageArray) { }
+
+		__CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+		Image1DArray& operator = (const Image1DArray& rhs)
+		{
+			if (this != &rhs) {
+				Image::operator=(rhs);
+			}
+			return *this;
+		}
+
+		Image1DArray& operator = (const cl_mem& rhs)
+		{
+			Image::operator=(rhs);
+			return *this;
+		}
+	};
+#endif // #if defined(CL_VERSION_1_2)
+
+
+	/*! \brief Class interface for 2D Image Memory objects.
+	*
+	*  See Memory for details about copy semantics, etc.
+	*
+	*  \see Memory
+	*/
+	class Image2D : public Image
+	{
+	public:
+		/*! \brief Constructs a 1D Image in a specified context.
+		*
+		*  Wraps clCreateImage().
+		*/
+		Image2D(
+			const Context& context,
+			cl_mem_flags flags,
+			ImageFormat format,
+			::size_t width,
+			::size_t height,
+			::size_t row_pitch = 0,
+			void* host_ptr = NULL,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+			bool useCreateImage;
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+			// Run-time decision based on the actual platform
+			{
+				cl_uint version = detail::getContextPlatformVersion(context());
+				useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+			}
+#elif defined(CL_VERSION_1_2)
+			useCreateImage = true;
+#else
+			useCreateImage = false;
+#endif
+
+#if defined(CL_VERSION_1_2)
+			if (useCreateImage)
+			{
+				cl_image_desc desc;
+				desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+				desc.image_width = width;
+				desc.image_height = height;
+				desc.image_row_pitch = row_pitch;
+				desc.num_mip_levels = 0;
+				desc.num_samples = 0;
+				desc.buffer = 0;
+				object_ = ::clCreateImage(
+					context(),
+					flags,
+					&format,
+					&desc,
+					host_ptr,
+					&error);
+
+				detail::errHandler(error, __CREATE_IMAGE_ERR);
+				if (err != NULL) {
+					*err = error;
+				}
+			}
+#endif // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+			if (!useCreateImage)
+			{
+				object_ = ::clCreateImage2D(
+					context(), flags, &format, width, height, row_pitch, host_ptr, &error);
+
+				detail::errHandler(error, __CREATE_IMAGE2D_ERR);
+				if (err != NULL) {
+					*err = error;
+				}
+			}
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+		}
+
+		//! \brief Default constructor - initializes to NULL.
+		Image2D() { }
+
+		/*! \brief Copy constructor - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Image2D(const Image2D& image2D) : Image(image2D) { }
+
+		/*! \brief Constructor from cl_mem - takes ownership.
+		*
+		*  See Memory for further details.
+		*/
+		__CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { }
+
+		/*! \brief Assignment from Image2D - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Image2D& operator = (const Image2D& rhs)
+		{
+			if (this != &rhs) {
+				Image::operator=(rhs);
+			}
+			return *this;
+		}
+
+		/*! \brief Assignment from cl_mem - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Image2D& operator = (const cl_mem& rhs)
+		{
+			Image::operator=(rhs);
+			return *this;
+		}
+	};
+
+
+#if !defined(CL_VERSION_1_2)
+	/*! \brief Class interface for GL 2D Image Memory objects.
+	*
+	*  This is provided to facilitate interoperability with OpenGL.
+	*
+	*  See Memory for details about copy semantics, etc.
+	*
+	*  \see Memory
+	*  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
+	*/
+	class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D
+	{
+	public:
+		/*! \brief Constructs an Image2DGL in a specified context, from a given
+		*         GL Texture.
+		*
+		*  Wraps clCreateFromGLTexture2D().
+		*/
+		Image2DGL(
+			const Context& context,
+			cl_mem_flags flags,
+			GLenum target,
+			GLint  miplevel,
+			GLuint texobj,
+			cl_int * err = NULL)
+		{
+			cl_int error;
+			object_ = ::clCreateFromGLTexture2D(
+				context(),
+				flags,
+				target,
+				miplevel,
+				texobj,
+				&error);
+
+			detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+
+		}
+
+		//! \brief Default constructor - initializes to NULL.
+		Image2DGL() : Image2D() { }
+
+		/*! \brief Copy constructor - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Image2DGL(const Image2DGL& image) : Image2D(image) { }
+
+		/*! \brief Constructor from cl_mem - takes ownership.
+		*
+		*  See Memory for further details.
+		*/
+		__CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { }
+
+		/*! \brief Assignment from Image2DGL - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Image2DGL& operator = (const Image2DGL& rhs)
+		{
+			if (this != &rhs) {
+				Image2D::operator=(rhs);
+			}
+			return *this;
+		}
+
+		/*! \brief Assignment from cl_mem - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Image2DGL& operator = (const cl_mem& rhs)
+		{
+			Image2D::operator=(rhs);
+			return *this;
+		}
+	};
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+	/*! \class Image2DArray
+	* \brief Image interface for arrays of 2D images.
+	*/
+	class Image2DArray : public Image
+	{
+	public:
+		Image2DArray(
+			const Context& context,
+			cl_mem_flags flags,
+			ImageFormat format,
+			::size_t arraySize,
+			::size_t width,
+			::size_t height,
+			::size_t rowPitch,
+			::size_t slicePitch,
+			void* host_ptr = NULL,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+			cl_image_desc desc;
+			desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+			desc.image_array_size = arraySize;
+			desc.image_width = width;
+			desc.image_height = height;
+			desc.image_row_pitch = rowPitch;
+			desc.image_slice_pitch = slicePitch;
+			desc.num_mip_levels = 0;
+			desc.num_samples = 0;
+			desc.buffer = 0;
+			object_ = ::clCreateImage(
+				context(),
+				flags,
+				&format,
+				&desc,
+				host_ptr,
+				&error);
+
+			detail::errHandler(error, __CREATE_IMAGE_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		Image2DArray() { }
+
+		Image2DArray(const Image2DArray& imageArray) : Image(imageArray) { }
+
+		__CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+		Image2DArray& operator = (const Image2DArray& rhs)
+		{
+			if (this != &rhs) {
+				Image::operator=(rhs);
+			}
+			return *this;
+		}
+
+		Image2DArray& operator = (const cl_mem& rhs)
+		{
+			Image::operator=(rhs);
+			return *this;
+		}
+	};
+#endif // #if defined(CL_VERSION_1_2)
+
+	/*! \brief Class interface for 3D Image Memory objects.
+	*
+	*  See Memory for details about copy semantics, etc.
+	*
+	*  \see Memory
+	*/
+	class Image3D : public Image
+	{
+	public:
+		/*! \brief Constructs a 3D Image in a specified context.
+		*
+		*  Wraps clCreateImage().
+		*/
+		Image3D(
+			const Context& context,
+			cl_mem_flags flags,
+			ImageFormat format,
+			::size_t width,
+			::size_t height,
+			::size_t depth,
+			::size_t row_pitch = 0,
+			::size_t slice_pitch = 0,
+			void* host_ptr = NULL,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+			bool useCreateImage;
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+			// Run-time decision based on the actual platform
+			{
+				cl_uint version = detail::getContextPlatformVersion(context());
+				useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+			}
+#elif defined(CL_VERSION_1_2)
+			useCreateImage = true;
+#else
+			useCreateImage = false;
+#endif
+
+#if defined(CL_VERSION_1_2)
+			if (useCreateImage)
+			{
+				cl_image_desc desc;
+				desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+				desc.image_width = width;
+				desc.image_height = height;
+				desc.image_depth = depth;
+				desc.image_row_pitch = row_pitch;
+				desc.image_slice_pitch = slice_pitch;
+				desc.num_mip_levels = 0;
+				desc.num_samples = 0;
+				desc.buffer = 0;
+				object_ = ::clCreateImage(
+					context(),
+					flags,
+					&format,
+					&desc,
+					host_ptr,
+					&error);
+
+				detail::errHandler(error, __CREATE_IMAGE_ERR);
+				if (err != NULL) {
+					*err = error;
+				}
+			}
+#endif  // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+			if (!useCreateImage)
+			{
+				object_ = ::clCreateImage3D(
+					context(), flags, &format, width, height, depth, row_pitch,
+					slice_pitch, host_ptr, &error);
+
+				detail::errHandler(error, __CREATE_IMAGE3D_ERR);
+				if (err != NULL) {
+					*err = error;
+				}
+			}
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+		}
+
+		//! \brief Default constructor - initializes to NULL.
+		Image3D() { }
+
+		/*! \brief Copy constructor - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Image3D(const Image3D& image3D) : Image(image3D) { }
+
+		/*! \brief Constructor from cl_mem - takes ownership.
+		*
+		*  See Memory for further details.
+		*/
+		__CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { }
+
+		/*! \brief Assignment from Image3D - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Image3D& operator = (const Image3D& rhs)
+		{
+			if (this != &rhs) {
+				Image::operator=(rhs);
+			}
+			return *this;
+		}
+
+		/*! \brief Assignment from cl_mem - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Image3D& operator = (const cl_mem& rhs)
+		{
+			Image::operator=(rhs);
+			return *this;
+		}
+	};
+
+#if !defined(CL_VERSION_1_2)
+	/*! \brief Class interface for GL 3D Image Memory objects.
+	*
+	*  This is provided to facilitate interoperability with OpenGL.
+	*
+	*  See Memory for details about copy semantics, etc.
+	*
+	*  \see Memory
+	*/
+	class Image3DGL : public Image3D
+	{
+	public:
+		/*! \brief Constructs an Image3DGL in a specified context, from a given
+		*         GL Texture.
+		*
+		*  Wraps clCreateFromGLTexture3D().
+		*/
+		Image3DGL(
+			const Context& context,
+			cl_mem_flags flags,
+			GLenum target,
+			GLint  miplevel,
+			GLuint texobj,
+			cl_int * err = NULL)
+		{
+			cl_int error;
+			object_ = ::clCreateFromGLTexture3D(
+				context(),
+				flags,
+				target,
+				miplevel,
+				texobj,
+				&error);
+
+			detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		//! \brief Default constructor - initializes to NULL.
+		Image3DGL() : Image3D() { }
+
+		/*! \brief Copy constructor - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Image3DGL(const Image3DGL& image) : Image3D(image) { }
+
+		/*! \brief Constructor from cl_mem - takes ownership.
+		*
+		*  See Memory for further details.
+		*/
+		__CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { }
+
+		/*! \brief Assignment from Image3DGL - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Image3DGL& operator = (const Image3DGL& rhs)
+		{
+			if (this != &rhs) {
+				Image3D::operator=(rhs);
+			}
+			return *this;
+		}
+
+		/*! \brief Assignment from cl_mem - performs shallow copy.
+		*
+		*  See Memory for further details.
+		*/
+		Image3DGL& operator = (const cl_mem& rhs)
+		{
+			Image3D::operator=(rhs);
+			return *this;
+		}
+	};
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+	/*! \class ImageGL
+	* \brief general image interface for GL interop.
+	* We abstract the 2D and 3D GL images into a single instance here
+	* that wraps all GL sourced images on the grounds that setup information
+	* was performed by OpenCL anyway.
+	*/
+	class ImageGL : public Image
+	{
+	public:
+		ImageGL(
+			const Context& context,
+			cl_mem_flags flags,
+			GLenum target,
+			GLint  miplevel,
+			GLuint texobj,
+			cl_int * err = NULL)
+		{
+			cl_int error;
+			object_ = ::clCreateFromGLTexture(
+				context(),
+				flags,
+				target,
+				miplevel,
+				texobj,
+				&error);
+
+			detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		ImageGL() : Image() { }
+
+		ImageGL(const ImageGL& image) : Image(image) { }
+
+		__CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { }
+
+		ImageGL& operator = (const ImageGL& rhs)
+		{
+			if (this != &rhs) {
+				Image::operator=(rhs);
+			}
+			return *this;
+		}
+
+		ImageGL& operator = (const cl_mem& rhs)
+		{
+			Image::operator=(rhs);
+			return *this;
+		}
+	};
+#endif // #if defined(CL_VERSION_1_2)
+
+	/*! \brief Class interface for cl_sampler.
+	*
+	*  \note Copies of these objects are shallow, meaning that the copy will refer
+	*        to the same underlying cl_sampler as the original.  For details, see
+	*        clRetainSampler() and clReleaseSampler().
+	*
+	*  \see cl_sampler
+	*/
+	class Sampler : public detail::Wrapper<cl_sampler>
+	{
+	public:
+		/*! \brief Destructor.
+		*
+		*  This calls clReleaseSampler() on the value held by this instance.
+		*/
+		~Sampler() { }
+
+		//! \brief Default constructor - initializes to NULL.
+		Sampler() { }
+
+		/*! \brief Constructs a Sampler in a specified context.
+		*
+		*  Wraps clCreateSampler().
+		*/
+		Sampler(
+			const Context& context,
+			cl_bool normalized_coords,
+			cl_addressing_mode addressing_mode,
+			cl_filter_mode filter_mode,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+			object_ = ::clCreateSampler(
+				context(),
+				normalized_coords,
+				addressing_mode,
+				filter_mode,
+				&error);
+
+			detail::errHandler(error, __CREATE_SAMPLER_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		/*! \brief Copy constructor - performs shallow copy.
+		*
+		*  This calls clRetainSampler() on the parameter's cl_sampler.
+		*/
+		Sampler(const Sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
+
+		/*! \brief Constructor from cl_sampler - takes ownership.
+		*
+		*  This effectively transfers ownership of a refcount on the cl_sampler
+		*  into the new Sampler object.
+		*/
+		Sampler(const cl_sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
+
+		/*! \brief Assignment operator from Sampler.
+		*
+		*  This calls clRetainSampler() on the parameter and clReleaseSampler()
+		*  on the previous value held by this instance.
+		*/
+		Sampler& operator = (const Sampler& rhs)
+		{
+			if (this != &rhs) {
+				detail::Wrapper<cl_type>::operator=(rhs);
+			}
+			return *this;
+		}
+
+		/*! \brief Assignment operator from cl_sampler - takes ownership.
+		*
+		*  This effectively transfers ownership of a refcount on the rhs and calls
+		*  clReleaseSampler() on the value previously held by this instance.
+		*/
+		Sampler& operator = (const cl_sampler& rhs)
+		{
+			detail::Wrapper<cl_type>::operator=(rhs);
+			return *this;
+		}
+
+		//! \brief Wrapper for clGetSamplerInfo().
+		template <typename T>
+		cl_int getInfo(cl_sampler_info name, T* param) const
+		{
+			return detail::errHandler(
+				detail::getInfo(&::clGetSamplerInfo, object_, name, param),
+				__GET_SAMPLER_INFO_ERR);
+		}
+
+		//! \brief Wrapper for clGetSamplerInfo() that returns by value.
+		template <cl_int name> typename
+			detail::param_traits<detail::cl_sampler_info, name>::param_type
+			getInfo(cl_int* err = NULL) const
+		{
+			typename detail::param_traits<
+				detail::cl_sampler_info, name>::param_type param;
+			cl_int result = getInfo(name, &param);
+			if (err != NULL) {
+				*err = result;
+			}
+			return param;
+		}
+	};
+
+	class Program;
+	class CommandQueue;
+	class Kernel;
+
+	//! \brief Class interface for specifying NDRange values.
+	class NDRange
+	{
+	private:
+		size_t<3> sizes_;
+		cl_uint dimensions_;
+
+	public:
+		//! \brief Default constructor - resulting range has zero dimensions.
+		NDRange()
+			: dimensions_(0)
+		{ }
+
+		//! \brief Constructs one-dimensional range.
+		NDRange(::size_t size0)
+			: dimensions_(1)
+		{
+			sizes_[0] = size0;
+		}
+
+		//! \brief Constructs two-dimensional range.
+		NDRange(::size_t size0, ::size_t size1)
+			: dimensions_(2)
+		{
+			sizes_[0] = size0;
+			sizes_[1] = size1;
+		}
+
+		//! \brief Constructs three-dimensional range.
+		NDRange(::size_t size0, ::size_t size1, ::size_t size2)
+			: dimensions_(3)
+		{
+			sizes_[0] = size0;
+			sizes_[1] = size1;
+			sizes_[2] = size2;
+		}
+
+		/*! \brief Conversion operator to const ::size_t *.
+		*
+		*  \returns a pointer to the size of the first dimension.
+		*/
+		operator const ::size_t*() const {
+			return (const ::size_t*) sizes_;
+		}
+
+		//! \brief Queries the number of dimensions in the range.
+		::size_t dimensions() const { return dimensions_; }
+	};
+
+	//! \brief A zero-dimensional range.
+	static const NDRange NullRange;
+
+	//! \brief Local address wrapper for use with Kernel::setArg
+	struct LocalSpaceArg
+	{
+		::size_t size_;
+	};
+
+	namespace detail {
+
+		template <typename T>
+		struct KernelArgumentHandler
+		{
+			static ::size_t size(const T&) { return sizeof(T); }
+			static T* ptr(T& value) { return &value; }
+		};
+
+		template <>
+		struct KernelArgumentHandler<LocalSpaceArg>
+		{
+			static ::size_t size(const LocalSpaceArg& value) { return value.size_; }
+			static void* ptr(LocalSpaceArg&) { return NULL; }
+		};
+
+	}
+	//! \endcond
+
+	/*! __local
+	* \brief Helper function for generating LocalSpaceArg objects.
+	* Deprecated. Replaced with Local.
+	*/
+	inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg
+		__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+	inline LocalSpaceArg
+		__local(::size_t size)
+	{
+		LocalSpaceArg ret = { size };
+		return ret;
+	}
+
+	/*! Local
+	* \brief Helper function for generating LocalSpaceArg objects.
+	*/
+	inline LocalSpaceArg
+		Local(::size_t size)
+	{
+		LocalSpaceArg ret = { size };
+		return ret;
+	}
+
+	//class KernelFunctor;
+
+	/*! \brief Class interface for cl_kernel.
+	*
+	*  \note Copies of these objects are shallow, meaning that the copy will refer
+	*        to the same underlying cl_kernel as the original.  For details, see
+	*        clRetainKernel() and clReleaseKernel().
+	*
+	*  \see cl_kernel
+	*/
+	class Kernel : public detail::Wrapper<cl_kernel>
+	{
+	public:
+		inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
+
+		/*! \brief Destructor.
+		*
+		*  This calls clReleaseKernel() on the value held by this instance.
+		*/
+		~Kernel() { }
+
+		//! \brief Default constructor - initializes to NULL.
+		Kernel() { }
+
+		/*! \brief Copy constructor - performs shallow copy.
+		*
+		*  This calls clRetainKernel() on the parameter's cl_kernel.
+		*/
+		Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+
+		/*! \brief Constructor from cl_kernel - takes ownership.
+		*
+		*  This effectively transfers ownership of a refcount on the cl_kernel
+		*  into the new Kernel object.
+		*/
+		__CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+
+		/*! \brief Assignment operator from Kernel.
+		*
+		*  This calls clRetainKernel() on the parameter and clReleaseKernel()
+		*  on the previous value held by this instance.
+		*/
+		Kernel& operator = (const Kernel& rhs)
+		{
+			if (this != &rhs) {
+				detail::Wrapper<cl_type>::operator=(rhs);
+			}
+			return *this;
+		}
+
+		/*! \brief Assignment operator from cl_kernel - takes ownership.
+		*
+		*  This effectively transfers ownership of a refcount on the rhs and calls
+		*  clReleaseKernel() on the value previously held by this instance.
+		*/
+		Kernel& operator = (const cl_kernel& rhs)
+		{
+			detail::Wrapper<cl_type>::operator=(rhs);
+			return *this;
+		}
+
+		template <typename T>
+		cl_int getInfo(cl_kernel_info name, T* param) const
+		{
+			return detail::errHandler(
+				detail::getInfo(&::clGetKernelInfo, object_, name, param),
+				__GET_KERNEL_INFO_ERR);
+		}
+
+		template <cl_int name> typename
+			detail::param_traits<detail::cl_kernel_info, name>::param_type
+			getInfo(cl_int* err = NULL) const
+		{
+			typename detail::param_traits<
+				detail::cl_kernel_info, name>::param_type param;
+			cl_int result = getInfo(name, &param);
+			if (err != NULL) {
+				*err = result;
+			}
+			return param;
+		}
+
+#if defined(CL_VERSION_1_2)
+		template <typename T>
+		cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const
+		{
+			return detail::errHandler(
+				detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),
+				__GET_KERNEL_ARG_INFO_ERR);
+		}
+
+		template <cl_int name> typename
+			detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
+			getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
+		{
+			typename detail::param_traits<
+				detail::cl_kernel_arg_info, name>::param_type param;
+			cl_int result = getArgInfo(argIndex, name, &param);
+			if (err != NULL) {
+				*err = result;
+			}
+			return param;
+		}
+#endif // #if defined(CL_VERSION_1_2)
+
+		template <typename T>
+		cl_int getWorkGroupInfo(
+			const Device& device, cl_kernel_work_group_info name, T* param) const
+		{
+			return detail::errHandler(
+				detail::getInfo(
+				&::clGetKernelWorkGroupInfo, object_, device(), name, param),
+				__GET_KERNEL_WORK_GROUP_INFO_ERR);
+		}
+
+		template <cl_int name> typename
+			detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
+			getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
+		{
+			typename detail::param_traits<
+				detail::cl_kernel_work_group_info, name>::param_type param;
+			cl_int result = getWorkGroupInfo(device, name, &param);
+			if (err != NULL) {
+				*err = result;
+			}
+			return param;
+		}
+
+		template <typename T>
+		cl_int setArg(cl_uint index, T value)
+		{
+			return detail::errHandler(
+				::clSetKernelArg(
+				object_,
+				index,
+				detail::KernelArgumentHandler<T>::size(value),
+				detail::KernelArgumentHandler<T>::ptr(value)),
+				__SET_KERNEL_ARGS_ERR);
+		}
+
+		cl_int setArg(cl_uint index, ::size_t size, void* argPtr)
+		{
+			return detail::errHandler(
+				::clSetKernelArg(object_, index, size, argPtr),
+				__SET_KERNEL_ARGS_ERR);
+		}
+	};
+
+	/*! \class Program
+	* \brief Program interface that implements cl_program.
+	*/
+	class Program : public detail::Wrapper<cl_program>
+	{
+	public:
+		typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries;
+		typedef VECTOR_CLASS<std::pair<const char*, ::size_t> > Sources;
+
+		Program(
+			const STRING_CLASS& source,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+
+			const char * strings = source.c_str();
+			const ::size_t length = source.size();
+
+			Context context = Context::getDefault(err);
+
+			object_ = ::clCreateProgramWithSource(
+				context(), (cl_uint)1, &strings, &length, &error);
+
+			detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+			if (error == CL_SUCCESS) {
+
+				error = ::clBuildProgram(
+					object_,
+					0,
+					NULL,
+					"",
+					NULL,
+					NULL);
+
+				detail::errHandler(error, __BUILD_PROGRAM_ERR);
+			}
+
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		Program(
+			const STRING_CLASS& source,
+			bool build,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+
+			const char * strings = source.c_str();
+			const ::size_t length = source.size();
+
+			Context context = Context::getDefault(err);
+
+			object_ = ::clCreateProgramWithSource(
+				context(), (cl_uint)1, &strings, &length, &error);
+
+			detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+			if (error == CL_SUCCESS && build) {
+
+				error = ::clBuildProgram(
+					object_,
+					0,
+					NULL,
+					"",
+					NULL,
+					NULL);
+
+				detail::errHandler(error, __BUILD_PROGRAM_ERR);
+			}
+
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		Program(
+			const Context& context,
+			const STRING_CLASS& source,
+			bool build = false,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+
+			const char * strings = source.c_str();
+			const ::size_t length = source.size();
+
+			object_ = ::clCreateProgramWithSource(
+				context(), (cl_uint)1, &strings, &length, &error);
+
+			detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+			if (error == CL_SUCCESS && build) {
+
+				error = ::clBuildProgram(
+					object_,
+					0,
+					NULL,
+					"",
+					NULL,
+					NULL);
+
+				detail::errHandler(error, __BUILD_PROGRAM_ERR);
+			}
+
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		Program(
+			const Context& context,
+			const Sources& sources,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+
+			const ::size_t n = (::size_t)sources.size();
+			::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
+			const char** strings = (const char**)alloca(n * sizeof(const char*));
+
+			for (::size_t i = 0; i < n; ++i) {
+				strings[i] = sources[(int)i].first;
+				lengths[i] = sources[(int)i].second;
+			}
+
+			object_ = ::clCreateProgramWithSource(
+				context(), (cl_uint)n, strings, lengths, &error);
+
+			detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		/**
+		* Construct a program object from a list of devices and a per-device list of binaries.
+		* \param context A valid OpenCL context in which to construct the program.
+		* \param devices A vector of OpenCL device objects for which the program will be created.
+		* \param binaries A vector of pairs of a pointer to a binary object and its length.
+		* \param binaryStatus An optional vector that on completion will be resized to
+		*   match the size of binaries and filled with values to specify if each binary
+		*   was successfully loaded.
+		*   Set to CL_SUCCESS if the binary was successfully loaded.
+		*   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
+		*   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
+		* \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
+		*   CL_INVALID_CONTEXT if context is not a valid context.
+		*   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices;
+		*     or if any entry in binaries is NULL or has length 0.
+		*   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
+		*   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
+		*   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
+		*/
+		Program(
+			const Context& context,
+			const VECTOR_CLASS<Device>& devices,
+			const Binaries& binaries,
+			VECTOR_CLASS<cl_int>* binaryStatus = NULL,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+
+			const ::size_t numDevices = devices.size();
+
+			// Catch size mismatch early and return
+			if (binaries.size() != numDevices) {
+				error = CL_INVALID_VALUE;
+				detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+				if (err != NULL) {
+					*err = error;
+				}
+				return;
+			}
+
+			::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t));
+			const unsigned char** images = (const unsigned char**)alloca(numDevices * sizeof(const unsigned char**));
+
+			for (::size_t i = 0; i < numDevices; ++i) {
+				images[i] = (const unsigned char*)binaries[i].first;
+				lengths[i] = binaries[(int)i].second;
+			}
+
+			cl_device_id* deviceIDs = (cl_device_id*)alloca(numDevices * sizeof(cl_device_id));
+			for (::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex) {
+				deviceIDs[deviceIndex] = (devices[deviceIndex])();
+			}
+
+			if (binaryStatus) {
+				binaryStatus->resize(numDevices);
+			}
+
+			object_ = ::clCreateProgramWithBinary(
+				context(), (cl_uint)devices.size(),
+				deviceIDs,
+				lengths, images, binaryStatus != NULL
+				? &binaryStatus->front()
+				: NULL, &error);
+
+			detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+
+#if defined(CL_VERSION_1_2)
+		/**
+		* Create program using builtin kernels.
+		* \param kernelNames Semi-colon separated list of builtin kernel names
+		*/
+		Program(
+			const Context& context,
+			const VECTOR_CLASS<Device>& devices,
+			const STRING_CLASS& kernelNames,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+
+
+			::size_t numDevices = devices.size();
+			cl_device_id* deviceIDs = (cl_device_id*)alloca(numDevices * sizeof(cl_device_id));
+			for (::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex) {
+				deviceIDs[deviceIndex] = (devices[deviceIndex])();
+			}
+
+			object_ = ::clCreateProgramWithBuiltInKernels(
+				context(),
+				(cl_uint)devices.size(),
+				deviceIDs,
+				kernelNames.c_str(),
+				&error);
+
+			detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+#endif // #if defined(CL_VERSION_1_2)
+
+		Program() { }
+
+		Program(const Program& program) : detail::Wrapper<cl_type>(program) { }
+
+		__CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper<cl_type>(program) { }
+
+		Program& operator = (const Program& rhs)
+		{
+			if (this != &rhs) {
+				detail::Wrapper<cl_type>::operator=(rhs);
+			}
+			return *this;
+		}
+
+		Program& operator = (const cl_program& rhs)
+		{
+			detail::Wrapper<cl_type>::operator=(rhs);
+			return *this;
+		}
+
+		cl_int build(
+			const VECTOR_CLASS<Device>& devices,
+			const char* options = NULL,
+			void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+			void* data = NULL) const
+		{
+			::size_t numDevices = devices.size();
+			cl_device_id* deviceIDs = (cl_device_id*)alloca(numDevices * sizeof(cl_device_id));
+			for (::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex) {
+				deviceIDs[deviceIndex] = (devices[deviceIndex])();
+			}
+
+			return detail::errHandler(
+				::clBuildProgram(
+				object_,
+				(cl_uint)
+				devices.size(),
+				deviceIDs,
+				options,
+				notifyFptr,
+				data),
+				__BUILD_PROGRAM_ERR);
+		}
+
+		cl_int build(
+			const char* options = NULL,
+			void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+			void* data = NULL) const
+		{
+			return detail::errHandler(
+				::clBuildProgram(
+				object_,
+				0,
+				NULL,
+				options,
+				notifyFptr,
+				data),
+				__BUILD_PROGRAM_ERR);
+		}
+
+#if defined(CL_VERSION_1_2)
+		cl_int compile(
+			const char* options = NULL,
+			void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+			void* data = NULL) const
+		{
+			return detail::errHandler(
+				::clCompileProgram(
+				object_,
+				0,
+				NULL,
+				options,
+				0,
+				NULL,
+				NULL,
+				notifyFptr,
+				data),
+				__COMPILE_PROGRAM_ERR);
+		}
+#endif
+
+		template <typename T>
+		cl_int getInfo(cl_program_info name, T* param) const
+		{
+			return detail::errHandler(
+				detail::getInfo(&::clGetProgramInfo, object_, name, param),
+				__GET_PROGRAM_INFO_ERR);
+		}
+
+		template <cl_int name> typename
+			detail::param_traits<detail::cl_program_info, name>::param_type
+			getInfo(cl_int* err = NULL) const
+		{
+			typename detail::param_traits<
+				detail::cl_program_info, name>::param_type param;
+			cl_int result = getInfo(name, &param);
+			if (err != NULL) {
+				*err = result;
+			}
+			return param;
+		}
+
+		template <typename T>
+		cl_int getBuildInfo(
+			const Device& device, cl_program_build_info name, T* param) const
+		{
+			return detail::errHandler(
+				detail::getInfo(
+				&::clGetProgramBuildInfo, object_, device(), name, param),
+				__GET_PROGRAM_BUILD_INFO_ERR);
+		}
+
+		template <cl_int name> typename
+			detail::param_traits<detail::cl_program_build_info, name>::param_type
+			getBuildInfo(const Device& device, cl_int* err = NULL) const
+		{
+			typename detail::param_traits<
+				detail::cl_program_build_info, name>::param_type param;
+			cl_int result = getBuildInfo(device, name, &param);
+			if (err != NULL) {
+				*err = result;
+			}
+			return param;
+		}
+
+		cl_int createKernels(VECTOR_CLASS<Kernel>* kernels)
+		{
+			cl_uint numKernels;
+			cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
+			if (err != CL_SUCCESS) {
+				return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+			}
+
+			Kernel* value = (Kernel*)alloca(numKernels * sizeof(Kernel));
+			err = ::clCreateKernelsInProgram(
+				object_, numKernels, (cl_kernel*)value, NULL);
+			if (err != CL_SUCCESS) {
+				return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+			}
+
+			kernels->assign(&value[0], &value[numKernels]);
+			return CL_SUCCESS;
+		}
+	};
+
+#if defined(CL_VERSION_1_2)
+	inline Program linkProgram(
+		Program input1,
+		Program input2,
+		const char* options = NULL,
+		void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+		void* data = NULL,
+		cl_int* err = NULL)
+	{
+		cl_int err_local = CL_SUCCESS;
+
+		cl_program programs[2] = { input1(), input2() };
+
+		Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>();
+
+		cl_program prog = ::clLinkProgram(
+			ctx(),
+			0,
+			NULL,
+			options,
+			2,
+			programs,
+			notifyFptr,
+			data,
+			&err_local);
+
+		detail::errHandler(err_local, __COMPILE_PROGRAM_ERR);
+		if (err != NULL) {
+			*err = err_local;
+		}
+
+		return Program(prog);
+	}
+
+	inline Program linkProgram(
+		VECTOR_CLASS<Program> inputPrograms,
+		const char* options = NULL,
+		void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+		void* data = NULL,
+		cl_int* err = NULL)
+	{
+		cl_int err_local = CL_SUCCESS;
+
+		cl_program * programs = (cl_program*)alloca(inputPrograms.size() * sizeof(cl_program));
+
+		if (programs != NULL) {
+			for (unsigned int i = 0; i < inputPrograms.size(); i++) {
+				programs[i] = inputPrograms[i]();
+			}
+		}
+
+		cl_program prog = ::clLinkProgram(
+			Context::getDefault()(),
+			0,
+			NULL,
+			options,
+			(cl_uint)inputPrograms.size(),
+			programs,
+			notifyFptr,
+			data,
+			&err_local);
+
+		detail::errHandler(err_local, __COMPILE_PROGRAM_ERR);
+		if (err != NULL) {
+			*err = err_local;
+		}
+
+		return Program(prog);
+	}
+#endif
+
+	template<>
+	inline VECTOR_CLASS<char *> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const
+	{
+		VECTOR_CLASS< ::size_t> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();
+		VECTOR_CLASS<char *> binaries;
+		for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s)
+		{
+			char *ptr = NULL;
+			if (*s != 0)
+				ptr = new char[*s];
+			binaries.push_back(ptr);
+		}
+
+		cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries);
+		if (err != NULL) {
+			*err = result;
+		}
+		return binaries;
+	}
+
+	inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
+	{
+		cl_int error;
+
+		object_ = ::clCreateKernel(program(), name, &error);
+		detail::errHandler(error, __CREATE_KERNEL_ERR);
+
+		if (err != NULL) {
+			*err = error;
+		}
+
+	}
+
+	/*! \class CommandQueue
+	* \brief CommandQueue interface for cl_command_queue.
+	*/
+	class CommandQueue : public detail::Wrapper<cl_command_queue>
+	{
+	private:
+		static volatile int default_initialized_;
+		static CommandQueue default_;
+		static volatile cl_int default_error_;
+	public:
+		CommandQueue(
+			cl_command_queue_properties properties,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+
+			Context context = Context::getDefault(&error);
+			detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+			if (error != CL_SUCCESS) {
+				if (err != NULL) {
+					*err = error;
+				}
+			}
+			else {
+				Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+				object_ = ::clCreateCommandQueue(
+					context(), device(), properties, &error);
+
+				detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+				if (err != NULL) {
+					*err = error;
+				}
+			}
+		}
+
+		CommandQueue(
+			const Context& context,
+			const Device& device,
+			cl_command_queue_properties properties = 0,
+			cl_int* err = NULL)
+		{
+			cl_int error;
+			object_ = ::clCreateCommandQueue(
+				context(), device(), properties, &error);
+
+			detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+		}
+
+		static CommandQueue getDefault(cl_int * err = NULL)
+		{
+			int state = detail::compare_exchange(
+				&default_initialized_,
+				__DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+
+			if (state & __DEFAULT_INITIALIZED) {
+				if (err != NULL) {
+					*err = default_error_;
+				}
+				return default_;
+			}
+
+			if (state & __DEFAULT_BEING_INITIALIZED) {
+				// Assume writes will propagate eventually...
+				while (default_initialized_ != __DEFAULT_INITIALIZED) {
+					detail::fence();
+				}
+
+				if (err != NULL) {
+					*err = default_error_;
+				}
+				return default_;
+			}
+
+			cl_int error;
+
+			Context context = Context::getDefault(&error);
+			detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+			if (error != CL_SUCCESS) {
+				if (err != NULL) {
+					*err = error;
+				}
+			}
+			else {
+				Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+				default_ = CommandQueue(context, device, 0, &error);
+
+				detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+				if (err != NULL) {
+					*err = error;
+				}
+			}
+
+			detail::fence();
+
+			default_error_ = error;
+			// Assume writes will propagate eventually...
+			default_initialized_ = __DEFAULT_INITIALIZED;
+
+			detail::fence();
+
+			if (err != NULL) {
+				*err = default_error_;
+			}
+			return default_;
+
+		}
+
+		CommandQueue() { }
+
+		CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+		CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+		CommandQueue& operator = (const CommandQueue& rhs)
+		{
+			if (this != &rhs) {
+				detail::Wrapper<cl_type>::operator=(rhs);
+			}
+			return *this;
+		}
+
+		CommandQueue& operator = (const cl_command_queue& rhs)
+		{
+			detail::Wrapper<cl_type>::operator=(rhs);
+			return *this;
+		}
+
+		template <typename T>
+		cl_int getInfo(cl_command_queue_info name, T* param) const
+		{
+			return detail::errHandler(
+				detail::getInfo(
+				&::clGetCommandQueueInfo, object_, name, param),
+				__GET_COMMAND_QUEUE_INFO_ERR);
+		}
+
+		template <cl_int name> typename
+			detail::param_traits<detail::cl_command_queue_info, name>::param_type
+			getInfo(cl_int* err = NULL) const
+		{
+			typename detail::param_traits<
+				detail::cl_command_queue_info, name>::param_type param;
+			cl_int result = getInfo(name, &param);
+			if (err != NULL) {
+				*err = result;
+			}
+			return param;
+		}
+
+		cl_int enqueueReadBuffer(
+			const Buffer& buffer,
+			cl_bool blocking,
+			::size_t offset,
+			::size_t size,
+			void* ptr,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueReadBuffer(
+				object_, buffer(), blocking, offset, size,
+				ptr,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_READ_BUFFER_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+		cl_int enqueueWriteBuffer(
+			const Buffer& buffer,
+			cl_bool blocking,
+			::size_t offset,
+			::size_t size,
+			const void* ptr,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueWriteBuffer(
+				object_, buffer(), blocking, offset, size,
+				ptr,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_WRITE_BUFFER_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+		cl_int enqueueCopyBuffer(
+			const Buffer& src,
+			const Buffer& dst,
+			::size_t src_offset,
+			::size_t dst_offset,
+			::size_t size,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueCopyBuffer(
+				object_, src(), dst(), src_offset, dst_offset, size,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQEUE_COPY_BUFFER_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+		cl_int enqueueReadBufferRect(
+			const Buffer& buffer,
+			cl_bool blocking,
+			const size_t<3>& buffer_offset,
+			const size_t<3>& host_offset,
+			const size_t<3>& region,
+			::size_t buffer_row_pitch,
+			::size_t buffer_slice_pitch,
+			::size_t host_row_pitch,
+			::size_t host_slice_pitch,
+			void *ptr,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueReadBufferRect(
+				object_,
+				buffer(),
+				blocking,
+				(const ::size_t *)buffer_offset,
+				(const ::size_t *)host_offset,
+				(const ::size_t *)region,
+				buffer_row_pitch,
+				buffer_slice_pitch,
+				host_row_pitch,
+				host_slice_pitch,
+				ptr,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_READ_BUFFER_RECT_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+		cl_int enqueueWriteBufferRect(
+			const Buffer& buffer,
+			cl_bool blocking,
+			const size_t<3>& buffer_offset,
+			const size_t<3>& host_offset,
+			const size_t<3>& region,
+			::size_t buffer_row_pitch,
+			::size_t buffer_slice_pitch,
+			::size_t host_row_pitch,
+			::size_t host_slice_pitch,
+			void *ptr,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueWriteBufferRect(
+				object_,
+				buffer(),
+				blocking,
+				(const ::size_t *)buffer_offset,
+				(const ::size_t *)host_offset,
+				(const ::size_t *)region,
+				buffer_row_pitch,
+				buffer_slice_pitch,
+				host_row_pitch,
+				host_slice_pitch,
+				ptr,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_WRITE_BUFFER_RECT_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+		cl_int enqueueCopyBufferRect(
+			const Buffer& src,
+			const Buffer& dst,
+			const size_t<3>& src_origin,
+			const size_t<3>& dst_origin,
+			const size_t<3>& region,
+			::size_t src_row_pitch,
+			::size_t src_slice_pitch,
+			::size_t dst_row_pitch,
+			::size_t dst_slice_pitch,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueCopyBufferRect(
+				object_,
+				src(),
+				dst(),
+				(const ::size_t *)src_origin,
+				(const ::size_t *)dst_origin,
+				(const ::size_t *)region,
+				src_row_pitch,
+				src_slice_pitch,
+				dst_row_pitch,
+				dst_slice_pitch,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQEUE_COPY_BUFFER_RECT_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+#if defined(CL_VERSION_1_2)
+		/**
+		* Enqueue a command to fill a buffer object with a pattern
+		* of a given size. The pattern is specified a as vector.
+		* \tparam PatternType The datatype of the pattern field.
+		*     The pattern type must be an accepted OpenCL data type.
+		*/
+		template<typename PatternType>
+		cl_int enqueueFillBuffer(
+			const Buffer& buffer,
+			PatternType pattern,
+			::size_t offset,
+			::size_t size,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueFillBuffer(
+				object_,
+				buffer(),
+				static_cast<void*>(&pattern),
+				sizeof(PatternType),
+				offset,
+				size,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_FILL_BUFFER_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+#endif // #if defined(CL_VERSION_1_2)
+
+		cl_int enqueueReadImage(
+			const Image& image,
+			cl_bool blocking,
+			const size_t<3>& origin,
+			const size_t<3>& region,
+			::size_t row_pitch,
+			::size_t slice_pitch,
+			void* ptr,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueReadImage(
+				object_, image(), blocking, (const ::size_t *) origin,
+				(const ::size_t *) region, row_pitch, slice_pitch, ptr,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_READ_IMAGE_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+		cl_int enqueueWriteImage(
+			const Image& image,
+			cl_bool blocking,
+			const size_t<3>& origin,
+			const size_t<3>& region,
+			::size_t row_pitch,
+			::size_t slice_pitch,
+			void* ptr,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueWriteImage(
+				object_, image(), blocking, (const ::size_t *) origin,
+				(const ::size_t *) region, row_pitch, slice_pitch, ptr,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_WRITE_IMAGE_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+		cl_int enqueueCopyImage(
+			const Image& src,
+			const Image& dst,
+			const size_t<3>& src_origin,
+			const size_t<3>& dst_origin,
+			const size_t<3>& region,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueCopyImage(
+				object_, src(), dst(), (const ::size_t *) src_origin,
+				(const ::size_t *)dst_origin, (const ::size_t *) region,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_COPY_IMAGE_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+#if defined(CL_VERSION_1_2)
+		/**
+		* Enqueue a command to fill an image object with a specified color.
+		* \param fillColor is the color to use to fill the image.
+		*     This is a four component RGBA floating-point color value if
+		*     the image channel data type is not an unnormalized signed or
+		*     unsigned data type.
+		*/
+		cl_int enqueueFillImage(
+			const Image& image,
+			cl_float4 fillColor,
+			const size_t<3>& origin,
+			const size_t<3>& region,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueFillImage(
+				object_,
+				image(),
+				static_cast<void*>(&fillColor),
+				(const ::size_t *) origin,
+				(const ::size_t *) region,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_FILL_IMAGE_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+		/**
+		* Enqueue a command to fill an image object with a specified color.
+		* \param fillColor is the color to use to fill the image.
+		*     This is a four component RGBA signed integer color value if
+		*     the image channel data type is an unnormalized signed integer
+		*     type.
+		*/
+		cl_int enqueueFillImage(
+			const Image& image,
+			cl_int4 fillColor,
+			const size_t<3>& origin,
+			const size_t<3>& region,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueFillImage(
+				object_,
+				image(),
+				static_cast<void*>(&fillColor),
+				(const ::size_t *) origin,
+				(const ::size_t *) region,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_FILL_IMAGE_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+		/**
+		* Enqueue a command to fill an image object with a specified color.
+		* \param fillColor is the color to use to fill the image.
+		*     This is a four component RGBA unsigned integer color value if
+		*     the image channel data type is an unnormalized unsigned integer
+		*     type.
+		*/
+		cl_int enqueueFillImage(
+			const Image& image,
+			cl_uint4 fillColor,
+			const size_t<3>& origin,
+			const size_t<3>& region,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueFillImage(
+				object_,
+				image(),
+				static_cast<void*>(&fillColor),
+				(const ::size_t *) origin,
+				(const ::size_t *) region,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_FILL_IMAGE_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+#endif // #if defined(CL_VERSION_1_2)
+
+		cl_int enqueueCopyImageToBuffer(
+			const Image& src,
+			const Buffer& dst,
+			const size_t<3>& src_origin,
+			const size_t<3>& region,
+			::size_t dst_offset,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueCopyImageToBuffer(
+				object_, src(), dst(), (const ::size_t *) src_origin,
+				(const ::size_t *) region, dst_offset,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+		cl_int enqueueCopyBufferToImage(
+			const Buffer& src,
+			const Image& dst,
+			::size_t src_offset,
+			const size_t<3>& dst_origin,
+			const size_t<3>& region,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueCopyBufferToImage(
+				object_, src(), dst(), src_offset,
+				(const ::size_t *) dst_origin, (const ::size_t *) region,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+		void* enqueueMapBuffer(
+			const Buffer& buffer,
+			cl_bool blocking,
+			cl_map_flags flags,
+			::size_t offset,
+			::size_t size,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL,
+			cl_int* err = NULL) const
+		{
+			cl_int error;
+			void * result = ::clEnqueueMapBuffer(
+				object_, buffer(), blocking, flags, offset, size,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(cl_event*)event,
+				&error);
+
+			detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+			return result;
+		}
+
+		void* enqueueMapImage(
+			const Image& buffer,
+			cl_bool blocking,
+			cl_map_flags flags,
+			const size_t<3>& origin,
+			const size_t<3>& region,
+			::size_t * row_pitch,
+			::size_t * slice_pitch,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL,
+			cl_int* err = NULL) const
+		{
+			cl_int error;
+			void * result = ::clEnqueueMapImage(
+				object_, buffer(), blocking, flags,
+				(const ::size_t *) origin, (const ::size_t *) region,
+				row_pitch, slice_pitch,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(cl_event*)event,
+				&error);
+
+			detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
+			if (err != NULL) {
+				*err = error;
+			}
+			return result;
+		}
+
+		cl_int enqueueUnmapMemObject(
+			const Memory& memory,
+			void* mapped_ptr,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueUnmapMemObject(
+				object_, memory(), mapped_ptr,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+#if defined(CL_VERSION_1_2)
+		/**
+		* Enqueues a marker command which waits for either a list of events to complete,
+		* or all previously enqueued commands to complete.
+		*
+		* Enqueues a marker command which waits for either a list of events to complete,
+		* or if the list is empty it waits for all commands previously enqueued in command_queue
+		* to complete before it completes. This command returns an event which can be waited on,
+		* i.e. this event can be waited on to insure that all events either in the event_wait_list
+		* or all previously enqueued commands, queued before this command to command_queue,
+		* have completed.
+		*/
+		cl_int enqueueMarkerWithWaitList(
+			const VECTOR_CLASS<Event> *events = 0,
+			Event *event = 0)
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueMarkerWithWaitList(
+				object_,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_MARKER_WAIT_LIST_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+		/**
+		* A synchronization point that enqueues a barrier operation.
+		*
+		* Enqueues a barrier command which waits for either a list of events to complete,
+		* or if the list is empty it waits for all commands previously enqueued in command_queue
+		* to complete before it completes. This command blocks command execution, that is, any
+		* following commands enqueued after it do not execute until it completes. This command
+		* returns an event which can be waited on, i.e. this event can be waited on to insure that
+		* all events either in the event_wait_list or all previously enqueued commands, queued
+		* before this command to command_queue, have completed.
+		*/
+		cl_int enqueueBarrierWithWaitList(
+			const VECTOR_CLASS<Event> *events = 0,
+			Event *event = 0)
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueBarrierWithWaitList(
+				object_,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_BARRIER_WAIT_LIST_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+		/**
+		* Enqueues a command to indicate with which device a set of memory objects
+		* should be associated.
+		*/
+		cl_int enqueueMigrateMemObjects(
+			const VECTOR_CLASS<Memory> &memObjects,
+			cl_mem_migration_flags flags,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL
+			)
+		{
+			cl_event tmp;
+
+			cl_mem* localMemObjects = static_cast<cl_mem*>(alloca(memObjects.size() * sizeof(cl_mem)));
+			for (int i = 0; i < (int)memObjects.size(); ++i) {
+				localMemObjects[i] = memObjects[i]();
+			}
+
+
+			cl_int err = detail::errHandler(
+				::clEnqueueMigrateMemObjects(
+				object_,
+				(cl_uint)memObjects.size(),
+				static_cast<const cl_mem*>(localMemObjects),
+				flags,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+#endif // #if defined(CL_VERSION_1_2)
+
+		cl_int enqueueNDRangeKernel(
+			const Kernel& kernel,
+			const NDRange& offset,
+			const NDRange& global,
+			const NDRange& local = NullRange,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueNDRangeKernel(
+				object_, kernel(), (cl_uint)global.dimensions(),
+				offset.dimensions() != 0 ? (const ::size_t*) offset : NULL,
+				(const ::size_t*) global,
+				local.dimensions() != 0 ? (const ::size_t*) local : NULL,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_NDRANGE_KERNEL_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+		cl_int enqueueTask(
+			const Kernel& kernel,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueTask(
+				object_, kernel(),
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_TASK_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+		cl_int enqueueNativeKernel(
+			void (CL_CALLBACK *userFptr)(void *),
+			std::pair<void*, ::size_t> args,
+			const VECTOR_CLASS<Memory>* mem_objects = NULL,
+			const VECTOR_CLASS<const void*>* mem_locs = NULL,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0)
+				? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem))
+				: NULL;
+
+			if (mems != NULL) {
+				for (unsigned int i = 0; i < mem_objects->size(); i++) {
+					mems[i] = ((*mem_objects)[i])();
+				}
+			}
+
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueNativeKernel(
+				object_, userFptr, args.first, args.second,
+				(mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0,
+				mems,
+				(mem_locs != NULL) ? (const void **)&mem_locs->front() : NULL,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_NATIVE_KERNEL);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+		/**
+		* Deprecated APIs for 1.2
+		*/
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+		CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+			cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+		{
+			return detail::errHandler(
+				::clEnqueueMarker(object_, (cl_event*)event),
+				__ENQUEUE_MARKER_ERR);
+		}
+
+		CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+			cl_int enqueueWaitForEvents(const VECTOR_CLASS<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+		{
+			return detail::errHandler(
+			::clEnqueueWaitForEvents(
+			object_,
+			(cl_uint)events.size(),
+			(const cl_event*)&events.front()),
+			__ENQUEUE_WAIT_FOR_EVENTS_ERR);
+		}
+#endif // #if defined(CL_VERSION_1_1)
+
+			cl_int enqueueAcquireGLObjects(
+			const VECTOR_CLASS<Memory>* mem_objects = NULL,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueAcquireGLObjects(
+				object_,
+				(mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0,
+				(mem_objects != NULL) ? (const cl_mem *)&mem_objects->front() : NULL,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_ACQUIRE_GL_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+		cl_int enqueueReleaseGLObjects(
+			const VECTOR_CLASS<Memory>* mem_objects = NULL,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				::clEnqueueReleaseGLObjects(
+				object_,
+				(mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0,
+				(mem_objects != NULL) ? (const cl_mem *)&mem_objects->front() : NULL,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_RELEASE_GL_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+#if defined (USE_DX_INTEROP)
+		typedef CL_API_ENTRY cl_int(CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
+			cl_command_queue command_queue, cl_uint num_objects,
+			const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+			const cl_event* event_wait_list, cl_event* event);
+		typedef CL_API_ENTRY cl_int(CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
+			cl_command_queue command_queue, cl_uint num_objects,
+			const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+			const cl_event* event_wait_list, cl_event* event);
+
+		cl_int enqueueAcquireD3D10Objects(
+			const VECTOR_CLASS<Memory>* mem_objects = NULL,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+			cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+			cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+			cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+			__INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+			__INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				pfn_clEnqueueAcquireD3D10ObjectsKHR(
+				object_,
+				(mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0,
+				(mem_objects != NULL) ? (const cl_mem *)&mem_objects->front() : NULL,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_ACQUIRE_GL_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+
+		cl_int enqueueReleaseD3D10Objects(
+			const VECTOR_CLASS<Memory>* mem_objects = NULL,
+			const VECTOR_CLASS<Event>* events = NULL,
+			Event* event = NULL) const
+		{
+			static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+			cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+			cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+			cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+			__INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_2)
+#if defined(CL_VERSION_1_1)
+			__INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_1)
+
+			cl_event tmp;
+			cl_int err = detail::errHandler(
+				pfn_clEnqueueReleaseD3D10ObjectsKHR(
+				object_,
+				(mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0,
+				(mem_objects != NULL) ? (const cl_mem *)&mem_objects->front() : NULL,
+				(events != NULL) ? (cl_uint)events->size() : 0,
+				(events != NULL) ? (cl_event*)&events->front() : NULL,
+				(event != NULL) ? &tmp : NULL),
+				__ENQUEUE_RELEASE_GL_ERR);
+
+			if (event != NULL && err == CL_SUCCESS)
+				*event = tmp;
+
+			return err;
+		}
+#endif
+
+		/**
+		* Deprecated APIs for 1.2
+		*/
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+		CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+			cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+		{
+			return detail::errHandler(
+			::clEnqueueBarrier(object_),
+			__ENQUEUE_BARRIER_ERR);
+		}
+#endif // #if defined(CL_VERSION_1_1)
+
+			cl_int flush() const
+		{
+			return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
+		}
+
+		cl_int finish() const
+		{
+			return detail::errHandler(::clFinish(object_), __FINISH_ERR);
+		}
+	};
+
+#ifdef _WIN32
+	__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+	__declspec(selectany) CommandQueue CommandQueue::default_;
+	__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#else
+	__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+	__attribute__((weak)) CommandQueue CommandQueue::default_;
+	__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#endif
+
+	inline cl_int enqueueReadBuffer(
+		const Buffer& buffer,
+		cl_bool blocking,
+		::size_t offset,
+		::size_t size,
+		void* ptr,
+		const VECTOR_CLASS<Event>* events = NULL,
+		Event* event = NULL)
+	{
+		cl_int error;
+		CommandQueue queue = CommandQueue::getDefault(&error);
+
+		if (error != CL_SUCCESS) {
+			return error;
+		}
+
+		return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);
+	}
+
+	inline cl_int enqueueWriteBuffer(
+		const Buffer& buffer,
+		cl_bool blocking,
+		::size_t offset,
+		::size_t size,
+		const void* ptr,
+		const VECTOR_CLASS<Event>* events = NULL,
+		Event* event = NULL)
+	{
+		cl_int error;
+		CommandQueue queue = CommandQueue::getDefault(&error);
+
+		if (error != CL_SUCCESS) {
+			return error;
+		}
+
+		return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);
+	}
+
+	inline void* enqueueMapBuffer(
+		const Buffer& buffer,
+		cl_bool blocking,
+		cl_map_flags flags,
+		::size_t offset,
+		::size_t size,
+		const VECTOR_CLASS<Event>* events = NULL,
+		Event* event = NULL,
+		cl_int* err = NULL)
+	{
+		cl_int error;
+		CommandQueue queue = CommandQueue::getDefault(&error);
+		detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+		if (err != NULL) {
+			*err = error;
+		}
+
+		void * result = ::clEnqueueMapBuffer(
+			queue(), buffer(), blocking, flags, offset, size,
+			(events != NULL) ? (cl_uint)events->size() : 0,
+			(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+			(cl_event*)event,
+			&error);
+
+		detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+		if (err != NULL) {
+			*err = error;
+		}
+		return result;
+	}
+
+	inline cl_int enqueueUnmapMemObject(
+		const Memory& memory,
+		void* mapped_ptr,
+		const VECTOR_CLASS<Event>* events = NULL,
+		Event* event = NULL)
+	{
+		cl_int error;
+		CommandQueue queue = CommandQueue::getDefault(&error);
+		detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+		if (error != CL_SUCCESS) {
+			return error;
+		}
+
+		cl_event tmp;
+		cl_int err = detail::errHandler(
+			::clEnqueueUnmapMemObject(
+			queue(), memory(), mapped_ptr,
+			(events != NULL) ? (cl_uint)events->size() : 0,
+			(events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+			(event != NULL) ? &tmp : NULL),
+			__ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+		if (event != NULL && err == CL_SUCCESS)
+			*event = tmp;
+
+		return err;
+	}
+
+	inline cl_int enqueueCopyBuffer(
+		const Buffer& src,
+		const Buffer& dst,
+		::size_t src_offset,
+		::size_t dst_offset,
+		::size_t size,
+		const VECTOR_CLASS<Event>* events = NULL,
+		Event* event = NULL)
+	{
+		cl_int error;
+		CommandQueue queue = CommandQueue::getDefault(&error);
+
+		if (error != CL_SUCCESS) {
+			return error;
+		}
+
+		return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);
+	}
+
+	/**
+	* Blocking copy operation between iterators and a buffer.
+	*/
+	template< typename IteratorType >
+	inline cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer)
+	{
+		typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+		cl_int error;
+
+		::size_t length = endIterator - startIterator;
+		::size_t byteLength = length*sizeof(DataType);
+
+		DataType *pointer =
+			static_cast<DataType*>(enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));
+		// if exceptions enabled, enqueueMapBuffer will throw
+		if (error != CL_SUCCESS) {
+			return error;
+		}
+#if defined(_MSC_VER)
+		std::copy(
+			startIterator,
+			endIterator,
+			stdext::checked_array_iterator<DataType*>(
+			pointer, length));
+#else
+		std::copy(startIterator, endIterator, pointer);
+#endif
+		Event endEvent;
+		error = enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+		// if exceptions enabled, enqueueUnmapMemObject will throw
+		if (error != CL_SUCCESS) {
+			return error;
+		}
+		endEvent.wait();
+		return CL_SUCCESS;
+	}
+
+	/**
+	* Blocking copy operation between iterators and a buffer.
+	*/
+	template< typename IteratorType >
+	inline cl_int copy(const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator)
+	{
+		typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+		cl_int error;
+
+		::size_t length = endIterator - startIterator;
+		::size_t byteLength = length*sizeof(DataType);
+
+		DataType *pointer =
+			static_cast<DataType*>(enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));
+		// if exceptions enabled, enqueueMapBuffer will throw
+		if (error != CL_SUCCESS) {
+			return error;
+		}
+		std::copy(pointer, pointer + length, startIterator);
+		Event endEvent;
+		error = enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+		// if exceptions enabled, enqueueUnmapMemObject will throw
+		if (error != CL_SUCCESS) {
+			return error;
+		}
+		endEvent.wait();
+		return CL_SUCCESS;
+	}
+
+#if defined(CL_VERSION_1_1)
+	inline cl_int enqueueReadBufferRect(
+		const Buffer& buffer,
+		cl_bool blocking,
+		const size_t<3>& buffer_offset,
+		const size_t<3>& host_offset,
+		const size_t<3>& region,
+		::size_t buffer_row_pitch,
+		::size_t buffer_slice_pitch,
+		::size_t host_row_pitch,
+		::size_t host_slice_pitch,
+		void *ptr,
+		const VECTOR_CLASS<Event>* events = NULL,
+		Event* event = NULL)
+	{
+		cl_int error;
+		CommandQueue queue = CommandQueue::getDefault(&error);
+
+		if (error != CL_SUCCESS) {
+			return error;
+		}
+
+		return queue.enqueueReadBufferRect(
+			buffer,
+			blocking,
+			buffer_offset,
+			host_offset,
+			region,
+			buffer_row_pitch,
+			buffer_slice_pitch,
+			host_row_pitch,
+			host_slice_pitch,
+			ptr,
+			events,
+			event);
+	}
+
+	inline cl_int enqueueWriteBufferRect(
+		const Buffer& buffer,
+		cl_bool blocking,
+		const size_t<3>& buffer_offset,
+		const size_t<3>& host_offset,
+		const size_t<3>& region,
+		::size_t buffer_row_pitch,
+		::size_t buffer_slice_pitch,
+		::size_t host_row_pitch,
+		::size_t host_slice_pitch,
+		void *ptr,
+		const VECTOR_CLASS<Event>* events = NULL,
+		Event* event = NULL)
+	{
+		cl_int error;
+		CommandQueue queue = CommandQueue::getDefault(&error);
+
+		if (error != CL_SUCCESS) {
+			return error;
+		}
+
+		return queue.enqueueWriteBufferRect(
+			buffer,
+			blocking,
+			buffer_offset,
+			host_offset,
+			region,
+			buffer_row_pitch,
+			buffer_slice_pitch,
+			host_row_pitch,
+			host_slice_pitch,
+			ptr,
+			events,
+			event);
+	}
+
+	inline cl_int enqueueCopyBufferRect(
+		const Buffer& src,
+		const Buffer& dst,
+		const size_t<3>& src_origin,
+		const size_t<3>& dst_origin,
+		const size_t<3>& region,
+		::size_t src_row_pitch,
+		::size_t src_slice_pitch,
+		::size_t dst_row_pitch,
+		::size_t dst_slice_pitch,
+		const VECTOR_CLASS<Event>* events = NULL,
+		Event* event = NULL)
+	{
+		cl_int error;
+		CommandQueue queue = CommandQueue::getDefault(&error);
+
+		if (error != CL_SUCCESS) {
+			return error;
+		}
+
+		return queue.enqueueCopyBufferRect(
+			src,
+			dst,
+			src_origin,
+			dst_origin,
+			region,
+			src_row_pitch,
+			src_slice_pitch,
+			dst_row_pitch,
+			dst_slice_pitch,
+			events,
+			event);
+	}
+#endif
+
+	inline cl_int enqueueReadImage(
+		const Image& image,
+		cl_bool blocking,
+		const size_t<3>& origin,
+		const size_t<3>& region,
+		::size_t row_pitch,
+		::size_t slice_pitch,
+		void* ptr,
+		const VECTOR_CLASS<Event>* events = NULL,
+		Event* event = NULL)
+	{
+		cl_int error;
+		CommandQueue queue = CommandQueue::getDefault(&error);
+
+		if (error != CL_SUCCESS) {
+			return error;
+		}
+
+		return queue.enqueueReadImage(
+			image,
+			blocking,
+			origin,
+			region,
+			row_pitch,
+			slice_pitch,
+			ptr,
+			events,
+			event);
+	}
+
+	inline cl_int enqueueWriteImage(
+		const Image& image,
+		cl_bool blocking,
+		const size_t<3>& origin,
+		const size_t<3>& region,
+		::size_t row_pitch,
+		::size_t slice_pitch,
+		void* ptr,
+		const VECTOR_CLASS<Event>* events = NULL,
+		Event* event = NULL)
+	{
+		cl_int error;
+		CommandQueue queue = CommandQueue::getDefault(&error);
+
+		if (error != CL_SUCCESS) {
+			return error;
+		}
+
+		return queue.enqueueWriteImage(
+			image,
+			blocking,
+			origin,
+			region,
+			row_pitch,
+			slice_pitch,
+			ptr,
+			events,
+			event);
+	}
+
+	inline cl_int enqueueCopyImage(
+		const Image& src,
+		const Image& dst,
+		const size_t<3>& src_origin,
+		const size_t<3>& dst_origin,
+		const size_t<3>& region,
+		const VECTOR_CLASS<Event>* events = NULL,
+		Event* event = NULL)
+	{
+		cl_int error;
+		CommandQueue queue = CommandQueue::getDefault(&error);
+
+		if (error != CL_SUCCESS) {
+			return error;
+		}
+
+		return queue.enqueueCopyImage(
+			src,
+			dst,
+			src_origin,
+			dst_origin,
+			region,
+			events,
+			event);
+	}
+
+	inline cl_int enqueueCopyImageToBuffer(
+		const Image& src,
+		const Buffer& dst,
+		const size_t<3>& src_origin,
+		const size_t<3>& region,
+		::size_t dst_offset,
+		const VECTOR_CLASS<Event>* events = NULL,
+		Event* event = NULL)
+	{
+		cl_int error;
+		CommandQueue queue = CommandQueue::getDefault(&error);
+
+		if (error != CL_SUCCESS) {
+			return error;
+		}
+
+		return queue.enqueueCopyImageToBuffer(
+			src,
+			dst,
+			src_origin,
+			region,
+			dst_offset,
+			events,
+			event);
+	}
+
+	inline cl_int enqueueCopyBufferToImage(
+		const Buffer& src,
+		const Image& dst,
+		::size_t src_offset,
+		const size_t<3>& dst_origin,
+		const size_t<3>& region,
+		const VECTOR_CLASS<Event>* events = NULL,
+		Event* event = NULL)
+	{
+		cl_int error;
+		CommandQueue queue = CommandQueue::getDefault(&error);
+
+		if (error != CL_SUCCESS) {
+			return error;
+		}
+
+		return queue.enqueueCopyBufferToImage(
+			src,
+			dst,
+			src_offset,
+			dst_origin,
+			region,
+			events,
+			event);
+	}
+
+
+	inline cl_int flush(void)
+	{
+		cl_int error;
+		CommandQueue queue = CommandQueue::getDefault(&error);
+
+		if (error != CL_SUCCESS) {
+			return error;
+		}
+
+		return queue.flush();
+	}
+
+	inline cl_int finish(void)
+	{
+		cl_int error;
+		CommandQueue queue = CommandQueue::getDefault(&error);
+
+		if (error != CL_SUCCESS) {
+			return error;
+		}
+
+
+		return queue.finish();
+	}
+
+	// Kernel Functor support
+	// New interface as of September 2011
+	// Requires the C++11 std::tr1::function (note do not support TR1)
+	// Visual Studio 2010 and GCC 4.2
+
+	struct EnqueueArgs
+	{
+		CommandQueue queue_;
+		const NDRange offset_;
+		const NDRange global_;
+		const NDRange local_;
+		VECTOR_CLASS<Event> events_;
+
+		EnqueueArgs(NDRange global) :
+			queue_(CommandQueue::getDefault()),
+			offset_(NullRange),
+			global_(global),
+			local_(NullRange)
+		{
+
+		}
+
+		EnqueueArgs(NDRange global, NDRange local) :
+			queue_(CommandQueue::getDefault()),
+			offset_(NullRange),
+			global_(global),
+			local_(local)
+		{
+
+		}
+
+		EnqueueArgs(NDRange offset, NDRange global, NDRange local) :
+			queue_(CommandQueue::getDefault()),
+			offset_(offset),
+			global_(global),
+			local_(local)
+		{
+
+		}
+
+		EnqueueArgs(Event e, NDRange global) :
+			queue_(CommandQueue::getDefault()),
+			offset_(NullRange),
+			global_(global),
+			local_(NullRange)
+		{
+			events_.push_back(e);
+		}
+
+		EnqueueArgs(Event e, NDRange global, NDRange local) :
+			queue_(CommandQueue::getDefault()),
+			offset_(NullRange),
+			global_(global),
+			local_(local)
+		{
+			events_.push_back(e);
+		}
+
+		EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) :
+			queue_(CommandQueue::getDefault()),
+			offset_(offset),
+			global_(global),
+			local_(local)
+		{
+			events_.push_back(e);
+		}
+
+		EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global) :
+			queue_(CommandQueue::getDefault()),
+			offset_(NullRange),
+			global_(global),
+			local_(NullRange),
+			events_(events)
+		{
+
+		}
+
+		EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) :
+			queue_(CommandQueue::getDefault()),
+			offset_(NullRange),
+			global_(global),
+			local_(local),
+			events_(events)
+		{
+
+		}
+
+		EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) :
+			queue_(CommandQueue::getDefault()),
+			offset_(offset),
+			global_(global),
+			local_(local),
+			events_(events)
+		{
+
+		}
+
+		EnqueueArgs(CommandQueue &queue, NDRange global) :
+			queue_(queue),
+			offset_(NullRange),
+			global_(global),
+			local_(NullRange)
+		{
+
+		}
+
+		EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) :
+			queue_(queue),
+			offset_(NullRange),
+			global_(global),
+			local_(local)
+		{
+
+		}
+
+		EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) :
+			queue_(queue),
+			offset_(offset),
+			global_(global),
+			local_(local)
+		{
+
+		}
+
+		EnqueueArgs(CommandQueue &queue, Event e, NDRange global) :
+			queue_(queue),
+			offset_(NullRange),
+			global_(global),
+			local_(NullRange)
+		{
+			events_.push_back(e);
+		}
+
+		EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) :
+			queue_(queue),
+			offset_(NullRange),
+			global_(global),
+			local_(local)
+		{
+			events_.push_back(e);
+		}
+
+		EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) :
+			queue_(queue),
+			offset_(offset),
+			global_(global),
+			local_(local)
+		{
+			events_.push_back(e);
+		}
+
+		EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global) :
+			queue_(queue),
+			offset_(NullRange),
+			global_(global),
+			local_(NullRange),
+			events_(events)
+		{
+
+		}
+
+		EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) :
+			queue_(queue),
+			offset_(NullRange),
+			global_(global),
+			local_(local),
+			events_(events)
+		{
+
+		}
+
+		EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) :
+			queue_(queue),
+			offset_(offset),
+			global_(global),
+			local_(local),
+			events_(events)
+		{
+
+		}
+	};
+
+	namespace detail {
+
+		class NullType {};
+
+		template<int index, typename T0>
+		struct SetArg
+		{
+			static void set(Kernel kernel, T0 arg)
+			{
+				kernel.setArg(index, arg);
+			}
+		};
+
+		template<int index>
+		struct SetArg<index, NullType>
+		{
+			static void set(Kernel, NullType)
+			{
+			}
+		};
+
+		template <
+			typename T0, typename T1, typename T2, typename T3,
+			typename T4, typename T5, typename T6, typename T7,
+			typename T8, typename T9, typename T10, typename T11,
+			typename T12, typename T13, typename T14, typename T15,
+			typename T16, typename T17, typename T18, typename T19,
+			typename T20, typename T21, typename T22, typename T23,
+			typename T24, typename T25, typename T26, typename T27,
+			typename T28, typename T29, typename T30, typename T31
+		>
+		class KernelFunctorGlobal
+		{
+		private:
+			Kernel kernel_;
+
+		public:
+			KernelFunctorGlobal(
+				Kernel kernel) :
+				kernel_(kernel)
+			{}
+
+			KernelFunctorGlobal(
+				const Program& program,
+				const STRING_CLASS name,
+				cl_int * err = NULL) :
+				kernel_(program, name.c_str(), err)
+			{}
+
+			Event operator() (
+				const EnqueueArgs& args,
+				T0 t0,
+				T1 t1 = NullType(),
+				T2 t2 = NullType(),
+				T3 t3 = NullType(),
+				T4 t4 = NullType(),
+				T5 t5 = NullType(),
+				T6 t6 = NullType(),
+				T7 t7 = NullType(),
+				T8 t8 = NullType(),
+				T9 t9 = NullType(),
+				T10 t10 = NullType(),
+				T11 t11 = NullType(),
+				T12 t12 = NullType(),
+				T13 t13 = NullType(),
+				T14 t14 = NullType(),
+				T15 t15 = NullType(),
+				T16 t16 = NullType(),
+				T17 t17 = NullType(),
+				T18 t18 = NullType(),
+				T19 t19 = NullType(),
+				T20 t20 = NullType(),
+				T21 t21 = NullType(),
+				T22 t22 = NullType(),
+				T23 t23 = NullType(),
+				T24 t24 = NullType(),
+				T25 t25 = NullType(),
+				T26 t26 = NullType(),
+				T27 t27 = NullType(),
+				T28 t28 = NullType(),
+				T29 t29 = NullType(),
+				T30 t30 = NullType(),
+				T31 t31 = NullType()
+				)
+			{
+				Event event;
+				SetArg<0, T0>::set(kernel_, t0);
+				SetArg<1, T1>::set(kernel_, t1);
+				SetArg<2, T2>::set(kernel_, t2);
+				SetArg<3, T3>::set(kernel_, t3);
+				SetArg<4, T4>::set(kernel_, t4);
+				SetArg<5, T5>::set(kernel_, t5);
+				SetArg<6, T6>::set(kernel_, t6);
+				SetArg<7, T7>::set(kernel_, t7);
+				SetArg<8, T8>::set(kernel_, t8);
+				SetArg<9, T9>::set(kernel_, t9);
+				SetArg<10, T10>::set(kernel_, t10);
+				SetArg<11, T11>::set(kernel_, t11);
+				SetArg<12, T12>::set(kernel_, t12);
+				SetArg<13, T13>::set(kernel_, t13);
+				SetArg<14, T14>::set(kernel_, t14);
+				SetArg<15, T15>::set(kernel_, t15);
+				SetArg<16, T16>::set(kernel_, t16);
+				SetArg<17, T17>::set(kernel_, t17);
+				SetArg<18, T18>::set(kernel_, t18);
+				SetArg<19, T19>::set(kernel_, t19);
+				SetArg<20, T20>::set(kernel_, t20);
+				SetArg<21, T21>::set(kernel_, t21);
+				SetArg<22, T22>::set(kernel_, t22);
+				SetArg<23, T23>::set(kernel_, t23);
+				SetArg<24, T24>::set(kernel_, t24);
+				SetArg<25, T25>::set(kernel_, t25);
+				SetArg<26, T26>::set(kernel_, t26);
+				SetArg<27, T27>::set(kernel_, t27);
+				SetArg<28, T28>::set(kernel_, t28);
+				SetArg<29, T29>::set(kernel_, t29);
+				SetArg<30, T30>::set(kernel_, t30);
+				SetArg<31, T31>::set(kernel_, t31);
+
+				args.queue_.enqueueNDRangeKernel(
+					kernel_,
+					args.offset_,
+					args.global_,
+					args.local_,
+					&args.events_,
+					&event);
+
+				return event;
+			}
+
+		};
+
+		//------------------------------------------------------------------------------------------------------
+
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13,
+			typename T14,
+			typename T15,
+			typename T16,
+			typename T17,
+			typename T18,
+			typename T19,
+			typename T20,
+			typename T21,
+			typename T22,
+			typename T23,
+			typename T24,
+			typename T25,
+			typename T26,
+			typename T27,
+			typename T28,
+			typename T29,
+			typename T30,
+			typename T31>
+		struct functionImplementation_
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				T23,
+				T24,
+				T25,
+				T26,
+				T27,
+				T28,
+				T29,
+				T30,
+				T31> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				T23,
+				T24,
+				T25,
+				T26,
+				T27,
+				T28,
+				T29,
+				T30,
+				T31);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13,
+				T14 arg14,
+				T15 arg15,
+				T16 arg16,
+				T17 arg17,
+				T18 arg18,
+				T19 arg19,
+				T20 arg20,
+				T21 arg21,
+				T22 arg22,
+				T23 arg23,
+				T24 arg24,
+				T25 arg25,
+				T26 arg26,
+				T27 arg27,
+				T28 arg28,
+				T29 arg29,
+				T30 arg30,
+				T31 arg31)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13,
+					arg14,
+					arg15,
+					arg16,
+					arg17,
+					arg18,
+					arg19,
+					arg20,
+					arg21,
+					arg22,
+					arg23,
+					arg24,
+					arg25,
+					arg26,
+					arg27,
+					arg28,
+					arg29,
+					arg30,
+					arg31);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13,
+			typename T14,
+			typename T15,
+			typename T16,
+			typename T17,
+			typename T18,
+			typename T19,
+			typename T20,
+			typename T21,
+			typename T22,
+			typename T23,
+			typename T24,
+			typename T25,
+			typename T26,
+			typename T27,
+			typename T28,
+			typename T29,
+			typename T30>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			T13,
+			T14,
+			T15,
+			T16,
+			T17,
+			T18,
+			T19,
+			T20,
+			T21,
+			T22,
+			T23,
+			T24,
+			T25,
+			T26,
+			T27,
+			T28,
+			T29,
+			T30,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				T23,
+				T24,
+				T25,
+				T26,
+				T27,
+				T28,
+				T29,
+				T30,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				T23,
+				T24,
+				T25,
+				T26,
+				T27,
+				T28,
+				T29,
+				T30);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13,
+				T14 arg14,
+				T15 arg15,
+				T16 arg16,
+				T17 arg17,
+				T18 arg18,
+				T19 arg19,
+				T20 arg20,
+				T21 arg21,
+				T22 arg22,
+				T23 arg23,
+				T24 arg24,
+				T25 arg25,
+				T26 arg26,
+				T27 arg27,
+				T28 arg28,
+				T29 arg29,
+				T30 arg30)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13,
+					arg14,
+					arg15,
+					arg16,
+					arg17,
+					arg18,
+					arg19,
+					arg20,
+					arg21,
+					arg22,
+					arg23,
+					arg24,
+					arg25,
+					arg26,
+					arg27,
+					arg28,
+					arg29,
+					arg30);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13,
+			typename T14,
+			typename T15,
+			typename T16,
+			typename T17,
+			typename T18,
+			typename T19,
+			typename T20,
+			typename T21,
+			typename T22,
+			typename T23,
+			typename T24,
+			typename T25,
+			typename T26,
+			typename T27,
+			typename T28,
+			typename T29>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			T13,
+			T14,
+			T15,
+			T16,
+			T17,
+			T18,
+			T19,
+			T20,
+			T21,
+			T22,
+			T23,
+			T24,
+			T25,
+			T26,
+			T27,
+			T28,
+			T29,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				T23,
+				T24,
+				T25,
+				T26,
+				T27,
+				T28,
+				T29,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				T23,
+				T24,
+				T25,
+				T26,
+				T27,
+				T28,
+				T29);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13,
+				T14 arg14,
+				T15 arg15,
+				T16 arg16,
+				T17 arg17,
+				T18 arg18,
+				T19 arg19,
+				T20 arg20,
+				T21 arg21,
+				T22 arg22,
+				T23 arg23,
+				T24 arg24,
+				T25 arg25,
+				T26 arg26,
+				T27 arg27,
+				T28 arg28,
+				T29 arg29)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13,
+					arg14,
+					arg15,
+					arg16,
+					arg17,
+					arg18,
+					arg19,
+					arg20,
+					arg21,
+					arg22,
+					arg23,
+					arg24,
+					arg25,
+					arg26,
+					arg27,
+					arg28,
+					arg29);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13,
+			typename T14,
+			typename T15,
+			typename T16,
+			typename T17,
+			typename T18,
+			typename T19,
+			typename T20,
+			typename T21,
+			typename T22,
+			typename T23,
+			typename T24,
+			typename T25,
+			typename T26,
+			typename T27,
+			typename T28>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			T13,
+			T14,
+			T15,
+			T16,
+			T17,
+			T18,
+			T19,
+			T20,
+			T21,
+			T22,
+			T23,
+			T24,
+			T25,
+			T26,
+			T27,
+			T28,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				T23,
+				T24,
+				T25,
+				T26,
+				T27,
+				T28,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				T23,
+				T24,
+				T25,
+				T26,
+				T27,
+				T28);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13,
+				T14 arg14,
+				T15 arg15,
+				T16 arg16,
+				T17 arg17,
+				T18 arg18,
+				T19 arg19,
+				T20 arg20,
+				T21 arg21,
+				T22 arg22,
+				T23 arg23,
+				T24 arg24,
+				T25 arg25,
+				T26 arg26,
+				T27 arg27,
+				T28 arg28)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13,
+					arg14,
+					arg15,
+					arg16,
+					arg17,
+					arg18,
+					arg19,
+					arg20,
+					arg21,
+					arg22,
+					arg23,
+					arg24,
+					arg25,
+					arg26,
+					arg27,
+					arg28);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13,
+			typename T14,
+			typename T15,
+			typename T16,
+			typename T17,
+			typename T18,
+			typename T19,
+			typename T20,
+			typename T21,
+			typename T22,
+			typename T23,
+			typename T24,
+			typename T25,
+			typename T26,
+			typename T27>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			T13,
+			T14,
+			T15,
+			T16,
+			T17,
+			T18,
+			T19,
+			T20,
+			T21,
+			T22,
+			T23,
+			T24,
+			T25,
+			T26,
+			T27,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				T23,
+				T24,
+				T25,
+				T26,
+				T27,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				T23,
+				T24,
+				T25,
+				T26,
+				T27);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13,
+				T14 arg14,
+				T15 arg15,
+				T16 arg16,
+				T17 arg17,
+				T18 arg18,
+				T19 arg19,
+				T20 arg20,
+				T21 arg21,
+				T22 arg22,
+				T23 arg23,
+				T24 arg24,
+				T25 arg25,
+				T26 arg26,
+				T27 arg27)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13,
+					arg14,
+					arg15,
+					arg16,
+					arg17,
+					arg18,
+					arg19,
+					arg20,
+					arg21,
+					arg22,
+					arg23,
+					arg24,
+					arg25,
+					arg26,
+					arg27);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13,
+			typename T14,
+			typename T15,
+			typename T16,
+			typename T17,
+			typename T18,
+			typename T19,
+			typename T20,
+			typename T21,
+			typename T22,
+			typename T23,
+			typename T24,
+			typename T25,
+			typename T26>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			T13,
+			T14,
+			T15,
+			T16,
+			T17,
+			T18,
+			T19,
+			T20,
+			T21,
+			T22,
+			T23,
+			T24,
+			T25,
+			T26,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				T23,
+				T24,
+				T25,
+				T26,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				T23,
+				T24,
+				T25,
+				T26);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13,
+				T14 arg14,
+				T15 arg15,
+				T16 arg16,
+				T17 arg17,
+				T18 arg18,
+				T19 arg19,
+				T20 arg20,
+				T21 arg21,
+				T22 arg22,
+				T23 arg23,
+				T24 arg24,
+				T25 arg25,
+				T26 arg26)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13,
+					arg14,
+					arg15,
+					arg16,
+					arg17,
+					arg18,
+					arg19,
+					arg20,
+					arg21,
+					arg22,
+					arg23,
+					arg24,
+					arg25,
+					arg26);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13,
+			typename T14,
+			typename T15,
+			typename T16,
+			typename T17,
+			typename T18,
+			typename T19,
+			typename T20,
+			typename T21,
+			typename T22,
+			typename T23,
+			typename T24,
+			typename T25>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			T13,
+			T14,
+			T15,
+			T16,
+			T17,
+			T18,
+			T19,
+			T20,
+			T21,
+			T22,
+			T23,
+			T24,
+			T25,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				T23,
+				T24,
+				T25,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				T23,
+				T24,
+				T25);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13,
+				T14 arg14,
+				T15 arg15,
+				T16 arg16,
+				T17 arg17,
+				T18 arg18,
+				T19 arg19,
+				T20 arg20,
+				T21 arg21,
+				T22 arg22,
+				T23 arg23,
+				T24 arg24,
+				T25 arg25)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13,
+					arg14,
+					arg15,
+					arg16,
+					arg17,
+					arg18,
+					arg19,
+					arg20,
+					arg21,
+					arg22,
+					arg23,
+					arg24,
+					arg25);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13,
+			typename T14,
+			typename T15,
+			typename T16,
+			typename T17,
+			typename T18,
+			typename T19,
+			typename T20,
+			typename T21,
+			typename T22,
+			typename T23,
+			typename T24>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			T13,
+			T14,
+			T15,
+			T16,
+			T17,
+			T18,
+			T19,
+			T20,
+			T21,
+			T22,
+			T23,
+			T24,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				T23,
+				T24,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				T23,
+				T24);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13,
+				T14 arg14,
+				T15 arg15,
+				T16 arg16,
+				T17 arg17,
+				T18 arg18,
+				T19 arg19,
+				T20 arg20,
+				T21 arg21,
+				T22 arg22,
+				T23 arg23,
+				T24 arg24)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13,
+					arg14,
+					arg15,
+					arg16,
+					arg17,
+					arg18,
+					arg19,
+					arg20,
+					arg21,
+					arg22,
+					arg23,
+					arg24);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13,
+			typename T14,
+			typename T15,
+			typename T16,
+			typename T17,
+			typename T18,
+			typename T19,
+			typename T20,
+			typename T21,
+			typename T22,
+			typename T23>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			T13,
+			T14,
+			T15,
+			T16,
+			T17,
+			T18,
+			T19,
+			T20,
+			T21,
+			T22,
+			T23,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				T23,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				T23);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13,
+				T14 arg14,
+				T15 arg15,
+				T16 arg16,
+				T17 arg17,
+				T18 arg18,
+				T19 arg19,
+				T20 arg20,
+				T21 arg21,
+				T22 arg22,
+				T23 arg23)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13,
+					arg14,
+					arg15,
+					arg16,
+					arg17,
+					arg18,
+					arg19,
+					arg20,
+					arg21,
+					arg22,
+					arg23);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13,
+			typename T14,
+			typename T15,
+			typename T16,
+			typename T17,
+			typename T18,
+			typename T19,
+			typename T20,
+			typename T21,
+			typename T22>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			T13,
+			T14,
+			T15,
+			T16,
+			T17,
+			T18,
+			T19,
+			T20,
+			T21,
+			T22,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				T22);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13,
+				T14 arg14,
+				T15 arg15,
+				T16 arg16,
+				T17 arg17,
+				T18 arg18,
+				T19 arg19,
+				T20 arg20,
+				T21 arg21,
+				T22 arg22)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13,
+					arg14,
+					arg15,
+					arg16,
+					arg17,
+					arg18,
+					arg19,
+					arg20,
+					arg21,
+					arg22);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13,
+			typename T14,
+			typename T15,
+			typename T16,
+			typename T17,
+			typename T18,
+			typename T19,
+			typename T20,
+			typename T21>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			T13,
+			T14,
+			T15,
+			T16,
+			T17,
+			T18,
+			T19,
+			T20,
+			T21,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				T21);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13,
+				T14 arg14,
+				T15 arg15,
+				T16 arg16,
+				T17 arg17,
+				T18 arg18,
+				T19 arg19,
+				T20 arg20,
+				T21 arg21)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13,
+					arg14,
+					arg15,
+					arg16,
+					arg17,
+					arg18,
+					arg19,
+					arg20,
+					arg21);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13,
+			typename T14,
+			typename T15,
+			typename T16,
+			typename T17,
+			typename T18,
+			typename T19,
+			typename T20>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			T13,
+			T14,
+			T15,
+			T16,
+			T17,
+			T18,
+			T19,
+			T20,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				T20);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13,
+				T14 arg14,
+				T15 arg15,
+				T16 arg16,
+				T17 arg17,
+				T18 arg18,
+				T19 arg19,
+				T20 arg20)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13,
+					arg14,
+					arg15,
+					arg16,
+					arg17,
+					arg18,
+					arg19,
+					arg20);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13,
+			typename T14,
+			typename T15,
+			typename T16,
+			typename T17,
+			typename T18,
+			typename T19>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			T13,
+			T14,
+			T15,
+			T16,
+			T17,
+			T18,
+			T19,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				T19);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13,
+				T14 arg14,
+				T15 arg15,
+				T16 arg16,
+				T17 arg17,
+				T18 arg18,
+				T19 arg19)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13,
+					arg14,
+					arg15,
+					arg16,
+					arg17,
+					arg18,
+					arg19);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13,
+			typename T14,
+			typename T15,
+			typename T16,
+			typename T17,
+			typename T18>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			T13,
+			T14,
+			T15,
+			T16,
+			T17,
+			T18,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				T18);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13,
+				T14 arg14,
+				T15 arg15,
+				T16 arg16,
+				T17 arg17,
+				T18 arg18)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13,
+					arg14,
+					arg15,
+					arg16,
+					arg17,
+					arg18);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13,
+			typename T14,
+			typename T15,
+			typename T16,
+			typename T17>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			T13,
+			T14,
+			T15,
+			T16,
+			T17,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				T17);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13,
+				T14 arg14,
+				T15 arg15,
+				T16 arg16,
+				T17 arg17)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13,
+					arg14,
+					arg15,
+					arg16,
+					arg17);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13,
+			typename T14,
+			typename T15,
+			typename T16>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			T13,
+			T14,
+			T15,
+			T16,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				T16);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13,
+				T14 arg14,
+				T15 arg15,
+				T16 arg16)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13,
+					arg14,
+					arg15,
+					arg16);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13,
+			typename T14,
+			typename T15>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			T13,
+			T14,
+			T15,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				T15);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13,
+				T14 arg14,
+				T15 arg15)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13,
+					arg14,
+					arg15);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13,
+			typename T14>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			T13,
+			T14,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				T14);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13,
+				T14 arg14)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13,
+					arg14);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12,
+			typename T13>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			T13,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				T13);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12,
+				T13 arg13)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12,
+					arg13);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11,
+			typename T12>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			T12,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				T12);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11,
+				T12 arg12)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11,
+					arg12);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10,
+			typename T11>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			T11,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				T11);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10,
+				T11 arg11)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10,
+					arg11);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9,
+			typename T10>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			T10,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				T10);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9,
+				T10 arg10)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9,
+					arg10);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8,
+			typename T9>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			T9,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				T9);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8,
+				T9 arg9)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8,
+					arg9);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7,
+			typename T8>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			T8,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				T8);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7,
+				T8 arg8)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7,
+					arg8);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6,
+			typename T7>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			T7,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				T7);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6,
+				T7 arg7)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6,
+					arg7);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5,
+			typename T6>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			T6,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				T6);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5,
+				T6 arg6)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5,
+					arg6);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4,
+			typename T5>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			T5,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				T5);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4,
+				T5 arg5)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4,
+					arg5);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3,
+			typename T4>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			T4,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				T4,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3,
+				T4);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3,
+				T4 arg4)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3,
+					arg4);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2,
+			typename T3>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			T3,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				T3,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2,
+				T3);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2,
+				T3 arg3)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2,
+					arg3);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1,
+			typename T2>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			T2,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				T2,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1,
+				T2);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1,
+				T2 arg2)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1,
+					arg2);
+			}
+
+
+		};
+
+		template<
+			typename T0,
+			typename T1>
+		struct functionImplementation_
+			<	T0,
+			T1,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				T1,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0,
+				T1);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0,
+				T1 arg1)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0,
+					arg1);
+			}
+
+
+		};
+
+		template<
+			typename T0>
+		struct functionImplementation_
+			<	T0,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType,
+			NullType>
+		{
+			typedef detail::KernelFunctorGlobal<
+				T0,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType,
+				NullType> FunctorType;
+
+			FunctorType functor_;
+
+			functionImplementation_(const FunctorType &functor) :
+				functor_(functor)
+			{
+
+#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1))
+				// Fail variadic expansion for dev11
+				static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+#endif
+
+			}
+
+			//! \brief Return type of the functor
+			typedef Event result_type;
+
+			//! \brief Function signature of kernel functor with no event dependency.
+			typedef Event type_(
+				const EnqueueArgs&,
+				T0);
+
+			Event operator()(
+				const EnqueueArgs& enqueueArgs,
+				T0 arg0)
+			{
+				return functor_(
+					enqueueArgs,
+					arg0);
+			}
+
+
+		};
+
+
+
+
+
+	} // namespace detail
+
+	//----------------------------------------------------------------------------------------------
+
+	template <
+		typename T0, typename T1 = detail::NullType, typename T2 = detail::NullType,
+		typename T3 = detail::NullType, typename T4 = detail::NullType,
+		typename T5 = detail::NullType, typename T6 = detail::NullType,
+		typename T7 = detail::NullType, typename T8 = detail::NullType,
+		typename T9 = detail::NullType, typename T10 = detail::NullType,
+		typename T11 = detail::NullType, typename T12 = detail::NullType,
+		typename T13 = detail::NullType, typename T14 = detail::NullType,
+		typename T15 = detail::NullType, typename T16 = detail::NullType,
+		typename T17 = detail::NullType, typename T18 = detail::NullType,
+		typename T19 = detail::NullType, typename T20 = detail::NullType,
+		typename T21 = detail::NullType, typename T22 = detail::NullType,
+		typename T23 = detail::NullType, typename T24 = detail::NullType,
+		typename T25 = detail::NullType, typename T26 = detail::NullType,
+		typename T27 = detail::NullType, typename T28 = detail::NullType,
+		typename T29 = detail::NullType, typename T30 = detail::NullType,
+		typename T31 = detail::NullType
+	>
+	struct make_kernel :
+		public detail::functionImplementation_<
+		T0, T1, T2, T3,
+		T4, T5, T6, T7,
+		T8, T9, T10, T11,
+		T12, T13, T14, T15,
+		T16, T17, T18, T19,
+		T20, T21, T22, T23,
+		T24, T25, T26, T27,
+		T28, T29, T30, T31
+		>
+	{
+	public:
+		typedef detail::KernelFunctorGlobal<
+			T0, T1, T2, T3,
+			T4, T5, T6, T7,
+			T8, T9, T10, T11,
+			T12, T13, T14, T15,
+			T16, T17, T18, T19,
+			T20, T21, T22, T23,
+			T24, T25, T26, T27,
+			T28, T29, T30, T31
+		> FunctorType;
+
+		make_kernel(
+			const Program& program,
+			const STRING_CLASS name,
+			cl_int * err = NULL) :
+			detail::functionImplementation_<
+			T0, T1, T2, T3,
+			T4, T5, T6, T7,
+			T8, T9, T10, T11,
+			T12, T13, T14, T15,
+			T16, T17, T18, T19,
+			T20, T21, T22, T23,
+			T24, T25, T26, T27,
+			T28, T29, T30, T31
+			>(
+			FunctorType(program, name, err))
+		{}
+
+		make_kernel(
+			const Kernel kernel) :
+			detail::functionImplementation_<
+			T0, T1, T2, T3,
+			T4, T5, T6, T7,
+			T8, T9, T10, T11,
+			T12, T13, T14, T15,
+			T16, T17, T18, T19,
+			T20, T21, T22, T23,
+			T24, T25, T26, T27,
+			T28, T29, T30, T31
+			>(
+			FunctorType(kernel))
+		{}
+	};
+
+
+	//----------------------------------------------------------------------------------------------------------------------
+
+#undef __ERR_STR
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#undef __GET_DEVICE_INFO_ERR
+#undef __GET_PLATFORM_INFO_ERR
+#undef __GET_DEVICE_IDS_ERR
+#undef __GET_CONTEXT_INFO_ERR
+#undef __GET_EVENT_INFO_ERR
+#undef __GET_EVENT_PROFILE_INFO_ERR
+#undef __GET_MEM_OBJECT_INFO_ERR
+#undef __GET_IMAGE_INFO_ERR
+#undef __GET_SAMPLER_INFO_ERR
+#undef __GET_KERNEL_INFO_ERR
+#undef __GET_KERNEL_ARG_INFO_ERR
+#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
+#undef __GET_PROGRAM_INFO_ERR
+#undef __GET_PROGRAM_BUILD_INFO_ERR
+#undef __GET_COMMAND_QUEUE_INFO_ERR
+
+#undef __CREATE_CONTEXT_ERR
+#undef __CREATE_CONTEXT_FROM_TYPE_ERR
+#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
+
+#undef __CREATE_BUFFER_ERR
+#undef __CREATE_SUBBUFFER_ERR
+#undef __CREATE_IMAGE2D_ERR
+#undef __CREATE_IMAGE3D_ERR
+#undef __CREATE_SAMPLER_ERR
+#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
+
+#undef __CREATE_USER_EVENT_ERR
+#undef __SET_USER_EVENT_STATUS_ERR
+#undef __SET_EVENT_CALLBACK_ERR
+#undef __SET_PRINTF_CALLBACK_ERR
+
+#undef __WAIT_FOR_EVENTS_ERR
+
+#undef __CREATE_KERNEL_ERR
+#undef __SET_KERNEL_ARGS_ERR
+#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
+#undef __CREATE_PROGRAM_WITH_BINARY_ERR
+#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR
+#undef __BUILD_PROGRAM_ERR
+#undef __CREATE_KERNELS_IN_PROGRAM_ERR
+
+#undef __CREATE_COMMAND_QUEUE_ERR
+#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
+#undef __ENQUEUE_READ_BUFFER_ERR
+#undef __ENQUEUE_WRITE_BUFFER_ERR
+#undef __ENQUEUE_READ_BUFFER_RECT_ERR
+#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
+#undef __ENQEUE_COPY_BUFFER_ERR
+#undef __ENQEUE_COPY_BUFFER_RECT_ERR
+#undef __ENQUEUE_READ_IMAGE_ERR
+#undef __ENQUEUE_WRITE_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
+#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
+#undef __ENQUEUE_MAP_BUFFER_ERR
+#undef __ENQUEUE_MAP_IMAGE_ERR
+#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
+#undef __ENQUEUE_NDRANGE_KERNEL_ERR
+#undef __ENQUEUE_TASK_ERR
+#undef __ENQUEUE_NATIVE_KERNEL
+
+#undef __CL_EXPLICIT_CONSTRUCTORS
+
+#undef __UNLOAD_COMPILER_ERR
+#endif //__CL_USER_OVERRIDE_ERROR_STRINGS
+
+#undef __CL_FUNCTION_TYPE
+
+	// Extensions
+	/**
+	* Deprecated APIs for 1.2
+	*/
+#if defined(CL_VERSION_1_1)
+#undef __INIT_CL_EXT_FCN_PTR
+#endif // #if defined(CL_VERSION_1_1)
+#undef __CREATE_SUB_DEVICES
+
+#if defined(USE_CL_DEVICE_FISSION)
+#undef __PARAM_NAME_DEVICE_FISSION
+#endif // USE_CL_DEVICE_FISSION
+
+#undef __DEFAULT_NOT_INITIALIZED 
+#undef __DEFAULT_BEING_INITIALIZED 
+#undef __DEFAULT_INITIALIZED
+
+} // namespace cl
+
+#ifdef _WIN32
+#pragma pop_macro("max")
+#endif // _WIN32
+
+#endif // CL_HPP_
diff --git a/contrib/ocl/crypto/blake.hpp b/contrib/ocl/crypto/blake.hpp
new file mode 100644
index 000000000..230cad2ef
--- /dev/null
+++ b/contrib/ocl/crypto/blake.hpp
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <ocl/crypto/detail/blake.hpp>
+#include <cassert>
+#include <cstring>
+
+namespace ocl {
+namespace crypto {
+
+typedef struct  blake2b_state_s
+{
+    uint64_t    h[8];
+    uint64_t    bytes;
+}               blake2b_state_t;
+
+inline void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, uint32_t n, uint32_t k) {
+	using namespace crypto_detail;
+
+	assert(n > k);
+    assert(hash_len <= 64);
+    st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len);
+    for (uint32_t i = 1; i <= 5; i++)
+        st->h[i] = blake2b_iv[i];
+    st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW";
+    st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n);
+    st->bytes = 0;
+}
+
+inline void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg, uint32_t msg_len, uint32_t is_final) {
+	using namespace crypto_detail;
+
+	const uint64_t      *m = (const uint64_t *)_msg;
+    uint64_t            v[16];
+    assert(msg_len <= 128);
+    assert(st->bytes <= UINT64_MAX - msg_len);
+    memcpy(v + 0, st->h, 8 * sizeof (*v));
+    memcpy(v + 8, blake2b_iv, 8 * sizeof (*v));
+    v[12] ^= (st->bytes += msg_len);
+    v[14] ^= is_final ? -1 : 0;
+    for (uint32_t round = 0; round < blake2b_rounds; round++)
+      {
+        const uint8_t   *s = blake2b_sigma[round];
+        mix(v + 0, v + 4, v + 8,  v + 12, m[s[0]],  m[s[1]]);
+        mix(v + 1, v + 5, v + 9,  v + 13, m[s[2]],  m[s[3]]);
+        mix(v + 2, v + 6, v + 10, v + 14, m[s[4]],  m[s[5]]);
+        mix(v + 3, v + 7, v + 11, v + 15, m[s[6]],  m[s[7]]);
+        mix(v + 0, v + 5, v + 10, v + 15, m[s[8]],  m[s[9]]);
+        mix(v + 1, v + 6, v + 11, v + 12, m[s[10]], m[s[11]]);
+        mix(v + 2, v + 7, v + 8,  v + 13, m[s[12]], m[s[13]]);
+        mix(v + 3, v + 4, v + 9,  v + 14, m[s[14]], m[s[15]]);
+      }
+    for (uint32_t i = 0; i < 8; i++)
+        st->h[i] ^= v[i] ^ v[i + 8];	
+}
+
+inline void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen) {
+	assert(outlen <= 64);
+    memcpy(out, st->h, outlen);
+}
+	
+}
+}
\ No newline at end of file
diff --git a/contrib/ocl/crypto/detail/blake.hpp b/contrib/ocl/crypto/detail/blake.hpp
new file mode 100644
index 000000000..cd21d4c01
--- /dev/null
+++ b/contrib/ocl/crypto/detail/blake.hpp
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <cstdint>
+
+namespace ocl {
+	namespace crypto {
+		namespace crypto_detail {
+
+			inline uint64_t rotr64(uint64_t a, uint8_t bits)
+			{
+				return (a >> bits) | (a << (64 - bits));
+			}
+
+			inline void mix(uint64_t *va, uint64_t *vb, uint64_t *vc, uint64_t *vd,
+				uint64_t x, uint64_t y)
+			{
+				*va = (*va + *vb + x);
+				*vd = rotr64(*vd ^ *va, 32);
+				*vc = (*vc + *vd);
+				*vb = rotr64(*vb ^ *vc, 24);
+				*va = (*va + *vb + y);
+				*vd = rotr64(*vd ^ *va, 16);
+				*vc = (*vc + *vd);
+				*vb = rotr64(*vb ^ *vc, 63);
+			}
+
+			static const uint32_t   blake2b_block_len = 128;
+			static const uint32_t   blake2b_rounds = 12;
+			static const uint64_t   blake2b_iv[8] =
+			{
+				0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+				0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+				0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+				0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL,
+			};
+			static const uint8_t    blake2b_sigma[12][16] =
+			{
+				  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+				  { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+				  { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+				  {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+				  {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+				  {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
+				  { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
+				  { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
+				  {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
+				  { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
+				  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+				  { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+			};
+
+		}
+	}
+}
\ No newline at end of file
diff --git a/contrib/ocl/hex.hpp b/contrib/ocl/hex.hpp
new file mode 100644
index 000000000..056286171
--- /dev/null
+++ b/contrib/ocl/hex.hpp
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <cstdio>
+
+namespace ocl {
+	
+inline void hexdump(uint8_t *a, uint32_t a_len)
+{
+	for (uint32_t i = 0; i < a_len; i++)
+		fprintf(stderr, "%02x", a[i]);
+}
+
+inline char *s_hexdump(const void *_a, uint32_t a_len)
+{
+	const uint8_t	*a = (uint8_t	*)_a;
+	static char		buf[1024];
+	uint32_t		i;
+	for (i = 0; i < a_len && i + 2 < sizeof(buf); i++)
+		sprintf(buf + i * 2, "%02x", a[i]);
+	buf[i * 2] = 0;
+	return buf;
+}
+
+inline uint8_t hex2val(const char *base, size_t off)
+{
+	const char          c = base[off];
+	if (c >= '0' && c <= '9')           return c - '0';
+	else if (c >= 'a' && c <= 'f')      return 10 + c - 'a';
+	else if (c >= 'A' && c <= 'F')      return 10 + c - 'A';
+	printf("Invalid hex char at offset %zd: ...%c...\n", off, c);
+	return 0;
+}
+
+}
diff --git a/contrib/ocl/include/blake.hpp b/contrib/ocl/include/blake.hpp
new file mode 100644
index 000000000..509500256
--- /dev/null
+++ b/contrib/ocl/include/blake.hpp
@@ -0,0 +1,103 @@
+#pragma once
+#include <cstdint>
+#include <cstring>
+#include <cassert>
+
+namespace gg {
+	namespace impl {
+		static const uint32_t   blake2b_block_len = 128;
+		static const uint32_t   blake2b_rounds = 12;
+		static const uint64_t   blake2b_iv[8] =
+		{
+			0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+			0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+			0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+			0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL,
+		};
+		static const uint8_t    blake2b_sigma[12][16] =
+		{
+			{ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+			{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+			{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+			{ 7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+			{ 9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+			{ 2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
+			{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
+			{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
+			{ 6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
+			{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
+			{ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+			{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+		};
+
+		inline uint64_t rotr64(uint64_t a, uint8_t bits)
+		{
+			return (a >> bits) | (a << (64 - bits));
+		}
+
+		inline void mix(uint64_t *va, uint64_t *vb, uint64_t *vc, uint64_t *vd,
+			uint64_t x, uint64_t y)
+		{
+			*va = (*va + *vb + x);
+			*vd = rotr64(*vd ^ *va, 32);
+			*vc = (*vc + *vd);
+			*vb = rotr64(*vb ^ *vc, 24);
+			*va = (*va + *vb + y);
+			*vd = rotr64(*vd ^ *va, 16);
+			*vc = (*vc + *vd);
+			*vb = rotr64(*vb ^ *vc, 63);
+		}
+	}
+
+	typedef struct  blake2b_state_s
+	{
+		uint64_t    h[8];
+		uint64_t    bytes;
+	}               blake2b_state_t;
+
+	inline void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, uint32_t n, uint32_t k)
+	{
+		assert(n > k);
+		assert(hash_len <= 64);
+		st->h[0] = impl::blake2b_iv[0] ^ (0x01010000 | hash_len);
+		for (uint32_t i = 1; i <= 5; i++)
+			st->h[i] = impl::blake2b_iv[i];
+		st->h[6] = impl::blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW";
+		st->h[7] = impl::blake2b_iv[7] ^ (((uint64_t)k << 32) | n);
+		st->bytes = 0;
+	}
+
+	inline void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg, uint32_t msg_len, uint32_t is_final)
+	{
+		using namespace gg::impl;
+
+		const uint64_t      *m = (const uint64_t *)_msg;
+		uint64_t            v[16];
+		assert(msg_len <= 128);
+		assert(st->bytes <= UINT64_MAX - msg_len);
+		memcpy(v + 0, st->h, 8 * sizeof(*v));
+		memcpy(v + 8, blake2b_iv, 8 * sizeof(*v));
+		v[12] ^= (st->bytes += msg_len);
+		v[14] ^= is_final ? -1 : 0;
+		for (uint32_t round = 0; round < impl::blake2b_rounds; round++) {
+			const uint8_t   *s = blake2b_sigma[round];
+			mix(v + 0, v + 4, v + 8, v + 12, m[s[0]], m[s[1]]);
+			mix(v + 1, v + 5, v + 9, v + 13, m[s[2]], m[s[3]]);
+			mix(v + 2, v + 6, v + 10, v + 14, m[s[4]], m[s[5]]);
+			mix(v + 3, v + 7, v + 11, v + 15, m[s[6]], m[s[7]]);
+			mix(v + 0, v + 5, v + 10, v + 15, m[s[8]], m[s[9]]);
+			mix(v + 1, v + 6, v + 11, v + 12, m[s[10]], m[s[11]]);
+			mix(v + 2, v + 7, v + 8, v + 13, m[s[12]], m[s[13]]);
+			mix(v + 3, v + 4, v + 9, v + 14, m[s[14]], m[s[15]]);
+		}
+		for (uint32_t i = 0; i < 8; i++) {
+			st->h[i] ^= v[i] ^ v[i + 8];
+		}
+	}
+
+	inline void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen)
+	{
+		assert(outlen <= 64);
+		memcpy(out, st->h, outlen);
+	}
+}
\ No newline at end of file
diff --git a/contrib/ocl/include/ocl_gatelessgate.hpp b/contrib/ocl/include/ocl_gatelessgate.hpp
new file mode 100644
index 000000000..69fcfdaab
--- /dev/null
+++ b/contrib/ocl/include/ocl_gatelessgate.hpp
@@ -0,0 +1,73 @@
+#pragma once
+
+#include "ocl_gg_context.hpp"
+#include <string>
+
+namespace gg {
+namespace impl {
+
+}
+	struct ocl_gatelessgate
+	{
+		//int threadsperblock;
+		int blocks;
+		int device_id;
+		int platform_id;
+
+		ocl_gg_context* oclc;
+		// threads
+		unsigned threadsNum; // TMP
+		unsigned wokrsize;
+
+		bool is_init_success = false;
+
+		ocl_gatelessgate(int platf_id, int dev_id)
+			: blocks(0)
+			, device_id(dev_id)
+			, platform_id(platf_id)
+			, oclc(nullptr)
+			, threadsNum(8192U)
+			, wokrsize(128U)
+			, is_init_success(false)
+		{
+		}
+
+		std::string getdevinfo() {
+			static auto devices = GetAllDevices();
+			auto device = devices[device_id];
+			std::vector<char> name(256, 0);
+			size_t nActualSize = 0;
+			std::string gpu_name;
+
+			cl_int rc = clGetDeviceInfo(device, CL_DEVICE_NAME, name.size(), &name[0], &nActualSize);
+
+			gpu_name.assign(&name[0], nActualSize);
+
+			return "GPU_ID( " + gpu_name + ")";
+		}
+
+		static int getcount();
+
+		static void getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version);
+
+		static void start(ocl_silentarmy& device_context);
+
+		static void stop(ocl_silentarmy& device_context);
+
+		static void solve(const char *tequihash_header,
+			unsigned int tequihash_header_len,
+			const char* nonce,
+			unsigned int nonce_len,
+			std::function<bool()> cancelf,
+			std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
+			std::function<void(void)> hashdonef,
+			ocl_silentarmy& device_context);
+
+		std::string getname() { return "OCL_SILENTARMY"; }
+
+	private:
+		std::string m_gpu_name;
+		std::string m_version;
+	};
+
+}
\ No newline at end of file
diff --git a/contrib/ocl/include/ocl_gg_context.hpp b/contrib/ocl/include/ocl_gg_context.hpp
new file mode 100644
index 000000000..162d96196
--- /dev/null
+++ b/contrib/ocl/include/ocl_gg_context.hpp
@@ -0,0 +1,34 @@
+#pragma once
+#include "param.h"
+#include <CL/cl.h>
+
+struct ocl_gg_context {
+	cl_context _context;
+	cl_program _program;
+	cl_device_id _dev_id;
+
+	cl_platform_id platform_id = 0;
+
+	cl_command_queue queue;
+
+	cl_kernel k_init_ht;
+	cl_kernel k_rounds[PARAM_K];
+	cl_kernel k_sols;
+
+	cl_mem buf_ht[2], buf_sols, buf_dbg, rowCounters[2];
+	size_t global_ws;
+	size_t local_work_size = 64;
+
+	sols_t	*sols;
+
+	bool init(cl_device_id dev, unsigned threadsNum, unsigned threadsPerBlock);
+
+	~ocl_gg_context() {
+		clReleaseMemObject(buf_dbg);
+		clReleaseMemObject(buf_ht[0]);
+		clReleaseMemObject(buf_ht[1]);
+		clReleaseMemObject(rowCounters[0]);
+		clReleaseMemObject(rowCounters[1]);
+		free(sols);
+	}
+};
diff --git a/contrib/ocl/include/param.h b/contrib/ocl/include/param.h
new file mode 100644
index 000000000..fd08ba0e0
--- /dev/null
+++ b/contrib/ocl/include/param.h
@@ -0,0 +1,198 @@
+// Gateless Gate, a Zcash miner
+// Copyright 2016 zawawa @ bitcointalk.org
+//
+// The initial version of this software was based on:
+// SILENTARMY v5
+// The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil
+//
+// This program is free software : you can redistribute it and / or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program.If not, see <http://www.gnu.org/licenses/>.
+
+//#define ENABLE_DEBUG
+
+#define NR_ROWS_LOG            15
+#define NR_SLOTS               120
+#define LOCAL_WORK_SIZE        256
+#define THREADS_PER_ROW        256
+#define LOCAL_WORK_SIZE_SOLS   64
+#define THREADS_PER_ROW_SOLS   64
+#define GLOBAL_WORK_SIZE_RATIO 512
+#define SLOT_CACHE_SIZE        (NR_SLOTS * (LOCAL_WORK_SIZE/THREADS_PER_ROW) * 75 / 100)
+#define LDS_COLL_SIZE          (NR_SLOTS * (LOCAL_WORK_SIZE / THREADS_PER_ROW) * 120 / 100)
+
+#define SLOT_CACHE_INDEX_TYPE uchar
+
+#define PARAM_N				   200
+#define PARAM_K			       9
+#define PREFIX                 (PARAM_N / (PARAM_K + 1))
+#define NR_INPUTS              (1 << PREFIX)
+// Approximate log base 2 of number of elements in hash tables
+#define APX_NR_ELMS_LOG        PREFIX + 1)
+
+// Setting this to 1 might make Gateless Gate faster, see TROUBLESHOOTING.md
+#define OPTIM_SIMPLIFY_ROUND   1
+
+// Ratio of time of sleeping before rechecking if task is done (0-1)
+#define SLEEP_RECHECK_RATIO 0.60
+// Ratio of time to busy wait for the solution (0-1)
+// The higher value the higher CPU usage with Nvidia
+#define SLEEP_SKIP_RATIO 0.005
+
+// Make hash tables OVERHEAD times larger than necessary to store the average
+// number of elements per row. The ideal value is as small as possible to
+// reduce memory usage, but not too small or else elements are dropped from the
+// hash tables.
+//
+// The actual number of elements per row is closer to the theoretical average
+// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be
+// smaller.
+//
+// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease
+// performance as they cause VRAM channel conflicts.
+#if NR_ROWS_LOG <= 16
+#define OVERHEAD                        2
+#elif NR_ROWS_LOG == 17
+#define OVERHEAD                        3
+#elif NR_ROWS_LOG == 18
+#define OVERHEAD                        3
+#elif NR_ROWS_LOG == 19
+#define OVERHEAD                        5
+#elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND
+#define OVERHEAD                        6
+#elif NR_ROWS_LOG == 20
+#define OVERHEAD                        9
+#endif
+
+#define NR_ROWS                         (1 << NR_ROWS_LOG)
+#ifndef NR_SLOTS
+#define NR_SLOTS                        (((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD))
+#endif
+// Length of 1 element (slot) in byte
+#define SLOT_LEN                        32
+// Total size of hash table
+#define HT_SIZE				(NR_ROWS * NR_SLOTS * SLOT_LEN)
+// Length of Zcash block header, nonce (part of header)
+#define ZCASH_BLOCK_HEADER_LEN		140
+// Offset of nTime in header
+#define ZCASH_BLOCK_OFFSET_NTIME        (4 + 3 * 32)
+// Length of nonce
+#define ZCASH_NONCE_LEN			32
+// Length of encoded representation of solution size
+#define ZCASH_SOLSIZE_LEN		3
+// Solution size (1344 = 0x540) represented as a compact integer, in hex
+#define ZCASH_SOLSIZE_HEX               "fd4005"
+// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes)
+#define ZCASH_SOL_LEN                   ((1 << PARAM_K) * (PREFIX + 1) / 8)
+// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization
+#define N_ZERO_BYTES			12
+// Number of bytes Zcash needs out of Blake
+#define ZCASH_HASH_LEN                  50
+// Number of wavefronts per SIMD for the Blake kernel.
+// Blake is ALU-bound (beside the atomic counter being incremented) so we need
+// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer
+// instructions. 10 is the max supported by the hw.
+#define BLAKE_WPS               	10
+// Maximum number of solutions reported by kernel to host
+#define MAX_SOLS			10
+// Length of SHA256 target
+#define SHA256_TARGET_LEN               (256 / 8)
+
+#if (NR_SLOTS < 3)
+#define BITS_PER_ROW 2
+#define ROWS_PER_UINT 16
+#define ROW_MASK 0x03
+#elif (NR_SLOTS < 7)
+#define BITS_PER_ROW 3
+#define ROWS_PER_UINT 10
+#define ROW_MASK 0x07
+#elif (NR_SLOTS < 15)
+#define BITS_PER_ROW 4
+#define ROWS_PER_UINT 8
+#define ROW_MASK 0x0F
+#elif (NR_SLOTS < 31)
+#define BITS_PER_ROW 5
+#define ROWS_PER_UINT 6
+#define ROW_MASK 0x1F
+#elif (NR_SLOTS < 63)
+#define BITS_PER_ROW 6
+#define ROWS_PER_UINT 5
+#define ROW_MASK 0x3F
+#elif (NR_SLOTS < 255)
+#define BITS_PER_ROW 8
+#define ROWS_PER_UINT 4
+#define ROW_MASK 0xFF
+#else
+#define BITS_PER_ROW 16
+#define ROWS_PER_UINT 2
+#define ROW_MASK 0xFFFF
+#endif
+#define RC_SIZE (NR_ROWS * 4 / ROWS_PER_UINT)
+
+/*
+** Return the offset of Xi in bytes from the beginning of the slot.
+*/
+#define xi_offset_for_round(round)	4
+
+// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values
+#define SOL_SIZE			((1 << PARAM_K) * 4)
+typedef struct	sols_s
+{
+	uint	nr;
+	uint	likely_invalids;
+	uchar	valid[MAX_SOLS];
+	uint	values[MAX_SOLS][(1 << PARAM_K)];
+}		sols_t;
+
+#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff))
+#define DECODE_ROW(REF)   (REF >> 16)
+#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff)
+#define DECODE_SLOT0(REF) (REF & 0xff)
+
+#elif NR_ROWS_LOG <= 18 && NR_SLOTS <= (1 << 7)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f))
+#define DECODE_ROW(REF)   (REF >> 14)
+#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f)
+#define DECODE_SLOT0(REF) (REF & 0x7f)
+
+#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */
+#define DECODE_ROW(REF)   (REF >> 13)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f))
+#define DECODE_ROW(REF)   (REF >> 12)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+
+// Windows only for now
+#define DEFAULT_NUM_MINING_MODE_THREADS 1
+#define MAX_NUM_MINING_MODE_THREADS 16
+
+#define ADJUSTED_SLOT_LEN(round) (((round) <= 5) ? SLOT_LEN : SLOT_LEN / 2)
+#define OPENCL_BUILD_OPTIONS_AMD "-I.. -I. -O1"
+#define OPENCL_BUILD_OPTIONS     "-I.. -I."
\ No newline at end of file
diff --git a/contrib/ocl/kernels/gatelessgate.cl b/contrib/ocl/kernels/gatelessgate.cl
new file mode 100644
index 000000000..69d842039
--- /dev/null
+++ b/contrib/ocl/kernels/gatelessgate.cl
@@ -0,0 +1,993 @@
+// Gateless Gate, a Zcash miner
+// Copyright 2016 zawawa @ bitcointalk.org
+//
+// The initial version of this software was based on:
+// SILENTARMY v5
+// The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil
+//
+// This program is free software : you can redistribute it and / or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program.If not, see <http://www.gnu.org/licenses/>.
+
+//#define ENABLE_DEBUG
+
+#define NR_ROWS_LOG            15
+#define NR_SLOTS               120
+#define LOCAL_WORK_SIZE        256
+#define THREADS_PER_ROW        256
+#define LOCAL_WORK_SIZE_SOLS   64
+#define THREADS_PER_ROW_SOLS   64
+#define GLOBAL_WORK_SIZE_RATIO 512
+#define SLOT_CACHE_SIZE        (NR_SLOTS * (LOCAL_WORK_SIZE/THREADS_PER_ROW) * 75 / 100)
+#define LDS_COLL_SIZE          (NR_SLOTS * (LOCAL_WORK_SIZE / THREADS_PER_ROW) * 120 / 100)
+
+#define SLOT_CACHE_INDEX_TYPE uchar
+
+#define PARAM_N				   200
+#define PARAM_K			       9
+#define PREFIX                 (PARAM_N / (PARAM_K + 1))
+#define NR_INPUTS              (1 << PREFIX)
+// Approximate log base 2 of number of elements in hash tables
+#define APX_NR_ELMS_LOG        PREFIX + 1)
+
+// Setting this to 1 might make Gateless Gate faster, see TROUBLESHOOTING.md
+#define OPTIM_SIMPLIFY_ROUND   1
+
+// Ratio of time of sleeping before rechecking if task is done (0-1)
+#define SLEEP_RECHECK_RATIO 0.60
+// Ratio of time to busy wait for the solution (0-1)
+// The higher value the higher CPU usage with Nvidia
+#define SLEEP_SKIP_RATIO 0.005
+
+// Make hash tables OVERHEAD times larger than necessary to store the average
+// number of elements per row. The ideal value is as small as possible to
+// reduce memory usage, but not too small or else elements are dropped from the
+// hash tables.
+//
+// The actual number of elements per row is closer to the theoretical average
+// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be
+// smaller.
+//
+// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease
+// performance as they cause VRAM channel conflicts.
+#if NR_ROWS_LOG <= 16
+#define OVERHEAD                        2
+#elif NR_ROWS_LOG == 17
+#define OVERHEAD                        3
+#elif NR_ROWS_LOG == 18
+#define OVERHEAD                        3
+#elif NR_ROWS_LOG == 19
+#define OVERHEAD                        5
+#elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND
+#define OVERHEAD                        6
+#elif NR_ROWS_LOG == 20
+#define OVERHEAD                        9
+#endif
+
+#define NR_ROWS                         (1 << NR_ROWS_LOG)
+#ifndef NR_SLOTS
+#define NR_SLOTS                        (((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD))
+#endif
+// Length of 1 element (slot) in byte
+#define SLOT_LEN                        32
+// Total size of hash table
+#define HT_SIZE				(NR_ROWS * NR_SLOTS * SLOT_LEN)
+// Length of Zcash block header, nonce (part of header)
+#define ZCASH_BLOCK_HEADER_LEN		140
+// Offset of nTime in header
+#define ZCASH_BLOCK_OFFSET_NTIME        (4 + 3 * 32)
+// Length of nonce
+#define ZCASH_NONCE_LEN			32
+// Length of encoded representation of solution size
+#define ZCASH_SOLSIZE_LEN		3
+// Solution size (1344 = 0x540) represented as a compact integer, in hex
+#define ZCASH_SOLSIZE_HEX               "fd4005"
+// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes)
+#define ZCASH_SOL_LEN                   ((1 << PARAM_K) * (PREFIX + 1) / 8)
+// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization
+#define N_ZERO_BYTES			12
+// Number of bytes Zcash needs out of Blake
+#define ZCASH_HASH_LEN                  50
+// Number of wavefronts per SIMD for the Blake kernel.
+// Blake is ALU-bound (beside the atomic counter being incremented) so we need
+// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer
+// instructions. 10 is the max supported by the hw.
+#define BLAKE_WPS               	10
+// Maximum number of solutions reported by kernel to host
+#define MAX_SOLS			10
+// Length of SHA256 target
+#define SHA256_TARGET_LEN               (256 / 8)
+
+#if (NR_SLOTS < 3)
+#define BITS_PER_ROW 2
+#define ROWS_PER_UINT 16
+#define ROW_MASK 0x03
+#elif (NR_SLOTS < 7)
+#define BITS_PER_ROW 3
+#define ROWS_PER_UINT 10
+#define ROW_MASK 0x07
+#elif (NR_SLOTS < 15)
+#define BITS_PER_ROW 4
+#define ROWS_PER_UINT 8
+#define ROW_MASK 0x0F
+#elif (NR_SLOTS < 31)
+#define BITS_PER_ROW 5
+#define ROWS_PER_UINT 6
+#define ROW_MASK 0x1F
+#elif (NR_SLOTS < 63)
+#define BITS_PER_ROW 6
+#define ROWS_PER_UINT 5
+#define ROW_MASK 0x3F
+#elif (NR_SLOTS < 255)
+#define BITS_PER_ROW 8
+#define ROWS_PER_UINT 4
+#define ROW_MASK 0xFF
+#else
+#define BITS_PER_ROW 16
+#define ROWS_PER_UINT 2
+#define ROW_MASK 0xFFFF
+#endif
+#define RC_SIZE (NR_ROWS * 4 / ROWS_PER_UINT)
+
+/*
+** Return the offset of Xi in bytes from the beginning of the slot.
+*/
+#define xi_offset_for_round(round)	4
+
+// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values
+#define SOL_SIZE			((1 << PARAM_K) * 4)
+typedef struct	sols_s
+{
+	uint	nr;
+	uint	likely_invalids;
+	uchar	valid[MAX_SOLS];
+	uint	values[MAX_SOLS][(1 << PARAM_K)];
+}		sols_t;
+
+#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff))
+#define DECODE_ROW(REF)   (REF >> 16)
+#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff)
+#define DECODE_SLOT0(REF) (REF & 0xff)
+
+#elif NR_ROWS_LOG <= 18 && NR_SLOTS <= (1 << 7)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f))
+#define DECODE_ROW(REF)   (REF >> 14)
+#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f)
+#define DECODE_SLOT0(REF) (REF & 0x7f)
+
+#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */
+#define DECODE_ROW(REF)   (REF >> 13)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f))
+#define DECODE_ROW(REF)   (REF >> 12)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+
+// Windows only for now
+#define DEFAULT_NUM_MINING_MODE_THREADS 1
+#define MAX_NUM_MINING_MODE_THREADS 16
+
+#define ADJUSTED_SLOT_LEN(round) (((round) <= 5) ? SLOT_LEN : SLOT_LEN / 2)
+#define OPENCL_BUILD_OPTIONS_AMD "-I.. -I. -O1"
+#define OPENCL_BUILD_OPTIONS     "-I.. -I."
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+
+typedef union {
+	struct {
+		uint i;
+		uint xi[6];
+		uint padding;
+	} slot;
+	uint8 ui8;
+	uint4 ui4[2];
+} slot_t;
+
+/*
+** The new hash table has this layout (length in bytes in parens):
+**
+** round 0, table 0: i(4) pad(0) Xi(24) pad(4)
+** round 1, table 1: i(4) pad(3) Xi(20) pad(5)
+** round 2, table 2: i(4) pad(0) Xi(19) pad(9)
+** round 3, table 3: i(4) pad(3) Xi(15) pad(10)
+** round 4, table 4: i(4) pad(0) Xi(14) pad(14)
+** round 5, table 5: i(4) pad(3) Xi(10) pad(15)
+** round 6, table 6: i(4) pad(0) Xi( 9) pad(19)
+** round 7, table 7: i(4) pad(3) Xi( 5) pad(20)
+** round 8, table 8: i(4) pad(0) Xi( 4) pad(24)
+**
+*/
+
+__constant ulong blake_iv_const[] =
+{
+	0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+	0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+	0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+	0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+};
+
+/*
+** Reset counters in hash table.
+*/
+__kernel
+void kernel_init_ht(__global char *ht, __global uint *rowCounters)
+{
+	rowCounters[get_global_id(0)] = 0;
+}
+
+/*
+** OBSOLETE
+** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they
+** represent (hex notation, group of 5 hex digits are a group of PREFIX bits):
+**   aa aa ab bb bb cc cc cd dd...  [round 0]
+**         --------------------
+**      ...ab bb bb cc cc cd dd...  [odd round]
+**               --------------
+**               ...cc cc cd dd...  [next even round]
+**                        -----
+** Bytes underlined are going to be stored in the slot. Preceding bytes
+** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are
+** used to compute the row number.
+**
+** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter)
+** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble)
+** TODO: update lines below with padding nibbles
+** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter)
+** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter)
+** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter)
+** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter)
+** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter)
+** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter)
+** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter)
+**
+** Return 0 if successfully stored, or 1 if the row overflowed.
+*/
+
+__global char *get_slot_ptr(__global char *ht, uint round, uint row, uint slot)
+{
+#if 1
+	return ht + (row * NR_SLOTS + slot) * ADJUSTED_SLOT_LEN(round);
+#else
+	return ht + (slot * NR_ROWS + row) * ADJUSTED_SLOT_LEN(round);
+#endif
+}
+
+__global char *get_xi_ptr(__global char *ht, uint round, uint row, uint slot)
+{
+	return get_slot_ptr(ht, round, row, slot) + xi_offset_for_round(round);
+}
+
+void get_row_counters_index(uint *rowIdx, uint *rowOffset, uint row)
+{
+	*rowIdx = row / ROWS_PER_UINT;
+	*rowOffset = BITS_PER_ROW * (row % ROWS_PER_UINT);
+}
+
+uint get_row(uint round, uint xi0)
+{
+	uint           row;
+#if NR_ROWS_LOG == 14
+	if (!(round % 2))
+		row = (xi0 & 0x3fff);
+	else
+		row = ((xi0 & 0x3f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24);
+#elif NR_ROWS_LOG == 15
+	if (!(round % 2))
+		row = (xi0 & 0x7fff);
+	else
+		row = ((xi0 & 0x7f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24);
+#elif NR_ROWS_LOG == 16
+	if (!(round % 2))
+		row = (xi0 & 0xffff);
+	else
+		row = ((xi0 & 0xff0f00) >> 8) | ((xi0 & 0xf0000000) >> 24);
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+	return row;
+}
+
+uint inc_row_counter(__global uint *rowCounters, uint row)
+{
+	uint rowIdx, rowOffset;
+	get_row_counters_index(&rowIdx, &rowOffset, row);
+	uint cnt = atomic_add(rowCounters + rowIdx, 1 << rowOffset);
+	cnt = (cnt >> rowOffset) & ROW_MASK;
+	if (cnt >= NR_SLOTS) {
+		// avoid overflows
+		atomic_sub(rowCounters + rowIdx, 1 << rowOffset);
+	}
+	return cnt;
+}
+
+uint ht_store(uint round, __global char *ht, uint i,
+	uint xi0, uint xi1, uint xi2, uint xi3, uint xi4, uint xi5, uint xi6, __global uint *rowCounters)
+{
+	uint row = get_row(round, xi0);
+	uint cnt = inc_row_counter(rowCounters, row);
+	if (cnt >= NR_SLOTS)
+		return 0;
+	__global char *p = get_slot_ptr(ht, round, row, cnt);
+	slot_t slot;
+	slot.slot.i = i;
+	slot.slot.xi[0] = ((xi1 << 24) | (xi0 >> 8));
+	slot.slot.xi[1] = ((xi2 << 24) | (xi1 >> 8));
+	slot.slot.xi[2] = ((xi3 << 24) | (xi2 >> 8));
+	slot.slot.xi[3] = ((xi4 << 24) | (xi3 >> 8));
+	slot.slot.xi[4] = ((xi5 << 24) | (xi4 >> 8));
+	slot.slot.xi[5] = ((xi6 << 24) | (xi5 >> 8));
+	if (round <= 5) {
+		*(__global uint8 *)p = slot.ui8;
+	}
+	else {
+		*(__global uint4 *)p = slot.ui4[0];
+	}
+	return 0;
+}
+
+#define mix(va, vb, vc, vd, x, y) \
+    va = (va + vb + x); \
+vd = rotate((vd ^ va), (ulong)64 - 32); \
+vc = (vc + vd); \
+vb = rotate((vb ^ vc), (ulong)64 - 24); \
+va = (va + vb + y); \
+vd = rotate((vd ^ va), (ulong)64 - 16); \
+vc = (vc + vd); \
+vb = rotate((vb ^ vc), (ulong)64 - 63);
+
+/*
+** Execute round 0 (blake).
+**
+** Note: making the work group size less than or equal to the wavefront size
+** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local
+** Memory (LDS) Optimization 2-10" in:
+** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/
+*/
+__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE, 1, 1)))
+void kernel_round0(__constant ulong *blake_state_const, __global char *ht,
+	__global uint *rowCounters, __global uint *debug)
+{
+	__local ulong blake_state[64];
+	__local ulong blake_iv[8];
+	uint                tid = get_global_id(0);
+	ulong               v[16];
+	uint                inputs_per_thread = NR_INPUTS / get_global_size(0);
+	uint                input = tid * inputs_per_thread;
+	uint                input_end = (tid + 1) * inputs_per_thread;
+	uint                dropped = 0;
+	if (get_local_id(0) < 64)
+		blake_state[get_local_id(0)] = blake_state_const[get_local_id(0)];
+	if (get_local_id(0) < 8)
+		blake_iv[get_local_id(0)] = blake_iv_const[get_local_id(0)];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	while (input < input_end) {
+		// shift "i" to occupy the high 32 bits of the second ulong word in the
+		// message block
+		ulong word1 = (ulong)input << 32;
+		// init vector v
+		v[0] = blake_state[0];
+		v[1] = blake_state[1];
+		v[2] = blake_state[2];
+		v[3] = blake_state[3];
+		v[4] = blake_state[4];
+		v[5] = blake_state[5];
+		v[6] = blake_state[6];
+		v[7] = blake_state[7];
+		v[8] = blake_iv[0];
+		v[9] = blake_iv[1];
+		v[10] = blake_iv[2];
+		v[11] = blake_iv[3];
+		v[12] = blake_iv[4];
+		v[13] = blake_iv[5];
+		v[14] = blake_iv[6];
+		v[15] = blake_iv[7];
+		// mix in length of data
+		v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */;
+		// last block
+		v[14] ^= (ulong)-1;
+
+		// round 1
+		mix(v[0], v[4], v[8], v[12], 0, word1);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 2
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], word1, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 3
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, word1);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 4
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, word1);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 5
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, word1);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 6
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], word1, 0);
+		// round 7
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], word1, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 8
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, word1);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 9
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], word1, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 10
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], word1, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 11
+		mix(v[0], v[4], v[8], v[12], 0, word1);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 12
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], word1, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+
+		// compress v into the blake state; this produces the 50-byte hash
+		// (two Xi values)
+		ulong h[7];
+		h[0] = blake_state[0] ^ v[0] ^ v[8];
+		h[1] = blake_state[1] ^ v[1] ^ v[9];
+		h[2] = blake_state[2] ^ v[2] ^ v[10];
+		h[3] = blake_state[3] ^ v[3] ^ v[11];
+		h[4] = blake_state[4] ^ v[4] ^ v[12];
+		h[5] = blake_state[5] ^ v[5] ^ v[13];
+		h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
+
+		// store the two Xi values in the hash table
+#if ZCASH_HASH_LEN == 50
+		dropped += ht_store(0, ht, input * 2,
+			h[0] & 0xffffffff, h[0] >> 32,
+			h[1] & 0xffffffff, h[1] >> 32,
+			h[2] & 0xffffffff, h[2] >> 32,
+			h[3] & 0xffffffff,
+			rowCounters);
+		dropped += ht_store(0, ht, input * 2 + 1,
+			((h[3] >> 8) | (h[4] << (64 - 8))) & 0xffffffff,
+			((h[3] >> 8) | (h[4] << (64 - 8))) >> 32,
+			((h[4] >> 8) | (h[5] << (64 - 8))) & 0xffffffff,
+			((h[4] >> 8) | (h[5] << (64 - 8))) >> 32,
+			((h[5] >> 8) | (h[6] << (64 - 8))) & 0xffffffff,
+			((h[5] >> 8) | (h[6] << (64 - 8))) >> 32,
+			(h[6] >> 8) & 0xffffffff,
+			rowCounters);
+#else
+#error "unsupported ZCASH_HASH_LEN"
+#endif
+
+		input++;
+	}
+#ifdef ENABLE_DEBUG
+	debug[tid * 2] = 0;
+	debug[tid * 2 + 1] = dropped;
+#endif
+}
+
+/*
+** XOR a pair of Xi values computed at "round - 1" and store the result in the
+** hash table being built for "round". Note that when building the table for
+** even rounds we need to skip 1 padding byte present in the "round - 1" table
+** (the "0xAB" byte mentioned in the description at the top of this file.) But
+** also note we can't load data directly past this byte because this would
+** cause an unaligned memory access which is undefined per the OpenCL spec.
+**
+** Return 0 if successfully stored, or 1 if the row overflowed.
+*/
+uint xor_and_store(uint round, __global char *ht_dst, uint row,
+	uint slot_a, uint slot_b, __local uint *ai, __local uint *bi,
+	__global uint *rowCounters)
+{
+	ulong xi0, xi1, xi2, xi3, xi4, xi5;
+#if NR_ROWS_LOG >= 8 && NR_ROWS_LOG <= 20
+	// xor 24 bytes
+	xi0 = *(ai++);
+	xi1 = *(ai++);
+	if (round <= 7) xi2 = *(ai++);
+	if (round <= 6) xi3 = *(ai++);
+	if (round <= 4) xi4 = *(ai++);
+	if (round <= 2) xi5 = *ai;
+
+	xi0 ^= *(bi++);
+	xi1 ^= *(bi++);
+	if (round <= 7) xi2 ^= *(bi++);
+	if (round <= 6) xi3 ^= *(bi++);
+	if (round <= 4) xi4 ^= *(bi++);
+	if (round <= 2) xi5 ^= *bi;
+
+	if (!(round & 0x1)) {
+		// skip padding bytes
+		xi0 = (xi0 >> 24) | (xi1 << (32 - 24));
+		xi1 = (xi1 >> 24) | (xi2 << (32 - 24));
+		if (round <= 7) xi2 = (xi2 >> 24) | (xi3 << (32 - 24));
+		if (round <= 6) xi3 = (xi3 >> 24) | (xi4 << (32 - 24));
+		if (round <= 4) xi4 = (xi4 >> 24) | (xi5 << (32 - 24));
+		if (round <= 2) xi5 = (xi5 >> 24);
+	}
+
+	// invalid solutions (which start happenning in round 5) have duplicate
+	// inputs and xor to zero, so discard them
+	if (!xi0 && !xi1)
+		return 0;
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+	return ht_store(round, ht_dst, ENCODE_INPUTS(row, slot_a, slot_b), xi0, xi1, xi2, xi3, xi4, xi5, 0, rowCounters);
+}
+
+/*
+** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi,
+** store them in ht_dst.
+*/
+
+#define UINTS_IN_XI(round) (((round) == 0) ? 6 : \
+                            ((round) == 1) ? 6 : \
+                            ((round) == 2) ? 5 : \
+                            ((round) == 3) ? 5 : \
+                            ((round) == 4) ? 4 : \
+                            ((round) == 5) ? 4 : \
+                            ((round) == 6) ? 3 : \
+                            ((round) == 7) ? 2 : \
+                                             1)
+
+#define RESERVED_FOR_XI(round) (((round) == 0) ? 6 : \
+                            ((round) == 1) ? 6 : \
+                            ((round) == 2) ? 6 : \
+                            ((round) == 3) ? 6 : \
+                            ((round) == 4) ? 4 : \
+                            ((round) == 5) ? 4 : \
+                            ((round) == 6) ? 4 : \
+                            ((round) == 7) ? 2 : \
+                                             2)
+
+void equihash_round(uint round,
+	__global char *ht_src,
+	__global char *ht_dst,
+	__global uint *debug,
+	__local uint  *slot_cache,
+	__local uint  *slot_cache_counter,
+	__local SLOT_CACHE_INDEX_TYPE *slot_cache_indexes,
+	__local uint *collisionsData,
+	__local uint *collisionsNum,
+	__global uint *rowCountersSrc,
+	__global uint *rowCountersDst,
+	uint threadsPerRow,
+	__local uint *nr_slots_array,
+	__local uchar *bins_data,
+	__local uint *bin_counters_data)
+{
+	uint globalTid = get_global_id(0) / threadsPerRow;
+	uint localTid = get_local_id(0) / threadsPerRow;
+	uint localGroupId = get_local_id(0) % threadsPerRow;
+
+	__global char *p;
+	uint     cnt;
+	uint     i, j;
+	uint     dropped_coll = 0;
+	uint     dropped_stor = 0;
+	__local uint  *a, *b;
+	// the mask is also computed to read data from the previous round
+#define BIN_MASK(round)        ((((round) + 1) % 2) ? 0xf000 : 0xf0000)
+#define BIN_MASK_OFFSET(round) ((((round) + 1) % 2) ? 3 * 4 : 4 * 4)
+#if NR_ROWS_LOG == 14
+#define BIN_MASK2(round)        ((((round) + 1) % 2) ? 0x00c0 : 0xc000)
+#define BIN_MASK2_OFFSET(round) ((((round) + 1) % 2) ? 2 : 10)
+#elif NR_ROWS_LOG == 15
+#define BIN_MASK2(round)        ((((round) + 1) % 2) ? 0x0080 : 0x8000)
+#define BIN_MASK2_OFFSET(round) ((((round) + 1) % 2) ? 3 : 11)
+#elif NR_ROWS_LOG == 16
+#define BIN_MASK2(round)        0
+#define BIN_MASK2_OFFSET(round) 0
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif    
+#define NR_BINS (64 >> (NR_ROWS_LOG - 14))
+	__local uchar *bins = &bins_data[localTid * NR_SLOTS * NR_BINS];
+	__local uint *bin_counters = &bin_counters_data[localTid * NR_BINS];
+
+	uint rows_per_work_item = (NR_ROWS + get_global_size(0) / threadsPerRow - 1) / (get_global_size(0) / threadsPerRow);
+	uint rows_per_chunk = get_global_size(0) / threadsPerRow;
+
+	for (uint chunk = 0; chunk < rows_per_work_item; chunk++) {
+		uint tid = globalTid + rows_per_chunk * chunk;
+		uint gid = tid & ~(get_local_size(0) / threadsPerRow - 1);
+
+		if (tid < NR_ROWS) {
+			if (!get_local_id(0)) {
+				*collisionsNum = 0;
+				*slot_cache_counter = 0;
+			}
+			for (i = localGroupId; i < NR_BINS; i += threadsPerRow)
+				bin_counters[i] = 0;
+			if (localGroupId == 0) {
+				uint rowIdx, rowOffset;
+				get_row_counters_index(&rowIdx, &rowOffset, tid);
+				cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK;
+				cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round
+				nr_slots_array[localTid] = cnt;
+			}
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		if (tid < NR_ROWS) {
+			if (localGroupId)
+				cnt = nr_slots_array[localTid];
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Perform a radix sort as slots get loaded into LDS.
+		if (tid < NR_ROWS) {
+			for (i = localGroupId; i < cnt; i += threadsPerRow) {
+				uint xi_first_bytes = *(__global uint *)get_xi_ptr(ht_src, round - 1, tid, i);
+
+				uint bin_to_use =
+					((xi_first_bytes & BIN_MASK(round - 1)) >> BIN_MASK_OFFSET(round - 1))
+					| ((xi_first_bytes & BIN_MASK2(round - 1)) >> BIN_MASK2_OFFSET(round - 1));
+				uint bin_counter_copy = atomic_inc(&bin_counters[bin_to_use]);
+				bins[bin_to_use * NR_SLOTS + bin_counter_copy] = i;
+
+				if (bin_counter_copy) {
+					uint slot_cache_counter_copy = atomic_inc(slot_cache_counter);
+					if (slot_cache_counter_copy >= SLOT_CACHE_SIZE) {
+						atomic_dec(slot_cache_counter);
+						++dropped_coll;
+						slot_cache_indexes[localTid * NR_SLOTS + i] = SLOT_CACHE_SIZE;
+					}
+					else {
+						slot_cache[slot_cache_counter_copy * RESERVED_FOR_XI(round - 1)] = xi_first_bytes;
+						for (j = 1; j < UINTS_IN_XI(round - 1); ++j)
+							slot_cache[slot_cache_counter_copy * RESERVED_FOR_XI(round - 1) + j] = *((__global uint *)get_xi_ptr(ht_src, round - 1, tid, i) + j);
+						slot_cache_indexes[localTid * NR_SLOTS + i] = slot_cache_counter_copy;
+					}
+
+					if (bin_counter_copy == 1) {
+						slot_cache_counter_copy = atomic_inc(slot_cache_counter);
+						uint first_slot_index = bins[bin_to_use * NR_SLOTS];
+						if (slot_cache_counter_copy >= SLOT_CACHE_SIZE) {
+							atomic_dec(slot_cache_counter);
+							++dropped_coll;
+							slot_cache_indexes[localTid * NR_SLOTS + first_slot_index] = SLOT_CACHE_SIZE;
+						}
+						else {
+							for (j = 0; j < UINTS_IN_XI(round - 1); ++j)
+								slot_cache[slot_cache_counter_copy * RESERVED_FOR_XI(round - 1) + j] = *((__global uint *)get_xi_ptr(ht_src, round - 1, tid, first_slot_index) + j);
+							slot_cache_indexes[localTid * NR_SLOTS + first_slot_index] = slot_cache_counter_copy;
+						}
+					}
+				}
+
+				for (j = 0; j < bin_counter_copy; ++j) {
+					uint index = atomic_inc(collisionsNum);
+					if (index >= LDS_COLL_SIZE) {
+						atomic_dec(collisionsNum);
+						++dropped_coll;
+					}
+					else {
+						collisionsData[index] = (localTid << 24) | (i << 12) | bins[bin_to_use * NR_SLOTS + j];
+					}
+				}
+			}
+		}
+
+	part2:
+		barrier(CLK_LOCAL_MEM_FENCE);
+		if (tid < NR_ROWS) {
+			uint totalCollisions = *collisionsNum;
+			for (uint index = get_local_id(0); index < totalCollisions; index += get_local_size(0)) {
+				uint collision = collisionsData[index];
+				uint collisionLocalThreadId = collision >> 24;
+				uint collisionThreadId = gid + collisionLocalThreadId;
+				uint i = (collision >> 12) & 0xfff;
+				uint j = collision & 0xfff;
+				uint slot_cache_index_i = slot_cache_indexes[collisionLocalThreadId * NR_SLOTS + i];
+				if (slot_cache_index_i >= SLOT_CACHE_SIZE)
+					continue;
+				uint slot_cache_index_j = slot_cache_indexes[collisionLocalThreadId * NR_SLOTS + j];
+				if (slot_cache_index_j >= SLOT_CACHE_SIZE)
+					continue;
+				a = (__local uint *)&slot_cache[slot_cache_index_i * RESERVED_FOR_XI(round - 1)];
+				b = (__local uint *)&slot_cache[slot_cache_index_j * RESERVED_FOR_XI(round - 1)];
+				dropped_stor += xor_and_store(round, ht_dst, collisionThreadId, i, j, a, b, rowCountersDst);
+			}
+		}
+	}
+
+#ifdef ENABLE_DEBUG
+	uint tid = get_global_id(0);
+	debug[tid * 2] = dropped_coll;
+	debug[tid * 2 + 1] = dropped_stor;
+#endif
+}
+
+/*
+** This defines kernel_round1, kernel_round2, ..., kernel_round7.
+*/
+#define KERNEL_ROUND(N) \
+__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE, 1, 1))) \
+void kernel_round ## N(__global char *ht_src, __global char *ht_dst, \
+	__global uint *rowCountersSrc, __global uint *rowCountersDst, \
+       	__global uint *debug) \
+{ \
+    __local uint    slot_cache[RESERVED_FOR_XI(N - 1) * SLOT_CACHE_SIZE]; \
+    __local uint    slot_cache_counter; \
+    __local SLOT_CACHE_INDEX_TYPE slot_cache_indexes[NR_SLOTS * (LOCAL_WORK_SIZE/THREADS_PER_ROW)]; \
+    __local uint    collisionsData[LDS_COLL_SIZE]; \
+    __local uint    collisionsNum; \
+	__local uint    nr_slots_array[LOCAL_WORK_SIZE / THREADS_PER_ROW]; \
+	__local uchar   bins_data[(LOCAL_WORK_SIZE / THREADS_PER_ROW) * NR_SLOTS * NR_BINS]; \
+	__local uint    bin_counters_data[(LOCAL_WORK_SIZE / THREADS_PER_ROW) * NR_BINS]; \
+	equihash_round(N, ht_src, ht_dst, debug, slot_cache, &slot_cache_counter, slot_cache_indexes, collisionsData, \
+	    &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW, nr_slots_array, bins_data, bin_counters_data); \
+}
+KERNEL_ROUND(1)
+KERNEL_ROUND(2)
+KERNEL_ROUND(3)
+KERNEL_ROUND(4)
+KERNEL_ROUND(5)
+KERNEL_ROUND(6)
+KERNEL_ROUND(7)
+KERNEL_ROUND(8)
+
+uint expand_ref(__global char *ht, uint round, uint row, uint slot)
+{
+	return ((__global slot_t *)get_slot_ptr(ht, round, row, slot))->slot.i;
+}
+
+/*
+** Expand references to inputs. Return 1 if so far the solution appears valid,
+** or 0 otherwise (an invalid solution would be a solution with duplicate
+** inputs, which can be detected at the last step: round == 0).
+*/
+uint expand_refs(__local uint *ins, uint nr_inputs, __global char **htabs,
+	uint round)
+{
+	__global char	*ht = htabs[round];
+	uint		i = nr_inputs - 1;
+	uint		j = nr_inputs * 2 - 1;
+	int			dup_to_watch = -1;
+	do {
+		ins[j] = expand_ref(ht, round,
+			DECODE_ROW(ins[i]), DECODE_SLOT1(ins[i]));
+		ins[j - 1] = expand_ref(ht, round,
+			DECODE_ROW(ins[i]), DECODE_SLOT0(ins[i]));
+		if (!round) {
+			if (dup_to_watch == -1)
+				dup_to_watch = ins[j];
+			else if (ins[j] == dup_to_watch || ins[j - 1] == dup_to_watch)
+				return 0;
+		}
+		if (!i)
+			break;
+		i--;
+		j -= 2;
+	} while (1);
+	return 1;
+}
+
+/*
+** Verify if a potential solution is in fact valid.
+*/
+void potential_sol(__global char **htabs, __global sols_t *sols,
+	uint ref0, uint ref1, __local uint *values_tmp)
+{
+	uint	nr_values;
+	uint	sol_i;
+	uint	i;
+	nr_values = 0;
+	values_tmp[nr_values++] = ref0;
+	values_tmp[nr_values++] = ref1;
+	uint round = PARAM_K - 1;
+	do {
+		round--;
+		if (!expand_refs(values_tmp, nr_values, htabs, round))
+			return;
+		nr_values *= 2;
+	} while (round > 0);
+	// solution appears valid, copy it to sols
+	sol_i = atomic_inc(&sols->nr);
+	if (sol_i >= MAX_SOLS)
+		return;
+	for (i = 0; i < (1 << PARAM_K); i++)
+		sols->values[sol_i][i] = values_tmp[i];
+	sols->valid[sol_i] = 1;
+}
+
+/*
+** Scan the hash tables to find Equihash solutions.
+*/
+__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE_SOLS, 1, 1)))
+void kernel_sols(__global char *ht0,
+	__global char *ht1,
+	__global char *ht2,
+	__global char *ht3,
+	__global char *ht4,
+	__global char *ht5,
+	__global char *ht6,
+	__global char *ht7,
+	__global char *ht8,
+	__global sols_t *sols,
+	__global uint *rowCountersSrc)
+{
+	__local uint refs[NR_SLOTS*(LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS)];
+	__local uint data[NR_SLOTS*(LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS)];
+	__local uint	values_tmp[(1 << PARAM_K)];
+	__local uint    semaphoe;
+
+	uint globalTid = get_global_id(0) / THREADS_PER_ROW_SOLS;
+	uint localTid = get_local_id(0) / THREADS_PER_ROW_SOLS;
+	uint localGroupId = get_local_id(0) % THREADS_PER_ROW_SOLS;
+	__local uint *refsPtr = &refs[NR_SLOTS*localTid];
+	__local uint *dataPtr = &data[NR_SLOTS*localTid];
+
+	__global char	*htabs[] = { ht0, ht1, ht2, ht3, ht4, ht5, ht6, ht7, ht8 };
+	uint		ht_i = (PARAM_K - 1); // table filled at last round
+	uint		cnt;
+	uint		i, j;
+	__global char	*p;
+	uint		ref_i, ref_j;
+	__local uchar   bins_data[(LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS) * NR_SLOTS * NR_BINS];
+	__local uint    bin_counters_data[(LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS) * NR_BINS];
+	__local uchar *bins = &bins_data[localTid * NR_SLOTS * NR_BINS];
+	__local uint *bin_counters = &bin_counters_data[localTid * NR_BINS];
+
+	if (!get_global_id(0))
+		sols->nr = sols->likely_invalids = 0;
+	barrier(CLK_GLOBAL_MEM_FENCE);
+
+	uint rows_per_work_item = (NR_ROWS + get_global_size(0) / THREADS_PER_ROW_SOLS - 1) / (get_global_size(0) / THREADS_PER_ROW_SOLS);
+	uint rows_per_chunk = get_global_size(0) / THREADS_PER_ROW_SOLS;
+
+	for (uint chunk = 0; chunk < rows_per_work_item; chunk++) {
+		uint tid = globalTid + rows_per_chunk * chunk;
+		uint gid = tid & ~(get_local_size(0) / THREADS_PER_ROW_SOLS - 1);
+
+		__local uint nr_slots_array[LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS];
+		if (tid < NR_ROWS) {
+			if (!get_local_id(0))
+				semaphoe = 0;
+			for (i = localGroupId; i < NR_BINS; i += THREADS_PER_ROW_SOLS)
+				bin_counters[i] = 0;
+			if (localGroupId == 0) {
+				uint rowIdx, rowOffset;
+				get_row_counters_index(&rowIdx, &rowOffset, tid);
+				cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK;
+				cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round
+				nr_slots_array[localTid] = cnt;
+			}
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		if (tid < NR_ROWS) {
+			if (localGroupId)
+				cnt = nr_slots_array[localTid];
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// in the final hash table, we are looking for a match on both the bits
+		// part of the previous PREFIX colliding bits, and the last PREFIX bits.
+		__local ulong coll;
+		if (tid < NR_ROWS) {
+			for (i = localGroupId; i < cnt && !semaphoe; i += THREADS_PER_ROW_SOLS) {
+				p = get_slot_ptr(htabs[ht_i], PARAM_K - 1, tid, i);
+				refsPtr[i] = ((__global slot_t *)p)->slot.i;
+				uint xi_first_bytes = dataPtr[i] = ((__global slot_t *)p)->slot.xi[0];
+				uint bin_to_use =
+					((xi_first_bytes & BIN_MASK(PARAM_K - 1)) >> BIN_MASK_OFFSET(PARAM_K - 1))
+					| ((xi_first_bytes & BIN_MASK2(PARAM_K - 1)) >> BIN_MASK2_OFFSET(PARAM_K - 1));
+				uint bin_counter_copy = atomic_inc(&bin_counters[bin_to_use]);
+				bins[bin_to_use * NR_SLOTS + bin_counter_copy] = i;
+				if (bin_counter_copy) {
+					for (j = 0; j < bin_counter_copy && !semaphoe; ++j) {
+						uint slot_index_j = bins[bin_to_use * NR_SLOTS + j];
+						if (xi_first_bytes == dataPtr[slot_index_j]) {
+							if (atomic_inc(&semaphoe) == 0)
+								coll = ((ulong)refsPtr[i] << 32) | refsPtr[slot_index_j];
+						}
+					}
+				}
+			}
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+		if (tid < NR_ROWS) {
+			if (get_local_id(0) == 0 && semaphoe)
+				potential_sol(htabs, sols, coll >> 32, coll & 0xffffffff, values_tmp);
+		}
+	}
+}
+
diff --git a/contrib/ocl/kernels/silentarmy.cl b/contrib/ocl/kernels/silentarmy.cl
new file mode 100644
index 000000000..8ba3d6283
--- /dev/null
+++ b/contrib/ocl/kernels/silentarmy.cl
@@ -0,0 +1,946 @@
+#define THRD                64
+#define PARAM_N				200
+#define PARAM_K				9
+#define PREFIX                          (PARAM_N / (PARAM_K + 1))
+#define NR_INPUTS                       (1 << PREFIX)
+// Approximate log base 2 of number of elements in hash tables
+#define APX_NR_ELMS_LOG                 (PREFIX + 1)
+// Number of rows and slots is affected by this. 20 offers the best performance
+// but occasionally misses ~1% of solutions.
+#define NR_ROWS_LOG                     20
+
+// Setting this to 1 might make SILENTARMY faster, see TROUBLESHOOTING.md
+#define OPTIM_SIMPLIFY_ROUND		1
+
+// Number of collision items to track, per thread
+#ifdef cl_nv_pragma_unroll // NVIDIA
+#define THREADS_PER_ROW 16
+#define LDS_COLL_SIZE (NR_SLOTS * 24 * (THRD / THREADS_PER_ROW))
+#else
+#define THREADS_PER_ROW 8
+#define LDS_COLL_SIZE (NR_SLOTS * 8 * (THRD / THREADS_PER_ROW))
+#endif
+
+// Ratio of time of sleeping before rechecking if task is done (0-1)
+#define SLEEP_RECHECK_RATIO 0.60
+// Ratio of time to busy wait for the solution (0-1)
+// The higher value the higher CPU usage with Nvidia
+#define SLEEP_SKIP_RATIO 0.005
+
+// Make hash tables OVERHEAD times larger than necessary to store the average
+// number of elements per row. The ideal value is as small as possible to
+// reduce memory usage, but not too small or else elements are dropped from the
+// hash tables.
+//
+// The actual number of elements per row is closer to the theoretical average
+// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be
+// smaller.
+//
+// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease
+// performance as they cause VRAM channel conflicts.
+#if NR_ROWS_LOG == 16
+// #error "NR_ROWS_LOG = 16 is currently broken - do not use"
+#define OVERHEAD                        2
+#elif NR_ROWS_LOG == 18
+#define OVERHEAD                        3
+#elif NR_ROWS_LOG == 19
+#define OVERHEAD                        5
+#elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND
+#define OVERHEAD                        6
+#elif NR_ROWS_LOG == 20
+#define OVERHEAD                        9
+#endif
+
+#define NR_ROWS                         (1 << NR_ROWS_LOG)
+#define NR_SLOTS            (((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD))
+// Length of 1 element (slot) in byte
+#define SLOT_LEN                        32
+// Total size of hash table
+#define HT_SIZE				(NR_ROWS * NR_SLOTS * SLOT_LEN)
+// Length of Zcash block header, nonce (part of header)
+#define ZCASH_BLOCK_HEADER_LEN		140
+// Offset of nTime in header
+#define ZCASH_BLOCK_OFFSET_NTIME        (4 + 3 * 32)
+// Length of nonce
+#define ZCASH_NONCE_LEN			32
+// Length of encoded representation of solution size
+#define ZCASH_SOLSIZE_LEN		3
+// Solution size (1344 = 0x540) represented as a compact integer, in hex
+#define ZCASH_SOLSIZE_HEX               "fd4005"
+// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes)
+#define ZCASH_SOL_LEN                   ((1 << PARAM_K) * (PREFIX + 1) / 8)
+// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization
+#define N_ZERO_BYTES			12
+// Number of bytes Zcash needs out of Blake
+#define ZCASH_HASH_LEN                  50
+// Number of wavefronts per SIMD for the Blake kernel.
+// Blake is ALU-bound (beside the atomic counter being incremented) so we need
+// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer
+// instructions. 10 is the max supported by the hw.
+#define BLAKE_WPS               	10
+// Maximum number of solutions reported by kernel to host
+#define MAX_SOLS			10
+// Length of SHA256 target
+#define SHA256_TARGET_LEN               (256 / 8)
+
+#if (NR_SLOTS < 16)
+#define BITS_PER_ROW 4
+#define ROWS_PER_UINT 8
+#define ROW_MASK 0x0F
+#else
+#define BITS_PER_ROW 8
+#define ROWS_PER_UINT 4
+#define ROW_MASK 0xFF
+#endif
+
+// Optional features
+#undef ENABLE_DEBUG
+
+/*
+** Return the offset of Xi in bytes from the beginning of the slot.
+*/
+#define xi_offset_for_round(round)	(8 + ((round) / 2) * 4)
+
+// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values
+#define SOL_SIZE			((1 << PARAM_K) * 4)
+typedef struct	sols_s
+{
+	uint	nr;
+	uint	likely_invalids;
+	uchar	valid[MAX_SOLS];
+	uint	values[MAX_SOLS][(1 << PARAM_K)];
+}		sols_t;
+
+// Windows only for now
+#define DEFAULT_NUM_MINING_MODE_THREADS 1
+#define MAX_NUM_MINING_MODE_THREADS 16
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+
+/*
+** Assuming NR_ROWS_LOG == 16, the hash table slots have this layout (length in
+** bytes in parens):
+**
+** round 0, table 0: cnt(4) i(4)                     pad(0)   Xi(23.0) pad(1)
+** round 1, table 1: cnt(4) i(4)                     pad(0.5) Xi(20.5) pad(3)
+** round 2, table 0: cnt(4) i(4) i(4)                pad(0)   Xi(18.0) pad(2)
+** round 3, table 1: cnt(4) i(4) i(4)                pad(0.5) Xi(15.5) pad(4)
+** round 4, table 0: cnt(4) i(4) i(4) i(4)           pad(0)   Xi(13.0) pad(3)
+** round 5, table 1: cnt(4) i(4) i(4) i(4)           pad(0.5) Xi(10.5) pad(5)
+** round 6, table 0: cnt(4) i(4) i(4) i(4) i(4)      pad(0)   Xi( 8.0) pad(4)
+** round 7, table 1: cnt(4) i(4) i(4) i(4) i(4)      pad(0.5) Xi( 5.5) pad(6)
+** round 8, table 0: cnt(4) i(4) i(4) i(4) i(4) i(4) pad(0)   Xi( 3.0) pad(5)
+**
+** If the first byte of Xi is 0xAB then:
+** - on even rounds, 'A' is part of the colliding PREFIX, 'B' is part of Xi
+** - on odd rounds, 'A' and 'B' are both part of the colliding PREFIX, but
+**   'A' is considered redundant padding as it was used to compute the row #
+**
+** - cnt is an atomic counter keeping track of the number of used slots.
+**   it is used in the first slot only; subsequent slots replace it with
+**   4 padding bytes
+** - i encodes either the 21-bit input value (round 0) or a reference to two
+**   inputs from the previous round
+**
+** Formula for Xi length and pad length above:
+** > for i in range(9):
+** >   xi=(200-20*i-NR_ROWS_LOG)/8.; ci=8+4*((i)/2); print xi,32-ci-xi
+**
+** Note that the fractional .5-byte/4-bit padding following Xi for odd rounds
+** is the 4 most significant bits of the last byte of Xi.
+*/
+
+__constant ulong blake_iv[] =
+{
+	0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+	0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+	0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+	0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+};
+
+/*
+** Reset counters in hash table.
+*/
+__kernel
+void kernel_init_ht(__global char *ht, __global uint *rowCounters)
+{
+	rowCounters[get_global_id(0)] = 0;
+}
+
+/*
+** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they
+** represent (hex notation, group of 5 hex digits are a group of PREFIX bits):
+**   aa aa ab bb bb cc cc cd dd...  [round 0]
+**         --------------------
+**      ...ab bb bb cc cc cd dd...  [odd round]
+**               --------------
+**               ...cc cc cd dd...  [next even round]
+**                        -----
+** Bytes underlined are going to be stored in the slot. Preceding bytes
+** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are
+** used to compute the row number.
+**
+** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter)
+** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble)
+** TODO: update lines below with padding nibbles
+** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter)
+** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter)
+** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter)
+** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter)
+** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter)
+** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter)
+** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter)
+**
+** Return 0 if successfully stored, or 1 if the row overflowed.
+*/
+uint ht_store(uint round, __global char *ht, uint i,
+	ulong xi0, ulong xi1, ulong xi2, ulong xi3, __global uint *rowCounters)
+{
+	uint    row;
+	__global char       *p;
+	uint                cnt;
+#if NR_ROWS_LOG == 16
+	if (!(round % 2))
+		row = (xi0 & 0xffff);
+	else
+		// if we have in hex: "ab cd ef..." (little endian xi0) then this
+		// formula computes the row as 0xdebc. it skips the 'a' nibble as it
+		// is part of the PREFIX. The Xi will be stored starting with "ef...";
+		// 'e' will be considered padding and 'f' is part of the current PREFIX
+		row = ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
+		((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
+#elif NR_ROWS_LOG == 18
+	if (!(round % 2))
+		row = (xi0 & 0xffff) | ((xi0 & 0xc00000) >> 6);
+	else
+		row = ((xi0 & 0xc0000) >> 2) |
+		((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
+		((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
+#elif NR_ROWS_LOG == 19
+	if (!(round % 2))
+		row = (xi0 & 0xffff) | ((xi0 & 0xe00000) >> 5);
+	else
+		row = ((xi0 & 0xe0000) >> 1) |
+		((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
+		((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
+#elif NR_ROWS_LOG == 20
+	if (!(round % 2))
+		row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4);
+	else
+		row = ((xi0 & 0xf0000) >> 0) |
+		((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
+		((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+	xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
+	xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
+	xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
+	p = ht + row * NR_SLOTS * SLOT_LEN;
+	uint rowIdx = row / ROWS_PER_UINT;
+	uint rowOffset = BITS_PER_ROW*(row%ROWS_PER_UINT);
+	uint xcnt = atomic_add(rowCounters + rowIdx, 1 << rowOffset);
+	xcnt = (xcnt >> rowOffset) & ROW_MASK;
+	cnt = xcnt;
+	if (cnt >= NR_SLOTS)
+	{
+		// avoid overflows
+		atomic_sub(rowCounters + rowIdx, 1 << rowOffset);
+		return 1;
+	}
+	p += cnt * SLOT_LEN + xi_offset_for_round(round);
+	// store "i" (always 4 bytes before Xi)
+	*(__global uint *)(p - 4) = i;
+	if (round == 0 || round == 1)
+	{
+		// store 24 bytes
+		*(__global ulong *)(p + 0) = xi0;
+		*(__global ulong *)(p + 8) = xi1;
+		*(__global ulong *)(p + 16) = xi2;
+	}
+	else if (round == 2)
+	{
+		// store 20 bytes
+		*(__global uint *)(p + 0) = xi0;
+		*(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32);
+		*(__global ulong *)(p + 12) = (xi1 >> 32) | (xi2 << 32);
+	}
+	else if (round == 3)
+	{
+		// store 16 bytes
+		*(__global uint *)(p + 0) = xi0;
+		*(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32);
+		*(__global uint *)(p + 12) = (xi1 >> 32);
+	}
+	else if (round == 4)
+	{
+		// store 16 bytes
+		*(__global ulong *)(p + 0) = xi0;
+		*(__global ulong *)(p + 8) = xi1;
+	}
+	else if (round == 5)
+	{
+		// store 12 bytes
+		*(__global ulong *)(p + 0) = xi0;
+		*(__global uint *)(p + 8) = xi1;
+	}
+	else if (round == 6 || round == 7)
+	{
+		// store 8 bytes
+		*(__global uint *)(p + 0) = xi0;
+		*(__global uint *)(p + 4) = (xi0 >> 32);
+	}
+	else if (round == 8)
+	{
+		// store 4 bytes
+		*(__global uint *)(p + 0) = xi0;
+	}
+	return 0;
+}
+
+#define mix(va, vb, vc, vd, x, y) \
+    va = (va + vb + x); \
+vd = rotate((vd ^ va), (ulong)64 - 32); \
+vc = (vc + vd); \
+vb = rotate((vb ^ vc), (ulong)64 - 24); \
+va = (va + vb + y); \
+vd = rotate((vd ^ va), (ulong)64 - 16); \
+vc = (vc + vd); \
+vb = rotate((vb ^ vc), (ulong)64 - 63);
+
+/*
+** Execute round 0 (blake).
+**
+** Note: making the work group size less than or equal to the wavefront size
+** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local
+** Memory (LDS) Optimization 2-10" in:
+** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/
+*/
+__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
+void kernel_round0(__global ulong *blake_state, __global char *ht,
+	__global uint *rowCounters, __global uint *debug)
+{
+	uint                tid = get_global_id(0);
+	ulong               v[16];
+	uint                inputs_per_thread = NR_INPUTS / get_global_size(0);
+	uint                input = tid * inputs_per_thread;
+	uint                input_end = (tid + 1) * inputs_per_thread;
+	uint                dropped = 0;
+	while (input < input_end)
+	{
+		// shift "i" to occupy the high 32 bits of the second ulong word in the
+		// message block
+		ulong word1 = (ulong)input << 32;
+		// init vector v
+		v[0] = blake_state[0];
+		v[1] = blake_state[1];
+		v[2] = blake_state[2];
+		v[3] = blake_state[3];
+		v[4] = blake_state[4];
+		v[5] = blake_state[5];
+		v[6] = blake_state[6];
+		v[7] = blake_state[7];
+		v[8] = blake_iv[0];
+		v[9] = blake_iv[1];
+		v[10] = blake_iv[2];
+		v[11] = blake_iv[3];
+		v[12] = blake_iv[4];
+		v[13] = blake_iv[5];
+		v[14] = blake_iv[6];
+		v[15] = blake_iv[7];
+		// mix in length of data
+		v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */;
+		// last block
+		v[14] ^= (ulong)-1;
+
+		// round 1
+		mix(v[0], v[4], v[8], v[12], 0, word1);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 2
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], word1, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 3
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, word1);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 4
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, word1);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 5
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, word1);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 6
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], word1, 0);
+		// round 7
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], word1, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 8
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, word1);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 9
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], word1, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 10
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], word1, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 11
+		mix(v[0], v[4], v[8], v[12], 0, word1);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 12
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], word1, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+
+		// compress v into the blake state; this produces the 50-byte hash
+		// (two Xi values)
+		ulong h[7];
+		h[0] = blake_state[0] ^ v[0] ^ v[8];
+		h[1] = blake_state[1] ^ v[1] ^ v[9];
+		h[2] = blake_state[2] ^ v[2] ^ v[10];
+		h[3] = blake_state[3] ^ v[3] ^ v[11];
+		h[4] = blake_state[4] ^ v[4] ^ v[12];
+		h[5] = blake_state[5] ^ v[5] ^ v[13];
+		h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
+
+		// store the two Xi values in the hash table
+#if ZCASH_HASH_LEN == 50
+		dropped += ht_store(0, ht, input * 2,
+			h[0],
+			h[1],
+			h[2],
+			h[3], rowCounters);
+		dropped += ht_store(0, ht, input * 2 + 1,
+			(h[3] >> 8) | (h[4] << (64 - 8)),
+			(h[4] >> 8) | (h[5] << (64 - 8)),
+			(h[5] >> 8) | (h[6] << (64 - 8)),
+			(h[6] >> 8), rowCounters);
+#else
+#error "unsupported ZCASH_HASH_LEN"
+#endif
+
+		input++;
+	}
+#ifdef ENABLE_DEBUG
+	debug[tid * 2] = 0;
+	debug[tid * 2 + 1] = dropped;
+#endif
+}
+
+#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff))
+#define DECODE_ROW(REF)   (REF >> 16)
+#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff)
+#define DECODE_SLOT0(REF) (REF & 0xff)
+
+#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f))
+#define DECODE_ROW(REF)   (REF >> 14)
+#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f)
+#define DECODE_SLOT0(REF) (REF & 0x7f)
+
+#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */
+#define DECODE_ROW(REF)   (REF >> 13)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f))
+#define DECODE_ROW(REF)   (REF >> 12)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+
+/*
+** Access a half-aligned long, that is a long aligned on a 4-byte boundary.
+*/
+ulong half_aligned_long(__global ulong *p, uint offset)
+{
+	return
+		(((ulong)*(__global uint *)((__global char *)p + offset + 0)) << 0) |
+		(((ulong)*(__global uint *)((__global char *)p + offset + 4)) << 32);
+}
+
+/*
+** Access a well-aligned int.
+*/
+uint well_aligned_int(__global ulong *_p, uint offset)
+{
+	__global char *p = (__global char *)_p;
+	return *(__global uint *)(p + offset);
+}
+
+/*
+** XOR a pair of Xi values computed at "round - 1" and store the result in the
+** hash table being built for "round". Note that when building the table for
+** even rounds we need to skip 1 padding byte present in the "round - 1" table
+** (the "0xAB" byte mentioned in the description at the top of this file.) But
+** also note we can't load data directly past this byte because this would
+** cause an unaligned memory access which is undefined per the OpenCL spec.
+**
+** Return 0 if successfully stored, or 1 if the row overflowed.
+*/
+uint xor_and_store(uint round, __global char *ht_dst, uint row,
+	uint slot_a, uint slot_b, __global ulong *a, __global ulong *b,
+	__global uint *rowCounters)
+{
+	ulong xi0, xi1, xi2;
+#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
+	// Note: for NR_ROWS_LOG == 20, for odd rounds, we could optimize by not
+	// storing the byte containing bits from the previous PREFIX block for
+	if (round == 1 || round == 2)
+	{
+		// xor 24 bytes
+		xi0 = *(a++) ^ *(b++);
+		xi1 = *(a++) ^ *(b++);
+		xi2 = *a ^ *b;
+		if (round == 2)
+		{
+			// skip padding byte
+			xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
+			xi1 = (xi1 >> 8) | (xi2 << (64 - 8));
+			xi2 = (xi2 >> 8);
+		}
+	}
+	else if (round == 3)
+	{
+		// xor 20 bytes
+		xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0);
+		xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8);
+		xi2 = well_aligned_int(a, 16) ^ well_aligned_int(b, 16);
+	}
+	else if (round == 4 || round == 5)
+	{
+		// xor 16 bytes
+		xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0);
+		xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8);
+		xi2 = 0;
+		if (round == 4)
+		{
+			// skip padding byte
+			xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
+			xi1 = (xi1 >> 8);
+		}
+	}
+	else if (round == 6)
+	{
+		// xor 12 bytes
+		xi0 = *a++ ^ *b++;
+		xi1 = *(__global uint *)a ^ *(__global uint *)b;
+		xi2 = 0;
+		if (round == 6)
+		{
+			// skip padding byte
+			xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
+			xi1 = (xi1 >> 8);
+		}
+	}
+	else if (round == 7 || round == 8)
+	{
+		// xor 8 bytes
+		xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0);
+		xi1 = 0;
+		xi2 = 0;
+		if (round == 8)
+		{
+			// skip padding byte
+			xi0 = (xi0 >> 8);
+		}
+	}
+	// invalid solutions (which start happenning in round 5) have duplicate
+	// inputs and xor to zero, so discard them
+	if (!xi0 && !xi1)
+		return 0;
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+	return ht_store(round, ht_dst, ENCODE_INPUTS(row, slot_a, slot_b),
+		xi0, xi1, xi2, 0, rowCounters);
+}
+
+/*
+** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi,
+** store them in ht_dst.
+*/
+void equihash_round(uint round,
+	__global char *ht_src,
+	__global char *ht_dst,
+	__global uint *debug,
+	__local uchar *first_words_data,
+	__local uint *collisionsData,
+	__local uint *collisionsNum,
+	__global uint *rowCountersSrc,
+	__global uint *rowCountersDst,
+	uint threadsPerRow)
+{
+	uint globalTid = get_global_id(0) / threadsPerRow;
+	uint localTid = get_local_id(0) / threadsPerRow;
+	uint localGroupId = get_local_id(0) % threadsPerRow;
+	__local uchar *first_words = &first_words_data[NR_SLOTS*localTid];
+
+	__global char *p;
+	uint    cnt;
+	uchar   mask;
+	uint    i, j;
+	// NR_SLOTS is already oversized (by a factor of OVERHEAD), but we want to
+	// make it even larger
+	uint    n;
+	uint    dropped_coll = 0;
+	uint    dropped_stor = 0;
+	__global ulong  *a, *b;
+	uint    xi_offset;
+	// read first words of Xi from the previous (round - 1) hash table
+	xi_offset = xi_offset_for_round(round - 1);
+	// the mask is also computed to read data from the previous round
+#if NR_ROWS_LOG <= 16
+	mask = ((!(round % 2)) ? 0x0f : 0xf0);
+#elif NR_ROWS_LOG == 18
+	mask = ((!(round % 2)) ? 0x03 : 0x30);
+#elif NR_ROWS_LOG == 19
+	mask = ((!(round % 2)) ? 0x01 : 0x10);
+#elif NR_ROWS_LOG == 20
+	mask = 0; /* we can vastly simplify the code below */
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif    
+
+	for (uint chunk = 0; chunk < threadsPerRow; chunk++) {
+		uint tid = globalTid + NR_ROWS / threadsPerRow*chunk;
+		uint gid = tid & ~(get_local_size(0) / threadsPerRow - 1);
+		//   for (uint tid = get_global_id(0)/threadsPerRow; tid < NR_ROWS; tid += get_global_size(0)/threadsPerRow) {
+
+		uint rowIdx = tid / ROWS_PER_UINT;
+		uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT);
+		cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK;
+		cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in prev. round
+
+		*collisionsNum = 0;
+		p = (ht_src + tid * NR_SLOTS * SLOT_LEN);
+		p += xi_offset;
+		p += SLOT_LEN*localGroupId;
+		for (i = localGroupId; i < cnt; i += threadsPerRow, p += SLOT_LEN*threadsPerRow)
+			first_words[i] = (*(__global uchar *)p) & mask;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if (cnt == 0)
+			// no elements in row, no collisions
+			goto part2;
+		// find collisions
+		for (i = 0; i < cnt - 1; i++)
+		{
+			uchar data_i = first_words[i];
+			uint collision = (localTid << 24) | (i << 12) | (i + 1 + localGroupId);
+			for (j = i + 1 + localGroupId; j < cnt; j += threadsPerRow)
+			{
+				if (data_i == first_words[j])
+				{
+					uint index = atomic_inc(collisionsNum);
+					if (index >= LDS_COLL_SIZE) {
+						atomic_dec(collisionsNum);
+						goto part2;
+					}
+					collisionsData[index] = collision;
+				}
+				collision += threadsPerRow;
+			}
+		}
+
+	part2:
+		barrier(CLK_LOCAL_MEM_FENCE);
+		uint totalCollisions = *collisionsNum;
+		for (uint index = get_local_id(0); index < totalCollisions; index += get_local_size(0))
+		{
+			uint collision = collisionsData[index];
+			uint collisionThreadId = gid + (collision >> 24);
+			uint i = (collision >> 12) & 0xFFF;
+			uint j = collision & 0xFFF;
+			__global uchar *ptr = ht_src + collisionThreadId * NR_SLOTS * SLOT_LEN +
+				xi_offset;
+			a = (__global ulong *)(ptr + i * SLOT_LEN);
+			b = (__global ulong *)(ptr + j * SLOT_LEN);
+			dropped_stor += xor_and_store(round, ht_dst, collisionThreadId, i, j,
+				a, b, rowCountersDst);
+		}
+	}
+
+#ifdef ENABLE_DEBUG
+	debug[tid * 2] = dropped_coll;
+	debug[tid * 2 + 1] = dropped_stor;
+#endif
+}
+
+/*
+** This defines kernel_round1, kernel_round2, ..., kernel_round7.
+*/
+#define KERNEL_ROUND(N) \
+__kernel __attribute__((reqd_work_group_size(64, 1, 1))) \
+void kernel_round ## N(__global char *ht_src, __global char *ht_dst, \
+	__global uint *rowCountersSrc, __global uint *rowCountersDst, \
+       	__global uint *debug) \
+{ \
+    __local uchar first_words_data[NR_SLOTS*(64/THREADS_PER_ROW)]; \
+    __local uint    collisionsData[LDS_COLL_SIZE]; \
+    __local uint    collisionsNum; \
+    equihash_round(N, ht_src, ht_dst, debug, first_words_data, collisionsData, \
+	    &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW); \
+}
+KERNEL_ROUND(1)
+KERNEL_ROUND(2)
+KERNEL_ROUND(3)
+KERNEL_ROUND(4)
+KERNEL_ROUND(5)
+KERNEL_ROUND(6)
+KERNEL_ROUND(7)
+
+// kernel_round8 takes an extra argument, "sols"
+__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
+void kernel_round8(__global char *ht_src, __global char *ht_dst,
+	__global uint *rowCountersSrc, __global uint *rowCountersDst,
+	__global uint *debug, __global sols_t *sols)
+{
+	uint		tid = get_global_id(0);
+	__local uchar	first_words_data[NR_SLOTS*(64 / THREADS_PER_ROW)];
+	__local uint	collisionsData[LDS_COLL_SIZE];
+	__local uint	collisionsNum;
+	equihash_round(8, ht_src, ht_dst, debug, first_words_data, collisionsData,
+		&collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW);
+	if (!tid)
+		sols->nr = sols->likely_invalids = 0;
+}
+
+uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
+{
+	return *(__global uint *)(ht + row * NR_SLOTS * SLOT_LEN +
+		slot * SLOT_LEN + xi_offset - 4);
+}
+
+/*
+** Expand references to inputs. Return 1 if so far the solution appears valid,
+** or 0 otherwise (an invalid solution would be a solution with duplicate
+** inputs, which can be detected at the last step: round == 0).
+*/
+uint expand_refs(uint *ins, uint nr_inputs, __global char **htabs,
+	uint round)
+{
+	__global char	*ht = htabs[round % 2];
+	uint		i = nr_inputs - 1;
+	uint		j = nr_inputs * 2 - 1;
+	uint		xi_offset = xi_offset_for_round(round);
+	int			dup_to_watch = -1;
+	do
+	{
+		ins[j] = expand_ref(ht, xi_offset,
+			DECODE_ROW(ins[i]), DECODE_SLOT1(ins[i]));
+		ins[j - 1] = expand_ref(ht, xi_offset,
+			DECODE_ROW(ins[i]), DECODE_SLOT0(ins[i]));
+		if (!round)
+		{
+			if (dup_to_watch == -1)
+				dup_to_watch = ins[j];
+			else if (ins[j] == dup_to_watch || ins[j - 1] == dup_to_watch)
+				return 0;
+		}
+		if (!i)
+			break;
+		i--;
+		j -= 2;
+	} while (1);
+	return 1;
+}
+
+/*
+** Verify if a potential solution is in fact valid.
+*/
+void potential_sol(__global char **htabs, __global sols_t *sols,
+	uint ref0, uint ref1)
+{
+	uint	nr_values;
+	uint	values_tmp[(1 << PARAM_K)];
+	uint	sol_i;
+	uint	i;
+	nr_values = 0;
+	values_tmp[nr_values++] = ref0;
+	values_tmp[nr_values++] = ref1;
+	uint round = PARAM_K - 1;
+	do
+	{
+		round--;
+		if (!expand_refs(values_tmp, nr_values, htabs, round))
+			return;
+		nr_values *= 2;
+	} while (round > 0);
+	// solution appears valid, copy it to sols
+	sol_i = atomic_inc(&sols->nr);
+	if (sol_i >= MAX_SOLS)
+		return;
+	for (i = 0; i < (1 << PARAM_K); i++)
+		sols->values[sol_i][i] = values_tmp[i];
+	sols->valid[sol_i] = 1;
+}
+
+/*
+** Scan the hash tables to find Equihash solutions.
+*/
+__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
+void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols,
+	__global uint *rowCountersSrc, __global uint *rowCountersDst)
+{
+	__local uint counters[64 / THREADS_PER_ROW];
+	__local uint refs[NR_SLOTS*(64 / THREADS_PER_ROW)];
+	__local uint data[NR_SLOTS*(64 / THREADS_PER_ROW)];
+	__local uint collisionsNum;
+	__local ulong collisions[64 * 4];
+
+	uint globalTid = get_global_id(0) / THREADS_PER_ROW;
+	uint localTid = get_local_id(0) / THREADS_PER_ROW;
+	uint localGroupId = get_local_id(0) % THREADS_PER_ROW;
+	__local uint *refsPtr = &refs[NR_SLOTS*localTid];
+	__local uint *dataPtr = &data[NR_SLOTS*localTid];
+
+	__global char	*htabs[2] = { ht0, ht1 };
+	__global char	*hcounters[2] = { rowCountersSrc, rowCountersDst };
+	uint		ht_i = (PARAM_K - 1) % 2; // table filled at last round
+	uint		cnt;
+	uint		xi_offset = xi_offset_for_round(PARAM_K - 1);
+	uint		i, j;
+	__global char	*p;
+	uint		ref_i, ref_j;
+	// it's ok for the collisions array to be so small, as if it fills up
+	// the potential solutions are likely invalid (many duplicate inputs)
+	//     ulong		collisions;
+#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
+	// in the final hash table, we are looking for a match on both the bits
+	// part of the previous PREFIX colliding bits, and the last PREFIX bits.
+	uint		mask = 0xffffff;
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+
+	collisionsNum = 0;
+
+	for (uint chunk = 0; chunk < THREADS_PER_ROW; chunk++) {
+		uint tid = globalTid + NR_ROWS / THREADS_PER_ROW*chunk;
+		p = htabs[ht_i] + tid * NR_SLOTS * SLOT_LEN;
+		uint rowIdx = tid / ROWS_PER_UINT;
+		uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT);
+		cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK;
+		cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round
+		p += xi_offset;
+		p += SLOT_LEN*localGroupId;
+
+		for (i = get_local_id(0); i < 64 / THREADS_PER_ROW; i += get_local_size(0))
+			counters[i] = 0;
+		for (i = localGroupId; i < cnt; i += THREADS_PER_ROW, p += SLOT_LEN*THREADS_PER_ROW) {
+			refsPtr[i] = *(__global uint *)(p - 4);
+			dataPtr[i] = (*(__global uint *)p) & mask;
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		for (i = 0; i < cnt; i++)
+		{
+			uint a_data = dataPtr[i];
+			ref_i = refsPtr[i];
+			for (j = i + 1 + localGroupId; j < cnt; j += THREADS_PER_ROW)
+			{
+				if (a_data == dataPtr[j])
+				{
+					if (atomic_inc(&counters[localTid]) == 0)
+						collisions[atomic_inc(&collisionsNum)] = ((ulong)ref_i << 32) | refsPtr[j];
+					goto part2;
+				}
+			}
+		}
+
+	part2:
+		continue;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	uint totalCollisions = collisionsNum;
+	if (get_local_id(0) < totalCollisions) {
+		ulong coll = collisions[get_local_id(0)];
+		potential_sol(htabs, sols, coll >> 32, coll & 0xffffffff);
+	}
+}
\ No newline at end of file
diff --git a/contrib/ocl/opencl.hpp b/contrib/ocl/opencl.hpp
new file mode 100644
index 000000000..dcb0e63e9
--- /dev/null
+++ b/contrib/ocl/opencl.hpp
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <CL/cl.h>
+
+#if defined(_MSC_VER)
+#pragma comment (lib, "opencl.lib")
+#endif
+
+#include <ocl/hex.hpp>
+#include <ocl/cl_ext.hpp>
+#include <ocl/utility/device_utils.hpp>
+
+#include <cstdio>
+
+namespace ocl {
+	
+inline cl_mem check_clCreateBuffer(cl_context ctx, cl_mem_flags flags, size_t size, void *host_ptr)
+{
+	cl_int	status;
+	cl_mem	ret;
+	ret = clCreateBuffer(ctx, flags, size, host_ptr, &status);
+	if (status != CL_SUCCESS || !ret)
+		printf("clCreateBuffer (%d)\n", status);
+	return ret;
+}
+
+inline void check_clSetKernelArg(cl_kernel k, cl_uint a_pos, cl_mem *a)
+{
+	cl_int	status;
+	status = clSetKernelArg(k, a_pos, sizeof(*a), a);
+	if (status != CL_SUCCESS)
+		printf("clSetKernelArg (%d)\n", status);
+}	
+	
+inline void check_clEnqueueNDRangeKernel(cl_command_queue queue, cl_kernel k, cl_uint
+	work_dim, const size_t *global_work_offset, const size_t
+	*global_work_size, const size_t *local_work_size, cl_uint
+	num_events_in_wait_list, const cl_event *event_wait_list, cl_event
+	*event)
+{
+	cl_uint	status;
+	status = clEnqueueNDRangeKernel(queue, k, work_dim, global_work_offset,
+		global_work_size, local_work_size, num_events_in_wait_list,
+		event_wait_list, event);
+	if (status != CL_SUCCESS)
+		printf("clEnqueueNDRangeKernel (%d)\n", status);
+}
+
+inline void check_clEnqueueReadBuffer(cl_command_queue queue, cl_mem buffer, cl_bool
+	blocking_read, size_t offset, size_t size, void *ptr, cl_uint
+	num_events_in_wait_list, const cl_event *event_wait_list, cl_event
+	*event)
+{
+	cl_int	status;
+	status = clEnqueueReadBuffer(queue, buffer, blocking_read, offset,
+		size, ptr, num_events_in_wait_list, event_wait_list, event);
+	if (status != CL_SUCCESS)
+		printf("clEnqueueReadBuffer (%d)\n", status);
+}
+
+
+inline unsigned nr_compute_units(cl_device_id device_id)
+{
+	cl_uint retval;
+	cl_int status = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &retval, nullptr);
+	if (status != CL_SUCCESS)
+		printf("nr_compute_units (%d)\n", status);
+	return retval;
+}
+
+	
+}
diff --git a/contrib/ocl/sols.hpp b/contrib/ocl/sols.hpp
new file mode 100644
index 000000000..e337ebcef
--- /dev/null
+++ b/contrib/ocl/sols.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+namespace ocl {
+	typedef uint8_t		uchar;
+	typedef uint32_t	uint;
+	typedef uint64_t	ulong;
+
+
+	template<int MAXSOLS, int PARAMK>
+	struct	sols_s
+	{
+		uint	nr;
+		uint	likely_invalids;
+		uchar	valid[MAXSOLS];
+		uint	values[MAXSOLS][(1 << PARAMK)];
+	};
+}
+
+typedef ocl::sols_s<10, 9> sols_t;
+
diff --git a/contrib/ocl/utility/device_utils.hpp b/contrib/ocl/utility/device_utils.hpp
new file mode 100644
index 000000000..7544a294a
--- /dev/null
+++ b/contrib/ocl/utility/device_utils.hpp
@@ -0,0 +1,181 @@
+#pragma once
+#include <vector>
+#include <iostream>
+#include <ocl/cl_ext.hpp>//reinclude just in case..
+#include <fstream>
+
+typedef std::vector<cl_device_id> ocl_devices;
+
+#define OCL(error) \
+  if(cl_int err = error){ \
+    printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); \
+    return; \
+  }
+
+#define OCLR(error, ret) \
+  if(cl_int err = error){ \
+    printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); \
+    return ret; \
+  }
+
+#define OCLE(error) \
+  if(cl_int err = error){ \
+    printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); \
+    exit(err); \
+  }
+
+namespace ocl {
+namespace utility {
+	inline ocl_devices GetAllDevices() {
+		ocl_devices retval;
+		retval.reserve(8);
+
+		cl_platform_id platforms[64];
+		cl_uint numPlatforms;
+		cl_int rc = clGetPlatformIDs(sizeof(platforms) / sizeof(cl_platform_id), platforms, &numPlatforms);
+
+		for (cl_uint i = 0; i < numPlatforms; i++) {
+			cl_uint numDevices = 0;
+			cl_device_id devices[64];
+			rc = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, sizeof(devices) / sizeof(cl_device_id), devices, &numDevices);
+			for (cl_uint n = 0; n < numDevices; n++) {
+				retval.push_back(devices[n]);
+			}
+		}
+
+		return retval;
+	}
+
+	inline void PrintDevices() {
+		using namespace ::std;
+
+		auto devices = GetAllDevices();
+		cout << "Number of OpenCL devices found: " << devices.size() << endl;
+		for (unsigned int i = 0; i < devices.size(); ++i) {
+			cl::Device device(devices[i]);
+			auto& platform = cl::Platform(device.getInfo<CL_DEVICE_PLATFORM>());
+			cout << "Device #" << i << " | " << platform.getInfo<CL_PLATFORM_NAME>() << " | " << device.getInfo<CL_DEVICE_NAME>();
+
+			switch (device.getInfo<CL_DEVICE_TYPE>()) {
+			case CL_DEVICE_TYPE_CPU:
+				cout << " | CPU";
+				break;
+			case CL_DEVICE_TYPE_GPU:
+				cout << " | GPU";
+				break;
+			case CL_DEVICE_TYPE_ACCELERATOR:
+				cout << " | ACCELERATOR";
+				break;
+			default:
+				cout << " | DEFAULT";
+				break;
+			}
+			cout << " | " << device.getInfo<CL_DEVICE_VERSION>();
+			cout << endl;
+		}
+	}
+
+	inline bool clCompileKernel(cl_context gContext,
+		cl_device_id gpu,
+		const char *binaryName,
+		const std::vector<const char*> &sources,
+		const char *arguments,
+		cl_int *binstatus,
+		cl_program *gProgram)
+	{
+		std::ifstream testfile(binaryName);
+
+		if (!testfile) {
+			printf("<info> compiling ...\n");
+
+			std::string sourceFile;
+			for (auto &i : sources) {
+				std::ifstream stream;
+				stream.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+				try {
+					stream.open(i);
+				}
+				catch (std::system_error& e) {
+					fprintf(stderr, "<error> %s\n", e.code().message().c_str());
+					return false;
+				}
+				std::string str((std::istreambuf_iterator<char>(stream)), std::istreambuf_iterator<char>());
+				sourceFile.append(str);
+			}
+
+			printf("<info> source: %u bytes\n", (unsigned)sourceFile.size());
+			if (sourceFile.size() < 1) {
+				fprintf(stderr, "<error> source files not found or empty\n");
+				return false;
+			}
+
+			cl_int error;
+			const char *sources[] = { sourceFile.c_str(), 0 };
+			*gProgram = clCreateProgramWithSource(gContext, 1, sources, 0, &error);
+			OCLR(error, false);
+
+			if (clBuildProgram(*gProgram, 1, &gpu, arguments, 0, 0) != CL_SUCCESS) {
+				size_t logSize;
+				clGetProgramBuildInfo(*gProgram, gpu, CL_PROGRAM_BUILD_LOG, 0, 0, &logSize);
+
+				std::unique_ptr<char[]> log(new char[logSize]);
+				clGetProgramBuildInfo(*gProgram, gpu, CL_PROGRAM_BUILD_LOG, logSize, log.get(), 0);
+				printf("%s\n", log.get());
+
+				return false;
+			}
+
+			size_t binsize;
+			OCLR(clGetProgramInfo(*gProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binsize, 0), false);
+			//     for (size_t i = 0; i < 1; i++) {
+			if (!binsize) {
+				printf("<error> no binary available!\n");
+				return false;
+			}
+			//     }
+
+			printf("<info> binsize = %u bytes\n", (unsigned)binsize);
+			std::unique_ptr<unsigned char[]> binary(new unsigned char[binsize + 1]);
+
+			OCLR(clGetProgramInfo(*gProgram, CL_PROGRAM_BINARIES, sizeof(void*), &binary, 0), false);
+
+			{
+				std::ofstream bin(binaryName, std::ofstream::binary | std::ofstream::trunc);
+				bin.write((const char*)binary.get(), binsize);
+				bin.close();
+			}
+
+			OCLR(clReleaseProgram(*gProgram), false);
+		}
+
+		std::ifstream bfile(binaryName, std::ifstream::binary);
+		if (!bfile) {
+			printf("<error> %s not found\n", binaryName);
+			return false;
+		}
+
+		bfile.seekg(0, bfile.end);
+		size_t binsize = bfile.tellg();
+		bfile.seekg(0, bfile.beg);
+		if (!binsize) {
+			printf("<error> %s empty\n", binaryName);
+			return false;
+		}
+
+		std::vector<char> binary(binsize + 1);
+		bfile.read(&binary[0], binsize);
+		bfile.close();
+
+		cl_int error;
+		//   binstatus.resize(gpus.size(), 0);
+		//   std::vector<size_t> binsizes(gpus.size(), binsize);
+		//   std::vector<const unsigned char*> binaries(gpus.size(), (const unsigned char*)&binary[0]);
+		const unsigned char *binaryPtr = (const unsigned char*)&binary[0];
+
+		*gProgram = clCreateProgramWithBinary(gContext, 1, &gpu, &binsize, &binaryPtr, binstatus, &error);
+		OCLR(error, false);
+		OCLR(clBuildProgram(*gProgram, 1, &gpu, 0, 0, 0), false);
+		return true;
+	}
+}
+}
\ No newline at end of file
diff --git a/contrib/sha256/sha256.hpp b/contrib/sha256/sha256.hpp
new file mode 100644
index 000000000..419a723df
--- /dev/null
+++ b/contrib/sha256/sha256.hpp
@@ -0,0 +1,228 @@
+#pragma once
+
+/* Sha256.h -- SHA-256 Hash
+2016-11-04 : Marc Bevand : A few changes to make it more self-contained
+2010-06-11 : Igor Pavlov : Public domain */
+
+#define SHA256_DIGEST_SIZE 32
+
+#include <cstdint>
+#include <cstring>
+
+#define rotlFixed(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
+#define rotrFixed(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22))
+#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x, 25))
+#define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3))
+#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10))
+
+#define blk0(i) (W[i] = data[i])
+#define blk2(i) (W[i&15] += s1(W[(i-2)&15]) + W[(i-7)&15] + s0(W[(i-15)&15]))
+
+#define Ch(x,y,z) (z^(x&(y^z)))
+#define Maj(x,y,z) ((x&y)|(z&(x|y)))
+
+#define a(i) T[(0-(i))&7]
+#define b(i) T[(1-(i))&7]
+#define c(i) T[(2-(i))&7]
+#define d(i) T[(3-(i))&7]
+#define e(i) T[(4-(i))&7]
+#define f(i) T[(5-(i))&7]
+#define g(i) T[(6-(i))&7]
+#define h(i) T[(7-(i))&7]
+
+#ifdef _SHA256_UNROLL2
+
+#define R(a,b,c,d,e,f,g,h, i) h += S1(e) + Ch(e,f,g) + K[i+j] + (j?blk2(i):blk0(i));\
+  d += h; h += S0(a) + Maj(a, b, c)
+
+#define RX_8(i) \
+  R(a,b,c,d,e,f,g,h, i); \
+  R(h,a,b,c,d,e,f,g, i+1); \
+  R(g,h,a,b,c,d,e,f, i+2); \
+  R(f,g,h,a,b,c,d,e, i+3); \
+  R(e,f,g,h,a,b,c,d, i+4); \
+  R(d,e,f,g,h,a,b,c, i+5); \
+  R(c,d,e,f,g,h,a,b, i+6); \
+  R(b,c,d,e,f,g,h,a, i+7)
+
+#else
+
+#define R(i) h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + (j?blk2(i):blk0(i));\
+  d(i) += h(i); h(i) += S0(a(i)) + Maj(a(i), b(i), c(i))
+
+#ifdef _SHA256_UNROLL
+
+#define RX_8(i) R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7);
+
+#endif
+
+#endif
+
+
+
+namespace sha256 {
+
+typedef struct
+{
+  uint32_t state[8];
+  uint64_t count;
+  uint8_t buffer[64];
+} CSha256;
+
+namespace impl {
+	static const uint32_t K[64] = {
+	  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+	  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+	  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+	  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+	  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+	  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+	};
+
+inline void Sha256_Transform(uint32_t *state, const uint32_t *data)
+{
+  uint32_t W[16];
+  unsigned j;
+  #ifdef _SHA256_UNROLL2
+  uint32_t a,b,c,d,e,f,g,h;
+  a = state[0];
+  b = state[1];
+  c = state[2];
+  d = state[3];
+  e = state[4];
+  f = state[5];
+  g = state[6];
+  h = state[7];
+  #else
+  uint32_t T[8];
+  for (j = 0; j < 8; j++)
+    T[j] = state[j];
+  #endif
+
+  for (j = 0; j < 64; j += 16)
+  {
+    #if defined(_SHA256_UNROLL) || defined(_SHA256_UNROLL2)
+    RX_8(0); RX_8(8);
+    #else
+    unsigned i;
+    for (i = 0; i < 16; i++) { R(i); }
+    #endif
+  }
+
+  #ifdef _SHA256_UNROLL2
+  state[0] += a;
+  state[1] += b;
+  state[2] += c;
+  state[3] += d;
+  state[4] += e;
+  state[5] += f;
+  state[6] += g;
+  state[7] += h;
+  #else
+  for (j = 0; j < 8; j++)
+    state[j] += T[j];
+  #endif
+  
+  /* Wipe variables */
+  /* memset(W, 0, sizeof(W)); */
+  /* memset(T, 0, sizeof(T)); */
+}
+
+#undef S0
+#undef S1
+#undef s0
+#undef s1
+
+inline void Sha256_WriteByteBlock(CSha256 *p)
+{
+  uint32_t data32[16];
+  unsigned i;
+  for (i = 0; i < 16; i++)
+    data32[i] =
+      ((uint32_t)(p->buffer[i * 4    ]) << 24) +
+      ((uint32_t)(p->buffer[i * 4 + 1]) << 16) +
+      ((uint32_t)(p->buffer[i * 4 + 2]) <<  8) +
+      ((uint32_t)(p->buffer[i * 4 + 3]));
+  Sha256_Transform(p->state, data32);
+}
+	
+}
+
+inline void Sha256_Init(CSha256 *p) {
+  p->state[0] = 0x6a09e667;
+  p->state[1] = 0xbb67ae85;
+  p->state[2] = 0x3c6ef372;
+  p->state[3] = 0xa54ff53a;
+  p->state[4] = 0x510e527f;
+  p->state[5] = 0x9b05688c;
+  p->state[6] = 0x1f83d9ab;
+  p->state[7] = 0x5be0cd19;
+  p->count = 0;	
+}
+
+inline void Sha256_Update(CSha256 *p, const uint8_t *data, size_t size)
+{
+	uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
+	while (size > 0)
+	{
+		p->buffer[curBufferPos++] = *data++;
+		p->count++;
+		size--;
+		if (curBufferPos == 64)
+		{
+			curBufferPos = 0;
+			impl::Sha256_WriteByteBlock(p);
+		}
+	}
+}
+
+inline void Sha256_Final(CSha256 *p, uint8_t *digest)
+{
+	uint64_t lenInBits = (p->count << 3);
+	uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
+	unsigned i;
+	p->buffer[curBufferPos++] = 0x80;
+	while (curBufferPos != (64 - 8))
+	{
+		curBufferPos &= 0x3F;
+		if (curBufferPos == 0)
+		impl::Sha256_WriteByteBlock(p);
+		p->buffer[curBufferPos++] = 0;
+	}
+	for (i = 0; i < 8; i++)
+	{
+		p->buffer[curBufferPos++] = (uint8_t)(lenInBits >> 56);
+		lenInBits <<= 8;
+	}
+	impl::Sha256_WriteByteBlock(p);
+
+	for (i = 0; i < 8; i++)
+	{
+		*digest++ = (uint8_t)(p->state[i] >> 24);
+		*digest++ = (uint8_t)(p->state[i] >> 16);
+		*digest++ = (uint8_t)(p->state[i] >> 8);
+		*digest++ = (uint8_t)(p->state[i]);
+	}
+	Sha256_Init(p);	
+}
+
+inline void Sha256_Onestep(const uint8_t *data, size_t size, uint8_t *digest)
+{
+	CSha256 p;
+    Sha256_Init(&p);
+    Sha256_Update(&p, data, size);
+    Sha256_Final(&p, digest);	
+}
+
+}
diff --git a/cpu_tromp/equi.h b/cpu_tromp/equi.h
index b9237c359..6cb07bc74 100644
--- a/cpu_tromp/equi.h
+++ b/cpu_tromp/equi.h
@@ -49,7 +49,7 @@ typedef u32 proof[PROOFSIZE];
 void setheader(blake2b_state *ctx, const char *header, const u32 headerLen, const char* nce, const u32 nonceLen) {
   uint32_t le_N = WN;
   uint32_t le_K = WK;
-  uchar personal[] = "DeepWebCa01230123";
+  uchar personal[] = "ZcashPoW01230123";
   memcpy(personal+8,  &le_N, 4);
   memcpy(personal+12, &le_K, 4);
   blake2b_param P[1];
diff --git a/cuda_silentarmy/cuda_silentarmy.vcxproj b/cuda_silentarmy/cuda_silentarmy.vcxproj
index 76f286295..4f6bd8cf5 100644
--- a/cuda_silentarmy/cuda_silentarmy.vcxproj
+++ b/cuda_silentarmy/cuda_silentarmy.vcxproj
@@ -74,10 +74,14 @@
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>true</LinkIncremental>
+    <IncludePath>$(ProjectDir)../contrib/;$(IncludePath)</IncludePath>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <IncludePath>$(ProjectDir)../contrib/;$(IncludePath)</IncludePath>
+  </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>
@@ -176,11 +180,9 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
     <ClInclude Include="cuda_silentarmy.hpp" />
     <ClInclude Include="sa_cuda_context.hpp" />
     <ClInclude Include="param.h" />
-    <ClInclude Include="sa_blake.h" />
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="cuda_silentarmy.cpp" />
-    <ClCompile Include="sa_blake.cpp" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/cuda_silentarmy/kernel.cu b/cuda_silentarmy/kernel.cu
index 5f773a9a9..1b6605395 100644
--- a/cuda_silentarmy/kernel.cu
+++ b/cuda_silentarmy/kernel.cu
@@ -8,7 +8,9 @@
 
 #include "sa_cuda_context.hpp"
 #include "param.h"
-#include "sa_blake.h"
+
+#include <blake/blake.hpp>
+using namespace blake;
 
 #define WN PARAM_N
 #define WK PARAM_K
@@ -59,7 +61,7 @@
 __device__ char rowCounter0[NR_ROWS];
 __device__ char rowCounter1[NR_ROWS];
 __device__ sols_t sols;
-__device__ blake2b_state_t blake;
+__device__ blake2b_state_t blake_obj;
 
 __constant__ ulong blake_iv[] =
 {
@@ -431,14 +433,14 @@ void kernel_round0(char *ht, uint *debug)
 		// message block
 		ulong word1 = (ulong)input << 32;
 		// init vector v
-		v[0] = blake.h[0];
-		v[1] = blake.h[1];
-		v[2] = blake.h[2];
-		v[3] = blake.h[3];
-		v[4] = blake.h[4];
-		v[5] = blake.h[5];
-		v[6] = blake.h[6];
-		v[7] = blake.h[7];
+		v[0] = blake_obj.h[0];
+		v[1] = blake_obj.h[1];
+		v[2] = blake_obj.h[2];
+		v[3] = blake_obj.h[3];
+		v[4] = blake_obj.h[4];
+		v[5] = blake_obj.h[5];
+		v[6] = blake_obj.h[6];
+		v[7] = blake_obj.h[7];
 		v[8] = blake_iv[0];
 		v[9] = blake_iv[1];
 		v[10] = blake_iv[2];
@@ -564,13 +566,13 @@ void kernel_round0(char *ht, uint *debug)
 		// compress v into the blake state; this produces the 50-byte hash
 		// (two Xi values)
 		ulong h[7];
-		h[0] = blake.h[0] ^ v[0] ^ v[8];
-		h[1] = blake.h[1] ^ v[1] ^ v[9];
-		h[2] = blake.h[2] ^ v[2] ^ v[10];
-		h[3] = blake.h[3] ^ v[3] ^ v[11];
-		h[4] = blake.h[4] ^ v[4] ^ v[12];
-		h[5] = blake.h[5] ^ v[5] ^ v[13];
-		h[6] = (blake.h[6] ^ v[6] ^ v[14]) & 0xffff;
+		h[0] = blake_obj.h[0] ^ v[0] ^ v[8];
+		h[1] = blake_obj.h[1] ^ v[1] ^ v[9];
+		h[2] = blake_obj.h[2] ^ v[2] ^ v[10];
+		h[3] = blake_obj.h[3] ^ v[3] ^ v[11];
+		h[4] = blake_obj.h[4] ^ v[4] ^ v[12];
+		h[5] = blake_obj.h[5] ^ v[5] ^ v[13];
+		h[6] = (blake_obj.h[6] ^ v[6] ^ v[14]) & 0xffff;
 
 		// store the two Xi values in the hash table
 #if ZCASH_HASH_LEN == 50
@@ -2474,7 +2476,7 @@ void sa_cuda_context::solve(const char * tequihash_header, unsigned int tequihas
 	zcash_blake2b_init(&initialCtx, ZCASH_HASH_LEN, PARAM_N, PARAM_K);
 	zcash_blake2b_update(&initialCtx, (const uint8_t*)context, 128, 0);
 
-	checkCudaErrors(cudaMemcpyToSymbol(blake, &initialCtx, sizeof(blake2b_state_s), 0, cudaMemcpyHostToDevice));
+	checkCudaErrors(cudaMemcpyToSymbol(blake_obj, &initialCtx, sizeof(blake2b_state_s), 0, cudaMemcpyHostToDevice));
 
 	for (unsigned round = 0; round < PARAM_K; round++) {
 		if (bUseOld) {
diff --git a/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj b/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj
index 9efc78965..8c07b5f7d 100644
--- a/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj
+++ b/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj
@@ -36,7 +36,7 @@
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v120</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
@@ -50,7 +50,7 @@
     <UseDebugLibraries>false</UseDebugLibraries>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v120</PlatformToolset>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
@@ -74,6 +74,10 @@
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>true</LinkIncremental>
+    <IncludePath>$(ProjectDir)../contrib/;$(IncludePath)</IncludePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <IncludePath>$(ProjectDir)../contrib/;$(IncludePath)</IncludePath>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
diff --git a/cuda_silentarmy_sm30/kernel.cu b/cuda_silentarmy_sm30/kernel.cu
index 459b94b3a..da0504291 100644
--- a/cuda_silentarmy_sm30/kernel.cu
+++ b/cuda_silentarmy_sm30/kernel.cu
@@ -13,118 +13,8 @@
 #include <iostream>
 #include <stdint.h>
 
-
-//*blake header */
-
-typedef struct  blake2b_state_s
-{
-	uint64_t    h[8];
-	uint64_t    bytes;
-}               blake2b_state_t;
-
-void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, uint32_t n, uint32_t k);
-void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg,
-	uint32_t msg_len, uint32_t is_final);
-void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen);
-
-/* blake.cpp **/
-
-//static const uint32_t   blake2b_block_len = 128;
-static const uint32_t   blake2b_rounds = 12;
-static const uint64_t   blake2b_iv[8] =
-{
-	0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
-	0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
-	0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
-	0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL,
-};
-static const uint8_t    blake2b_sigma[12][16] =
-{
-	{ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
-	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
-	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
-	{ 7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
-	{ 9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
-	{ 2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
-	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
-	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
-	{ 6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
-	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
-	{ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
-	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
-};
-
-/*
-** Init the state according to Zcash parameters.
-*/
-void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len,
-	uint32_t n, uint32_t k)
-{
-	st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len);
-	for (uint32_t i = 1; i <= 5; i++)
-		st->h[i] = blake2b_iv[i];
-	st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"DeepWebCa";
-	st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n);
-	st->bytes = 0;
-}
-
-static uint64_t rotr64(uint64_t a, uint8_t bits)
-{
-	return (a >> bits) | (a << (64 - bits));
-}
-
-static inline void mix64(uint64_t *va, uint64_t *vb, uint64_t *vc, uint64_t *vd,
-	uint64_t x, uint64_t y)
-{
-	*va = (*va + *vb + x);
-	*vd = rotr64(*vd ^ *va, 32);
-	*vc = (*vc + *vd);
-	*vb = rotr64(*vb ^ *vc, 24);
-	*va = (*va + *vb + y);
-	*vd = rotr64(*vd ^ *va, 16);
-	*vc = (*vc + *vd);
-	*vb = rotr64(*vb ^ *vc, 63);
-}
-
-/*
-** Process either a full message block or the final partial block.
-** Note that v[13] is not XOR'd because st->bytes is assumed to never overflow.
-**
-** _msg         pointer to message (must be zero-padded to 128 bytes if final block)
-** msg_len      must be 128 (<= 128 allowed only for final partial block)
-** is_final     indicate if this is the final block
-*/
-void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg,
-	uint32_t msg_len, uint32_t is_final)
-{
-	const uint64_t      *m = (const uint64_t *)_msg;
-	uint64_t            v[16];
-	memcpy(v + 0, st->h, 8 * sizeof(*v));
-	memcpy(v + 8, blake2b_iv, 8 * sizeof(*v));
-	v[12] ^= (st->bytes += msg_len);
-	v[14] ^= is_final ? -1 : 0;
-	for (uint32_t round = 0; round < blake2b_rounds; round++)
-	{
-		const uint8_t   *s = blake2b_sigma[round];
-		mix64(v + 0, v + 4, v + 8, v + 12, m[s[0]], m[s[1]]);
-		mix64(v + 1, v + 5, v + 9, v + 13, m[s[2]], m[s[3]]);
-		mix64(v + 2, v + 6, v + 10, v + 14, m[s[4]], m[s[5]]);
-		mix64(v + 3, v + 7, v + 11, v + 15, m[s[6]], m[s[7]]);
-		mix64(v + 0, v + 5, v + 10, v + 15, m[s[8]], m[s[9]]);
-		mix64(v + 1, v + 6, v + 11, v + 12, m[s[10]], m[s[11]]);
-		mix64(v + 2, v + 7, v + 8, v + 13, m[s[12]], m[s[13]]);
-		mix64(v + 3, v + 4, v + 9, v + 14, m[s[14]], m[s[15]]);
-	}
-	for (uint32_t i = 0; i < 8; i++)
-		st->h[i] ^= v[i] ^ v[i + 8];
-}
-
-void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen)
-{
-	memcpy(out, st->h, outlen);
-}
-
-/* end of blake cpp*/
+#include <blake/blake.hpp>
+using namespace blake;
 
 #define PARAM_N 200
 #define PARAM_K 9
@@ -178,7 +68,7 @@ typedef struct __align__(64) sols_s
 __device__ uint32_t rowCounter0[1 << NR_ROWS_LOG];
 __device__ uint32_t rowCounter1[1 << NR_ROWS_LOG];
 __device__ uint32_t* rowCounters[2] = { rowCounter0 , rowCounter1 };
-__device__ blake2b_state_t blake;
+__device__ blake2b_state_t blake_obj;
 __device__ sols_t sols;
 
 
@@ -263,6 +153,7 @@ __device__ uint well_aligned_int(ulong *_p, uint offset)
 	return *(uint *)(p + offset);
 }
 
+#if 0
 __device__ uint xor_and_store3(char* ht, uint tid, uint slot_a, uint slot_b, ulong* a, ulong* b, uint* rowCounters)
 {
 	ulong xi0, xi1, xi2, xi3;
@@ -294,9 +185,6 @@ __device__ uint xor_and_store3(char* ht, uint tid, uint slot_a, uint slot_b, ulo
 
 	ulong test1 = half_aligned_long(a, 0);
 
-	printf("test1 %lX | %lX | %02X %02X %02X %02X\n", test1, test3, a0, a1, a2, a3);
-
-
 	// xor 20 bytes
 	xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0);
 	xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8);
@@ -337,6 +225,8 @@ __device__ uint xor_and_store3(char* ht, uint tid, uint slot_a, uint slot_b, ulo
 	return 0;
 }
 
+#endif
+
 __device__ uint ht_store(uint round, char *ht, uint i,
 	ulong xi0, ulong xi1, ulong xi2, ulong xi3, uint *rowCounters)
 {
@@ -475,14 +365,14 @@ void kernel_round0(char *ht, uint32_t inputs_per_thread, int offset)
 		// message block
 		ulong word1 = (ulong)input << 32;
 		// init vector v
-		v[0] = blake.h[0];
-		v[1] = blake.h[1];
-		v[2] = blake.h[2];
-		v[3] = blake.h[3];
-		v[4] = blake.h[4];
-		v[5] = blake.h[5];
-		v[6] = blake.h[6];
-		v[7] = blake.h[7];
+		v[0] = blake_obj.h[0];
+		v[1] = blake_obj.h[1];
+		v[2] = blake_obj.h[2];
+		v[3] = blake_obj.h[3];
+		v[4] = blake_obj.h[4];
+		v[5] = blake_obj.h[5];
+		v[6] = blake_obj.h[6];
+		v[7] = blake_obj.h[7];
 		v[8] = blake_iv[0];
 		v[9] = blake_iv[1];
 		v[10] = blake_iv[2];
@@ -608,13 +498,13 @@ void kernel_round0(char *ht, uint32_t inputs_per_thread, int offset)
 		// compress v into the blake state; this produces the 50-byte hash
 		// (two Xi values)
 		ulong h[7];
-		h[0] = blake.h[0] ^ v[0] ^ v[8];
-		h[1] = blake.h[1] ^ v[1] ^ v[9];
-		h[2] = blake.h[2] ^ v[2] ^ v[10];
-		h[3] = blake.h[3] ^ v[3] ^ v[11];
-		h[4] = blake.h[4] ^ v[4] ^ v[12];
-		h[5] = blake.h[5] ^ v[5] ^ v[13];
-		h[6] = (blake.h[6] ^ v[6] ^ v[14]) & 0xffff;
+		h[0] = blake_obj.h[0] ^ v[0] ^ v[8];
+		h[1] = blake_obj.h[1] ^ v[1] ^ v[9];
+		h[2] = blake_obj.h[2] ^ v[2] ^ v[10];
+		h[3] = blake_obj.h[3] ^ v[3] ^ v[11];
+		h[4] = blake_obj.h[4] ^ v[4] ^ v[12];
+		h[5] = blake_obj.h[5] ^ v[5] ^ v[13];
+		h[6] = (blake_obj.h[6] ^ v[6] ^ v[14]) & 0xffff;
 
 		// store the two Xi values in the hash table
 #if ZCASH_HASH_LEN == 50
@@ -657,9 +547,12 @@ __device__ uint xor_and_store(uint round, char *ht_dst, uint row,
 	uint slot_a, uint slot_b, ulong *a, ulong *b,
 	uint *rowCounters)
 {
+
+#if 0
 	if (round == 3) {
 		return xor_and_store3(ht_dst, row, slot_a, slot_b, a, b, rowCounters);
 	}
+#endif
 
 	ulong xi0, xi1, xi2;
 #if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
@@ -1072,7 +965,7 @@ void sa_cuda_context::solve(const char * tequihash_header, unsigned int tequihas
 	zcash_blake2b_init(&initialCtx, ZCASH_HASH_LEN, PARAM_N, PARAM_K);
 	zcash_blake2b_update(&initialCtx, (const uint8_t*)context, 128, 0);
 
-	checkCudaErrors(cudaMemcpyToSymbol(blake, &initialCtx, sizeof(blake2b_state_s), 0, cudaMemcpyHostToDevice));
+	checkCudaErrors(cudaMemcpyToSymbol(blake_obj, &initialCtx, sizeof(blake2b_state_s), 0, cudaMemcpyHostToDevice));
 
 	//const uint32_t THREAD_SHIFT = 8;
 	//const uint32_t THREAD_COUNT = 1 << THREAD_SHIFT;
diff --git a/nheqminer/libstratum/StratumClient.cpp b/nheqminer/libstratum/StratumClient.cpp
index c8c00d338..df6d31f88 100644
--- a/nheqminer/libstratum/StratumClient.cpp
+++ b/nheqminer/libstratum/StratumClient.cpp
@@ -434,4 +434,11 @@ template class StratumClient<ZMinerSSE2CUDA80_SA, ZcashJob, EquihashSolution>;
 template class StratumClient<ZMinerAVXCUDA75_SA, ZcashJob, EquihashSolution>;
 template class StratumClient<ZMinerSSE2CUDA75_SA, ZcashJob, EquihashSolution>;
 template class StratumClient<ZMinerAVXCUDASA80_SA, ZcashJob, EquihashSolution>;
-template class StratumClient<ZMinerSSE2CUDASA80_SA, ZcashJob, EquihashSolution>;
\ No newline at end of file
+template class StratumClient<ZMinerSSE2CUDASA80_SA, ZcashJob, EquihashSolution>;
+// Gatelessgate
+template class StratumClient<ZMinerAVXCUDA80_GG, ZcashJob, EquihashSolution>;
+template class StratumClient<ZMinerSSE2CUDA80_GG, ZcashJob, EquihashSolution>;
+template class StratumClient<ZMinerAVXCUDA75_GG, ZcashJob, EquihashSolution>;
+template class StratumClient<ZMinerSSE2CUDA75_GG, ZcashJob, EquihashSolution>;
+template class StratumClient<ZMinerAVXCUDASA80_GG, ZcashJob, EquihashSolution>;
+template class StratumClient<ZMinerSSE2CUDASA80_GG, ZcashJob, EquihashSolution>;
\ No newline at end of file
diff --git a/nheqminer/libstratum/ZcashStratum.cpp b/nheqminer/libstratum/ZcashStratum.cpp
index d7cab675d..c64c31cab 100644
--- a/nheqminer/libstratum/ZcashStratum.cpp
+++ b/nheqminer/libstratum/ZcashStratum.cpp
@@ -623,6 +623,16 @@ template class ZcashMiner<cpu_tromp, cuda_tromp_75, ocl_silentarmy>;
 template class ZcashMiner<cpu_xenoncat, cuda_sa_solver, ocl_silentarmy>;
 template class ZcashMiner<cpu_tromp, cuda_sa_solver, ocl_silentarmy>;
 
+// Gatelessgate
+template class ZcashMiner<cpu_xenoncat, cuda_tromp, ocl_gatelessgate>;
+template class ZcashMiner<cpu_tromp, cuda_tromp, ocl_gatelessgate>;
+template class ZcashMiner<cpu_xenoncat, cuda_tromp_75, ocl_gatelessgate>;
+template class ZcashMiner<cpu_tromp, cuda_tromp_75, ocl_gatelessgate>;
+template class ZcashMiner<cpu_xenoncat, cuda_sa_solver, ocl_gatelessgate>;
+template class ZcashMiner<cpu_tromp, cuda_sa_solver, ocl_gatelessgate>;
+
+
+
 std::mutex benchmark_work;
 std::vector<uint256*> benchmark_nonces;
 std::atomic_int benchmark_solutions;
@@ -896,4 +906,28 @@ void ZMinerSSE2CUDASA80_SA_doBenchmark(int hashes, int cpu_threads, int cuda_cou
 	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) {
 	ZMinerSSE2CUDASA80_SA::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t);
 }
-
+// ocl_gatelessgate
+void ZMinerAVXCUDA80_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
+	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) {
+	ZMinerAVXCUDA80_GG::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t);
+}
+void ZMinerSSE2CUDA80_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
+	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) {
+	ZMinerSSE2CUDA80_GG::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t);
+}
+void ZMinerAVXCUDA75_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
+	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) {
+	ZMinerAVXCUDA75_GG::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t);
+}
+void ZMinerSSE2CUDA75_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
+	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) {
+	ZMinerSSE2CUDA75_GG::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t);
+}
+void ZMinerAVXCUDASA80_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
+	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) {
+	ZMinerAVXCUDASA80_GG::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t);
+}
+void ZMinerSSE2CUDASA80_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
+	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) {
+	ZMinerSSE2CUDASA80_GG::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t);
+}
diff --git a/nheqminer/libstratum/ZcashStratum.h b/nheqminer/libstratum/ZcashStratum.h
index b8d686d98..4ebe55404 100644
--- a/nheqminer/libstratum/ZcashStratum.h
+++ b/nheqminer/libstratum/ZcashStratum.h
@@ -50,6 +50,11 @@ CREATE_SOLVER_STUB(ocl_xmp, "ocl_xmp_STUB")
 #else
 CREATE_SOLVER_STUB(ocl_silentarmy, "ocl_silentarmy_STUB")
 #endif
+#ifdef USE_OCL_GATELESSGATE
+#include "../ocl_gatelessgate/ocl_gatelessgate.hpp"
+#else
+CREATE_SOLVER_STUB(ocl_gatelessgate, "ocl_gatelessgate_STUB")
+#endif
 
 #include "../cuda_silentarmy/cuda_silentarmy.hpp"
 
@@ -171,6 +176,15 @@ typedef ZcashMiner<cpu_tromp, cuda_tromp_75, ocl_silentarmy> ZMinerSSE2CUDA75_SA
 typedef ZcashMiner<cpu_tromp, cuda_sa_solver, ocl_silentarmy> ZMinerSSE2CUDASA80_SA;
 typedef ZcashMiner<cpu_xenoncat, cuda_sa_solver, ocl_silentarmy> ZMinerAVXCUDASA80_SA;
 
+//ocl_gatelessgate
+typedef ZcashMiner<cpu_xenoncat, cuda_tromp, ocl_gatelessgate> ZMinerAVXCUDA80_GG;
+typedef ZcashMiner<cpu_tromp, cuda_tromp, ocl_gatelessgate> ZMinerSSE2CUDA80_GG;
+typedef ZcashMiner<cpu_xenoncat, cuda_tromp_75, ocl_gatelessgate> ZMinerAVXCUDA75_GG;
+typedef ZcashMiner<cpu_tromp, cuda_tromp_75, ocl_gatelessgate> ZMinerSSE2CUDA75_GG;
+typedef ZcashMiner<cpu_tromp, cuda_sa_solver, ocl_gatelessgate> ZMinerSSE2CUDASA80_GG;
+typedef ZcashMiner<cpu_xenoncat, cuda_sa_solver, ocl_gatelessgate> ZMinerAVXCUDASA80_GG;
+
+
 // ocl_xmp
 // gcc static undefined reference workaround
 void ZMinerAVXCUDA80_XMP_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
@@ -197,4 +211,17 @@ void ZMinerSSE2CUDA75_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count
 void ZMinerAVXCUDASA80_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
 	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
 void ZMinerSSE2CUDASA80_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
-	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
\ No newline at end of file
+	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
+// ocl_gatelessgate
+void ZMinerAVXCUDA80_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
+	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
+void ZMinerSSE2CUDA80_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
+	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
+void ZMinerAVXCUDA75_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
+	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
+void ZMinerSSE2CUDA75_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
+	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
+void ZMinerAVXCUDASA80_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
+	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
+void ZMinerSSE2CUDASA80_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t,
+	int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t);
diff --git a/nheqminer/main.cpp b/nheqminer/main.cpp
index ee46f69a0..45d3db2b4 100644
--- a/nheqminer/main.cpp
+++ b/nheqminer/main.cpp
@@ -7,7 +7,7 @@
 
 #include "libstratum/StratumClient.h"
 
-#if defined(USE_OCL_XMP) || defined(USE_OCL_SILENTARMY)
+#if defined(USE_OCL_XMP) || defined(USE_OCL_SILENTARMY) || defined(USE_OCL_GATELESSGATE)
 #include "../ocl_device_utils/ocl_device_utils.h"
 #define PRINT_OCL_INFO
 #endif
@@ -29,8 +29,6 @@
 #include <boost/log/support/date_time.hpp>
 #include <boost/date_time/posix_time/posix_time_types.hpp>
 
-#include <ocl/opencl.hpp>
-
 namespace logging = boost::log;
 namespace sinks = boost::log::sinks;
 namespace src = boost::log::sources;
@@ -271,6 +269,7 @@ int main(int argc, char* argv[])
 	int opencl_device_count = 0;
 	int force_cpu_ext = -1;
 	int opencl_t = 0;
+	int use_gg = 1;
 
 	for (int i = 1; i < argc; ++i)
 	{
@@ -348,7 +347,52 @@ int main(int argc, char* argv[])
 				return 0;
 			case 'v':
 				use_old_xmp = atoi(argv[++i]);
+				use_gg = 0;
+				break;
+			case 'd':
+				while (opencl_device_count < 8 && i + 1 < argc)
+				{
+					try
+					{
+						opencl_enabled[opencl_device_count] = std::stol(argv[++i]);
+						++opencl_device_count;
+					}
+					catch (...)
+					{
+						--i;
+						break;
+					}
+				}
+				break;
+			case 'p':
+				opencl_platform = atoi(argv[++i]);
+				break;
+			case 't':
+				while (opencl_t < 8 && i + 1 < argc)
+				{
+					try
+					{
+						opencl_threads[opencl_t] = std::stol(argv[++i]);
+						++opencl_t;
+					}
+					catch (...)
+					{
+						--i;
+						break;
+					}
+				}
 				break;
+				// TODO extra parameters for OpenCL
+			}
+			break;
+		}
+		case 'g':
+		{
+			switch (argv[i][2])
+			{
+			case 'i':
+				print_opencl_info();
+				return 0;
 			case 'd':
 				while (opencl_device_count < 8 && i + 1 < argc)
 				{
@@ -363,6 +407,7 @@ int main(int argc, char* argv[])
 						break;
 					}
 				}
+				use_gg = 1;
 				break;
 			case 'p':
 				opencl_platform = atoi(argv[++i]);
@@ -537,6 +582,30 @@ int main(int argc, char* argv[])
 						ZMinerSSE2CUDA80_XMP_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads);
 					}
 				}
+			}
+			else if (use_gg == 1) {
+				if (use_avx) {
+					if (use_cuda_sa) {
+						ZMinerAVXCUDASA80_GG_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads);
+					}
+					else if (use_old_cuda) {
+						ZMinerAVXCUDA75_GG_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads);
+					}
+					else {
+						ZMinerAVXCUDA80_GG_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads);
+					}
+				}
+				else {
+					if (use_cuda_sa) {
+						ZMinerSSE2CUDASA80_GG_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads);
+					}
+					else if (use_old_cuda) {
+						ZMinerSSE2CUDA75_GG_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads);
+					}
+					else {
+						ZMinerSSE2CUDA80_GG_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads);
+					}
+				}
 			} else { // sarmy
 				if (use_avx) {
 					if (use_cuda_sa) {
diff --git a/nheqminer/nheqminer.sln b/nheqminer/nheqminer.sln
index 67e3a8b88..cb2e2c463 100644
--- a/nheqminer/nheqminer.sln
+++ b/nheqminer/nheqminer.sln
@@ -30,6 +30,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuda_silentarmy", "..\cuda_
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuda_silentarmy_sm30", "..\cuda_silentarmy_sm30\cuda_silentarmy_sm30.vcxproj", "{53E62B3D-3FA6-4B53-8175-2B93753D98C4}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ocl_gatelessgate", "..\ocl_gatelessgate\ocl_gatelessgate.vcxproj", "{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Mixed Platforms = Debug|Mixed Platforms
@@ -184,6 +186,24 @@ Global
 		{53E62B3D-3FA6-4B53-8175-2B93753D98C4}.ReleaseSlow|Win32.Build.0 = Release|Win32
 		{53E62B3D-3FA6-4B53-8175-2B93753D98C4}.ReleaseSlow|x64.ActiveCfg = Release|x64
 		{53E62B3D-3FA6-4B53-8175-2B93753D98C4}.ReleaseSlow|x64.Build.0 = Release|x64
+		{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+		{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+		{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Debug|Win32.ActiveCfg = Debug|Win32
+		{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Debug|Win32.Build.0 = Debug|Win32
+		{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Debug|x64.ActiveCfg = Debug|x64
+		{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Debug|x64.Build.0 = Debug|x64
+		{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+		{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Release|Mixed Platforms.Build.0 = Release|Win32
+		{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Release|Win32.ActiveCfg = Release|Win32
+		{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Release|Win32.Build.0 = Release|Win32
+		{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Release|x64.ActiveCfg = Release|x64
+		{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Release|x64.Build.0 = Release|x64
+		{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.ReleaseSlow|Mixed Platforms.ActiveCfg = Release|Win32
+		{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.ReleaseSlow|Mixed Platforms.Build.0 = Release|Win32
+		{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.ReleaseSlow|Win32.ActiveCfg = Release|Win32
+		{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.ReleaseSlow|Win32.Build.0 = Release|Win32
+		{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.ReleaseSlow|x64.ActiveCfg = Release|x64
+		{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.ReleaseSlow|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/nheqminer/nheqminer.vcxproj b/nheqminer/nheqminer.vcxproj
index 47972a117..fb685dfca 100644
--- a/nheqminer/nheqminer.vcxproj
+++ b/nheqminer/nheqminer.vcxproj
@@ -85,7 +85,7 @@
       <Optimization>MaxSpeed</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;USE_CPU_TROMP;USE_CPU_XENONCAT;USE_CUDA_TROMP;USE_OCL_XMP;USE_OCL_SILENTARMY;CONSOLE_COLORS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;USE_CPU_TROMP;USE_CPU_XENONCAT;USE_CUDA_TROMP;USE_OCL_XMP;USE_OCL_SILENTARMY;USE_OCL_GATELESSGATE;CONSOLE_COLORS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
       <AdditionalOptions>-D_WIN32_WINNT=0x0601 %(AdditionalOptions)</AdditionalOptions>
       <DisableSpecificWarnings>4068;4996;4503;4267;4180;4290;4244;4800;4334;4251</DisableSpecificWarnings>
@@ -97,7 +97,7 @@
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>cuda_tromp.lib;cuda_tromp_75.lib;cpu_xenoncat.lib;cpu_tromp_SSE2.lib;cpu_tromp_AVX.lib;ocl_device_utils.lib;ocl_xpm.lib;ocl_silentarmy.lib;cuda_silentarmy.lib;OpenCL.lib</AdditionalDependencies>
+      <AdditionalDependencies>cuda_tromp.lib;cuda_tromp_75.lib;cpu_xenoncat.lib;cpu_tromp_SSE2.lib;cpu_tromp_AVX.lib;ocl_device_utils.lib;ocl_xpm.lib;ocl_silentarmy.lib;ocl_gatelessgate.lib;cuda_silentarmy.lib;OpenCL.lib</AdditionalDependencies>
       <AdditionalLibraryDirectories>.\trompequihash\pthreads\x64;..\3rdparty\libs\win64;$(AMDAPPSDKROOT)\lib\x86_64\;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
       <LargeAddressAware>
       </LargeAddressAware>
diff --git a/ocl_gatelessgate/gatelessgate.cl b/ocl_gatelessgate/gatelessgate.cl
new file mode 100644
index 000000000..9257e8f6d
--- /dev/null
+++ b/ocl_gatelessgate/gatelessgate.cl
@@ -0,0 +1,1245 @@
+// Gateless Gate, a Zcash miner
+// Copyright 2016 zawawa @ bitcointalk.org
+//
+// The initial version of this software was based on:
+// SILENTARMY v5
+// The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil
+//
+// This program is free software : you can redistribute it and / or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program.If not, see <http://www.gnu.org/licenses/>.
+
+//#define ENABLE_DEBUG
+
+//
+// Parameters for Hash Tables
+//
+
+// There are PARAM_K - 1 hash tables, and each hash table has NR_ROWS rows.
+// Each row contains NR_SLOTS slots.
+
+#define NR_ROWS_LOG            14  // 12, 13, 14, 15, or 16. 12 and 13 are not practically usable.
+#define NR_SLOTS               199 // Prime numbers are preferable.
+#define LOCAL_WORK_SIZE        64  
+#define THREADS_PER_ROW        64
+#define LOCAL_WORK_SIZE_SOLS   64
+#define THREADS_PER_ROW_SOLS   64
+#define GLOBAL_WORK_SIZE_RATIO 512 // global_work_size = GLOBAL_WORK_SIZE_RATIO * nr_compute_units * LOCAL_WORK_SIZE
+#define THREADS_PER_WRITE      1  // 1, 2, 4, or 8
+#define SLOT_CACHE_SIZE        (NR_SLOTS * ROWS_IN_WORK_ITEM)
+#define LDS_COLL_SIZE          (NR_SLOTS * ROWS_IN_WORK_ITEM * 140 / 100)
+#define BIN_SIZE               (NR_SLOTS * 6 / 100)
+#define EXTRA_BITS_FOR_BINS_SOLS 1
+#define BIN_SIZE_SOLS          ((BIN_SIZE >> EXTRA_BITS_FOR_BINS_SOLS) * 250 / 100)
+
+#define PARAM_N				   200
+#define PARAM_K			       9
+#define PREFIX                 (PARAM_N / (PARAM_K + 1))
+#define NR_INPUTS              (1 << PREFIX)
+#define NR_ROWS                         (1 << NR_ROWS_LOG)
+// Length of 1 element (slot) in byte
+#define SLOT_LEN                        32
+// Total size of hash table
+#define HT_SIZE				(NR_ROWS * NR_SLOTS * SLOT_LEN)
+// Length of Zcash block header, nonce (part of header)
+#define ZCASH_BLOCK_HEADER_LEN		140
+// Offset of nTime in header
+#define ZCASH_BLOCK_OFFSET_NTIME        (4 + 3 * 32)
+// Length of nonce
+#define ZCASH_NONCE_LEN			32
+// Length of encoded representation of solution size
+#define ZCASH_SOLSIZE_LEN		3
+// Solution size (1344 = 0x540) represented as a compact integer, in hex
+#define ZCASH_SOLSIZE_HEX               "fd4005"
+// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes)
+#define ZCASH_SOL_LEN                   ((1 << PARAM_K) * (PREFIX + 1) / 8)
+// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization
+#define N_ZERO_BYTES			12
+// Number of bytes Zcash needs out of Blake
+#define ZCASH_HASH_LEN                  50
+// Number of wavefronts per SIMD for the Blake kernel.
+// Blake is ALU-bound (beside the atomic counter being incremented) so we need
+// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer
+// instructions. 10 is the max supported by the hw.
+#define BLAKE_WPS               	10
+// Maximum number of solutions reported by kernel to host
+#define MAX_SOLS			11
+// Length of SHA256 target
+#define SHA256_TARGET_LEN               (256 / 8)
+
+#if (NR_SLOTS < 3)
+#define BITS_PER_ROW 2
+#define ROWS_PER_UINT 16
+#define ROW_MASK 0x03
+#elif (NR_SLOTS < 7)
+#define BITS_PER_ROW 3
+#define ROWS_PER_UINT 10
+#define ROW_MASK 0x07
+#elif (NR_SLOTS < 15)
+#define BITS_PER_ROW 4
+#define ROWS_PER_UINT 8
+#define ROW_MASK 0x0F
+#elif (NR_SLOTS < 31)
+#define BITS_PER_ROW 5
+#define ROWS_PER_UINT 6
+#define ROW_MASK 0x1F
+#elif (NR_SLOTS < 63)
+#define BITS_PER_ROW 6
+#define ROWS_PER_UINT 5
+#define ROW_MASK 0x3F
+#elif (NR_SLOTS < 255)
+#define BITS_PER_ROW 8
+#define ROWS_PER_UINT 4
+#define ROW_MASK 0xFF
+#else
+#define BITS_PER_ROW 16
+#define ROWS_PER_UINT 2
+#define ROW_MASK 0xFFFF
+#endif
+#define RC_SIZE (NR_ROWS * 4 / ROWS_PER_UINT)
+
+/*
+** Return the offset of Xi in bytes from the beginning of the slot.
+*/
+#define xi_offset_for_round(round)	4
+
+// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values
+#define SOL_SIZE			((1 << PARAM_K) * 4)
+typedef struct	sols_s
+{
+	uint	nr;
+	uint	likely_invalids;
+	uchar	valid[MAX_SOLS];
+	uint	values[MAX_SOLS][(1 << PARAM_K)];
+}		sols_t;
+
+#if NR_ROWS_LOG <= 12 && NR_SLOTS <= (1 << 10)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 20) | ((slot1 & 0x3ff) << 10) | (slot0 & 0x3ff))
+#define DECODE_ROW(REF)   (REF >> 20)
+#define DECODE_SLOT1(REF) ((REF >> 10) & 0x3ff)
+#define DECODE_SLOT0(REF) (REF & 0x3ff)
+
+#elif NR_ROWS_LOG <= 14 && NR_SLOTS <= (1 << 9)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 18) | ((slot1 & 0x1ff) << 9) | (slot0 & 0x1ff))
+#define DECODE_ROW(REF)   (REF >> 18)
+#define DECODE_SLOT1(REF) ((REF >> 9) & 0x1ff)
+#define DECODE_SLOT0(REF) (REF & 0x1ff)
+
+#elif NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff))
+#define DECODE_ROW(REF)   (REF >> 16)
+#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff)
+#define DECODE_SLOT0(REF) (REF & 0xff)
+
+#elif NR_ROWS_LOG <= 18 && NR_SLOTS <= (1 << 7)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f))
+#define DECODE_ROW(REF)   (REF >> 14)
+#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f)
+#define DECODE_SLOT0(REF) (REF & 0x7f)
+
+#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */
+#define DECODE_ROW(REF)   (REF >> 13)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f))
+#define DECODE_ROW(REF)   (REF >> 12)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+
+// Windows only for now
+#define DEFAULT_NUM_MINING_MODE_THREADS 1
+#define MAX_NUM_MINING_MODE_THREADS 16
+
+#if THREADS_PER_WRITE == 1
+#define ADJUSTED_SLOT_LEN(round) (((round) <= 5) ? SLOT_LEN : SLOT_LEN - 16)
+#else
+#define ADJUSTED_SLOT_LEN(round) SLOT_LEN
+#endif
+
+#define OPENCL_BUILD_OPTIONS_AMD "-I.. -I. -O5"
+#define OPENCL_BUILD_OPTIONS     "-I.. -I."
+
+#define NEXT_PRIME_NO(n) \
+	(((n) <= 2) ? 2 : \
+     ((n) <= 3) ? 3 : \
+     ((n) <= 5) ? 5 : \
+     ((n) <= 7) ? 7 : \
+     ((n) <= 11) ? 11 : \
+     ((n) <= 13) ? 13 : \
+     ((n) <= 17) ? 17 : \
+     ((n) <= 19) ? 19 : \
+     ((n) <= 23) ? 23 : \
+     ((n) <= 29) ? 29 : \
+     ((n) <= 31) ? 31 : \
+     ((n) <= 37) ? 37 : \
+     ((n) <= 41) ? 41 : \
+     ((n) <= 43) ? 43 : \
+     ((n) <= 47) ? 47 : \
+     ((n) <= 53) ? 53 : \
+     ((n) <= 59) ? 59 : \
+     ((n) <= 61) ? 61 : \
+     ((n) <= 67) ? 67 : \
+     ((n) <= 71) ? 71 : \
+     ((n) <= 73) ? 73 : \
+     ((n) <= 79) ? 79 : \
+     ((n) <= 83) ? 83 : \
+     ((n) <= 89) ? 89 : \
+     ((n) <= 97) ? 97 : \
+     ((n) <= 101) ? 101 : \
+     ((n) <= 103) ? 103 : \
+     ((n) <= 107) ? 107 : \
+     ((n) <= 109) ? 109 : \
+     ((n) <= 113) ? 113 : \
+     ((n) <= 127) ? 127 : \
+     ((n) <= 131) ? 131 : \
+     ((n) <= 137) ? 137 : \
+     ((n) <= 139) ? 139 : \
+     ((n) <= 149) ? 149 : \
+     ((n) <= 151) ? 151 : \
+     ((n) <= 157) ? 157 : \
+     ((n) <= 163) ? 163 : \
+     ((n) <= 167) ? 167 : \
+     ((n) <= 173) ? 173 : \
+     ((n) <= 179) ? 179 : \
+     ((n) <= 181) ? 181 : \
+     ((n) <= 191) ? 191 : \
+     ((n) <= 193) ? 193 : \
+     ((n) <= 197) ? 197 : \
+     ((n) <= 199) ? 199 : \
+     ((n) <= 211) ? 211 : \
+     ((n) <= 223) ? 223 : \
+     ((n) <= 227) ? 227 : \
+     ((n) <= 229) ? 229 : \
+     ((n) <= 233) ? 233 : \
+     ((n) <= 239) ? 239 : \
+     ((n) <= 241) ? 241 : \
+     ((n) <= 251) ? 251 : \
+     ((n) <= 257) ? 257 : \
+     ((n) <= 263) ? 263 : \
+     ((n) <= 269) ? 269 : \
+     ((n) <= 271) ? 271 : \
+     ((n) <= 277) ? 277 : \
+     ((n) <= 281) ? 281 : \
+     ((n) <= 283) ? 283 : \
+     ((n) <= 293) ? 293 : \
+     ((n) <= 307) ? 307 : \
+     ((n) <= 311) ? 311 : \
+     ((n) <= 313) ? 313 : \
+     ((n) <= 317) ? 317 : \
+     ((n) <= 331) ? 331 : \
+     ((n) <= 337) ? 337 : \
+     ((n) <= 347) ? 347 : \
+     ((n) <= 349) ? 349 : \
+     ((n) <= 353) ? 353 : \
+     ((n) <= 359) ? 359 : \
+     ((n) <= 367) ? 367 : \
+     ((n) <= 373) ? 373 : \
+     ((n) <= 379) ? 379 : \
+     ((n) <= 383) ? 383 : \
+     ((n) <= 389) ? 389 : \
+     ((n) <= 397) ? 397 : \
+     ((n) <= 401) ? 401 : \
+     ((n) <= 409) ? 409 : \
+     ((n) <= 419) ? 419 : \
+     ((n) <= 421) ? 421 : \
+     ((n) <= 431) ? 431 : \
+     ((n) <= 433) ? 433 : \
+     ((n) <= 439) ? 439 : \
+     ((n) <= 443) ? 443 : \
+     ((n) <= 449) ? 449 : \
+     ((n) <= 457) ? 457 : \
+     ((n) <= 461) ? 461 : \
+     ((n) <= 463) ? 463 : \
+     ((n) <= 467) ? 467 : \
+     ((n) <= 479) ? 479 : \
+     ((n) <= 487) ? 487 : \
+     ((n) <= 491) ? 491 : \
+     ((n) <= 499) ? 499 : \
+     ((n) <= 503) ? 503 : \
+     ((n) <= 509) ? 509 : \
+     ((n) <= 521) ? 521 : \
+     ((n) <= 523) ? 523 : \
+     ((n) <= 541) ? 541 : \
+     ((n) <= 547) ? 547 : \
+     ((n) <= 557) ? 557 : \
+     ((n) <= 563) ? 563 : \
+     ((n) <= 569) ? 569 : \
+     ((n) <= 571) ? 571 : \
+     ((n) <= 577) ? 577 : \
+     ((n) <= 587) ? 587 : \
+     ((n) <= 593) ? 593 : \
+     ((n) <= 599) ? 599 : \
+     ((n) <= 601) ? 601 : \
+     ((n) <= 607) ? 607 : \
+     ((n) <= 613) ? 613 : \
+     ((n) <= 617) ? 617 : \
+     ((n) <= 619) ? 619 : \
+     ((n) <= 631) ? 631 : \
+     ((n) <= 641) ? 641 : \
+     ((n) <= 643) ? 643 : \
+     ((n) <= 647) ? 647 : \
+     ((n) <= 653) ? 653 : \
+     ((n) <= 659) ? 659 : \
+     ((n) <= 661) ? 661 : \
+     ((n) <= 673) ? 673 : \
+     ((n) <= 677) ? 677 : \
+     ((n) <= 683) ? 683 : \
+     ((n) <= 691) ? 691 : \
+     ((n) <= 701) ? 701 : \
+     ((n) <= 709) ? 709 : \
+     ((n) <= 719) ? 719 : \
+     ((n) <= 727) ? 727 : \
+     ((n) <= 733) ? 733 : \
+     ((n) <= 739) ? 739 : \
+     ((n) <= 743) ? 743 : \
+     ((n) <= 751) ? 751 : \
+     ((n) <= 757) ? 757 : \
+     ((n) <= 761) ? 761 : \
+     ((n) <= 769) ? 769 : \
+     ((n) <= 773) ? 773 : \
+     ((n) <= 787) ? 787 : \
+     ((n) <= 797) ? 797 : \
+     ((n) <= 809) ? 809 : \
+     ((n) <= 811) ? 811 : \
+     ((n) <= 821) ? 821 : \
+     ((n) <= 823) ? 823 : \
+     ((n) <= 827) ? 827 : \
+     ((n) <= 829) ? 829 : \
+     ((n) <= 839) ? 839 : \
+     ((n) <= 853) ? 853 : \
+     ((n) <= 857) ? 857 : \
+     ((n) <= 859) ? 859 : \
+     ((n) <= 863) ? 863 : \
+     ((n) <= 877) ? 877 : \
+     ((n) <= 881) ? 881 : \
+     ((n) <= 883) ? 883 : \
+     ((n) <= 887) ? 887 : \
+     ((n) <= 907) ? 907 : \
+     ((n) <= 911) ? 911 : \
+     ((n) <= 919) ? 919 : \
+     ((n) <= 929) ? 929 : \
+     ((n) <= 937) ? 937 : \
+     ((n) <= 941) ? 941 : \
+     ((n) <= 947) ? 947 : \
+     ((n) <= 953) ? 953 : \
+     ((n) <= 967) ? 967 : \
+     ((n) <= 971) ? 971 : \
+     ((n) <= 977) ? 977 : \
+     ((n) <= 983) ? 983 : \
+     ((n) <= 991) ? 991 : \
+     ((n) <= 997) ? 997 : \
+     ((n) <= 1009) ? 1009 : \
+		             (n))
+
+#define ROWS_IN_WORK_ITEM      (LOCAL_WORK_SIZE      / THREADS_PER_ROW     )
+#define ROWS_IN_WORK_ITEM_SOLS (LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS)
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+
+typedef union {
+	struct {
+		uint i;
+		uint xi[6];
+		uint padding;
+	} slot;
+	uint8 ui8;
+	uint4 ui4[2];
+	uint2 ui2[4];
+	uint  ui[8];
+} slot_t;
+
+#if THREADS_PER_WRITE != 1
+typedef __global slot_t *global_pointer_to_slot_t;
+#endif
+
+
+/*
+** The new hash table has this layout (length in bytes in parens):
+**
+** round 0, table 0: i(4) pad(0) Xi(24) pad(4)
+** round 1, table 1: i(4) pad(3) Xi(20) pad(5)
+** round 2, table 2: i(4) pad(0) Xi(19) pad(9)
+** round 3, table 3: i(4) pad(3) Xi(15) pad(10)
+** round 4, table 4: i(4) pad(0) Xi(14) pad(14)
+** round 5, table 5: i(4) pad(3) Xi(10) pad(15)
+** round 6, table 6: i(4) pad(0) Xi( 9) pad(19)
+** round 7, table 7: i(4) pad(3) Xi( 5) pad(20)
+** round 8, table 8: i(4) pad(0) Xi( 4) pad(24)
+**
+*/
+
+__constant ulong blake_iv_const[] =
+{
+	0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+	0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+	0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+	0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+};
+
+/*
+** Reset counters in hash table.
+*/
+__kernel
+void kernel_init_ht(__global char *ht, __global uint *rowCounters)
+{
+	rowCounters[get_global_id(0)] = 0;
+}
+
+/*
+** OBSOLETE
+** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they
+** represent (hex notation, group of 5 hex digits are a group of PREFIX bits):
+**   aa aa ab bb bb cc cc cd dd...  [round 0]
+**         --------------------
+**      ...ab bb bb cc cc cd dd...  [odd round]
+**               --------------
+**               ...cc cc cd dd...  [next even round]
+**                        -----
+** Bytes underlined are going to be stored in the slot. Preceding bytes
+** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are
+** used to compute the row number.
+**
+** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter)
+** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble)
+** TODO: update lines below with padding nibbles
+** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter)
+** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter)
+** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter)
+** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter)
+** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter)
+** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter)
+** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter)
+**
+** Return 0 if successfully stored, or 1 if the row overflowed.
+*/
+
+__global char *get_slot_ptr(__global char *ht, uint round, uint row, uint slot)
+{
+#if 1
+	return ht + (row * NR_SLOTS + slot) * ADJUSTED_SLOT_LEN(round);
+#else
+	return ht + (slot * NR_ROWS + row) * ADJUSTED_SLOT_LEN(round);
+#endif
+}
+
+__global char *get_xi_ptr(__global char *ht, uint round, uint row, uint slot)
+{
+	return get_slot_ptr(ht, round, row, slot) + xi_offset_for_round(round);
+}
+
+void get_row_counters_index(uint *rowIdx, uint *rowOffset, uint row)
+{
+	*rowIdx = row / ROWS_PER_UINT;
+	*rowOffset = BITS_PER_ROW * (row % ROWS_PER_UINT);
+}
+
+uint get_row(uint round, uint xi0)
+{
+	uint           row;
+#if NR_ROWS_LOG == 12
+	if (!(round % 2))
+		row = (xi0 & 0xfff);
+	else
+		row = ((xi0 & 0x0f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24);
+#elif NR_ROWS_LOG == 13
+	if (!(round % 2))
+		row = (xi0 & 0x1fff);
+	else
+		row = ((xi0 & 0x1f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24);
+#elif NR_ROWS_LOG == 14
+	if (!(round % 2))
+		row = (xi0 & 0x3fff);
+	else
+		row = ((xi0 & 0x3f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24);
+#elif NR_ROWS_LOG == 15
+	if (!(round % 2))
+		row = (xi0 & 0x7fff);
+	else
+		row = ((xi0 & 0x7f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24);
+#elif NR_ROWS_LOG == 16
+	if (!(round % 2))
+		row = (xi0 & 0xffff);
+	else
+		row = ((xi0 & 0xff0f00) >> 8) | ((xi0 & 0xf0000000) >> 24);
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+	return row;
+}
+
+uint inc_row_counter(__global uint *rowCounters, uint row)
+{
+	uint rowIdx, rowOffset;
+	get_row_counters_index(&rowIdx, &rowOffset, row);
+	uint cnt = atomic_add(rowCounters + rowIdx, 1 << rowOffset);
+	cnt = (cnt >> rowOffset) & ROW_MASK;
+	if (cnt >= NR_SLOTS) {
+		// avoid overflows
+		atomic_sub(rowCounters + rowIdx, 1 << rowOffset);
+	}
+	return cnt;
+}
+
+uint ht_store(uint round, __global char *ht, uint i,
+	uint xi0, uint xi1, uint xi2, uint xi3, uint xi4, uint xi5, uint xi6, __global uint *rowCounters)
+{
+	uint row = get_row(round, xi0);
+	uint cnt = inc_row_counter(rowCounters, row);
+	if (cnt >= NR_SLOTS)
+		return 1;
+	__global char *p = get_slot_ptr(ht, round, row, cnt);
+	slot_t slot;
+	slot.slot.i = i;
+	slot.slot.xi[0] = ((xi1 << 24) | (xi0 >> 8));
+	slot.slot.xi[1] = ((xi2 << 24) | (xi1 >> 8));
+	slot.slot.xi[2] = ((xi3 << 24) | (xi2 >> 8));
+	slot.slot.xi[3] = ((xi4 << 24) | (xi3 >> 8));
+	slot.slot.xi[4] = ((xi5 << 24) | (xi4 >> 8));
+	slot.slot.xi[5] = ((xi6 << 24) | (xi5 >> 8));
+	if (round <= 5) {
+		*(__global uint8 *)p = slot.ui8;
+	}
+	else {
+		*(__global uint4 *)p = slot.ui4[0];
+	}
+	return 0;
+}
+
+#define mix(va, vb, vc, vd, x, y) \
+    va = (va + vb + x); \
+vd = rotate((vd ^ va), (ulong)64 - 32); \
+vc = (vc + vd); \
+vb = rotate((vb ^ vc), (ulong)64 - 24); \
+va = (va + vb + y); \
+vd = rotate((vd ^ va), (ulong)64 - 16); \
+vc = (vc + vd); \
+vb = rotate((vb ^ vc), (ulong)64 - 63);
+
+/*
+** Execute round 0 (blake).
+**
+** Note: making the work group size less than or equal to the wavefront size
+** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local
+** Memory (LDS) Optimization 2-10" in:
+** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/
+*/
+__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE, 1, 1)))
+void kernel_round0(__constant ulong *blake_state_const, __global char *ht,
+	__global uint *rowCounters, __global uint *debug)
+{
+	__local ulong blake_state[64];
+	__local ulong blake_iv[8];
+	uint                tid = get_global_id(0);
+	ulong               v[16];
+	uint                inputs_per_thread = NR_INPUTS / get_global_size(0);
+	uint                input = tid * inputs_per_thread;
+	uint                input_end = (tid + 1) * inputs_per_thread;
+	uint                dropped = 0;
+	if (get_local_id(0) < 64)
+		blake_state[get_local_id(0)] = blake_state_const[get_local_id(0)];
+	if (get_local_id(0) < 8)
+		blake_iv[get_local_id(0)] = blake_iv_const[get_local_id(0)];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	while (input < input_end) {
+		// shift "i" to occupy the high 32 bits of the second ulong word in the
+		// message block
+		ulong word1 = (ulong)input << 32;
+		// init vector v
+		v[0] = blake_state[0];
+		v[1] = blake_state[1];
+		v[2] = blake_state[2];
+		v[3] = blake_state[3];
+		v[4] = blake_state[4];
+		v[5] = blake_state[5];
+		v[6] = blake_state[6];
+		v[7] = blake_state[7];
+		v[8] = blake_iv[0];
+		v[9] = blake_iv[1];
+		v[10] = blake_iv[2];
+		v[11] = blake_iv[3];
+		v[12] = blake_iv[4];
+		v[13] = blake_iv[5];
+		v[14] = blake_iv[6];
+		v[15] = blake_iv[7];
+		// mix in length of data
+		v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */;
+		// last block
+		v[14] ^= (ulong)-1;
+
+		// round 1
+		mix(v[0], v[4], v[8], v[12], 0, word1);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 2
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], word1, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 3
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, word1);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 4
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, word1);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 5
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, word1);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 6
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], word1, 0);
+		// round 7
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], word1, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 8
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, word1);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 9
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], word1, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 10
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], word1, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 11
+		mix(v[0], v[4], v[8], v[12], 0, word1);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], 0, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+		// round 12
+		mix(v[0], v[4], v[8], v[12], 0, 0);
+		mix(v[1], v[5], v[9], v[13], 0, 0);
+		mix(v[2], v[6], v[10], v[14], 0, 0);
+		mix(v[3], v[7], v[11], v[15], 0, 0);
+		mix(v[0], v[5], v[10], v[15], word1, 0);
+		mix(v[1], v[6], v[11], v[12], 0, 0);
+		mix(v[2], v[7], v[8], v[13], 0, 0);
+		mix(v[3], v[4], v[9], v[14], 0, 0);
+
+		// compress v into the blake state; this produces the 50-byte hash
+		// (two Xi values)
+		ulong h[7];
+		h[0] = blake_state[0] ^ v[0] ^ v[8];
+		h[1] = blake_state[1] ^ v[1] ^ v[9];
+		h[2] = blake_state[2] ^ v[2] ^ v[10];
+		h[3] = blake_state[3] ^ v[3] ^ v[11];
+		h[4] = blake_state[4] ^ v[4] ^ v[12];
+		h[5] = blake_state[5] ^ v[5] ^ v[13];
+		h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
+
+		// store the two Xi values in the hash table
+#if ZCASH_HASH_LEN == 50
+		dropped += ht_store(0, ht, input * 2,
+			h[0] & 0xffffffff, h[0] >> 32,
+			h[1] & 0xffffffff, h[1] >> 32,
+			h[2] & 0xffffffff, h[2] >> 32,
+			h[3] & 0xffffffff,
+			rowCounters);
+		dropped += ht_store(0, ht, input * 2 + 1,
+			((h[3] >> 8) | (h[4] << (64 - 8))) & 0xffffffff,
+			((h[3] >> 8) | (h[4] << (64 - 8))) >> 32,
+			((h[4] >> 8) | (h[5] << (64 - 8))) & 0xffffffff,
+			((h[4] >> 8) | (h[5] << (64 - 8))) >> 32,
+			((h[5] >> 8) | (h[6] << (64 - 8))) & 0xffffffff,
+			((h[5] >> 8) | (h[6] << (64 - 8))) >> 32,
+			(h[6] >> 8) & 0xffffffff,
+			rowCounters);
+#else
+#error "unsupported ZCASH_HASH_LEN"
+#endif
+
+		input++;
+	}
+#ifdef ENABLE_DEBUG
+	debug[tid * 2] = 0;
+	debug[tid * 2 + 1] = dropped;
+#endif
+}
+
+/*
+** XOR a pair of Xi values computed at "round - 1" and store the result in the
+** hash table being built for "round". Note that when building the table for
+** even rounds we need to skip 1 padding byte present in the "round - 1" table
+** (the "0xAB" byte mentioned in the description at the top of this file.) But
+** also note we can't load data directly past this byte because this would
+** cause an unaligned memory access which is undefined per the OpenCL spec.
+**
+** Return 0 if successfully stored, or 1 if the row overflowed.
+*/
+// single-thread reads, parallel writes
+uint xor_and_store(uint round, __global char *ht_dst, uint row,
+	uint slot_a, uint slot_b, __local uint *ai, __local uint *bi,
+	__global uint *rowCounters
+#if THREADS_PER_WRITE != 1
+	, __local slot_t *slot_write_cache,
+	__local global_pointer_to_slot_t *slot_ptrs
+#endif
+) {
+	uint ret = 0;
+	uint xi0, xi1, xi2, xi3, xi4, xi5;
+	uint thread_index = get_local_id(0) % THREADS_PER_WRITE;
+
+#if NR_ROWS_LOG < 8 && NR_ROWS_LOG > 20
+#error "unsupported NR_ROWS_LOG"
+#endif
+	slot_t slot;
+	__global slot_t *p = 0;
+#if THREADS_PER_WRITE != 1
+	slot_ptrs[get_local_id(0)] = 0;
+	barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+	if (ai && bi) {
+		xi0 = *(ai++);
+		xi1 = *(ai++);
+		if (round <= 7) xi2 = *(ai++);
+		if (round <= 6) xi3 = *(ai++);
+		if (round <= 4) xi4 = *(ai++);
+		if (round <= 2) xi5 = *ai;
+
+		xi0 ^= *(bi++);
+		xi1 ^= *(bi++);
+		if (round <= 7) xi2 ^= *(bi++);
+		if (round <= 6) xi3 ^= *(bi++);
+		if (round <= 4) xi4 ^= *(bi++);
+		if (round <= 2) xi5 ^= *bi;
+
+		if (!(round & 0x1)) {
+			// skip padding bytes
+			xi0 = (xi0 >> 24) | (xi1 << (32 - 24));
+			xi1 = (xi1 >> 24) | (xi2 << (32 - 24));
+			if (round <= 7) xi2 = (xi2 >> 24) | (xi3 << (32 - 24));
+			if (round <= 6) xi3 = (xi3 >> 24) | (xi4 << (32 - 24));
+			if (round <= 4) xi4 = (xi4 >> 24) | (xi5 << (32 - 24));
+			if (round <= 2) xi5 = (xi5 >> 24);
+		}
+
+		// invalid solutions (which start happenning in round 5) have duplicate
+		// inputs and xor to zero, so discard them
+		if (xi0 || xi1) {
+			uint new_row = get_row(round, xi0);
+			uint new_slot_index = inc_row_counter(rowCounters, new_row);
+			if (new_slot_index >= NR_SLOTS) {
+				ret = 1;
+			}
+			else {
+#if THREADS_PER_WRITE == 1
+				p
+#else
+				slot_ptrs[get_local_id(0)]
+#endif
+					= (__global slot_t *)get_slot_ptr(ht_dst, round, new_row, new_slot_index);
+			}
+		}
+	}
+
+#if THREADS_PER_WRITE == 1
+	if (p) {
+		slot.slot.i = ENCODE_INPUTS(row, slot_a, slot_b);
+		slot.slot.xi[0] = ((xi1 << 24) | (xi0 >> 8));
+		slot.slot.xi[1] = ((xi2 << 24) | (xi1 >> 8));
+		slot.slot.xi[2] = ((xi3 << 24) | (xi2 >> 8));
+		slot.slot.xi[3] = ((xi4 << 24) | (xi3 >> 8));
+		slot.slot.xi[4] = ((xi5 << 24) | (xi4 >> 8));
+		slot.slot.xi[5] = ((xi5 >> 8));
+		if (round <= 5)
+			*(__global uint8 *)p = slot.ui8;
+		else
+			*(__global uint4 *)p = slot.ui4[0];
+	}
+#else
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if (slot_ptrs[get_local_id(0)]) {
+		slot_write_cache[get_local_id(0)].slot.i = ENCODE_INPUTS(row, slot_a, slot_b);
+		slot_write_cache[get_local_id(0)].slot.xi[0] = ((xi1 << 24) | (xi0 >> 8));
+		slot_write_cache[get_local_id(0)].slot.xi[1] = ((xi2 << 24) | (xi1 >> 8));
+		slot_write_cache[get_local_id(0)].slot.xi[2] = ((xi3 << 24) | (xi2 >> 8));
+		slot_write_cache[get_local_id(0)].slot.xi[3] = ((xi4 << 24) | (xi3 >> 8));
+		slot_write_cache[get_local_id(0)].slot.xi[4] = ((xi5 << 24) | (xi4 >> 8));
+		slot_write_cache[get_local_id(0)].slot.xi[5] = ((xi5 >> 8));
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+	uint local_id_base = get_local_id(0) - get_local_id(0) % THREADS_PER_WRITE;
+	for (uint write_index = local_id_base; write_index < local_id_base + THREADS_PER_WRITE; ++write_index) {
+		if (slot_ptrs[write_index]) {
+#if THREADS_PER_WRITE == 2 
+			* ((__global uint4 *)slot_ptrs[write_index] + thread_index) = slot_write_cache[write_index].ui4[thread_index];
+#elif THREADS_PER_WRITE == 4
+			* ((__global uint2 *)slot_ptrs[write_index] + thread_index) = slot_write_cache[write_index].ui2[thread_index];
+#elif THREADS_PER_WRITE == 8
+			* ((__global uint *)slot_ptrs[write_index] + thread_index) = slot_write_cache[write_index].ui[thread_index];
+#else
+#error "unsupported THREADS_PER_WRITE"
+#endif
+		}
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+	return ret;
+}
+
+/*
+** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi,
+** store them in ht_dst.
+*/
+
+#define UINTS_IN_XI(round) (((round) == 0) ? 6 : \
+                            ((round) == 1) ? 6 : \
+                            ((round) == 2) ? 5 : \
+                            ((round) == 3) ? 5 : \
+                            ((round) == 4) ? 4 : \
+                            ((round) == 5) ? 4 : \
+                            ((round) == 6) ? 3 : \
+                            ((round) == 7) ? 2 : \
+                                             1)
+
+#define RESERVED_FOR_XI(round) (((round) == 0) ? 6 : \
+                            ((round) == 1) ? 6 : \
+                            ((round) == 2) ? 6 : \
+                            ((round) == 3) ? 6 : \
+                            ((round) == 4) ? 4 : \
+                            ((round) == 5) ? 4 : \
+                            ((round) == 6) ? 4 : \
+                            ((round) == 7) ? 2 : \
+                                             2)
+
+void equihash_round(uint round,
+	__global char *ht_src,
+	__global char *ht_dst,
+	__global uint *debug,
+	__local uint  *slot_cache,
+	__local uint *collisionsData,
+	__local uint *collisionsNum,
+	__global uint *rowCountersSrc,
+	__global uint *rowCountersDst,
+	uint threadsPerRow,
+	__local uint *nr_slots_array,
+	__local uchar *bins_data,
+	__local uint *bin_counters_data)
+{
+	uint globalTid = get_global_id(0) / threadsPerRow;
+	uint localTid = get_local_id(0) / threadsPerRow;
+	uint localGroupId = get_local_id(0) % threadsPerRow;
+	__global char *p;
+	uint     i, j;
+	uint     dropped_coll = 0;
+	uint     dropped_stor = 0;
+	__local uint  *a, *b;
+	// the mask is also computed to read data from the previous round
+#define BIN_MASK(round)        ((((round) + 1) % 2) ? 0xf000 : 0xf0000)
+#define BIN_MASK_OFFSET(round) ((((round) + 1) % 2) ? 3 * 4 : 4 * 4)
+#if NR_ROWS_LOG == 12
+#define BIN_MASK2(round)        ((((round) + 1) % 2) ? 0x00f0 : 0xf000)
+#define BIN_MASK2_OFFSET(round) ((((round) + 1) % 2) ? 0 : 8)
+#elif NR_ROWS_LOG == 13
+#define BIN_MASK2(round)        ((((round) + 1) % 2) ? 0x00e0 : 0xe000)
+#define BIN_MASK2_OFFSET(round) ((((round) + 1) % 2) ? 1 : 9)
+#elif NR_ROWS_LOG == 14
+#define BIN_MASK2(round)        ((((round) + 1) % 2) ? 0x00c0 : 0xc000)
+#define BIN_MASK2_OFFSET(round) ((((round) + 1) % 2) ? 2 : 10)
+#elif NR_ROWS_LOG == 15
+#define BIN_MASK2(round)        ((((round) + 1) % 2) ? 0x0080 : 0x8000)
+#define BIN_MASK2_OFFSET(round) ((((round) + 1) % 2) ? 3 : 11)
+#elif NR_ROWS_LOG == 16
+#define BIN_MASK2(round)        0
+#define BIN_MASK2_OFFSET(round) 0
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif    
+#define NR_BINS (256 >> (NR_ROWS_LOG - 12))
+	__local uchar *bins = &bins_data[localTid * BIN_SIZE * NR_BINS];
+	__local uint *bin_counters = &bin_counters_data[localTid * NR_BINS];
+#if THREADS_PER_WRITE != 1
+	__local slot_t slot_write_cache[LOCAL_WORK_SIZE];
+	__local global_pointer_to_slot_t slot_ptrs[LOCAL_WORK_SIZE];
+#endif
+
+	uint rows_per_work_item = (NR_ROWS + get_global_size(0) / threadsPerRow - 1) / (get_global_size(0) / threadsPerRow);
+	uint rows_per_chunk = get_global_size(0) / threadsPerRow;
+
+	for (uint chunk = 0; chunk < rows_per_work_item; chunk++) {
+		uint cnt = 0;
+		uint tid = globalTid + rows_per_chunk * chunk;
+		uint gid = tid & ~(get_local_size(0) / threadsPerRow - 1);
+
+		if (!get_local_id(0))
+			*collisionsNum = 0;
+		for (i = localGroupId; i < NR_BINS; i += threadsPerRow)
+			bin_counters[i] = 0;
+		if (tid < NR_ROWS && localGroupId == 0) {
+			uint rowIdx, rowOffset;
+			get_row_counters_index(&rowIdx, &rowOffset, tid);
+			cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK;
+			cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round
+			nr_slots_array[localTid] = cnt;
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		if (tid < NR_ROWS && localGroupId) {
+			cnt = nr_slots_array[localTid];
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Perform a radix sort as slots get loaded into LDS.
+		uint xi_first_bytes;
+		uint bin_to_use;
+		uint bin_counter_copy;
+		// Make sure all the work items in the work group enter the loop.
+		uint i_max = cnt + (get_local_size(0) - cnt % get_local_size(0)) - 1;
+		for (i = localGroupId; i < i_max; i += threadsPerRow) {
+			if (tid < NR_ROWS && i < cnt) {
+				xi_first_bytes = *(__global uint *)get_xi_ptr(ht_src, round - 1, tid, i);
+				slot_cache[(localTid * NR_SLOTS + i) * RESERVED_FOR_XI(round - 1)] = xi_first_bytes;
+				for (j = 1; j < UINTS_IN_XI(round - 1); ++j)
+					slot_cache[(localTid * NR_SLOTS + i) * RESERVED_FOR_XI(round - 1) + j] = *((__global uint *)get_xi_ptr(ht_src, round - 1, tid, i) + j);
+
+				bin_to_use =
+					((xi_first_bytes & BIN_MASK(round - 1)) >> BIN_MASK_OFFSET(round - 1))
+					| ((xi_first_bytes & BIN_MASK2(round - 1)) >> BIN_MASK2_OFFSET(round - 1));
+				bin_counter_copy = atomic_inc(&bin_counters[bin_to_use]);
+				if (bin_counter_copy >= BIN_SIZE) {
+					atomic_dec(&bin_counters[bin_to_use]);
+					++dropped_coll;
+				}
+				else {
+					bins[bin_to_use * BIN_SIZE + bin_counter_copy] = i;
+				}
+			}
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			if (tid < NR_ROWS && i < cnt) {
+				for (j = 0; j < bin_counter_copy; ++j) {
+					uint index = atomic_inc(collisionsNum);
+					if (index >= LDS_COLL_SIZE) {
+						atomic_dec(collisionsNum);
+						++dropped_coll;
+					}
+					else {
+						collisionsData[index] = (localTid << 24) | (i << 12) | bins[bin_to_use * BIN_SIZE + j];
+					}
+				}
+			}
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+		uint totalCollisions = *collisionsNum;
+		// Make sure all the work items in the work group enter and leave the loop at the same time.
+		uint max_index = totalCollisions + (LOCAL_WORK_SIZE - totalCollisions % LOCAL_WORK_SIZE) - 1;
+		for (uint index = get_local_id(0); index <= max_index; index += LOCAL_WORK_SIZE) {
+			uint collision, collisionLocalThreadId, collisionThreadId;
+			uint i, j, slot_cache_index_i, slot_cache_index_j;
+			a = 0;
+			b = 0;
+
+			if (tid < NR_ROWS && index < totalCollisions) {
+				collision = collisionsData[index];
+				collisionLocalThreadId = collision >> 24;
+				collisionThreadId = gid + collisionLocalThreadId;
+				i = (collision >> 12) & 0xfff;
+				j = collision & 0xfff;
+				a = (__local uint *)&slot_cache[(collisionLocalThreadId * NR_SLOTS + i) * RESERVED_FOR_XI(round - 1)];
+				b = (__local uint *)&slot_cache[(collisionLocalThreadId * NR_SLOTS + j) * RESERVED_FOR_XI(round - 1)];
+			}
+
+			dropped_stor += xor_and_store(
+				round,
+				ht_dst,
+				collisionThreadId,
+				i, j,
+				a, b,
+				rowCountersDst
+#if THREADS_PER_WRITE == 1
+			);
+#else
+				, slot_write_cache, slot_ptrs);
+#endif
+			barrier(CLK_LOCAL_MEM_FENCE);
+		}
+	}
+
+#ifdef ENABLE_DEBUG
+	uint tid = get_global_id(0);
+	debug[tid * 2] = dropped_coll;
+	debug[tid * 2 + 1] = dropped_stor;
+#endif
+}
+
+/*
+** This defines kernel_round1, kernel_round2, ..., kernel_round7.
+*/
+#define KERNEL_ROUND(N) \
+__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE, 1, 1))) \
+void kernel_round ## N(__global char *ht_src, __global char *ht_dst, \
+	__global uint *rowCountersSrc, __global uint *rowCountersDst, \
+       	__global uint *debug) \
+{ \
+    __local uint    slot_cache[NEXT_PRIME_NO(RESERVED_FOR_XI(N - 1) * SLOT_CACHE_SIZE)]; \
+    __local uint    collisionsData[NEXT_PRIME_NO(LDS_COLL_SIZE)]; \
+    __local uint    collisionsNum; \
+	__local uint    nr_slots_array[NEXT_PRIME_NO(LOCAL_WORK_SIZE / THREADS_PER_ROW)]; \
+	__local uchar   bins_data[NEXT_PRIME_NO((LOCAL_WORK_SIZE / THREADS_PER_ROW) * BIN_SIZE * NR_BINS)]; \
+	__local uint    bin_counters_data[NEXT_PRIME_NO((LOCAL_WORK_SIZE / THREADS_PER_ROW) * NR_BINS)]; \
+	equihash_round(N, ht_src, ht_dst, debug, slot_cache, collisionsData, \
+	    &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW, nr_slots_array, bins_data, bin_counters_data); \
+}
+KERNEL_ROUND(1)
+KERNEL_ROUND(2)
+KERNEL_ROUND(3)
+KERNEL_ROUND(4)
+KERNEL_ROUND(5)
+KERNEL_ROUND(6)
+KERNEL_ROUND(7)
+KERNEL_ROUND(8)
+
+uint expand_ref(__global char *ht, uint round, uint row, uint slot)
+{
+	return ((__global slot_t *)get_slot_ptr(ht, round, row, slot))->slot.i;
+}
+
+/*
+** Expand references to inputs. Return 1 if so far the solution appears valid,
+** or 0 otherwise (an invalid solution would be a solution with duplicate
+** inputs, which can be detected at the last step: round == 0).
+*/
+uint expand_refs(__local uint *ins, uint nr_inputs, __global char **htabs,
+	uint round)
+{
+	__global char	*ht = htabs[round];
+	uint		i = nr_inputs - 1;
+	uint		j = nr_inputs * 2 - 1;
+	int			dup_to_watch = -1;
+	do {
+		ins[j] = expand_ref(ht, round,
+			DECODE_ROW(ins[i]), DECODE_SLOT1(ins[i]));
+		ins[j - 1] = expand_ref(ht, round,
+			DECODE_ROW(ins[i]), DECODE_SLOT0(ins[i]));
+		if (!round) {
+			if (dup_to_watch == -1)
+				dup_to_watch = ins[j];
+			else if (ins[j] == dup_to_watch || ins[j - 1] == dup_to_watch)
+				return 0;
+		}
+		if (!i)
+			break;
+		i--;
+		j -= 2;
+	} while (1);
+	return 1;
+}
+
+/*
+** Verify if a potential solution is in fact valid.
+*/
+void potential_sol(__global char **htabs, __global sols_t *sols,
+	uint ref0, uint ref1, __local uint *values_tmp)
+{
+	uint	nr_values;
+	uint	sol_i;
+	uint	i;
+	nr_values = 0;
+	values_tmp[nr_values++] = ref0;
+	values_tmp[nr_values++] = ref1;
+	uint round = PARAM_K - 1;
+	do {
+		round--;
+		if (!expand_refs(values_tmp, nr_values, htabs, round))
+			return;
+		nr_values *= 2;
+	} while (round > 0);
+	// solution appears valid, copy it to sols
+	sol_i = atomic_inc(&sols->nr);
+	if (sol_i >= MAX_SOLS)
+		return;
+	for (i = 0; i < (1 << PARAM_K); i++)
+		sols->values[sol_i][i] = values_tmp[i];
+	sols->valid[sol_i] = 1;
+}
+
+/*
+** Scan the hash tables to find Equihash solutions.
+*/
+__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE_SOLS, 1, 1)))
+void kernel_sols(__global char *ht0,
+	__global char *ht1,
+	__global char *ht2,
+	__global char *ht3,
+	__global char *ht4,
+	__global char *ht5,
+	__global char *ht6,
+	__global char *ht7,
+	__global char *ht8,
+	__global sols_t *sols,
+	__global uint *rowCountersSrc)
+{
+	__local uint refs[NEXT_PRIME_NO(NR_SLOTS*(LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS))];
+	__local uint data[NEXT_PRIME_NO(NR_SLOTS*(LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS))];
+	__local uint	values_tmp[NEXT_PRIME_NO(1 << PARAM_K)];
+	__local uint    semaphoe;
+
+	uint globalTid = get_global_id(0) / THREADS_PER_ROW_SOLS;
+	uint localTid = get_local_id(0) / THREADS_PER_ROW_SOLS;
+	uint localGroupId = get_local_id(0) % THREADS_PER_ROW_SOLS;
+	__local uint *refsPtr = &refs[NR_SLOTS*localTid];
+	__local uint *dataPtr = &data[NR_SLOTS*localTid];
+
+	__global char	*htabs[] = { ht0, ht1, ht2, ht3, ht4, ht5, ht6, ht7, ht8 };
+	uint		ht_i = (PARAM_K - 1); // table filled at last round
+	uint		cnt;
+	uint		i, j;
+	__global char	*p;
+	uint		ref_i, ref_j;
+	__local uchar   bins_data[(LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS) * NR_SLOTS * NR_BINS];
+	__local uint    bin_counters_data[(LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS) * NR_BINS];
+	__local uchar *bins = &bins_data[localTid * NR_SLOTS * NR_BINS];
+	__local uint *bin_counters = &bin_counters_data[localTid * NR_BINS];
+
+	if (!get_global_id(0))
+		sols->nr = sols->likely_invalids = 0;
+	barrier(CLK_GLOBAL_MEM_FENCE);
+
+	uint rows_per_work_item = (NR_ROWS + get_global_size(0) / THREADS_PER_ROW_SOLS - 1) / (get_global_size(0) / THREADS_PER_ROW_SOLS);
+	uint rows_per_chunk = get_global_size(0) / THREADS_PER_ROW_SOLS;
+
+	for (uint chunk = 0; chunk < rows_per_work_item; chunk++) {
+		uint tid = globalTid + rows_per_chunk * chunk;
+		uint gid = tid & ~(get_local_size(0) / THREADS_PER_ROW_SOLS - 1);
+
+		__local uint nr_slots_array[LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS];
+		if (tid < NR_ROWS) {
+			if (!get_local_id(0))
+				semaphoe = 0;
+			for (i = localGroupId; i < NR_BINS; i += THREADS_PER_ROW_SOLS)
+				bin_counters[i] = 0;
+			if (localGroupId == 0) {
+				uint rowIdx, rowOffset;
+				get_row_counters_index(&rowIdx, &rowOffset, tid);
+				cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK;
+				cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round
+				nr_slots_array[localTid] = cnt;
+			}
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		if (tid < NR_ROWS) {
+			if (localGroupId)
+				cnt = nr_slots_array[localTid];
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// in the final hash table, we are looking for a match on both the bits
+		// part of the previous PREFIX colliding bits, and the last PREFIX bits.
+		__local ulong coll;
+		if (tid < NR_ROWS) {
+			for (i = localGroupId; i < cnt && !semaphoe; i += THREADS_PER_ROW_SOLS) {
+				p = get_slot_ptr(htabs[ht_i], PARAM_K - 1, tid, i);
+				refsPtr[i] = ((__global slot_t *)p)->slot.i;
+				uint xi_first_bytes = dataPtr[i] = ((__global slot_t *)p)->slot.xi[0];
+				uint bin_to_use =
+					((xi_first_bytes & BIN_MASK(PARAM_K - 1)) >> BIN_MASK_OFFSET(PARAM_K - 1))
+					| ((xi_first_bytes & BIN_MASK2(PARAM_K - 1)) >> BIN_MASK2_OFFSET(PARAM_K - 1));
+				uint bin_counter_copy = atomic_inc(&bin_counters[bin_to_use]);
+				bins[bin_to_use * NR_SLOTS + bin_counter_copy] = i;
+				if (bin_counter_copy) {
+					for (j = 0; j < bin_counter_copy && !semaphoe; ++j) {
+						uint slot_index_j = bins[bin_to_use * NR_SLOTS + j];
+						if (xi_first_bytes == dataPtr[slot_index_j]) {
+							if (atomic_inc(&semaphoe) == 0)
+								coll = ((ulong)refsPtr[i] << 32) | refsPtr[slot_index_j];
+						}
+					}
+				}
+			}
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+		if (tid < NR_ROWS) {
+			if (get_local_id(0) == 0 && semaphoe)
+				potential_sol(htabs, sols, coll >> 32, coll & 0xffffffff, values_tmp);
+		}
+	}
+}
\ No newline at end of file
diff --git a/ocl_gatelessgate/gettimeofday.h b/ocl_gatelessgate/gettimeofday.h
new file mode 100644
index 000000000..0af4feffb
--- /dev/null
+++ b/ocl_gatelessgate/gettimeofday.h
@@ -0,0 +1,43 @@
+// Gateless Gate, a Zcash miner
+// Copyright 2016 zawawa @ bitcointalk.org
+//
+// The initial version of this software was based on:
+// SILENTARMY v5
+// The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil
+//
+// This program is free software : you can redistribute it and / or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program.If not, see <http://www.gnu.org/licenses/>.
+
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+#include <Winsock2.h>
+#include <stdint.h> // portable: uint64_t   MSVC: __int64 
+
+inline int gettimeofday(struct timeval * tp, struct timezone * tzp)
+{
+	// Note: some broken versions only have 8 trailing zero's, the correct epoch has 9 trailing zero's
+	static const uint64_t EPOCH = ((uint64_t)116444736000000000ULL);
+
+	SYSTEMTIME  system_time;
+	FILETIME    file_time;
+	uint64_t    time;
+
+	GetSystemTime(&system_time);
+	SystemTimeToFileTime(&system_time, &file_time);
+	time = ((uint64_t)file_time.dwLowDateTime);
+	time += ((uint64_t)file_time.dwHighDateTime) << 32;
+
+	tp->tv_sec = (long)((time - EPOCH) / 10000000L);
+	tp->tv_usec = (long)(system_time.wMilliseconds * 1000);
+	return 0;
+}
diff --git a/ocl_gatelessgate/ocl_gatelessgate.cpp b/ocl_gatelessgate/ocl_gatelessgate.cpp
new file mode 100644
index 000000000..9830ebfdb
--- /dev/null
+++ b/ocl_gatelessgate/ocl_gatelessgate.cpp
@@ -0,0 +1,912 @@
+#include "ocl_gatelessgate.hpp"
+
+#pragma comment(lib, "winmm.lib")
+#define _CRT_RAND_S 
+
+
+//#define _CRT_SECURE_NO_WARNINGS
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stdint.h>
+#include <assert.h>
+#include <sys/types.h>
+//#include <sys/time.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+//#include <unistd.h>
+//#include <getopt.h>
+#include <errno.h>
+
+#include <opencl.h>
+
+#include <fstream>
+
+#include "gettimeofday.h"
+#include <TimeAPI.h>
+
+#include <blake/blake.hpp>
+using namespace blake;
+#include <sha256/sha256.hpp>
+
+#include <inttypes.h>
+
+typedef uint8_t		uchar;
+typedef uint32_t	uint;
+typedef uint64_t	ulong;
+#include "param.h"
+
+#define MIN(A, B)	(((A) < (B)) ? (A) : (B))
+#define MAX(A, B)	(((A) > (B)) ? (A) : (B))
+
+#define WN PARAM_N
+#define WK PARAM_K
+
+#define COLLISION_BIT_LENGTH (WN / (WK+1))
+#define COLLISION_BYTE_LENGTH ((COLLISION_BIT_LENGTH+7)/8)
+#define FINAL_FULL_WIDTH (2*COLLISION_BYTE_LENGTH+sizeof(uint32_t)*(1 << (WK)))
+
+#define NDIGITS   (WK+1)
+#define DIGITBITS (WN/(NDIGITS))
+#define PROOFSIZE (1u<<WK)
+#define COMPRESSED_PROOFSIZE ((COLLISION_BIT_LENGTH+1)*PROOFSIZE*4/(8*sizeof(uint32_t)))
+
+struct timeval kern_avg_run_time;
+
+typedef struct  debug_s
+{
+	uint32_t    dropped_coll;
+	uint32_t    dropped_stor;
+}               debug_t;
+
+struct OclGGContext {
+	cl_context _context;
+	cl_program _program;
+	cl_device_id _dev_id;
+
+	cl_platform_id platform_id = 0;
+
+	cl_command_queue queue;
+
+	cl_kernel k_init_ht;
+	cl_kernel k_rounds[PARAM_K];
+	cl_kernel k_sols;
+
+	cl_mem buf_ht[9], buf_sols, buf_dbg, rowCounters[2];
+	size_t global_ws;
+	size_t local_work_size = 64;
+
+	sols_t	*sols;
+
+	bool init(cl_device_id dev, unsigned threadsNum, unsigned threadsPerBlock);
+
+	~OclGGContext() {
+		clReleaseMemObject(buf_dbg);
+		clReleaseMemObject(buf_ht[0]);
+		clReleaseMemObject(buf_ht[1]);
+		clReleaseMemObject(rowCounters[0]);
+		clReleaseMemObject(rowCounters[1]);
+		free(sols);
+	}
+};
+
+cl_mem check_clCreateBuffer(cl_context ctx, cl_mem_flags flags, size_t size,
+	void *host_ptr);
+
+bool OclGGContext::init(
+	cl_device_id dev,
+	unsigned int threadsNum,
+	unsigned int threadsPerBlock)
+{
+	cl_int error;
+
+	queue = clCreateCommandQueue(_context, dev, 0, &error);
+
+#ifdef ENABLE_DEBUG
+	size_t              dbg_size = NR_ROWS;
+#else
+	size_t              dbg_size = 1;
+#endif
+
+	buf_dbg = check_clCreateBuffer(_context, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, dbg_size, NULL);
+	buf_ht[0] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, HT_SIZE, NULL);
+	buf_ht[1] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, HT_SIZE, NULL);
+	buf_ht[2] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, HT_SIZE, NULL);
+	buf_ht[3] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, HT_SIZE, NULL);
+	buf_ht[4] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, HT_SIZE, NULL);
+	buf_ht[5] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, HT_SIZE, NULL);
+	buf_ht[6] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, HT_SIZE, NULL);
+	buf_ht[7] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, HT_SIZE, NULL);
+	buf_ht[8] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, HT_SIZE, NULL);
+	buf_sols = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, sizeof(sols_t), NULL);
+
+	rowCounters[0] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, RC_SIZE, NULL);
+	rowCounters[1] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, RC_SIZE, NULL);
+
+	fprintf(stderr, "Hash tables will use %.1f MB\n", 9 * HT_SIZE / 1e6);
+
+	k_init_ht = clCreateKernel(_program, "kernel_init_ht", &error);
+
+	if (error != CL_SUCCESS) {
+		printf("kernel error\n");
+	}
+
+	for (unsigned i = 0; i < WK; i++) {
+		char kernelName[128];
+		sprintf(kernelName, "kernel_round%d", i);
+		k_rounds[i] = clCreateKernel(_program, kernelName, &error);
+		if (error != CL_SUCCESS) {
+			printf("kernel round error\n");
+		}
+	}
+
+	sols = (sols_t *)malloc(sizeof(*sols));
+
+	k_sols = clCreateKernel(_program, "kernel_sols", &error);
+	if (error != CL_SUCCESS) {
+		printf("kernel sols error\n");
+	}
+
+	return true;
+}
+
+///
+static int             verbose = 0;
+static uint32_t	show_encoded = 0;
+
+static cl_mem check_clCreateBuffer(cl_context ctx, cl_mem_flags flags, size_t size,
+	void *host_ptr)
+{
+	cl_int	status;
+	cl_mem	ret;
+	ret = clCreateBuffer(ctx, flags, size, host_ptr, &status);
+	if (status != CL_SUCCESS || !ret)
+		printf("clCreateBuffer (%d)\n", status);
+	return ret;
+}
+
+static void check_clSetKernelArg(cl_kernel k, cl_uint a_pos, cl_mem *a)
+{
+	cl_int	status;
+	status = clSetKernelArg(k, a_pos, sizeof(*a), a);
+	if (status != CL_SUCCESS)
+		printf("clSetKernelArg (%d)\n", status);
+}
+
+/*static void check_clEnqueueNDRangeKernel(cl_command_queue queue, cl_kernel k, cl_uint
+	work_dim, const size_t *global_work_offset, const size_t
+	*global_work_size, const size_t *local_work_size, cl_uint
+	num_events_in_wait_list, const cl_event *event_wait_list, cl_event
+	*event)
+{
+	cl_uint	status;
+	status = clEnqueueNDRangeKernel(queue, k, work_dim, global_work_offset,
+		global_work_size, local_work_size, num_events_in_wait_list,
+		event_wait_list, event);
+	OCL(status);
+}*/
+
+static void check_clEnqueueReadBuffer(cl_command_queue queue, cl_mem buffer, cl_bool
+	blocking_read, size_t offset, size_t size, void *ptr, cl_uint
+	num_events_in_wait_list, const cl_event *event_wait_list, cl_event
+	*event)
+{
+	cl_int	status;
+	status = clEnqueueReadBuffer(queue, buffer, blocking_read, offset,
+		size, ptr, num_events_in_wait_list, event_wait_list, event);
+	if (status != CL_SUCCESS)
+		printf("clEnqueueReadBuffer (%d)\n", status);
+	OCL(status);
+}
+
+static void hexdump(uint8_t *a, uint32_t a_len)
+{
+	for (uint32_t i = 0; i < a_len; i++)
+		fprintf(stderr, "%02x", a[i]);
+}
+
+static char* s_hexdump(const void *_a, uint32_t a_len)
+{
+	const uint8_t	*a = (uint8_t	*)_a;
+	static char		buf[1024];
+	uint32_t		i;
+	for (i = 0; i < a_len && i + 2 < sizeof(buf); i++)
+		sprintf(buf + i * 2, "%02x", a[i]);
+	buf[i * 2] = 0;
+	return buf;
+}
+
+static uint8_t hex2val(const char *base, size_t off)
+{
+	const char          c = base[off];
+	if (c >= '0' && c <= '9')           return c - '0';
+	else if (c >= 'a' && c <= 'f')      return 10 + c - 'a';
+	else if (c >= 'A' && c <= 'F')      return 10 + c - 'A';
+	printf("Invalid hex char at offset %zd: ...%c...\n", off, c);
+	return 0;
+}
+
+static unsigned nr_compute_units(const char *gpu)
+{
+	if (!strcmp(gpu, "rx480")) return 36;
+	fprintf(stderr, "Unknown GPU: %s\n", gpu);
+	return 0;
+}
+
+static void compress(uint8_t *out, uint32_t *inputs, uint32_t n)
+{
+	uint32_t byte_pos = 0;
+	int32_t bits_left = PREFIX + 1;
+	uint8_t x = 0;
+	uint8_t x_bits_used = 0;
+	uint8_t *pOut = out;
+	while (byte_pos < n)
+	{
+		if (bits_left >= 8 - x_bits_used)
+		{
+			x |= inputs[byte_pos] >> (bits_left - 8 + x_bits_used);
+			bits_left -= 8 - x_bits_used;
+			x_bits_used = 8;
+		}
+		else if (bits_left > 0)
+		{
+			uint32_t mask = ~(-1 << (8 - x_bits_used));
+			mask = ((~mask) >> bits_left) & mask;
+			x |= (inputs[byte_pos] << (8 - x_bits_used - bits_left)) & mask;
+			x_bits_used += bits_left;
+			bits_left = 0;
+		}
+		else if (bits_left <= 0)
+		{
+			assert(!bits_left);
+			byte_pos++;
+			bits_left = PREFIX + 1;
+		}
+		if (x_bits_used == 8)
+		{
+			*pOut++ = x;
+			x = x_bits_used = 0;
+		}
+	}
+}
+
+static void get_program_build_log(cl_program program, cl_device_id device)
+{
+	cl_int		status;
+	char	        val[2 * 1024 * 1024];
+	size_t		ret = 0;
+	status = clGetProgramBuildInfo(program, device,
+		CL_PROGRAM_BUILD_LOG,
+		sizeof(val),	// size_t param_value_size
+		&val,		// void *param_value
+		&ret);		// size_t *param_value_size_ret
+	if (status != CL_SUCCESS)
+		printf("clGetProgramBuildInfo (%d)\n", status);
+	fprintf(stderr, "%s\n", val);
+}
+
+static size_t select_work_size_blake(void)
+{
+	size_t              work_size =
+		64 * /* thread per wavefront */
+		BLAKE_WPS * /* wavefront per simd */
+		4 * /* simd per compute unit */
+		nr_compute_units("rx480");
+	// Make the work group size a multiple of the nr of wavefronts, while
+	// dividing the number of inputs. This results in the worksize being a
+	// power of 2.
+	while (NR_INPUTS % work_size)
+		work_size += 64;
+	//debug("Blake: work size %zd\n", work_size);
+	return work_size;
+}
+
+static void init_ht(cl_command_queue queue, cl_kernel k_init_ht, cl_mem buf_ht, cl_mem rowCounters)
+{
+	size_t      global_ws = RC_SIZE / sizeof(cl_uint);
+	size_t      local_ws = 256;
+	cl_int      status;
+#if 0
+	uint32_t    pat = -1;
+	status = clEnqueueFillBuffer(queue, buf_ht, &pat, sizeof(pat), 0,
+		NR_ROWS * NR_SLOTS * SLOT_LEN,
+		0,		// cl_uint	num_events_in_wait_list
+		NULL,	// cl_event	*event_wait_list
+		NULL);	// cl_event	*event
+	if (status != CL_SUCCESS)
+		fatal("clEnqueueFillBuffer (%d)\n", status);
+#endif
+	status = clSetKernelArg(k_init_ht, 0, sizeof(buf_ht), &buf_ht);
+	status = clSetKernelArg(k_init_ht, 1, sizeof(rowCounters), &rowCounters);
+	if (status != CL_SUCCESS)
+		printf("clSetKernelArg (%d)\n", status);
+	OCL(clEnqueueNDRangeKernel(queue, k_init_ht,
+		1,		// cl_uint	work_dim
+		NULL,	// size_t	*global_work_offset
+		&global_ws,	// size_t	*global_work_size
+		&local_ws,	// size_t	*local_work_size
+		0,		// cl_uint	num_events_in_wait_list
+		NULL,	// cl_event	*event_wait_list
+		NULL));	// cl_event	*event
+}
+
+
+/*
+** Sort a pair of binary blobs (a, b) which are consecutive in memory and
+** occupy a total of 2*len 32-bit words.
+**
+** a            points to the pair
+** len          number of 32-bit words in each pair
+*/
+static void sort_pair(uint32_t *a, uint32_t len)
+{
+	uint32_t    *b = a + len;
+	uint32_t     tmp, need_sorting = 0;
+	for (uint32_t i = 0; i < len; i++)
+		if (need_sorting || a[i] > b[i])
+		{
+			need_sorting = 1;
+			tmp = a[i];
+			a[i] = b[i];
+			b[i] = tmp;
+		}
+		else if (a[i] < b[i])
+			return;
+}
+
+
+#define SEEN_LEN (1 << (PREFIX + 1)) / 8
+
+static uint32_t verify_sol(sols_t *sols, unsigned sol_i)
+{
+	uint32_t  *inputs = sols->values[sol_i];
+	//uint32_t  seen_len = (1 << (PREFIX + 1)) / 8;
+	//uint8_t seen[(1 << (PREFIX + 1)) / 8];
+	uint8_t	seen[SEEN_LEN];
+	uint32_t  i;
+	uint8_t tmp;
+	// look for duplicate inputs
+	memset(seen, 0, SEEN_LEN);
+	for (i = 0; i < (1 << PARAM_K); i++)
+	{
+		if (inputs[i] / 8 >= SEEN_LEN)
+		{
+			printf("Invalid input retrieved from device: %d\n", inputs[i]);
+			sols->valid[sol_i] = 0;
+			return 0;
+		}
+		tmp = seen[inputs[i] / 8];
+		seen[inputs[i] / 8] |= 1 << (inputs[i] & 7);
+		if (tmp == seen[inputs[i] / 8])
+		{
+			// at least one input value is a duplicate
+			sols->valid[sol_i] = 0;
+			return 0;
+		}
+	}
+	// the valid flag is already set by the GPU, but set it again because
+	// I plan to change the GPU code to not set it
+	sols->valid[sol_i] = 1;
+	// sort the pairs in place
+	for (uint32_t level = 0; level < PARAM_K; level++)
+		for (i = 0; i < (1 << PARAM_K); i += (2 << level))
+			sort_pair(&inputs[i], 1 << level);
+	return 1;
+}
+
+
+static struct timeval time_diff(struct timeval start, struct timeval end)
+{
+	struct timeval temp;
+	if ((end.tv_usec - start.tv_usec)<0) {
+		temp.tv_sec = end.tv_sec - start.tv_sec - 1;
+		temp.tv_usec = 1000000 + end.tv_usec - start.tv_usec;
+	}
+	else {
+		temp.tv_sec = end.tv_sec - start.tv_sec;
+		temp.tv_usec = end.tv_usec - start.tv_usec;
+	}
+	return temp;
+}
+
+/*
+** Write ZCASH_SOL_LEN bytes representing the encoded solution as per the
+** Zcash protocol specs (512 x 21-bit inputs).
+**
+** out		ZCASH_SOL_LEN-byte buffer where the solution will be stored
+** inputs	array of 32-bit inputs
+** n		number of elements in array
+*/
+static void store_encoded_sol(uint8_t *out, uint32_t *inputs, uint32_t n)
+{
+	uint32_t byte_pos = 0;
+	int32_t bits_left = PREFIX + 1;
+	uint8_t x = 0;
+	uint8_t x_bits_used = 0;
+	while (byte_pos < n)
+	{
+		if (bits_left >= 8 - x_bits_used)
+		{
+			x |= inputs[byte_pos] >> (bits_left - 8 + x_bits_used);
+			bits_left -= 8 - x_bits_used;
+			x_bits_used = 8;
+		}
+		else if (bits_left > 0)
+		{
+			uint32_t mask = ~(-1 << (8 - x_bits_used));
+			mask = ((~mask) >> bits_left) & mask;
+			x |= (inputs[byte_pos] << (8 - x_bits_used - bits_left)) & mask;
+			x_bits_used += bits_left;
+			bits_left = 0;
+		}
+		else if (bits_left <= 0)
+		{
+			assert(!bits_left);
+			byte_pos++;
+			bits_left = PREFIX + 1;
+		}
+		if (x_bits_used == 8)
+		{
+			*out++ = x;
+			x = x_bits_used = 0;
+		}
+	}
+}
+
+/*
+** Compare two 256-bit values interpreted as little-endian 256-bit integers.
+*/
+static int32_t cmp_target_256(void *_a, void *_b)
+{
+	uint8_t	*a = static_cast<uint8_t*>(_a);
+	uint8_t	*b = static_cast<uint8_t*>(_b);
+	int32_t	i;
+	for (i = SHA256_TARGET_LEN - 1; i >= 0; i--)
+		if (a[i] != b[i])
+			return (int32_t)a[i] - b[i];
+	return 0;
+}
+
+/*
+** Verify if the solution's block hash is under the target, and if yes print
+** it formatted as:
+** "sol: <job_id> <ntime> <nonce_rightpart> <solSize+sol>"
+**
+** Return 1 iff the block hash is under the target.
+*/
+uint32_t print_solver_line(uint32_t *values, uint8_t *header,
+	size_t fixed_nonce_bytes, uint8_t *target, char *job_id)
+{
+	uint8_t	buffer[ZCASH_BLOCK_HEADER_LEN + ZCASH_SOLSIZE_LEN +
+		ZCASH_SOL_LEN];
+	uint8_t	hash0[SHA256_DIGEST_SIZE];
+	uint8_t	hash1[SHA256_DIGEST_SIZE];
+	uint8_t	*p;
+	p = buffer;
+	memcpy(p, header, ZCASH_BLOCK_HEADER_LEN);
+	p += ZCASH_BLOCK_HEADER_LEN;
+	memcpy(p, "\xfd\x40\x05", ZCASH_SOLSIZE_LEN);
+	p += ZCASH_SOLSIZE_LEN;
+	store_encoded_sol(p, values, 1 << PARAM_K);
+	sha256::Sha256_Onestep(buffer, sizeof(buffer), hash0);
+	sha256::Sha256_Onestep(hash0, sizeof(hash0), hash1);
+	// compare the double SHA256 hash with the target
+	if (cmp_target_256(target, hash1) < 0)
+	{
+		printf("Hash is above target\n");
+		return 0;
+	}
+	printf("Hash is under target\n");
+	printf("sol: %s ", job_id);
+	p = header + ZCASH_BLOCK_OFFSET_NTIME;
+	printf("%02x%02x%02x%02x ", p[0], p[1], p[2], p[3]);
+	printf("%s ", s_hexdump(header + ZCASH_BLOCK_HEADER_LEN - ZCASH_NONCE_LEN +
+		fixed_nonce_bytes, ZCASH_NONCE_LEN - fixed_nonce_bytes));
+	printf("%s%s\n", ZCASH_SOLSIZE_HEX,
+		s_hexdump(buffer + ZCASH_BLOCK_HEADER_LEN + ZCASH_SOLSIZE_LEN,
+			ZCASH_SOL_LEN));
+	fflush(stdout);
+	return 1;
+}
+
+int sol_cmp(const void *_a, const void *_b)
+{
+	const uint32_t	*a = static_cast<const uint32_t*>(_a);
+	const uint32_t	*b = static_cast<const uint32_t*>(_b);
+	for (uint32_t i = 0; i < (1 << PARAM_K); i++)
+	{
+		if (*a != *b)
+			return *a - *b;
+		a++;
+		b++;
+	}
+	return 0;
+}
+
+/*
+** Print on stdout a hex representation of the encoded solution as per the
+** zcash protocol specs (512 x 21-bit inputs).
+**
+** inputs	array of 32-bit inputs
+** n		number of elements in array
+*/
+static void print_encoded_sol(uint32_t *inputs, uint32_t n)
+{
+	uint8_t	sol[ZCASH_SOL_LEN];
+	uint32_t	i;
+	store_encoded_sol(sol, inputs, n);
+	for (i = 0; i < sizeof(sol); i++)
+		printf("%02x", sol[i]);
+	printf("\n");
+	fflush(stdout);
+}
+
+static void print_sol(uint32_t *values, uint64_t *nonce)
+{
+	uint32_t	show_n_sols;
+	show_n_sols = (1 << PARAM_K);
+	if (verbose < 2)
+		show_n_sols = MIN(10, show_n_sols);
+	fprintf(stderr, "Soln:");
+	// for brievity, only print "small" nonces
+	if (*nonce < (1ULL << 32))
+		fprintf(stderr, " 0x%" PRIx64 ":", *nonce);
+	for (unsigned i = 0; i < show_n_sols; i++)
+		fprintf(stderr, " %x", values[i]);
+	fprintf(stderr, "%s\n", (show_n_sols != (1 << PARAM_K) ? "..." : ""));
+}
+
+/*
+** Print all solutions.
+**
+** In mining mode, return the number of shares, that is the number of solutions
+** that were under the target.
+*/
+static uint32_t print_sols(sols_t *all_sols, uint64_t *nonce, uint32_t nr_valid_sols,
+	uint8_t *header, size_t fixed_nonce_bytes, uint8_t *target,
+	char *job_id)
+{
+	uint8_t		*valid_sols;
+	uint32_t		counted;
+	uint32_t		shares = 0;
+	valid_sols = static_cast<uint8_t*>(malloc(nr_valid_sols * SOL_SIZE));
+	if (!valid_sols)
+		printf("malloc: %s\n", strerror(errno));
+	counted = 0;
+	for (uint32_t i = 0; i < all_sols->nr; i++)
+		if (all_sols->valid[i])
+		{
+			if (counted >= nr_valid_sols)
+				printf("Bug: more than %d solutions\n", nr_valid_sols);
+			memcpy(valid_sols + counted * SOL_SIZE, all_sols->values[i],
+				SOL_SIZE);
+			counted++;
+		}
+	assert(counted == nr_valid_sols);
+	// sort the solutions amongst each other, to make the solver's output
+	// deterministic and testable
+	qsort(valid_sols, nr_valid_sols, SOL_SIZE, sol_cmp);
+	for (uint32_t i = 0; i < nr_valid_sols; i++)
+	{
+		uint32_t	*inputs = (uint32_t *)(valid_sols + i * SOL_SIZE);
+		if (show_encoded)
+			print_encoded_sol(inputs, 1 << PARAM_K);
+		if (verbose)
+			print_sol(inputs, nonce);
+		if (true)
+			shares += print_solver_line(inputs, header, fixed_nonce_bytes,
+				target, job_id);
+	}
+	free(valid_sols);
+	return shares;
+}
+
+/*
+** Return the number of valid solutions.
+*/
+static uint32_t verify_sols(cl_command_queue queue, cl_mem buf_sols, uint64_t *nonce,
+	uint8_t *header, size_t fixed_nonce_bytes, uint8_t *target,
+	char *job_id, uint32_t *shares, struct timeval *start_time, bool is_amd)
+{
+	sols_t	*sols;
+	uint32_t	nr_valid_sols;
+	sols = (sols_t *)malloc(sizeof(*sols));
+	if (!sols)
+		printf("malloc: %s\n", strerror(errno));
+#ifdef WIN32
+	timeBeginPeriod(1);
+	DWORD duration = (DWORD)kern_avg_run_time.tv_sec * 1000 + (DWORD)kern_avg_run_time.tv_usec / 1000;
+	if (!is_amd && duration < 1000)
+		Sleep(duration);
+#endif
+	check_clEnqueueReadBuffer(queue, buf_sols,
+		CL_TRUE,	// cl_bool	blocking_read
+		0,		// size_t	offset
+		sizeof(*sols),	// size_t	size
+		sols,	// void		*ptr
+		0,		// cl_uint	num_events_in_wait_list
+		NULL,	// cl_event	*event_wait_list
+		NULL);	// cl_event	*event
+	struct timeval curr_time;
+	gettimeofday(&curr_time, NULL);
+
+	struct timeval t_diff = time_diff(*start_time, curr_time);
+
+	double a_diff = t_diff.tv_sec * 1e6 + t_diff.tv_usec;
+	double kern_avg = kern_avg_run_time.tv_sec * 1e6 + kern_avg_run_time.tv_usec;
+	if (kern_avg == 0)
+		kern_avg = a_diff;
+	else
+		kern_avg = kern_avg * 70 / 100 + a_diff * 28 / 100; // it is 2% less than average
+															// thus allowing time to reduce
+
+	kern_avg_run_time.tv_sec = (time_t)(kern_avg / 1e6);
+	kern_avg_run_time.tv_usec = ((long)kern_avg) % 1000000;
+
+	if (sols->nr > MAX_SOLS)
+	{
+		fprintf(stderr, "%d (probably invalid) solutions were dropped!\n",
+			sols->nr - MAX_SOLS);
+		sols->nr = MAX_SOLS;
+	}
+	printf("Retrieved %d potential solutions\n", sols->nr);
+	nr_valid_sols = 0;
+	for (unsigned sol_i = 0; sol_i < sols->nr; sol_i++)
+		nr_valid_sols += verify_sol(sols, sol_i);
+	uint32_t sh = print_sols(sols, nonce, nr_valid_sols, header, fixed_nonce_bytes, target, job_id);
+	if (shares)
+		*shares = sh;
+	printf("Stats: %d likely invalids\n", sols->likely_invalids);
+	free(sols);
+	return nr_valid_sols;
+}
+
+
+ocl_gatelessgate::ocl_gatelessgate(int platf_id, int dev_id) {
+	platform_id = platf_id;
+	device_id = dev_id;
+	// TODO 
+	threadsNum = 8192;
+	wokrsize = 128; // 256;
+}
+
+std::string ocl_gatelessgate::getdevinfo() {
+	static auto devices = GetAllDevices(platform_id);
+	auto device = devices[device_id];
+	std::vector<char> name(256, 0);
+	size_t nActualSize = 0;
+	std::string gpu_name;
+
+	cl_int rc = clGetDeviceInfo(device, CL_DEVICE_NAME, name.size(), &name[0], &nActualSize);
+
+	gpu_name.assign(&name[0], nActualSize);
+
+	return "GPU_ID( " + gpu_name + ")";
+}
+
+// STATICS START
+int ocl_gatelessgate::getcount() {
+	static auto devices = GetAllDevices();
+	return devices.size();
+}
+
+void ocl_gatelessgate::getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version) {
+	static auto devices = GetAllDevices(platf_id);
+
+	if (devices.size() <= d_id) {
+		return;
+	}
+	auto device = devices[d_id];
+
+	std::vector<char> name(256, 0);
+	cl_uint compute_units = 0;
+
+	size_t nActualSize = 0;
+	cl_int rc = clGetDeviceInfo(device, CL_DEVICE_NAME, name.size(), &name[0], &nActualSize);
+
+	if (rc == CL_SUCCESS) {
+		gpu_name.assign(&name[0], nActualSize);
+	}
+
+	rc = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(cl_uint), &compute_units, &nActualSize);
+	if (rc == CL_SUCCESS) {
+		sm_count = (int)compute_units;
+	}
+
+	memset(&name[0], 0, name.size());
+	rc = clGetDeviceInfo(device, CL_DEVICE_VERSION, name.size(), &name[0], &nActualSize);
+	if (rc == CL_SUCCESS) {
+		version.assign(&name[0], nActualSize);
+	}
+}
+
+
+static bool is_platform_amd(cl_platform_id platform_id)
+{
+	char	name[1024];
+	size_t	len = 0;
+	int		status;
+	status = clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(name), &name,
+		&len);
+	if (status != CL_SUCCESS)
+		printf("clGetPlatformInfo (%d)\n", status);
+	return strncmp(name, "AMD Accelerated Parallel Processing", len) == 0;
+}
+
+
+void ocl_gatelessgate::start(ocl_gatelessgate& device_context) {
+	/*TODO*/
+	device_context.is_init_success = false;
+	device_context.oclc = new OclGGContext;
+	auto devices = GetAllDevices(device_context.platform_id);
+
+	printf("pid %i, size %u\n", device_context.platform_id, devices.size());
+	auto device = devices[device_context.device_id];
+
+	size_t nActualSize = 0;
+	cl_platform_id platform_id = nullptr;
+	cl_int rc = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform_id, nullptr);
+
+
+	device_context.oclc->_dev_id = device;
+	device_context.oclc->platform_id = platform_id;
+
+	// context create
+	cl_context_properties props[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)device_context.oclc->platform_id, 0 };
+	cl_int error;
+	device_context.oclc->_context = clCreateContext(props, 1, &device, 0, 0, &error);
+	//OCLR(error, false);
+	if (cl_int err = error) {
+		printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__);
+		return;
+	}
+
+	cl_int binstatus;
+
+	device_context.is_amd = is_platform_amd(platform_id);
+
+	char kernelName[64];
+	sprintf(kernelName, "gatelessgate_gpu_%u.bin", (unsigned)device_context.device_id);
+	if (!clCompileKernel(device_context.oclc->_context,
+		device,
+		kernelName,
+		{ "zcash/gpu/gatelessgate.cl" },
+		device_context.is_amd ? OPENCL_BUILD_OPTIONS_AMD : OPENCL_BUILD_OPTIONS,
+		&binstatus,
+		&device_context.oclc->_program)) {
+		return;
+	}
+
+	if (binstatus == CL_SUCCESS) {
+		if (!device_context.oclc->init(device, device_context.threadsNum, device_context.wokrsize)) {
+			printf("Init failed");
+			return;
+		}
+	}
+	else {
+		printf("GPU %d: failed to load kernel\n", device_context.device_id);
+		return;
+	}
+
+	device_context.is_init_success = true;
+}
+
+#include <iostream>
+
+void ocl_gatelessgate::stop(ocl_gatelessgate& device_context) {
+	if (device_context.oclc != nullptr) delete device_context.oclc;
+}
+
+void ocl_gatelessgate::solve(const char *tequihash_header,
+	unsigned int tequihash_header_len,
+	const char* nonce,
+	unsigned int nonce_len,
+	std::function<bool()> cancelf,
+	std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
+	std::function<void(void)> hashdonef,
+	ocl_gatelessgate& device_context) {
+
+	uint64_t		*nonce_ptr;
+	
+	unsigned char context[140];
+	memset(context, 0, 140);
+	memcpy(context, tequihash_header, tequihash_header_len);
+	memcpy(context + tequihash_header_len, nonce, nonce_len);
+
+	OclGGContext *miner = device_context.oclc;
+	clFlush(miner->queue);
+
+	blake2b_state_t initialCtx;
+	zcash_blake2b_init(&initialCtx, ZCASH_HASH_LEN, PARAM_N, PARAM_K);
+	zcash_blake2b_update(&initialCtx, (const uint8_t*)context, 128, 0);
+
+	cl_mem buf_blake_st;
+	buf_blake_st = check_clCreateBuffer(miner->_context, CL_MEM_READ_ONLY |
+		CL_MEM_COPY_HOST_PTR, sizeof(blake2b_state_s), &initialCtx);
+
+	cl_uint  compute_units;
+	cl_int status = clGetDeviceInfo(miner->_dev_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
+	if (status != CL_SUCCESS)
+		printf("clGetDeviceInfo (%d)\n", status);
+
+	miner->local_work_size = LOCAL_WORK_SIZE;
+
+	for (unsigned round = 0; round < PARAM_K; round++)
+	{
+		init_ht(miner->queue, miner->k_init_ht, miner->buf_ht[round & 1], miner->rowCounters[round & 1]);
+		if (!round)
+		{
+			check_clSetKernelArg(miner->k_rounds[round], 0, &buf_blake_st);
+			check_clSetKernelArg(miner->k_rounds[round], 1, &miner->buf_ht[round]);
+			check_clSetKernelArg(miner->k_rounds[round], 2, &miner->rowCounters[round % 2]);
+			miner->global_ws = select_work_size_blake();
+		}
+		else
+		{
+			check_clSetKernelArg(miner->k_rounds[round], 0, &miner->buf_ht[round - 1]);
+			check_clSetKernelArg(miner->k_rounds[round], 1, &miner->buf_ht[round]);
+			check_clSetKernelArg(miner->k_rounds[round], 2, &miner->rowCounters[(round - 1) % 2]);
+			check_clSetKernelArg(miner->k_rounds[round], 3, &miner->rowCounters[round % 2]);
+			miner->global_ws = GLOBAL_WORK_SIZE_RATIO * compute_units * LOCAL_WORK_SIZE;
+			if (miner->global_ws > NR_ROWS * THREADS_PER_ROW)
+				miner->global_ws = NR_ROWS * THREADS_PER_ROW;
+		}
+		check_clSetKernelArg(miner->k_rounds[round], round == 0 ? 3 : 4, &miner->buf_dbg);
+		OCL(clEnqueueNDRangeKernel(miner->queue, miner->k_rounds[round], 1, NULL,
+			&miner->global_ws, &miner->local_work_size, 0, NULL, NULL));
+		// cancel function
+		if (cancelf()) return;
+	}
+
+	check_clSetKernelArg(miner->k_sols, 0, &miner->buf_ht[0]);
+	check_clSetKernelArg(miner->k_sols, 1, &miner->buf_ht[1]);
+	check_clSetKernelArg(miner->k_sols, 2, &miner->buf_ht[2]);
+	check_clSetKernelArg(miner->k_sols, 3, &miner->buf_ht[3]);
+	check_clSetKernelArg(miner->k_sols, 4, &miner->buf_ht[4]);
+	check_clSetKernelArg(miner->k_sols, 5, &miner->buf_ht[5]);
+	check_clSetKernelArg(miner->k_sols, 6, &miner->buf_ht[6]);
+	check_clSetKernelArg(miner->k_sols, 7, &miner->buf_ht[7]);
+	check_clSetKernelArg(miner->k_sols, 8, &miner->buf_ht[8]);
+	check_clSetKernelArg(miner->k_sols, 9, &miner->buf_sols);
+	check_clSetKernelArg(miner->k_sols, 10, &miner->rowCounters[0]);
+	miner->global_ws = GLOBAL_WORK_SIZE_RATIO * compute_units * LOCAL_WORK_SIZE_SOLS;
+	if (miner->global_ws > NR_ROWS * THREADS_PER_ROW_SOLS)
+		miner->global_ws = NR_ROWS * THREADS_PER_ROW_SOLS;
+	miner->local_work_size = LOCAL_WORK_SIZE_SOLS;
+	struct timeval start_time;
+	gettimeofday(&start_time, NULL);
+	OCL(clEnqueueNDRangeKernel(miner->queue, miner->k_sols, 1, NULL,
+		&miner->global_ws, &miner->local_work_size, 0, NULL, NULL));
+	
+	OCL(clEnqueueReadBuffer(miner->queue, miner->buf_sols,
+		CL_TRUE,	// cl_bool	blocking_read
+		0,		// size_t	offset
+		sizeof(*miner->sols),	// size_t	size
+		miner->sols,	// void		*ptr
+		0,		// cl_uint	num_events_in_wait_list
+		NULL,	// cl_event	*event_wait_list
+		NULL));	// cl_event	*event
+
+	if (miner->sols->nr > MAX_SOLS)
+		miner->sols->nr = MAX_SOLS;
+
+	clReleaseMemObject(buf_blake_st);
+
+	for (unsigned sol_i = 0; sol_i < miner->sols->nr; sol_i++) {
+		verify_sol(miner->sols, sol_i);
+	}
+
+	uint8_t proof[COMPRESSED_PROOFSIZE * 2];
+	for (uint32_t i = 0; i < miner->sols->nr; i++) {
+		if (miner->sols->valid[i]) {
+			compress(proof, (uint32_t *)(miner->sols->values[i]), 1 << PARAM_K);
+			solutionf(std::vector<uint32_t>(0), 1344, proof);
+		}
+	}
+	hashdonef();
+}
+
+// STATICS END
+
diff --git a/ocl_gatelessgate/ocl_gatelessgate.hpp b/ocl_gatelessgate/ocl_gatelessgate.hpp
new file mode 100644
index 000000000..41cf3ca8f
--- /dev/null
+++ b/ocl_gatelessgate/ocl_gatelessgate.hpp
@@ -0,0 +1,58 @@
+#pragma once
+
+#ifdef _LIB
+#define DLL_OCL_GATELESSGATE __declspec(dllexport)
+#else
+#define DLL_OCL_GATELESSGATE
+#endif
+
+// remove after
+#include <string>
+#include <functional>
+#include <vector>
+#include <cstdint>
+
+struct OclGGContext;
+
+struct DLL_OCL_GATELESSGATE ocl_gatelessgate
+{
+	//int threadsperblock;
+	int blocks;
+	int device_id;
+	int platform_id;
+
+	OclGGContext* oclc;
+	// threads
+	unsigned threadsNum; // TMP
+	unsigned wokrsize;
+
+	bool is_init_success = false;
+	bool is_amd = false;
+
+	ocl_gatelessgate(int platf_id, int dev_id);
+
+	std::string getdevinfo();
+
+	static int getcount();
+
+	static void getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version);
+
+	static void start(ocl_gatelessgate& device_context);
+
+	static void stop(ocl_gatelessgate& device_context);
+
+	static void solve(const char *tequihash_header,
+		unsigned int tequihash_header_len,
+		const char* nonce,
+		unsigned int nonce_len,
+		std::function<bool()> cancelf,
+		std::function<void(const std::vector<uint32_t>&, size_t, const unsigned char*)> solutionf,
+		std::function<void(void)> hashdonef,
+		ocl_gatelessgate& device_context);
+
+	std::string getname() { return "OCL_GATELESSGATE"; }
+
+private:
+	std::string m_gpu_name;
+	std::string m_version;
+};
diff --git a/ocl_gatelessgate/ocl_gatelessgate.vcxproj b/ocl_gatelessgate/ocl_gatelessgate.vcxproj
new file mode 100644
index 000000000..7db9870b9
--- /dev/null
+++ b/ocl_gatelessgate/ocl_gatelessgate.vcxproj
@@ -0,0 +1,133 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}</ProjectGuid>
+    <RootNamespace>ocl_gatelessgate</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <IncludePath>$(ProjectDir)../contrib/;$(IncludePath)</IncludePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <IncludePath>$(ProjectDir)../contrib;$(ProjectDir)../ocl_device_utils;$(AMDAPPSDKROOT)include;$(IncludePath)</IncludePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="ocl_gatelessgate.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="gettimeofday.h" />
+    <ClInclude Include="ocl_gatelessgate.hpp" />
+    <ClInclude Include="param.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="gatelessgate.cl" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/ocl_gatelessgate/ocl_gatelessgate.vcxproj.filters b/ocl_gatelessgate/ocl_gatelessgate.vcxproj.filters
new file mode 100644
index 000000000..c9e0a8491
--- /dev/null
+++ b/ocl_gatelessgate/ocl_gatelessgate.vcxproj.filters
@@ -0,0 +1,14 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="ocl_gatelessgate.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="ocl_gatelessgate.hpp" />
+    <ClInclude Include="param.h" />
+    <ClInclude Include="gettimeofday.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="gatelessgate.cl" />
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/ocl_gatelessgate/param.h b/ocl_gatelessgate/param.h
new file mode 100644
index 000000000..7f476d982
--- /dev/null
+++ b/ocl_gatelessgate/param.h
@@ -0,0 +1,373 @@
+// Gateless Gate, a Zcash miner
+// Copyright 2016 zawawa @ bitcointalk.org
+//
+// The initial version of this software was based on:
+// SILENTARMY v5
+// The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil
+//
+// This program is free software : you can redistribute it and / or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program.If not, see <http://www.gnu.org/licenses/>.
+
+#pragma once
+
+#define PARAM_H
+
+// When you tweak parameters in this file, make sure to uncomment the next line,
+// rebuild sa-solver, and run it to see how many slots get dropped
+// at each round. Note that performance suffers if too many slots get dropped.
+
+//#define ENABLE_DEBUG
+
+//
+// Parameters for Hash Tables
+//
+
+// There are PARAM_K - 1 hash tables, and each hash table has NR_ROWS rows.
+// Each row contains NR_SLOTS slots.
+
+#define NR_ROWS_LOG            14  // 12, 13, 14, 15, or 16. 12 and 13 are not practically usable.
+#define NR_SLOTS               199 // Prime numbers are preferable.
+#define LOCAL_WORK_SIZE        64  
+#define THREADS_PER_ROW        64
+#define LOCAL_WORK_SIZE_SOLS   64
+#define THREADS_PER_ROW_SOLS   64
+#define GLOBAL_WORK_SIZE_RATIO 512 // global_work_size = GLOBAL_WORK_SIZE_RATIO * nr_compute_units * LOCAL_WORK_SIZE
+#define THREADS_PER_WRITE      1  // 1, 2, 4, or 8
+#define SLOT_CACHE_SIZE        (NR_SLOTS * ROWS_IN_WORK_ITEM)
+#define LDS_COLL_SIZE          (NR_SLOTS * ROWS_IN_WORK_ITEM * 140 / 100)
+#define BIN_SIZE               (NR_SLOTS * 6 / 100)
+#define EXTRA_BITS_FOR_BINS_SOLS 1
+#define BIN_SIZE_SOLS          ((BIN_SIZE >> EXTRA_BITS_FOR_BINS_SOLS) * 250 / 100)
+
+
+
+#define PARAM_N				   200
+#define PARAM_K			       9
+#define PREFIX                 (PARAM_N / (PARAM_K + 1))
+#define NR_INPUTS              (1 << PREFIX)
+#define NR_ROWS                         (1 << NR_ROWS_LOG)
+// Length of 1 element (slot) in byte
+#define SLOT_LEN                        32
+// Total size of hash table
+#define HT_SIZE				(NR_ROWS * NR_SLOTS * SLOT_LEN)
+// Length of Zcash block header, nonce (part of header)
+#define ZCASH_BLOCK_HEADER_LEN		140
+// Offset of nTime in header
+#define ZCASH_BLOCK_OFFSET_NTIME        (4 + 3 * 32)
+// Length of nonce
+#define ZCASH_NONCE_LEN			32
+// Length of encoded representation of solution size
+#define ZCASH_SOLSIZE_LEN		3
+// Solution size (1344 = 0x540) represented as a compact integer, in hex
+#define ZCASH_SOLSIZE_HEX               "fd4005"
+// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes)
+#define ZCASH_SOL_LEN                   ((1 << PARAM_K) * (PREFIX + 1) / 8)
+// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization
+#define N_ZERO_BYTES			12
+// Number of bytes Zcash needs out of Blake
+#define ZCASH_HASH_LEN                  50
+// Number of wavefronts per SIMD for the Blake kernel.
+// Blake is ALU-bound (beside the atomic counter being incremented) so we need
+// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer
+// instructions. 10 is the max supported by the hw.
+#define BLAKE_WPS               	10
+// Maximum number of solutions reported by kernel to host
+#define MAX_SOLS			11
+// Length of SHA256 target
+#define SHA256_TARGET_LEN               (256 / 8)
+
+#if (NR_SLOTS < 3)
+#define BITS_PER_ROW 2
+#define ROWS_PER_UINT 16
+#define ROW_MASK 0x03
+#elif (NR_SLOTS < 7)
+#define BITS_PER_ROW 3
+#define ROWS_PER_UINT 10
+#define ROW_MASK 0x07
+#elif (NR_SLOTS < 15)
+#define BITS_PER_ROW 4
+#define ROWS_PER_UINT 8
+#define ROW_MASK 0x0F
+#elif (NR_SLOTS < 31)
+#define BITS_PER_ROW 5
+#define ROWS_PER_UINT 6
+#define ROW_MASK 0x1F
+#elif (NR_SLOTS < 63)
+#define BITS_PER_ROW 6
+#define ROWS_PER_UINT 5
+#define ROW_MASK 0x3F
+#elif (NR_SLOTS < 255)
+#define BITS_PER_ROW 8
+#define ROWS_PER_UINT 4
+#define ROW_MASK 0xFF
+#else
+#define BITS_PER_ROW 16
+#define ROWS_PER_UINT 2
+#define ROW_MASK 0xFFFF
+#endif
+#define RC_SIZE (NR_ROWS * 4 / ROWS_PER_UINT)
+
+/*
+** Return the offset of Xi in bytes from the beginning of the slot.
+*/
+#define xi_offset_for_round(round)	4
+
+// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values
+#define SOL_SIZE			((1 << PARAM_K) * 4)
+typedef struct	sols_s
+{
+	uint	nr;
+	uint	likely_invalids;
+	uchar	valid[MAX_SOLS];
+	uint	values[MAX_SOLS][(1 << PARAM_K)];
+}		sols_t;
+
+#if NR_ROWS_LOG <= 12 && NR_SLOTS <= (1 << 10)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 20) | ((slot1 & 0x3ff) << 10) | (slot0 & 0x3ff))
+#define DECODE_ROW(REF)   (REF >> 20)
+#define DECODE_SLOT1(REF) ((REF >> 10) & 0x3ff)
+#define DECODE_SLOT0(REF) (REF & 0x3ff)
+
+#elif NR_ROWS_LOG <= 14 && NR_SLOTS <= (1 << 9)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 18) | ((slot1 & 0x1ff) << 9) | (slot0 & 0x1ff))
+#define DECODE_ROW(REF)   (REF >> 18)
+#define DECODE_SLOT1(REF) ((REF >> 9) & 0x1ff)
+#define DECODE_SLOT0(REF) (REF & 0x1ff)
+
+#elif NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff))
+#define DECODE_ROW(REF)   (REF >> 16)
+#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff)
+#define DECODE_SLOT0(REF) (REF & 0xff)
+
+#elif NR_ROWS_LOG <= 18 && NR_SLOTS <= (1 << 7)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f))
+#define DECODE_ROW(REF)   (REF >> 14)
+#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f)
+#define DECODE_SLOT0(REF) (REF & 0x7f)
+
+#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */
+#define DECODE_ROW(REF)   (REF >> 13)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f))
+#define DECODE_ROW(REF)   (REF >> 12)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+
+// Windows only for now
+#define DEFAULT_NUM_MINING_MODE_THREADS 1
+#define MAX_NUM_MINING_MODE_THREADS 16
+
+#if THREADS_PER_WRITE == 1
+#define ADJUSTED_SLOT_LEN(round) (((round) <= 5) ? SLOT_LEN : SLOT_LEN - 16)
+#else
+#define ADJUSTED_SLOT_LEN(round) SLOT_LEN
+#endif
+
+#define OPENCL_BUILD_OPTIONS_AMD "-I.. -I. -O5"
+#define OPENCL_BUILD_OPTIONS     "-I.. -I."
+
+#define NEXT_PRIME_NO(n) \
+	(((n) <= 2) ? 2 : \
+     ((n) <= 3) ? 3 : \
+     ((n) <= 5) ? 5 : \
+     ((n) <= 7) ? 7 : \
+     ((n) <= 11) ? 11 : \
+     ((n) <= 13) ? 13 : \
+     ((n) <= 17) ? 17 : \
+     ((n) <= 19) ? 19 : \
+     ((n) <= 23) ? 23 : \
+     ((n) <= 29) ? 29 : \
+     ((n) <= 31) ? 31 : \
+     ((n) <= 37) ? 37 : \
+     ((n) <= 41) ? 41 : \
+     ((n) <= 43) ? 43 : \
+     ((n) <= 47) ? 47 : \
+     ((n) <= 53) ? 53 : \
+     ((n) <= 59) ? 59 : \
+     ((n) <= 61) ? 61 : \
+     ((n) <= 67) ? 67 : \
+     ((n) <= 71) ? 71 : \
+     ((n) <= 73) ? 73 : \
+     ((n) <= 79) ? 79 : \
+     ((n) <= 83) ? 83 : \
+     ((n) <= 89) ? 89 : \
+     ((n) <= 97) ? 97 : \
+     ((n) <= 101) ? 101 : \
+     ((n) <= 103) ? 103 : \
+     ((n) <= 107) ? 107 : \
+     ((n) <= 109) ? 109 : \
+     ((n) <= 113) ? 113 : \
+     ((n) <= 127) ? 127 : \
+     ((n) <= 131) ? 131 : \
+     ((n) <= 137) ? 137 : \
+     ((n) <= 139) ? 139 : \
+     ((n) <= 149) ? 149 : \
+     ((n) <= 151) ? 151 : \
+     ((n) <= 157) ? 157 : \
+     ((n) <= 163) ? 163 : \
+     ((n) <= 167) ? 167 : \
+     ((n) <= 173) ? 173 : \
+     ((n) <= 179) ? 179 : \
+     ((n) <= 181) ? 181 : \
+     ((n) <= 191) ? 191 : \
+     ((n) <= 193) ? 193 : \
+     ((n) <= 197) ? 197 : \
+     ((n) <= 199) ? 199 : \
+     ((n) <= 211) ? 211 : \
+     ((n) <= 223) ? 223 : \
+     ((n) <= 227) ? 227 : \
+     ((n) <= 229) ? 229 : \
+     ((n) <= 233) ? 233 : \
+     ((n) <= 239) ? 239 : \
+     ((n) <= 241) ? 241 : \
+     ((n) <= 251) ? 251 : \
+     ((n) <= 257) ? 257 : \
+     ((n) <= 263) ? 263 : \
+     ((n) <= 269) ? 269 : \
+     ((n) <= 271) ? 271 : \
+     ((n) <= 277) ? 277 : \
+     ((n) <= 281) ? 281 : \
+     ((n) <= 283) ? 283 : \
+     ((n) <= 293) ? 293 : \
+     ((n) <= 307) ? 307 : \
+     ((n) <= 311) ? 311 : \
+     ((n) <= 313) ? 313 : \
+     ((n) <= 317) ? 317 : \
+     ((n) <= 331) ? 331 : \
+     ((n) <= 337) ? 337 : \
+     ((n) <= 347) ? 347 : \
+     ((n) <= 349) ? 349 : \
+     ((n) <= 353) ? 353 : \
+     ((n) <= 359) ? 359 : \
+     ((n) <= 367) ? 367 : \
+     ((n) <= 373) ? 373 : \
+     ((n) <= 379) ? 379 : \
+     ((n) <= 383) ? 383 : \
+     ((n) <= 389) ? 389 : \
+     ((n) <= 397) ? 397 : \
+     ((n) <= 401) ? 401 : \
+     ((n) <= 409) ? 409 : \
+     ((n) <= 419) ? 419 : \
+     ((n) <= 421) ? 421 : \
+     ((n) <= 431) ? 431 : \
+     ((n) <= 433) ? 433 : \
+     ((n) <= 439) ? 439 : \
+     ((n) <= 443) ? 443 : \
+     ((n) <= 449) ? 449 : \
+     ((n) <= 457) ? 457 : \
+     ((n) <= 461) ? 461 : \
+     ((n) <= 463) ? 463 : \
+     ((n) <= 467) ? 467 : \
+     ((n) <= 479) ? 479 : \
+     ((n) <= 487) ? 487 : \
+     ((n) <= 491) ? 491 : \
+     ((n) <= 499) ? 499 : \
+     ((n) <= 503) ? 503 : \
+     ((n) <= 509) ? 509 : \
+     ((n) <= 521) ? 521 : \
+     ((n) <= 523) ? 523 : \
+     ((n) <= 541) ? 541 : \
+     ((n) <= 547) ? 547 : \
+     ((n) <= 557) ? 557 : \
+     ((n) <= 563) ? 563 : \
+     ((n) <= 569) ? 569 : \
+     ((n) <= 571) ? 571 : \
+     ((n) <= 577) ? 577 : \
+     ((n) <= 587) ? 587 : \
+     ((n) <= 593) ? 593 : \
+     ((n) <= 599) ? 599 : \
+     ((n) <= 601) ? 601 : \
+     ((n) <= 607) ? 607 : \
+     ((n) <= 613) ? 613 : \
+     ((n) <= 617) ? 617 : \
+     ((n) <= 619) ? 619 : \
+     ((n) <= 631) ? 631 : \
+     ((n) <= 641) ? 641 : \
+     ((n) <= 643) ? 643 : \
+     ((n) <= 647) ? 647 : \
+     ((n) <= 653) ? 653 : \
+     ((n) <= 659) ? 659 : \
+     ((n) <= 661) ? 661 : \
+     ((n) <= 673) ? 673 : \
+     ((n) <= 677) ? 677 : \
+     ((n) <= 683) ? 683 : \
+     ((n) <= 691) ? 691 : \
+     ((n) <= 701) ? 701 : \
+     ((n) <= 709) ? 709 : \
+     ((n) <= 719) ? 719 : \
+     ((n) <= 727) ? 727 : \
+     ((n) <= 733) ? 733 : \
+     ((n) <= 739) ? 739 : \
+     ((n) <= 743) ? 743 : \
+     ((n) <= 751) ? 751 : \
+     ((n) <= 757) ? 757 : \
+     ((n) <= 761) ? 761 : \
+     ((n) <= 769) ? 769 : \
+     ((n) <= 773) ? 773 : \
+     ((n) <= 787) ? 787 : \
+     ((n) <= 797) ? 797 : \
+     ((n) <= 809) ? 809 : \
+     ((n) <= 811) ? 811 : \
+     ((n) <= 821) ? 821 : \
+     ((n) <= 823) ? 823 : \
+     ((n) <= 827) ? 827 : \
+     ((n) <= 829) ? 829 : \
+     ((n) <= 839) ? 839 : \
+     ((n) <= 853) ? 853 : \
+     ((n) <= 857) ? 857 : \
+     ((n) <= 859) ? 859 : \
+     ((n) <= 863) ? 863 : \
+     ((n) <= 877) ? 877 : \
+     ((n) <= 881) ? 881 : \
+     ((n) <= 883) ? 883 : \
+     ((n) <= 887) ? 887 : \
+     ((n) <= 907) ? 907 : \
+     ((n) <= 911) ? 911 : \
+     ((n) <= 919) ? 919 : \
+     ((n) <= 929) ? 929 : \
+     ((n) <= 937) ? 937 : \
+     ((n) <= 941) ? 941 : \
+     ((n) <= 947) ? 947 : \
+     ((n) <= 953) ? 953 : \
+     ((n) <= 967) ? 967 : \
+     ((n) <= 971) ? 971 : \
+     ((n) <= 977) ? 977 : \
+     ((n) <= 983) ? 983 : \
+     ((n) <= 991) ? 991 : \
+     ((n) <= 997) ? 997 : \
+     ((n) <= 1009) ? 1009 : \
+		             (n))
+
+#define ROWS_IN_WORK_ITEM      (LOCAL_WORK_SIZE      / THREADS_PER_ROW     )
+#define ROWS_IN_WORK_ITEM_SOLS (LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS)
diff --git a/ocl_gatelessgate/param_nr15.h b/ocl_gatelessgate/param_nr15.h
new file mode 100644
index 000000000..fd08ba0e0
--- /dev/null
+++ b/ocl_gatelessgate/param_nr15.h
@@ -0,0 +1,198 @@
+// Gateless Gate, a Zcash miner
+// Copyright 2016 zawawa @ bitcointalk.org
+//
+// The initial version of this software was based on:
+// SILENTARMY v5
+// The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil
+//
+// This program is free software : you can redistribute it and / or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program.If not, see <http://www.gnu.org/licenses/>.
+
+//#define ENABLE_DEBUG
+
+#define NR_ROWS_LOG            15
+#define NR_SLOTS               120
+#define LOCAL_WORK_SIZE        256
+#define THREADS_PER_ROW        256
+#define LOCAL_WORK_SIZE_SOLS   64
+#define THREADS_PER_ROW_SOLS   64
+#define GLOBAL_WORK_SIZE_RATIO 512
+#define SLOT_CACHE_SIZE        (NR_SLOTS * (LOCAL_WORK_SIZE/THREADS_PER_ROW) * 75 / 100)
+#define LDS_COLL_SIZE          (NR_SLOTS * (LOCAL_WORK_SIZE / THREADS_PER_ROW) * 120 / 100)
+
+#define SLOT_CACHE_INDEX_TYPE uchar
+
+#define PARAM_N				   200
+#define PARAM_K			       9
+#define PREFIX                 (PARAM_N / (PARAM_K + 1))
+#define NR_INPUTS              (1 << PREFIX)
+// Approximate log base 2 of number of elements in hash tables
+#define APX_NR_ELMS_LOG        PREFIX + 1)
+
+// Setting this to 1 might make Gateless Gate faster, see TROUBLESHOOTING.md
+#define OPTIM_SIMPLIFY_ROUND   1
+
+// Ratio of time of sleeping before rechecking if task is done (0-1)
+#define SLEEP_RECHECK_RATIO 0.60
+// Ratio of time to busy wait for the solution (0-1)
+// The higher value the higher CPU usage with Nvidia
+#define SLEEP_SKIP_RATIO 0.005
+
+// Make hash tables OVERHEAD times larger than necessary to store the average
+// number of elements per row. The ideal value is as small as possible to
+// reduce memory usage, but not too small or else elements are dropped from the
+// hash tables.
+//
+// The actual number of elements per row is closer to the theoretical average
+// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be
+// smaller.
+//
+// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease
+// performance as they cause VRAM channel conflicts.
+#if NR_ROWS_LOG <= 16
+#define OVERHEAD                        2
+#elif NR_ROWS_LOG == 17
+#define OVERHEAD                        3
+#elif NR_ROWS_LOG == 18
+#define OVERHEAD                        3
+#elif NR_ROWS_LOG == 19
+#define OVERHEAD                        5
+#elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND
+#define OVERHEAD                        6
+#elif NR_ROWS_LOG == 20
+#define OVERHEAD                        9
+#endif
+
+#define NR_ROWS                         (1 << NR_ROWS_LOG)
+#ifndef NR_SLOTS
+#define NR_SLOTS                        (((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD))
+#endif
+// Length of 1 element (slot) in byte
+#define SLOT_LEN                        32
+// Total size of hash table
+#define HT_SIZE				(NR_ROWS * NR_SLOTS * SLOT_LEN)
+// Length of Zcash block header, nonce (part of header)
+#define ZCASH_BLOCK_HEADER_LEN		140
+// Offset of nTime in header
+#define ZCASH_BLOCK_OFFSET_NTIME        (4 + 3 * 32)
+// Length of nonce
+#define ZCASH_NONCE_LEN			32
+// Length of encoded representation of solution size
+#define ZCASH_SOLSIZE_LEN		3
+// Solution size (1344 = 0x540) represented as a compact integer, in hex
+#define ZCASH_SOLSIZE_HEX               "fd4005"
+// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes)
+#define ZCASH_SOL_LEN                   ((1 << PARAM_K) * (PREFIX + 1) / 8)
+// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization
+#define N_ZERO_BYTES			12
+// Number of bytes Zcash needs out of Blake
+#define ZCASH_HASH_LEN                  50
+// Number of wavefronts per SIMD for the Blake kernel.
+// Blake is ALU-bound (beside the atomic counter being incremented) so we need
+// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer
+// instructions. 10 is the max supported by the hw.
+#define BLAKE_WPS               	10
+// Maximum number of solutions reported by kernel to host
+#define MAX_SOLS			10
+// Length of SHA256 target
+#define SHA256_TARGET_LEN               (256 / 8)
+
+#if (NR_SLOTS < 3)
+#define BITS_PER_ROW 2
+#define ROWS_PER_UINT 16
+#define ROW_MASK 0x03
+#elif (NR_SLOTS < 7)
+#define BITS_PER_ROW 3
+#define ROWS_PER_UINT 10
+#define ROW_MASK 0x07
+#elif (NR_SLOTS < 15)
+#define BITS_PER_ROW 4
+#define ROWS_PER_UINT 8
+#define ROW_MASK 0x0F
+#elif (NR_SLOTS < 31)
+#define BITS_PER_ROW 5
+#define ROWS_PER_UINT 6
+#define ROW_MASK 0x1F
+#elif (NR_SLOTS < 63)
+#define BITS_PER_ROW 6
+#define ROWS_PER_UINT 5
+#define ROW_MASK 0x3F
+#elif (NR_SLOTS < 255)
+#define BITS_PER_ROW 8
+#define ROWS_PER_UINT 4
+#define ROW_MASK 0xFF
+#else
+#define BITS_PER_ROW 16
+#define ROWS_PER_UINT 2
+#define ROW_MASK 0xFFFF
+#endif
+#define RC_SIZE (NR_ROWS * 4 / ROWS_PER_UINT)
+
+/*
+** Return the offset of Xi in bytes from the beginning of the slot.
+*/
+#define xi_offset_for_round(round)	4
+
+// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values
+#define SOL_SIZE			((1 << PARAM_K) * 4)
+typedef struct	sols_s
+{
+	uint	nr;
+	uint	likely_invalids;
+	uchar	valid[MAX_SOLS];
+	uint	values[MAX_SOLS][(1 << PARAM_K)];
+}		sols_t;
+
+#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff))
+#define DECODE_ROW(REF)   (REF >> 16)
+#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff)
+#define DECODE_SLOT0(REF) (REF & 0xff)
+
+#elif NR_ROWS_LOG <= 18 && NR_SLOTS <= (1 << 7)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f))
+#define DECODE_ROW(REF)   (REF >> 14)
+#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f)
+#define DECODE_SLOT0(REF) (REF & 0x7f)
+
+#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */
+#define DECODE_ROW(REF)   (REF >> 13)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f))
+#define DECODE_ROW(REF)   (REF >> 12)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+
+// Windows only for now
+#define DEFAULT_NUM_MINING_MODE_THREADS 1
+#define MAX_NUM_MINING_MODE_THREADS 16
+
+#define ADJUSTED_SLOT_LEN(round) (((round) <= 5) ? SLOT_LEN : SLOT_LEN / 2)
+#define OPENCL_BUILD_OPTIONS_AMD "-I.. -I. -O1"
+#define OPENCL_BUILD_OPTIONS     "-I.. -I."
\ No newline at end of file
diff --git a/ocl_silentarmy/ocl_silentarmy.cpp b/ocl_silentarmy/ocl_silentarmy.cpp
index c4a2cc7b0..a820de8f5 100644
--- a/ocl_silentarmy/ocl_silentarmy.cpp
+++ b/ocl_silentarmy/ocl_silentarmy.cpp
@@ -16,12 +16,12 @@
 //#include <getopt.h>
 #include <errno.h>
 
-
 #include "opencl.h"
 
 #include <fstream>
 
-#include "sa_blake.h"
+#include <blake/blake.hpp>
+using namespace blake;
 
 typedef uint8_t		uchar;
 typedef uint32_t	uint;
diff --git a/ocl_silentarmy/ocl_silentarmy.vcxproj b/ocl_silentarmy/ocl_silentarmy.vcxproj
index 77771fcb6..251d3b605 100644
--- a/ocl_silentarmy/ocl_silentarmy.vcxproj
+++ b/ocl_silentarmy/ocl_silentarmy.vcxproj
@@ -12,12 +12,10 @@
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="ocl_silentarmy.cpp" />
-    <ClCompile Include="sa_blake.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="ocl_silentarmy.hpp" />
     <ClInclude Include="param.h" />
-    <ClInclude Include="sa_blake.h" />
   </ItemGroup>
   <ItemGroup>
     <None Include="zcash\gpu\input.cl" />
@@ -56,10 +54,12 @@
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(Platform)\$(Configuration)\</IntDir>
+    <IncludePath>$(ProjectDir)../contrib/;$(IncludePath)</IncludePath>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <IntDir>$(Platform)\$(Configuration)\</IntDir>
     <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
+    <IncludePath>$(ProjectDir)../contrib/;$(IncludePath)</IncludePath>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <ClCompile>
diff --git a/ocl_silentarmy/ocl_silentarmy.vcxproj.filters b/ocl_silentarmy/ocl_silentarmy.vcxproj.filters
index 9659f2c07..432f9c4b1 100644
--- a/ocl_silentarmy/ocl_silentarmy.vcxproj.filters
+++ b/ocl_silentarmy/ocl_silentarmy.vcxproj.filters
@@ -2,12 +2,10 @@
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup>
     <ClCompile Include="ocl_silentarmy.cpp" />
-    <ClCompile Include="sa_blake.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="ocl_silentarmy.hpp" />
     <ClInclude Include="param.h" />
-    <ClInclude Include="sa_blake.h" />
   </ItemGroup>
   <ItemGroup>
     <Filter Include="zcash">
diff --git a/ocl_xpm/ocl_xmp.cpp b/ocl_xpm/ocl_xmp.cpp
index 4064e3626..d0a96a2a8 100644
--- a/ocl_xpm/ocl_xmp.cpp
+++ b/ocl_xpm/ocl_xmp.cpp
@@ -105,7 +105,7 @@ static void setheader(blake2b_state *ctx, const char *header, const uint32_t hea
 {
 	uint32_t le_N = WN;
 	uint32_t le_K = WK;
-	char personal[] = "DeepWebCa01230123";
+	char personal[] = "ZcashPoW01230123";
 	memcpy(personal + 8, &le_N, 4);
 	memcpy(personal + 12, &le_K, 4);
 	blake2b_param P[1];