From 1c57e7b5efbe84e1c252544b1da970bb8b298a3b Mon Sep 17 00:00:00 2001 From: maztheman Date: Tue, 20 Dec 2016 08:19:45 -0700 Subject: [PATCH] AMD/NVIDIA GATELESS GATE v0.0.1 --- contrib/blake/blake.hpp | 112 + contrib/ocl/algorithm/compress.hpp | 46 + .../algorithm/detail/gatelessgate_context.hpp | 15 + .../algorithm/detail/silentarmy_context.hpp | 82 + .../algorithm/detail/silentarmy_detail.hpp | 114 + contrib/ocl/algorithm/gatelessgate.hpp | 114 + contrib/ocl/algorithm/silentarmy.hpp | 254 + contrib/ocl/cl_ext.hpp | 12355 ++++++++++++++++ contrib/ocl/crypto/blake.hpp | 62 + contrib/ocl/crypto/detail/blake.hpp | 54 + contrib/ocl/hex.hpp | 34 + contrib/ocl/include/blake.hpp | 103 + contrib/ocl/include/ocl_gatelessgate.hpp | 73 + contrib/ocl/include/ocl_gg_context.hpp | 34 + contrib/ocl/include/param.h | 198 + contrib/ocl/kernels/gatelessgate.cl | 993 ++ contrib/ocl/kernels/silentarmy.cl | 946 ++ contrib/ocl/opencl.hpp | 72 + contrib/ocl/sols.hpp | 20 + contrib/ocl/utility/device_utils.hpp | 181 + contrib/sha256/sha256.hpp | 228 + cpu_tromp/equi.h | 2 +- cuda_silentarmy/cuda_silentarmy.vcxproj | 6 +- cuda_silentarmy/kernel.cu | 38 +- .../cuda_silentarmy_sm30.vcxproj | 8 +- cuda_silentarmy_sm30/kernel.cu | 157 +- nheqminer/libstratum/StratumClient.cpp | 9 +- nheqminer/libstratum/ZcashStratum.cpp | 36 +- nheqminer/libstratum/ZcashStratum.h | 29 +- nheqminer/main.cpp | 75 +- nheqminer/nheqminer.sln | 20 + nheqminer/nheqminer.vcxproj | 4 +- ocl_gatelessgate/gatelessgate.cl | 1245 ++ ocl_gatelessgate/gettimeofday.h | 43 + ocl_gatelessgate/ocl_gatelessgate.cpp | 912 ++ ocl_gatelessgate/ocl_gatelessgate.hpp | 58 + ocl_gatelessgate/ocl_gatelessgate.vcxproj | 133 + .../ocl_gatelessgate.vcxproj.filters | 14 + ocl_gatelessgate/param.h | 373 + ocl_gatelessgate/param_nr15.h | 198 + ocl_silentarmy/ocl_silentarmy.cpp | 4 +- ocl_silentarmy/ocl_silentarmy.vcxproj | 4 +- ocl_silentarmy/ocl_silentarmy.vcxproj.filters | 2 - ocl_xpm/ocl_xmp.cpp | 2 +- 44 files changed, 19292 insertions(+), 170 deletions(-) create mode 100644 contrib/blake/blake.hpp create mode 100644 contrib/ocl/algorithm/compress.hpp create mode 100644 contrib/ocl/algorithm/detail/gatelessgate_context.hpp create mode 100644 contrib/ocl/algorithm/detail/silentarmy_context.hpp create mode 100644 contrib/ocl/algorithm/detail/silentarmy_detail.hpp create mode 100644 contrib/ocl/algorithm/gatelessgate.hpp create mode 100644 contrib/ocl/algorithm/silentarmy.hpp create mode 100644 contrib/ocl/cl_ext.hpp create mode 100644 contrib/ocl/crypto/blake.hpp create mode 100644 contrib/ocl/crypto/detail/blake.hpp create mode 100644 contrib/ocl/hex.hpp create mode 100644 contrib/ocl/include/blake.hpp create mode 100644 contrib/ocl/include/ocl_gatelessgate.hpp create mode 100644 contrib/ocl/include/ocl_gg_context.hpp create mode 100644 contrib/ocl/include/param.h create mode 100644 contrib/ocl/kernels/gatelessgate.cl create mode 100644 contrib/ocl/kernels/silentarmy.cl create mode 100644 contrib/ocl/opencl.hpp create mode 100644 contrib/ocl/sols.hpp create mode 100644 contrib/ocl/utility/device_utils.hpp create mode 100644 contrib/sha256/sha256.hpp create mode 100644 ocl_gatelessgate/gatelessgate.cl create mode 100644 ocl_gatelessgate/gettimeofday.h create mode 100644 ocl_gatelessgate/ocl_gatelessgate.cpp create mode 100644 ocl_gatelessgate/ocl_gatelessgate.hpp create mode 100644 ocl_gatelessgate/ocl_gatelessgate.vcxproj create mode 100644 ocl_gatelessgate/ocl_gatelessgate.vcxproj.filters create mode 100644 ocl_gatelessgate/param.h create mode 100644 ocl_gatelessgate/param_nr15.h diff --git a/contrib/blake/blake.hpp b/contrib/blake/blake.hpp new file mode 100644 index 000000000..976dc6dbc --- /dev/null +++ b/contrib/blake/blake.hpp @@ -0,0 +1,112 @@ +#pragma once + +#include + +namespace blake { + + +namespace impl { + +static const uint32_t blake2b_block_len = 128; +static const uint32_t blake2b_rounds = 12; +static const uint64_t blake2b_iv[8] = +{ + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL, +}; + +static const uint8_t blake2b_sigma[12][16] = +{ + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, +}; + +inline uint64_t rotr64(uint64_t a, uint8_t bits) +{ + return (a >> bits) | (a << (64 - bits)); +} + +inline void mix(uint64_t *va, uint64_t *vb, uint64_t *vc, uint64_t *vd, + uint64_t x, uint64_t y) +{ + *va = (*va + *vb + x); + *vd = rotr64(*vd ^ *va, 32); + *vc = (*vc + *vd); + *vb = rotr64(*vb ^ *vc, 24); + *va = (*va + *vb + y); + *vd = rotr64(*vd ^ *va, 16); + *vc = (*vc + *vd); + *vb = rotr64(*vb ^ *vc, 63); +} + +} + + +typedef struct blake2b_state_s +{ + uint64_t h[8]; + uint64_t bytes; +} blake2b_state_t; + +inline void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, uint32_t n, uint32_t k) +{ + using namespace blake::impl; + + assert(n > k); + assert(hash_len <= 64); + st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len); + for (uint32_t i = 1; i <= 5; i++) + st->h[i] = blake2b_iv[i]; + st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW"; + st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n); + st->bytes = 0; + +} + +inline void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg, uint32_t msg_len, uint32_t is_final) +{ + using namespace blake::impl; + + const uint64_t *m = (const uint64_t *)_msg; + uint64_t v[16]; + assert(msg_len <= 128); + assert(st->bytes <= UINT64_MAX - msg_len); + memcpy(v + 0, st->h, 8 * sizeof (*v)); + memcpy(v + 8, blake2b_iv, 8 * sizeof (*v)); + v[12] ^= (st->bytes += msg_len); + v[14] ^= is_final ? -1 : 0; + for (uint32_t round = 0; round < blake2b_rounds; round++) + { + const uint8_t *s = blake2b_sigma[round]; + mix(v + 0, v + 4, v + 8, v + 12, m[s[0]], m[s[1]]); + mix(v + 1, v + 5, v + 9, v + 13, m[s[2]], m[s[3]]); + mix(v + 2, v + 6, v + 10, v + 14, m[s[4]], m[s[5]]); + mix(v + 3, v + 7, v + 11, v + 15, m[s[6]], m[s[7]]); + mix(v + 0, v + 5, v + 10, v + 15, m[s[8]], m[s[9]]); + mix(v + 1, v + 6, v + 11, v + 12, m[s[10]], m[s[11]]); + mix(v + 2, v + 7, v + 8, v + 13, m[s[12]], m[s[13]]); + mix(v + 3, v + 4, v + 9, v + 14, m[s[14]], m[s[15]]); + } + for (uint32_t i = 0; i < 8; i++) + st->h[i] ^= v[i] ^ v[i + 8]; +} + +inline void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen) +{ + assert(outlen <= 64); + memcpy(out, st->h, outlen); +} + +} \ No newline at end of file diff --git a/contrib/ocl/algorithm/compress.hpp b/contrib/ocl/algorithm/compress.hpp new file mode 100644 index 000000000..36c8e14c0 --- /dev/null +++ b/contrib/ocl/algorithm/compress.hpp @@ -0,0 +1,46 @@ +#pragma once + +namespace ocl { +namespace algorithm { + +template +inline void compress(uint8_t *out, uint32_t *inputs, uint32_t n) +{ + uint32_t byte_pos = 0; + int32_t bits_left = _PREFIX + 1; + uint8_t x = 0; + uint8_t x_bits_used = 0; + uint8_t *pOut = out; + while (byte_pos < n) + { + if (bits_left >= 8 - x_bits_used) + { + x |= inputs[byte_pos] >> (bits_left - 8 + x_bits_used); + bits_left -= 8 - x_bits_used; + x_bits_used = 8; + } + else if (bits_left > 0) + { + uint32_t mask = ~(-1 << (8 - x_bits_used)); + mask = ((~mask) >> bits_left) & mask; + x |= (inputs[byte_pos] << (8 - x_bits_used - bits_left)) & mask; + x_bits_used += bits_left; + bits_left = 0; + } + else if (bits_left <= 0) + { + assert(!bits_left); + byte_pos++; + bits_left = _PREFIX + 1; + } + if (x_bits_used == 8) + { + *pOut++ = x; + x = x_bits_used = 0; + } + } +} + + +} +} \ No newline at end of file diff --git a/contrib/ocl/algorithm/detail/gatelessgate_context.hpp b/contrib/ocl/algorithm/detail/gatelessgate_context.hpp new file mode 100644 index 000000000..47c36c21a --- /dev/null +++ b/contrib/ocl/algorithm/detail/gatelessgate_context.hpp @@ -0,0 +1,15 @@ +#pragma once + +namespace ocl { +namespace algorithm { +namespace algorithm_detail { + +struct gatelessgate_context { + + + +}; + +} +} +} \ No newline at end of file diff --git a/contrib/ocl/algorithm/detail/silentarmy_context.hpp b/contrib/ocl/algorithm/detail/silentarmy_context.hpp new file mode 100644 index 000000000..a72fac862 --- /dev/null +++ b/contrib/ocl/algorithm/detail/silentarmy_context.hpp @@ -0,0 +1,82 @@ +#pragma once +#include +#include +#include + +namespace ocl { +namespace algorithm { +namespace algorithm_detail { + + + +struct silentarmy_context { + cl_context _context; + cl_program _program; + cl_device_id _dev_id; + cl_platform_id platform_id = 0; + cl_command_queue queue; + + + cl_kernel k_init_ht; + cl_kernel k_rounds[SA_PARAM_K]; + cl_kernel k_sols; + + cl_mem buf_ht[2], buf_sols, buf_dbg, rowCounters[2]; + size_t global_ws; + size_t local_work_size = 64; + + sols_t *sols; + + bool init(cl_device_id dev, unsigned threadsNum, unsigned threadsPerBlock) { + cl_int error; + + queue = clCreateCommandQueue(_context, dev, 0, &error); + + #ifdef SA_ENABLE_DEBUG + size_t dbg_size = SA_NR_ROWS; + #else + size_t dbg_size = 1; + #endif + + buf_dbg = check_clCreateBuffer(_context, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, dbg_size, NULL); + buf_ht[0] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_HT_SIZE, NULL); + buf_ht[1] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_HT_SIZE, NULL); + buf_sols = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, sizeof(sols_t), NULL); + + rowCounters[0] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_NR_ROWS, NULL); + rowCounters[1] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_NR_ROWS, NULL); + + + + fprintf(stderr, "Hash tables will use %.1f MB\n", 2.0 * SA_HT_SIZE / 1e6); + + k_init_ht = clCreateKernel(_program, "kernel_init_ht", &error); + for (unsigned i = 0; i < SA_PARAM_K; i++) { + char kernelName[128]; + sprintf(kernelName, "kernel_round%d", i); + k_rounds[i] = clCreateKernel(_program, kernelName, &error); + } + + sols = (sols_t *)malloc(sizeof(*sols)); + + k_sols = clCreateKernel(_program, "kernel_sols", &error); + return true; + + + } + + ~silentarmy_context() { + clReleaseMemObject(buf_dbg); + clReleaseMemObject(buf_ht[0]); + clReleaseMemObject(buf_ht[1]); + clReleaseMemObject(rowCounters[0]); + clReleaseMemObject(rowCounters[1]); + free(sols); + } + + +}; + +} +} +} \ No newline at end of file diff --git a/contrib/ocl/algorithm/detail/silentarmy_detail.hpp b/contrib/ocl/algorithm/detail/silentarmy_detail.hpp new file mode 100644 index 000000000..00f4ac35c --- /dev/null +++ b/contrib/ocl/algorithm/detail/silentarmy_detail.hpp @@ -0,0 +1,114 @@ +#pragma once +#include +#include +#include + +namespace ocl { +namespace algorithm { +namespace algorithm_detail { + +inline void init_ht(cl_command_queue queue, cl_kernel k_init_ht, cl_mem buf_ht, cl_mem rowCounters) +{ + size_t global_ws = SA_NR_ROWS / SA_ROWS_PER_UINT; + size_t local_ws = 256; + cl_int status; +#if 0 + uint32_t pat = -1; + status = clEnqueueFillBuffer(queue, buf_ht, &pat, sizeof(pat), 0, + SA_NR_ROWS * SA_NR_SLOTS * SA_SLOT_LEN, + 0, // cl_uint num_events_in_wait_list + NULL, // cl_event *event_wait_list + NULL); // cl_event *event + if (status != CL_SUCCESS) + fatal("clEnqueueFillBuffer (%d)\n", status); +#endif + status = clSetKernelArg(k_init_ht, 0, sizeof(buf_ht), &buf_ht); + status = clSetKernelArg(k_init_ht, 1, sizeof(rowCounters), &rowCounters); + if (status != CL_SUCCESS) + printf("clSetKernelArg (%d)\n", status); + check_clEnqueueNDRangeKernel(queue, k_init_ht, + 1, // cl_uint work_dim + NULL, // size_t *global_work_offset + &global_ws, // size_t *global_work_size + &local_ws, // size_t *local_work_size + 0, // cl_uint num_events_in_wait_list + NULL, // cl_event *event_wait_list + NULL); // cl_event *event +} + + +/* +** Sort a pair of binary blobs (a, b) which are consecutive in memory and +** occupy a total of 2*len 32-bit words. +** +** a points to the pair +** len number of 32-bit words in each pair +*/ +inline void sort_pair(uint32_t *a, uint32_t len) +{ + uint32_t *b = a + len; + uint32_t tmp, need_sorting = 0; + for (uint32_t i = 0; i < len; i++) + if (need_sorting || a[i] > b[i]) + { + need_sorting = 1; + tmp = a[i]; + a[i] = b[i]; + b[i] = tmp; + } + else if (a[i] < b[i]) + return; +} + +inline uint32_t verify_sol(sols_t *sols, unsigned sol_i) +{ + uint32_t *inputs = sols->values[sol_i]; + uint32_t seen_len = (1 << (SA_PREFIX + 1)) / 8; + uint8_t seen[(1 << (SA_PREFIX + 1)) / 8]; + uint32_t i; + uint8_t tmp; + // look for duplicate inputs + memset(seen, 0, seen_len); + for (i = 0; i < (1 << SA_PARAM_K); i++) + { + tmp = seen[inputs[i] / 8]; + seen[inputs[i] / 8] |= 1 << (inputs[i] & 7); + if (tmp == seen[inputs[i] / 8]) + { + // at least one input value is a duplicate + sols->valid[sol_i] = 0; + return 0; + } + } + // the valid flag is already set by the GPU, but set it again because + // I plan to change the GPU code to not set it + sols->valid[sol_i] = 1; + // sort the pairs in place + for (uint32_t level = 0; level < SA_PARAM_K; level++) + for (i = 0; i < (1 << SA_PARAM_K); i += (2 << level)) + sort_pair(&inputs[i], 1 << level); + return 1; +} + + +inline size_t select_work_size_blake(cl_device_id device_id) +{ + + size_t work_size = + 64 * /* thread per wavefront */ + SA_BLAKE_WPS * /* wavefront per simd */ + 4 * /* simd per compute unit */ + nr_compute_units(device_id); + // Make the work group size a multiple of the nr of wavefronts, while + // dividing the number of inputs. This results in the worksize being a + // power of 2. + while (SA_NR_INPUTS % work_size) + work_size += 64; + + return work_size; +} + + +} +} +} \ No newline at end of file diff --git a/contrib/ocl/algorithm/gatelessgate.hpp b/contrib/ocl/algorithm/gatelessgate.hpp new file mode 100644 index 000000000..3c6480c65 --- /dev/null +++ b/contrib/ocl/algorithm/gatelessgate.hpp @@ -0,0 +1,114 @@ +#pragma once + +#include +#include +#include +#include + +namespace ocl { +namespace algorithm { + +struct gatelessgate { + + int blocks; + int device_id; + int platform_id; + + algorithm_detail::gatelessgate_context* oclc; + // threads + unsigned threadsNum; // TMP + unsigned wokrsize; + + bool is_init_success = false; + + gatelessgate(int platf_id, int dev_id) + : blocks(0) + , device_id(dev_id) + , platform_id(platf_id) + , oclc(nullptr) + , threadsNum(8192U) + , wokrsize(128) + { + + } + + static int getcount() { + static auto devices = utility::GetAllDevices(); + return devices.size(); + } + + static void getinfo(int platf_id, int d_id, ::std::string& gpu_name, int& sm_count, ::std::string& version) { + static auto devices = utility::GetAllDevices(); + + if (devices.size() <= d_id) { + return; + } + auto device = devices[d_id]; + + ::std::vector name(256, 0); + cl_uint compute_units = 0; + + size_t nActualSize = 0; + cl_int rc = clGetDeviceInfo(device, CL_DEVICE_NAME, name.size(), &name[0], &nActualSize); + + if (rc == CL_SUCCESS) { + gpu_name.assign(&name[0], nActualSize); + } + + rc = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(cl_uint), &compute_units, &nActualSize); + if (rc == CL_SUCCESS) { + sm_count = (int)compute_units; + } + + memset(&name[0], 0, name.size()); + rc = clGetDeviceInfo(device, CL_DEVICE_VERSION, name.size(), &name[0], &nActualSize); + if (rc == CL_SUCCESS) { + version.assign(&name[0], nActualSize); + } + } + + static void start(gatelessgate& device_context) { + + } + + static void stop(gatelessgate& device_context) { + + } + + static void solve(const char *tequihash_header, + unsigned int tequihash_header_len, + const char* nonce, + unsigned int nonce_len, + ::std::function cancelf, + ::std::function&, size_t, const unsigned char*)> solutionf, + ::std::function hashdonef, + gatelessgate& device_context) { + + } + + std::string getname() const { return "OCL_GATELESSGATE"; } + + std::string getdevinfo() { + static auto devices = ocl::utility::GetAllDevices(); + auto device = devices[device_id]; + std::vector name(256, 0); + size_t nActualSize = 0; + std::string gpu_name; + + cl_int rc = clGetDeviceInfo(device, CL_DEVICE_NAME, name.size(), &name[0], &nActualSize); + + gpu_name.assign(&name[0], nActualSize); + + return "GPU_ID( " + gpu_name + ")"; + } + +private: + ::std::string m_gpu_name; + ::std::string m_version; + +}; + + +} +} + diff --git a/contrib/ocl/algorithm/silentarmy.hpp b/contrib/ocl/algorithm/silentarmy.hpp new file mode 100644 index 000000000..173609285 --- /dev/null +++ b/contrib/ocl/algorithm/silentarmy.hpp @@ -0,0 +1,254 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define SA_COLLISION_BIT_LENGTH (SA_PARAM_N / (SA_PARAM_K+1)) +#define SA_COLLISION_BYTE_LENGTH ((SA_COLLISION_BIT_LENGTH+7)/8) +#define SA_FINAL_FULL_WIDTH (2*SA_COLLISION_BYTE_LENGTH+sizeof(uint32_t)*(1 << (SA_PARAM_K))) + +#define SA_NDIGITS (SA_PARAM_K+1) +#define SA_DIGITBITS (SA_PARAM_N/(SA_NDIGITS)) +#define SA_PROOFSIZE (1u< name(256, 0); + cl_uint compute_units = 0; + + size_t nActualSize = 0; + cl_int rc = clGetDeviceInfo(device, CL_DEVICE_NAME, name.size(), &name[0], &nActualSize); + + if (rc == CL_SUCCESS) { + gpu_name.assign(&name[0], nActualSize); + } + + rc = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(cl_uint), &compute_units, &nActualSize); + if (rc == CL_SUCCESS) { + sm_count = (int)compute_units; + } + + memset(&name[0], 0, name.size()); + rc = clGetDeviceInfo(device, CL_DEVICE_VERSION, name.size(), &name[0], &nActualSize); + if (rc == CL_SUCCESS) { + version.assign(&name[0], nActualSize); + } + } + + static void start(silentarmy& device_context) { + device_context.is_init_success = false; + device_context.oclc = new algorithm_detail::silentarmy_context; + auto devices = utility::GetAllDevices(); + + auto& device = devices[device_context.device_id]; + + size_t nActualSize = 0; + cl_platform_id platform_id = nullptr; + cl_int rc = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform_id, nullptr); + + + device_context.oclc->_dev_id = device; + device_context.oclc->platform_id = platform_id; + + // context create + cl_context_properties props[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)device_context.oclc->platform_id, 0 }; + cl_int error; + device_context.oclc->_context = clCreateContext(props, 1, &device, 0, 0, &error); + //OCLR(error, false); + if (cl_int err = error) { + printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); + return; + } + + cl_int binstatus; + + char kernelName[64]; + sprintf(kernelName, "silentarmy_gpu_%u.bin", (unsigned)device_context.device_id); + if (!utility::clCompileKernel(device_context.oclc->_context, + device, + kernelName, + { "zcash/gpu/silentarmy.cl" }, + "", + &binstatus, + &device_context.oclc->_program)) { + return; + } + + if (binstatus == CL_SUCCESS) { + if (!device_context.oclc->init(device, device_context.threadsNum, device_context.wokrsize)) { + printf("Init failed"); + return; + } + } else { + printf("GPU %d: failed to load kernel\n", device_context.device_id); + return; + } + + device_context.is_init_success = true; + } + + static void stop(silentarmy& device_context) { + if (device_context.oclc != nullptr) delete device_context.oclc; + } + + static void solve(const char *tequihash_header, + unsigned int tequihash_header_len, + const char* nonce, + unsigned int nonce_len, + std::function cancelf, + std::function&, size_t, const unsigned char*)> solutionf, + std::function hashdonef, + silentarmy& device_context) { + using namespace ocl::crypto; + using namespace algorithm_detail; + + unsigned char context[140]; + memset(context, 0, 140); + memcpy(context, tequihash_header, tequihash_header_len); + memcpy(context + tequihash_header_len, nonce, nonce_len); + + auto *miner = device_context.oclc; + clFlush(miner->queue); + + blake2b_state_t initialCtx; + zcash_blake2b_init(&initialCtx, SA_ZCASH_HASH_LEN, SA_PARAM_N, SA_PARAM_K); + zcash_blake2b_update(&initialCtx, (const uint8_t*)context, 128, 0); + + cl_mem buf_blake_st; + buf_blake_st = check_clCreateBuffer(miner->_context, CL_MEM_READ_ONLY | + CL_MEM_COPY_HOST_PTR, sizeof(blake2b_state_s), &initialCtx); + + for (unsigned round = 0; round < SA_PARAM_K; round++) + { + init_ht(miner->queue, miner->k_init_ht, miner->buf_ht[round & 1], miner->rowCounters[round & 1]); + if (!round) + { + check_clSetKernelArg(miner->k_rounds[round], 0, &buf_blake_st); + check_clSetKernelArg(miner->k_rounds[round], 1, &miner->buf_ht[round & 1]); + check_clSetKernelArg(miner->k_rounds[round], 2, &miner->rowCounters[round & 2]); + miner->global_ws = select_work_size_blake(miner->_dev_id); + } + else + { + check_clSetKernelArg(miner->k_rounds[round], 0, &miner->buf_ht[(round - 1) & 1]); + check_clSetKernelArg(miner->k_rounds[round], 1, &miner->buf_ht[round & 1]); + check_clSetKernelArg(miner->k_rounds[round], 2, &miner->rowCounters[(round - 1) & 1]); + check_clSetKernelArg(miner->k_rounds[round], 3, &miner->rowCounters[round & 1]); + miner->global_ws = SA_NR_ROWS; + } + check_clSetKernelArg(miner->k_rounds[round], round == 0 ? 3 : 4, &miner->buf_dbg); + if (round == SA_PARAM_K - 1) + check_clSetKernelArg(miner->k_rounds[round], 5, &miner->buf_sols); + check_clEnqueueNDRangeKernel(miner->queue, miner->k_rounds[round], 1, NULL, + &miner->global_ws, &miner->local_work_size, 0, NULL, NULL); + // cancel function + if (cancelf()) return; + } + check_clSetKernelArg(miner->k_sols, 0, &miner->buf_ht[0]); + check_clSetKernelArg(miner->k_sols, 1, &miner->buf_ht[1]); + check_clSetKernelArg(miner->k_sols, 2, &miner->buf_sols); + check_clSetKernelArg(miner->k_sols, 3, &miner->rowCounters[0]); + check_clSetKernelArg(miner->k_sols, 4, &miner->rowCounters[1]); + miner->global_ws = SA_NR_ROWS; + check_clEnqueueNDRangeKernel(miner->queue, miner->k_sols, 1, NULL, + &miner->global_ws, &miner->local_work_size, 0, NULL, NULL); + + check_clEnqueueReadBuffer(miner->queue, miner->buf_sols, + CL_TRUE, // cl_bool blocking_read + 0, // size_t offset + sizeof(*miner->sols), // size_t size + miner->sols, // void *ptr + 0, // cl_uint num_events_in_wait_list + NULL, // cl_event *event_wait_list + NULL); // cl_event *event + + if (miner->sols->nr > SA_MAX_SOLS) + miner->sols->nr = SA_MAX_SOLS; + + clReleaseMemObject(buf_blake_st); + + for (unsigned sol_i = 0; sol_i < miner->sols->nr; sol_i++) { + verify_sol(miner->sols, sol_i); + } + + uint8_t proof[SA_COMPRESSED_PROOFSIZE * 2]; + for (uint32_t i = 0; i < miner->sols->nr; i++) { + if (miner->sols->valid[i]) { + compress(proof, (uint32_t *)(miner->sols->values[i]), 1 << SA_PARAM_K); + solutionf(std::vector(0), 1344, proof); + } + } + hashdonef(); + } + + std::string getname() const { return "OCL_SILENTARMY"; } + + std::string getdevinfo() { + static auto devices = ocl::utility::GetAllDevices(); + auto device = devices[device_id]; + std::vector name(256, 0); + size_t nActualSize = 0; + std::string gpu_name; + + cl_int rc = clGetDeviceInfo(device, CL_DEVICE_NAME, name.size(), &name[0], &nActualSize); + + gpu_name.assign(&name[0], nActualSize); + + return "GPU_ID( " + gpu_name + ")"; + } + +private: + std::string m_gpu_name; + std::string m_version; + +}; + +} +} \ No newline at end of file diff --git a/contrib/ocl/cl_ext.hpp b/contrib/ocl/cl_ext.hpp new file mode 100644 index 000000000..507598171 --- /dev/null +++ b/contrib/ocl/cl_ext.hpp @@ -0,0 +1,12355 @@ +/******************************************************************************* +* Copyright (c) 2008-2013 The Khronos Group Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and/or associated documentation files (the +* "Materials"), to deal in the Materials without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Materials, and to +* permit persons to whom the Materials are furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Materials. +* +* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. +******************************************************************************/ + +/*! \file +* +* \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and +* OpenCL 1.2 (rev 15) +* \author Benedict R. Gaster, Laurent Morichetti and Lee Howes +* +* Additions and fixes from: +* Brian Cole, March 3rd 2010 and April 2012 +* Matt Gruenke, April 2012. +* Bruce Merry, February 2013. +* +* \version 1.2.5 +* \date June 2013 +* +* Optional extension support +* +* cl +* cl_ext_device_fission +* #define USE_CL_DEVICE_FISSION +*/ + +/*! \mainpage +* \section intro Introduction +* For many large applications C++ is the language of choice and so it seems +* reasonable to define C++ bindings for OpenCL. +* +* +* The interface is contained with a single C++ header file \em cl.hpp and all +* definitions are contained within the namespace \em cl. There is no additional +* requirement to include \em cl.h and to use either the C++ or original C +* bindings it is enough to simply include \em cl.hpp. +* +* The bindings themselves are lightweight and correspond closely to the +* underlying C API. Using the C++ bindings introduces no additional execution +* overhead. +* +* For detail documentation on the bindings see: +* +* The OpenCL C++ Wrapper API 1.2 (revision 09) +* http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf +* +* \section example Example +* +* The following example shows a general use case for the C++ +* bindings, including support for the optional exception feature and +* also the supplied vector and string classes, see following sections for +* decriptions of these features. +* +* \code +* #define __CL_ENABLE_EXCEPTIONS +* +* #if defined(__APPLE__) || defined(__MACOSX) +* #include +* #else +* #include +* #endif +* #include +* #include +* #include +* +* const char * helloStr = "__kernel void " +* "hello(void) " +* "{ " +* " " +* "} "; +* +* int +* main(void) +* { +* cl_int err = CL_SUCCESS; +* try { +* +* std::vector platforms; +* cl::Platform::get(&platforms); +* if (platforms.size() == 0) { +* std::cout << "Platform size 0\n"; +* return -1; +* } +* +* cl_context_properties properties[] = +* { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0}; +* cl::Context context(CL_DEVICE_TYPE_CPU, properties); +* +* std::vector devices = context.getInfo(); +* +* cl::Program::Sources source(1, +* std::make_pair(helloStr,strlen(helloStr))); +* cl::Program program_ = cl::Program(context, source); +* program_.build(devices); +* +* cl::Kernel kernel(program_, "hello", &err); +* +* cl::Event event; +* cl::CommandQueue queue(context, devices[0], 0, &err); +* queue.enqueueNDRangeKernel( +* kernel, +* cl::NullRange, +* cl::NDRange(4,4), +* cl::NullRange, +* NULL, +* &event); +* +* event.wait(); +* } +* catch (cl::Error err) { +* std::cerr +* << "ERROR: " +* << err.what() +* << "(" +* << err.err() +* << ")" +* << std::endl; +* } +* +* return EXIT_SUCCESS; +* } +* +* \endcode +* +*/ +#ifndef CL_HPP_ +#define CL_HPP_ + +#ifdef _WIN32 + +#include +#include +#include +#include + +#if defined(__CL_ENABLE_EXCEPTIONS) +#include +#endif // #if defined(__CL_ENABLE_EXCEPTIONS) + +#pragma push_macro("max") +#undef max +#if defined(USE_DX_INTEROP) +#include +#include +#endif +#endif // _WIN32 + +// +#if defined(USE_CL_DEVICE_FISSION) +#include // AMD topology not needed here +#endif + +#if defined(__APPLE__) || defined(__MACOSX) +#include +#include +#include +#else +#include +#include +#endif // !__APPLE__ + +// To avoid accidentally taking ownership of core OpenCL types +// such as cl_kernel constructors are made explicit +// under OpenCL 1.2 +#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +#define __CL_EXPLICIT_CONSTRUCTORS explicit +#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +#define __CL_EXPLICIT_CONSTRUCTORS +#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + +// Define deprecated prefixes and suffixes to ensure compilation +// in case they are not pre-defined +#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) +#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) +#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED) +#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) + +#if !defined(CL_CALLBACK) +#define CL_CALLBACK +#endif //CL_CALLBACK + +#include +#include + +#if !defined(__NO_STD_VECTOR) +#include +#endif + +#if !defined(__NO_STD_STRING) +#include +#endif + +#if defined(__linux__) || defined(__APPLE__) || defined(__MACOSX) +#include + +#include +#include +#endif // __linux__ + +#include + + +/*! \namespace cl +* +* \brief The OpenCL C++ bindings are defined within this namespace. +* +*/ +namespace cl { + + class Memory; + + /** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) +#define __INIT_CL_EXT_FCN_PTR(name) \ + if(!pfn_##name) { \ + pfn_##name = (PFN_##name) \ + clGetExtensionFunctionAddress(#name); \ + if(!pfn_##name) { \ + } \ + } +#endif // #if defined(CL_VERSION_1_1) + +#if defined(CL_VERSION_1_2) +#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \ + if(!pfn_##name) { \ + pfn_##name = (PFN_##name) \ + clGetExtensionFunctionAddressForPlatform(platform, #name); \ + if(!pfn_##name) { \ + } \ + } +#endif // #if defined(CL_VERSION_1_1) + + class Program; + class Device; + class Context; + class CommandQueue; + class Memory; + class Buffer; + +#if defined(__CL_ENABLE_EXCEPTIONS) + /*! \brief Exception class + * + * This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined. + */ + class Error : public std::exception + { + private: + cl_int err_; + const char * errStr_; + public: + /*! \brief Create a new CL error exception for a given error code + * and corresponding message. + * + * \param err error code value. + * + * \param errStr a descriptive string that must remain in scope until + * handling of the exception has concluded. If set, it + * will be returned by what(). + */ + Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr) + {} + + ~Error() throw() {} + + /*! \brief Get error string associated with exception + * + * \return A memory pointer to the error message string. + */ + virtual const char * what() const throw () + { + if (errStr_ == NULL) { + return "empty"; + } + else { + return errStr_; + } + } + + /*! \brief Get error code associated with exception + * + * \return The error code. + */ + cl_int err(void) const { return err_; } + }; + +#define __ERR_STR(x) #x +#else +#define __ERR_STR(x) NULL +#endif // __CL_ENABLE_EXCEPTIONS + + + namespace detail + { +#if defined(__CL_ENABLE_EXCEPTIONS) + static inline cl_int errHandler( + cl_int err, + const char * errStr = NULL) + { + if (err != CL_SUCCESS) { + throw Error(err, errStr); + } + return err; + } +#else + static inline cl_int errHandler(cl_int err, const char * errStr = NULL) + { + (void)errStr; // suppress unused variable warning + return err; + } +#endif // __CL_ENABLE_EXCEPTIONS + } + + + + //! \cond DOXYGEN_DETAIL +#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS) +#define __GET_DEVICE_INFO_ERR __ERR_STR(clGetDeviceInfo) +#define __GET_PLATFORM_INFO_ERR __ERR_STR(clGetPlatformInfo) +#define __GET_DEVICE_IDS_ERR __ERR_STR(clGetDeviceIDs) +#define __GET_PLATFORM_IDS_ERR __ERR_STR(clGetPlatformIDs) +#define __GET_CONTEXT_INFO_ERR __ERR_STR(clGetContextInfo) +#define __GET_EVENT_INFO_ERR __ERR_STR(clGetEventInfo) +#define __GET_EVENT_PROFILE_INFO_ERR __ERR_STR(clGetEventProfileInfo) +#define __GET_MEM_OBJECT_INFO_ERR __ERR_STR(clGetMemObjectInfo) +#define __GET_IMAGE_INFO_ERR __ERR_STR(clGetImageInfo) +#define __GET_SAMPLER_INFO_ERR __ERR_STR(clGetSamplerInfo) +#define __GET_KERNEL_INFO_ERR __ERR_STR(clGetKernelInfo) +#if defined(CL_VERSION_1_2) +#define __GET_KERNEL_ARG_INFO_ERR __ERR_STR(clGetKernelArgInfo) +#endif // #if defined(CL_VERSION_1_2) +#define __GET_KERNEL_WORK_GROUP_INFO_ERR __ERR_STR(clGetKernelWorkGroupInfo) +#define __GET_PROGRAM_INFO_ERR __ERR_STR(clGetProgramInfo) +#define __GET_PROGRAM_BUILD_INFO_ERR __ERR_STR(clGetProgramBuildInfo) +#define __GET_COMMAND_QUEUE_INFO_ERR __ERR_STR(clGetCommandQueueInfo) + +#define __CREATE_CONTEXT_ERR __ERR_STR(clCreateContext) +#define __CREATE_CONTEXT_FROM_TYPE_ERR __ERR_STR(clCreateContextFromType) +#define __GET_SUPPORTED_IMAGE_FORMATS_ERR __ERR_STR(clGetSupportedImageFormats) + +#define __CREATE_BUFFER_ERR __ERR_STR(clCreateBuffer) +#define __COPY_ERR __ERR_STR(cl::copy) +#define __CREATE_SUBBUFFER_ERR __ERR_STR(clCreateSubBuffer) +#define __CREATE_GL_BUFFER_ERR __ERR_STR(clCreateFromGLBuffer) +#define __CREATE_GL_RENDER_BUFFER_ERR __ERR_STR(clCreateFromGLBuffer) +#define __GET_GL_OBJECT_INFO_ERR __ERR_STR(clGetGLObjectInfo) +#if defined(CL_VERSION_1_2) +#define __CREATE_IMAGE_ERR __ERR_STR(clCreateImage) +#define __CREATE_GL_TEXTURE_ERR __ERR_STR(clCreateFromGLTexture) +#define __IMAGE_DIMENSION_ERR __ERR_STR(Incorrect image dimensions) +#endif // #if defined(CL_VERSION_1_2) +#define __CREATE_SAMPLER_ERR __ERR_STR(clCreateSampler) +#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback) + +#define __CREATE_USER_EVENT_ERR __ERR_STR(clCreateUserEvent) +#define __SET_USER_EVENT_STATUS_ERR __ERR_STR(clSetUserEventStatus) +#define __SET_EVENT_CALLBACK_ERR __ERR_STR(clSetEventCallback) +#define __WAIT_FOR_EVENTS_ERR __ERR_STR(clWaitForEvents) + +#define __CREATE_KERNEL_ERR __ERR_STR(clCreateKernel) +#define __SET_KERNEL_ARGS_ERR __ERR_STR(clSetKernelArg) +#define __CREATE_PROGRAM_WITH_SOURCE_ERR __ERR_STR(clCreateProgramWithSource) +#define __CREATE_PROGRAM_WITH_BINARY_ERR __ERR_STR(clCreateProgramWithBinary) +#if defined(CL_VERSION_1_2) +#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR __ERR_STR(clCreateProgramWithBuiltInKernels) +#endif // #if defined(CL_VERSION_1_2) +#define __BUILD_PROGRAM_ERR __ERR_STR(clBuildProgram) +#if defined(CL_VERSION_1_2) +#define __COMPILE_PROGRAM_ERR __ERR_STR(clCompileProgram) + +#endif // #if defined(CL_VERSION_1_2) +#define __CREATE_KERNELS_IN_PROGRAM_ERR __ERR_STR(clCreateKernelsInProgram) + +#define __CREATE_COMMAND_QUEUE_ERR __ERR_STR(clCreateCommandQueue) +#define __SET_COMMAND_QUEUE_PROPERTY_ERR __ERR_STR(clSetCommandQueueProperty) +#define __ENQUEUE_READ_BUFFER_ERR __ERR_STR(clEnqueueReadBuffer) +#define __ENQUEUE_READ_BUFFER_RECT_ERR __ERR_STR(clEnqueueReadBufferRect) +#define __ENQUEUE_WRITE_BUFFER_ERR __ERR_STR(clEnqueueWriteBuffer) +#define __ENQUEUE_WRITE_BUFFER_RECT_ERR __ERR_STR(clEnqueueWriteBufferRect) +#define __ENQEUE_COPY_BUFFER_ERR __ERR_STR(clEnqueueCopyBuffer) +#define __ENQEUE_COPY_BUFFER_RECT_ERR __ERR_STR(clEnqueueCopyBufferRect) +#define __ENQUEUE_FILL_BUFFER_ERR __ERR_STR(clEnqueueFillBuffer) +#define __ENQUEUE_READ_IMAGE_ERR __ERR_STR(clEnqueueReadImage) +#define __ENQUEUE_WRITE_IMAGE_ERR __ERR_STR(clEnqueueWriteImage) +#define __ENQUEUE_COPY_IMAGE_ERR __ERR_STR(clEnqueueCopyImage) +#define __ENQUEUE_FILL_IMAGE_ERR __ERR_STR(clEnqueueFillImage) +#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR __ERR_STR(clEnqueueCopyImageToBuffer) +#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR __ERR_STR(clEnqueueCopyBufferToImage) +#define __ENQUEUE_MAP_BUFFER_ERR __ERR_STR(clEnqueueMapBuffer) +#define __ENQUEUE_MAP_IMAGE_ERR __ERR_STR(clEnqueueMapImage) +#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR __ERR_STR(clEnqueueUnMapMemObject) +#define __ENQUEUE_NDRANGE_KERNEL_ERR __ERR_STR(clEnqueueNDRangeKernel) +#define __ENQUEUE_TASK_ERR __ERR_STR(clEnqueueTask) +#define __ENQUEUE_NATIVE_KERNEL __ERR_STR(clEnqueueNativeKernel) +#if defined(CL_VERSION_1_2) +#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR __ERR_STR(clEnqueueMigrateMemObjects) +#endif // #if defined(CL_VERSION_1_2) + +#define __ENQUEUE_ACQUIRE_GL_ERR __ERR_STR(clEnqueueAcquireGLObjects) +#define __ENQUEUE_RELEASE_GL_ERR __ERR_STR(clEnqueueReleaseGLObjects) + + +#define __RETAIN_ERR __ERR_STR(Retain Object) +#define __RELEASE_ERR __ERR_STR(Release Object) +#define __FLUSH_ERR __ERR_STR(clFlush) +#define __FINISH_ERR __ERR_STR(clFinish) +#define __VECTOR_CAPACITY_ERR __ERR_STR(Vector capacity error) + + /** + * CL 1.2 version that uses device fission. + */ +#if defined(CL_VERSION_1_2) +#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevices) +#else +#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevicesEXT) +#endif // #if defined(CL_VERSION_1_2) + + /** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) +#define __ENQUEUE_MARKER_ERR __ERR_STR(clEnqueueMarker) +#define __ENQUEUE_WAIT_FOR_EVENTS_ERR __ERR_STR(clEnqueueWaitForEvents) +#define __ENQUEUE_BARRIER_ERR __ERR_STR(clEnqueueBarrier) +#define __UNLOAD_COMPILER_ERR __ERR_STR(clUnloadCompiler) +#define __CREATE_GL_TEXTURE_2D_ERR __ERR_STR(clCreateFromGLTexture2D) +#define __CREATE_GL_TEXTURE_3D_ERR __ERR_STR(clCreateFromGLTexture3D) +#define __CREATE_IMAGE2D_ERR __ERR_STR(clCreateImage2D) +#define __CREATE_IMAGE3D_ERR __ERR_STR(clCreateImage3D) +#endif // #if defined(CL_VERSION_1_1) + +#endif // __CL_USER_OVERRIDE_ERROR_STRINGS + //! \endcond + + /** + * CL 1.2 marker and barrier commands + */ +#if defined(CL_VERSION_1_2) +#define __ENQUEUE_MARKER_WAIT_LIST_ERR __ERR_STR(clEnqueueMarkerWithWaitList) +#define __ENQUEUE_BARRIER_WAIT_LIST_ERR __ERR_STR(clEnqueueBarrierWithWaitList) +#endif // #if defined(CL_VERSION_1_2) + +#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING) + typedef std::string STRING_CLASS; +#elif !defined(__USE_DEV_STRING) + + /*! \class string + * \brief Simple string class, that provides a limited subset of std::string + * functionality but avoids many of the issues that come with that class. + + * \note Deprecated. Please use std::string as default or + * re-define the string class to match the std::string + * interface by defining STRING_CLASS + */ + class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + private: + ::size_t size_; + char * str_; + public: + //! \brief Constructs an empty string, allocating no memory. + string(void) : size_(0), str_(NULL) + { + } + + /*! \brief Constructs a string populated from an arbitrary value of + * specified size. + * + * An extra '\0' is added, in case none was contained in str. + * + * \param str the initial value of the string instance. Note that '\0' + * characters receive no special treatment. If NULL, + * the string is left empty, with a size of 0. + * + * \param size the number of characters to copy from str. + */ + string(const char * str, ::size_t size) : + size_(size), + str_(NULL) + { + if (size > 0) { + str_ = new char[size_ + 1]; + if (str_ != NULL) { + memcpy(str_, str, size_ * sizeof(char)); + str_[size_] = '\0'; + } + else { + size_ = 0; + } + } + } + + /*! \brief Constructs a string populated from a null-terminated value. + * + * \param str the null-terminated initial value of the string instance. + * If NULL, the string is left empty, with a size of 0. + */ + string(const char * str) : + size_(0), + str_(NULL) + { + if (str) { + size_ = ::strlen(str); + } + if (size_ > 0) { + str_ = new char[size_ + 1]; + if (str_ != NULL) { + memcpy(str_, str, (size_ + 1) * sizeof(char)); + } + } + } + + void resize(::size_t n) + { + if (size_ == n) { + return; + } + if (n == 0) { + if (str_) { + delete[] str_; + } + str_ = NULL; + size_ = 0; + } + else { + char *newString = new char[n + 1]; + int copySize = n; + if (size_ < n) { + copySize = size_; + } + size_ = n; + + if (str_) { + memcpy(newString, str_, (copySize + 1) * sizeof(char)); + } + if (copySize < size_) { + memset(newString + copySize, 0, size_ - copySize); + } + newString[size_] = '\0'; + + delete[] str_; + str_ = newString; + } + } + + const char& operator[] (::size_t pos) const + { + return str_[pos]; + } + + char& operator[] (::size_t pos) + { + return str_[pos]; + } + + /*! \brief Copies the value of another string to this one. + * + * \param rhs the string to copy. + * + * \returns a reference to the modified instance. + */ + string& operator=(const string& rhs) + { + if (this == &rhs) { + return *this; + } + + if (str_ != NULL) { + delete[] str_; + str_ = NULL; + size_ = 0; + } + + if (rhs.size_ == 0 || rhs.str_ == NULL) { + str_ = NULL; + size_ = 0; + } + else { + str_ = new char[rhs.size_ + 1]; + size_ = rhs.size_; + + if (str_ != NULL) { + memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char)); + } + else { + size_ = 0; + } + } + + return *this; + } + + /*! \brief Constructs a string by copying the value of another instance. + * + * \param rhs the string to copy. + */ + string(const string& rhs) : + size_(0), + str_(NULL) + { + *this = rhs; + } + + //! \brief Destructor - frees memory used to hold the current value. + ~string() + { + delete[] str_; + str_ = NULL; + } + + //! \brief Queries the length of the string, excluding any added '\0's. + ::size_t size(void) const { return size_; } + + //! \brief Queries the length of the string, excluding any added '\0's. + ::size_t length(void) const { return size(); } + + /*! \brief Returns a pointer to the private copy held by this instance, + * or "" if empty/unset. + */ + const char * c_str(void) const { return (str_) ? str_ : ""; } + }; + typedef cl::string STRING_CLASS; +#endif // #elif !defined(__USE_DEV_STRING) + +#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR) +#define VECTOR_CLASS std::vector +#elif !defined(__USE_DEV_VECTOR) +#define VECTOR_CLASS cl::vector + +#if !defined(__MAX_DEFAULT_VECTOR_SIZE) +#define __MAX_DEFAULT_VECTOR_SIZE 10 +#endif + + /*! \class vector + * \brief Fixed sized vector implementation that mirroring + * + * \note Deprecated. Please use std::vector as default or + * re-define the vector class to match the std::vector + * interface by defining VECTOR_CLASS + + * \note Not recommended for use with custom objects as + * current implementation will construct N elements + * + * std::vector functionality. + * \brief Fixed sized vector compatible with std::vector. + * + * \note + * This differs from std::vector<> not just in memory allocation, + * but also in terms of when members are constructed, destroyed, + * and assigned instead of being copy constructed. + * + * \param T type of element contained in the vector. + * + * \param N maximum size of the vector. + */ + template + class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + private: + T data_[N]; + unsigned int size_; + + public: + //! \brief Constructs an empty vector with no memory allocated. + vector() : + size_(static_cast(0)) + {} + + //! \brief Deallocates the vector's memory and destroys all of its elements. + ~vector() + { + clear(); + } + + //! \brief Returns the number of elements currently contained. + unsigned int size(void) const + { + return size_; + } + + /*! \brief Empties the vector of all elements. + * \note + * This does not deallocate memory but will invoke destructors + * on contained elements. + */ + void clear() + { + while (!empty()) { + pop_back(); + } + } + + /*! \brief Appends an element after the last valid element. + * Calling this on a vector that has reached capacity will throw an + * exception if exceptions are enabled. + */ + void push_back(const T& x) + { + if (size() < N) { + new (&data_[size_]) T(x); + size_++; + } + else { + detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR); + } + } + + /*! \brief Removes the last valid element from the vector. + * Calling this on an empty vector will throw an exception + * if exceptions are enabled. + */ + void pop_back(void) + { + if (size_ != 0) { + --size_; + data_[size_].~T(); + } + else { + detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR); + } + } + + /*! \brief Constructs with a value copied from another. + * + * \param vec the vector to copy. + */ + vector(const vector& vec) : + size_(vec.size_) + { + if (size_ != 0) { + assign(vec.begin(), vec.end()); + } + } + + /*! \brief Constructs with a specified number of initial elements. + * + * \param size number of initial elements. + * + * \param val value of initial elements. + */ + vector(unsigned int size, const T& val = T()) : + size_(0) + { + for (unsigned int i = 0; i < size; i++) { + push_back(val); + } + } + + /*! \brief Overwrites the current content with that copied from another + * instance. + * + * \param rhs vector to copy. + * + * \returns a reference to this. + */ + vector& operator=(const vector& rhs) + { + if (this == &rhs) { + return *this; + } + + if (rhs.size_ != 0) { + assign(rhs.begin(), rhs.end()); + } + else { + clear(); + } + + return *this; + } + + /*! \brief Tests equality against another instance. + * + * \param vec the vector against which to compare. + */ + bool operator==(vector &vec) + { + if (size() != vec.size()) { + return false; + } + + for (unsigned int i = 0; i < size(); ++i) { + if (operator[](i) != vec[i]) { + return false; + } + } + return true; + } + + //! \brief Conversion operator to T*. + operator T* () { return data_; } + + //! \brief Conversion operator to const T*. + operator const T* () const { return data_; } + + //! \brief Tests whether this instance has any elements. + bool empty(void) const + { + return size_ == 0; + } + + //! \brief Returns the maximum number of elements this instance can hold. + unsigned int max_size(void) const + { + return N; + } + + //! \brief Returns the maximum number of elements this instance can hold. + unsigned int capacity() const + { + return N; + } + + /*! \brief Returns a reference to a given element. + * + * \param index which element to access. * + * \note + * The caller is responsible for ensuring index is >= 0 and < size(). + */ + T& operator[](int index) + { + return data_[index]; + } + + /*! \brief Returns a const reference to a given element. + * + * \param index which element to access. + * + * \note + * The caller is responsible for ensuring index is >= 0 and < size(). + */ + const T& operator[](int index) const + { + return data_[index]; + } + + /*! \brief Assigns elements of the vector based on a source iterator range. + * + * \param start Beginning iterator of source range + * \param end Enditerator of source range + * + * \note + * Will throw an exception if exceptions are enabled and size exceeded. + */ + template + void assign(I start, I end) + { + clear(); + while (start != end) { + push_back(*start); + start++; + } + } + + /*! \class iterator + * \brief Const iterator class for vectors + */ + class iterator + { + private: + const vector *vec_; + int index_; + + /** + * Internal iterator constructor to capture reference + * to the vector it iterates over rather than taking + * the vector by copy. + */ + iterator(const vector &vec, int index) : + vec_(&vec) + { + if (!vec.empty()) { + index_ = index; + } + else { + index_ = -1; + } + } + + public: + iterator(void) : + index_(-1), + vec_(NULL) + { + } + + iterator(const iterator& rhs) : + vec_(rhs.vec_), + index_(rhs.index_) + { + } + + ~iterator(void) {} + + static iterator begin(const cl::vector &vec) + { + iterator i(vec, 0); + + return i; + } + + static iterator end(const cl::vector &vec) + { + iterator i(vec, vec.size()); + + return i; + } + + bool operator==(iterator i) + { + return ((vec_ == i.vec_) && + (index_ == i.index_)); + } + + bool operator!=(iterator i) + { + return (!(*this == i)); + } + + iterator& operator++() + { + ++index_; + return *this; + } + + iterator operator++(int) + { + iterator retVal(*this); + ++index_; + return retVal; + } + + iterator& operator--() + { + --index_; + return *this; + } + + iterator operator--(int) + { + iterator retVal(*this); + --index_; + return retVal; + } + + const T& operator *() const + { + return (*vec_)[index_]; + } + }; + + iterator begin(void) + { + return iterator::begin(*this); + } + + iterator begin(void) const + { + return iterator::begin(*this); + } + + iterator end(void) + { + return iterator::end(*this); + } + + iterator end(void) const + { + return iterator::end(*this); + } + + T& front(void) + { + return data_[0]; + } + + T& back(void) + { + return data_[size_]; + } + + const T& front(void) const + { + return data_[0]; + } + + const T& back(void) const + { + return data_[size_ - 1]; + } + }; +#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR) + + + + + + namespace detail { +#define __DEFAULT_NOT_INITIALIZED 1 +#define __DEFAULT_BEING_INITIALIZED 2 +#define __DEFAULT_INITIALIZED 4 + + /* + * Compare and exchange primitives are needed for handling of defaults + */ + inline int compare_exchange(volatile int * dest, int exchange, int comparand) + { +#ifdef _WIN32 + return (int)(InterlockedCompareExchange( + (volatile long*)dest, + (long)exchange, + (long)comparand)); +#elif defined(__APPLE__) || defined(__MACOSX) + return OSAtomicOr32Orig((uint32_t)exchange, (volatile uint32_t*)dest); +#else // !_WIN32 || defined(__APPLE__) || defined(__MACOSX) + return (__sync_val_compare_and_swap( + dest, + comparand, + exchange)); +#endif // !_WIN32 + } + + inline void fence() { _mm_mfence(); } + }; // namespace detail + + + /*! \brief class used to interface between C++ and + * OpenCL C calls that require arrays of size_t values, whose + * size is known statically. + */ + template + class size_t + { + private: + ::size_t data_[N]; + + public: + //! \brief Initialize size_t to all 0s + size_t() + { + for (int i = 0; i < N; ++i) { + data_[i] = 0; + } + } + + ::size_t& operator[](int index) + { + return data_[index]; + } + + const ::size_t& operator[](int index) const + { + return data_[index]; + } + + //! \brief Conversion operator to T*. + operator ::size_t* () { return data_; } + + //! \brief Conversion operator to const T*. + operator const ::size_t* () const { return data_; } + }; + + namespace detail { + + // Generic getInfoHelper. The final parameter is used to guide overload + // resolution: the actual parameter passed is an int, which makes this + // a worse conversion sequence than a specialization that declares the + // parameter as an int. + template + inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long) + { + return f(name, sizeof(T), param, NULL); + } + + // Specialized getInfoHelper for VECTOR_CLASS params + template + inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, long) + { + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + T* value = (T*)alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + param->assign(&value[0], &value[required / sizeof(T)]); + return CL_SUCCESS; + } + + /* Specialization for reference-counted types. This depends on the + * existence of Wrapper::cl_type, and none of the other types having the + * cl_type member. Note that simplify specifying the parameter as Wrapper + * does not work, because when using a derived type (e.g. Context) the generic + * template will provide a better match. + */ + template + inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, int, typename T::cl_type = 0) + { + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + typename T::cl_type * value = (typename T::cl_type *) alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + ::size_t elements = required / sizeof(typename T::cl_type); + param->assign(&value[0], &value[elements]); + for (::size_t i = 0; i < elements; i++) + { + if (value[i] != NULL) + { + err = (*param)[i].retain(); + if (err != CL_SUCCESS) { + return err; + } + } + } + return CL_SUCCESS; + } + + // Specialized for getInfo + template + inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, int) + { + cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL); + + if (err != CL_SUCCESS) { + return err; + } + + return CL_SUCCESS; + } + + // Specialized GetInfoHelper for STRING_CLASS params + template + inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long) + { + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + char* value = (char*)alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + *param = value; + return CL_SUCCESS; + } + + // Specialized GetInfoHelper for cl::size_t params + template + inline cl_int getInfoHelper(Func f, cl_uint name, size_t* param, long) + { + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + ::size_t* value = (::size_t*) alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + for (int i = 0; i < N; ++i) { + (*param)[i] = value[i]; + } + + return CL_SUCCESS; + } + + template struct ReferenceHandler; + + /* Specialization for reference-counted types. This depends on the + * existence of Wrapper::cl_type, and none of the other types having the + * cl_type member. Note that simplify specifying the parameter as Wrapper + * does not work, because when using a derived type (e.g. Context) the generic + * template will provide a better match. + */ + template + inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0) + { + typename T::cl_type value; + cl_int err = f(name, sizeof(value), &value, NULL); + if (err != CL_SUCCESS) { + return err; + } + *param = value; + if (value != NULL) + { + err = param->retain(); + if (err != CL_SUCCESS) { + return err; + } + } + return CL_SUCCESS; + } + +#define __PARAM_NAME_INFO_1_0(F) \ + F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \ + \ + F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \ + F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \ + F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \ + F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \ + F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \ + F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \ + F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \ + F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \ + F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \ + F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \ + F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \ + F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \ + F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \ + \ + F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \ + F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS) \ + F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS) \ + \ + F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \ + F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \ + F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \ + F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_uint) \ + \ + F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \ + \ + F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \ + F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \ + F(cl_mem_info, CL_MEM_SIZE, ::size_t) \ + F(cl_mem_info, CL_MEM_HOST_PTR, void*) \ + F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \ + F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \ + F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \ + \ + F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \ + F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \ + F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \ + F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \ + F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \ + F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \ + F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \ + \ + F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \ + F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \ + F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \ + F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \ + F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \ + \ + F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \ + F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \ + F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \ + F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS) \ + F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \ + F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \ + F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS) \ + \ + F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \ + F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \ + F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \ + \ + F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \ + F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \ + F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \ + F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \ + F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \ + \ + F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \ + F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \ + F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \ + \ + F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \ + F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \ + F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \ + F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties) + +#if defined(CL_VERSION_1_1) +#define __PARAM_NAME_INFO_1_1(F) \ + F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \ + F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \ + F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \ + F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \ + F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \ + \ + F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \ + F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \ + \ + F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \ + F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \ + \ + F(cl_event_info, CL_EVENT_CONTEXT, cl::Context) +#endif // CL_VERSION_1_1 + + +#if defined(CL_VERSION_1_2) +#define __PARAM_NAME_INFO_1_2(F) \ + F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \ + \ + F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \ + F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \ + \ + F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \ + \ + F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \ + \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \ + \ + F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \ + F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS) \ + F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS) \ + F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \ + F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \ + F(cl_device_info, CL_DEVICE_TOPOLOGY_AMD, cl_device_topology_amd) \ + F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS) +#endif // #if defined(CL_VERSION_1_2) + +#if defined(USE_CL_DEVICE_FISSION) +#define __PARAM_NAME_DEVICE_FISSION(F) \ + F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \ + F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS) \ + F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS) \ + F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \ + F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS) +#endif // USE_CL_DEVICE_FISSION + + template + struct param_traits {}; + +#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \ +struct token; \ +template<> \ +struct param_traits \ + { \ + enum { value = param_name }; \ + typedef T param_type; \ + }; + + __PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS) +#if defined(CL_VERSION_1_1) + __PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS) +#endif // CL_VERSION_1_1 +#if defined(CL_VERSION_1_2) + __PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS) +#endif // CL_VERSION_1_1 + +#if defined(USE_CL_DEVICE_FISSION) + __PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS); +#endif // USE_CL_DEVICE_FISSION + +#ifdef CL_PLATFORM_ICD_SUFFIX_KHR + __CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS) +#endif + +#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD + __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong) +#endif + +#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD + __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>) +#endif +#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD + __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_SIMD_WIDTH_AMD + __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD + __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD + __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD + __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD + __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD + __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD + __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD + __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint) +#endif + +#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV + __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint) +#endif +#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV + __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint) +#endif +#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV + __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint) +#endif +#ifdef CL_DEVICE_WARP_SIZE_NV + __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint) +#endif +#ifdef CL_DEVICE_GPU_OVERLAP_NV + __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool) +#endif +#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV + __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool) +#endif +#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV + __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool) +#endif + + // Convenience functions + + template + inline cl_int + getInfo(Func f, cl_uint name, T* param) + { + return getInfoHelper(f, name, param, 0); + } + + template + struct GetInfoFunctor0 + { + Func f_; const Arg0& arg0_; + cl_int operator ()( + cl_uint param, ::size_t size, void* value, ::size_t* size_ret) + { + return f_(arg0_, param, size, value, size_ret); + } + }; + + template + struct GetInfoFunctor1 + { + Func f_; const Arg0& arg0_; const Arg1& arg1_; + cl_int operator ()( + cl_uint param, ::size_t size, void* value, ::size_t* size_ret) + { + return f_(arg0_, arg1_, param, size, value, size_ret); + } + }; + + template + inline cl_int + getInfo(Func f, const Arg0& arg0, cl_uint name, T* param) + { + GetInfoFunctor0 f0 = { f, arg0 }; + return getInfoHelper(f0, name, param, 0); + } + + template + inline cl_int + getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param) + { + GetInfoFunctor1 f0 = { f, arg0, arg1 }; + return getInfoHelper(f0, name, param, 0); + } + + template + struct ReferenceHandler + { }; + +#if defined(CL_VERSION_1_2) + /** + * OpenCL 1.2 devices do have retain/release. + */ + template <> + struct ReferenceHandler + { + /** + * Retain the device. + * \param device A valid device created using createSubDevices + * \return + * CL_SUCCESS if the function executed successfully. + * CL_INVALID_DEVICE if device was not a valid subdevice + * CL_OUT_OF_RESOURCES + * CL_OUT_OF_HOST_MEMORY + */ + static cl_int retain(cl_device_id device) + { + return ::clRetainDevice(device); + } + /** + * Retain the device. + * \param device A valid device created using createSubDevices + * \return + * CL_SUCCESS if the function executed successfully. + * CL_INVALID_DEVICE if device was not a valid subdevice + * CL_OUT_OF_RESOURCES + * CL_OUT_OF_HOST_MEMORY + */ + static cl_int release(cl_device_id device) + { + return ::clReleaseDevice(device); + } + }; +#else // #if defined(CL_VERSION_1_2) + /** + * OpenCL 1.1 devices do not have retain/release. + */ + template <> + struct ReferenceHandler + { + // cl_device_id does not have retain(). + static cl_int retain(cl_device_id) + { + return CL_SUCCESS; + } + // cl_device_id does not have release(). + static cl_int release(cl_device_id) + { + return CL_SUCCESS; + } + }; +#endif // #if defined(CL_VERSION_1_2) + + template <> + struct ReferenceHandler + { + // cl_platform_id does not have retain(). + static cl_int retain(cl_platform_id) + { + return CL_SUCCESS; + } + // cl_platform_id does not have release(). + static cl_int release(cl_platform_id) + { + return CL_SUCCESS; + } + }; + + template <> + struct ReferenceHandler + { + static cl_int retain(cl_context context) + { + return ::clRetainContext(context); + } + static cl_int release(cl_context context) + { + return ::clReleaseContext(context); + } + }; + + template <> + struct ReferenceHandler + { + static cl_int retain(cl_command_queue queue) + { + return ::clRetainCommandQueue(queue); + } + static cl_int release(cl_command_queue queue) + { + return ::clReleaseCommandQueue(queue); + } + }; + + template <> + struct ReferenceHandler + { + static cl_int retain(cl_mem memory) + { + return ::clRetainMemObject(memory); + } + static cl_int release(cl_mem memory) + { + return ::clReleaseMemObject(memory); + } + }; + + template <> + struct ReferenceHandler + { + static cl_int retain(cl_sampler sampler) + { + return ::clRetainSampler(sampler); + } + static cl_int release(cl_sampler sampler) + { + return ::clReleaseSampler(sampler); + } + }; + + template <> + struct ReferenceHandler + { + static cl_int retain(cl_program program) + { + return ::clRetainProgram(program); + } + static cl_int release(cl_program program) + { + return ::clReleaseProgram(program); + } + }; + + template <> + struct ReferenceHandler + { + static cl_int retain(cl_kernel kernel) + { + return ::clRetainKernel(kernel); + } + static cl_int release(cl_kernel kernel) + { + return ::clReleaseKernel(kernel); + } + }; + + template <> + struct ReferenceHandler + { + static cl_int retain(cl_event event) + { + return ::clRetainEvent(event); + } + static cl_int release(cl_event event) + { + return ::clReleaseEvent(event); + } + }; + + + // Extracts version number with major in the upper 16 bits, minor in the lower 16 + static cl_uint getVersion(const char *versionInfo) + { + int highVersion = 0; + int lowVersion = 0; + int index = 7; + while (versionInfo[index] != '.') { + highVersion *= 10; + highVersion += versionInfo[index] - '0'; + ++index; + } + ++index; + while (versionInfo[index] != ' ') { + lowVersion *= 10; + lowVersion += versionInfo[index] - '0'; + ++index; + } + return (highVersion << 16) | lowVersion; + } + + static cl_uint getPlatformVersion(cl_platform_id platform) + { + ::size_t size = 0; + clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size); + char *versionInfo = (char *)alloca(size); + clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size); + return getVersion(versionInfo); + } + + static cl_uint getDevicePlatformVersion(cl_device_id device) + { + cl_platform_id platform; + clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL); + return getPlatformVersion(platform); + } + +#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + static cl_uint getContextPlatformVersion(cl_context context) + { + // The platform cannot be queried directly, so we first have to grab a + // device and obtain its context + ::size_t size = 0; + clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size); + if (size == 0) + return 0; + cl_device_id *devices = (cl_device_id *)alloca(size); + clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL); + return getDevicePlatformVersion(devices[0]); + } +#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + + template + class Wrapper + { + public: + typedef T cl_type; + + protected: + cl_type object_; + + public: + Wrapper() : object_(NULL) { } + + Wrapper(const cl_type &obj) : object_(obj) { } + + ~Wrapper() + { + if (object_ != NULL) { release(); } + } + + Wrapper(const Wrapper& rhs) + { + object_ = rhs.object_; + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + } + + Wrapper& operator = (const Wrapper& rhs) + { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs.object_; + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + return *this; + } + + Wrapper& operator = (const cl_type &rhs) + { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs; + return *this; + } + + cl_type operator ()() const { return object_; } + + cl_type& operator ()() { return object_; } + + protected: + template + friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type); + + cl_int retain() const + { + return ReferenceHandler::retain(object_); + } + + cl_int release() const + { + return ReferenceHandler::release(object_); + } + }; + + template <> + class Wrapper + { + public: + typedef cl_device_id cl_type; + + protected: + cl_type object_; + bool referenceCountable_; + + static bool isReferenceCountable(cl_device_id device) + { + bool retVal = false; + if (device != NULL) { + int version = getDevicePlatformVersion(device); + if (version > ((1 << 16) + 1)) { + retVal = true; + } + } + return retVal; + } + + public: + Wrapper() : object_(NULL), referenceCountable_(false) + { + } + + Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false) + { + referenceCountable_ = isReferenceCountable(obj); + } + + ~Wrapper() + { + if (object_ != NULL) { release(); } + } + + Wrapper(const Wrapper& rhs) + { + object_ = rhs.object_; + referenceCountable_ = isReferenceCountable(object_); + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + } + + Wrapper& operator = (const Wrapper& rhs) + { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs.object_; + referenceCountable_ = rhs.referenceCountable_; + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + return *this; + } + + Wrapper& operator = (const cl_type &rhs) + { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs; + referenceCountable_ = isReferenceCountable(object_); + return *this; + } + + cl_type operator ()() const { return object_; } + + cl_type& operator ()() { return object_; } + + protected: + template + friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type); + + template + friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS*, int, typename U::cl_type); + + cl_int retain() const + { + if (referenceCountable_) { + return ReferenceHandler::retain(object_); + } + else { + return CL_SUCCESS; + } + } + + cl_int release() const + { + if (referenceCountable_) { + return ReferenceHandler::release(object_); + } + else { + return CL_SUCCESS; + } + } + }; + + } // namespace detail + //! \endcond + + /*! \stuct ImageFormat + * \brief Adds constructors and member functions for cl_image_format. + * + * \see cl_image_format + */ + struct ImageFormat : public cl_image_format + { + //! \brief Default constructor - performs no initialization. + ImageFormat(){} + + //! \brief Initializing constructor. + ImageFormat(cl_channel_order order, cl_channel_type type) + { + image_channel_order = order; + image_channel_data_type = type; + } + + //! \brief Assignment operator. + ImageFormat& operator = (const ImageFormat& rhs) + { + if (this != &rhs) { + this->image_channel_data_type = rhs.image_channel_data_type; + this->image_channel_order = rhs.image_channel_order; + } + return *this; + } + }; + + /*! \brief Class interface for cl_device_id. + * + * \note Copies of these objects are inexpensive, since they don't 'own' + * any underlying resources or data structures. + * + * \see cl_device_id + */ + class Device : public detail::Wrapper + { + public: + //! \brief Default constructor - initializes to NULL. + Device() : detail::Wrapper() { } + + /*! \brief Copy constructor. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + Device(const Device& device) : detail::Wrapper(device) { } + + /*! \brief Constructor from cl_device_id. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + Device(const cl_device_id &device) : detail::Wrapper(device) { } + + /*! \brief Returns the first device on the default context. + * + * \see Context::getDefault() + */ + static Device getDefault(cl_int * err = NULL); + + /*! \brief Assignment operator from Device. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + Device& operator = (const Device& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_device_id. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + Device& operator = (const cl_device_id& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetDeviceInfo(). + template + cl_int getInfo(cl_device_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetDeviceInfo, object_, name, param), + __GET_DEVICE_INFO_ERR); + } + + //! \brief Wrapper for clGetDeviceInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_device_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /** + * CL 1.2 version + */ +#if defined(CL_VERSION_1_2) + //! \brief Wrapper for clCreateSubDevicesEXT(). + cl_int createSubDevices( + const cl_device_partition_property * properties, + VECTOR_CLASS* devices) + { + cl_uint n = 0; + cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + cl_device_id* ids = (cl_device_id*)alloca(n * sizeof(cl_device_id)); + err = clCreateSubDevices(object_, properties, n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } +#endif // #if defined(CL_VERSION_1_2) + + /** + * CL 1.1 version that uses device fission. + */ +#if defined(CL_VERSION_1_1) +#if defined(USE_CL_DEVICE_FISSION) + cl_int createSubDevices( + const cl_device_partition_property_ext * properties, + VECTOR_CLASS* devices) + { + typedef CL_API_ENTRY cl_int + (CL_API_CALL * PFN_clCreateSubDevicesEXT)( + cl_device_id /*in_device*/, + const cl_device_partition_property_ext * /* properties */, + cl_uint /*num_entries*/, + cl_device_id * /*out_devices*/, + cl_uint * /*num_devices*/) CL_EXT_SUFFIX__VERSION_1_1; + + static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL; + __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT); + + cl_uint n = 0; + cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + cl_device_id* ids = (cl_device_id*)alloca(n * sizeof(cl_device_id)); + err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } +#endif // #if defined(USE_CL_DEVICE_FISSION) +#endif // #if defined(CL_VERSION_1_1) + }; + + /*! \brief Class interface for cl_platform_id. + * + * \note Copies of these objects are inexpensive, since they don't 'own' + * any underlying resources or data structures. + * + * \see cl_platform_id + */ + class Platform : public detail::Wrapper + { + public: + //! \brief Default constructor - initializes to NULL. + Platform() : detail::Wrapper() { } + + /*! \brief Copy constructor. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + Platform(const Platform& platform) : detail::Wrapper(platform) { } + + /*! \brief Constructor from cl_platform_id. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + Platform(const cl_platform_id &platform) : detail::Wrapper(platform) { } + + /*! \brief Assignment operator from Platform. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + Platform& operator = (const Platform& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_platform_id. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + Platform& operator = (const cl_platform_id& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetPlatformInfo(). + cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetPlatformInfo, object_, name, param), + __GET_PLATFORM_INFO_ERR); + } + + //! \brief Wrapper for clGetPlatformInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_platform_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! \brief Gets a list of devices for this platform. + * + * Wraps clGetDeviceIDs(). + */ + cl_int getDevices( + cl_device_type type, + VECTOR_CLASS* devices) const + { + cl_uint n = 0; + if (devices == NULL) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR); + } + cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + cl_device_id* ids = (cl_device_id*)alloca(n * sizeof(cl_device_id)); + err = ::clGetDeviceIDs(object_, type, n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } + +#if defined(USE_DX_INTEROP) + /*! \brief Get the list of available D3D10 devices. + * + * \param d3d_device_source. + * + * \param d3d_object. + * + * \param d3d_device_set. + * + * \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device + * values returned in devices can be used to identify a specific OpenCL + * device. If \a devices argument is NULL, this argument is ignored. + * + * \return One of the following values: + * - CL_SUCCESS if the function is executed successfully. + * + * The application can query specific capabilities of the OpenCL device(s) + * returned by cl::getDevices. This can be used by the application to + * determine which device(s) to use. + * + * \note In the case that exceptions are enabled and a return value + * other than CL_SUCCESS is generated, then cl::Error exception is + * generated. + */ + cl_int getDevices( + cl_d3d10_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_khr d3d_device_set, + VECTOR_CLASS* devices) const + { + typedef CL_API_ENTRY cl_int(CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)( + cl_platform_id platform, + cl_d3d10_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_khr d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint* num_devices); + + if (devices == NULL) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR); + } + + static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL; + __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR); + + cl_uint n = 0; + cl_int err = pfn_clGetDeviceIDsFromD3D10KHR( + object_, + d3d_device_source, + d3d_object, + d3d_device_set, + 0, + NULL, + &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + cl_device_id* ids = (cl_device_id*)alloca(n * sizeof(cl_device_id)); + err = pfn_clGetDeviceIDsFromD3D10KHR( + object_, + d3d_device_source, + d3d_object, + d3d_device_set, + n, + ids, + NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } +#endif + + /*! \brief Gets a list of available platforms. + * + * Wraps clGetPlatformIDs(). + */ + static cl_int get( + VECTOR_CLASS* platforms) + { + cl_uint n = 0; + + if (platforms == NULL) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR); + } + + cl_int err = ::clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + cl_platform_id* ids = (cl_platform_id*)alloca( + n * sizeof(cl_platform_id)); + err = ::clGetPlatformIDs(n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + platforms->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } + + /*! \brief Gets the first available platform. + * + * Wraps clGetPlatformIDs(), returning the first result. + */ + static cl_int get( + Platform * platform) + { + cl_uint n = 0; + + if (platform == NULL) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR); + } + + cl_int err = ::clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + cl_platform_id* ids = (cl_platform_id*)alloca( + n * sizeof(cl_platform_id)); + err = ::clGetPlatformIDs(n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + *platform = ids[0]; + return CL_SUCCESS; + } + + /*! \brief Gets the first available platform, returning it by value. + * + * Wraps clGetPlatformIDs(), returning the first result. + */ + static Platform get( + cl_int * errResult = NULL) + { + Platform platform; + cl_uint n = 0; + cl_int err = ::clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + if (errResult != NULL) { + *errResult = err; + } + } + + cl_platform_id* ids = (cl_platform_id*)alloca( + n * sizeof(cl_platform_id)); + err = ::clGetPlatformIDs(n, ids, NULL); + + if (err != CL_SUCCESS) { + detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + if (errResult != NULL) { + *errResult = err; + } + + return ids[0]; + } + + static Platform getDefault( + cl_int *errResult = NULL) + { + return get(errResult); + } + + +#if defined(CL_VERSION_1_2) + //! \brief Wrapper for clUnloadCompiler(). + cl_int + unloadCompiler() + { + return ::clUnloadPlatformCompiler(object_); + } +#endif // #if defined(CL_VERSION_1_2) + }; // class Platform + + /** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) + /** + * Unload the OpenCL compiler. + * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead. + */ + inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int + UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + inline cl_int + UnloadCompiler() + { + return ::clUnloadCompiler(); + } +#endif // #if defined(CL_VERSION_1_1) + + /*! \brief Class interface for cl_context. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_context as the original. For details, see + * clRetainContext() and clReleaseContext(). + * + * \see cl_context + */ + class Context + : public detail::Wrapper + { + private: + static volatile int default_initialized_; + static Context default_; + static volatile cl_int default_error_; + public: + /*! \brief Destructor. + * + * This calls clReleaseContext() on the value held by this instance. + */ + ~Context() { } + + /*! \brief Constructs a context including a list of specified devices. + * + * Wraps clCreateContext(). + */ + Context( + const VECTOR_CLASS& devices, + cl_context_properties* properties = NULL, + void (CL_CALLBACK * notifyFptr)( + const char *, + const void *, + ::size_t, + void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int error; + + ::size_t numDevices = devices.size(); + cl_device_id* deviceIDs = (cl_device_id*)alloca(numDevices * sizeof(cl_device_id)); + for (::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + object_ = ::clCreateContext( + properties, (cl_uint)numDevices, + deviceIDs, + notifyFptr, data, &error); + + detail::errHandler(error, __CREATE_CONTEXT_ERR); + if (err != NULL) { + *err = error; + } + } + + Context( + const Device& device, + cl_context_properties* properties = NULL, + void (CL_CALLBACK * notifyFptr)( + const char *, + const void *, + ::size_t, + void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int error; + + cl_device_id deviceID = device(); + + object_ = ::clCreateContext( + properties, 1, + &deviceID, + notifyFptr, data, &error); + + detail::errHandler(error, __CREATE_CONTEXT_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Constructs a context including all devices of a specified type. + * + * Wraps clCreateContextFromType(). + */ + Context( + cl_device_type type, + cl_context_properties* properties = NULL, + void (CL_CALLBACK * notifyFptr)( + const char *, + const void *, + ::size_t, + void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int error; + +#if !defined(__APPLE__) || !defined(__MACOS) + cl_context_properties prop[4] = { CL_CONTEXT_PLATFORM, 0, 0, 0 }; + if (properties == NULL) { + prop[1] = (cl_context_properties)Platform::get(&error)(); + if (error != CL_SUCCESS) { + detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); + if (err != NULL) { + *err = error; + return; + } + } + + properties = &prop[0]; + } +#endif + object_ = ::clCreateContextFromType( + properties, type, notifyFptr, data, &error); + + detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT. + * + * \note All calls to this function return the same cl_context as the first. + */ + static Context getDefault(cl_int * err = NULL) + { + int state = detail::compare_exchange( + &default_initialized_, + __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED); + + if (state & __DEFAULT_INITIALIZED) { + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + if (state & __DEFAULT_BEING_INITIALIZED) { + // Assume writes will propagate eventually... + while (default_initialized_ != __DEFAULT_INITIALIZED) { + detail::fence(); + } + + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + cl_int error; + default_ = Context( + CL_DEVICE_TYPE_DEFAULT, + NULL, + NULL, + NULL, + &error); + + detail::fence(); + + default_error_ = error; + // Assume writes will propagate eventually... + default_initialized_ = __DEFAULT_INITIALIZED; + + detail::fence(); + + if (err != NULL) { + *err = default_error_; + } + return default_; + + } + + //! \brief Default constructor - initializes to NULL. + Context() : detail::Wrapper() { } + + /*! \brief Copy constructor. + * + * This calls clRetainContext() on the parameter's cl_context. + */ + Context(const Context& context) : detail::Wrapper(context) { } + + /*! \brief Constructor from cl_context - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_context + * into the new Context object. + */ + __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper(context) { } + + /*! \brief Assignment operator from Context. + * + * This calls clRetainContext() on the parameter and clReleaseContext() on + * the previous value held by this instance. + */ + Context& operator = (const Context& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_context - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseContext() on the value previously held by this instance. + */ + Context& operator = (const cl_context& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetContextInfo(). + template + cl_int getInfo(cl_context_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetContextInfo, object_, name, param), + __GET_CONTEXT_INFO_ERR); + } + + //! \brief Wrapper for clGetContextInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_context_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! \brief Gets a list of supported image formats. + * + * Wraps clGetSupportedImageFormats(). + */ + cl_int getSupportedImageFormats( + cl_mem_flags flags, + cl_mem_object_type type, + VECTOR_CLASS* formats) const + { + cl_uint numEntries; + cl_int err = ::clGetSupportedImageFormats( + object_, + flags, + type, + 0, + NULL, + &numEntries); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); + } + + ImageFormat* value = (ImageFormat*) + alloca(numEntries * sizeof(ImageFormat)); + err = ::clGetSupportedImageFormats( + object_, + flags, + type, + numEntries, + (cl_image_format*)value, + NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); + } + + formats->assign(&value[0], &value[numEntries]); + return CL_SUCCESS; + } + }; + + inline Device Device::getDefault(cl_int * err) + { + cl_int error; + Device device; + + Context context = Context::getDefault(&error); + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + device = context.getInfo()[0]; + if (err != NULL) { + *err = CL_SUCCESS; + } + } + + return device; + } + + +#ifdef _WIN32 + __declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED; + __declspec(selectany) Context Context::default_; + __declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS; +#else + __attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED; + __attribute__((weak)) Context Context::default_; + __attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS; +#endif + + /*! \brief Class interface for cl_event. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_event as the original. For details, see + * clRetainEvent() and clReleaseEvent(). + * + * \see cl_event + */ + class Event : public detail::Wrapper + { + public: + /*! \brief Destructor. + * + * This calls clReleaseEvent() on the value held by this instance. + */ + ~Event() { } + + //! \brief Default constructor - initializes to NULL. + Event() : detail::Wrapper() { } + + /*! \brief Copy constructor. + * + * This calls clRetainEvent() on the parameter's cl_event. + */ + Event(const Event& event) : detail::Wrapper(event) { } + + /*! \brief Constructor from cl_event - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_event + * into the new Event object. + */ + Event(const cl_event& event) : detail::Wrapper(event) { } + + /*! \brief Assignment operator from cl_event - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseEvent() on the value previously held by this instance. + */ + Event& operator = (const Event& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_event. + * + * This calls clRetainEvent() on the parameter and clReleaseEvent() on + * the previous value held by this instance. + */ + Event& operator = (const cl_event& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetEventInfo(). + template + cl_int getInfo(cl_event_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetEventInfo, object_, name, param), + __GET_EVENT_INFO_ERR); + } + + //! \brief Wrapper for clGetEventInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_event_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + //! \brief Wrapper for clGetEventProfilingInfo(). + template + cl_int getProfilingInfo(cl_profiling_info name, T* param) const + { + return detail::errHandler(detail::getInfo( + &::clGetEventProfilingInfo, object_, name, param), + __GET_EVENT_PROFILE_INFO_ERR); + } + + //! \brief Wrapper for clGetEventProfilingInfo() that returns by value. + template typename + detail::param_traits::param_type + getProfilingInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_profiling_info, name>::param_type param; + cl_int result = getProfilingInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! \brief Blocks the calling thread until this event completes. + * + * Wraps clWaitForEvents(). + */ + cl_int wait() const + { + return detail::errHandler( + ::clWaitForEvents(1, &object_), + __WAIT_FOR_EVENTS_ERR); + } + +#if defined(CL_VERSION_1_1) + /*! \brief Registers a user callback function for a specific command execution status. + * + * Wraps clSetEventCallback(). + */ + cl_int setCallback( + cl_int type, + void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *), + void * user_data = NULL) + { + return detail::errHandler( + ::clSetEventCallback( + object_, + type, + pfn_notify, + user_data), + __SET_EVENT_CALLBACK_ERR); + } +#endif + + /*! \brief Blocks the calling thread until every event specified is complete. + * + * Wraps clWaitForEvents(). + */ + static cl_int + waitForEvents(const VECTOR_CLASS& events) + { + return detail::errHandler( + ::clWaitForEvents( + (cl_uint)events.size(), (cl_event*)&events.front()), + __WAIT_FOR_EVENTS_ERR); + } + }; + +#if defined(CL_VERSION_1_1) + /*! \brief Class interface for user events (a subset of cl_event's). + * + * See Event for details about copy semantics, etc. + */ + class UserEvent : public Event + { + public: + /*! \brief Constructs a user event on a given context. + * + * Wraps clCreateUserEvent(). + */ + UserEvent( + const Context& context, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateUserEvent( + context(), + &error); + + detail::errHandler(error, __CREATE_USER_EVENT_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + UserEvent() : Event() { } + + //! \brief Copy constructor - performs shallow copy. + UserEvent(const UserEvent& event) : Event(event) { } + + //! \brief Assignment Operator - performs shallow copy. + UserEvent& operator = (const UserEvent& rhs) + { + if (this != &rhs) { + Event::operator=(rhs); + } + return *this; + } + + /*! \brief Sets the execution status of a user event object. + * + * Wraps clSetUserEventStatus(). + */ + cl_int setStatus(cl_int status) + { + return detail::errHandler( + ::clSetUserEventStatus(object_, status), + __SET_USER_EVENT_STATUS_ERR); + } + }; +#endif + + /*! \brief Blocks the calling thread until every event specified is complete. + * + * Wraps clWaitForEvents(). + */ + inline static cl_int + WaitForEvents(const VECTOR_CLASS& events) + { + return detail::errHandler( + ::clWaitForEvents( + (cl_uint)events.size(), (cl_event*)&events.front()), + __WAIT_FOR_EVENTS_ERR); + } + + /*! \brief Class interface for cl_mem. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_mem as the original. For details, see + * clRetainMemObject() and clReleaseMemObject(). + * + * \see cl_mem + */ + class Memory : public detail::Wrapper + { + public: + + /*! \brief Destructor. + * + * This calls clReleaseMemObject() on the value held by this instance. + */ + ~Memory() {} + + //! \brief Default constructor - initializes to NULL. + Memory() : detail::Wrapper() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * This calls clRetainMemObject() on the parameter's cl_mem. + */ + Memory(const Memory& memory) : detail::Wrapper(memory) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_mem + * into the new Memory object. + */ + __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper(memory) { } + + /*! \brief Assignment operator from Memory. + * + * This calls clRetainMemObject() on the parameter and clReleaseMemObject() + * on the previous value held by this instance. + */ + Memory& operator = (const Memory& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_mem - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseMemObject() on the value previously held by this instance. + */ + Memory& operator = (const cl_mem& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetMemObjectInfo(). + template + cl_int getInfo(cl_mem_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetMemObjectInfo, object_, name, param), + __GET_MEM_OBJECT_INFO_ERR); + } + + //! \brief Wrapper for clGetMemObjectInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_mem_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + +#if defined(CL_VERSION_1_1) + /*! \brief Registers a callback function to be called when the memory object + * is no longer needed. + * + * Wraps clSetMemObjectDestructorCallback(). + * + * Repeated calls to this function, for a given cl_mem value, will append + * to the list of functions called (in reverse order) when memory object's + * resources are freed and the memory object is deleted. + * + * \note + * The registered callbacks are associated with the underlying cl_mem + * value - not the Memory class instance. + */ + cl_int setDestructorCallback( + void (CL_CALLBACK * pfn_notify)(cl_mem, void *), + void * user_data = NULL) + { + return detail::errHandler( + ::clSetMemObjectDestructorCallback( + object_, + pfn_notify, + user_data), + __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR); + } +#endif + + }; + + // Pre-declare copy functions + class Buffer; + template< typename IteratorType > + cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer); + template< typename IteratorType > + cl_int copy(const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator); + + /*! \brief Class interface for Buffer Memory Objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ + class Buffer : public Memory + { + public: + + /*! \brief Constructs a Buffer in a specified context. + * + * Wraps clCreateBuffer(). + * + * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was + * specified. Note alignment & exclusivity requirements. + */ + Buffer( + const Context& context, + cl_mem_flags flags, + ::size_t size, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Constructs a Buffer in the default context. + * + * Wraps clCreateBuffer(). + * + * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was + * specified. Note alignment & exclusivity requirements. + * + * \see Context::getDefault() + */ + Buffer( + cl_mem_flags flags, + ::size_t size, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + + Context context = Context::getDefault(err); + + object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! + * \brief Construct a Buffer from a host container via iterators. + * If useHostPtr is specified iterators must be random access. + */ + template< typename IteratorType > + Buffer( + IteratorType startIterator, + IteratorType endIterator, + bool readOnly, + bool useHostPtr = false, + cl_int* err = NULL) + { + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + cl_mem_flags flags = 0; + if (readOnly) { + flags |= CL_MEM_READ_ONLY; + } + else { + flags |= CL_MEM_READ_WRITE; + } + if (useHostPtr) { + flags |= CL_MEM_USE_HOST_PTR; + } + + ::size_t size = sizeof(DataType)*(endIterator - startIterator); + + Context context = Context::getDefault(err); + + if (useHostPtr) { + object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); + } + else { + object_ = ::clCreateBuffer(context(), flags, size, 0, &error); + } + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + if (!useHostPtr) { + error = cl::copy(startIterator, endIterator, *this); + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + } + + //! \brief Default constructor - initializes to NULL. + Buffer() : Memory() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Buffer(const Buffer& buffer) : Memory(buffer) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { } + + /*! \brief Assignment from Buffer - performs shallow copy. + * + * See Memory for further details. + */ + Buffer& operator = (const Buffer& rhs) + { + if (this != &rhs) { + Memory::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Buffer& operator = (const cl_mem& rhs) + { + Memory::operator=(rhs); + return *this; + } + +#if defined(CL_VERSION_1_1) + /*! \brief Creates a new buffer object from this. + * + * Wraps clCreateSubBuffer(). + */ + Buffer createSubBuffer( + cl_mem_flags flags, + cl_buffer_create_type buffer_create_type, + const void * buffer_create_info, + cl_int * err = NULL) + { + Buffer result; + cl_int error; + result.object_ = ::clCreateSubBuffer( + object_, + flags, + buffer_create_type, + buffer_create_info, + &error); + + detail::errHandler(error, __CREATE_SUBBUFFER_ERR); + if (err != NULL) { + *err = error; + } + + return result; + } +#endif + }; + +#if defined (USE_DX_INTEROP) + /*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's. + * + * This is provided to facilitate interoperability with Direct3D. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ + class BufferD3D10 : public Buffer + { + public: + typedef CL_API_ENTRY cl_mem(CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)( + cl_context context, cl_mem_flags flags, ID3D10Buffer* buffer, + cl_int* errcode_ret); + + /*! \brief Constructs a BufferD3D10, in a specified context, from a + * given ID3D10Buffer. + * + * Wraps clCreateFromD3D10BufferKHR(). + */ + BufferD3D10( + const Context& context, + cl_mem_flags flags, + ID3D10Buffer* bufobj, + cl_int * err = NULL) + { + static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL; + +#if defined(CL_VERSION_1_2) + vector props = context.getInfo(); + cl_platform platform = -1; + for (int i = 0; i < props.size(); ++i) { + if (props[i] == CL_CONTEXT_PLATFORM) { + platform = props[i + 1]; + } + } + __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR); +#endif +#if defined(CL_VERSION_1_1) + __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR); +#endif + + cl_int error; + object_ = pfn_clCreateFromD3D10BufferKHR( + context(), + flags, + bufobj, + &error); + + detail::errHandler(error, __CREATE_GL_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + BufferD3D10() : Buffer() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { } + + /*! \brief Assignment from BufferD3D10 - performs shallow copy. + * + * See Memory for further details. + */ + BufferD3D10& operator = (const BufferD3D10& rhs) + { + if (this != &rhs) { + Buffer::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + BufferD3D10& operator = (const cl_mem& rhs) + { + Buffer::operator=(rhs); + return *this; + } + }; +#endif + + /*! \brief Class interface for GL Buffer Memory Objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ + class BufferGL : public Buffer + { + public: + /*! \brief Constructs a BufferGL in a specified context, from a given + * GL buffer. + * + * Wraps clCreateFromGLBuffer(). + */ + BufferGL( + const Context& context, + cl_mem_flags flags, + GLuint bufobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLBuffer( + context(), + flags, + bufobj, + &error); + + detail::errHandler(error, __CREATE_GL_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + BufferGL() : Buffer() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + BufferGL(const BufferGL& buffer) : Buffer(buffer) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { } + + /*! \brief Assignment from BufferGL - performs shallow copy. + * + * See Memory for further details. + */ + BufferGL& operator = (const BufferGL& rhs) + { + if (this != &rhs) { + Buffer::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + BufferGL& operator = (const cl_mem& rhs) + { + Buffer::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetGLObjectInfo(). + cl_int getObjectInfo( + cl_gl_object_type *type, + GLuint * gl_object_name) + { + return detail::errHandler( + ::clGetGLObjectInfo(object_, type, gl_object_name), + __GET_GL_OBJECT_INFO_ERR); + } + }; + + /*! \brief Class interface for GL Render Buffer Memory Objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ + class BufferRenderGL : public Buffer + { + public: + /*! \brief Constructs a BufferRenderGL in a specified context, from a given + * GL Renderbuffer. + * + * Wraps clCreateFromGLRenderbuffer(). + */ + BufferRenderGL( + const Context& context, + cl_mem_flags flags, + GLuint bufobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLRenderbuffer( + context(), + flags, + bufobj, + &error); + + detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + BufferRenderGL() : Buffer() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Buffer(buffer) { } + + /*! \brief Assignment from BufferGL - performs shallow copy. + * + * See Memory for further details. + */ + BufferRenderGL& operator = (const BufferRenderGL& rhs) + { + if (this != &rhs) { + Buffer::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + BufferRenderGL& operator = (const cl_mem& rhs) + { + Buffer::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetGLObjectInfo(). + cl_int getObjectInfo( + cl_gl_object_type *type, + GLuint * gl_object_name) + { + return detail::errHandler( + ::clGetGLObjectInfo(object_, type, gl_object_name), + __GET_GL_OBJECT_INFO_ERR); + } + }; + + /*! \brief C++ base class for Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ + class Image : public Memory + { + protected: + //! \brief Default constructor - initializes to NULL. + Image() : Memory() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image(const Image& image) : Memory(image) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { } + + /*! \brief Assignment from Image - performs shallow copy. + * + * See Memory for further details. + */ + Image& operator = (const Image& rhs) + { + if (this != &rhs) { + Memory::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image& operator = (const cl_mem& rhs) + { + Memory::operator=(rhs); + return *this; + } + + public: + //! \brief Wrapper for clGetImageInfo(). + template + cl_int getImageInfo(cl_image_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetImageInfo, object_, name, param), + __GET_IMAGE_INFO_ERR); + } + + //! \brief Wrapper for clGetImageInfo() that returns by value. + template typename + detail::param_traits::param_type + getImageInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_image_info, name>::param_type param; + cl_int result = getImageInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + }; + +#if defined(CL_VERSION_1_2) + /*! \brief Class interface for 1D Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ + class Image1D : public Image + { + public: + /*! \brief Constructs a 1D Image in a specified context. + * + * Wraps clCreateImage(). + */ + Image1D( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE1D; + desc.image_width = width; + desc.image_row_pitch = 0; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = 0; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + Image1D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image1D(const Image1D& image1D) : Image(image1D) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { } + + /*! \brief Assignment from Image1D - performs shallow copy. + * + * See Memory for further details. + */ + Image1D& operator = (const Image1D& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image1D& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + }; + + /*! \class Image1DBuffer + * \brief Image interface for 1D buffer images. + */ + class Image1DBuffer : public Image + { + public: + Image1DBuffer( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + const Buffer &buffer, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + desc.image_width = width; + desc.image_row_pitch = 0; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = buffer(); + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + NULL, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + Image1DBuffer() { } + + Image1DBuffer(const Image1DBuffer& image1D) : Image(image1D) { } + + __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { } + + Image1DBuffer& operator = (const Image1DBuffer& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + Image1DBuffer& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + }; + + /*! \class Image1DArray + * \brief Image interface for arrays of 1D images. + */ + class Image1DArray : public Image + { + public: + Image1DArray( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t arraySize, + ::size_t width, + ::size_t rowPitch, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY; + desc.image_array_size = arraySize; + desc.image_width = width; + desc.image_row_pitch = rowPitch; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = 0; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + Image1DArray() { } + + Image1DArray(const Image1DArray& imageArray) : Image(imageArray) { } + + __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { } + + Image1DArray& operator = (const Image1DArray& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + Image1DArray& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + }; +#endif // #if defined(CL_VERSION_1_2) + + + /*! \brief Class interface for 2D Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ + class Image2D : public Image + { + public: + /*! \brief Constructs a 1D Image in a specified context. + * + * Wraps clCreateImage(). + */ + Image2D( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + ::size_t height, + ::size_t row_pitch = 0, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + bool useCreateImage; + +#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + // Run-time decision based on the actual platform + { + cl_uint version = detail::getContextPlatformVersion(context()); + useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above + } +#elif defined(CL_VERSION_1_2) + useCreateImage = true; +#else + useCreateImage = false; +#endif + +#if defined(CL_VERSION_1_2) + if (useCreateImage) + { + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE2D; + desc.image_width = width; + desc.image_height = height; + desc.image_row_pitch = row_pitch; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = 0; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if defined(CL_VERSION_1_2) +#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + if (!useCreateImage) + { + object_ = ::clCreateImage2D( + context(), flags, &format, width, height, row_pitch, host_ptr, &error); + + detail::errHandler(error, __CREATE_IMAGE2D_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + } + + //! \brief Default constructor - initializes to NULL. + Image2D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image2D(const Image2D& image2D) : Image(image2D) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { } + + /*! \brief Assignment from Image2D - performs shallow copy. + * + * See Memory for further details. + */ + Image2D& operator = (const Image2D& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image2D& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + }; + + +#if !defined(CL_VERSION_1_2) + /*! \brief Class interface for GL 2D Image Memory objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + * \note Deprecated for OpenCL 1.2. Please use ImageGL instead. + */ + class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D + { + public: + /*! \brief Constructs an Image2DGL in a specified context, from a given + * GL Texture. + * + * Wraps clCreateFromGLTexture2D(). + */ + Image2DGL( + const Context& context, + cl_mem_flags flags, + GLenum target, + GLint miplevel, + GLuint texobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLTexture2D( + context(), + flags, + target, + miplevel, + texobj, + &error); + + detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR); + if (err != NULL) { + *err = error; + } + + } + + //! \brief Default constructor - initializes to NULL. + Image2DGL() : Image2D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image2DGL(const Image2DGL& image) : Image2D(image) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { } + + /*! \brief Assignment from Image2DGL - performs shallow copy. + * + * See Memory for further details. + */ + Image2DGL& operator = (const Image2DGL& rhs) + { + if (this != &rhs) { + Image2D::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image2DGL& operator = (const cl_mem& rhs) + { + Image2D::operator=(rhs); + return *this; + } + }; +#endif // #if !defined(CL_VERSION_1_2) + +#if defined(CL_VERSION_1_2) + /*! \class Image2DArray + * \brief Image interface for arrays of 2D images. + */ + class Image2DArray : public Image + { + public: + Image2DArray( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t arraySize, + ::size_t width, + ::size_t height, + ::size_t rowPitch, + ::size_t slicePitch, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY; + desc.image_array_size = arraySize; + desc.image_width = width; + desc.image_height = height; + desc.image_row_pitch = rowPitch; + desc.image_slice_pitch = slicePitch; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = 0; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + Image2DArray() { } + + Image2DArray(const Image2DArray& imageArray) : Image(imageArray) { } + + __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { } + + Image2DArray& operator = (const Image2DArray& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + Image2DArray& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + }; +#endif // #if defined(CL_VERSION_1_2) + + /*! \brief Class interface for 3D Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ + class Image3D : public Image + { + public: + /*! \brief Constructs a 3D Image in a specified context. + * + * Wraps clCreateImage(). + */ + Image3D( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + ::size_t height, + ::size_t depth, + ::size_t row_pitch = 0, + ::size_t slice_pitch = 0, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + bool useCreateImage; + +#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + // Run-time decision based on the actual platform + { + cl_uint version = detail::getContextPlatformVersion(context()); + useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above + } +#elif defined(CL_VERSION_1_2) + useCreateImage = true; +#else + useCreateImage = false; +#endif + +#if defined(CL_VERSION_1_2) + if (useCreateImage) + { + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE3D; + desc.image_width = width; + desc.image_height = height; + desc.image_depth = depth; + desc.image_row_pitch = row_pitch; + desc.image_slice_pitch = slice_pitch; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = 0; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if defined(CL_VERSION_1_2) +#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + if (!useCreateImage) + { + object_ = ::clCreateImage3D( + context(), flags, &format, width, height, depth, row_pitch, + slice_pitch, host_ptr, &error); + + detail::errHandler(error, __CREATE_IMAGE3D_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + } + + //! \brief Default constructor - initializes to NULL. + Image3D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image3D(const Image3D& image3D) : Image(image3D) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { } + + /*! \brief Assignment from Image3D - performs shallow copy. + * + * See Memory for further details. + */ + Image3D& operator = (const Image3D& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image3D& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + }; + +#if !defined(CL_VERSION_1_2) + /*! \brief Class interface for GL 3D Image Memory objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ + class Image3DGL : public Image3D + { + public: + /*! \brief Constructs an Image3DGL in a specified context, from a given + * GL Texture. + * + * Wraps clCreateFromGLTexture3D(). + */ + Image3DGL( + const Context& context, + cl_mem_flags flags, + GLenum target, + GLint miplevel, + GLuint texobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLTexture3D( + context(), + flags, + target, + miplevel, + texobj, + &error); + + detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + Image3DGL() : Image3D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image3DGL(const Image3DGL& image) : Image3D(image) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { } + + /*! \brief Assignment from Image3DGL - performs shallow copy. + * + * See Memory for further details. + */ + Image3DGL& operator = (const Image3DGL& rhs) + { + if (this != &rhs) { + Image3D::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image3DGL& operator = (const cl_mem& rhs) + { + Image3D::operator=(rhs); + return *this; + } + }; +#endif // #if !defined(CL_VERSION_1_2) + +#if defined(CL_VERSION_1_2) + /*! \class ImageGL + * \brief general image interface for GL interop. + * We abstract the 2D and 3D GL images into a single instance here + * that wraps all GL sourced images on the grounds that setup information + * was performed by OpenCL anyway. + */ + class ImageGL : public Image + { + public: + ImageGL( + const Context& context, + cl_mem_flags flags, + GLenum target, + GLint miplevel, + GLuint texobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLTexture( + context(), + flags, + target, + miplevel, + texobj, + &error); + + detail::errHandler(error, __CREATE_GL_TEXTURE_ERR); + if (err != NULL) { + *err = error; + } + } + + ImageGL() : Image() { } + + ImageGL(const ImageGL& image) : Image(image) { } + + __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { } + + ImageGL& operator = (const ImageGL& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + ImageGL& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + }; +#endif // #if defined(CL_VERSION_1_2) + + /*! \brief Class interface for cl_sampler. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_sampler as the original. For details, see + * clRetainSampler() and clReleaseSampler(). + * + * \see cl_sampler + */ + class Sampler : public detail::Wrapper + { + public: + /*! \brief Destructor. + * + * This calls clReleaseSampler() on the value held by this instance. + */ + ~Sampler() { } + + //! \brief Default constructor - initializes to NULL. + Sampler() { } + + /*! \brief Constructs a Sampler in a specified context. + * + * Wraps clCreateSampler(). + */ + Sampler( + const Context& context, + cl_bool normalized_coords, + cl_addressing_mode addressing_mode, + cl_filter_mode filter_mode, + cl_int* err = NULL) + { + cl_int error; + object_ = ::clCreateSampler( + context(), + normalized_coords, + addressing_mode, + filter_mode, + &error); + + detail::errHandler(error, __CREATE_SAMPLER_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Copy constructor - performs shallow copy. + * + * This calls clRetainSampler() on the parameter's cl_sampler. + */ + Sampler(const Sampler& sampler) : detail::Wrapper(sampler) { } + + /*! \brief Constructor from cl_sampler - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_sampler + * into the new Sampler object. + */ + Sampler(const cl_sampler& sampler) : detail::Wrapper(sampler) { } + + /*! \brief Assignment operator from Sampler. + * + * This calls clRetainSampler() on the parameter and clReleaseSampler() + * on the previous value held by this instance. + */ + Sampler& operator = (const Sampler& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_sampler - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseSampler() on the value previously held by this instance. + */ + Sampler& operator = (const cl_sampler& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetSamplerInfo(). + template + cl_int getInfo(cl_sampler_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetSamplerInfo, object_, name, param), + __GET_SAMPLER_INFO_ERR); + } + + //! \brief Wrapper for clGetSamplerInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_sampler_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + }; + + class Program; + class CommandQueue; + class Kernel; + + //! \brief Class interface for specifying NDRange values. + class NDRange + { + private: + size_t<3> sizes_; + cl_uint dimensions_; + + public: + //! \brief Default constructor - resulting range has zero dimensions. + NDRange() + : dimensions_(0) + { } + + //! \brief Constructs one-dimensional range. + NDRange(::size_t size0) + : dimensions_(1) + { + sizes_[0] = size0; + } + + //! \brief Constructs two-dimensional range. + NDRange(::size_t size0, ::size_t size1) + : dimensions_(2) + { + sizes_[0] = size0; + sizes_[1] = size1; + } + + //! \brief Constructs three-dimensional range. + NDRange(::size_t size0, ::size_t size1, ::size_t size2) + : dimensions_(3) + { + sizes_[0] = size0; + sizes_[1] = size1; + sizes_[2] = size2; + } + + /*! \brief Conversion operator to const ::size_t *. + * + * \returns a pointer to the size of the first dimension. + */ + operator const ::size_t*() const { + return (const ::size_t*) sizes_; + } + + //! \brief Queries the number of dimensions in the range. + ::size_t dimensions() const { return dimensions_; } + }; + + //! \brief A zero-dimensional range. + static const NDRange NullRange; + + //! \brief Local address wrapper for use with Kernel::setArg + struct LocalSpaceArg + { + ::size_t size_; + }; + + namespace detail { + + template + struct KernelArgumentHandler + { + static ::size_t size(const T&) { return sizeof(T); } + static T* ptr(T& value) { return &value; } + }; + + template <> + struct KernelArgumentHandler + { + static ::size_t size(const LocalSpaceArg& value) { return value.size_; } + static void* ptr(LocalSpaceArg&) { return NULL; } + }; + + } + //! \endcond + + /*! __local + * \brief Helper function for generating LocalSpaceArg objects. + * Deprecated. Replaced with Local. + */ + inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg + __local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + inline LocalSpaceArg + __local(::size_t size) + { + LocalSpaceArg ret = { size }; + return ret; + } + + /*! Local + * \brief Helper function for generating LocalSpaceArg objects. + */ + inline LocalSpaceArg + Local(::size_t size) + { + LocalSpaceArg ret = { size }; + return ret; + } + + //class KernelFunctor; + + /*! \brief Class interface for cl_kernel. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_kernel as the original. For details, see + * clRetainKernel() and clReleaseKernel(). + * + * \see cl_kernel + */ + class Kernel : public detail::Wrapper + { + public: + inline Kernel(const Program& program, const char* name, cl_int* err = NULL); + + /*! \brief Destructor. + * + * This calls clReleaseKernel() on the value held by this instance. + */ + ~Kernel() { } + + //! \brief Default constructor - initializes to NULL. + Kernel() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * This calls clRetainKernel() on the parameter's cl_kernel. + */ + Kernel(const Kernel& kernel) : detail::Wrapper(kernel) { } + + /*! \brief Constructor from cl_kernel - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_kernel + * into the new Kernel object. + */ + __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper(kernel) { } + + /*! \brief Assignment operator from Kernel. + * + * This calls clRetainKernel() on the parameter and clReleaseKernel() + * on the previous value held by this instance. + */ + Kernel& operator = (const Kernel& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_kernel - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseKernel() on the value previously held by this instance. + */ + Kernel& operator = (const cl_kernel& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + template + cl_int getInfo(cl_kernel_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetKernelInfo, object_, name, param), + __GET_KERNEL_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_kernel_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + +#if defined(CL_VERSION_1_2) + template + cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param), + __GET_KERNEL_ARG_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getArgInfo(cl_uint argIndex, cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_kernel_arg_info, name>::param_type param; + cl_int result = getArgInfo(argIndex, name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } +#endif // #if defined(CL_VERSION_1_2) + + template + cl_int getWorkGroupInfo( + const Device& device, cl_kernel_work_group_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetKernelWorkGroupInfo, object_, device(), name, param), + __GET_KERNEL_WORK_GROUP_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getWorkGroupInfo(const Device& device, cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_kernel_work_group_info, name>::param_type param; + cl_int result = getWorkGroupInfo(device, name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + template + cl_int setArg(cl_uint index, T value) + { + return detail::errHandler( + ::clSetKernelArg( + object_, + index, + detail::KernelArgumentHandler::size(value), + detail::KernelArgumentHandler::ptr(value)), + __SET_KERNEL_ARGS_ERR); + } + + cl_int setArg(cl_uint index, ::size_t size, void* argPtr) + { + return detail::errHandler( + ::clSetKernelArg(object_, index, size, argPtr), + __SET_KERNEL_ARGS_ERR); + } + }; + + /*! \class Program + * \brief Program interface that implements cl_program. + */ + class Program : public detail::Wrapper + { + public: + typedef VECTOR_CLASS > Binaries; + typedef VECTOR_CLASS > Sources; + + Program( + const STRING_CLASS& source, + cl_int* err = NULL) + { + cl_int error; + + const char * strings = source.c_str(); + const ::size_t length = source.size(); + + Context context = Context::getDefault(err); + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)1, &strings, &length, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + + if (error == CL_SUCCESS) { + + error = ::clBuildProgram( + object_, + 0, + NULL, + "", + NULL, + NULL); + + detail::errHandler(error, __BUILD_PROGRAM_ERR); + } + + if (err != NULL) { + *err = error; + } + } + + Program( + const STRING_CLASS& source, + bool build, + cl_int* err = NULL) + { + cl_int error; + + const char * strings = source.c_str(); + const ::size_t length = source.size(); + + Context context = Context::getDefault(err); + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)1, &strings, &length, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + + if (error == CL_SUCCESS && build) { + + error = ::clBuildProgram( + object_, + 0, + NULL, + "", + NULL, + NULL); + + detail::errHandler(error, __BUILD_PROGRAM_ERR); + } + + if (err != NULL) { + *err = error; + } + } + + Program( + const Context& context, + const STRING_CLASS& source, + bool build = false, + cl_int* err = NULL) + { + cl_int error; + + const char * strings = source.c_str(); + const ::size_t length = source.size(); + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)1, &strings, &length, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + + if (error == CL_SUCCESS && build) { + + error = ::clBuildProgram( + object_, + 0, + NULL, + "", + NULL, + NULL); + + detail::errHandler(error, __BUILD_PROGRAM_ERR); + } + + if (err != NULL) { + *err = error; + } + } + + Program( + const Context& context, + const Sources& sources, + cl_int* err = NULL) + { + cl_int error; + + const ::size_t n = (::size_t)sources.size(); + ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t)); + const char** strings = (const char**)alloca(n * sizeof(const char*)); + + for (::size_t i = 0; i < n; ++i) { + strings[i] = sources[(int)i].first; + lengths[i] = sources[(int)i].second; + } + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)n, strings, lengths, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + if (err != NULL) { + *err = error; + } + } + + /** + * Construct a program object from a list of devices and a per-device list of binaries. + * \param context A valid OpenCL context in which to construct the program. + * \param devices A vector of OpenCL device objects for which the program will be created. + * \param binaries A vector of pairs of a pointer to a binary object and its length. + * \param binaryStatus An optional vector that on completion will be resized to + * match the size of binaries and filled with values to specify if each binary + * was successfully loaded. + * Set to CL_SUCCESS if the binary was successfully loaded. + * Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL. + * Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device. + * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors: + * CL_INVALID_CONTEXT if context is not a valid context. + * CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; + * or if any entry in binaries is NULL or has length 0. + * CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context. + * CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device. + * CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host. + */ + Program( + const Context& context, + const VECTOR_CLASS& devices, + const Binaries& binaries, + VECTOR_CLASS* binaryStatus = NULL, + cl_int* err = NULL) + { + cl_int error; + + const ::size_t numDevices = devices.size(); + + // Catch size mismatch early and return + if (binaries.size() != numDevices) { + error = CL_INVALID_VALUE; + detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); + if (err != NULL) { + *err = error; + } + return; + } + + ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t)); + const unsigned char** images = (const unsigned char**)alloca(numDevices * sizeof(const unsigned char**)); + + for (::size_t i = 0; i < numDevices; ++i) { + images[i] = (const unsigned char*)binaries[i].first; + lengths[i] = binaries[(int)i].second; + } + + cl_device_id* deviceIDs = (cl_device_id*)alloca(numDevices * sizeof(cl_device_id)); + for (::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + if (binaryStatus) { + binaryStatus->resize(numDevices); + } + + object_ = ::clCreateProgramWithBinary( + context(), (cl_uint)devices.size(), + deviceIDs, + lengths, images, binaryStatus != NULL + ? &binaryStatus->front() + : NULL, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); + if (err != NULL) { + *err = error; + } + } + + +#if defined(CL_VERSION_1_2) + /** + * Create program using builtin kernels. + * \param kernelNames Semi-colon separated list of builtin kernel names + */ + Program( + const Context& context, + const VECTOR_CLASS& devices, + const STRING_CLASS& kernelNames, + cl_int* err = NULL) + { + cl_int error; + + + ::size_t numDevices = devices.size(); + cl_device_id* deviceIDs = (cl_device_id*)alloca(numDevices * sizeof(cl_device_id)); + for (::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + object_ = ::clCreateProgramWithBuiltInKernels( + context(), + (cl_uint)devices.size(), + deviceIDs, + kernelNames.c_str(), + &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if defined(CL_VERSION_1_2) + + Program() { } + + Program(const Program& program) : detail::Wrapper(program) { } + + __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper(program) { } + + Program& operator = (const Program& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + Program& operator = (const cl_program& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + cl_int build( + const VECTOR_CLASS& devices, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + ::size_t numDevices = devices.size(); + cl_device_id* deviceIDs = (cl_device_id*)alloca(numDevices * sizeof(cl_device_id)); + for (::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + return detail::errHandler( + ::clBuildProgram( + object_, + (cl_uint) + devices.size(), + deviceIDs, + options, + notifyFptr, + data), + __BUILD_PROGRAM_ERR); + } + + cl_int build( + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + return detail::errHandler( + ::clBuildProgram( + object_, + 0, + NULL, + options, + notifyFptr, + data), + __BUILD_PROGRAM_ERR); + } + +#if defined(CL_VERSION_1_2) + cl_int compile( + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + return detail::errHandler( + ::clCompileProgram( + object_, + 0, + NULL, + options, + 0, + NULL, + NULL, + notifyFptr, + data), + __COMPILE_PROGRAM_ERR); + } +#endif + + template + cl_int getInfo(cl_program_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetProgramInfo, object_, name, param), + __GET_PROGRAM_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_program_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + template + cl_int getBuildInfo( + const Device& device, cl_program_build_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetProgramBuildInfo, object_, device(), name, param), + __GET_PROGRAM_BUILD_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getBuildInfo(const Device& device, cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_program_build_info, name>::param_type param; + cl_int result = getBuildInfo(device, name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + cl_int createKernels(VECTOR_CLASS* kernels) + { + cl_uint numKernels; + cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); + } + + Kernel* value = (Kernel*)alloca(numKernels * sizeof(Kernel)); + err = ::clCreateKernelsInProgram( + object_, numKernels, (cl_kernel*)value, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); + } + + kernels->assign(&value[0], &value[numKernels]); + return CL_SUCCESS; + } + }; + +#if defined(CL_VERSION_1_2) + inline Program linkProgram( + Program input1, + Program input2, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int err_local = CL_SUCCESS; + + cl_program programs[2] = { input1(), input2() }; + + Context ctx = input1.getInfo(); + + cl_program prog = ::clLinkProgram( + ctx(), + 0, + NULL, + options, + 2, + programs, + notifyFptr, + data, + &err_local); + + detail::errHandler(err_local, __COMPILE_PROGRAM_ERR); + if (err != NULL) { + *err = err_local; + } + + return Program(prog); + } + + inline Program linkProgram( + VECTOR_CLASS inputPrograms, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int err_local = CL_SUCCESS; + + cl_program * programs = (cl_program*)alloca(inputPrograms.size() * sizeof(cl_program)); + + if (programs != NULL) { + for (unsigned int i = 0; i < inputPrograms.size(); i++) { + programs[i] = inputPrograms[i](); + } + } + + cl_program prog = ::clLinkProgram( + Context::getDefault()(), + 0, + NULL, + options, + (cl_uint)inputPrograms.size(), + programs, + notifyFptr, + data, + &err_local); + + detail::errHandler(err_local, __COMPILE_PROGRAM_ERR); + if (err != NULL) { + *err = err_local; + } + + return Program(prog); + } +#endif + + template<> + inline VECTOR_CLASS cl::Program::getInfo(cl_int* err) const + { + VECTOR_CLASS< ::size_t> sizes = getInfo(); + VECTOR_CLASS binaries; + for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s) + { + char *ptr = NULL; + if (*s != 0) + ptr = new char[*s]; + binaries.push_back(ptr); + } + + cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries); + if (err != NULL) { + *err = result; + } + return binaries; + } + + inline Kernel::Kernel(const Program& program, const char* name, cl_int* err) + { + cl_int error; + + object_ = ::clCreateKernel(program(), name, &error); + detail::errHandler(error, __CREATE_KERNEL_ERR); + + if (err != NULL) { + *err = error; + } + + } + + /*! \class CommandQueue + * \brief CommandQueue interface for cl_command_queue. + */ + class CommandQueue : public detail::Wrapper + { + private: + static volatile int default_initialized_; + static CommandQueue default_; + static volatile cl_int default_error_; + public: + CommandQueue( + cl_command_queue_properties properties, + cl_int* err = NULL) + { + cl_int error; + + Context context = Context::getDefault(&error); + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + Device device = context.getInfo()[0]; + + object_ = ::clCreateCommandQueue( + context(), device(), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } + } + + CommandQueue( + const Context& context, + const Device& device, + cl_command_queue_properties properties = 0, + cl_int* err = NULL) + { + cl_int error; + object_ = ::clCreateCommandQueue( + context(), device(), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } + + static CommandQueue getDefault(cl_int * err = NULL) + { + int state = detail::compare_exchange( + &default_initialized_, + __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED); + + if (state & __DEFAULT_INITIALIZED) { + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + if (state & __DEFAULT_BEING_INITIALIZED) { + // Assume writes will propagate eventually... + while (default_initialized_ != __DEFAULT_INITIALIZED) { + detail::fence(); + } + + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + cl_int error; + + Context context = Context::getDefault(&error); + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + Device device = context.getInfo()[0]; + + default_ = CommandQueue(context, device, 0, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } + + detail::fence(); + + default_error_ = error; + // Assume writes will propagate eventually... + default_initialized_ = __DEFAULT_INITIALIZED; + + detail::fence(); + + if (err != NULL) { + *err = default_error_; + } + return default_; + + } + + CommandQueue() { } + + CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper(commandQueue) { } + + CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper(commandQueue) { } + + CommandQueue& operator = (const CommandQueue& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + CommandQueue& operator = (const cl_command_queue& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + template + cl_int getInfo(cl_command_queue_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetCommandQueueInfo, object_, name, param), + __GET_COMMAND_QUEUE_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_command_queue_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + cl_int enqueueReadBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReadBuffer( + object_, buffer(), blocking, offset, size, + ptr, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_READ_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueWriteBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + const void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueWriteBuffer( + object_, buffer(), blocking, offset, size, + ptr, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_WRITE_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyBuffer( + const Buffer& src, + const Buffer& dst, + ::size_t src_offset, + ::size_t dst_offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyBuffer( + object_, src(), dst(), src_offset, dst_offset, size, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQEUE_COPY_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueReadBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + void *ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReadBufferRect( + object_, + buffer(), + blocking, + (const ::size_t *)buffer_offset, + (const ::size_t *)host_offset, + (const ::size_t *)region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_READ_BUFFER_RECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueWriteBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + void *ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueWriteBufferRect( + object_, + buffer(), + blocking, + (const ::size_t *)buffer_offset, + (const ::size_t *)host_offset, + (const ::size_t *)region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_WRITE_BUFFER_RECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyBufferRect( + const Buffer& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + ::size_t src_row_pitch, + ::size_t src_slice_pitch, + ::size_t dst_row_pitch, + ::size_t dst_slice_pitch, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyBufferRect( + object_, + src(), + dst(), + (const ::size_t *)src_origin, + (const ::size_t *)dst_origin, + (const ::size_t *)region, + src_row_pitch, + src_slice_pitch, + dst_row_pitch, + dst_slice_pitch, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQEUE_COPY_BUFFER_RECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined(CL_VERSION_1_2) + /** + * Enqueue a command to fill a buffer object with a pattern + * of a given size. The pattern is specified a as vector. + * \tparam PatternType The datatype of the pattern field. + * The pattern type must be an accepted OpenCL data type. + */ + template + cl_int enqueueFillBuffer( + const Buffer& buffer, + PatternType pattern, + ::size_t offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillBuffer( + object_, + buffer(), + static_cast(&pattern), + sizeof(PatternType), + offset, + size, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // #if defined(CL_VERSION_1_2) + + cl_int enqueueReadImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReadImage( + object_, image(), blocking, (const ::size_t *) origin, + (const ::size_t *) region, row_pitch, slice_pitch, ptr, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_READ_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueWriteImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueWriteImage( + object_, image(), blocking, (const ::size_t *) origin, + (const ::size_t *) region, row_pitch, slice_pitch, ptr, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_WRITE_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyImage( + const Image& src, + const Image& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyImage( + object_, src(), dst(), (const ::size_t *) src_origin, + (const ::size_t *)dst_origin, (const ::size_t *) region, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_COPY_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined(CL_VERSION_1_2) + /** + * Enqueue a command to fill an image object with a specified color. + * \param fillColor is the color to use to fill the image. + * This is a four component RGBA floating-point color value if + * the image channel data type is not an unnormalized signed or + * unsigned data type. + */ + cl_int enqueueFillImage( + const Image& image, + cl_float4 fillColor, + const size_t<3>& origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillImage( + object_, + image(), + static_cast(&fillColor), + (const ::size_t *) origin, + (const ::size_t *) region, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueue a command to fill an image object with a specified color. + * \param fillColor is the color to use to fill the image. + * This is a four component RGBA signed integer color value if + * the image channel data type is an unnormalized signed integer + * type. + */ + cl_int enqueueFillImage( + const Image& image, + cl_int4 fillColor, + const size_t<3>& origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillImage( + object_, + image(), + static_cast(&fillColor), + (const ::size_t *) origin, + (const ::size_t *) region, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueue a command to fill an image object with a specified color. + * \param fillColor is the color to use to fill the image. + * This is a four component RGBA unsigned integer color value if + * the image channel data type is an unnormalized unsigned integer + * type. + */ + cl_int enqueueFillImage( + const Image& image, + cl_uint4 fillColor, + const size_t<3>& origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillImage( + object_, + image(), + static_cast(&fillColor), + (const ::size_t *) origin, + (const ::size_t *) region, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // #if defined(CL_VERSION_1_2) + + cl_int enqueueCopyImageToBuffer( + const Image& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& region, + ::size_t dst_offset, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyImageToBuffer( + object_, src(), dst(), (const ::size_t *) src_origin, + (const ::size_t *) region, dst_offset, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyBufferToImage( + const Buffer& src, + const Image& dst, + ::size_t src_offset, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyBufferToImage( + object_, src(), dst(), src_offset, + (const ::size_t *) dst_origin, (const ::size_t *) region, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + void* enqueueMapBuffer( + const Buffer& buffer, + cl_bool blocking, + cl_map_flags flags, + ::size_t offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL, + cl_int* err = NULL) const + { + cl_int error; + void * result = ::clEnqueueMapBuffer( + object_, buffer(), blocking, flags, offset, size, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (cl_event*)event, + &error); + + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + return result; + } + + void* enqueueMapImage( + const Image& buffer, + cl_bool blocking, + cl_map_flags flags, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t * row_pitch, + ::size_t * slice_pitch, + const VECTOR_CLASS* events = NULL, + Event* event = NULL, + cl_int* err = NULL) const + { + cl_int error; + void * result = ::clEnqueueMapImage( + object_, buffer(), blocking, flags, + (const ::size_t *) origin, (const ::size_t *) region, + row_pitch, slice_pitch, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (cl_event*)event, + &error); + + detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + return result; + } + + cl_int enqueueUnmapMemObject( + const Memory& memory, + void* mapped_ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueUnmapMemObject( + object_, memory(), mapped_ptr, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined(CL_VERSION_1_2) + /** + * Enqueues a marker command which waits for either a list of events to complete, + * or all previously enqueued commands to complete. + * + * Enqueues a marker command which waits for either a list of events to complete, + * or if the list is empty it waits for all commands previously enqueued in command_queue + * to complete before it completes. This command returns an event which can be waited on, + * i.e. this event can be waited on to insure that all events either in the event_wait_list + * or all previously enqueued commands, queued before this command to command_queue, + * have completed. + */ + cl_int enqueueMarkerWithWaitList( + const VECTOR_CLASS *events = 0, + Event *event = 0) + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueMarkerWithWaitList( + object_, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_MARKER_WAIT_LIST_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * A synchronization point that enqueues a barrier operation. + * + * Enqueues a barrier command which waits for either a list of events to complete, + * or if the list is empty it waits for all commands previously enqueued in command_queue + * to complete before it completes. This command blocks command execution, that is, any + * following commands enqueued after it do not execute until it completes. This command + * returns an event which can be waited on, i.e. this event can be waited on to insure that + * all events either in the event_wait_list or all previously enqueued commands, queued + * before this command to command_queue, have completed. + */ + cl_int enqueueBarrierWithWaitList( + const VECTOR_CLASS *events = 0, + Event *event = 0) + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueBarrierWithWaitList( + object_, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_BARRIER_WAIT_LIST_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueues a command to indicate with which device a set of memory objects + * should be associated. + */ + cl_int enqueueMigrateMemObjects( + const VECTOR_CLASS &memObjects, + cl_mem_migration_flags flags, + const VECTOR_CLASS* events = NULL, + Event* event = NULL + ) + { + cl_event tmp; + + cl_mem* localMemObjects = static_cast(alloca(memObjects.size() * sizeof(cl_mem))); + for (int i = 0; i < (int)memObjects.size(); ++i) { + localMemObjects[i] = memObjects[i](); + } + + + cl_int err = detail::errHandler( + ::clEnqueueMigrateMemObjects( + object_, + (cl_uint)memObjects.size(), + static_cast(localMemObjects), + flags, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // #if defined(CL_VERSION_1_2) + + cl_int enqueueNDRangeKernel( + const Kernel& kernel, + const NDRange& offset, + const NDRange& global, + const NDRange& local = NullRange, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueNDRangeKernel( + object_, kernel(), (cl_uint)global.dimensions(), + offset.dimensions() != 0 ? (const ::size_t*) offset : NULL, + (const ::size_t*) global, + local.dimensions() != 0 ? (const ::size_t*) local : NULL, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_NDRANGE_KERNEL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueTask( + const Kernel& kernel, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueTask( + object_, kernel(), + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_TASK_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueNativeKernel( + void (CL_CALLBACK *userFptr)(void *), + std::pair args, + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* mem_locs = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) + ? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem)) + : NULL; + + if (mems != NULL) { + for (unsigned int i = 0; i < mem_objects->size(); i++) { + mems[i] = ((*mem_objects)[i])(); + } + } + + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueNativeKernel( + object_, userFptr, args.first, args.second, + (mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0, + mems, + (mem_locs != NULL) ? (const void **)&mem_locs->front() : NULL, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_NATIVE_KERNEL); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) + CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + return detail::errHandler( + ::clEnqueueMarker(object_, (cl_event*)event), + __ENQUEUE_MARKER_ERR); + } + + CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueWaitForEvents(const VECTOR_CLASS& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + return detail::errHandler( + ::clEnqueueWaitForEvents( + object_, + (cl_uint)events.size(), + (const cl_event*)&events.front()), + __ENQUEUE_WAIT_FOR_EVENTS_ERR); + } +#endif // #if defined(CL_VERSION_1_1) + + cl_int enqueueAcquireGLObjects( + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueAcquireGLObjects( + object_, + (mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0, + (mem_objects != NULL) ? (const cl_mem *)&mem_objects->front() : NULL, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_ACQUIRE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueReleaseGLObjects( + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReleaseGLObjects( + object_, + (mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0, + (mem_objects != NULL) ? (const cl_mem *)&mem_objects->front() : NULL, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_RELEASE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined (USE_DX_INTEROP) + typedef CL_API_ENTRY cl_int(CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem* mem_objects, cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, cl_event* event); + typedef CL_API_ENTRY cl_int(CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem* mem_objects, cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, cl_event* event); + + cl_int enqueueAcquireD3D10Objects( + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL; +#if defined(CL_VERSION_1_2) + cl_context context = getInfo(); + cl::Device device(getInfo()); + cl_platform_id platform = device.getInfo(); + __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR); +#endif +#if defined(CL_VERSION_1_1) + __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR); +#endif + + cl_event tmp; + cl_int err = detail::errHandler( + pfn_clEnqueueAcquireD3D10ObjectsKHR( + object_, + (mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0, + (mem_objects != NULL) ? (const cl_mem *)&mem_objects->front() : NULL, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_ACQUIRE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueReleaseD3D10Objects( + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL; +#if defined(CL_VERSION_1_2) + cl_context context = getInfo(); + cl::Device device(getInfo()); + cl_platform_id platform = device.getInfo(); + __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR); +#endif // #if defined(CL_VERSION_1_2) +#if defined(CL_VERSION_1_1) + __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR); +#endif // #if defined(CL_VERSION_1_1) + + cl_event tmp; + cl_int err = detail::errHandler( + pfn_clEnqueueReleaseD3D10ObjectsKHR( + object_, + (mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0, + (mem_objects != NULL) ? (const cl_mem *)&mem_objects->front() : NULL, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_RELEASE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif + + /** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) + CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + return detail::errHandler( + ::clEnqueueBarrier(object_), + __ENQUEUE_BARRIER_ERR); + } +#endif // #if defined(CL_VERSION_1_1) + + cl_int flush() const + { + return detail::errHandler(::clFlush(object_), __FLUSH_ERR); + } + + cl_int finish() const + { + return detail::errHandler(::clFinish(object_), __FINISH_ERR); + } + }; + +#ifdef _WIN32 + __declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED; + __declspec(selectany) CommandQueue CommandQueue::default_; + __declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS; +#else + __attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED; + __attribute__((weak)) CommandQueue CommandQueue::default_; + __attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS; +#endif + + inline cl_int enqueueReadBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) + { + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event); + } + + inline cl_int enqueueWriteBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + const void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) + { + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event); + } + + inline void* enqueueMapBuffer( + const Buffer& buffer, + cl_bool blocking, + cl_map_flags flags, + ::size_t offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL, + cl_int* err = NULL) + { + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + void * result = ::clEnqueueMapBuffer( + queue(), buffer(), blocking, flags, offset, size, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (cl_event*)event, + &error); + + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + return result; + } + + inline cl_int enqueueUnmapMemObject( + const Memory& memory, + void* mapped_ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) + { + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (error != CL_SUCCESS) { + return error; + } + + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueUnmapMemObject( + queue(), memory(), mapped_ptr, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + inline cl_int enqueueCopyBuffer( + const Buffer& src, + const Buffer& dst, + ::size_t src_offset, + ::size_t dst_offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) + { + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event); + } + + /** + * Blocking copy operation between iterators and a buffer. + */ + template< typename IteratorType > + inline cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer) + { + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + ::size_t length = endIterator - startIterator; + ::size_t byteLength = length*sizeof(DataType); + + DataType *pointer = + static_cast(enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error)); + // if exceptions enabled, enqueueMapBuffer will throw + if (error != CL_SUCCESS) { + return error; + } +#if defined(_MSC_VER) + std::copy( + startIterator, + endIterator, + stdext::checked_array_iterator( + pointer, length)); +#else + std::copy(startIterator, endIterator, pointer); +#endif + Event endEvent; + error = enqueueUnmapMemObject(buffer, pointer, 0, &endEvent); + // if exceptions enabled, enqueueUnmapMemObject will throw + if (error != CL_SUCCESS) { + return error; + } + endEvent.wait(); + return CL_SUCCESS; + } + + /** + * Blocking copy operation between iterators and a buffer. + */ + template< typename IteratorType > + inline cl_int copy(const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator) + { + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + ::size_t length = endIterator - startIterator; + ::size_t byteLength = length*sizeof(DataType); + + DataType *pointer = + static_cast(enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error)); + // if exceptions enabled, enqueueMapBuffer will throw + if (error != CL_SUCCESS) { + return error; + } + std::copy(pointer, pointer + length, startIterator); + Event endEvent; + error = enqueueUnmapMemObject(buffer, pointer, 0, &endEvent); + // if exceptions enabled, enqueueUnmapMemObject will throw + if (error != CL_SUCCESS) { + return error; + } + endEvent.wait(); + return CL_SUCCESS; + } + +#if defined(CL_VERSION_1_1) + inline cl_int enqueueReadBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + void *ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) + { + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueReadBufferRect( + buffer, + blocking, + buffer_offset, + host_offset, + region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + events, + event); + } + + inline cl_int enqueueWriteBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + void *ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) + { + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueWriteBufferRect( + buffer, + blocking, + buffer_offset, + host_offset, + region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + events, + event); + } + + inline cl_int enqueueCopyBufferRect( + const Buffer& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + ::size_t src_row_pitch, + ::size_t src_slice_pitch, + ::size_t dst_row_pitch, + ::size_t dst_slice_pitch, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) + { + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyBufferRect( + src, + dst, + src_origin, + dst_origin, + region, + src_row_pitch, + src_slice_pitch, + dst_row_pitch, + dst_slice_pitch, + events, + event); + } +#endif + + inline cl_int enqueueReadImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) + { + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueReadImage( + image, + blocking, + origin, + region, + row_pitch, + slice_pitch, + ptr, + events, + event); + } + + inline cl_int enqueueWriteImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) + { + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueWriteImage( + image, + blocking, + origin, + region, + row_pitch, + slice_pitch, + ptr, + events, + event); + } + + inline cl_int enqueueCopyImage( + const Image& src, + const Image& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) + { + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyImage( + src, + dst, + src_origin, + dst_origin, + region, + events, + event); + } + + inline cl_int enqueueCopyImageToBuffer( + const Image& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& region, + ::size_t dst_offset, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) + { + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyImageToBuffer( + src, + dst, + src_origin, + region, + dst_offset, + events, + event); + } + + inline cl_int enqueueCopyBufferToImage( + const Buffer& src, + const Image& dst, + ::size_t src_offset, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) + { + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyBufferToImage( + src, + dst, + src_offset, + dst_origin, + region, + events, + event); + } + + + inline cl_int flush(void) + { + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.flush(); + } + + inline cl_int finish(void) + { + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + + return queue.finish(); + } + + // Kernel Functor support + // New interface as of September 2011 + // Requires the C++11 std::tr1::function (note do not support TR1) + // Visual Studio 2010 and GCC 4.2 + + struct EnqueueArgs + { + CommandQueue queue_; + const NDRange offset_; + const NDRange global_; + const NDRange local_; + VECTOR_CLASS events_; + + EnqueueArgs(NDRange global) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(NullRange) + { + + } + + EnqueueArgs(NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(local) + { + + } + + EnqueueArgs(NDRange offset, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(offset), + global_(global), + local_(local) + { + + } + + EnqueueArgs(Event e, NDRange global) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(NullRange) + { + events_.push_back(e); + } + + EnqueueArgs(Event e, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(offset), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(const VECTOR_CLASS &events, NDRange global) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(NullRange), + events_(events) + { + + } + + EnqueueArgs(const VECTOR_CLASS &events, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(local), + events_(events) + { + + } + + EnqueueArgs(const VECTOR_CLASS &events, NDRange offset, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(offset), + global_(global), + local_(local), + events_(events) + { + + } + + EnqueueArgs(CommandQueue &queue, NDRange global) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(NullRange) + { + + } + + EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(local) + { + + } + + EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : + queue_(queue), + offset_(offset), + global_(global), + local_(local) + { + + } + + EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(NullRange) + { + events_.push_back(e); + } + + EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : + queue_(queue), + offset_(offset), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS &events, NDRange global) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(NullRange), + events_(events) + { + + } + + EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS &events, NDRange global, NDRange local) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(local), + events_(events) + { + + } + + EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS &events, NDRange offset, NDRange global, NDRange local) : + queue_(queue), + offset_(offset), + global_(global), + local_(local), + events_(events) + { + + } + }; + + namespace detail { + + class NullType {}; + + template + struct SetArg + { + static void set(Kernel kernel, T0 arg) + { + kernel.setArg(index, arg); + } + }; + + template + struct SetArg + { + static void set(Kernel, NullType) + { + } + }; + + template < + typename T0, typename T1, typename T2, typename T3, + typename T4, typename T5, typename T6, typename T7, + typename T8, typename T9, typename T10, typename T11, + typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, + typename T20, typename T21, typename T22, typename T23, + typename T24, typename T25, typename T26, typename T27, + typename T28, typename T29, typename T30, typename T31 + > + class KernelFunctorGlobal + { + private: + Kernel kernel_; + + public: + KernelFunctorGlobal( + Kernel kernel) : + kernel_(kernel) + {} + + KernelFunctorGlobal( + const Program& program, + const STRING_CLASS name, + cl_int * err = NULL) : + kernel_(program, name.c_str(), err) + {} + + Event operator() ( + const EnqueueArgs& args, + T0 t0, + T1 t1 = NullType(), + T2 t2 = NullType(), + T3 t3 = NullType(), + T4 t4 = NullType(), + T5 t5 = NullType(), + T6 t6 = NullType(), + T7 t7 = NullType(), + T8 t8 = NullType(), + T9 t9 = NullType(), + T10 t10 = NullType(), + T11 t11 = NullType(), + T12 t12 = NullType(), + T13 t13 = NullType(), + T14 t14 = NullType(), + T15 t15 = NullType(), + T16 t16 = NullType(), + T17 t17 = NullType(), + T18 t18 = NullType(), + T19 t19 = NullType(), + T20 t20 = NullType(), + T21 t21 = NullType(), + T22 t22 = NullType(), + T23 t23 = NullType(), + T24 t24 = NullType(), + T25 t25 = NullType(), + T26 t26 = NullType(), + T27 t27 = NullType(), + T28 t28 = NullType(), + T29 t29 = NullType(), + T30 t30 = NullType(), + T31 t31 = NullType() + ) + { + Event event; + SetArg<0, T0>::set(kernel_, t0); + SetArg<1, T1>::set(kernel_, t1); + SetArg<2, T2>::set(kernel_, t2); + SetArg<3, T3>::set(kernel_, t3); + SetArg<4, T4>::set(kernel_, t4); + SetArg<5, T5>::set(kernel_, t5); + SetArg<6, T6>::set(kernel_, t6); + SetArg<7, T7>::set(kernel_, t7); + SetArg<8, T8>::set(kernel_, t8); + SetArg<9, T9>::set(kernel_, t9); + SetArg<10, T10>::set(kernel_, t10); + SetArg<11, T11>::set(kernel_, t11); + SetArg<12, T12>::set(kernel_, t12); + SetArg<13, T13>::set(kernel_, t13); + SetArg<14, T14>::set(kernel_, t14); + SetArg<15, T15>::set(kernel_, t15); + SetArg<16, T16>::set(kernel_, t16); + SetArg<17, T17>::set(kernel_, t17); + SetArg<18, T18>::set(kernel_, t18); + SetArg<19, T19>::set(kernel_, t19); + SetArg<20, T20>::set(kernel_, t20); + SetArg<21, T21>::set(kernel_, t21); + SetArg<22, T22>::set(kernel_, t22); + SetArg<23, T23>::set(kernel_, t23); + SetArg<24, T24>::set(kernel_, t24); + SetArg<25, T25>::set(kernel_, t25); + SetArg<26, T26>::set(kernel_, t26); + SetArg<27, T27>::set(kernel_, t27); + SetArg<28, T28>::set(kernel_, t28); + SetArg<29, T29>::set(kernel_, t29); + SetArg<30, T30>::set(kernel_, t30); + SetArg<31, T31>::set(kernel_, t31); + + args.queue_.enqueueNDRangeKernel( + kernel_, + args.offset_, + args.global_, + args.local_, + &args.events_, + &event); + + return event; + } + + }; + + //------------------------------------------------------------------------------------------------------ + + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28, + typename T29, + typename T30, + typename T31> + struct functionImplementation_ + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + T31> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + T31); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28, + T29 arg29, + T30 arg30, + T31 arg31) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28, + arg29, + arg30, + arg31); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28, + typename T29, + typename T30> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28, + T29 arg29, + T30 arg30) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28, + arg29, + arg30); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28, + typename T29> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28, + T29 arg29) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28, + arg29); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + T6, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + T5, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + T4, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4); + } + + + }; + + template< + typename T0, + typename T1, + typename T2, + typename T3> + struct functionImplementation_ + < T0, + T1, + T2, + T3, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3); + } + + + }; + + template< + typename T0, + typename T1, + typename T2> + struct functionImplementation_ + < T0, + T1, + T2, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2); + } + + + }; + + template< + typename T0, + typename T1> + struct functionImplementation_ + < T0, + T1, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + T1, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1) + { + return functor_( + enqueueArgs, + arg0, + arg1); + } + + + }; + + template< + typename T0> + struct functionImplementation_ + < T0, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> + { + typedef detail::KernelFunctorGlobal< + T0, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + +#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); +#endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0) + { + return functor_( + enqueueArgs, + arg0); + } + + + }; + + + + + + } // namespace detail + + //---------------------------------------------------------------------------------------------- + + template < + typename T0, typename T1 = detail::NullType, typename T2 = detail::NullType, + typename T3 = detail::NullType, typename T4 = detail::NullType, + typename T5 = detail::NullType, typename T6 = detail::NullType, + typename T7 = detail::NullType, typename T8 = detail::NullType, + typename T9 = detail::NullType, typename T10 = detail::NullType, + typename T11 = detail::NullType, typename T12 = detail::NullType, + typename T13 = detail::NullType, typename T14 = detail::NullType, + typename T15 = detail::NullType, typename T16 = detail::NullType, + typename T17 = detail::NullType, typename T18 = detail::NullType, + typename T19 = detail::NullType, typename T20 = detail::NullType, + typename T21 = detail::NullType, typename T22 = detail::NullType, + typename T23 = detail::NullType, typename T24 = detail::NullType, + typename T25 = detail::NullType, typename T26 = detail::NullType, + typename T27 = detail::NullType, typename T28 = detail::NullType, + typename T29 = detail::NullType, typename T30 = detail::NullType, + typename T31 = detail::NullType + > + struct make_kernel : + public detail::functionImplementation_< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + > + { + public: + typedef detail::KernelFunctorGlobal< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + > FunctorType; + + make_kernel( + const Program& program, + const STRING_CLASS name, + cl_int * err = NULL) : + detail::functionImplementation_< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + >( + FunctorType(program, name, err)) + {} + + make_kernel( + const Kernel kernel) : + detail::functionImplementation_< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + >( + FunctorType(kernel)) + {} + }; + + + //---------------------------------------------------------------------------------------------------------------------- + +#undef __ERR_STR +#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS) +#undef __GET_DEVICE_INFO_ERR +#undef __GET_PLATFORM_INFO_ERR +#undef __GET_DEVICE_IDS_ERR +#undef __GET_CONTEXT_INFO_ERR +#undef __GET_EVENT_INFO_ERR +#undef __GET_EVENT_PROFILE_INFO_ERR +#undef __GET_MEM_OBJECT_INFO_ERR +#undef __GET_IMAGE_INFO_ERR +#undef __GET_SAMPLER_INFO_ERR +#undef __GET_KERNEL_INFO_ERR +#undef __GET_KERNEL_ARG_INFO_ERR +#undef __GET_KERNEL_WORK_GROUP_INFO_ERR +#undef __GET_PROGRAM_INFO_ERR +#undef __GET_PROGRAM_BUILD_INFO_ERR +#undef __GET_COMMAND_QUEUE_INFO_ERR + +#undef __CREATE_CONTEXT_ERR +#undef __CREATE_CONTEXT_FROM_TYPE_ERR +#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR + +#undef __CREATE_BUFFER_ERR +#undef __CREATE_SUBBUFFER_ERR +#undef __CREATE_IMAGE2D_ERR +#undef __CREATE_IMAGE3D_ERR +#undef __CREATE_SAMPLER_ERR +#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR + +#undef __CREATE_USER_EVENT_ERR +#undef __SET_USER_EVENT_STATUS_ERR +#undef __SET_EVENT_CALLBACK_ERR +#undef __SET_PRINTF_CALLBACK_ERR + +#undef __WAIT_FOR_EVENTS_ERR + +#undef __CREATE_KERNEL_ERR +#undef __SET_KERNEL_ARGS_ERR +#undef __CREATE_PROGRAM_WITH_SOURCE_ERR +#undef __CREATE_PROGRAM_WITH_BINARY_ERR +#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR +#undef __BUILD_PROGRAM_ERR +#undef __CREATE_KERNELS_IN_PROGRAM_ERR + +#undef __CREATE_COMMAND_QUEUE_ERR +#undef __SET_COMMAND_QUEUE_PROPERTY_ERR +#undef __ENQUEUE_READ_BUFFER_ERR +#undef __ENQUEUE_WRITE_BUFFER_ERR +#undef __ENQUEUE_READ_BUFFER_RECT_ERR +#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR +#undef __ENQEUE_COPY_BUFFER_ERR +#undef __ENQEUE_COPY_BUFFER_RECT_ERR +#undef __ENQUEUE_READ_IMAGE_ERR +#undef __ENQUEUE_WRITE_IMAGE_ERR +#undef __ENQUEUE_COPY_IMAGE_ERR +#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR +#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR +#undef __ENQUEUE_MAP_BUFFER_ERR +#undef __ENQUEUE_MAP_IMAGE_ERR +#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR +#undef __ENQUEUE_NDRANGE_KERNEL_ERR +#undef __ENQUEUE_TASK_ERR +#undef __ENQUEUE_NATIVE_KERNEL + +#undef __CL_EXPLICIT_CONSTRUCTORS + +#undef __UNLOAD_COMPILER_ERR +#endif //__CL_USER_OVERRIDE_ERROR_STRINGS + +#undef __CL_FUNCTION_TYPE + + // Extensions + /** + * Deprecated APIs for 1.2 + */ +#if defined(CL_VERSION_1_1) +#undef __INIT_CL_EXT_FCN_PTR +#endif // #if defined(CL_VERSION_1_1) +#undef __CREATE_SUB_DEVICES + +#if defined(USE_CL_DEVICE_FISSION) +#undef __PARAM_NAME_DEVICE_FISSION +#endif // USE_CL_DEVICE_FISSION + +#undef __DEFAULT_NOT_INITIALIZED +#undef __DEFAULT_BEING_INITIALIZED +#undef __DEFAULT_INITIALIZED + +} // namespace cl + +#ifdef _WIN32 +#pragma pop_macro("max") +#endif // _WIN32 + +#endif // CL_HPP_ diff --git a/contrib/ocl/crypto/blake.hpp b/contrib/ocl/crypto/blake.hpp new file mode 100644 index 000000000..230cad2ef --- /dev/null +++ b/contrib/ocl/crypto/blake.hpp @@ -0,0 +1,62 @@ +#pragma once + +#include +#include +#include + +namespace ocl { +namespace crypto { + +typedef struct blake2b_state_s +{ + uint64_t h[8]; + uint64_t bytes; +} blake2b_state_t; + +inline void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, uint32_t n, uint32_t k) { + using namespace crypto_detail; + + assert(n > k); + assert(hash_len <= 64); + st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len); + for (uint32_t i = 1; i <= 5; i++) + st->h[i] = blake2b_iv[i]; + st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW"; + st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n); + st->bytes = 0; +} + +inline void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg, uint32_t msg_len, uint32_t is_final) { + using namespace crypto_detail; + + const uint64_t *m = (const uint64_t *)_msg; + uint64_t v[16]; + assert(msg_len <= 128); + assert(st->bytes <= UINT64_MAX - msg_len); + memcpy(v + 0, st->h, 8 * sizeof (*v)); + memcpy(v + 8, blake2b_iv, 8 * sizeof (*v)); + v[12] ^= (st->bytes += msg_len); + v[14] ^= is_final ? -1 : 0; + for (uint32_t round = 0; round < blake2b_rounds; round++) + { + const uint8_t *s = blake2b_sigma[round]; + mix(v + 0, v + 4, v + 8, v + 12, m[s[0]], m[s[1]]); + mix(v + 1, v + 5, v + 9, v + 13, m[s[2]], m[s[3]]); + mix(v + 2, v + 6, v + 10, v + 14, m[s[4]], m[s[5]]); + mix(v + 3, v + 7, v + 11, v + 15, m[s[6]], m[s[7]]); + mix(v + 0, v + 5, v + 10, v + 15, m[s[8]], m[s[9]]); + mix(v + 1, v + 6, v + 11, v + 12, m[s[10]], m[s[11]]); + mix(v + 2, v + 7, v + 8, v + 13, m[s[12]], m[s[13]]); + mix(v + 3, v + 4, v + 9, v + 14, m[s[14]], m[s[15]]); + } + for (uint32_t i = 0; i < 8; i++) + st->h[i] ^= v[i] ^ v[i + 8]; +} + +inline void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen) { + assert(outlen <= 64); + memcpy(out, st->h, outlen); +} + +} +} \ No newline at end of file diff --git a/contrib/ocl/crypto/detail/blake.hpp b/contrib/ocl/crypto/detail/blake.hpp new file mode 100644 index 000000000..cd21d4c01 --- /dev/null +++ b/contrib/ocl/crypto/detail/blake.hpp @@ -0,0 +1,54 @@ +#pragma once + +#include + +namespace ocl { + namespace crypto { + namespace crypto_detail { + + inline uint64_t rotr64(uint64_t a, uint8_t bits) + { + return (a >> bits) | (a << (64 - bits)); + } + + inline void mix(uint64_t *va, uint64_t *vb, uint64_t *vc, uint64_t *vd, + uint64_t x, uint64_t y) + { + *va = (*va + *vb + x); + *vd = rotr64(*vd ^ *va, 32); + *vc = (*vc + *vd); + *vb = rotr64(*vb ^ *vc, 24); + *va = (*va + *vb + y); + *vd = rotr64(*vd ^ *va, 16); + *vc = (*vc + *vd); + *vb = rotr64(*vb ^ *vc, 63); + } + + static const uint32_t blake2b_block_len = 128; + static const uint32_t blake2b_rounds = 12; + static const uint64_t blake2b_iv[8] = + { + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL, + }; + static const uint8_t blake2b_sigma[12][16] = + { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + }; + + } + } +} \ No newline at end of file diff --git a/contrib/ocl/hex.hpp b/contrib/ocl/hex.hpp new file mode 100644 index 000000000..056286171 --- /dev/null +++ b/contrib/ocl/hex.hpp @@ -0,0 +1,34 @@ +#pragma once + +#include + +namespace ocl { + +inline void hexdump(uint8_t *a, uint32_t a_len) +{ + for (uint32_t i = 0; i < a_len; i++) + fprintf(stderr, "%02x", a[i]); +} + +inline char *s_hexdump(const void *_a, uint32_t a_len) +{ + const uint8_t *a = (uint8_t *)_a; + static char buf[1024]; + uint32_t i; + for (i = 0; i < a_len && i + 2 < sizeof(buf); i++) + sprintf(buf + i * 2, "%02x", a[i]); + buf[i * 2] = 0; + return buf; +} + +inline uint8_t hex2val(const char *base, size_t off) +{ + const char c = base[off]; + if (c >= '0' && c <= '9') return c - '0'; + else if (c >= 'a' && c <= 'f') return 10 + c - 'a'; + else if (c >= 'A' && c <= 'F') return 10 + c - 'A'; + printf("Invalid hex char at offset %zd: ...%c...\n", off, c); + return 0; +} + +} diff --git a/contrib/ocl/include/blake.hpp b/contrib/ocl/include/blake.hpp new file mode 100644 index 000000000..509500256 --- /dev/null +++ b/contrib/ocl/include/blake.hpp @@ -0,0 +1,103 @@ +#pragma once +#include +#include +#include + +namespace gg { + namespace impl { + static const uint32_t blake2b_block_len = 128; + static const uint32_t blake2b_rounds = 12; + static const uint64_t blake2b_iv[8] = + { + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL, + }; + static const uint8_t blake2b_sigma[12][16] = + { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + }; + + inline uint64_t rotr64(uint64_t a, uint8_t bits) + { + return (a >> bits) | (a << (64 - bits)); + } + + inline void mix(uint64_t *va, uint64_t *vb, uint64_t *vc, uint64_t *vd, + uint64_t x, uint64_t y) + { + *va = (*va + *vb + x); + *vd = rotr64(*vd ^ *va, 32); + *vc = (*vc + *vd); + *vb = rotr64(*vb ^ *vc, 24); + *va = (*va + *vb + y); + *vd = rotr64(*vd ^ *va, 16); + *vc = (*vc + *vd); + *vb = rotr64(*vb ^ *vc, 63); + } + } + + typedef struct blake2b_state_s + { + uint64_t h[8]; + uint64_t bytes; + } blake2b_state_t; + + inline void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, uint32_t n, uint32_t k) + { + assert(n > k); + assert(hash_len <= 64); + st->h[0] = impl::blake2b_iv[0] ^ (0x01010000 | hash_len); + for (uint32_t i = 1; i <= 5; i++) + st->h[i] = impl::blake2b_iv[i]; + st->h[6] = impl::blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW"; + st->h[7] = impl::blake2b_iv[7] ^ (((uint64_t)k << 32) | n); + st->bytes = 0; + } + + inline void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg, uint32_t msg_len, uint32_t is_final) + { + using namespace gg::impl; + + const uint64_t *m = (const uint64_t *)_msg; + uint64_t v[16]; + assert(msg_len <= 128); + assert(st->bytes <= UINT64_MAX - msg_len); + memcpy(v + 0, st->h, 8 * sizeof(*v)); + memcpy(v + 8, blake2b_iv, 8 * sizeof(*v)); + v[12] ^= (st->bytes += msg_len); + v[14] ^= is_final ? -1 : 0; + for (uint32_t round = 0; round < impl::blake2b_rounds; round++) { + const uint8_t *s = blake2b_sigma[round]; + mix(v + 0, v + 4, v + 8, v + 12, m[s[0]], m[s[1]]); + mix(v + 1, v + 5, v + 9, v + 13, m[s[2]], m[s[3]]); + mix(v + 2, v + 6, v + 10, v + 14, m[s[4]], m[s[5]]); + mix(v + 3, v + 7, v + 11, v + 15, m[s[6]], m[s[7]]); + mix(v + 0, v + 5, v + 10, v + 15, m[s[8]], m[s[9]]); + mix(v + 1, v + 6, v + 11, v + 12, m[s[10]], m[s[11]]); + mix(v + 2, v + 7, v + 8, v + 13, m[s[12]], m[s[13]]); + mix(v + 3, v + 4, v + 9, v + 14, m[s[14]], m[s[15]]); + } + for (uint32_t i = 0; i < 8; i++) { + st->h[i] ^= v[i] ^ v[i + 8]; + } + } + + inline void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen) + { + assert(outlen <= 64); + memcpy(out, st->h, outlen); + } +} \ No newline at end of file diff --git a/contrib/ocl/include/ocl_gatelessgate.hpp b/contrib/ocl/include/ocl_gatelessgate.hpp new file mode 100644 index 000000000..69fcfdaab --- /dev/null +++ b/contrib/ocl/include/ocl_gatelessgate.hpp @@ -0,0 +1,73 @@ +#pragma once + +#include "ocl_gg_context.hpp" +#include + +namespace gg { +namespace impl { + +} + struct ocl_gatelessgate + { + //int threadsperblock; + int blocks; + int device_id; + int platform_id; + + ocl_gg_context* oclc; + // threads + unsigned threadsNum; // TMP + unsigned wokrsize; + + bool is_init_success = false; + + ocl_gatelessgate(int platf_id, int dev_id) + : blocks(0) + , device_id(dev_id) + , platform_id(platf_id) + , oclc(nullptr) + , threadsNum(8192U) + , wokrsize(128U) + , is_init_success(false) + { + } + + std::string getdevinfo() { + static auto devices = GetAllDevices(); + auto device = devices[device_id]; + std::vector name(256, 0); + size_t nActualSize = 0; + std::string gpu_name; + + cl_int rc = clGetDeviceInfo(device, CL_DEVICE_NAME, name.size(), &name[0], &nActualSize); + + gpu_name.assign(&name[0], nActualSize); + + return "GPU_ID( " + gpu_name + ")"; + } + + static int getcount(); + + static void getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version); + + static void start(ocl_silentarmy& device_context); + + static void stop(ocl_silentarmy& device_context); + + static void solve(const char *tequihash_header, + unsigned int tequihash_header_len, + const char* nonce, + unsigned int nonce_len, + std::function cancelf, + std::function&, size_t, const unsigned char*)> solutionf, + std::function hashdonef, + ocl_silentarmy& device_context); + + std::string getname() { return "OCL_SILENTARMY"; } + + private: + std::string m_gpu_name; + std::string m_version; + }; + +} \ No newline at end of file diff --git a/contrib/ocl/include/ocl_gg_context.hpp b/contrib/ocl/include/ocl_gg_context.hpp new file mode 100644 index 000000000..162d96196 --- /dev/null +++ b/contrib/ocl/include/ocl_gg_context.hpp @@ -0,0 +1,34 @@ +#pragma once +#include "param.h" +#include + +struct ocl_gg_context { + cl_context _context; + cl_program _program; + cl_device_id _dev_id; + + cl_platform_id platform_id = 0; + + cl_command_queue queue; + + cl_kernel k_init_ht; + cl_kernel k_rounds[PARAM_K]; + cl_kernel k_sols; + + cl_mem buf_ht[2], buf_sols, buf_dbg, rowCounters[2]; + size_t global_ws; + size_t local_work_size = 64; + + sols_t *sols; + + bool init(cl_device_id dev, unsigned threadsNum, unsigned threadsPerBlock); + + ~ocl_gg_context() { + clReleaseMemObject(buf_dbg); + clReleaseMemObject(buf_ht[0]); + clReleaseMemObject(buf_ht[1]); + clReleaseMemObject(rowCounters[0]); + clReleaseMemObject(rowCounters[1]); + free(sols); + } +}; diff --git a/contrib/ocl/include/param.h b/contrib/ocl/include/param.h new file mode 100644 index 000000000..fd08ba0e0 --- /dev/null +++ b/contrib/ocl/include/param.h @@ -0,0 +1,198 @@ +// Gateless Gate, a Zcash miner +// Copyright 2016 zawawa @ bitcointalk.org +// +// The initial version of this software was based on: +// SILENTARMY v5 +// The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil +// +// This program is free software : you can redistribute it and / or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program.If not, see . + +//#define ENABLE_DEBUG + +#define NR_ROWS_LOG 15 +#define NR_SLOTS 120 +#define LOCAL_WORK_SIZE 256 +#define THREADS_PER_ROW 256 +#define LOCAL_WORK_SIZE_SOLS 64 +#define THREADS_PER_ROW_SOLS 64 +#define GLOBAL_WORK_SIZE_RATIO 512 +#define SLOT_CACHE_SIZE (NR_SLOTS * (LOCAL_WORK_SIZE/THREADS_PER_ROW) * 75 / 100) +#define LDS_COLL_SIZE (NR_SLOTS * (LOCAL_WORK_SIZE / THREADS_PER_ROW) * 120 / 100) + +#define SLOT_CACHE_INDEX_TYPE uchar + +#define PARAM_N 200 +#define PARAM_K 9 +#define PREFIX (PARAM_N / (PARAM_K + 1)) +#define NR_INPUTS (1 << PREFIX) +// Approximate log base 2 of number of elements in hash tables +#define APX_NR_ELMS_LOG PREFIX + 1) + +// Setting this to 1 might make Gateless Gate faster, see TROUBLESHOOTING.md +#define OPTIM_SIMPLIFY_ROUND 1 + +// Ratio of time of sleeping before rechecking if task is done (0-1) +#define SLEEP_RECHECK_RATIO 0.60 +// Ratio of time to busy wait for the solution (0-1) +// The higher value the higher CPU usage with Nvidia +#define SLEEP_SKIP_RATIO 0.005 + +// Make hash tables OVERHEAD times larger than necessary to store the average +// number of elements per row. The ideal value is as small as possible to +// reduce memory usage, but not too small or else elements are dropped from the +// hash tables. +// +// The actual number of elements per row is closer to the theoretical average +// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be +// smaller. +// +// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease +// performance as they cause VRAM channel conflicts. +#if NR_ROWS_LOG <= 16 +#define OVERHEAD 2 +#elif NR_ROWS_LOG == 17 +#define OVERHEAD 3 +#elif NR_ROWS_LOG == 18 +#define OVERHEAD 3 +#elif NR_ROWS_LOG == 19 +#define OVERHEAD 5 +#elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND +#define OVERHEAD 6 +#elif NR_ROWS_LOG == 20 +#define OVERHEAD 9 +#endif + +#define NR_ROWS (1 << NR_ROWS_LOG) +#ifndef NR_SLOTS +#define NR_SLOTS (((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD)) +#endif +// Length of 1 element (slot) in byte +#define SLOT_LEN 32 +// Total size of hash table +#define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN) +// Length of Zcash block header, nonce (part of header) +#define ZCASH_BLOCK_HEADER_LEN 140 +// Offset of nTime in header +#define ZCASH_BLOCK_OFFSET_NTIME (4 + 3 * 32) +// Length of nonce +#define ZCASH_NONCE_LEN 32 +// Length of encoded representation of solution size +#define ZCASH_SOLSIZE_LEN 3 +// Solution size (1344 = 0x540) represented as a compact integer, in hex +#define ZCASH_SOLSIZE_HEX "fd4005" +// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes) +#define ZCASH_SOL_LEN ((1 << PARAM_K) * (PREFIX + 1) / 8) +// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization +#define N_ZERO_BYTES 12 +// Number of bytes Zcash needs out of Blake +#define ZCASH_HASH_LEN 50 +// Number of wavefronts per SIMD for the Blake kernel. +// Blake is ALU-bound (beside the atomic counter being incremented) so we need +// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer +// instructions. 10 is the max supported by the hw. +#define BLAKE_WPS 10 +// Maximum number of solutions reported by kernel to host +#define MAX_SOLS 10 +// Length of SHA256 target +#define SHA256_TARGET_LEN (256 / 8) + +#if (NR_SLOTS < 3) +#define BITS_PER_ROW 2 +#define ROWS_PER_UINT 16 +#define ROW_MASK 0x03 +#elif (NR_SLOTS < 7) +#define BITS_PER_ROW 3 +#define ROWS_PER_UINT 10 +#define ROW_MASK 0x07 +#elif (NR_SLOTS < 15) +#define BITS_PER_ROW 4 +#define ROWS_PER_UINT 8 +#define ROW_MASK 0x0F +#elif (NR_SLOTS < 31) +#define BITS_PER_ROW 5 +#define ROWS_PER_UINT 6 +#define ROW_MASK 0x1F +#elif (NR_SLOTS < 63) +#define BITS_PER_ROW 6 +#define ROWS_PER_UINT 5 +#define ROW_MASK 0x3F +#elif (NR_SLOTS < 255) +#define BITS_PER_ROW 8 +#define ROWS_PER_UINT 4 +#define ROW_MASK 0xFF +#else +#define BITS_PER_ROW 16 +#define ROWS_PER_UINT 2 +#define ROW_MASK 0xFFFF +#endif +#define RC_SIZE (NR_ROWS * 4 / ROWS_PER_UINT) + +/* +** Return the offset of Xi in bytes from the beginning of the slot. +*/ +#define xi_offset_for_round(round) 4 + +// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values +#define SOL_SIZE ((1 << PARAM_K) * 4) +typedef struct sols_s +{ + uint nr; + uint likely_invalids; + uchar valid[MAX_SOLS]; + uint values[MAX_SOLS][(1 << PARAM_K)]; +} sols_t; + +#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff)) +#define DECODE_ROW(REF) (REF >> 16) +#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff) +#define DECODE_SLOT0(REF) (REF & 0xff) + +#elif NR_ROWS_LOG <= 18 && NR_SLOTS <= (1 << 7) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f)) +#define DECODE_ROW(REF) (REF >> 14) +#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f) +#define DECODE_SLOT0(REF) (REF & 0x7f) + +#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */ +#define DECODE_ROW(REF) (REF >> 13) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) +#define DECODE_ROW(REF) (REF >> 12) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#else +#error "unsupported NR_ROWS_LOG" +#endif + +// Windows only for now +#define DEFAULT_NUM_MINING_MODE_THREADS 1 +#define MAX_NUM_MINING_MODE_THREADS 16 + +#define ADJUSTED_SLOT_LEN(round) (((round) <= 5) ? SLOT_LEN : SLOT_LEN / 2) +#define OPENCL_BUILD_OPTIONS_AMD "-I.. -I. -O1" +#define OPENCL_BUILD_OPTIONS "-I.. -I." \ No newline at end of file diff --git a/contrib/ocl/kernels/gatelessgate.cl b/contrib/ocl/kernels/gatelessgate.cl new file mode 100644 index 000000000..69d842039 --- /dev/null +++ b/contrib/ocl/kernels/gatelessgate.cl @@ -0,0 +1,993 @@ +// Gateless Gate, a Zcash miner +// Copyright 2016 zawawa @ bitcointalk.org +// +// The initial version of this software was based on: +// SILENTARMY v5 +// The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil +// +// This program is free software : you can redistribute it and / or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program.If not, see . + +//#define ENABLE_DEBUG + +#define NR_ROWS_LOG 15 +#define NR_SLOTS 120 +#define LOCAL_WORK_SIZE 256 +#define THREADS_PER_ROW 256 +#define LOCAL_WORK_SIZE_SOLS 64 +#define THREADS_PER_ROW_SOLS 64 +#define GLOBAL_WORK_SIZE_RATIO 512 +#define SLOT_CACHE_SIZE (NR_SLOTS * (LOCAL_WORK_SIZE/THREADS_PER_ROW) * 75 / 100) +#define LDS_COLL_SIZE (NR_SLOTS * (LOCAL_WORK_SIZE / THREADS_PER_ROW) * 120 / 100) + +#define SLOT_CACHE_INDEX_TYPE uchar + +#define PARAM_N 200 +#define PARAM_K 9 +#define PREFIX (PARAM_N / (PARAM_K + 1)) +#define NR_INPUTS (1 << PREFIX) +// Approximate log base 2 of number of elements in hash tables +#define APX_NR_ELMS_LOG PREFIX + 1) + +// Setting this to 1 might make Gateless Gate faster, see TROUBLESHOOTING.md +#define OPTIM_SIMPLIFY_ROUND 1 + +// Ratio of time of sleeping before rechecking if task is done (0-1) +#define SLEEP_RECHECK_RATIO 0.60 +// Ratio of time to busy wait for the solution (0-1) +// The higher value the higher CPU usage with Nvidia +#define SLEEP_SKIP_RATIO 0.005 + +// Make hash tables OVERHEAD times larger than necessary to store the average +// number of elements per row. The ideal value is as small as possible to +// reduce memory usage, but not too small or else elements are dropped from the +// hash tables. +// +// The actual number of elements per row is closer to the theoretical average +// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be +// smaller. +// +// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease +// performance as they cause VRAM channel conflicts. +#if NR_ROWS_LOG <= 16 +#define OVERHEAD 2 +#elif NR_ROWS_LOG == 17 +#define OVERHEAD 3 +#elif NR_ROWS_LOG == 18 +#define OVERHEAD 3 +#elif NR_ROWS_LOG == 19 +#define OVERHEAD 5 +#elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND +#define OVERHEAD 6 +#elif NR_ROWS_LOG == 20 +#define OVERHEAD 9 +#endif + +#define NR_ROWS (1 << NR_ROWS_LOG) +#ifndef NR_SLOTS +#define NR_SLOTS (((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD)) +#endif +// Length of 1 element (slot) in byte +#define SLOT_LEN 32 +// Total size of hash table +#define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN) +// Length of Zcash block header, nonce (part of header) +#define ZCASH_BLOCK_HEADER_LEN 140 +// Offset of nTime in header +#define ZCASH_BLOCK_OFFSET_NTIME (4 + 3 * 32) +// Length of nonce +#define ZCASH_NONCE_LEN 32 +// Length of encoded representation of solution size +#define ZCASH_SOLSIZE_LEN 3 +// Solution size (1344 = 0x540) represented as a compact integer, in hex +#define ZCASH_SOLSIZE_HEX "fd4005" +// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes) +#define ZCASH_SOL_LEN ((1 << PARAM_K) * (PREFIX + 1) / 8) +// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization +#define N_ZERO_BYTES 12 +// Number of bytes Zcash needs out of Blake +#define ZCASH_HASH_LEN 50 +// Number of wavefronts per SIMD for the Blake kernel. +// Blake is ALU-bound (beside the atomic counter being incremented) so we need +// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer +// instructions. 10 is the max supported by the hw. +#define BLAKE_WPS 10 +// Maximum number of solutions reported by kernel to host +#define MAX_SOLS 10 +// Length of SHA256 target +#define SHA256_TARGET_LEN (256 / 8) + +#if (NR_SLOTS < 3) +#define BITS_PER_ROW 2 +#define ROWS_PER_UINT 16 +#define ROW_MASK 0x03 +#elif (NR_SLOTS < 7) +#define BITS_PER_ROW 3 +#define ROWS_PER_UINT 10 +#define ROW_MASK 0x07 +#elif (NR_SLOTS < 15) +#define BITS_PER_ROW 4 +#define ROWS_PER_UINT 8 +#define ROW_MASK 0x0F +#elif (NR_SLOTS < 31) +#define BITS_PER_ROW 5 +#define ROWS_PER_UINT 6 +#define ROW_MASK 0x1F +#elif (NR_SLOTS < 63) +#define BITS_PER_ROW 6 +#define ROWS_PER_UINT 5 +#define ROW_MASK 0x3F +#elif (NR_SLOTS < 255) +#define BITS_PER_ROW 8 +#define ROWS_PER_UINT 4 +#define ROW_MASK 0xFF +#else +#define BITS_PER_ROW 16 +#define ROWS_PER_UINT 2 +#define ROW_MASK 0xFFFF +#endif +#define RC_SIZE (NR_ROWS * 4 / ROWS_PER_UINT) + +/* +** Return the offset of Xi in bytes from the beginning of the slot. +*/ +#define xi_offset_for_round(round) 4 + +// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values +#define SOL_SIZE ((1 << PARAM_K) * 4) +typedef struct sols_s +{ + uint nr; + uint likely_invalids; + uchar valid[MAX_SOLS]; + uint values[MAX_SOLS][(1 << PARAM_K)]; +} sols_t; + +#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff)) +#define DECODE_ROW(REF) (REF >> 16) +#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff) +#define DECODE_SLOT0(REF) (REF & 0xff) + +#elif NR_ROWS_LOG <= 18 && NR_SLOTS <= (1 << 7) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f)) +#define DECODE_ROW(REF) (REF >> 14) +#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f) +#define DECODE_SLOT0(REF) (REF & 0x7f) + +#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */ +#define DECODE_ROW(REF) (REF >> 13) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) +#define DECODE_ROW(REF) (REF >> 12) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#else +#error "unsupported NR_ROWS_LOG" +#endif + +// Windows only for now +#define DEFAULT_NUM_MINING_MODE_THREADS 1 +#define MAX_NUM_MINING_MODE_THREADS 16 + +#define ADJUSTED_SLOT_LEN(round) (((round) <= 5) ? SLOT_LEN : SLOT_LEN / 2) +#define OPENCL_BUILD_OPTIONS_AMD "-I.. -I. -O1" +#define OPENCL_BUILD_OPTIONS "-I.. -I." + +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable + +typedef union { + struct { + uint i; + uint xi[6]; + uint padding; + } slot; + uint8 ui8; + uint4 ui4[2]; +} slot_t; + +/* +** The new hash table has this layout (length in bytes in parens): +** +** round 0, table 0: i(4) pad(0) Xi(24) pad(4) +** round 1, table 1: i(4) pad(3) Xi(20) pad(5) +** round 2, table 2: i(4) pad(0) Xi(19) pad(9) +** round 3, table 3: i(4) pad(3) Xi(15) pad(10) +** round 4, table 4: i(4) pad(0) Xi(14) pad(14) +** round 5, table 5: i(4) pad(3) Xi(10) pad(15) +** round 6, table 6: i(4) pad(0) Xi( 9) pad(19) +** round 7, table 7: i(4) pad(3) Xi( 5) pad(20) +** round 8, table 8: i(4) pad(0) Xi( 4) pad(24) +** +*/ + +__constant ulong blake_iv_const[] = +{ + 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, + 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, 0x9b05688c2b3e6c1f, + 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, +}; + +/* +** Reset counters in hash table. +*/ +__kernel +void kernel_init_ht(__global char *ht, __global uint *rowCounters) +{ + rowCounters[get_global_id(0)] = 0; +} + +/* +** OBSOLETE +** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they +** represent (hex notation, group of 5 hex digits are a group of PREFIX bits): +** aa aa ab bb bb cc cc cd dd... [round 0] +** -------------------- +** ...ab bb bb cc cc cd dd... [odd round] +** -------------- +** ...cc cc cd dd... [next even round] +** ----- +** Bytes underlined are going to be stored in the slot. Preceding bytes +** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are +** used to compute the row number. +** +** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter) +** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble) +** TODO: update lines below with padding nibbles +** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter) +** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter) +** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter) +** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter) +** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter) +** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter) +** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter) +** +** Return 0 if successfully stored, or 1 if the row overflowed. +*/ + +__global char *get_slot_ptr(__global char *ht, uint round, uint row, uint slot) +{ +#if 1 + return ht + (row * NR_SLOTS + slot) * ADJUSTED_SLOT_LEN(round); +#else + return ht + (slot * NR_ROWS + row) * ADJUSTED_SLOT_LEN(round); +#endif +} + +__global char *get_xi_ptr(__global char *ht, uint round, uint row, uint slot) +{ + return get_slot_ptr(ht, round, row, slot) + xi_offset_for_round(round); +} + +void get_row_counters_index(uint *rowIdx, uint *rowOffset, uint row) +{ + *rowIdx = row / ROWS_PER_UINT; + *rowOffset = BITS_PER_ROW * (row % ROWS_PER_UINT); +} + +uint get_row(uint round, uint xi0) +{ + uint row; +#if NR_ROWS_LOG == 14 + if (!(round % 2)) + row = (xi0 & 0x3fff); + else + row = ((xi0 & 0x3f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24); +#elif NR_ROWS_LOG == 15 + if (!(round % 2)) + row = (xi0 & 0x7fff); + else + row = ((xi0 & 0x7f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24); +#elif NR_ROWS_LOG == 16 + if (!(round % 2)) + row = (xi0 & 0xffff); + else + row = ((xi0 & 0xff0f00) >> 8) | ((xi0 & 0xf0000000) >> 24); +#else +#error "unsupported NR_ROWS_LOG" +#endif + return row; +} + +uint inc_row_counter(__global uint *rowCounters, uint row) +{ + uint rowIdx, rowOffset; + get_row_counters_index(&rowIdx, &rowOffset, row); + uint cnt = atomic_add(rowCounters + rowIdx, 1 << rowOffset); + cnt = (cnt >> rowOffset) & ROW_MASK; + if (cnt >= NR_SLOTS) { + // avoid overflows + atomic_sub(rowCounters + rowIdx, 1 << rowOffset); + } + return cnt; +} + +uint ht_store(uint round, __global char *ht, uint i, + uint xi0, uint xi1, uint xi2, uint xi3, uint xi4, uint xi5, uint xi6, __global uint *rowCounters) +{ + uint row = get_row(round, xi0); + uint cnt = inc_row_counter(rowCounters, row); + if (cnt >= NR_SLOTS) + return 0; + __global char *p = get_slot_ptr(ht, round, row, cnt); + slot_t slot; + slot.slot.i = i; + slot.slot.xi[0] = ((xi1 << 24) | (xi0 >> 8)); + slot.slot.xi[1] = ((xi2 << 24) | (xi1 >> 8)); + slot.slot.xi[2] = ((xi3 << 24) | (xi2 >> 8)); + slot.slot.xi[3] = ((xi4 << 24) | (xi3 >> 8)); + slot.slot.xi[4] = ((xi5 << 24) | (xi4 >> 8)); + slot.slot.xi[5] = ((xi6 << 24) | (xi5 >> 8)); + if (round <= 5) { + *(__global uint8 *)p = slot.ui8; + } + else { + *(__global uint4 *)p = slot.ui4[0]; + } + return 0; +} + +#define mix(va, vb, vc, vd, x, y) \ + va = (va + vb + x); \ +vd = rotate((vd ^ va), (ulong)64 - 32); \ +vc = (vc + vd); \ +vb = rotate((vb ^ vc), (ulong)64 - 24); \ +va = (va + vb + y); \ +vd = rotate((vd ^ va), (ulong)64 - 16); \ +vc = (vc + vd); \ +vb = rotate((vb ^ vc), (ulong)64 - 63); + +/* +** Execute round 0 (blake). +** +** Note: making the work group size less than or equal to the wavefront size +** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local +** Memory (LDS) Optimization 2-10" in: +** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/ +*/ +__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE, 1, 1))) +void kernel_round0(__constant ulong *blake_state_const, __global char *ht, + __global uint *rowCounters, __global uint *debug) +{ + __local ulong blake_state[64]; + __local ulong blake_iv[8]; + uint tid = get_global_id(0); + ulong v[16]; + uint inputs_per_thread = NR_INPUTS / get_global_size(0); + uint input = tid * inputs_per_thread; + uint input_end = (tid + 1) * inputs_per_thread; + uint dropped = 0; + if (get_local_id(0) < 64) + blake_state[get_local_id(0)] = blake_state_const[get_local_id(0)]; + if (get_local_id(0) < 8) + blake_iv[get_local_id(0)] = blake_iv_const[get_local_id(0)]; + barrier(CLK_LOCAL_MEM_FENCE); + while (input < input_end) { + // shift "i" to occupy the high 32 bits of the second ulong word in the + // message block + ulong word1 = (ulong)input << 32; + // init vector v + v[0] = blake_state[0]; + v[1] = blake_state[1]; + v[2] = blake_state[2]; + v[3] = blake_state[3]; + v[4] = blake_state[4]; + v[5] = blake_state[5]; + v[6] = blake_state[6]; + v[7] = blake_state[7]; + v[8] = blake_iv[0]; + v[9] = blake_iv[1]; + v[10] = blake_iv[2]; + v[11] = blake_iv[3]; + v[12] = blake_iv[4]; + v[13] = blake_iv[5]; + v[14] = blake_iv[6]; + v[15] = blake_iv[7]; + // mix in length of data + v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */; + // last block + v[14] ^= (ulong)-1; + + // round 1 + mix(v[0], v[4], v[8], v[12], 0, word1); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 2 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], word1, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 3 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, word1); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 4 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, word1); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 5 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, word1); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 6 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], word1, 0); + // round 7 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], word1, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 8 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, word1); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 9 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], word1, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 10 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], word1, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 11 + mix(v[0], v[4], v[8], v[12], 0, word1); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 12 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], word1, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + + // compress v into the blake state; this produces the 50-byte hash + // (two Xi values) + ulong h[7]; + h[0] = blake_state[0] ^ v[0] ^ v[8]; + h[1] = blake_state[1] ^ v[1] ^ v[9]; + h[2] = blake_state[2] ^ v[2] ^ v[10]; + h[3] = blake_state[3] ^ v[3] ^ v[11]; + h[4] = blake_state[4] ^ v[4] ^ v[12]; + h[5] = blake_state[5] ^ v[5] ^ v[13]; + h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff; + + // store the two Xi values in the hash table +#if ZCASH_HASH_LEN == 50 + dropped += ht_store(0, ht, input * 2, + h[0] & 0xffffffff, h[0] >> 32, + h[1] & 0xffffffff, h[1] >> 32, + h[2] & 0xffffffff, h[2] >> 32, + h[3] & 0xffffffff, + rowCounters); + dropped += ht_store(0, ht, input * 2 + 1, + ((h[3] >> 8) | (h[4] << (64 - 8))) & 0xffffffff, + ((h[3] >> 8) | (h[4] << (64 - 8))) >> 32, + ((h[4] >> 8) | (h[5] << (64 - 8))) & 0xffffffff, + ((h[4] >> 8) | (h[5] << (64 - 8))) >> 32, + ((h[5] >> 8) | (h[6] << (64 - 8))) & 0xffffffff, + ((h[5] >> 8) | (h[6] << (64 - 8))) >> 32, + (h[6] >> 8) & 0xffffffff, + rowCounters); +#else +#error "unsupported ZCASH_HASH_LEN" +#endif + + input++; + } +#ifdef ENABLE_DEBUG + debug[tid * 2] = 0; + debug[tid * 2 + 1] = dropped; +#endif +} + +/* +** XOR a pair of Xi values computed at "round - 1" and store the result in the +** hash table being built for "round". Note that when building the table for +** even rounds we need to skip 1 padding byte present in the "round - 1" table +** (the "0xAB" byte mentioned in the description at the top of this file.) But +** also note we can't load data directly past this byte because this would +** cause an unaligned memory access which is undefined per the OpenCL spec. +** +** Return 0 if successfully stored, or 1 if the row overflowed. +*/ +uint xor_and_store(uint round, __global char *ht_dst, uint row, + uint slot_a, uint slot_b, __local uint *ai, __local uint *bi, + __global uint *rowCounters) +{ + ulong xi0, xi1, xi2, xi3, xi4, xi5; +#if NR_ROWS_LOG >= 8 && NR_ROWS_LOG <= 20 + // xor 24 bytes + xi0 = *(ai++); + xi1 = *(ai++); + if (round <= 7) xi2 = *(ai++); + if (round <= 6) xi3 = *(ai++); + if (round <= 4) xi4 = *(ai++); + if (round <= 2) xi5 = *ai; + + xi0 ^= *(bi++); + xi1 ^= *(bi++); + if (round <= 7) xi2 ^= *(bi++); + if (round <= 6) xi3 ^= *(bi++); + if (round <= 4) xi4 ^= *(bi++); + if (round <= 2) xi5 ^= *bi; + + if (!(round & 0x1)) { + // skip padding bytes + xi0 = (xi0 >> 24) | (xi1 << (32 - 24)); + xi1 = (xi1 >> 24) | (xi2 << (32 - 24)); + if (round <= 7) xi2 = (xi2 >> 24) | (xi3 << (32 - 24)); + if (round <= 6) xi3 = (xi3 >> 24) | (xi4 << (32 - 24)); + if (round <= 4) xi4 = (xi4 >> 24) | (xi5 << (32 - 24)); + if (round <= 2) xi5 = (xi5 >> 24); + } + + // invalid solutions (which start happenning in round 5) have duplicate + // inputs and xor to zero, so discard them + if (!xi0 && !xi1) + return 0; +#else +#error "unsupported NR_ROWS_LOG" +#endif + return ht_store(round, ht_dst, ENCODE_INPUTS(row, slot_a, slot_b), xi0, xi1, xi2, xi3, xi4, xi5, 0, rowCounters); +} + +/* +** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi, +** store them in ht_dst. +*/ + +#define UINTS_IN_XI(round) (((round) == 0) ? 6 : \ + ((round) == 1) ? 6 : \ + ((round) == 2) ? 5 : \ + ((round) == 3) ? 5 : \ + ((round) == 4) ? 4 : \ + ((round) == 5) ? 4 : \ + ((round) == 6) ? 3 : \ + ((round) == 7) ? 2 : \ + 1) + +#define RESERVED_FOR_XI(round) (((round) == 0) ? 6 : \ + ((round) == 1) ? 6 : \ + ((round) == 2) ? 6 : \ + ((round) == 3) ? 6 : \ + ((round) == 4) ? 4 : \ + ((round) == 5) ? 4 : \ + ((round) == 6) ? 4 : \ + ((round) == 7) ? 2 : \ + 2) + +void equihash_round(uint round, + __global char *ht_src, + __global char *ht_dst, + __global uint *debug, + __local uint *slot_cache, + __local uint *slot_cache_counter, + __local SLOT_CACHE_INDEX_TYPE *slot_cache_indexes, + __local uint *collisionsData, + __local uint *collisionsNum, + __global uint *rowCountersSrc, + __global uint *rowCountersDst, + uint threadsPerRow, + __local uint *nr_slots_array, + __local uchar *bins_data, + __local uint *bin_counters_data) +{ + uint globalTid = get_global_id(0) / threadsPerRow; + uint localTid = get_local_id(0) / threadsPerRow; + uint localGroupId = get_local_id(0) % threadsPerRow; + + __global char *p; + uint cnt; + uint i, j; + uint dropped_coll = 0; + uint dropped_stor = 0; + __local uint *a, *b; + // the mask is also computed to read data from the previous round +#define BIN_MASK(round) ((((round) + 1) % 2) ? 0xf000 : 0xf0000) +#define BIN_MASK_OFFSET(round) ((((round) + 1) % 2) ? 3 * 4 : 4 * 4) +#if NR_ROWS_LOG == 14 +#define BIN_MASK2(round) ((((round) + 1) % 2) ? 0x00c0 : 0xc000) +#define BIN_MASK2_OFFSET(round) ((((round) + 1) % 2) ? 2 : 10) +#elif NR_ROWS_LOG == 15 +#define BIN_MASK2(round) ((((round) + 1) % 2) ? 0x0080 : 0x8000) +#define BIN_MASK2_OFFSET(round) ((((round) + 1) % 2) ? 3 : 11) +#elif NR_ROWS_LOG == 16 +#define BIN_MASK2(round) 0 +#define BIN_MASK2_OFFSET(round) 0 +#else +#error "unsupported NR_ROWS_LOG" +#endif +#define NR_BINS (64 >> (NR_ROWS_LOG - 14)) + __local uchar *bins = &bins_data[localTid * NR_SLOTS * NR_BINS]; + __local uint *bin_counters = &bin_counters_data[localTid * NR_BINS]; + + uint rows_per_work_item = (NR_ROWS + get_global_size(0) / threadsPerRow - 1) / (get_global_size(0) / threadsPerRow); + uint rows_per_chunk = get_global_size(0) / threadsPerRow; + + for (uint chunk = 0; chunk < rows_per_work_item; chunk++) { + uint tid = globalTid + rows_per_chunk * chunk; + uint gid = tid & ~(get_local_size(0) / threadsPerRow - 1); + + if (tid < NR_ROWS) { + if (!get_local_id(0)) { + *collisionsNum = 0; + *slot_cache_counter = 0; + } + for (i = localGroupId; i < NR_BINS; i += threadsPerRow) + bin_counters[i] = 0; + if (localGroupId == 0) { + uint rowIdx, rowOffset; + get_row_counters_index(&rowIdx, &rowOffset, tid); + cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK; + cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round + nr_slots_array[localTid] = cnt; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < NR_ROWS) { + if (localGroupId) + cnt = nr_slots_array[localTid]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Perform a radix sort as slots get loaded into LDS. + if (tid < NR_ROWS) { + for (i = localGroupId; i < cnt; i += threadsPerRow) { + uint xi_first_bytes = *(__global uint *)get_xi_ptr(ht_src, round - 1, tid, i); + + uint bin_to_use = + ((xi_first_bytes & BIN_MASK(round - 1)) >> BIN_MASK_OFFSET(round - 1)) + | ((xi_first_bytes & BIN_MASK2(round - 1)) >> BIN_MASK2_OFFSET(round - 1)); + uint bin_counter_copy = atomic_inc(&bin_counters[bin_to_use]); + bins[bin_to_use * NR_SLOTS + bin_counter_copy] = i; + + if (bin_counter_copy) { + uint slot_cache_counter_copy = atomic_inc(slot_cache_counter); + if (slot_cache_counter_copy >= SLOT_CACHE_SIZE) { + atomic_dec(slot_cache_counter); + ++dropped_coll; + slot_cache_indexes[localTid * NR_SLOTS + i] = SLOT_CACHE_SIZE; + } + else { + slot_cache[slot_cache_counter_copy * RESERVED_FOR_XI(round - 1)] = xi_first_bytes; + for (j = 1; j < UINTS_IN_XI(round - 1); ++j) + slot_cache[slot_cache_counter_copy * RESERVED_FOR_XI(round - 1) + j] = *((__global uint *)get_xi_ptr(ht_src, round - 1, tid, i) + j); + slot_cache_indexes[localTid * NR_SLOTS + i] = slot_cache_counter_copy; + } + + if (bin_counter_copy == 1) { + slot_cache_counter_copy = atomic_inc(slot_cache_counter); + uint first_slot_index = bins[bin_to_use * NR_SLOTS]; + if (slot_cache_counter_copy >= SLOT_CACHE_SIZE) { + atomic_dec(slot_cache_counter); + ++dropped_coll; + slot_cache_indexes[localTid * NR_SLOTS + first_slot_index] = SLOT_CACHE_SIZE; + } + else { + for (j = 0; j < UINTS_IN_XI(round - 1); ++j) + slot_cache[slot_cache_counter_copy * RESERVED_FOR_XI(round - 1) + j] = *((__global uint *)get_xi_ptr(ht_src, round - 1, tid, first_slot_index) + j); + slot_cache_indexes[localTid * NR_SLOTS + first_slot_index] = slot_cache_counter_copy; + } + } + } + + for (j = 0; j < bin_counter_copy; ++j) { + uint index = atomic_inc(collisionsNum); + if (index >= LDS_COLL_SIZE) { + atomic_dec(collisionsNum); + ++dropped_coll; + } + else { + collisionsData[index] = (localTid << 24) | (i << 12) | bins[bin_to_use * NR_SLOTS + j]; + } + } + } + } + + part2: + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < NR_ROWS) { + uint totalCollisions = *collisionsNum; + for (uint index = get_local_id(0); index < totalCollisions; index += get_local_size(0)) { + uint collision = collisionsData[index]; + uint collisionLocalThreadId = collision >> 24; + uint collisionThreadId = gid + collisionLocalThreadId; + uint i = (collision >> 12) & 0xfff; + uint j = collision & 0xfff; + uint slot_cache_index_i = slot_cache_indexes[collisionLocalThreadId * NR_SLOTS + i]; + if (slot_cache_index_i >= SLOT_CACHE_SIZE) + continue; + uint slot_cache_index_j = slot_cache_indexes[collisionLocalThreadId * NR_SLOTS + j]; + if (slot_cache_index_j >= SLOT_CACHE_SIZE) + continue; + a = (__local uint *)&slot_cache[slot_cache_index_i * RESERVED_FOR_XI(round - 1)]; + b = (__local uint *)&slot_cache[slot_cache_index_j * RESERVED_FOR_XI(round - 1)]; + dropped_stor += xor_and_store(round, ht_dst, collisionThreadId, i, j, a, b, rowCountersDst); + } + } + } + +#ifdef ENABLE_DEBUG + uint tid = get_global_id(0); + debug[tid * 2] = dropped_coll; + debug[tid * 2 + 1] = dropped_stor; +#endif +} + +/* +** This defines kernel_round1, kernel_round2, ..., kernel_round7. +*/ +#define KERNEL_ROUND(N) \ +__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE, 1, 1))) \ +void kernel_round ## N(__global char *ht_src, __global char *ht_dst, \ + __global uint *rowCountersSrc, __global uint *rowCountersDst, \ + __global uint *debug) \ +{ \ + __local uint slot_cache[RESERVED_FOR_XI(N - 1) * SLOT_CACHE_SIZE]; \ + __local uint slot_cache_counter; \ + __local SLOT_CACHE_INDEX_TYPE slot_cache_indexes[NR_SLOTS * (LOCAL_WORK_SIZE/THREADS_PER_ROW)]; \ + __local uint collisionsData[LDS_COLL_SIZE]; \ + __local uint collisionsNum; \ + __local uint nr_slots_array[LOCAL_WORK_SIZE / THREADS_PER_ROW]; \ + __local uchar bins_data[(LOCAL_WORK_SIZE / THREADS_PER_ROW) * NR_SLOTS * NR_BINS]; \ + __local uint bin_counters_data[(LOCAL_WORK_SIZE / THREADS_PER_ROW) * NR_BINS]; \ + equihash_round(N, ht_src, ht_dst, debug, slot_cache, &slot_cache_counter, slot_cache_indexes, collisionsData, \ + &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW, nr_slots_array, bins_data, bin_counters_data); \ +} +KERNEL_ROUND(1) +KERNEL_ROUND(2) +KERNEL_ROUND(3) +KERNEL_ROUND(4) +KERNEL_ROUND(5) +KERNEL_ROUND(6) +KERNEL_ROUND(7) +KERNEL_ROUND(8) + +uint expand_ref(__global char *ht, uint round, uint row, uint slot) +{ + return ((__global slot_t *)get_slot_ptr(ht, round, row, slot))->slot.i; +} + +/* +** Expand references to inputs. Return 1 if so far the solution appears valid, +** or 0 otherwise (an invalid solution would be a solution with duplicate +** inputs, which can be detected at the last step: round == 0). +*/ +uint expand_refs(__local uint *ins, uint nr_inputs, __global char **htabs, + uint round) +{ + __global char *ht = htabs[round]; + uint i = nr_inputs - 1; + uint j = nr_inputs * 2 - 1; + int dup_to_watch = -1; + do { + ins[j] = expand_ref(ht, round, + DECODE_ROW(ins[i]), DECODE_SLOT1(ins[i])); + ins[j - 1] = expand_ref(ht, round, + DECODE_ROW(ins[i]), DECODE_SLOT0(ins[i])); + if (!round) { + if (dup_to_watch == -1) + dup_to_watch = ins[j]; + else if (ins[j] == dup_to_watch || ins[j - 1] == dup_to_watch) + return 0; + } + if (!i) + break; + i--; + j -= 2; + } while (1); + return 1; +} + +/* +** Verify if a potential solution is in fact valid. +*/ +void potential_sol(__global char **htabs, __global sols_t *sols, + uint ref0, uint ref1, __local uint *values_tmp) +{ + uint nr_values; + uint sol_i; + uint i; + nr_values = 0; + values_tmp[nr_values++] = ref0; + values_tmp[nr_values++] = ref1; + uint round = PARAM_K - 1; + do { + round--; + if (!expand_refs(values_tmp, nr_values, htabs, round)) + return; + nr_values *= 2; + } while (round > 0); + // solution appears valid, copy it to sols + sol_i = atomic_inc(&sols->nr); + if (sol_i >= MAX_SOLS) + return; + for (i = 0; i < (1 << PARAM_K); i++) + sols->values[sol_i][i] = values_tmp[i]; + sols->valid[sol_i] = 1; +} + +/* +** Scan the hash tables to find Equihash solutions. +*/ +__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE_SOLS, 1, 1))) +void kernel_sols(__global char *ht0, + __global char *ht1, + __global char *ht2, + __global char *ht3, + __global char *ht4, + __global char *ht5, + __global char *ht6, + __global char *ht7, + __global char *ht8, + __global sols_t *sols, + __global uint *rowCountersSrc) +{ + __local uint refs[NR_SLOTS*(LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS)]; + __local uint data[NR_SLOTS*(LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS)]; + __local uint values_tmp[(1 << PARAM_K)]; + __local uint semaphoe; + + uint globalTid = get_global_id(0) / THREADS_PER_ROW_SOLS; + uint localTid = get_local_id(0) / THREADS_PER_ROW_SOLS; + uint localGroupId = get_local_id(0) % THREADS_PER_ROW_SOLS; + __local uint *refsPtr = &refs[NR_SLOTS*localTid]; + __local uint *dataPtr = &data[NR_SLOTS*localTid]; + + __global char *htabs[] = { ht0, ht1, ht2, ht3, ht4, ht5, ht6, ht7, ht8 }; + uint ht_i = (PARAM_K - 1); // table filled at last round + uint cnt; + uint i, j; + __global char *p; + uint ref_i, ref_j; + __local uchar bins_data[(LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS) * NR_SLOTS * NR_BINS]; + __local uint bin_counters_data[(LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS) * NR_BINS]; + __local uchar *bins = &bins_data[localTid * NR_SLOTS * NR_BINS]; + __local uint *bin_counters = &bin_counters_data[localTid * NR_BINS]; + + if (!get_global_id(0)) + sols->nr = sols->likely_invalids = 0; + barrier(CLK_GLOBAL_MEM_FENCE); + + uint rows_per_work_item = (NR_ROWS + get_global_size(0) / THREADS_PER_ROW_SOLS - 1) / (get_global_size(0) / THREADS_PER_ROW_SOLS); + uint rows_per_chunk = get_global_size(0) / THREADS_PER_ROW_SOLS; + + for (uint chunk = 0; chunk < rows_per_work_item; chunk++) { + uint tid = globalTid + rows_per_chunk * chunk; + uint gid = tid & ~(get_local_size(0) / THREADS_PER_ROW_SOLS - 1); + + __local uint nr_slots_array[LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS]; + if (tid < NR_ROWS) { + if (!get_local_id(0)) + semaphoe = 0; + for (i = localGroupId; i < NR_BINS; i += THREADS_PER_ROW_SOLS) + bin_counters[i] = 0; + if (localGroupId == 0) { + uint rowIdx, rowOffset; + get_row_counters_index(&rowIdx, &rowOffset, tid); + cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK; + cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round + nr_slots_array[localTid] = cnt; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < NR_ROWS) { + if (localGroupId) + cnt = nr_slots_array[localTid]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // in the final hash table, we are looking for a match on both the bits + // part of the previous PREFIX colliding bits, and the last PREFIX bits. + __local ulong coll; + if (tid < NR_ROWS) { + for (i = localGroupId; i < cnt && !semaphoe; i += THREADS_PER_ROW_SOLS) { + p = get_slot_ptr(htabs[ht_i], PARAM_K - 1, tid, i); + refsPtr[i] = ((__global slot_t *)p)->slot.i; + uint xi_first_bytes = dataPtr[i] = ((__global slot_t *)p)->slot.xi[0]; + uint bin_to_use = + ((xi_first_bytes & BIN_MASK(PARAM_K - 1)) >> BIN_MASK_OFFSET(PARAM_K - 1)) + | ((xi_first_bytes & BIN_MASK2(PARAM_K - 1)) >> BIN_MASK2_OFFSET(PARAM_K - 1)); + uint bin_counter_copy = atomic_inc(&bin_counters[bin_to_use]); + bins[bin_to_use * NR_SLOTS + bin_counter_copy] = i; + if (bin_counter_copy) { + for (j = 0; j < bin_counter_copy && !semaphoe; ++j) { + uint slot_index_j = bins[bin_to_use * NR_SLOTS + j]; + if (xi_first_bytes == dataPtr[slot_index_j]) { + if (atomic_inc(&semaphoe) == 0) + coll = ((ulong)refsPtr[i] << 32) | refsPtr[slot_index_j]; + } + } + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < NR_ROWS) { + if (get_local_id(0) == 0 && semaphoe) + potential_sol(htabs, sols, coll >> 32, coll & 0xffffffff, values_tmp); + } + } +} + diff --git a/contrib/ocl/kernels/silentarmy.cl b/contrib/ocl/kernels/silentarmy.cl new file mode 100644 index 000000000..8ba3d6283 --- /dev/null +++ b/contrib/ocl/kernels/silentarmy.cl @@ -0,0 +1,946 @@ +#define THRD 64 +#define PARAM_N 200 +#define PARAM_K 9 +#define PREFIX (PARAM_N / (PARAM_K + 1)) +#define NR_INPUTS (1 << PREFIX) +// Approximate log base 2 of number of elements in hash tables +#define APX_NR_ELMS_LOG (PREFIX + 1) +// Number of rows and slots is affected by this. 20 offers the best performance +// but occasionally misses ~1% of solutions. +#define NR_ROWS_LOG 20 + +// Setting this to 1 might make SILENTARMY faster, see TROUBLESHOOTING.md +#define OPTIM_SIMPLIFY_ROUND 1 + +// Number of collision items to track, per thread +#ifdef cl_nv_pragma_unroll // NVIDIA +#define THREADS_PER_ROW 16 +#define LDS_COLL_SIZE (NR_SLOTS * 24 * (THRD / THREADS_PER_ROW)) +#else +#define THREADS_PER_ROW 8 +#define LDS_COLL_SIZE (NR_SLOTS * 8 * (THRD / THREADS_PER_ROW)) +#endif + +// Ratio of time of sleeping before rechecking if task is done (0-1) +#define SLEEP_RECHECK_RATIO 0.60 +// Ratio of time to busy wait for the solution (0-1) +// The higher value the higher CPU usage with Nvidia +#define SLEEP_SKIP_RATIO 0.005 + +// Make hash tables OVERHEAD times larger than necessary to store the average +// number of elements per row. The ideal value is as small as possible to +// reduce memory usage, but not too small or else elements are dropped from the +// hash tables. +// +// The actual number of elements per row is closer to the theoretical average +// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be +// smaller. +// +// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease +// performance as they cause VRAM channel conflicts. +#if NR_ROWS_LOG == 16 +// #error "NR_ROWS_LOG = 16 is currently broken - do not use" +#define OVERHEAD 2 +#elif NR_ROWS_LOG == 18 +#define OVERHEAD 3 +#elif NR_ROWS_LOG == 19 +#define OVERHEAD 5 +#elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND +#define OVERHEAD 6 +#elif NR_ROWS_LOG == 20 +#define OVERHEAD 9 +#endif + +#define NR_ROWS (1 << NR_ROWS_LOG) +#define NR_SLOTS (((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD)) +// Length of 1 element (slot) in byte +#define SLOT_LEN 32 +// Total size of hash table +#define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN) +// Length of Zcash block header, nonce (part of header) +#define ZCASH_BLOCK_HEADER_LEN 140 +// Offset of nTime in header +#define ZCASH_BLOCK_OFFSET_NTIME (4 + 3 * 32) +// Length of nonce +#define ZCASH_NONCE_LEN 32 +// Length of encoded representation of solution size +#define ZCASH_SOLSIZE_LEN 3 +// Solution size (1344 = 0x540) represented as a compact integer, in hex +#define ZCASH_SOLSIZE_HEX "fd4005" +// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes) +#define ZCASH_SOL_LEN ((1 << PARAM_K) * (PREFIX + 1) / 8) +// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization +#define N_ZERO_BYTES 12 +// Number of bytes Zcash needs out of Blake +#define ZCASH_HASH_LEN 50 +// Number of wavefronts per SIMD for the Blake kernel. +// Blake is ALU-bound (beside the atomic counter being incremented) so we need +// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer +// instructions. 10 is the max supported by the hw. +#define BLAKE_WPS 10 +// Maximum number of solutions reported by kernel to host +#define MAX_SOLS 10 +// Length of SHA256 target +#define SHA256_TARGET_LEN (256 / 8) + +#if (NR_SLOTS < 16) +#define BITS_PER_ROW 4 +#define ROWS_PER_UINT 8 +#define ROW_MASK 0x0F +#else +#define BITS_PER_ROW 8 +#define ROWS_PER_UINT 4 +#define ROW_MASK 0xFF +#endif + +// Optional features +#undef ENABLE_DEBUG + +/* +** Return the offset of Xi in bytes from the beginning of the slot. +*/ +#define xi_offset_for_round(round) (8 + ((round) / 2) * 4) + +// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values +#define SOL_SIZE ((1 << PARAM_K) * 4) +typedef struct sols_s +{ + uint nr; + uint likely_invalids; + uchar valid[MAX_SOLS]; + uint values[MAX_SOLS][(1 << PARAM_K)]; +} sols_t; + +// Windows only for now +#define DEFAULT_NUM_MINING_MODE_THREADS 1 +#define MAX_NUM_MINING_MODE_THREADS 16 + +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable + +/* +** Assuming NR_ROWS_LOG == 16, the hash table slots have this layout (length in +** bytes in parens): +** +** round 0, table 0: cnt(4) i(4) pad(0) Xi(23.0) pad(1) +** round 1, table 1: cnt(4) i(4) pad(0.5) Xi(20.5) pad(3) +** round 2, table 0: cnt(4) i(4) i(4) pad(0) Xi(18.0) pad(2) +** round 3, table 1: cnt(4) i(4) i(4) pad(0.5) Xi(15.5) pad(4) +** round 4, table 0: cnt(4) i(4) i(4) i(4) pad(0) Xi(13.0) pad(3) +** round 5, table 1: cnt(4) i(4) i(4) i(4) pad(0.5) Xi(10.5) pad(5) +** round 6, table 0: cnt(4) i(4) i(4) i(4) i(4) pad(0) Xi( 8.0) pad(4) +** round 7, table 1: cnt(4) i(4) i(4) i(4) i(4) pad(0.5) Xi( 5.5) pad(6) +** round 8, table 0: cnt(4) i(4) i(4) i(4) i(4) i(4) pad(0) Xi( 3.0) pad(5) +** +** If the first byte of Xi is 0xAB then: +** - on even rounds, 'A' is part of the colliding PREFIX, 'B' is part of Xi +** - on odd rounds, 'A' and 'B' are both part of the colliding PREFIX, but +** 'A' is considered redundant padding as it was used to compute the row # +** +** - cnt is an atomic counter keeping track of the number of used slots. +** it is used in the first slot only; subsequent slots replace it with +** 4 padding bytes +** - i encodes either the 21-bit input value (round 0) or a reference to two +** inputs from the previous round +** +** Formula for Xi length and pad length above: +** > for i in range(9): +** > xi=(200-20*i-NR_ROWS_LOG)/8.; ci=8+4*((i)/2); print xi,32-ci-xi +** +** Note that the fractional .5-byte/4-bit padding following Xi for odd rounds +** is the 4 most significant bits of the last byte of Xi. +*/ + +__constant ulong blake_iv[] = +{ + 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, + 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, 0x9b05688c2b3e6c1f, + 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, +}; + +/* +** Reset counters in hash table. +*/ +__kernel +void kernel_init_ht(__global char *ht, __global uint *rowCounters) +{ + rowCounters[get_global_id(0)] = 0; +} + +/* +** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they +** represent (hex notation, group of 5 hex digits are a group of PREFIX bits): +** aa aa ab bb bb cc cc cd dd... [round 0] +** -------------------- +** ...ab bb bb cc cc cd dd... [odd round] +** -------------- +** ...cc cc cd dd... [next even round] +** ----- +** Bytes underlined are going to be stored in the slot. Preceding bytes +** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are +** used to compute the row number. +** +** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter) +** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble) +** TODO: update lines below with padding nibbles +** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter) +** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter) +** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter) +** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter) +** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter) +** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter) +** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter) +** +** Return 0 if successfully stored, or 1 if the row overflowed. +*/ +uint ht_store(uint round, __global char *ht, uint i, + ulong xi0, ulong xi1, ulong xi2, ulong xi3, __global uint *rowCounters) +{ + uint row; + __global char *p; + uint cnt; +#if NR_ROWS_LOG == 16 + if (!(round % 2)) + row = (xi0 & 0xffff); + else + // if we have in hex: "ab cd ef..." (little endian xi0) then this + // formula computes the row as 0xdebc. it skips the 'a' nibble as it + // is part of the PREFIX. The Xi will be stored starting with "ef..."; + // 'e' will be considered padding and 'f' is part of the current PREFIX + row = ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | + ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); +#elif NR_ROWS_LOG == 18 + if (!(round % 2)) + row = (xi0 & 0xffff) | ((xi0 & 0xc00000) >> 6); + else + row = ((xi0 & 0xc0000) >> 2) | + ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | + ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); +#elif NR_ROWS_LOG == 19 + if (!(round % 2)) + row = (xi0 & 0xffff) | ((xi0 & 0xe00000) >> 5); + else + row = ((xi0 & 0xe0000) >> 1) | + ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | + ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); +#elif NR_ROWS_LOG == 20 + if (!(round % 2)) + row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4); + else + row = ((xi0 & 0xf0000) >> 0) | + ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | + ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); +#else +#error "unsupported NR_ROWS_LOG" +#endif + xi0 = (xi0 >> 16) | (xi1 << (64 - 16)); + xi1 = (xi1 >> 16) | (xi2 << (64 - 16)); + xi2 = (xi2 >> 16) | (xi3 << (64 - 16)); + p = ht + row * NR_SLOTS * SLOT_LEN; + uint rowIdx = row / ROWS_PER_UINT; + uint rowOffset = BITS_PER_ROW*(row%ROWS_PER_UINT); + uint xcnt = atomic_add(rowCounters + rowIdx, 1 << rowOffset); + xcnt = (xcnt >> rowOffset) & ROW_MASK; + cnt = xcnt; + if (cnt >= NR_SLOTS) + { + // avoid overflows + atomic_sub(rowCounters + rowIdx, 1 << rowOffset); + return 1; + } + p += cnt * SLOT_LEN + xi_offset_for_round(round); + // store "i" (always 4 bytes before Xi) + *(__global uint *)(p - 4) = i; + if (round == 0 || round == 1) + { + // store 24 bytes + *(__global ulong *)(p + 0) = xi0; + *(__global ulong *)(p + 8) = xi1; + *(__global ulong *)(p + 16) = xi2; + } + else if (round == 2) + { + // store 20 bytes + *(__global uint *)(p + 0) = xi0; + *(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32); + *(__global ulong *)(p + 12) = (xi1 >> 32) | (xi2 << 32); + } + else if (round == 3) + { + // store 16 bytes + *(__global uint *)(p + 0) = xi0; + *(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32); + *(__global uint *)(p + 12) = (xi1 >> 32); + } + else if (round == 4) + { + // store 16 bytes + *(__global ulong *)(p + 0) = xi0; + *(__global ulong *)(p + 8) = xi1; + } + else if (round == 5) + { + // store 12 bytes + *(__global ulong *)(p + 0) = xi0; + *(__global uint *)(p + 8) = xi1; + } + else if (round == 6 || round == 7) + { + // store 8 bytes + *(__global uint *)(p + 0) = xi0; + *(__global uint *)(p + 4) = (xi0 >> 32); + } + else if (round == 8) + { + // store 4 bytes + *(__global uint *)(p + 0) = xi0; + } + return 0; +} + +#define mix(va, vb, vc, vd, x, y) \ + va = (va + vb + x); \ +vd = rotate((vd ^ va), (ulong)64 - 32); \ +vc = (vc + vd); \ +vb = rotate((vb ^ vc), (ulong)64 - 24); \ +va = (va + vb + y); \ +vd = rotate((vd ^ va), (ulong)64 - 16); \ +vc = (vc + vd); \ +vb = rotate((vb ^ vc), (ulong)64 - 63); + +/* +** Execute round 0 (blake). +** +** Note: making the work group size less than or equal to the wavefront size +** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local +** Memory (LDS) Optimization 2-10" in: +** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/ +*/ +__kernel __attribute__((reqd_work_group_size(64, 1, 1))) +void kernel_round0(__global ulong *blake_state, __global char *ht, + __global uint *rowCounters, __global uint *debug) +{ + uint tid = get_global_id(0); + ulong v[16]; + uint inputs_per_thread = NR_INPUTS / get_global_size(0); + uint input = tid * inputs_per_thread; + uint input_end = (tid + 1) * inputs_per_thread; + uint dropped = 0; + while (input < input_end) + { + // shift "i" to occupy the high 32 bits of the second ulong word in the + // message block + ulong word1 = (ulong)input << 32; + // init vector v + v[0] = blake_state[0]; + v[1] = blake_state[1]; + v[2] = blake_state[2]; + v[3] = blake_state[3]; + v[4] = blake_state[4]; + v[5] = blake_state[5]; + v[6] = blake_state[6]; + v[7] = blake_state[7]; + v[8] = blake_iv[0]; + v[9] = blake_iv[1]; + v[10] = blake_iv[2]; + v[11] = blake_iv[3]; + v[12] = blake_iv[4]; + v[13] = blake_iv[5]; + v[14] = blake_iv[6]; + v[15] = blake_iv[7]; + // mix in length of data + v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */; + // last block + v[14] ^= (ulong)-1; + + // round 1 + mix(v[0], v[4], v[8], v[12], 0, word1); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 2 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], word1, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 3 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, word1); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 4 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, word1); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 5 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, word1); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 6 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], word1, 0); + // round 7 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], word1, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 8 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, word1); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 9 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], word1, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 10 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], word1, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 11 + mix(v[0], v[4], v[8], v[12], 0, word1); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 12 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], word1, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + + // compress v into the blake state; this produces the 50-byte hash + // (two Xi values) + ulong h[7]; + h[0] = blake_state[0] ^ v[0] ^ v[8]; + h[1] = blake_state[1] ^ v[1] ^ v[9]; + h[2] = blake_state[2] ^ v[2] ^ v[10]; + h[3] = blake_state[3] ^ v[3] ^ v[11]; + h[4] = blake_state[4] ^ v[4] ^ v[12]; + h[5] = blake_state[5] ^ v[5] ^ v[13]; + h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff; + + // store the two Xi values in the hash table +#if ZCASH_HASH_LEN == 50 + dropped += ht_store(0, ht, input * 2, + h[0], + h[1], + h[2], + h[3], rowCounters); + dropped += ht_store(0, ht, input * 2 + 1, + (h[3] >> 8) | (h[4] << (64 - 8)), + (h[4] >> 8) | (h[5] << (64 - 8)), + (h[5] >> 8) | (h[6] << (64 - 8)), + (h[6] >> 8), rowCounters); +#else +#error "unsupported ZCASH_HASH_LEN" +#endif + + input++; + } +#ifdef ENABLE_DEBUG + debug[tid * 2] = 0; + debug[tid * 2 + 1] = dropped; +#endif +} + +#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff)) +#define DECODE_ROW(REF) (REF >> 16) +#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff) +#define DECODE_SLOT0(REF) (REF & 0xff) + +#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f)) +#define DECODE_ROW(REF) (REF >> 14) +#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f) +#define DECODE_SLOT0(REF) (REF & 0x7f) + +#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */ +#define DECODE_ROW(REF) (REF >> 13) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) +#define DECODE_ROW(REF) (REF >> 12) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#else +#error "unsupported NR_ROWS_LOG" +#endif + +/* +** Access a half-aligned long, that is a long aligned on a 4-byte boundary. +*/ +ulong half_aligned_long(__global ulong *p, uint offset) +{ + return + (((ulong)*(__global uint *)((__global char *)p + offset + 0)) << 0) | + (((ulong)*(__global uint *)((__global char *)p + offset + 4)) << 32); +} + +/* +** Access a well-aligned int. +*/ +uint well_aligned_int(__global ulong *_p, uint offset) +{ + __global char *p = (__global char *)_p; + return *(__global uint *)(p + offset); +} + +/* +** XOR a pair of Xi values computed at "round - 1" and store the result in the +** hash table being built for "round". Note that when building the table for +** even rounds we need to skip 1 padding byte present in the "round - 1" table +** (the "0xAB" byte mentioned in the description at the top of this file.) But +** also note we can't load data directly past this byte because this would +** cause an unaligned memory access which is undefined per the OpenCL spec. +** +** Return 0 if successfully stored, or 1 if the row overflowed. +*/ +uint xor_and_store(uint round, __global char *ht_dst, uint row, + uint slot_a, uint slot_b, __global ulong *a, __global ulong *b, + __global uint *rowCounters) +{ + ulong xi0, xi1, xi2; +#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20 + // Note: for NR_ROWS_LOG == 20, for odd rounds, we could optimize by not + // storing the byte containing bits from the previous PREFIX block for + if (round == 1 || round == 2) + { + // xor 24 bytes + xi0 = *(a++) ^ *(b++); + xi1 = *(a++) ^ *(b++); + xi2 = *a ^ *b; + if (round == 2) + { + // skip padding byte + xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); + xi1 = (xi1 >> 8) | (xi2 << (64 - 8)); + xi2 = (xi2 >> 8); + } + } + else if (round == 3) + { + // xor 20 bytes + xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0); + xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8); + xi2 = well_aligned_int(a, 16) ^ well_aligned_int(b, 16); + } + else if (round == 4 || round == 5) + { + // xor 16 bytes + xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0); + xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8); + xi2 = 0; + if (round == 4) + { + // skip padding byte + xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); + xi1 = (xi1 >> 8); + } + } + else if (round == 6) + { + // xor 12 bytes + xi0 = *a++ ^ *b++; + xi1 = *(__global uint *)a ^ *(__global uint *)b; + xi2 = 0; + if (round == 6) + { + // skip padding byte + xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); + xi1 = (xi1 >> 8); + } + } + else if (round == 7 || round == 8) + { + // xor 8 bytes + xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0); + xi1 = 0; + xi2 = 0; + if (round == 8) + { + // skip padding byte + xi0 = (xi0 >> 8); + } + } + // invalid solutions (which start happenning in round 5) have duplicate + // inputs and xor to zero, so discard them + if (!xi0 && !xi1) + return 0; +#else +#error "unsupported NR_ROWS_LOG" +#endif + return ht_store(round, ht_dst, ENCODE_INPUTS(row, slot_a, slot_b), + xi0, xi1, xi2, 0, rowCounters); +} + +/* +** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi, +** store them in ht_dst. +*/ +void equihash_round(uint round, + __global char *ht_src, + __global char *ht_dst, + __global uint *debug, + __local uchar *first_words_data, + __local uint *collisionsData, + __local uint *collisionsNum, + __global uint *rowCountersSrc, + __global uint *rowCountersDst, + uint threadsPerRow) +{ + uint globalTid = get_global_id(0) / threadsPerRow; + uint localTid = get_local_id(0) / threadsPerRow; + uint localGroupId = get_local_id(0) % threadsPerRow; + __local uchar *first_words = &first_words_data[NR_SLOTS*localTid]; + + __global char *p; + uint cnt; + uchar mask; + uint i, j; + // NR_SLOTS is already oversized (by a factor of OVERHEAD), but we want to + // make it even larger + uint n; + uint dropped_coll = 0; + uint dropped_stor = 0; + __global ulong *a, *b; + uint xi_offset; + // read first words of Xi from the previous (round - 1) hash table + xi_offset = xi_offset_for_round(round - 1); + // the mask is also computed to read data from the previous round +#if NR_ROWS_LOG <= 16 + mask = ((!(round % 2)) ? 0x0f : 0xf0); +#elif NR_ROWS_LOG == 18 + mask = ((!(round % 2)) ? 0x03 : 0x30); +#elif NR_ROWS_LOG == 19 + mask = ((!(round % 2)) ? 0x01 : 0x10); +#elif NR_ROWS_LOG == 20 + mask = 0; /* we can vastly simplify the code below */ +#else +#error "unsupported NR_ROWS_LOG" +#endif + + for (uint chunk = 0; chunk < threadsPerRow; chunk++) { + uint tid = globalTid + NR_ROWS / threadsPerRow*chunk; + uint gid = tid & ~(get_local_size(0) / threadsPerRow - 1); + // for (uint tid = get_global_id(0)/threadsPerRow; tid < NR_ROWS; tid += get_global_size(0)/threadsPerRow) { + + uint rowIdx = tid / ROWS_PER_UINT; + uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT); + cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK; + cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in prev. round + + *collisionsNum = 0; + p = (ht_src + tid * NR_SLOTS * SLOT_LEN); + p += xi_offset; + p += SLOT_LEN*localGroupId; + for (i = localGroupId; i < cnt; i += threadsPerRow, p += SLOT_LEN*threadsPerRow) + first_words[i] = (*(__global uchar *)p) & mask; + barrier(CLK_LOCAL_MEM_FENCE); + + if (cnt == 0) + // no elements in row, no collisions + goto part2; + // find collisions + for (i = 0; i < cnt - 1; i++) + { + uchar data_i = first_words[i]; + uint collision = (localTid << 24) | (i << 12) | (i + 1 + localGroupId); + for (j = i + 1 + localGroupId; j < cnt; j += threadsPerRow) + { + if (data_i == first_words[j]) + { + uint index = atomic_inc(collisionsNum); + if (index >= LDS_COLL_SIZE) { + atomic_dec(collisionsNum); + goto part2; + } + collisionsData[index] = collision; + } + collision += threadsPerRow; + } + } + + part2: + barrier(CLK_LOCAL_MEM_FENCE); + uint totalCollisions = *collisionsNum; + for (uint index = get_local_id(0); index < totalCollisions; index += get_local_size(0)) + { + uint collision = collisionsData[index]; + uint collisionThreadId = gid + (collision >> 24); + uint i = (collision >> 12) & 0xFFF; + uint j = collision & 0xFFF; + __global uchar *ptr = ht_src + collisionThreadId * NR_SLOTS * SLOT_LEN + + xi_offset; + a = (__global ulong *)(ptr + i * SLOT_LEN); + b = (__global ulong *)(ptr + j * SLOT_LEN); + dropped_stor += xor_and_store(round, ht_dst, collisionThreadId, i, j, + a, b, rowCountersDst); + } + } + +#ifdef ENABLE_DEBUG + debug[tid * 2] = dropped_coll; + debug[tid * 2 + 1] = dropped_stor; +#endif +} + +/* +** This defines kernel_round1, kernel_round2, ..., kernel_round7. +*/ +#define KERNEL_ROUND(N) \ +__kernel __attribute__((reqd_work_group_size(64, 1, 1))) \ +void kernel_round ## N(__global char *ht_src, __global char *ht_dst, \ + __global uint *rowCountersSrc, __global uint *rowCountersDst, \ + __global uint *debug) \ +{ \ + __local uchar first_words_data[NR_SLOTS*(64/THREADS_PER_ROW)]; \ + __local uint collisionsData[LDS_COLL_SIZE]; \ + __local uint collisionsNum; \ + equihash_round(N, ht_src, ht_dst, debug, first_words_data, collisionsData, \ + &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW); \ +} +KERNEL_ROUND(1) +KERNEL_ROUND(2) +KERNEL_ROUND(3) +KERNEL_ROUND(4) +KERNEL_ROUND(5) +KERNEL_ROUND(6) +KERNEL_ROUND(7) + +// kernel_round8 takes an extra argument, "sols" +__kernel __attribute__((reqd_work_group_size(64, 1, 1))) +void kernel_round8(__global char *ht_src, __global char *ht_dst, + __global uint *rowCountersSrc, __global uint *rowCountersDst, + __global uint *debug, __global sols_t *sols) +{ + uint tid = get_global_id(0); + __local uchar first_words_data[NR_SLOTS*(64 / THREADS_PER_ROW)]; + __local uint collisionsData[LDS_COLL_SIZE]; + __local uint collisionsNum; + equihash_round(8, ht_src, ht_dst, debug, first_words_data, collisionsData, + &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW); + if (!tid) + sols->nr = sols->likely_invalids = 0; +} + +uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot) +{ + return *(__global uint *)(ht + row * NR_SLOTS * SLOT_LEN + + slot * SLOT_LEN + xi_offset - 4); +} + +/* +** Expand references to inputs. Return 1 if so far the solution appears valid, +** or 0 otherwise (an invalid solution would be a solution with duplicate +** inputs, which can be detected at the last step: round == 0). +*/ +uint expand_refs(uint *ins, uint nr_inputs, __global char **htabs, + uint round) +{ + __global char *ht = htabs[round % 2]; + uint i = nr_inputs - 1; + uint j = nr_inputs * 2 - 1; + uint xi_offset = xi_offset_for_round(round); + int dup_to_watch = -1; + do + { + ins[j] = expand_ref(ht, xi_offset, + DECODE_ROW(ins[i]), DECODE_SLOT1(ins[i])); + ins[j - 1] = expand_ref(ht, xi_offset, + DECODE_ROW(ins[i]), DECODE_SLOT0(ins[i])); + if (!round) + { + if (dup_to_watch == -1) + dup_to_watch = ins[j]; + else if (ins[j] == dup_to_watch || ins[j - 1] == dup_to_watch) + return 0; + } + if (!i) + break; + i--; + j -= 2; + } while (1); + return 1; +} + +/* +** Verify if a potential solution is in fact valid. +*/ +void potential_sol(__global char **htabs, __global sols_t *sols, + uint ref0, uint ref1) +{ + uint nr_values; + uint values_tmp[(1 << PARAM_K)]; + uint sol_i; + uint i; + nr_values = 0; + values_tmp[nr_values++] = ref0; + values_tmp[nr_values++] = ref1; + uint round = PARAM_K - 1; + do + { + round--; + if (!expand_refs(values_tmp, nr_values, htabs, round)) + return; + nr_values *= 2; + } while (round > 0); + // solution appears valid, copy it to sols + sol_i = atomic_inc(&sols->nr); + if (sol_i >= MAX_SOLS) + return; + for (i = 0; i < (1 << PARAM_K); i++) + sols->values[sol_i][i] = values_tmp[i]; + sols->valid[sol_i] = 1; +} + +/* +** Scan the hash tables to find Equihash solutions. +*/ +__kernel __attribute__((reqd_work_group_size(64, 1, 1))) +void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols, + __global uint *rowCountersSrc, __global uint *rowCountersDst) +{ + __local uint counters[64 / THREADS_PER_ROW]; + __local uint refs[NR_SLOTS*(64 / THREADS_PER_ROW)]; + __local uint data[NR_SLOTS*(64 / THREADS_PER_ROW)]; + __local uint collisionsNum; + __local ulong collisions[64 * 4]; + + uint globalTid = get_global_id(0) / THREADS_PER_ROW; + uint localTid = get_local_id(0) / THREADS_PER_ROW; + uint localGroupId = get_local_id(0) % THREADS_PER_ROW; + __local uint *refsPtr = &refs[NR_SLOTS*localTid]; + __local uint *dataPtr = &data[NR_SLOTS*localTid]; + + __global char *htabs[2] = { ht0, ht1 }; + __global char *hcounters[2] = { rowCountersSrc, rowCountersDst }; + uint ht_i = (PARAM_K - 1) % 2; // table filled at last round + uint cnt; + uint xi_offset = xi_offset_for_round(PARAM_K - 1); + uint i, j; + __global char *p; + uint ref_i, ref_j; + // it's ok for the collisions array to be so small, as if it fills up + // the potential solutions are likely invalid (many duplicate inputs) + // ulong collisions; +#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20 + // in the final hash table, we are looking for a match on both the bits + // part of the previous PREFIX colliding bits, and the last PREFIX bits. + uint mask = 0xffffff; +#else +#error "unsupported NR_ROWS_LOG" +#endif + + collisionsNum = 0; + + for (uint chunk = 0; chunk < THREADS_PER_ROW; chunk++) { + uint tid = globalTid + NR_ROWS / THREADS_PER_ROW*chunk; + p = htabs[ht_i] + tid * NR_SLOTS * SLOT_LEN; + uint rowIdx = tid / ROWS_PER_UINT; + uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT); + cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK; + cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round + p += xi_offset; + p += SLOT_LEN*localGroupId; + + for (i = get_local_id(0); i < 64 / THREADS_PER_ROW; i += get_local_size(0)) + counters[i] = 0; + for (i = localGroupId; i < cnt; i += THREADS_PER_ROW, p += SLOT_LEN*THREADS_PER_ROW) { + refsPtr[i] = *(__global uint *)(p - 4); + dataPtr[i] = (*(__global uint *)p) & mask; + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (i = 0; i < cnt; i++) + { + uint a_data = dataPtr[i]; + ref_i = refsPtr[i]; + for (j = i + 1 + localGroupId; j < cnt; j += THREADS_PER_ROW) + { + if (a_data == dataPtr[j]) + { + if (atomic_inc(&counters[localTid]) == 0) + collisions[atomic_inc(&collisionsNum)] = ((ulong)ref_i << 32) | refsPtr[j]; + goto part2; + } + } + } + + part2: + continue; + } + + barrier(CLK_LOCAL_MEM_FENCE); + uint totalCollisions = collisionsNum; + if (get_local_id(0) < totalCollisions) { + ulong coll = collisions[get_local_id(0)]; + potential_sol(htabs, sols, coll >> 32, coll & 0xffffffff); + } +} \ No newline at end of file diff --git a/contrib/ocl/opencl.hpp b/contrib/ocl/opencl.hpp new file mode 100644 index 000000000..dcb0e63e9 --- /dev/null +++ b/contrib/ocl/opencl.hpp @@ -0,0 +1,72 @@ +#pragma once + +#include + +#if defined(_MSC_VER) +#pragma comment (lib, "opencl.lib") +#endif + +#include +#include +#include + +#include + +namespace ocl { + +inline cl_mem check_clCreateBuffer(cl_context ctx, cl_mem_flags flags, size_t size, void *host_ptr) +{ + cl_int status; + cl_mem ret; + ret = clCreateBuffer(ctx, flags, size, host_ptr, &status); + if (status != CL_SUCCESS || !ret) + printf("clCreateBuffer (%d)\n", status); + return ret; +} + +inline void check_clSetKernelArg(cl_kernel k, cl_uint a_pos, cl_mem *a) +{ + cl_int status; + status = clSetKernelArg(k, a_pos, sizeof(*a), a); + if (status != CL_SUCCESS) + printf("clSetKernelArg (%d)\n", status); +} + +inline void check_clEnqueueNDRangeKernel(cl_command_queue queue, cl_kernel k, cl_uint + work_dim, const size_t *global_work_offset, const size_t + *global_work_size, const size_t *local_work_size, cl_uint + num_events_in_wait_list, const cl_event *event_wait_list, cl_event + *event) +{ + cl_uint status; + status = clEnqueueNDRangeKernel(queue, k, work_dim, global_work_offset, + global_work_size, local_work_size, num_events_in_wait_list, + event_wait_list, event); + if (status != CL_SUCCESS) + printf("clEnqueueNDRangeKernel (%d)\n", status); +} + +inline void check_clEnqueueReadBuffer(cl_command_queue queue, cl_mem buffer, cl_bool + blocking_read, size_t offset, size_t size, void *ptr, cl_uint + num_events_in_wait_list, const cl_event *event_wait_list, cl_event + *event) +{ + cl_int status; + status = clEnqueueReadBuffer(queue, buffer, blocking_read, offset, + size, ptr, num_events_in_wait_list, event_wait_list, event); + if (status != CL_SUCCESS) + printf("clEnqueueReadBuffer (%d)\n", status); +} + + +inline unsigned nr_compute_units(cl_device_id device_id) +{ + cl_uint retval; + cl_int status = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &retval, nullptr); + if (status != CL_SUCCESS) + printf("nr_compute_units (%d)\n", status); + return retval; +} + + +} diff --git a/contrib/ocl/sols.hpp b/contrib/ocl/sols.hpp new file mode 100644 index 000000000..e337ebcef --- /dev/null +++ b/contrib/ocl/sols.hpp @@ -0,0 +1,20 @@ +#pragma once + +namespace ocl { + typedef uint8_t uchar; + typedef uint32_t uint; + typedef uint64_t ulong; + + + template + struct sols_s + { + uint nr; + uint likely_invalids; + uchar valid[MAXSOLS]; + uint values[MAXSOLS][(1 << PARAMK)]; + }; +} + +typedef ocl::sols_s<10, 9> sols_t; + diff --git a/contrib/ocl/utility/device_utils.hpp b/contrib/ocl/utility/device_utils.hpp new file mode 100644 index 000000000..7544a294a --- /dev/null +++ b/contrib/ocl/utility/device_utils.hpp @@ -0,0 +1,181 @@ +#pragma once +#include +#include +#include //reinclude just in case.. +#include + +typedef std::vector ocl_devices; + +#define OCL(error) \ + if(cl_int err = error){ \ + printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); \ + return; \ + } + +#define OCLR(error, ret) \ + if(cl_int err = error){ \ + printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); \ + return ret; \ + } + +#define OCLE(error) \ + if(cl_int err = error){ \ + printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); \ + exit(err); \ + } + +namespace ocl { +namespace utility { + inline ocl_devices GetAllDevices() { + ocl_devices retval; + retval.reserve(8); + + cl_platform_id platforms[64]; + cl_uint numPlatforms; + cl_int rc = clGetPlatformIDs(sizeof(platforms) / sizeof(cl_platform_id), platforms, &numPlatforms); + + for (cl_uint i = 0; i < numPlatforms; i++) { + cl_uint numDevices = 0; + cl_device_id devices[64]; + rc = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, sizeof(devices) / sizeof(cl_device_id), devices, &numDevices); + for (cl_uint n = 0; n < numDevices; n++) { + retval.push_back(devices[n]); + } + } + + return retval; + } + + inline void PrintDevices() { + using namespace ::std; + + auto devices = GetAllDevices(); + cout << "Number of OpenCL devices found: " << devices.size() << endl; + for (unsigned int i = 0; i < devices.size(); ++i) { + cl::Device device(devices[i]); + auto& platform = cl::Platform(device.getInfo()); + cout << "Device #" << i << " | " << platform.getInfo() << " | " << device.getInfo(); + + switch (device.getInfo()) { + case CL_DEVICE_TYPE_CPU: + cout << " | CPU"; + break; + case CL_DEVICE_TYPE_GPU: + cout << " | GPU"; + break; + case CL_DEVICE_TYPE_ACCELERATOR: + cout << " | ACCELERATOR"; + break; + default: + cout << " | DEFAULT"; + break; + } + cout << " | " << device.getInfo(); + cout << endl; + } + } + + inline bool clCompileKernel(cl_context gContext, + cl_device_id gpu, + const char *binaryName, + const std::vector &sources, + const char *arguments, + cl_int *binstatus, + cl_program *gProgram) + { + std::ifstream testfile(binaryName); + + if (!testfile) { + printf(" compiling ...\n"); + + std::string sourceFile; + for (auto &i : sources) { + std::ifstream stream; + stream.exceptions(std::ifstream::failbit | std::ifstream::badbit); + try { + stream.open(i); + } + catch (std::system_error& e) { + fprintf(stderr, " %s\n", e.code().message().c_str()); + return false; + } + std::string str((std::istreambuf_iterator(stream)), std::istreambuf_iterator()); + sourceFile.append(str); + } + + printf(" source: %u bytes\n", (unsigned)sourceFile.size()); + if (sourceFile.size() < 1) { + fprintf(stderr, " source files not found or empty\n"); + return false; + } + + cl_int error; + const char *sources[] = { sourceFile.c_str(), 0 }; + *gProgram = clCreateProgramWithSource(gContext, 1, sources, 0, &error); + OCLR(error, false); + + if (clBuildProgram(*gProgram, 1, &gpu, arguments, 0, 0) != CL_SUCCESS) { + size_t logSize; + clGetProgramBuildInfo(*gProgram, gpu, CL_PROGRAM_BUILD_LOG, 0, 0, &logSize); + + std::unique_ptr log(new char[logSize]); + clGetProgramBuildInfo(*gProgram, gpu, CL_PROGRAM_BUILD_LOG, logSize, log.get(), 0); + printf("%s\n", log.get()); + + return false; + } + + size_t binsize; + OCLR(clGetProgramInfo(*gProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binsize, 0), false); + // for (size_t i = 0; i < 1; i++) { + if (!binsize) { + printf(" no binary available!\n"); + return false; + } + // } + + printf(" binsize = %u bytes\n", (unsigned)binsize); + std::unique_ptr binary(new unsigned char[binsize + 1]); + + OCLR(clGetProgramInfo(*gProgram, CL_PROGRAM_BINARIES, sizeof(void*), &binary, 0), false); + + { + std::ofstream bin(binaryName, std::ofstream::binary | std::ofstream::trunc); + bin.write((const char*)binary.get(), binsize); + bin.close(); + } + + OCLR(clReleaseProgram(*gProgram), false); + } + + std::ifstream bfile(binaryName, std::ifstream::binary); + if (!bfile) { + printf(" %s not found\n", binaryName); + return false; + } + + bfile.seekg(0, bfile.end); + size_t binsize = bfile.tellg(); + bfile.seekg(0, bfile.beg); + if (!binsize) { + printf(" %s empty\n", binaryName); + return false; + } + + std::vector binary(binsize + 1); + bfile.read(&binary[0], binsize); + bfile.close(); + + cl_int error; + // binstatus.resize(gpus.size(), 0); + // std::vector binsizes(gpus.size(), binsize); + // std::vector binaries(gpus.size(), (const unsigned char*)&binary[0]); + const unsigned char *binaryPtr = (const unsigned char*)&binary[0]; + + *gProgram = clCreateProgramWithBinary(gContext, 1, &gpu, &binsize, &binaryPtr, binstatus, &error); + OCLR(error, false); + OCLR(clBuildProgram(*gProgram, 1, &gpu, 0, 0, 0), false); + return true; + } +} +} \ No newline at end of file diff --git a/contrib/sha256/sha256.hpp b/contrib/sha256/sha256.hpp new file mode 100644 index 000000000..419a723df --- /dev/null +++ b/contrib/sha256/sha256.hpp @@ -0,0 +1,228 @@ +#pragma once + +/* Sha256.h -- SHA-256 Hash +2016-11-04 : Marc Bevand : A few changes to make it more self-contained +2010-06-11 : Igor Pavlov : Public domain */ + +#define SHA256_DIGEST_SIZE 32 + +#include +#include + +#define rotlFixed(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) +#define rotrFixed(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22)) +#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x, 25)) +#define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3)) +#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10)) + +#define blk0(i) (W[i] = data[i]) +#define blk2(i) (W[i&15] += s1(W[(i-2)&15]) + W[(i-7)&15] + s0(W[(i-15)&15])) + +#define Ch(x,y,z) (z^(x&(y^z))) +#define Maj(x,y,z) ((x&y)|(z&(x|y))) + +#define a(i) T[(0-(i))&7] +#define b(i) T[(1-(i))&7] +#define c(i) T[(2-(i))&7] +#define d(i) T[(3-(i))&7] +#define e(i) T[(4-(i))&7] +#define f(i) T[(5-(i))&7] +#define g(i) T[(6-(i))&7] +#define h(i) T[(7-(i))&7] + +#ifdef _SHA256_UNROLL2 + +#define R(a,b,c,d,e,f,g,h, i) h += S1(e) + Ch(e,f,g) + K[i+j] + (j?blk2(i):blk0(i));\ + d += h; h += S0(a) + Maj(a, b, c) + +#define RX_8(i) \ + R(a,b,c,d,e,f,g,h, i); \ + R(h,a,b,c,d,e,f,g, i+1); \ + R(g,h,a,b,c,d,e,f, i+2); \ + R(f,g,h,a,b,c,d,e, i+3); \ + R(e,f,g,h,a,b,c,d, i+4); \ + R(d,e,f,g,h,a,b,c, i+5); \ + R(c,d,e,f,g,h,a,b, i+6); \ + R(b,c,d,e,f,g,h,a, i+7) + +#else + +#define R(i) h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + (j?blk2(i):blk0(i));\ + d(i) += h(i); h(i) += S0(a(i)) + Maj(a(i), b(i), c(i)) + +#ifdef _SHA256_UNROLL + +#define RX_8(i) R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7); + +#endif + +#endif + + + +namespace sha256 { + +typedef struct +{ + uint32_t state[8]; + uint64_t count; + uint8_t buffer[64]; +} CSha256; + +namespace impl { + static const uint32_t K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + }; + +inline void Sha256_Transform(uint32_t *state, const uint32_t *data) +{ + uint32_t W[16]; + unsigned j; + #ifdef _SHA256_UNROLL2 + uint32_t a,b,c,d,e,f,g,h; + a = state[0]; + b = state[1]; + c = state[2]; + d = state[3]; + e = state[4]; + f = state[5]; + g = state[6]; + h = state[7]; + #else + uint32_t T[8]; + for (j = 0; j < 8; j++) + T[j] = state[j]; + #endif + + for (j = 0; j < 64; j += 16) + { + #if defined(_SHA256_UNROLL) || defined(_SHA256_UNROLL2) + RX_8(0); RX_8(8); + #else + unsigned i; + for (i = 0; i < 16; i++) { R(i); } + #endif + } + + #ifdef _SHA256_UNROLL2 + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; + #else + for (j = 0; j < 8; j++) + state[j] += T[j]; + #endif + + /* Wipe variables */ + /* memset(W, 0, sizeof(W)); */ + /* memset(T, 0, sizeof(T)); */ +} + +#undef S0 +#undef S1 +#undef s0 +#undef s1 + +inline void Sha256_WriteByteBlock(CSha256 *p) +{ + uint32_t data32[16]; + unsigned i; + for (i = 0; i < 16; i++) + data32[i] = + ((uint32_t)(p->buffer[i * 4 ]) << 24) + + ((uint32_t)(p->buffer[i * 4 + 1]) << 16) + + ((uint32_t)(p->buffer[i * 4 + 2]) << 8) + + ((uint32_t)(p->buffer[i * 4 + 3])); + Sha256_Transform(p->state, data32); +} + +} + +inline void Sha256_Init(CSha256 *p) { + p->state[0] = 0x6a09e667; + p->state[1] = 0xbb67ae85; + p->state[2] = 0x3c6ef372; + p->state[3] = 0xa54ff53a; + p->state[4] = 0x510e527f; + p->state[5] = 0x9b05688c; + p->state[6] = 0x1f83d9ab; + p->state[7] = 0x5be0cd19; + p->count = 0; +} + +inline void Sha256_Update(CSha256 *p, const uint8_t *data, size_t size) +{ + uint32_t curBufferPos = (uint32_t)p->count & 0x3F; + while (size > 0) + { + p->buffer[curBufferPos++] = *data++; + p->count++; + size--; + if (curBufferPos == 64) + { + curBufferPos = 0; + impl::Sha256_WriteByteBlock(p); + } + } +} + +inline void Sha256_Final(CSha256 *p, uint8_t *digest) +{ + uint64_t lenInBits = (p->count << 3); + uint32_t curBufferPos = (uint32_t)p->count & 0x3F; + unsigned i; + p->buffer[curBufferPos++] = 0x80; + while (curBufferPos != (64 - 8)) + { + curBufferPos &= 0x3F; + if (curBufferPos == 0) + impl::Sha256_WriteByteBlock(p); + p->buffer[curBufferPos++] = 0; + } + for (i = 0; i < 8; i++) + { + p->buffer[curBufferPos++] = (uint8_t)(lenInBits >> 56); + lenInBits <<= 8; + } + impl::Sha256_WriteByteBlock(p); + + for (i = 0; i < 8; i++) + { + *digest++ = (uint8_t)(p->state[i] >> 24); + *digest++ = (uint8_t)(p->state[i] >> 16); + *digest++ = (uint8_t)(p->state[i] >> 8); + *digest++ = (uint8_t)(p->state[i]); + } + Sha256_Init(p); +} + +inline void Sha256_Onestep(const uint8_t *data, size_t size, uint8_t *digest) +{ + CSha256 p; + Sha256_Init(&p); + Sha256_Update(&p, data, size); + Sha256_Final(&p, digest); +} + +} diff --git a/cpu_tromp/equi.h b/cpu_tromp/equi.h index b9237c359..6cb07bc74 100644 --- a/cpu_tromp/equi.h +++ b/cpu_tromp/equi.h @@ -49,7 +49,7 @@ typedef u32 proof[PROOFSIZE]; void setheader(blake2b_state *ctx, const char *header, const u32 headerLen, const char* nce, const u32 nonceLen) { uint32_t le_N = WN; uint32_t le_K = WK; - uchar personal[] = "DeepWebCa01230123"; + uchar personal[] = "ZcashPoW01230123"; memcpy(personal+8, &le_N, 4); memcpy(personal+12, &le_K, 4); blake2b_param P[1]; diff --git a/cuda_silentarmy/cuda_silentarmy.vcxproj b/cuda_silentarmy/cuda_silentarmy.vcxproj index 76f286295..4f6bd8cf5 100644 --- a/cuda_silentarmy/cuda_silentarmy.vcxproj +++ b/cuda_silentarmy/cuda_silentarmy.vcxproj @@ -74,10 +74,14 @@ true + $(ProjectDir)../contrib/;$(IncludePath) $(SolutionDir)$(Platform)\$(Configuration)\ + + $(ProjectDir)../contrib/;$(IncludePath) + Level3 @@ -176,11 +180,9 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" - - diff --git a/cuda_silentarmy/kernel.cu b/cuda_silentarmy/kernel.cu index 5f773a9a9..1b6605395 100644 --- a/cuda_silentarmy/kernel.cu +++ b/cuda_silentarmy/kernel.cu @@ -8,7 +8,9 @@ #include "sa_cuda_context.hpp" #include "param.h" -#include "sa_blake.h" + +#include +using namespace blake; #define WN PARAM_N #define WK PARAM_K @@ -59,7 +61,7 @@ __device__ char rowCounter0[NR_ROWS]; __device__ char rowCounter1[NR_ROWS]; __device__ sols_t sols; -__device__ blake2b_state_t blake; +__device__ blake2b_state_t blake_obj; __constant__ ulong blake_iv[] = { @@ -431,14 +433,14 @@ void kernel_round0(char *ht, uint *debug) // message block ulong word1 = (ulong)input << 32; // init vector v - v[0] = blake.h[0]; - v[1] = blake.h[1]; - v[2] = blake.h[2]; - v[3] = blake.h[3]; - v[4] = blake.h[4]; - v[5] = blake.h[5]; - v[6] = blake.h[6]; - v[7] = blake.h[7]; + v[0] = blake_obj.h[0]; + v[1] = blake_obj.h[1]; + v[2] = blake_obj.h[2]; + v[3] = blake_obj.h[3]; + v[4] = blake_obj.h[4]; + v[5] = blake_obj.h[5]; + v[6] = blake_obj.h[6]; + v[7] = blake_obj.h[7]; v[8] = blake_iv[0]; v[9] = blake_iv[1]; v[10] = blake_iv[2]; @@ -564,13 +566,13 @@ void kernel_round0(char *ht, uint *debug) // compress v into the blake state; this produces the 50-byte hash // (two Xi values) ulong h[7]; - h[0] = blake.h[0] ^ v[0] ^ v[8]; - h[1] = blake.h[1] ^ v[1] ^ v[9]; - h[2] = blake.h[2] ^ v[2] ^ v[10]; - h[3] = blake.h[3] ^ v[3] ^ v[11]; - h[4] = blake.h[4] ^ v[4] ^ v[12]; - h[5] = blake.h[5] ^ v[5] ^ v[13]; - h[6] = (blake.h[6] ^ v[6] ^ v[14]) & 0xffff; + h[0] = blake_obj.h[0] ^ v[0] ^ v[8]; + h[1] = blake_obj.h[1] ^ v[1] ^ v[9]; + h[2] = blake_obj.h[2] ^ v[2] ^ v[10]; + h[3] = blake_obj.h[3] ^ v[3] ^ v[11]; + h[4] = blake_obj.h[4] ^ v[4] ^ v[12]; + h[5] = blake_obj.h[5] ^ v[5] ^ v[13]; + h[6] = (blake_obj.h[6] ^ v[6] ^ v[14]) & 0xffff; // store the two Xi values in the hash table #if ZCASH_HASH_LEN == 50 @@ -2474,7 +2476,7 @@ void sa_cuda_context::solve(const char * tequihash_header, unsigned int tequihas zcash_blake2b_init(&initialCtx, ZCASH_HASH_LEN, PARAM_N, PARAM_K); zcash_blake2b_update(&initialCtx, (const uint8_t*)context, 128, 0); - checkCudaErrors(cudaMemcpyToSymbol(blake, &initialCtx, sizeof(blake2b_state_s), 0, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyToSymbol(blake_obj, &initialCtx, sizeof(blake2b_state_s), 0, cudaMemcpyHostToDevice)); for (unsigned round = 0; round < PARAM_K; round++) { if (bUseOld) { diff --git a/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj b/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj index 9efc78965..8c07b5f7d 100644 --- a/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj +++ b/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj @@ -36,7 +36,7 @@ DynamicLibrary true MultiByte - v140 + v120 Application @@ -50,7 +50,7 @@ false true MultiByte - v140 + v120 @@ -74,6 +74,10 @@ true + $(ProjectDir)../contrib/;$(IncludePath) + + + $(ProjectDir)../contrib/;$(IncludePath) diff --git a/cuda_silentarmy_sm30/kernel.cu b/cuda_silentarmy_sm30/kernel.cu index 459b94b3a..da0504291 100644 --- a/cuda_silentarmy_sm30/kernel.cu +++ b/cuda_silentarmy_sm30/kernel.cu @@ -13,118 +13,8 @@ #include #include - -//*blake header */ - -typedef struct blake2b_state_s -{ - uint64_t h[8]; - uint64_t bytes; -} blake2b_state_t; - -void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, uint32_t n, uint32_t k); -void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg, - uint32_t msg_len, uint32_t is_final); -void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen); - -/* blake.cpp **/ - -//static const uint32_t blake2b_block_len = 128; -static const uint32_t blake2b_rounds = 12; -static const uint64_t blake2b_iv[8] = -{ - 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, - 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, - 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, - 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL, -}; -static const uint8_t blake2b_sigma[12][16] = -{ - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, -}; - -/* -** Init the state according to Zcash parameters. -*/ -void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, - uint32_t n, uint32_t k) -{ - st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len); - for (uint32_t i = 1; i <= 5; i++) - st->h[i] = blake2b_iv[i]; - st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"DeepWebCa"; - st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n); - st->bytes = 0; -} - -static uint64_t rotr64(uint64_t a, uint8_t bits) -{ - return (a >> bits) | (a << (64 - bits)); -} - -static inline void mix64(uint64_t *va, uint64_t *vb, uint64_t *vc, uint64_t *vd, - uint64_t x, uint64_t y) -{ - *va = (*va + *vb + x); - *vd = rotr64(*vd ^ *va, 32); - *vc = (*vc + *vd); - *vb = rotr64(*vb ^ *vc, 24); - *va = (*va + *vb + y); - *vd = rotr64(*vd ^ *va, 16); - *vc = (*vc + *vd); - *vb = rotr64(*vb ^ *vc, 63); -} - -/* -** Process either a full message block or the final partial block. -** Note that v[13] is not XOR'd because st->bytes is assumed to never overflow. -** -** _msg pointer to message (must be zero-padded to 128 bytes if final block) -** msg_len must be 128 (<= 128 allowed only for final partial block) -** is_final indicate if this is the final block -*/ -void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg, - uint32_t msg_len, uint32_t is_final) -{ - const uint64_t *m = (const uint64_t *)_msg; - uint64_t v[16]; - memcpy(v + 0, st->h, 8 * sizeof(*v)); - memcpy(v + 8, blake2b_iv, 8 * sizeof(*v)); - v[12] ^= (st->bytes += msg_len); - v[14] ^= is_final ? -1 : 0; - for (uint32_t round = 0; round < blake2b_rounds; round++) - { - const uint8_t *s = blake2b_sigma[round]; - mix64(v + 0, v + 4, v + 8, v + 12, m[s[0]], m[s[1]]); - mix64(v + 1, v + 5, v + 9, v + 13, m[s[2]], m[s[3]]); - mix64(v + 2, v + 6, v + 10, v + 14, m[s[4]], m[s[5]]); - mix64(v + 3, v + 7, v + 11, v + 15, m[s[6]], m[s[7]]); - mix64(v + 0, v + 5, v + 10, v + 15, m[s[8]], m[s[9]]); - mix64(v + 1, v + 6, v + 11, v + 12, m[s[10]], m[s[11]]); - mix64(v + 2, v + 7, v + 8, v + 13, m[s[12]], m[s[13]]); - mix64(v + 3, v + 4, v + 9, v + 14, m[s[14]], m[s[15]]); - } - for (uint32_t i = 0; i < 8; i++) - st->h[i] ^= v[i] ^ v[i + 8]; -} - -void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen) -{ - memcpy(out, st->h, outlen); -} - -/* end of blake cpp*/ +#include +using namespace blake; #define PARAM_N 200 #define PARAM_K 9 @@ -178,7 +68,7 @@ typedef struct __align__(64) sols_s __device__ uint32_t rowCounter0[1 << NR_ROWS_LOG]; __device__ uint32_t rowCounter1[1 << NR_ROWS_LOG]; __device__ uint32_t* rowCounters[2] = { rowCounter0 , rowCounter1 }; -__device__ blake2b_state_t blake; +__device__ blake2b_state_t blake_obj; __device__ sols_t sols; @@ -263,6 +153,7 @@ __device__ uint well_aligned_int(ulong *_p, uint offset) return *(uint *)(p + offset); } +#if 0 __device__ uint xor_and_store3(char* ht, uint tid, uint slot_a, uint slot_b, ulong* a, ulong* b, uint* rowCounters) { ulong xi0, xi1, xi2, xi3; @@ -294,9 +185,6 @@ __device__ uint xor_and_store3(char* ht, uint tid, uint slot_a, uint slot_b, ulo ulong test1 = half_aligned_long(a, 0); - printf("test1 %lX | %lX | %02X %02X %02X %02X\n", test1, test3, a0, a1, a2, a3); - - // xor 20 bytes xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0); xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8); @@ -337,6 +225,8 @@ __device__ uint xor_and_store3(char* ht, uint tid, uint slot_a, uint slot_b, ulo return 0; } +#endif + __device__ uint ht_store(uint round, char *ht, uint i, ulong xi0, ulong xi1, ulong xi2, ulong xi3, uint *rowCounters) { @@ -475,14 +365,14 @@ void kernel_round0(char *ht, uint32_t inputs_per_thread, int offset) // message block ulong word1 = (ulong)input << 32; // init vector v - v[0] = blake.h[0]; - v[1] = blake.h[1]; - v[2] = blake.h[2]; - v[3] = blake.h[3]; - v[4] = blake.h[4]; - v[5] = blake.h[5]; - v[6] = blake.h[6]; - v[7] = blake.h[7]; + v[0] = blake_obj.h[0]; + v[1] = blake_obj.h[1]; + v[2] = blake_obj.h[2]; + v[3] = blake_obj.h[3]; + v[4] = blake_obj.h[4]; + v[5] = blake_obj.h[5]; + v[6] = blake_obj.h[6]; + v[7] = blake_obj.h[7]; v[8] = blake_iv[0]; v[9] = blake_iv[1]; v[10] = blake_iv[2]; @@ -608,13 +498,13 @@ void kernel_round0(char *ht, uint32_t inputs_per_thread, int offset) // compress v into the blake state; this produces the 50-byte hash // (two Xi values) ulong h[7]; - h[0] = blake.h[0] ^ v[0] ^ v[8]; - h[1] = blake.h[1] ^ v[1] ^ v[9]; - h[2] = blake.h[2] ^ v[2] ^ v[10]; - h[3] = blake.h[3] ^ v[3] ^ v[11]; - h[4] = blake.h[4] ^ v[4] ^ v[12]; - h[5] = blake.h[5] ^ v[5] ^ v[13]; - h[6] = (blake.h[6] ^ v[6] ^ v[14]) & 0xffff; + h[0] = blake_obj.h[0] ^ v[0] ^ v[8]; + h[1] = blake_obj.h[1] ^ v[1] ^ v[9]; + h[2] = blake_obj.h[2] ^ v[2] ^ v[10]; + h[3] = blake_obj.h[3] ^ v[3] ^ v[11]; + h[4] = blake_obj.h[4] ^ v[4] ^ v[12]; + h[5] = blake_obj.h[5] ^ v[5] ^ v[13]; + h[6] = (blake_obj.h[6] ^ v[6] ^ v[14]) & 0xffff; // store the two Xi values in the hash table #if ZCASH_HASH_LEN == 50 @@ -657,9 +547,12 @@ __device__ uint xor_and_store(uint round, char *ht_dst, uint row, uint slot_a, uint slot_b, ulong *a, ulong *b, uint *rowCounters) { + +#if 0 if (round == 3) { return xor_and_store3(ht_dst, row, slot_a, slot_b, a, b, rowCounters); } +#endif ulong xi0, xi1, xi2; #if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20 @@ -1072,7 +965,7 @@ void sa_cuda_context::solve(const char * tequihash_header, unsigned int tequihas zcash_blake2b_init(&initialCtx, ZCASH_HASH_LEN, PARAM_N, PARAM_K); zcash_blake2b_update(&initialCtx, (const uint8_t*)context, 128, 0); - checkCudaErrors(cudaMemcpyToSymbol(blake, &initialCtx, sizeof(blake2b_state_s), 0, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyToSymbol(blake_obj, &initialCtx, sizeof(blake2b_state_s), 0, cudaMemcpyHostToDevice)); //const uint32_t THREAD_SHIFT = 8; //const uint32_t THREAD_COUNT = 1 << THREAD_SHIFT; diff --git a/nheqminer/libstratum/StratumClient.cpp b/nheqminer/libstratum/StratumClient.cpp index c8c00d338..df6d31f88 100644 --- a/nheqminer/libstratum/StratumClient.cpp +++ b/nheqminer/libstratum/StratumClient.cpp @@ -434,4 +434,11 @@ template class StratumClient; template class StratumClient; template class StratumClient; template class StratumClient; -template class StratumClient; \ No newline at end of file +template class StratumClient; +// Gatelessgate +template class StratumClient; +template class StratumClient; +template class StratumClient; +template class StratumClient; +template class StratumClient; +template class StratumClient; \ No newline at end of file diff --git a/nheqminer/libstratum/ZcashStratum.cpp b/nheqminer/libstratum/ZcashStratum.cpp index d7cab675d..c64c31cab 100644 --- a/nheqminer/libstratum/ZcashStratum.cpp +++ b/nheqminer/libstratum/ZcashStratum.cpp @@ -623,6 +623,16 @@ template class ZcashMiner; template class ZcashMiner; template class ZcashMiner; +// Gatelessgate +template class ZcashMiner; +template class ZcashMiner; +template class ZcashMiner; +template class ZcashMiner; +template class ZcashMiner; +template class ZcashMiner; + + + std::mutex benchmark_work; std::vector benchmark_nonces; std::atomic_int benchmark_solutions; @@ -896,4 +906,28 @@ void ZMinerSSE2CUDASA80_SA_doBenchmark(int hashes, int cpu_threads, int cuda_cou int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) { ZMinerSSE2CUDASA80_SA::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t); } - +// ocl_gatelessgate +void ZMinerAVXCUDA80_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, + int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) { + ZMinerAVXCUDA80_GG::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t); +} +void ZMinerSSE2CUDA80_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, + int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) { + ZMinerSSE2CUDA80_GG::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t); +} +void ZMinerAVXCUDA75_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, + int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) { + ZMinerAVXCUDA75_GG::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t); +} +void ZMinerSSE2CUDA75_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, + int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) { + ZMinerSSE2CUDA75_GG::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t); +} +void ZMinerAVXCUDASA80_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, + int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) { + ZMinerAVXCUDASA80_GG::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t); +} +void ZMinerSSE2CUDASA80_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, + int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) { + ZMinerSSE2CUDASA80_GG::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t); +} diff --git a/nheqminer/libstratum/ZcashStratum.h b/nheqminer/libstratum/ZcashStratum.h index b8d686d98..4ebe55404 100644 --- a/nheqminer/libstratum/ZcashStratum.h +++ b/nheqminer/libstratum/ZcashStratum.h @@ -50,6 +50,11 @@ CREATE_SOLVER_STUB(ocl_xmp, "ocl_xmp_STUB") #else CREATE_SOLVER_STUB(ocl_silentarmy, "ocl_silentarmy_STUB") #endif +#ifdef USE_OCL_GATELESSGATE +#include "../ocl_gatelessgate/ocl_gatelessgate.hpp" +#else +CREATE_SOLVER_STUB(ocl_gatelessgate, "ocl_gatelessgate_STUB") +#endif #include "../cuda_silentarmy/cuda_silentarmy.hpp" @@ -171,6 +176,15 @@ typedef ZcashMiner ZMinerSSE2CUDA75_SA typedef ZcashMiner ZMinerSSE2CUDASA80_SA; typedef ZcashMiner ZMinerAVXCUDASA80_SA; +//ocl_gatelessgate +typedef ZcashMiner ZMinerAVXCUDA80_GG; +typedef ZcashMiner ZMinerSSE2CUDA80_GG; +typedef ZcashMiner ZMinerAVXCUDA75_GG; +typedef ZcashMiner ZMinerSSE2CUDA75_GG; +typedef ZcashMiner ZMinerSSE2CUDASA80_GG; +typedef ZcashMiner ZMinerAVXCUDASA80_GG; + + // ocl_xmp // gcc static undefined reference workaround void ZMinerAVXCUDA80_XMP_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, @@ -197,4 +211,17 @@ void ZMinerSSE2CUDA75_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count void ZMinerAVXCUDASA80_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); void ZMinerSSE2CUDASA80_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); \ No newline at end of file + int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); +// ocl_gatelessgate +void ZMinerAVXCUDA80_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, + int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); +void ZMinerSSE2CUDA80_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, + int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); +void ZMinerAVXCUDA75_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, + int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); +void ZMinerSSE2CUDA75_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, + int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); +void ZMinerAVXCUDASA80_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, + int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); +void ZMinerSSE2CUDASA80_GG_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, + int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); diff --git a/nheqminer/main.cpp b/nheqminer/main.cpp index ee46f69a0..45d3db2b4 100644 --- a/nheqminer/main.cpp +++ b/nheqminer/main.cpp @@ -7,7 +7,7 @@ #include "libstratum/StratumClient.h" -#if defined(USE_OCL_XMP) || defined(USE_OCL_SILENTARMY) +#if defined(USE_OCL_XMP) || defined(USE_OCL_SILENTARMY) || defined(USE_OCL_GATELESSGATE) #include "../ocl_device_utils/ocl_device_utils.h" #define PRINT_OCL_INFO #endif @@ -29,8 +29,6 @@ #include #include -#include - namespace logging = boost::log; namespace sinks = boost::log::sinks; namespace src = boost::log::sources; @@ -271,6 +269,7 @@ int main(int argc, char* argv[]) int opencl_device_count = 0; int force_cpu_ext = -1; int opencl_t = 0; + int use_gg = 1; for (int i = 1; i < argc; ++i) { @@ -348,7 +347,52 @@ int main(int argc, char* argv[]) return 0; case 'v': use_old_xmp = atoi(argv[++i]); + use_gg = 0; + break; + case 'd': + while (opencl_device_count < 8 && i + 1 < argc) + { + try + { + opencl_enabled[opencl_device_count] = std::stol(argv[++i]); + ++opencl_device_count; + } + catch (...) + { + --i; + break; + } + } + break; + case 'p': + opencl_platform = atoi(argv[++i]); + break; + case 't': + while (opencl_t < 8 && i + 1 < argc) + { + try + { + opencl_threads[opencl_t] = std::stol(argv[++i]); + ++opencl_t; + } + catch (...) + { + --i; + break; + } + } break; + // TODO extra parameters for OpenCL + } + break; + } + case 'g': + { + switch (argv[i][2]) + { + case 'i': + print_opencl_info(); + return 0; case 'd': while (opencl_device_count < 8 && i + 1 < argc) { @@ -363,6 +407,7 @@ int main(int argc, char* argv[]) break; } } + use_gg = 1; break; case 'p': opencl_platform = atoi(argv[++i]); @@ -537,6 +582,30 @@ int main(int argc, char* argv[]) ZMinerSSE2CUDA80_XMP_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads); } } + } + else if (use_gg == 1) { + if (use_avx) { + if (use_cuda_sa) { + ZMinerAVXCUDASA80_GG_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads); + } + else if (use_old_cuda) { + ZMinerAVXCUDA75_GG_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads); + } + else { + ZMinerAVXCUDA80_GG_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads); + } + } + else { + if (use_cuda_sa) { + ZMinerSSE2CUDASA80_GG_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads); + } + else if (use_old_cuda) { + ZMinerSSE2CUDA75_GG_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads); + } + else { + ZMinerSSE2CUDA80_GG_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads); + } + } } else { // sarmy if (use_avx) { if (use_cuda_sa) { diff --git a/nheqminer/nheqminer.sln b/nheqminer/nheqminer.sln index 67e3a8b88..cb2e2c463 100644 --- a/nheqminer/nheqminer.sln +++ b/nheqminer/nheqminer.sln @@ -30,6 +30,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuda_silentarmy", "..\cuda_ EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuda_silentarmy_sm30", "..\cuda_silentarmy_sm30\cuda_silentarmy_sm30.vcxproj", "{53E62B3D-3FA6-4B53-8175-2B93753D98C4}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ocl_gatelessgate", "..\ocl_gatelessgate\ocl_gatelessgate.vcxproj", "{6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Mixed Platforms = Debug|Mixed Platforms @@ -184,6 +186,24 @@ Global {53E62B3D-3FA6-4B53-8175-2B93753D98C4}.ReleaseSlow|Win32.Build.0 = Release|Win32 {53E62B3D-3FA6-4B53-8175-2B93753D98C4}.ReleaseSlow|x64.ActiveCfg = Release|x64 {53E62B3D-3FA6-4B53-8175-2B93753D98C4}.ReleaseSlow|x64.Build.0 = Release|x64 + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32 + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Debug|Mixed Platforms.Build.0 = Debug|Win32 + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Debug|Win32.ActiveCfg = Debug|Win32 + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Debug|Win32.Build.0 = Debug|Win32 + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Debug|x64.ActiveCfg = Debug|x64 + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Debug|x64.Build.0 = Debug|x64 + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Release|Mixed Platforms.ActiveCfg = Release|Win32 + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Release|Mixed Platforms.Build.0 = Release|Win32 + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Release|Win32.ActiveCfg = Release|Win32 + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Release|Win32.Build.0 = Release|Win32 + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Release|x64.ActiveCfg = Release|x64 + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.Release|x64.Build.0 = Release|x64 + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.ReleaseSlow|Mixed Platforms.ActiveCfg = Release|Win32 + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.ReleaseSlow|Mixed Platforms.Build.0 = Release|Win32 + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.ReleaseSlow|Win32.ActiveCfg = Release|Win32 + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.ReleaseSlow|Win32.Build.0 = Release|Win32 + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.ReleaseSlow|x64.ActiveCfg = Release|x64 + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612}.ReleaseSlow|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/nheqminer/nheqminer.vcxproj b/nheqminer/nheqminer.vcxproj index 47972a117..fb685dfca 100644 --- a/nheqminer/nheqminer.vcxproj +++ b/nheqminer/nheqminer.vcxproj @@ -85,7 +85,7 @@ MaxSpeed true true - WIN32;NDEBUG;_CONSOLE;USE_CPU_TROMP;USE_CPU_XENONCAT;USE_CUDA_TROMP;USE_OCL_XMP;USE_OCL_SILENTARMY;CONSOLE_COLORS;%(PreprocessorDefinitions) + WIN32;NDEBUG;_CONSOLE;USE_CPU_TROMP;USE_CPU_XENONCAT;USE_CUDA_TROMP;USE_OCL_XMP;USE_OCL_SILENTARMY;USE_OCL_GATELESSGATE;CONSOLE_COLORS;%(PreprocessorDefinitions) NotSet -D_WIN32_WINNT=0x0601 %(AdditionalOptions) 4068;4996;4503;4267;4180;4290;4244;4800;4334;4251 @@ -97,7 +97,7 @@ true true true - cuda_tromp.lib;cuda_tromp_75.lib;cpu_xenoncat.lib;cpu_tromp_SSE2.lib;cpu_tromp_AVX.lib;ocl_device_utils.lib;ocl_xpm.lib;ocl_silentarmy.lib;cuda_silentarmy.lib;OpenCL.lib + cuda_tromp.lib;cuda_tromp_75.lib;cpu_xenoncat.lib;cpu_tromp_SSE2.lib;cpu_tromp_AVX.lib;ocl_device_utils.lib;ocl_xpm.lib;ocl_silentarmy.lib;ocl_gatelessgate.lib;cuda_silentarmy.lib;OpenCL.lib .\trompequihash\pthreads\x64;..\3rdparty\libs\win64;$(AMDAPPSDKROOT)\lib\x86_64\;%(AdditionalLibraryDirectories) diff --git a/ocl_gatelessgate/gatelessgate.cl b/ocl_gatelessgate/gatelessgate.cl new file mode 100644 index 000000000..9257e8f6d --- /dev/null +++ b/ocl_gatelessgate/gatelessgate.cl @@ -0,0 +1,1245 @@ +// Gateless Gate, a Zcash miner +// Copyright 2016 zawawa @ bitcointalk.org +// +// The initial version of this software was based on: +// SILENTARMY v5 +// The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil +// +// This program is free software : you can redistribute it and / or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program.If not, see . + +//#define ENABLE_DEBUG + +// +// Parameters for Hash Tables +// + +// There are PARAM_K - 1 hash tables, and each hash table has NR_ROWS rows. +// Each row contains NR_SLOTS slots. + +#define NR_ROWS_LOG 14 // 12, 13, 14, 15, or 16. 12 and 13 are not practically usable. +#define NR_SLOTS 199 // Prime numbers are preferable. +#define LOCAL_WORK_SIZE 64 +#define THREADS_PER_ROW 64 +#define LOCAL_WORK_SIZE_SOLS 64 +#define THREADS_PER_ROW_SOLS 64 +#define GLOBAL_WORK_SIZE_RATIO 512 // global_work_size = GLOBAL_WORK_SIZE_RATIO * nr_compute_units * LOCAL_WORK_SIZE +#define THREADS_PER_WRITE 1 // 1, 2, 4, or 8 +#define SLOT_CACHE_SIZE (NR_SLOTS * ROWS_IN_WORK_ITEM) +#define LDS_COLL_SIZE (NR_SLOTS * ROWS_IN_WORK_ITEM * 140 / 100) +#define BIN_SIZE (NR_SLOTS * 6 / 100) +#define EXTRA_BITS_FOR_BINS_SOLS 1 +#define BIN_SIZE_SOLS ((BIN_SIZE >> EXTRA_BITS_FOR_BINS_SOLS) * 250 / 100) + +#define PARAM_N 200 +#define PARAM_K 9 +#define PREFIX (PARAM_N / (PARAM_K + 1)) +#define NR_INPUTS (1 << PREFIX) +#define NR_ROWS (1 << NR_ROWS_LOG) +// Length of 1 element (slot) in byte +#define SLOT_LEN 32 +// Total size of hash table +#define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN) +// Length of Zcash block header, nonce (part of header) +#define ZCASH_BLOCK_HEADER_LEN 140 +// Offset of nTime in header +#define ZCASH_BLOCK_OFFSET_NTIME (4 + 3 * 32) +// Length of nonce +#define ZCASH_NONCE_LEN 32 +// Length of encoded representation of solution size +#define ZCASH_SOLSIZE_LEN 3 +// Solution size (1344 = 0x540) represented as a compact integer, in hex +#define ZCASH_SOLSIZE_HEX "fd4005" +// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes) +#define ZCASH_SOL_LEN ((1 << PARAM_K) * (PREFIX + 1) / 8) +// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization +#define N_ZERO_BYTES 12 +// Number of bytes Zcash needs out of Blake +#define ZCASH_HASH_LEN 50 +// Number of wavefronts per SIMD for the Blake kernel. +// Blake is ALU-bound (beside the atomic counter being incremented) so we need +// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer +// instructions. 10 is the max supported by the hw. +#define BLAKE_WPS 10 +// Maximum number of solutions reported by kernel to host +#define MAX_SOLS 11 +// Length of SHA256 target +#define SHA256_TARGET_LEN (256 / 8) + +#if (NR_SLOTS < 3) +#define BITS_PER_ROW 2 +#define ROWS_PER_UINT 16 +#define ROW_MASK 0x03 +#elif (NR_SLOTS < 7) +#define BITS_PER_ROW 3 +#define ROWS_PER_UINT 10 +#define ROW_MASK 0x07 +#elif (NR_SLOTS < 15) +#define BITS_PER_ROW 4 +#define ROWS_PER_UINT 8 +#define ROW_MASK 0x0F +#elif (NR_SLOTS < 31) +#define BITS_PER_ROW 5 +#define ROWS_PER_UINT 6 +#define ROW_MASK 0x1F +#elif (NR_SLOTS < 63) +#define BITS_PER_ROW 6 +#define ROWS_PER_UINT 5 +#define ROW_MASK 0x3F +#elif (NR_SLOTS < 255) +#define BITS_PER_ROW 8 +#define ROWS_PER_UINT 4 +#define ROW_MASK 0xFF +#else +#define BITS_PER_ROW 16 +#define ROWS_PER_UINT 2 +#define ROW_MASK 0xFFFF +#endif +#define RC_SIZE (NR_ROWS * 4 / ROWS_PER_UINT) + +/* +** Return the offset of Xi in bytes from the beginning of the slot. +*/ +#define xi_offset_for_round(round) 4 + +// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values +#define SOL_SIZE ((1 << PARAM_K) * 4) +typedef struct sols_s +{ + uint nr; + uint likely_invalids; + uchar valid[MAX_SOLS]; + uint values[MAX_SOLS][(1 << PARAM_K)]; +} sols_t; + +#if NR_ROWS_LOG <= 12 && NR_SLOTS <= (1 << 10) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 20) | ((slot1 & 0x3ff) << 10) | (slot0 & 0x3ff)) +#define DECODE_ROW(REF) (REF >> 20) +#define DECODE_SLOT1(REF) ((REF >> 10) & 0x3ff) +#define DECODE_SLOT0(REF) (REF & 0x3ff) + +#elif NR_ROWS_LOG <= 14 && NR_SLOTS <= (1 << 9) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 18) | ((slot1 & 0x1ff) << 9) | (slot0 & 0x1ff)) +#define DECODE_ROW(REF) (REF >> 18) +#define DECODE_SLOT1(REF) ((REF >> 9) & 0x1ff) +#define DECODE_SLOT0(REF) (REF & 0x1ff) + +#elif NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff)) +#define DECODE_ROW(REF) (REF >> 16) +#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff) +#define DECODE_SLOT0(REF) (REF & 0xff) + +#elif NR_ROWS_LOG <= 18 && NR_SLOTS <= (1 << 7) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f)) +#define DECODE_ROW(REF) (REF >> 14) +#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f) +#define DECODE_SLOT0(REF) (REF & 0x7f) + +#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */ +#define DECODE_ROW(REF) (REF >> 13) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) +#define DECODE_ROW(REF) (REF >> 12) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#else +#error "unsupported NR_ROWS_LOG" +#endif + +// Windows only for now +#define DEFAULT_NUM_MINING_MODE_THREADS 1 +#define MAX_NUM_MINING_MODE_THREADS 16 + +#if THREADS_PER_WRITE == 1 +#define ADJUSTED_SLOT_LEN(round) (((round) <= 5) ? SLOT_LEN : SLOT_LEN - 16) +#else +#define ADJUSTED_SLOT_LEN(round) SLOT_LEN +#endif + +#define OPENCL_BUILD_OPTIONS_AMD "-I.. -I. -O5" +#define OPENCL_BUILD_OPTIONS "-I.. -I." + +#define NEXT_PRIME_NO(n) \ + (((n) <= 2) ? 2 : \ + ((n) <= 3) ? 3 : \ + ((n) <= 5) ? 5 : \ + ((n) <= 7) ? 7 : \ + ((n) <= 11) ? 11 : \ + ((n) <= 13) ? 13 : \ + ((n) <= 17) ? 17 : \ + ((n) <= 19) ? 19 : \ + ((n) <= 23) ? 23 : \ + ((n) <= 29) ? 29 : \ + ((n) <= 31) ? 31 : \ + ((n) <= 37) ? 37 : \ + ((n) <= 41) ? 41 : \ + ((n) <= 43) ? 43 : \ + ((n) <= 47) ? 47 : \ + ((n) <= 53) ? 53 : \ + ((n) <= 59) ? 59 : \ + ((n) <= 61) ? 61 : \ + ((n) <= 67) ? 67 : \ + ((n) <= 71) ? 71 : \ + ((n) <= 73) ? 73 : \ + ((n) <= 79) ? 79 : \ + ((n) <= 83) ? 83 : \ + ((n) <= 89) ? 89 : \ + ((n) <= 97) ? 97 : \ + ((n) <= 101) ? 101 : \ + ((n) <= 103) ? 103 : \ + ((n) <= 107) ? 107 : \ + ((n) <= 109) ? 109 : \ + ((n) <= 113) ? 113 : \ + ((n) <= 127) ? 127 : \ + ((n) <= 131) ? 131 : \ + ((n) <= 137) ? 137 : \ + ((n) <= 139) ? 139 : \ + ((n) <= 149) ? 149 : \ + ((n) <= 151) ? 151 : \ + ((n) <= 157) ? 157 : \ + ((n) <= 163) ? 163 : \ + ((n) <= 167) ? 167 : \ + ((n) <= 173) ? 173 : \ + ((n) <= 179) ? 179 : \ + ((n) <= 181) ? 181 : \ + ((n) <= 191) ? 191 : \ + ((n) <= 193) ? 193 : \ + ((n) <= 197) ? 197 : \ + ((n) <= 199) ? 199 : \ + ((n) <= 211) ? 211 : \ + ((n) <= 223) ? 223 : \ + ((n) <= 227) ? 227 : \ + ((n) <= 229) ? 229 : \ + ((n) <= 233) ? 233 : \ + ((n) <= 239) ? 239 : \ + ((n) <= 241) ? 241 : \ + ((n) <= 251) ? 251 : \ + ((n) <= 257) ? 257 : \ + ((n) <= 263) ? 263 : \ + ((n) <= 269) ? 269 : \ + ((n) <= 271) ? 271 : \ + ((n) <= 277) ? 277 : \ + ((n) <= 281) ? 281 : \ + ((n) <= 283) ? 283 : \ + ((n) <= 293) ? 293 : \ + ((n) <= 307) ? 307 : \ + ((n) <= 311) ? 311 : \ + ((n) <= 313) ? 313 : \ + ((n) <= 317) ? 317 : \ + ((n) <= 331) ? 331 : \ + ((n) <= 337) ? 337 : \ + ((n) <= 347) ? 347 : \ + ((n) <= 349) ? 349 : \ + ((n) <= 353) ? 353 : \ + ((n) <= 359) ? 359 : \ + ((n) <= 367) ? 367 : \ + ((n) <= 373) ? 373 : \ + ((n) <= 379) ? 379 : \ + ((n) <= 383) ? 383 : \ + ((n) <= 389) ? 389 : \ + ((n) <= 397) ? 397 : \ + ((n) <= 401) ? 401 : \ + ((n) <= 409) ? 409 : \ + ((n) <= 419) ? 419 : \ + ((n) <= 421) ? 421 : \ + ((n) <= 431) ? 431 : \ + ((n) <= 433) ? 433 : \ + ((n) <= 439) ? 439 : \ + ((n) <= 443) ? 443 : \ + ((n) <= 449) ? 449 : \ + ((n) <= 457) ? 457 : \ + ((n) <= 461) ? 461 : \ + ((n) <= 463) ? 463 : \ + ((n) <= 467) ? 467 : \ + ((n) <= 479) ? 479 : \ + ((n) <= 487) ? 487 : \ + ((n) <= 491) ? 491 : \ + ((n) <= 499) ? 499 : \ + ((n) <= 503) ? 503 : \ + ((n) <= 509) ? 509 : \ + ((n) <= 521) ? 521 : \ + ((n) <= 523) ? 523 : \ + ((n) <= 541) ? 541 : \ + ((n) <= 547) ? 547 : \ + ((n) <= 557) ? 557 : \ + ((n) <= 563) ? 563 : \ + ((n) <= 569) ? 569 : \ + ((n) <= 571) ? 571 : \ + ((n) <= 577) ? 577 : \ + ((n) <= 587) ? 587 : \ + ((n) <= 593) ? 593 : \ + ((n) <= 599) ? 599 : \ + ((n) <= 601) ? 601 : \ + ((n) <= 607) ? 607 : \ + ((n) <= 613) ? 613 : \ + ((n) <= 617) ? 617 : \ + ((n) <= 619) ? 619 : \ + ((n) <= 631) ? 631 : \ + ((n) <= 641) ? 641 : \ + ((n) <= 643) ? 643 : \ + ((n) <= 647) ? 647 : \ + ((n) <= 653) ? 653 : \ + ((n) <= 659) ? 659 : \ + ((n) <= 661) ? 661 : \ + ((n) <= 673) ? 673 : \ + ((n) <= 677) ? 677 : \ + ((n) <= 683) ? 683 : \ + ((n) <= 691) ? 691 : \ + ((n) <= 701) ? 701 : \ + ((n) <= 709) ? 709 : \ + ((n) <= 719) ? 719 : \ + ((n) <= 727) ? 727 : \ + ((n) <= 733) ? 733 : \ + ((n) <= 739) ? 739 : \ + ((n) <= 743) ? 743 : \ + ((n) <= 751) ? 751 : \ + ((n) <= 757) ? 757 : \ + ((n) <= 761) ? 761 : \ + ((n) <= 769) ? 769 : \ + ((n) <= 773) ? 773 : \ + ((n) <= 787) ? 787 : \ + ((n) <= 797) ? 797 : \ + ((n) <= 809) ? 809 : \ + ((n) <= 811) ? 811 : \ + ((n) <= 821) ? 821 : \ + ((n) <= 823) ? 823 : \ + ((n) <= 827) ? 827 : \ + ((n) <= 829) ? 829 : \ + ((n) <= 839) ? 839 : \ + ((n) <= 853) ? 853 : \ + ((n) <= 857) ? 857 : \ + ((n) <= 859) ? 859 : \ + ((n) <= 863) ? 863 : \ + ((n) <= 877) ? 877 : \ + ((n) <= 881) ? 881 : \ + ((n) <= 883) ? 883 : \ + ((n) <= 887) ? 887 : \ + ((n) <= 907) ? 907 : \ + ((n) <= 911) ? 911 : \ + ((n) <= 919) ? 919 : \ + ((n) <= 929) ? 929 : \ + ((n) <= 937) ? 937 : \ + ((n) <= 941) ? 941 : \ + ((n) <= 947) ? 947 : \ + ((n) <= 953) ? 953 : \ + ((n) <= 967) ? 967 : \ + ((n) <= 971) ? 971 : \ + ((n) <= 977) ? 977 : \ + ((n) <= 983) ? 983 : \ + ((n) <= 991) ? 991 : \ + ((n) <= 997) ? 997 : \ + ((n) <= 1009) ? 1009 : \ + (n)) + +#define ROWS_IN_WORK_ITEM (LOCAL_WORK_SIZE / THREADS_PER_ROW ) +#define ROWS_IN_WORK_ITEM_SOLS (LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS) + +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable + +typedef union { + struct { + uint i; + uint xi[6]; + uint padding; + } slot; + uint8 ui8; + uint4 ui4[2]; + uint2 ui2[4]; + uint ui[8]; +} slot_t; + +#if THREADS_PER_WRITE != 1 +typedef __global slot_t *global_pointer_to_slot_t; +#endif + + +/* +** The new hash table has this layout (length in bytes in parens): +** +** round 0, table 0: i(4) pad(0) Xi(24) pad(4) +** round 1, table 1: i(4) pad(3) Xi(20) pad(5) +** round 2, table 2: i(4) pad(0) Xi(19) pad(9) +** round 3, table 3: i(4) pad(3) Xi(15) pad(10) +** round 4, table 4: i(4) pad(0) Xi(14) pad(14) +** round 5, table 5: i(4) pad(3) Xi(10) pad(15) +** round 6, table 6: i(4) pad(0) Xi( 9) pad(19) +** round 7, table 7: i(4) pad(3) Xi( 5) pad(20) +** round 8, table 8: i(4) pad(0) Xi( 4) pad(24) +** +*/ + +__constant ulong blake_iv_const[] = +{ + 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, + 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, 0x9b05688c2b3e6c1f, + 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, +}; + +/* +** Reset counters in hash table. +*/ +__kernel +void kernel_init_ht(__global char *ht, __global uint *rowCounters) +{ + rowCounters[get_global_id(0)] = 0; +} + +/* +** OBSOLETE +** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they +** represent (hex notation, group of 5 hex digits are a group of PREFIX bits): +** aa aa ab bb bb cc cc cd dd... [round 0] +** -------------------- +** ...ab bb bb cc cc cd dd... [odd round] +** -------------- +** ...cc cc cd dd... [next even round] +** ----- +** Bytes underlined are going to be stored in the slot. Preceding bytes +** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are +** used to compute the row number. +** +** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter) +** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble) +** TODO: update lines below with padding nibbles +** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter) +** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter) +** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter) +** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter) +** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter) +** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter) +** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter) +** +** Return 0 if successfully stored, or 1 if the row overflowed. +*/ + +__global char *get_slot_ptr(__global char *ht, uint round, uint row, uint slot) +{ +#if 1 + return ht + (row * NR_SLOTS + slot) * ADJUSTED_SLOT_LEN(round); +#else + return ht + (slot * NR_ROWS + row) * ADJUSTED_SLOT_LEN(round); +#endif +} + +__global char *get_xi_ptr(__global char *ht, uint round, uint row, uint slot) +{ + return get_slot_ptr(ht, round, row, slot) + xi_offset_for_round(round); +} + +void get_row_counters_index(uint *rowIdx, uint *rowOffset, uint row) +{ + *rowIdx = row / ROWS_PER_UINT; + *rowOffset = BITS_PER_ROW * (row % ROWS_PER_UINT); +} + +uint get_row(uint round, uint xi0) +{ + uint row; +#if NR_ROWS_LOG == 12 + if (!(round % 2)) + row = (xi0 & 0xfff); + else + row = ((xi0 & 0x0f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24); +#elif NR_ROWS_LOG == 13 + if (!(round % 2)) + row = (xi0 & 0x1fff); + else + row = ((xi0 & 0x1f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24); +#elif NR_ROWS_LOG == 14 + if (!(round % 2)) + row = (xi0 & 0x3fff); + else + row = ((xi0 & 0x3f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24); +#elif NR_ROWS_LOG == 15 + if (!(round % 2)) + row = (xi0 & 0x7fff); + else + row = ((xi0 & 0x7f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24); +#elif NR_ROWS_LOG == 16 + if (!(round % 2)) + row = (xi0 & 0xffff); + else + row = ((xi0 & 0xff0f00) >> 8) | ((xi0 & 0xf0000000) >> 24); +#else +#error "unsupported NR_ROWS_LOG" +#endif + return row; +} + +uint inc_row_counter(__global uint *rowCounters, uint row) +{ + uint rowIdx, rowOffset; + get_row_counters_index(&rowIdx, &rowOffset, row); + uint cnt = atomic_add(rowCounters + rowIdx, 1 << rowOffset); + cnt = (cnt >> rowOffset) & ROW_MASK; + if (cnt >= NR_SLOTS) { + // avoid overflows + atomic_sub(rowCounters + rowIdx, 1 << rowOffset); + } + return cnt; +} + +uint ht_store(uint round, __global char *ht, uint i, + uint xi0, uint xi1, uint xi2, uint xi3, uint xi4, uint xi5, uint xi6, __global uint *rowCounters) +{ + uint row = get_row(round, xi0); + uint cnt = inc_row_counter(rowCounters, row); + if (cnt >= NR_SLOTS) + return 1; + __global char *p = get_slot_ptr(ht, round, row, cnt); + slot_t slot; + slot.slot.i = i; + slot.slot.xi[0] = ((xi1 << 24) | (xi0 >> 8)); + slot.slot.xi[1] = ((xi2 << 24) | (xi1 >> 8)); + slot.slot.xi[2] = ((xi3 << 24) | (xi2 >> 8)); + slot.slot.xi[3] = ((xi4 << 24) | (xi3 >> 8)); + slot.slot.xi[4] = ((xi5 << 24) | (xi4 >> 8)); + slot.slot.xi[5] = ((xi6 << 24) | (xi5 >> 8)); + if (round <= 5) { + *(__global uint8 *)p = slot.ui8; + } + else { + *(__global uint4 *)p = slot.ui4[0]; + } + return 0; +} + +#define mix(va, vb, vc, vd, x, y) \ + va = (va + vb + x); \ +vd = rotate((vd ^ va), (ulong)64 - 32); \ +vc = (vc + vd); \ +vb = rotate((vb ^ vc), (ulong)64 - 24); \ +va = (va + vb + y); \ +vd = rotate((vd ^ va), (ulong)64 - 16); \ +vc = (vc + vd); \ +vb = rotate((vb ^ vc), (ulong)64 - 63); + +/* +** Execute round 0 (blake). +** +** Note: making the work group size less than or equal to the wavefront size +** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local +** Memory (LDS) Optimization 2-10" in: +** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/ +*/ +__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE, 1, 1))) +void kernel_round0(__constant ulong *blake_state_const, __global char *ht, + __global uint *rowCounters, __global uint *debug) +{ + __local ulong blake_state[64]; + __local ulong blake_iv[8]; + uint tid = get_global_id(0); + ulong v[16]; + uint inputs_per_thread = NR_INPUTS / get_global_size(0); + uint input = tid * inputs_per_thread; + uint input_end = (tid + 1) * inputs_per_thread; + uint dropped = 0; + if (get_local_id(0) < 64) + blake_state[get_local_id(0)] = blake_state_const[get_local_id(0)]; + if (get_local_id(0) < 8) + blake_iv[get_local_id(0)] = blake_iv_const[get_local_id(0)]; + barrier(CLK_LOCAL_MEM_FENCE); + while (input < input_end) { + // shift "i" to occupy the high 32 bits of the second ulong word in the + // message block + ulong word1 = (ulong)input << 32; + // init vector v + v[0] = blake_state[0]; + v[1] = blake_state[1]; + v[2] = blake_state[2]; + v[3] = blake_state[3]; + v[4] = blake_state[4]; + v[5] = blake_state[5]; + v[6] = blake_state[6]; + v[7] = blake_state[7]; + v[8] = blake_iv[0]; + v[9] = blake_iv[1]; + v[10] = blake_iv[2]; + v[11] = blake_iv[3]; + v[12] = blake_iv[4]; + v[13] = blake_iv[5]; + v[14] = blake_iv[6]; + v[15] = blake_iv[7]; + // mix in length of data + v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */; + // last block + v[14] ^= (ulong)-1; + + // round 1 + mix(v[0], v[4], v[8], v[12], 0, word1); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 2 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], word1, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 3 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, word1); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 4 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, word1); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 5 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, word1); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 6 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], word1, 0); + // round 7 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], word1, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 8 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, word1); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 9 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], word1, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 10 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], word1, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 11 + mix(v[0], v[4], v[8], v[12], 0, word1); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 12 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], word1, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + + // compress v into the blake state; this produces the 50-byte hash + // (two Xi values) + ulong h[7]; + h[0] = blake_state[0] ^ v[0] ^ v[8]; + h[1] = blake_state[1] ^ v[1] ^ v[9]; + h[2] = blake_state[2] ^ v[2] ^ v[10]; + h[3] = blake_state[3] ^ v[3] ^ v[11]; + h[4] = blake_state[4] ^ v[4] ^ v[12]; + h[5] = blake_state[5] ^ v[5] ^ v[13]; + h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff; + + // store the two Xi values in the hash table +#if ZCASH_HASH_LEN == 50 + dropped += ht_store(0, ht, input * 2, + h[0] & 0xffffffff, h[0] >> 32, + h[1] & 0xffffffff, h[1] >> 32, + h[2] & 0xffffffff, h[2] >> 32, + h[3] & 0xffffffff, + rowCounters); + dropped += ht_store(0, ht, input * 2 + 1, + ((h[3] >> 8) | (h[4] << (64 - 8))) & 0xffffffff, + ((h[3] >> 8) | (h[4] << (64 - 8))) >> 32, + ((h[4] >> 8) | (h[5] << (64 - 8))) & 0xffffffff, + ((h[4] >> 8) | (h[5] << (64 - 8))) >> 32, + ((h[5] >> 8) | (h[6] << (64 - 8))) & 0xffffffff, + ((h[5] >> 8) | (h[6] << (64 - 8))) >> 32, + (h[6] >> 8) & 0xffffffff, + rowCounters); +#else +#error "unsupported ZCASH_HASH_LEN" +#endif + + input++; + } +#ifdef ENABLE_DEBUG + debug[tid * 2] = 0; + debug[tid * 2 + 1] = dropped; +#endif +} + +/* +** XOR a pair of Xi values computed at "round - 1" and store the result in the +** hash table being built for "round". Note that when building the table for +** even rounds we need to skip 1 padding byte present in the "round - 1" table +** (the "0xAB" byte mentioned in the description at the top of this file.) But +** also note we can't load data directly past this byte because this would +** cause an unaligned memory access which is undefined per the OpenCL spec. +** +** Return 0 if successfully stored, or 1 if the row overflowed. +*/ +// single-thread reads, parallel writes +uint xor_and_store(uint round, __global char *ht_dst, uint row, + uint slot_a, uint slot_b, __local uint *ai, __local uint *bi, + __global uint *rowCounters +#if THREADS_PER_WRITE != 1 + , __local slot_t *slot_write_cache, + __local global_pointer_to_slot_t *slot_ptrs +#endif +) { + uint ret = 0; + uint xi0, xi1, xi2, xi3, xi4, xi5; + uint thread_index = get_local_id(0) % THREADS_PER_WRITE; + +#if NR_ROWS_LOG < 8 && NR_ROWS_LOG > 20 +#error "unsupported NR_ROWS_LOG" +#endif + slot_t slot; + __global slot_t *p = 0; +#if THREADS_PER_WRITE != 1 + slot_ptrs[get_local_id(0)] = 0; + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + if (ai && bi) { + xi0 = *(ai++); + xi1 = *(ai++); + if (round <= 7) xi2 = *(ai++); + if (round <= 6) xi3 = *(ai++); + if (round <= 4) xi4 = *(ai++); + if (round <= 2) xi5 = *ai; + + xi0 ^= *(bi++); + xi1 ^= *(bi++); + if (round <= 7) xi2 ^= *(bi++); + if (round <= 6) xi3 ^= *(bi++); + if (round <= 4) xi4 ^= *(bi++); + if (round <= 2) xi5 ^= *bi; + + if (!(round & 0x1)) { + // skip padding bytes + xi0 = (xi0 >> 24) | (xi1 << (32 - 24)); + xi1 = (xi1 >> 24) | (xi2 << (32 - 24)); + if (round <= 7) xi2 = (xi2 >> 24) | (xi3 << (32 - 24)); + if (round <= 6) xi3 = (xi3 >> 24) | (xi4 << (32 - 24)); + if (round <= 4) xi4 = (xi4 >> 24) | (xi5 << (32 - 24)); + if (round <= 2) xi5 = (xi5 >> 24); + } + + // invalid solutions (which start happenning in round 5) have duplicate + // inputs and xor to zero, so discard them + if (xi0 || xi1) { + uint new_row = get_row(round, xi0); + uint new_slot_index = inc_row_counter(rowCounters, new_row); + if (new_slot_index >= NR_SLOTS) { + ret = 1; + } + else { +#if THREADS_PER_WRITE == 1 + p +#else + slot_ptrs[get_local_id(0)] +#endif + = (__global slot_t *)get_slot_ptr(ht_dst, round, new_row, new_slot_index); + } + } + } + +#if THREADS_PER_WRITE == 1 + if (p) { + slot.slot.i = ENCODE_INPUTS(row, slot_a, slot_b); + slot.slot.xi[0] = ((xi1 << 24) | (xi0 >> 8)); + slot.slot.xi[1] = ((xi2 << 24) | (xi1 >> 8)); + slot.slot.xi[2] = ((xi3 << 24) | (xi2 >> 8)); + slot.slot.xi[3] = ((xi4 << 24) | (xi3 >> 8)); + slot.slot.xi[4] = ((xi5 << 24) | (xi4 >> 8)); + slot.slot.xi[5] = ((xi5 >> 8)); + if (round <= 5) + *(__global uint8 *)p = slot.ui8; + else + *(__global uint4 *)p = slot.ui4[0]; + } +#else + barrier(CLK_LOCAL_MEM_FENCE); + if (slot_ptrs[get_local_id(0)]) { + slot_write_cache[get_local_id(0)].slot.i = ENCODE_INPUTS(row, slot_a, slot_b); + slot_write_cache[get_local_id(0)].slot.xi[0] = ((xi1 << 24) | (xi0 >> 8)); + slot_write_cache[get_local_id(0)].slot.xi[1] = ((xi2 << 24) | (xi1 >> 8)); + slot_write_cache[get_local_id(0)].slot.xi[2] = ((xi3 << 24) | (xi2 >> 8)); + slot_write_cache[get_local_id(0)].slot.xi[3] = ((xi4 << 24) | (xi3 >> 8)); + slot_write_cache[get_local_id(0)].slot.xi[4] = ((xi5 << 24) | (xi4 >> 8)); + slot_write_cache[get_local_id(0)].slot.xi[5] = ((xi5 >> 8)); + } + barrier(CLK_LOCAL_MEM_FENCE); + uint local_id_base = get_local_id(0) - get_local_id(0) % THREADS_PER_WRITE; + for (uint write_index = local_id_base; write_index < local_id_base + THREADS_PER_WRITE; ++write_index) { + if (slot_ptrs[write_index]) { +#if THREADS_PER_WRITE == 2 + * ((__global uint4 *)slot_ptrs[write_index] + thread_index) = slot_write_cache[write_index].ui4[thread_index]; +#elif THREADS_PER_WRITE == 4 + * ((__global uint2 *)slot_ptrs[write_index] + thread_index) = slot_write_cache[write_index].ui2[thread_index]; +#elif THREADS_PER_WRITE == 8 + * ((__global uint *)slot_ptrs[write_index] + thread_index) = slot_write_cache[write_index].ui[thread_index]; +#else +#error "unsupported THREADS_PER_WRITE" +#endif + } + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + return ret; +} + +/* +** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi, +** store them in ht_dst. +*/ + +#define UINTS_IN_XI(round) (((round) == 0) ? 6 : \ + ((round) == 1) ? 6 : \ + ((round) == 2) ? 5 : \ + ((round) == 3) ? 5 : \ + ((round) == 4) ? 4 : \ + ((round) == 5) ? 4 : \ + ((round) == 6) ? 3 : \ + ((round) == 7) ? 2 : \ + 1) + +#define RESERVED_FOR_XI(round) (((round) == 0) ? 6 : \ + ((round) == 1) ? 6 : \ + ((round) == 2) ? 6 : \ + ((round) == 3) ? 6 : \ + ((round) == 4) ? 4 : \ + ((round) == 5) ? 4 : \ + ((round) == 6) ? 4 : \ + ((round) == 7) ? 2 : \ + 2) + +void equihash_round(uint round, + __global char *ht_src, + __global char *ht_dst, + __global uint *debug, + __local uint *slot_cache, + __local uint *collisionsData, + __local uint *collisionsNum, + __global uint *rowCountersSrc, + __global uint *rowCountersDst, + uint threadsPerRow, + __local uint *nr_slots_array, + __local uchar *bins_data, + __local uint *bin_counters_data) +{ + uint globalTid = get_global_id(0) / threadsPerRow; + uint localTid = get_local_id(0) / threadsPerRow; + uint localGroupId = get_local_id(0) % threadsPerRow; + __global char *p; + uint i, j; + uint dropped_coll = 0; + uint dropped_stor = 0; + __local uint *a, *b; + // the mask is also computed to read data from the previous round +#define BIN_MASK(round) ((((round) + 1) % 2) ? 0xf000 : 0xf0000) +#define BIN_MASK_OFFSET(round) ((((round) + 1) % 2) ? 3 * 4 : 4 * 4) +#if NR_ROWS_LOG == 12 +#define BIN_MASK2(round) ((((round) + 1) % 2) ? 0x00f0 : 0xf000) +#define BIN_MASK2_OFFSET(round) ((((round) + 1) % 2) ? 0 : 8) +#elif NR_ROWS_LOG == 13 +#define BIN_MASK2(round) ((((round) + 1) % 2) ? 0x00e0 : 0xe000) +#define BIN_MASK2_OFFSET(round) ((((round) + 1) % 2) ? 1 : 9) +#elif NR_ROWS_LOG == 14 +#define BIN_MASK2(round) ((((round) + 1) % 2) ? 0x00c0 : 0xc000) +#define BIN_MASK2_OFFSET(round) ((((round) + 1) % 2) ? 2 : 10) +#elif NR_ROWS_LOG == 15 +#define BIN_MASK2(round) ((((round) + 1) % 2) ? 0x0080 : 0x8000) +#define BIN_MASK2_OFFSET(round) ((((round) + 1) % 2) ? 3 : 11) +#elif NR_ROWS_LOG == 16 +#define BIN_MASK2(round) 0 +#define BIN_MASK2_OFFSET(round) 0 +#else +#error "unsupported NR_ROWS_LOG" +#endif +#define NR_BINS (256 >> (NR_ROWS_LOG - 12)) + __local uchar *bins = &bins_data[localTid * BIN_SIZE * NR_BINS]; + __local uint *bin_counters = &bin_counters_data[localTid * NR_BINS]; +#if THREADS_PER_WRITE != 1 + __local slot_t slot_write_cache[LOCAL_WORK_SIZE]; + __local global_pointer_to_slot_t slot_ptrs[LOCAL_WORK_SIZE]; +#endif + + uint rows_per_work_item = (NR_ROWS + get_global_size(0) / threadsPerRow - 1) / (get_global_size(0) / threadsPerRow); + uint rows_per_chunk = get_global_size(0) / threadsPerRow; + + for (uint chunk = 0; chunk < rows_per_work_item; chunk++) { + uint cnt = 0; + uint tid = globalTid + rows_per_chunk * chunk; + uint gid = tid & ~(get_local_size(0) / threadsPerRow - 1); + + if (!get_local_id(0)) + *collisionsNum = 0; + for (i = localGroupId; i < NR_BINS; i += threadsPerRow) + bin_counters[i] = 0; + if (tid < NR_ROWS && localGroupId == 0) { + uint rowIdx, rowOffset; + get_row_counters_index(&rowIdx, &rowOffset, tid); + cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK; + cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round + nr_slots_array[localTid] = cnt; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < NR_ROWS && localGroupId) { + cnt = nr_slots_array[localTid]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Perform a radix sort as slots get loaded into LDS. + uint xi_first_bytes; + uint bin_to_use; + uint bin_counter_copy; + // Make sure all the work items in the work group enter the loop. + uint i_max = cnt + (get_local_size(0) - cnt % get_local_size(0)) - 1; + for (i = localGroupId; i < i_max; i += threadsPerRow) { + if (tid < NR_ROWS && i < cnt) { + xi_first_bytes = *(__global uint *)get_xi_ptr(ht_src, round - 1, tid, i); + slot_cache[(localTid * NR_SLOTS + i) * RESERVED_FOR_XI(round - 1)] = xi_first_bytes; + for (j = 1; j < UINTS_IN_XI(round - 1); ++j) + slot_cache[(localTid * NR_SLOTS + i) * RESERVED_FOR_XI(round - 1) + j] = *((__global uint *)get_xi_ptr(ht_src, round - 1, tid, i) + j); + + bin_to_use = + ((xi_first_bytes & BIN_MASK(round - 1)) >> BIN_MASK_OFFSET(round - 1)) + | ((xi_first_bytes & BIN_MASK2(round - 1)) >> BIN_MASK2_OFFSET(round - 1)); + bin_counter_copy = atomic_inc(&bin_counters[bin_to_use]); + if (bin_counter_copy >= BIN_SIZE) { + atomic_dec(&bin_counters[bin_to_use]); + ++dropped_coll; + } + else { + bins[bin_to_use * BIN_SIZE + bin_counter_copy] = i; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < NR_ROWS && i < cnt) { + for (j = 0; j < bin_counter_copy; ++j) { + uint index = atomic_inc(collisionsNum); + if (index >= LDS_COLL_SIZE) { + atomic_dec(collisionsNum); + ++dropped_coll; + } + else { + collisionsData[index] = (localTid << 24) | (i << 12) | bins[bin_to_use * BIN_SIZE + j]; + } + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + uint totalCollisions = *collisionsNum; + // Make sure all the work items in the work group enter and leave the loop at the same time. + uint max_index = totalCollisions + (LOCAL_WORK_SIZE - totalCollisions % LOCAL_WORK_SIZE) - 1; + for (uint index = get_local_id(0); index <= max_index; index += LOCAL_WORK_SIZE) { + uint collision, collisionLocalThreadId, collisionThreadId; + uint i, j, slot_cache_index_i, slot_cache_index_j; + a = 0; + b = 0; + + if (tid < NR_ROWS && index < totalCollisions) { + collision = collisionsData[index]; + collisionLocalThreadId = collision >> 24; + collisionThreadId = gid + collisionLocalThreadId; + i = (collision >> 12) & 0xfff; + j = collision & 0xfff; + a = (__local uint *)&slot_cache[(collisionLocalThreadId * NR_SLOTS + i) * RESERVED_FOR_XI(round - 1)]; + b = (__local uint *)&slot_cache[(collisionLocalThreadId * NR_SLOTS + j) * RESERVED_FOR_XI(round - 1)]; + } + + dropped_stor += xor_and_store( + round, + ht_dst, + collisionThreadId, + i, j, + a, b, + rowCountersDst +#if THREADS_PER_WRITE == 1 + ); +#else + , slot_write_cache, slot_ptrs); +#endif + barrier(CLK_LOCAL_MEM_FENCE); + } + } + +#ifdef ENABLE_DEBUG + uint tid = get_global_id(0); + debug[tid * 2] = dropped_coll; + debug[tid * 2 + 1] = dropped_stor; +#endif +} + +/* +** This defines kernel_round1, kernel_round2, ..., kernel_round7. +*/ +#define KERNEL_ROUND(N) \ +__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE, 1, 1))) \ +void kernel_round ## N(__global char *ht_src, __global char *ht_dst, \ + __global uint *rowCountersSrc, __global uint *rowCountersDst, \ + __global uint *debug) \ +{ \ + __local uint slot_cache[NEXT_PRIME_NO(RESERVED_FOR_XI(N - 1) * SLOT_CACHE_SIZE)]; \ + __local uint collisionsData[NEXT_PRIME_NO(LDS_COLL_SIZE)]; \ + __local uint collisionsNum; \ + __local uint nr_slots_array[NEXT_PRIME_NO(LOCAL_WORK_SIZE / THREADS_PER_ROW)]; \ + __local uchar bins_data[NEXT_PRIME_NO((LOCAL_WORK_SIZE / THREADS_PER_ROW) * BIN_SIZE * NR_BINS)]; \ + __local uint bin_counters_data[NEXT_PRIME_NO((LOCAL_WORK_SIZE / THREADS_PER_ROW) * NR_BINS)]; \ + equihash_round(N, ht_src, ht_dst, debug, slot_cache, collisionsData, \ + &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW, nr_slots_array, bins_data, bin_counters_data); \ +} +KERNEL_ROUND(1) +KERNEL_ROUND(2) +KERNEL_ROUND(3) +KERNEL_ROUND(4) +KERNEL_ROUND(5) +KERNEL_ROUND(6) +KERNEL_ROUND(7) +KERNEL_ROUND(8) + +uint expand_ref(__global char *ht, uint round, uint row, uint slot) +{ + return ((__global slot_t *)get_slot_ptr(ht, round, row, slot))->slot.i; +} + +/* +** Expand references to inputs. Return 1 if so far the solution appears valid, +** or 0 otherwise (an invalid solution would be a solution with duplicate +** inputs, which can be detected at the last step: round == 0). +*/ +uint expand_refs(__local uint *ins, uint nr_inputs, __global char **htabs, + uint round) +{ + __global char *ht = htabs[round]; + uint i = nr_inputs - 1; + uint j = nr_inputs * 2 - 1; + int dup_to_watch = -1; + do { + ins[j] = expand_ref(ht, round, + DECODE_ROW(ins[i]), DECODE_SLOT1(ins[i])); + ins[j - 1] = expand_ref(ht, round, + DECODE_ROW(ins[i]), DECODE_SLOT0(ins[i])); + if (!round) { + if (dup_to_watch == -1) + dup_to_watch = ins[j]; + else if (ins[j] == dup_to_watch || ins[j - 1] == dup_to_watch) + return 0; + } + if (!i) + break; + i--; + j -= 2; + } while (1); + return 1; +} + +/* +** Verify if a potential solution is in fact valid. +*/ +void potential_sol(__global char **htabs, __global sols_t *sols, + uint ref0, uint ref1, __local uint *values_tmp) +{ + uint nr_values; + uint sol_i; + uint i; + nr_values = 0; + values_tmp[nr_values++] = ref0; + values_tmp[nr_values++] = ref1; + uint round = PARAM_K - 1; + do { + round--; + if (!expand_refs(values_tmp, nr_values, htabs, round)) + return; + nr_values *= 2; + } while (round > 0); + // solution appears valid, copy it to sols + sol_i = atomic_inc(&sols->nr); + if (sol_i >= MAX_SOLS) + return; + for (i = 0; i < (1 << PARAM_K); i++) + sols->values[sol_i][i] = values_tmp[i]; + sols->valid[sol_i] = 1; +} + +/* +** Scan the hash tables to find Equihash solutions. +*/ +__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE_SOLS, 1, 1))) +void kernel_sols(__global char *ht0, + __global char *ht1, + __global char *ht2, + __global char *ht3, + __global char *ht4, + __global char *ht5, + __global char *ht6, + __global char *ht7, + __global char *ht8, + __global sols_t *sols, + __global uint *rowCountersSrc) +{ + __local uint refs[NEXT_PRIME_NO(NR_SLOTS*(LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS))]; + __local uint data[NEXT_PRIME_NO(NR_SLOTS*(LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS))]; + __local uint values_tmp[NEXT_PRIME_NO(1 << PARAM_K)]; + __local uint semaphoe; + + uint globalTid = get_global_id(0) / THREADS_PER_ROW_SOLS; + uint localTid = get_local_id(0) / THREADS_PER_ROW_SOLS; + uint localGroupId = get_local_id(0) % THREADS_PER_ROW_SOLS; + __local uint *refsPtr = &refs[NR_SLOTS*localTid]; + __local uint *dataPtr = &data[NR_SLOTS*localTid]; + + __global char *htabs[] = { ht0, ht1, ht2, ht3, ht4, ht5, ht6, ht7, ht8 }; + uint ht_i = (PARAM_K - 1); // table filled at last round + uint cnt; + uint i, j; + __global char *p; + uint ref_i, ref_j; + __local uchar bins_data[(LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS) * NR_SLOTS * NR_BINS]; + __local uint bin_counters_data[(LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS) * NR_BINS]; + __local uchar *bins = &bins_data[localTid * NR_SLOTS * NR_BINS]; + __local uint *bin_counters = &bin_counters_data[localTid * NR_BINS]; + + if (!get_global_id(0)) + sols->nr = sols->likely_invalids = 0; + barrier(CLK_GLOBAL_MEM_FENCE); + + uint rows_per_work_item = (NR_ROWS + get_global_size(0) / THREADS_PER_ROW_SOLS - 1) / (get_global_size(0) / THREADS_PER_ROW_SOLS); + uint rows_per_chunk = get_global_size(0) / THREADS_PER_ROW_SOLS; + + for (uint chunk = 0; chunk < rows_per_work_item; chunk++) { + uint tid = globalTid + rows_per_chunk * chunk; + uint gid = tid & ~(get_local_size(0) / THREADS_PER_ROW_SOLS - 1); + + __local uint nr_slots_array[LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS]; + if (tid < NR_ROWS) { + if (!get_local_id(0)) + semaphoe = 0; + for (i = localGroupId; i < NR_BINS; i += THREADS_PER_ROW_SOLS) + bin_counters[i] = 0; + if (localGroupId == 0) { + uint rowIdx, rowOffset; + get_row_counters_index(&rowIdx, &rowOffset, tid); + cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK; + cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round + nr_slots_array[localTid] = cnt; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < NR_ROWS) { + if (localGroupId) + cnt = nr_slots_array[localTid]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // in the final hash table, we are looking for a match on both the bits + // part of the previous PREFIX colliding bits, and the last PREFIX bits. + __local ulong coll; + if (tid < NR_ROWS) { + for (i = localGroupId; i < cnt && !semaphoe; i += THREADS_PER_ROW_SOLS) { + p = get_slot_ptr(htabs[ht_i], PARAM_K - 1, tid, i); + refsPtr[i] = ((__global slot_t *)p)->slot.i; + uint xi_first_bytes = dataPtr[i] = ((__global slot_t *)p)->slot.xi[0]; + uint bin_to_use = + ((xi_first_bytes & BIN_MASK(PARAM_K - 1)) >> BIN_MASK_OFFSET(PARAM_K - 1)) + | ((xi_first_bytes & BIN_MASK2(PARAM_K - 1)) >> BIN_MASK2_OFFSET(PARAM_K - 1)); + uint bin_counter_copy = atomic_inc(&bin_counters[bin_to_use]); + bins[bin_to_use * NR_SLOTS + bin_counter_copy] = i; + if (bin_counter_copy) { + for (j = 0; j < bin_counter_copy && !semaphoe; ++j) { + uint slot_index_j = bins[bin_to_use * NR_SLOTS + j]; + if (xi_first_bytes == dataPtr[slot_index_j]) { + if (atomic_inc(&semaphoe) == 0) + coll = ((ulong)refsPtr[i] << 32) | refsPtr[slot_index_j]; + } + } + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < NR_ROWS) { + if (get_local_id(0) == 0 && semaphoe) + potential_sol(htabs, sols, coll >> 32, coll & 0xffffffff, values_tmp); + } + } +} \ No newline at end of file diff --git a/ocl_gatelessgate/gettimeofday.h b/ocl_gatelessgate/gettimeofday.h new file mode 100644 index 000000000..0af4feffb --- /dev/null +++ b/ocl_gatelessgate/gettimeofday.h @@ -0,0 +1,43 @@ +// Gateless Gate, a Zcash miner +// Copyright 2016 zawawa @ bitcointalk.org +// +// The initial version of this software was based on: +// SILENTARMY v5 +// The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil +// +// This program is free software : you can redistribute it and / or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program.If not, see . + +#define WIN32_LEAN_AND_MEAN +#include +#include +#include // portable: uint64_t MSVC: __int64 + +inline int gettimeofday(struct timeval * tp, struct timezone * tzp) +{ + // Note: some broken versions only have 8 trailing zero's, the correct epoch has 9 trailing zero's + static const uint64_t EPOCH = ((uint64_t)116444736000000000ULL); + + SYSTEMTIME system_time; + FILETIME file_time; + uint64_t time; + + GetSystemTime(&system_time); + SystemTimeToFileTime(&system_time, &file_time); + time = ((uint64_t)file_time.dwLowDateTime); + time += ((uint64_t)file_time.dwHighDateTime) << 32; + + tp->tv_sec = (long)((time - EPOCH) / 10000000L); + tp->tv_usec = (long)(system_time.wMilliseconds * 1000); + return 0; +} diff --git a/ocl_gatelessgate/ocl_gatelessgate.cpp b/ocl_gatelessgate/ocl_gatelessgate.cpp new file mode 100644 index 000000000..9830ebfdb --- /dev/null +++ b/ocl_gatelessgate/ocl_gatelessgate.cpp @@ -0,0 +1,912 @@ +#include "ocl_gatelessgate.hpp" + +#pragma comment(lib, "winmm.lib") +#define _CRT_RAND_S + + +//#define _CRT_SECURE_NO_WARNINGS + +#include +#include +#include +#include +#include +#include +#include +//#include +#include +#include +//#include +//#include +#include + +#include + +#include + +#include "gettimeofday.h" +#include + +#include +using namespace blake; +#include + +#include + +typedef uint8_t uchar; +typedef uint32_t uint; +typedef uint64_t ulong; +#include "param.h" + +#define MIN(A, B) (((A) < (B)) ? (A) : (B)) +#define MAX(A, B) (((A) > (B)) ? (A) : (B)) + +#define WN PARAM_N +#define WK PARAM_K + +#define COLLISION_BIT_LENGTH (WN / (WK+1)) +#define COLLISION_BYTE_LENGTH ((COLLISION_BIT_LENGTH+7)/8) +#define FINAL_FULL_WIDTH (2*COLLISION_BYTE_LENGTH+sizeof(uint32_t)*(1 << (WK))) + +#define NDIGITS (WK+1) +#define DIGITBITS (WN/(NDIGITS)) +#define PROOFSIZE (1u<= '0' && c <= '9') return c - '0'; + else if (c >= 'a' && c <= 'f') return 10 + c - 'a'; + else if (c >= 'A' && c <= 'F') return 10 + c - 'A'; + printf("Invalid hex char at offset %zd: ...%c...\n", off, c); + return 0; +} + +static unsigned nr_compute_units(const char *gpu) +{ + if (!strcmp(gpu, "rx480")) return 36; + fprintf(stderr, "Unknown GPU: %s\n", gpu); + return 0; +} + +static void compress(uint8_t *out, uint32_t *inputs, uint32_t n) +{ + uint32_t byte_pos = 0; + int32_t bits_left = PREFIX + 1; + uint8_t x = 0; + uint8_t x_bits_used = 0; + uint8_t *pOut = out; + while (byte_pos < n) + { + if (bits_left >= 8 - x_bits_used) + { + x |= inputs[byte_pos] >> (bits_left - 8 + x_bits_used); + bits_left -= 8 - x_bits_used; + x_bits_used = 8; + } + else if (bits_left > 0) + { + uint32_t mask = ~(-1 << (8 - x_bits_used)); + mask = ((~mask) >> bits_left) & mask; + x |= (inputs[byte_pos] << (8 - x_bits_used - bits_left)) & mask; + x_bits_used += bits_left; + bits_left = 0; + } + else if (bits_left <= 0) + { + assert(!bits_left); + byte_pos++; + bits_left = PREFIX + 1; + } + if (x_bits_used == 8) + { + *pOut++ = x; + x = x_bits_used = 0; + } + } +} + +static void get_program_build_log(cl_program program, cl_device_id device) +{ + cl_int status; + char val[2 * 1024 * 1024]; + size_t ret = 0; + status = clGetProgramBuildInfo(program, device, + CL_PROGRAM_BUILD_LOG, + sizeof(val), // size_t param_value_size + &val, // void *param_value + &ret); // size_t *param_value_size_ret + if (status != CL_SUCCESS) + printf("clGetProgramBuildInfo (%d)\n", status); + fprintf(stderr, "%s\n", val); +} + +static size_t select_work_size_blake(void) +{ + size_t work_size = + 64 * /* thread per wavefront */ + BLAKE_WPS * /* wavefront per simd */ + 4 * /* simd per compute unit */ + nr_compute_units("rx480"); + // Make the work group size a multiple of the nr of wavefronts, while + // dividing the number of inputs. This results in the worksize being a + // power of 2. + while (NR_INPUTS % work_size) + work_size += 64; + //debug("Blake: work size %zd\n", work_size); + return work_size; +} + +static void init_ht(cl_command_queue queue, cl_kernel k_init_ht, cl_mem buf_ht, cl_mem rowCounters) +{ + size_t global_ws = RC_SIZE / sizeof(cl_uint); + size_t local_ws = 256; + cl_int status; +#if 0 + uint32_t pat = -1; + status = clEnqueueFillBuffer(queue, buf_ht, &pat, sizeof(pat), 0, + NR_ROWS * NR_SLOTS * SLOT_LEN, + 0, // cl_uint num_events_in_wait_list + NULL, // cl_event *event_wait_list + NULL); // cl_event *event + if (status != CL_SUCCESS) + fatal("clEnqueueFillBuffer (%d)\n", status); +#endif + status = clSetKernelArg(k_init_ht, 0, sizeof(buf_ht), &buf_ht); + status = clSetKernelArg(k_init_ht, 1, sizeof(rowCounters), &rowCounters); + if (status != CL_SUCCESS) + printf("clSetKernelArg (%d)\n", status); + OCL(clEnqueueNDRangeKernel(queue, k_init_ht, + 1, // cl_uint work_dim + NULL, // size_t *global_work_offset + &global_ws, // size_t *global_work_size + &local_ws, // size_t *local_work_size + 0, // cl_uint num_events_in_wait_list + NULL, // cl_event *event_wait_list + NULL)); // cl_event *event +} + + +/* +** Sort a pair of binary blobs (a, b) which are consecutive in memory and +** occupy a total of 2*len 32-bit words. +** +** a points to the pair +** len number of 32-bit words in each pair +*/ +static void sort_pair(uint32_t *a, uint32_t len) +{ + uint32_t *b = a + len; + uint32_t tmp, need_sorting = 0; + for (uint32_t i = 0; i < len; i++) + if (need_sorting || a[i] > b[i]) + { + need_sorting = 1; + tmp = a[i]; + a[i] = b[i]; + b[i] = tmp; + } + else if (a[i] < b[i]) + return; +} + + +#define SEEN_LEN (1 << (PREFIX + 1)) / 8 + +static uint32_t verify_sol(sols_t *sols, unsigned sol_i) +{ + uint32_t *inputs = sols->values[sol_i]; + //uint32_t seen_len = (1 << (PREFIX + 1)) / 8; + //uint8_t seen[(1 << (PREFIX + 1)) / 8]; + uint8_t seen[SEEN_LEN]; + uint32_t i; + uint8_t tmp; + // look for duplicate inputs + memset(seen, 0, SEEN_LEN); + for (i = 0; i < (1 << PARAM_K); i++) + { + if (inputs[i] / 8 >= SEEN_LEN) + { + printf("Invalid input retrieved from device: %d\n", inputs[i]); + sols->valid[sol_i] = 0; + return 0; + } + tmp = seen[inputs[i] / 8]; + seen[inputs[i] / 8] |= 1 << (inputs[i] & 7); + if (tmp == seen[inputs[i] / 8]) + { + // at least one input value is a duplicate + sols->valid[sol_i] = 0; + return 0; + } + } + // the valid flag is already set by the GPU, but set it again because + // I plan to change the GPU code to not set it + sols->valid[sol_i] = 1; + // sort the pairs in place + for (uint32_t level = 0; level < PARAM_K; level++) + for (i = 0; i < (1 << PARAM_K); i += (2 << level)) + sort_pair(&inputs[i], 1 << level); + return 1; +} + + +static struct timeval time_diff(struct timeval start, struct timeval end) +{ + struct timeval temp; + if ((end.tv_usec - start.tv_usec)<0) { + temp.tv_sec = end.tv_sec - start.tv_sec - 1; + temp.tv_usec = 1000000 + end.tv_usec - start.tv_usec; + } + else { + temp.tv_sec = end.tv_sec - start.tv_sec; + temp.tv_usec = end.tv_usec - start.tv_usec; + } + return temp; +} + +/* +** Write ZCASH_SOL_LEN bytes representing the encoded solution as per the +** Zcash protocol specs (512 x 21-bit inputs). +** +** out ZCASH_SOL_LEN-byte buffer where the solution will be stored +** inputs array of 32-bit inputs +** n number of elements in array +*/ +static void store_encoded_sol(uint8_t *out, uint32_t *inputs, uint32_t n) +{ + uint32_t byte_pos = 0; + int32_t bits_left = PREFIX + 1; + uint8_t x = 0; + uint8_t x_bits_used = 0; + while (byte_pos < n) + { + if (bits_left >= 8 - x_bits_used) + { + x |= inputs[byte_pos] >> (bits_left - 8 + x_bits_used); + bits_left -= 8 - x_bits_used; + x_bits_used = 8; + } + else if (bits_left > 0) + { + uint32_t mask = ~(-1 << (8 - x_bits_used)); + mask = ((~mask) >> bits_left) & mask; + x |= (inputs[byte_pos] << (8 - x_bits_used - bits_left)) & mask; + x_bits_used += bits_left; + bits_left = 0; + } + else if (bits_left <= 0) + { + assert(!bits_left); + byte_pos++; + bits_left = PREFIX + 1; + } + if (x_bits_used == 8) + { + *out++ = x; + x = x_bits_used = 0; + } + } +} + +/* +** Compare two 256-bit values interpreted as little-endian 256-bit integers. +*/ +static int32_t cmp_target_256(void *_a, void *_b) +{ + uint8_t *a = static_cast(_a); + uint8_t *b = static_cast(_b); + int32_t i; + for (i = SHA256_TARGET_LEN - 1; i >= 0; i--) + if (a[i] != b[i]) + return (int32_t)a[i] - b[i]; + return 0; +} + +/* +** Verify if the solution's block hash is under the target, and if yes print +** it formatted as: +** "sol: " +** +** Return 1 iff the block hash is under the target. +*/ +uint32_t print_solver_line(uint32_t *values, uint8_t *header, + size_t fixed_nonce_bytes, uint8_t *target, char *job_id) +{ + uint8_t buffer[ZCASH_BLOCK_HEADER_LEN + ZCASH_SOLSIZE_LEN + + ZCASH_SOL_LEN]; + uint8_t hash0[SHA256_DIGEST_SIZE]; + uint8_t hash1[SHA256_DIGEST_SIZE]; + uint8_t *p; + p = buffer; + memcpy(p, header, ZCASH_BLOCK_HEADER_LEN); + p += ZCASH_BLOCK_HEADER_LEN; + memcpy(p, "\xfd\x40\x05", ZCASH_SOLSIZE_LEN); + p += ZCASH_SOLSIZE_LEN; + store_encoded_sol(p, values, 1 << PARAM_K); + sha256::Sha256_Onestep(buffer, sizeof(buffer), hash0); + sha256::Sha256_Onestep(hash0, sizeof(hash0), hash1); + // compare the double SHA256 hash with the target + if (cmp_target_256(target, hash1) < 0) + { + printf("Hash is above target\n"); + return 0; + } + printf("Hash is under target\n"); + printf("sol: %s ", job_id); + p = header + ZCASH_BLOCK_OFFSET_NTIME; + printf("%02x%02x%02x%02x ", p[0], p[1], p[2], p[3]); + printf("%s ", s_hexdump(header + ZCASH_BLOCK_HEADER_LEN - ZCASH_NONCE_LEN + + fixed_nonce_bytes, ZCASH_NONCE_LEN - fixed_nonce_bytes)); + printf("%s%s\n", ZCASH_SOLSIZE_HEX, + s_hexdump(buffer + ZCASH_BLOCK_HEADER_LEN + ZCASH_SOLSIZE_LEN, + ZCASH_SOL_LEN)); + fflush(stdout); + return 1; +} + +int sol_cmp(const void *_a, const void *_b) +{ + const uint32_t *a = static_cast(_a); + const uint32_t *b = static_cast(_b); + for (uint32_t i = 0; i < (1 << PARAM_K); i++) + { + if (*a != *b) + return *a - *b; + a++; + b++; + } + return 0; +} + +/* +** Print on stdout a hex representation of the encoded solution as per the +** zcash protocol specs (512 x 21-bit inputs). +** +** inputs array of 32-bit inputs +** n number of elements in array +*/ +static void print_encoded_sol(uint32_t *inputs, uint32_t n) +{ + uint8_t sol[ZCASH_SOL_LEN]; + uint32_t i; + store_encoded_sol(sol, inputs, n); + for (i = 0; i < sizeof(sol); i++) + printf("%02x", sol[i]); + printf("\n"); + fflush(stdout); +} + +static void print_sol(uint32_t *values, uint64_t *nonce) +{ + uint32_t show_n_sols; + show_n_sols = (1 << PARAM_K); + if (verbose < 2) + show_n_sols = MIN(10, show_n_sols); + fprintf(stderr, "Soln:"); + // for brievity, only print "small" nonces + if (*nonce < (1ULL << 32)) + fprintf(stderr, " 0x%" PRIx64 ":", *nonce); + for (unsigned i = 0; i < show_n_sols; i++) + fprintf(stderr, " %x", values[i]); + fprintf(stderr, "%s\n", (show_n_sols != (1 << PARAM_K) ? "..." : "")); +} + +/* +** Print all solutions. +** +** In mining mode, return the number of shares, that is the number of solutions +** that were under the target. +*/ +static uint32_t print_sols(sols_t *all_sols, uint64_t *nonce, uint32_t nr_valid_sols, + uint8_t *header, size_t fixed_nonce_bytes, uint8_t *target, + char *job_id) +{ + uint8_t *valid_sols; + uint32_t counted; + uint32_t shares = 0; + valid_sols = static_cast(malloc(nr_valid_sols * SOL_SIZE)); + if (!valid_sols) + printf("malloc: %s\n", strerror(errno)); + counted = 0; + for (uint32_t i = 0; i < all_sols->nr; i++) + if (all_sols->valid[i]) + { + if (counted >= nr_valid_sols) + printf("Bug: more than %d solutions\n", nr_valid_sols); + memcpy(valid_sols + counted * SOL_SIZE, all_sols->values[i], + SOL_SIZE); + counted++; + } + assert(counted == nr_valid_sols); + // sort the solutions amongst each other, to make the solver's output + // deterministic and testable + qsort(valid_sols, nr_valid_sols, SOL_SIZE, sol_cmp); + for (uint32_t i = 0; i < nr_valid_sols; i++) + { + uint32_t *inputs = (uint32_t *)(valid_sols + i * SOL_SIZE); + if (show_encoded) + print_encoded_sol(inputs, 1 << PARAM_K); + if (verbose) + print_sol(inputs, nonce); + if (true) + shares += print_solver_line(inputs, header, fixed_nonce_bytes, + target, job_id); + } + free(valid_sols); + return shares; +} + +/* +** Return the number of valid solutions. +*/ +static uint32_t verify_sols(cl_command_queue queue, cl_mem buf_sols, uint64_t *nonce, + uint8_t *header, size_t fixed_nonce_bytes, uint8_t *target, + char *job_id, uint32_t *shares, struct timeval *start_time, bool is_amd) +{ + sols_t *sols; + uint32_t nr_valid_sols; + sols = (sols_t *)malloc(sizeof(*sols)); + if (!sols) + printf("malloc: %s\n", strerror(errno)); +#ifdef WIN32 + timeBeginPeriod(1); + DWORD duration = (DWORD)kern_avg_run_time.tv_sec * 1000 + (DWORD)kern_avg_run_time.tv_usec / 1000; + if (!is_amd && duration < 1000) + Sleep(duration); +#endif + check_clEnqueueReadBuffer(queue, buf_sols, + CL_TRUE, // cl_bool blocking_read + 0, // size_t offset + sizeof(*sols), // size_t size + sols, // void *ptr + 0, // cl_uint num_events_in_wait_list + NULL, // cl_event *event_wait_list + NULL); // cl_event *event + struct timeval curr_time; + gettimeofday(&curr_time, NULL); + + struct timeval t_diff = time_diff(*start_time, curr_time); + + double a_diff = t_diff.tv_sec * 1e6 + t_diff.tv_usec; + double kern_avg = kern_avg_run_time.tv_sec * 1e6 + kern_avg_run_time.tv_usec; + if (kern_avg == 0) + kern_avg = a_diff; + else + kern_avg = kern_avg * 70 / 100 + a_diff * 28 / 100; // it is 2% less than average + // thus allowing time to reduce + + kern_avg_run_time.tv_sec = (time_t)(kern_avg / 1e6); + kern_avg_run_time.tv_usec = ((long)kern_avg) % 1000000; + + if (sols->nr > MAX_SOLS) + { + fprintf(stderr, "%d (probably invalid) solutions were dropped!\n", + sols->nr - MAX_SOLS); + sols->nr = MAX_SOLS; + } + printf("Retrieved %d potential solutions\n", sols->nr); + nr_valid_sols = 0; + for (unsigned sol_i = 0; sol_i < sols->nr; sol_i++) + nr_valid_sols += verify_sol(sols, sol_i); + uint32_t sh = print_sols(sols, nonce, nr_valid_sols, header, fixed_nonce_bytes, target, job_id); + if (shares) + *shares = sh; + printf("Stats: %d likely invalids\n", sols->likely_invalids); + free(sols); + return nr_valid_sols; +} + + +ocl_gatelessgate::ocl_gatelessgate(int platf_id, int dev_id) { + platform_id = platf_id; + device_id = dev_id; + // TODO + threadsNum = 8192; + wokrsize = 128; // 256; +} + +std::string ocl_gatelessgate::getdevinfo() { + static auto devices = GetAllDevices(platform_id); + auto device = devices[device_id]; + std::vector name(256, 0); + size_t nActualSize = 0; + std::string gpu_name; + + cl_int rc = clGetDeviceInfo(device, CL_DEVICE_NAME, name.size(), &name[0], &nActualSize); + + gpu_name.assign(&name[0], nActualSize); + + return "GPU_ID( " + gpu_name + ")"; +} + +// STATICS START +int ocl_gatelessgate::getcount() { + static auto devices = GetAllDevices(); + return devices.size(); +} + +void ocl_gatelessgate::getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version) { + static auto devices = GetAllDevices(platf_id); + + if (devices.size() <= d_id) { + return; + } + auto device = devices[d_id]; + + std::vector name(256, 0); + cl_uint compute_units = 0; + + size_t nActualSize = 0; + cl_int rc = clGetDeviceInfo(device, CL_DEVICE_NAME, name.size(), &name[0], &nActualSize); + + if (rc == CL_SUCCESS) { + gpu_name.assign(&name[0], nActualSize); + } + + rc = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(cl_uint), &compute_units, &nActualSize); + if (rc == CL_SUCCESS) { + sm_count = (int)compute_units; + } + + memset(&name[0], 0, name.size()); + rc = clGetDeviceInfo(device, CL_DEVICE_VERSION, name.size(), &name[0], &nActualSize); + if (rc == CL_SUCCESS) { + version.assign(&name[0], nActualSize); + } +} + + +static bool is_platform_amd(cl_platform_id platform_id) +{ + char name[1024]; + size_t len = 0; + int status; + status = clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(name), &name, + &len); + if (status != CL_SUCCESS) + printf("clGetPlatformInfo (%d)\n", status); + return strncmp(name, "AMD Accelerated Parallel Processing", len) == 0; +} + + +void ocl_gatelessgate::start(ocl_gatelessgate& device_context) { + /*TODO*/ + device_context.is_init_success = false; + device_context.oclc = new OclGGContext; + auto devices = GetAllDevices(device_context.platform_id); + + printf("pid %i, size %u\n", device_context.platform_id, devices.size()); + auto device = devices[device_context.device_id]; + + size_t nActualSize = 0; + cl_platform_id platform_id = nullptr; + cl_int rc = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform_id, nullptr); + + + device_context.oclc->_dev_id = device; + device_context.oclc->platform_id = platform_id; + + // context create + cl_context_properties props[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)device_context.oclc->platform_id, 0 }; + cl_int error; + device_context.oclc->_context = clCreateContext(props, 1, &device, 0, 0, &error); + //OCLR(error, false); + if (cl_int err = error) { + printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); + return; + } + + cl_int binstatus; + + device_context.is_amd = is_platform_amd(platform_id); + + char kernelName[64]; + sprintf(kernelName, "gatelessgate_gpu_%u.bin", (unsigned)device_context.device_id); + if (!clCompileKernel(device_context.oclc->_context, + device, + kernelName, + { "zcash/gpu/gatelessgate.cl" }, + device_context.is_amd ? OPENCL_BUILD_OPTIONS_AMD : OPENCL_BUILD_OPTIONS, + &binstatus, + &device_context.oclc->_program)) { + return; + } + + if (binstatus == CL_SUCCESS) { + if (!device_context.oclc->init(device, device_context.threadsNum, device_context.wokrsize)) { + printf("Init failed"); + return; + } + } + else { + printf("GPU %d: failed to load kernel\n", device_context.device_id); + return; + } + + device_context.is_init_success = true; +} + +#include + +void ocl_gatelessgate::stop(ocl_gatelessgate& device_context) { + if (device_context.oclc != nullptr) delete device_context.oclc; +} + +void ocl_gatelessgate::solve(const char *tequihash_header, + unsigned int tequihash_header_len, + const char* nonce, + unsigned int nonce_len, + std::function cancelf, + std::function&, size_t, const unsigned char*)> solutionf, + std::function hashdonef, + ocl_gatelessgate& device_context) { + + uint64_t *nonce_ptr; + + unsigned char context[140]; + memset(context, 0, 140); + memcpy(context, tequihash_header, tequihash_header_len); + memcpy(context + tequihash_header_len, nonce, nonce_len); + + OclGGContext *miner = device_context.oclc; + clFlush(miner->queue); + + blake2b_state_t initialCtx; + zcash_blake2b_init(&initialCtx, ZCASH_HASH_LEN, PARAM_N, PARAM_K); + zcash_blake2b_update(&initialCtx, (const uint8_t*)context, 128, 0); + + cl_mem buf_blake_st; + buf_blake_st = check_clCreateBuffer(miner->_context, CL_MEM_READ_ONLY | + CL_MEM_COPY_HOST_PTR, sizeof(blake2b_state_s), &initialCtx); + + cl_uint compute_units; + cl_int status = clGetDeviceInfo(miner->_dev_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL); + if (status != CL_SUCCESS) + printf("clGetDeviceInfo (%d)\n", status); + + miner->local_work_size = LOCAL_WORK_SIZE; + + for (unsigned round = 0; round < PARAM_K; round++) + { + init_ht(miner->queue, miner->k_init_ht, miner->buf_ht[round & 1], miner->rowCounters[round & 1]); + if (!round) + { + check_clSetKernelArg(miner->k_rounds[round], 0, &buf_blake_st); + check_clSetKernelArg(miner->k_rounds[round], 1, &miner->buf_ht[round]); + check_clSetKernelArg(miner->k_rounds[round], 2, &miner->rowCounters[round % 2]); + miner->global_ws = select_work_size_blake(); + } + else + { + check_clSetKernelArg(miner->k_rounds[round], 0, &miner->buf_ht[round - 1]); + check_clSetKernelArg(miner->k_rounds[round], 1, &miner->buf_ht[round]); + check_clSetKernelArg(miner->k_rounds[round], 2, &miner->rowCounters[(round - 1) % 2]); + check_clSetKernelArg(miner->k_rounds[round], 3, &miner->rowCounters[round % 2]); + miner->global_ws = GLOBAL_WORK_SIZE_RATIO * compute_units * LOCAL_WORK_SIZE; + if (miner->global_ws > NR_ROWS * THREADS_PER_ROW) + miner->global_ws = NR_ROWS * THREADS_PER_ROW; + } + check_clSetKernelArg(miner->k_rounds[round], round == 0 ? 3 : 4, &miner->buf_dbg); + OCL(clEnqueueNDRangeKernel(miner->queue, miner->k_rounds[round], 1, NULL, + &miner->global_ws, &miner->local_work_size, 0, NULL, NULL)); + // cancel function + if (cancelf()) return; + } + + check_clSetKernelArg(miner->k_sols, 0, &miner->buf_ht[0]); + check_clSetKernelArg(miner->k_sols, 1, &miner->buf_ht[1]); + check_clSetKernelArg(miner->k_sols, 2, &miner->buf_ht[2]); + check_clSetKernelArg(miner->k_sols, 3, &miner->buf_ht[3]); + check_clSetKernelArg(miner->k_sols, 4, &miner->buf_ht[4]); + check_clSetKernelArg(miner->k_sols, 5, &miner->buf_ht[5]); + check_clSetKernelArg(miner->k_sols, 6, &miner->buf_ht[6]); + check_clSetKernelArg(miner->k_sols, 7, &miner->buf_ht[7]); + check_clSetKernelArg(miner->k_sols, 8, &miner->buf_ht[8]); + check_clSetKernelArg(miner->k_sols, 9, &miner->buf_sols); + check_clSetKernelArg(miner->k_sols, 10, &miner->rowCounters[0]); + miner->global_ws = GLOBAL_WORK_SIZE_RATIO * compute_units * LOCAL_WORK_SIZE_SOLS; + if (miner->global_ws > NR_ROWS * THREADS_PER_ROW_SOLS) + miner->global_ws = NR_ROWS * THREADS_PER_ROW_SOLS; + miner->local_work_size = LOCAL_WORK_SIZE_SOLS; + struct timeval start_time; + gettimeofday(&start_time, NULL); + OCL(clEnqueueNDRangeKernel(miner->queue, miner->k_sols, 1, NULL, + &miner->global_ws, &miner->local_work_size, 0, NULL, NULL)); + + OCL(clEnqueueReadBuffer(miner->queue, miner->buf_sols, + CL_TRUE, // cl_bool blocking_read + 0, // size_t offset + sizeof(*miner->sols), // size_t size + miner->sols, // void *ptr + 0, // cl_uint num_events_in_wait_list + NULL, // cl_event *event_wait_list + NULL)); // cl_event *event + + if (miner->sols->nr > MAX_SOLS) + miner->sols->nr = MAX_SOLS; + + clReleaseMemObject(buf_blake_st); + + for (unsigned sol_i = 0; sol_i < miner->sols->nr; sol_i++) { + verify_sol(miner->sols, sol_i); + } + + uint8_t proof[COMPRESSED_PROOFSIZE * 2]; + for (uint32_t i = 0; i < miner->sols->nr; i++) { + if (miner->sols->valid[i]) { + compress(proof, (uint32_t *)(miner->sols->values[i]), 1 << PARAM_K); + solutionf(std::vector(0), 1344, proof); + } + } + hashdonef(); +} + +// STATICS END + diff --git a/ocl_gatelessgate/ocl_gatelessgate.hpp b/ocl_gatelessgate/ocl_gatelessgate.hpp new file mode 100644 index 000000000..41cf3ca8f --- /dev/null +++ b/ocl_gatelessgate/ocl_gatelessgate.hpp @@ -0,0 +1,58 @@ +#pragma once + +#ifdef _LIB +#define DLL_OCL_GATELESSGATE __declspec(dllexport) +#else +#define DLL_OCL_GATELESSGATE +#endif + +// remove after +#include +#include +#include +#include + +struct OclGGContext; + +struct DLL_OCL_GATELESSGATE ocl_gatelessgate +{ + //int threadsperblock; + int blocks; + int device_id; + int platform_id; + + OclGGContext* oclc; + // threads + unsigned threadsNum; // TMP + unsigned wokrsize; + + bool is_init_success = false; + bool is_amd = false; + + ocl_gatelessgate(int platf_id, int dev_id); + + std::string getdevinfo(); + + static int getcount(); + + static void getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version); + + static void start(ocl_gatelessgate& device_context); + + static void stop(ocl_gatelessgate& device_context); + + static void solve(const char *tequihash_header, + unsigned int tequihash_header_len, + const char* nonce, + unsigned int nonce_len, + std::function cancelf, + std::function&, size_t, const unsigned char*)> solutionf, + std::function hashdonef, + ocl_gatelessgate& device_context); + + std::string getname() { return "OCL_GATELESSGATE"; } + +private: + std::string m_gpu_name; + std::string m_version; +}; diff --git a/ocl_gatelessgate/ocl_gatelessgate.vcxproj b/ocl_gatelessgate/ocl_gatelessgate.vcxproj new file mode 100644 index 000000000..7db9870b9 --- /dev/null +++ b/ocl_gatelessgate/ocl_gatelessgate.vcxproj @@ -0,0 +1,133 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + {6ECDB41E-F2BC-4FCF-9411-1DD9CA2A2612} + ocl_gatelessgate + 8.1 + + + + Application + true + v140 + MultiByte + + + Application + false + v140 + true + MultiByte + + + StaticLibrary + true + v120 + MultiByte + + + StaticLibrary + false + v120 + true + MultiByte + + + + + + + + + + + + + + + + + + + + + $(ProjectDir)../contrib/;$(IncludePath) + + + $(ProjectDir)../contrib;$(ProjectDir)../ocl_device_utils;$(AMDAPPSDKROOT)include;$(IncludePath) + + + + Level3 + Disabled + true + + + + + Level3 + Disabled + true + _LIB;%(PreprocessorDefinitions) + + + + + Level3 + MaxSpeed + true + true + true + + + true + true + + + + + Level3 + MaxSpeed + true + true + true + _LIB;%(PreprocessorDefinitions) + + + true + true + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/ocl_gatelessgate/ocl_gatelessgate.vcxproj.filters b/ocl_gatelessgate/ocl_gatelessgate.vcxproj.filters new file mode 100644 index 000000000..c9e0a8491 --- /dev/null +++ b/ocl_gatelessgate/ocl_gatelessgate.vcxproj.filters @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/ocl_gatelessgate/param.h b/ocl_gatelessgate/param.h new file mode 100644 index 000000000..7f476d982 --- /dev/null +++ b/ocl_gatelessgate/param.h @@ -0,0 +1,373 @@ +// Gateless Gate, a Zcash miner +// Copyright 2016 zawawa @ bitcointalk.org +// +// The initial version of this software was based on: +// SILENTARMY v5 +// The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil +// +// This program is free software : you can redistribute it and / or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program.If not, see . + +#pragma once + +#define PARAM_H + +// When you tweak parameters in this file, make sure to uncomment the next line, +// rebuild sa-solver, and run it to see how many slots get dropped +// at each round. Note that performance suffers if too many slots get dropped. + +//#define ENABLE_DEBUG + +// +// Parameters for Hash Tables +// + +// There are PARAM_K - 1 hash tables, and each hash table has NR_ROWS rows. +// Each row contains NR_SLOTS slots. + +#define NR_ROWS_LOG 14 // 12, 13, 14, 15, or 16. 12 and 13 are not practically usable. +#define NR_SLOTS 199 // Prime numbers are preferable. +#define LOCAL_WORK_SIZE 64 +#define THREADS_PER_ROW 64 +#define LOCAL_WORK_SIZE_SOLS 64 +#define THREADS_PER_ROW_SOLS 64 +#define GLOBAL_WORK_SIZE_RATIO 512 // global_work_size = GLOBAL_WORK_SIZE_RATIO * nr_compute_units * LOCAL_WORK_SIZE +#define THREADS_PER_WRITE 1 // 1, 2, 4, or 8 +#define SLOT_CACHE_SIZE (NR_SLOTS * ROWS_IN_WORK_ITEM) +#define LDS_COLL_SIZE (NR_SLOTS * ROWS_IN_WORK_ITEM * 140 / 100) +#define BIN_SIZE (NR_SLOTS * 6 / 100) +#define EXTRA_BITS_FOR_BINS_SOLS 1 +#define BIN_SIZE_SOLS ((BIN_SIZE >> EXTRA_BITS_FOR_BINS_SOLS) * 250 / 100) + + + +#define PARAM_N 200 +#define PARAM_K 9 +#define PREFIX (PARAM_N / (PARAM_K + 1)) +#define NR_INPUTS (1 << PREFIX) +#define NR_ROWS (1 << NR_ROWS_LOG) +// Length of 1 element (slot) in byte +#define SLOT_LEN 32 +// Total size of hash table +#define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN) +// Length of Zcash block header, nonce (part of header) +#define ZCASH_BLOCK_HEADER_LEN 140 +// Offset of nTime in header +#define ZCASH_BLOCK_OFFSET_NTIME (4 + 3 * 32) +// Length of nonce +#define ZCASH_NONCE_LEN 32 +// Length of encoded representation of solution size +#define ZCASH_SOLSIZE_LEN 3 +// Solution size (1344 = 0x540) represented as a compact integer, in hex +#define ZCASH_SOLSIZE_HEX "fd4005" +// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes) +#define ZCASH_SOL_LEN ((1 << PARAM_K) * (PREFIX + 1) / 8) +// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization +#define N_ZERO_BYTES 12 +// Number of bytes Zcash needs out of Blake +#define ZCASH_HASH_LEN 50 +// Number of wavefronts per SIMD for the Blake kernel. +// Blake is ALU-bound (beside the atomic counter being incremented) so we need +// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer +// instructions. 10 is the max supported by the hw. +#define BLAKE_WPS 10 +// Maximum number of solutions reported by kernel to host +#define MAX_SOLS 11 +// Length of SHA256 target +#define SHA256_TARGET_LEN (256 / 8) + +#if (NR_SLOTS < 3) +#define BITS_PER_ROW 2 +#define ROWS_PER_UINT 16 +#define ROW_MASK 0x03 +#elif (NR_SLOTS < 7) +#define BITS_PER_ROW 3 +#define ROWS_PER_UINT 10 +#define ROW_MASK 0x07 +#elif (NR_SLOTS < 15) +#define BITS_PER_ROW 4 +#define ROWS_PER_UINT 8 +#define ROW_MASK 0x0F +#elif (NR_SLOTS < 31) +#define BITS_PER_ROW 5 +#define ROWS_PER_UINT 6 +#define ROW_MASK 0x1F +#elif (NR_SLOTS < 63) +#define BITS_PER_ROW 6 +#define ROWS_PER_UINT 5 +#define ROW_MASK 0x3F +#elif (NR_SLOTS < 255) +#define BITS_PER_ROW 8 +#define ROWS_PER_UINT 4 +#define ROW_MASK 0xFF +#else +#define BITS_PER_ROW 16 +#define ROWS_PER_UINT 2 +#define ROW_MASK 0xFFFF +#endif +#define RC_SIZE (NR_ROWS * 4 / ROWS_PER_UINT) + +/* +** Return the offset of Xi in bytes from the beginning of the slot. +*/ +#define xi_offset_for_round(round) 4 + +// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values +#define SOL_SIZE ((1 << PARAM_K) * 4) +typedef struct sols_s +{ + uint nr; + uint likely_invalids; + uchar valid[MAX_SOLS]; + uint values[MAX_SOLS][(1 << PARAM_K)]; +} sols_t; + +#if NR_ROWS_LOG <= 12 && NR_SLOTS <= (1 << 10) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 20) | ((slot1 & 0x3ff) << 10) | (slot0 & 0x3ff)) +#define DECODE_ROW(REF) (REF >> 20) +#define DECODE_SLOT1(REF) ((REF >> 10) & 0x3ff) +#define DECODE_SLOT0(REF) (REF & 0x3ff) + +#elif NR_ROWS_LOG <= 14 && NR_SLOTS <= (1 << 9) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 18) | ((slot1 & 0x1ff) << 9) | (slot0 & 0x1ff)) +#define DECODE_ROW(REF) (REF >> 18) +#define DECODE_SLOT1(REF) ((REF >> 9) & 0x1ff) +#define DECODE_SLOT0(REF) (REF & 0x1ff) + +#elif NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff)) +#define DECODE_ROW(REF) (REF >> 16) +#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff) +#define DECODE_SLOT0(REF) (REF & 0xff) + +#elif NR_ROWS_LOG <= 18 && NR_SLOTS <= (1 << 7) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f)) +#define DECODE_ROW(REF) (REF >> 14) +#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f) +#define DECODE_SLOT0(REF) (REF & 0x7f) + +#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */ +#define DECODE_ROW(REF) (REF >> 13) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) +#define DECODE_ROW(REF) (REF >> 12) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#else +#error "unsupported NR_ROWS_LOG" +#endif + +// Windows only for now +#define DEFAULT_NUM_MINING_MODE_THREADS 1 +#define MAX_NUM_MINING_MODE_THREADS 16 + +#if THREADS_PER_WRITE == 1 +#define ADJUSTED_SLOT_LEN(round) (((round) <= 5) ? SLOT_LEN : SLOT_LEN - 16) +#else +#define ADJUSTED_SLOT_LEN(round) SLOT_LEN +#endif + +#define OPENCL_BUILD_OPTIONS_AMD "-I.. -I. -O5" +#define OPENCL_BUILD_OPTIONS "-I.. -I." + +#define NEXT_PRIME_NO(n) \ + (((n) <= 2) ? 2 : \ + ((n) <= 3) ? 3 : \ + ((n) <= 5) ? 5 : \ + ((n) <= 7) ? 7 : \ + ((n) <= 11) ? 11 : \ + ((n) <= 13) ? 13 : \ + ((n) <= 17) ? 17 : \ + ((n) <= 19) ? 19 : \ + ((n) <= 23) ? 23 : \ + ((n) <= 29) ? 29 : \ + ((n) <= 31) ? 31 : \ + ((n) <= 37) ? 37 : \ + ((n) <= 41) ? 41 : \ + ((n) <= 43) ? 43 : \ + ((n) <= 47) ? 47 : \ + ((n) <= 53) ? 53 : \ + ((n) <= 59) ? 59 : \ + ((n) <= 61) ? 61 : \ + ((n) <= 67) ? 67 : \ + ((n) <= 71) ? 71 : \ + ((n) <= 73) ? 73 : \ + ((n) <= 79) ? 79 : \ + ((n) <= 83) ? 83 : \ + ((n) <= 89) ? 89 : \ + ((n) <= 97) ? 97 : \ + ((n) <= 101) ? 101 : \ + ((n) <= 103) ? 103 : \ + ((n) <= 107) ? 107 : \ + ((n) <= 109) ? 109 : \ + ((n) <= 113) ? 113 : \ + ((n) <= 127) ? 127 : \ + ((n) <= 131) ? 131 : \ + ((n) <= 137) ? 137 : \ + ((n) <= 139) ? 139 : \ + ((n) <= 149) ? 149 : \ + ((n) <= 151) ? 151 : \ + ((n) <= 157) ? 157 : \ + ((n) <= 163) ? 163 : \ + ((n) <= 167) ? 167 : \ + ((n) <= 173) ? 173 : \ + ((n) <= 179) ? 179 : \ + ((n) <= 181) ? 181 : \ + ((n) <= 191) ? 191 : \ + ((n) <= 193) ? 193 : \ + ((n) <= 197) ? 197 : \ + ((n) <= 199) ? 199 : \ + ((n) <= 211) ? 211 : \ + ((n) <= 223) ? 223 : \ + ((n) <= 227) ? 227 : \ + ((n) <= 229) ? 229 : \ + ((n) <= 233) ? 233 : \ + ((n) <= 239) ? 239 : \ + ((n) <= 241) ? 241 : \ + ((n) <= 251) ? 251 : \ + ((n) <= 257) ? 257 : \ + ((n) <= 263) ? 263 : \ + ((n) <= 269) ? 269 : \ + ((n) <= 271) ? 271 : \ + ((n) <= 277) ? 277 : \ + ((n) <= 281) ? 281 : \ + ((n) <= 283) ? 283 : \ + ((n) <= 293) ? 293 : \ + ((n) <= 307) ? 307 : \ + ((n) <= 311) ? 311 : \ + ((n) <= 313) ? 313 : \ + ((n) <= 317) ? 317 : \ + ((n) <= 331) ? 331 : \ + ((n) <= 337) ? 337 : \ + ((n) <= 347) ? 347 : \ + ((n) <= 349) ? 349 : \ + ((n) <= 353) ? 353 : \ + ((n) <= 359) ? 359 : \ + ((n) <= 367) ? 367 : \ + ((n) <= 373) ? 373 : \ + ((n) <= 379) ? 379 : \ + ((n) <= 383) ? 383 : \ + ((n) <= 389) ? 389 : \ + ((n) <= 397) ? 397 : \ + ((n) <= 401) ? 401 : \ + ((n) <= 409) ? 409 : \ + ((n) <= 419) ? 419 : \ + ((n) <= 421) ? 421 : \ + ((n) <= 431) ? 431 : \ + ((n) <= 433) ? 433 : \ + ((n) <= 439) ? 439 : \ + ((n) <= 443) ? 443 : \ + ((n) <= 449) ? 449 : \ + ((n) <= 457) ? 457 : \ + ((n) <= 461) ? 461 : \ + ((n) <= 463) ? 463 : \ + ((n) <= 467) ? 467 : \ + ((n) <= 479) ? 479 : \ + ((n) <= 487) ? 487 : \ + ((n) <= 491) ? 491 : \ + ((n) <= 499) ? 499 : \ + ((n) <= 503) ? 503 : \ + ((n) <= 509) ? 509 : \ + ((n) <= 521) ? 521 : \ + ((n) <= 523) ? 523 : \ + ((n) <= 541) ? 541 : \ + ((n) <= 547) ? 547 : \ + ((n) <= 557) ? 557 : \ + ((n) <= 563) ? 563 : \ + ((n) <= 569) ? 569 : \ + ((n) <= 571) ? 571 : \ + ((n) <= 577) ? 577 : \ + ((n) <= 587) ? 587 : \ + ((n) <= 593) ? 593 : \ + ((n) <= 599) ? 599 : \ + ((n) <= 601) ? 601 : \ + ((n) <= 607) ? 607 : \ + ((n) <= 613) ? 613 : \ + ((n) <= 617) ? 617 : \ + ((n) <= 619) ? 619 : \ + ((n) <= 631) ? 631 : \ + ((n) <= 641) ? 641 : \ + ((n) <= 643) ? 643 : \ + ((n) <= 647) ? 647 : \ + ((n) <= 653) ? 653 : \ + ((n) <= 659) ? 659 : \ + ((n) <= 661) ? 661 : \ + ((n) <= 673) ? 673 : \ + ((n) <= 677) ? 677 : \ + ((n) <= 683) ? 683 : \ + ((n) <= 691) ? 691 : \ + ((n) <= 701) ? 701 : \ + ((n) <= 709) ? 709 : \ + ((n) <= 719) ? 719 : \ + ((n) <= 727) ? 727 : \ + ((n) <= 733) ? 733 : \ + ((n) <= 739) ? 739 : \ + ((n) <= 743) ? 743 : \ + ((n) <= 751) ? 751 : \ + ((n) <= 757) ? 757 : \ + ((n) <= 761) ? 761 : \ + ((n) <= 769) ? 769 : \ + ((n) <= 773) ? 773 : \ + ((n) <= 787) ? 787 : \ + ((n) <= 797) ? 797 : \ + ((n) <= 809) ? 809 : \ + ((n) <= 811) ? 811 : \ + ((n) <= 821) ? 821 : \ + ((n) <= 823) ? 823 : \ + ((n) <= 827) ? 827 : \ + ((n) <= 829) ? 829 : \ + ((n) <= 839) ? 839 : \ + ((n) <= 853) ? 853 : \ + ((n) <= 857) ? 857 : \ + ((n) <= 859) ? 859 : \ + ((n) <= 863) ? 863 : \ + ((n) <= 877) ? 877 : \ + ((n) <= 881) ? 881 : \ + ((n) <= 883) ? 883 : \ + ((n) <= 887) ? 887 : \ + ((n) <= 907) ? 907 : \ + ((n) <= 911) ? 911 : \ + ((n) <= 919) ? 919 : \ + ((n) <= 929) ? 929 : \ + ((n) <= 937) ? 937 : \ + ((n) <= 941) ? 941 : \ + ((n) <= 947) ? 947 : \ + ((n) <= 953) ? 953 : \ + ((n) <= 967) ? 967 : \ + ((n) <= 971) ? 971 : \ + ((n) <= 977) ? 977 : \ + ((n) <= 983) ? 983 : \ + ((n) <= 991) ? 991 : \ + ((n) <= 997) ? 997 : \ + ((n) <= 1009) ? 1009 : \ + (n)) + +#define ROWS_IN_WORK_ITEM (LOCAL_WORK_SIZE / THREADS_PER_ROW ) +#define ROWS_IN_WORK_ITEM_SOLS (LOCAL_WORK_SIZE_SOLS / THREADS_PER_ROW_SOLS) diff --git a/ocl_gatelessgate/param_nr15.h b/ocl_gatelessgate/param_nr15.h new file mode 100644 index 000000000..fd08ba0e0 --- /dev/null +++ b/ocl_gatelessgate/param_nr15.h @@ -0,0 +1,198 @@ +// Gateless Gate, a Zcash miner +// Copyright 2016 zawawa @ bitcointalk.org +// +// The initial version of this software was based on: +// SILENTARMY v5 +// The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil +// +// This program is free software : you can redistribute it and / or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program.If not, see . + +//#define ENABLE_DEBUG + +#define NR_ROWS_LOG 15 +#define NR_SLOTS 120 +#define LOCAL_WORK_SIZE 256 +#define THREADS_PER_ROW 256 +#define LOCAL_WORK_SIZE_SOLS 64 +#define THREADS_PER_ROW_SOLS 64 +#define GLOBAL_WORK_SIZE_RATIO 512 +#define SLOT_CACHE_SIZE (NR_SLOTS * (LOCAL_WORK_SIZE/THREADS_PER_ROW) * 75 / 100) +#define LDS_COLL_SIZE (NR_SLOTS * (LOCAL_WORK_SIZE / THREADS_PER_ROW) * 120 / 100) + +#define SLOT_CACHE_INDEX_TYPE uchar + +#define PARAM_N 200 +#define PARAM_K 9 +#define PREFIX (PARAM_N / (PARAM_K + 1)) +#define NR_INPUTS (1 << PREFIX) +// Approximate log base 2 of number of elements in hash tables +#define APX_NR_ELMS_LOG PREFIX + 1) + +// Setting this to 1 might make Gateless Gate faster, see TROUBLESHOOTING.md +#define OPTIM_SIMPLIFY_ROUND 1 + +// Ratio of time of sleeping before rechecking if task is done (0-1) +#define SLEEP_RECHECK_RATIO 0.60 +// Ratio of time to busy wait for the solution (0-1) +// The higher value the higher CPU usage with Nvidia +#define SLEEP_SKIP_RATIO 0.005 + +// Make hash tables OVERHEAD times larger than necessary to store the average +// number of elements per row. The ideal value is as small as possible to +// reduce memory usage, but not too small or else elements are dropped from the +// hash tables. +// +// The actual number of elements per row is closer to the theoretical average +// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be +// smaller. +// +// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease +// performance as they cause VRAM channel conflicts. +#if NR_ROWS_LOG <= 16 +#define OVERHEAD 2 +#elif NR_ROWS_LOG == 17 +#define OVERHEAD 3 +#elif NR_ROWS_LOG == 18 +#define OVERHEAD 3 +#elif NR_ROWS_LOG == 19 +#define OVERHEAD 5 +#elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND +#define OVERHEAD 6 +#elif NR_ROWS_LOG == 20 +#define OVERHEAD 9 +#endif + +#define NR_ROWS (1 << NR_ROWS_LOG) +#ifndef NR_SLOTS +#define NR_SLOTS (((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD)) +#endif +// Length of 1 element (slot) in byte +#define SLOT_LEN 32 +// Total size of hash table +#define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN) +// Length of Zcash block header, nonce (part of header) +#define ZCASH_BLOCK_HEADER_LEN 140 +// Offset of nTime in header +#define ZCASH_BLOCK_OFFSET_NTIME (4 + 3 * 32) +// Length of nonce +#define ZCASH_NONCE_LEN 32 +// Length of encoded representation of solution size +#define ZCASH_SOLSIZE_LEN 3 +// Solution size (1344 = 0x540) represented as a compact integer, in hex +#define ZCASH_SOLSIZE_HEX "fd4005" +// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes) +#define ZCASH_SOL_LEN ((1 << PARAM_K) * (PREFIX + 1) / 8) +// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization +#define N_ZERO_BYTES 12 +// Number of bytes Zcash needs out of Blake +#define ZCASH_HASH_LEN 50 +// Number of wavefronts per SIMD for the Blake kernel. +// Blake is ALU-bound (beside the atomic counter being incremented) so we need +// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer +// instructions. 10 is the max supported by the hw. +#define BLAKE_WPS 10 +// Maximum number of solutions reported by kernel to host +#define MAX_SOLS 10 +// Length of SHA256 target +#define SHA256_TARGET_LEN (256 / 8) + +#if (NR_SLOTS < 3) +#define BITS_PER_ROW 2 +#define ROWS_PER_UINT 16 +#define ROW_MASK 0x03 +#elif (NR_SLOTS < 7) +#define BITS_PER_ROW 3 +#define ROWS_PER_UINT 10 +#define ROW_MASK 0x07 +#elif (NR_SLOTS < 15) +#define BITS_PER_ROW 4 +#define ROWS_PER_UINT 8 +#define ROW_MASK 0x0F +#elif (NR_SLOTS < 31) +#define BITS_PER_ROW 5 +#define ROWS_PER_UINT 6 +#define ROW_MASK 0x1F +#elif (NR_SLOTS < 63) +#define BITS_PER_ROW 6 +#define ROWS_PER_UINT 5 +#define ROW_MASK 0x3F +#elif (NR_SLOTS < 255) +#define BITS_PER_ROW 8 +#define ROWS_PER_UINT 4 +#define ROW_MASK 0xFF +#else +#define BITS_PER_ROW 16 +#define ROWS_PER_UINT 2 +#define ROW_MASK 0xFFFF +#endif +#define RC_SIZE (NR_ROWS * 4 / ROWS_PER_UINT) + +/* +** Return the offset of Xi in bytes from the beginning of the slot. +*/ +#define xi_offset_for_round(round) 4 + +// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values +#define SOL_SIZE ((1 << PARAM_K) * 4) +typedef struct sols_s +{ + uint nr; + uint likely_invalids; + uchar valid[MAX_SOLS]; + uint values[MAX_SOLS][(1 << PARAM_K)]; +} sols_t; + +#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff)) +#define DECODE_ROW(REF) (REF >> 16) +#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff) +#define DECODE_SLOT0(REF) (REF & 0xff) + +#elif NR_ROWS_LOG <= 18 && NR_SLOTS <= (1 << 7) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f)) +#define DECODE_ROW(REF) (REF >> 14) +#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f) +#define DECODE_SLOT0(REF) (REF & 0x7f) + +#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */ +#define DECODE_ROW(REF) (REF >> 13) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) +#define DECODE_ROW(REF) (REF >> 12) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#else +#error "unsupported NR_ROWS_LOG" +#endif + +// Windows only for now +#define DEFAULT_NUM_MINING_MODE_THREADS 1 +#define MAX_NUM_MINING_MODE_THREADS 16 + +#define ADJUSTED_SLOT_LEN(round) (((round) <= 5) ? SLOT_LEN : SLOT_LEN / 2) +#define OPENCL_BUILD_OPTIONS_AMD "-I.. -I. -O1" +#define OPENCL_BUILD_OPTIONS "-I.. -I." \ No newline at end of file diff --git a/ocl_silentarmy/ocl_silentarmy.cpp b/ocl_silentarmy/ocl_silentarmy.cpp index c4a2cc7b0..a820de8f5 100644 --- a/ocl_silentarmy/ocl_silentarmy.cpp +++ b/ocl_silentarmy/ocl_silentarmy.cpp @@ -16,12 +16,12 @@ //#include #include - #include "opencl.h" #include -#include "sa_blake.h" +#include +using namespace blake; typedef uint8_t uchar; typedef uint32_t uint; diff --git a/ocl_silentarmy/ocl_silentarmy.vcxproj b/ocl_silentarmy/ocl_silentarmy.vcxproj index 77771fcb6..251d3b605 100644 --- a/ocl_silentarmy/ocl_silentarmy.vcxproj +++ b/ocl_silentarmy/ocl_silentarmy.vcxproj @@ -12,12 +12,10 @@ - - @@ -56,10 +54,12 @@ $(SolutionDir)$(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ + $(ProjectDir)../contrib/;$(IncludePath) $(Platform)\$(Configuration)\ $(SolutionDir)$(Platform)\$(Configuration)\ + $(ProjectDir)../contrib/;$(IncludePath) diff --git a/ocl_silentarmy/ocl_silentarmy.vcxproj.filters b/ocl_silentarmy/ocl_silentarmy.vcxproj.filters index 9659f2c07..432f9c4b1 100644 --- a/ocl_silentarmy/ocl_silentarmy.vcxproj.filters +++ b/ocl_silentarmy/ocl_silentarmy.vcxproj.filters @@ -2,12 +2,10 @@ - - diff --git a/ocl_xpm/ocl_xmp.cpp b/ocl_xpm/ocl_xmp.cpp index 4064e3626..d0a96a2a8 100644 --- a/ocl_xpm/ocl_xmp.cpp +++ b/ocl_xpm/ocl_xmp.cpp @@ -105,7 +105,7 @@ static void setheader(blake2b_state *ctx, const char *header, const uint32_t hea { uint32_t le_N = WN; uint32_t le_K = WK; - char personal[] = "DeepWebCa01230123"; + char personal[] = "ZcashPoW01230123"; memcpy(personal + 8, &le_N, 4); memcpy(personal + 12, &le_K, 4); blake2b_param P[1];