diff --git a/3rdparty/amd_bins_linux/equiw200k9.bin b/3rdparty/amd_bins_linux/equiw200k9.bin deleted file mode 100644 index 45785dc4e..000000000 Binary files a/3rdparty/amd_bins_linux/equiw200k9.bin and /dev/null differ diff --git a/3rdparty/amd_bins_linux/zcash/gpu/blake2bcl.h b/3rdparty/amd_bins_linux/zcash/gpu/blake2bcl.h deleted file mode 100644 index 13cad965c..000000000 --- a/3rdparty/amd_bins_linux/zcash/gpu/blake2bcl.h +++ /dev/null @@ -1,150 +0,0 @@ -// Blake2-B CUDA Implementation -// tpruvot@github July 2016 -// permission granted to use under MIT license -// modified for use in Zcash by John Tromp September 2016 - -/** - * uint2 direct ops by c++ operator definitions - */ - -// static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { -// return make_uint2(a.x ^ b.x, a.y ^ b.y); -// } - -// uint2 ROR/ROL methods -uint2 ROR2(const uint2 a, const int offset) { - uint2 result; - if (!offset) - result = a; - else if (offset < 32) { - result.y = ((a.y >> offset) | (a.x << (32 - offset))); - result.x = ((a.x >> offset) | (a.y << (32 - offset))); - } else if (offset == 32) { - result.y = a.x; - result.x = a.y; - } else { - result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset))); - result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset))); - } - return result; -} - -uint2 SWAPUINT2(uint2 value) { - uint2 result; - result.x = value.y; - result.y = value.x; - return result; -// return make_uint2(value.y, value.x); -} - -#define ROR24(u) ROR2(u,24) -#define ROR16(u) ROR2(u,16) - -__constant int8_t blake2b_sigma[12][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 } , - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } -}; - -void G(const int32_t r, const int32_t i, uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d, uint64_t const m[16]) { - *a += *b + m[ blake2b_sigma[r][2*i] ]; - ((uint2*)d)[0] = SWAPUINT2( ((uint2*)d)[0] ^ ((uint2*)a)[0] ); - *c += *d; - ((uint2*)b)[0] = ROR24( ((uint2*)b)[0] ^ ((uint2*)c)[0] ); - *a += *b + m[ blake2b_sigma[r][2*i+1] ]; - ((uint2*)d)[0] = ROR16( ((uint2*)d)[0] ^ ((uint2*)a)[0] ); - *c += *d; - ((uint2*)b)[0] = ROR2( ((uint2*)b)[0] ^ ((uint2*)c)[0], 63U); -} - -#define ROUND(r) \ - G(r, 0, &v[0], &v[4], &v[ 8], &v[12], m); \ - G(r, 1, &v[1], &v[5], &v[ 9], &v[13], m); \ - G(r, 2, &v[2], &v[6], &v[10], &v[14], m); \ - G(r, 3, &v[3], &v[7], &v[11], &v[15], m); \ - G(r, 4, &v[0], &v[5], &v[10], &v[15], m); \ - G(r, 5, &v[1], &v[6], &v[11], &v[12], m); \ - G(r, 6, &v[2], &v[7], &v[ 8], &v[13], m); \ - G(r, 7, &v[3], &v[4], &v[ 9], &v[14], m); - -void blake2b_gpu_hash(blake2b_state *state, uint32_t idx, uint8_t *hash, uint32_t outlen) { - const uint32_t leb = idx; - *(uint32_t*)(state->buf + state->buflen) = leb; - state->buflen += 4; - state->counter += state->buflen; - for (unsigned i = 0; i < BLAKE2B_BLOCKBYTES - state->buflen; i++) - state->buf[i+state->buflen] = 0; - - uint64_t *d_data = (uint64_t *)state->buf; - uint64_t m[16]; - - m[0] = d_data[0]; - m[1] = d_data[1]; - m[2] = d_data[2]; - m[3] = d_data[3]; - m[4] = d_data[4]; - m[5] = d_data[5]; - m[6] = d_data[6]; - m[7] = d_data[7]; - m[8] = d_data[8]; - m[9] = d_data[9]; - m[10] = d_data[10]; - m[11] = d_data[11]; - m[12] = d_data[12]; - m[13] = d_data[13]; - m[14] = d_data[14]; - m[15] = d_data[15]; - - uint64_t v[16]; - - v[0] = state->h[0]; - v[1] = state->h[1]; - v[2] = state->h[2]; - v[3] = state->h[3]; - v[4] = state->h[4]; - v[5] = state->h[5]; - v[6] = state->h[6]; - v[7] = state->h[7]; - v[8] = 0x6a09e667f3bcc908; - v[9] = 0xbb67ae8584caa73b; - v[10] = 0x3c6ef372fe94f82b; - v[11] = 0xa54ff53a5f1d36f1; - v[12] = 0x510e527fade682d1 ^ state->counter; - v[13] = 0x9b05688c2b3e6c1f; - v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff; - v[15] = 0x5be0cd19137e2179; - - ROUND( 0 ); - ROUND( 1 ); - ROUND( 2 ); - ROUND( 3 ); - ROUND( 4 ); - ROUND( 5 ); - ROUND( 6 ); - ROUND( 7 ); - ROUND( 8 ); - ROUND( 9 ); - ROUND( 10 ); - ROUND( 11 ); - - state->h[0] ^= v[0] ^ v[ 8]; - state->h[1] ^= v[1] ^ v[ 9]; - state->h[2] ^= v[2] ^ v[10]; - state->h[3] ^= v[3] ^ v[11]; - state->h[4] ^= v[4] ^ v[12]; - state->h[5] ^= v[5] ^ v[13]; - state->h[6] ^= v[6] ^ v[14]; - state->h[7] ^= v[7] ^ v[15]; - - for (unsigned i = 0; i < outlen; i++) - hash[i] = ((uint8_t*)state->h)[i]; -} diff --git a/3rdparty/amd_bins_linux/zcash/gpu/common.h b/3rdparty/amd_bins_linux/zcash/gpu/common.h deleted file mode 100644 index 22ba9548e..000000000 --- a/3rdparty/amd_bins_linux/zcash/gpu/common.h +++ /dev/null @@ -1,156 +0,0 @@ -#if defined(__OPENCL_HOST__) -#define __global -#include "../blake2.h" -#else -typedef char int8_t; -typedef uchar uint8_t; -typedef short int16_t; -typedef ushort uint16_t; -typedef int int32_t; -typedef uint uint32_t; -typedef long int64_t; -typedef ulong uint64_t; - -#if defined(_MSC_VER) -#define ALIGN(x) __declspec(align(x)) -#else -#define ALIGN(x) __attribute__ ((__aligned__(x))) -#endif - -enum blake2b_constant -{ - BLAKE2B_BLOCKBYTES = 128, - BLAKE2B_OUTBYTES = 64, - BLAKE2B_KEYBYTES = 64, - BLAKE2B_SALTBYTES = 16, - BLAKE2B_PERSONALBYTES = 16 -}; - -#pragma pack(push, 1) -ALIGN( 64 ) typedef struct __blake2b_state { - uint64_t h[8]; - uint8_t buf[BLAKE2B_BLOCKBYTES]; - uint16_t counter; - uint8_t buflen; - uint8_t lastblock; -} blake2b_state; -#pragma pack(pop) -#endif - -#define COLLISION_BIT_LENGTH (WN / (WK+1)) -#define COLLISION_BYTE_LENGTH ((COLLISION_BIT_LENGTH+7)/8) -#define FINAL_FULL_WIDTH (2*COLLISION_BYTE_LENGTH+sizeof(uint32_t)*(1 << (WK))) - - -#define NDIGITS (WK+1) -#define DIGITBITS (WN/(NDIGITS)) -#define PROOFSIZE (1u< 64 -#error cant use XBITMAP with more than 64 slots -#endif - uint64_t xhashmap[NRESTS]; - uint64_t xmap; -#else - xslot nxhashslots[NRESTS]; - xslot xhashslots[NRESTS][XFULL]; - xslot *xx; - uint32_t n0; - uint32_t n1; -#endif - uint32_t s0; -} collisiondata; - - -typedef struct equi { - blake2b_state blake_ctx; - htalloc hta; - __global bsizes *nslots; - __global proof *sols; - uint32_t nsols; - uint32_t nthreads; -} equi; diff --git a/3rdparty/amd_bins_linux/zcash/gpu/equihash.cl b/3rdparty/amd_bins_linux/zcash/gpu/equihash.cl deleted file mode 100644 index 213a8e4d6..000000000 --- a/3rdparty/amd_bins_linux/zcash/gpu/equihash.cl +++ /dev/null @@ -1,1038 +0,0 @@ -#include "common.h" - -#include "blake2bcl.h" - -#define tree0_ptr(heap, r) ((__global bucket0 *)(heap + r)) -#define tree1_ptr(heap, r) ((__global bucket1 *)(heap + r)) - -uint32_t tree_bucket(tree t) -{ - const uint32_t bucketMask = ((1u<> BUCKBITS) & SLOTMASK; -} - -uint32_t tree_slotid1(tree t) -{ - const uint32_t slotMask = ((1u<> (BUCKBITS+SLOTBITS)) & SLOTMASK; -} - -uint32_t tree_xhash(tree t) -{ - return t >> (2*SLOTBITS + BUCKBITS); -} - -uint32_t tree_getindex(const tree t) -{ - const uint32_t bucketMask = ((1u<> BUCKBITS); -} - -void tree_setindex(tree *t, uint32_t idx) -{ - const uint32_t bucketMask = ((1u<> SLOTBITS); - (*t) |= ((idx & slotMask) << BUCKBITS); -} - -void tree_setxhash(tree *t, uint32_t xhash) -{ - const uint32_t xhashMask = ((1u << RESTBITS)-1); - (*t) &= ~(xhashMask << (2*SLOTBITS + BUCKBITS)); - (*t) |= (xhash << (2*SLOTBITS + BUCKBITS)); -} - -tree tree_create3(uint32_t bucketId, uint32_t s0, uint32_t s1) -{ - return bucketId | (s0 << BUCKBITS) | (s1 << (BUCKBITS+SLOTBITS)); -} - -tree tree_create4(uint32_t bucketId, uint32_t s0, uint32_t s1, uint32_t xhash) -{ - return bucketId | (s0 << BUCKBITS) | (s1 << (BUCKBITS+SLOTBITS)) | (xhash << (2*SLOTBITS+BUCKBITS));; -} - -// size (in bytes) of hash in round 0 <= r < WK -uint32_t hashsize(const uint32_t r) -{ -#ifdef XINTREE - const uint32_t hashbits = WN - (r+1) * DIGITBITS; -#else - const uint32_t hashbits = WN - (r+1) * DIGITBITS + RESTBITS; -#endif - return (hashbits + 7) / 8; -} - -uint32_t hashwords(uint32_t bytes) -{ - return (bytes + 3) / 4; -} - -htlayout htlayout_create_2(uint32_t r) -{ - htlayout R; - R.prevhashunits = 0; - R.dunits = 0; - - uint32_t nexthashbytes = hashsize(r); - R.nexthashunits = hashwords(nexthashbytes); - - R.prevbo = 0; - R.nextbo = R.nexthashunits * sizeof(hashunit) - nexthashbytes; // 0-3 - if (r) { - uint32_t prevhashbytes = hashsize(r-1); - R.prevhashunits = hashwords(prevhashbytes); - R.prevbo = R.prevhashunits * sizeof(hashunit) - prevhashbytes; // 0-3 - R.dunits = R.prevhashunits - R.nexthashunits; - } - - return R; -} - -uint32_t htlayout_getxhash0(uint32_t prevbo, __global const slot0 *pslot) -{ -#ifdef XINTREE - return tree_xhash(pslot->attr); -#elif WN == 200 && RESTBITS == 4 - return pslot->hash->bytes[prevbo] >> 4; -#elif WN == 200 && RESTBITS == 8 - return (pslot->hash->bytes[prevbo] & 0xf) << 4 | pslot->hash->bytes[prevbo+1] >> 4; -#elif WN == 144 && RESTBITS == 4 - return pslot->hash->bytes[prevbo] & 0xf; -#elif WN == 200 && RESTBITS == 6 - return (pslot->hash->bytes[prevbo] & 0x3) << 4 | pslot->hash->bytes[prevbo+1] >> 4; -#else -#error non implemented -#endif -} - -uint32_t htlayout_getxhash1(uint32_t prevbo, __global const slot1 *pslot) -{ -#ifdef XINTREE - return tree_xhash(pslot->attr); -#elif WN == 200 && RESTBITS == 4 - return pslot->hash->bytes[prevbo] & 0xf; -#elif WN == 200 && RESTBITS == 8 - return pslot->hash->bytes[prevbo]; -#elif WN == 144 && RESTBITS == 4 - return pslot->hash->bytes[prevbo] & 0xf; -#elif WN == 200 && RESTBITS == 6 - return pslot->hash->bytes[prevbo] & 0x3f; -#else -#error non implemented -#endif -} - -bool htlayout_equal(uint32_t prevhashunits, __global const hashunit *hash0, __global const hashunit *hash1) -{ - return hash0[prevhashunits-1].word == hash1[prevhashunits-1].word; -} - -void collisiondata_clear(collisiondata *data) -{ -#ifdef XBITMAP - // memset(xhashmap, 0, NRESTS * sizeof(u64)); - for (unsigned i = 0; i < NRESTS; i++) - data->xhashmap[i] = 0; -#else - // memset(nxhashslots, 0, NRESTS * sizeof(xslot)); - for (unsigned i = 0; i < NRESTS; i++) - data->nxhashslots[i] = 0; -#endif -} - -bool collisiondata_addslot(collisiondata *data, uint32_t s1, uint32_t xh) -{ -#ifdef XBITMAP - data->xmap = data->xhashmap[xh]; - data->xhashmap[xh] |= (uint64_t)1 << s1; - data->s0 = ~0; - return true; -#else - data->n1 = (uint32_t)data->nxhashslots[xh]++; - if (data->n1 >= XFULL) - return false; - data->xx = data->xhashslots[xh]; - data->xx[data->n1] = s1; - data->n0 = 0; - return true; -#endif -} - -bool collisiondata_nextcollision(collisiondata *data) -{ -#ifdef XBITMAP - return data->xmap != 0; -#else - return data->n0 < data->n1; -#endif -} - -uint64_t __ffsll(uint64_t x) -{ - return x ? (64 - clz(x & -x)) : 0; -} - -uint32_t collisiondata_slot(collisiondata *data) { -#ifdef XBITMAP - const uint32_t ffs = __ffsll(xmap); - data->s0 += ffs; - data->xmap >>= ffs; - return data->s0; -#else - return (uint32_t)data->xx[data->n0++]; -#endif -} - -uint32_t equi_getnslots(__global bsizes *nslots, const uint32_t r, const uint32_t bid) -{ - __global uint32_t *nslot = &nslots[r&1][bid]; - const uint32_t n = min(*nslot, NSLOTS); - *nslot = 0; - return n; -} - -void equi_orderindices(__global uint32_t *indices, uint32_t size) -{ - if (indices[0] > indices[size]) { - for (uint32_t i = 0; i < size; i++) { - const uint32_t tmp = indices[i]; - indices[i] = indices[size+i]; - indices[size+i] = tmp; - } - } -} - -void local_orderindices(uint32_t *indices, uint32_t size) -{ - if (indices[0] > indices[size]) { - for (uint32_t i = 0; i < size; i++) { - const uint32_t tmp = indices[i]; - indices[i] = indices[size+i]; - indices[size+i] = tmp; - } - } -} - - -void equi_listindices1(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 0)[tree_bucket(t)]; - const uint32_t size = 1 << 0; - indices[0] = tree_getindex((*buck)[tree_slotid0(t)].attr); - indices[size] = tree_getindex((*buck)[tree_slotid1(t)].attr); - equi_orderindices(indices, size); -} - -void equi_listindices2(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 0)[tree_bucket(t)]; - const uint32_t size = 1 << 1; - equi_listindices1(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices1(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices3(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 1)[tree_bucket(t)]; - const uint32_t size = 1 << 2; - equi_listindices2(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices2(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices4(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 1)[tree_bucket(t)]; - const uint32_t size = 1 << 3; - equi_listindices3(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices3(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices5(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 2)[tree_bucket(t)]; - const uint32_t size = 1 << 4; - equi_listindices4(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices4(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices6(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 2)[tree_bucket(t)]; - const uint32_t size = 1 << 5; - equi_listindices5(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices5(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices7(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 3)[tree_bucket(t)]; - const uint32_t size = 1 << 6; - equi_listindices6(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices6(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices8(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 3)[tree_bucket(t)]; - const uint32_t size = 1 << 7; - equi_listindices7(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices7(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices9(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 4)[tree_bucket(t)]; - const uint32_t size = 1 << 8; - equi_listindices8(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices8(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void local_listindices1(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 0)[tree_bucket(t)]; - const uint32_t size = 1 << 0; - indices[0] = tree_getindex((*buck)[tree_slotid0(t)].attr); - indices[size] = tree_getindex((*buck)[tree_slotid1(t)].attr); - local_orderindices(indices, size); -} - -void local_listindices2(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 0)[tree_bucket(t)]; - const uint32_t size = 1 << 1; - local_listindices1(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices1(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices3(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 1)[tree_bucket(t)]; - const uint32_t size = 1 << 2; - local_listindices2(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices2(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices4(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 1)[tree_bucket(t)]; - const uint32_t size = 1 << 3; - local_listindices3(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices3(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices5(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 2)[tree_bucket(t)]; - const uint32_t size = 1 << 4; - local_listindices4(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices4(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices6(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 2)[tree_bucket(t)]; - const uint32_t size = 1 << 5; - local_listindices5(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices5(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices7(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 3)[tree_bucket(t)]; - const uint32_t size = 1 << 6; - local_listindices6(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices6(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices8(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 3)[tree_bucket(t)]; - const uint32_t size = 1 << 7; - local_listindices7(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices7(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices9(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 4)[tree_bucket(t)]; - const uint32_t size = 1 << 8; - local_listindices8(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices8(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -// proper dupe test is a little costly on GPU, so allow false negatives -bool equi_probdupe(uint32_t *prf) { - unsigned short susp[PROOFSIZE]; - for (unsigned i = 0; i < PROOFSIZE; i++) - susp[i] = 0xFFFF; - - for (unsigned i = 0; i < PROOFSIZE; i++) { - uint32_t bin = prf[i] & (PROOFSIZE-1); - unsigned short msb = prf[i] >> WK; - if (msb == susp[bin]) - return true; - susp[bin] = msb; - } - - return false; -} - -void equi_candidate(__global uint32_t *heap0, - __global uint32_t *heap1, - __global proof *sols, - __global uint32_t *nsols, - const tree t) -{ - proof prf; -#if WK==9 - local_listindices9(heap0, heap1, t, (uint32_t*)&prf); -#elif WK==5 - local_listindices5(heap0, heap1, t, (uint32_t*)&prf); -#else -#error not implemented -#endif - if (equi_probdupe(prf)) - return; - uint32_t soli = atomic_inc(nsols); - if (soli < MAXSOLS) -#if WK==9 - equi_listindices9(heap0, heap1, t, sols[soli]); -#elif WK==5 - equi_listindices5(heap0, heap1, t, sols[soli]); -#else -#error not implemented -#endif -} - - -__kernel void digitH(__global blake2b_state *blake2bState, - __global const uint32_t *heap0, - __global bsizes *nslots) -{ - uint8_t hash[HASHOUT]; - blake2b_state state; - // equi::htlayout htl(eq, 0); - htlayout htl = htlayout_create_2(0); - const uint32_t hashbytes = hashsize(0); - // const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x; - const uint32_t id = get_global_id(0); - for (uint32_t block = id; block < NBLOCKS; block += get_global_size(0)) { - state = *blake2bState; - blake2b_gpu_hash(&state, block, hash, HASHOUT); - for (uint32_t i = 0; i < HASHESPERBLAKE; i++) { - const uint8_t *ph = hash + i * WN/8; -#if BUCKBITS == 16 && RESTBITS == 4 - const uint32_t bucketid = ((uint32_t)ph[0] << 8) | ph[1]; -#ifdef XINTREE - const uint32_t xhash = ph[2] >> 4; -#endif -#elif BUCKBITS == 14 && RESTBITS == 6 - const uint32_t bucketid = ((uint32_t)ph[0] << 6) | ph[1] >> 2; -#elif BUCKBITS == 12 && RESTBITS == 8 - const uint32_t bucketid = ((uint32_t)ph[0] << 4) | ph[1] >> 4; -#elif BUCKBITS == 20 && RESTBITS == 4 - const uint32_t bucketid = ((((uint32_t)ph[0] << 8) | ph[1]) << 4) | ph[2] >> 4; -#ifdef XINTREE - const uint32_t xhash = ph[2] & 0xf; -#endif -#elif BUCKBITS == 12 && RESTBITS == 4 - const uint32_t bucketid = ((uint32_t)ph[0] << 4) | ph[1] >> 4; - const uint32_t xhash = ph[1] & 0xf; -#else -#error not implemented -#endif - const uint32_t slot = atomic_inc(&nslots[0][bucketid]); - if (slot >= NSLOTS) - continue; - tree leaf; - tree_setindex(&leaf, block*HASHESPERBLAKE+i); -#ifdef XINTREE - tree_setxhash(&leaf, xhash); -#endif - __global slot0 *s = &tree0_ptr(heap0, 0)[bucketid][slot]; - s->attr = leaf; - - // memcpy(s.hash->bytes+htl.nextbo, ph+WN/8-hashbytes, hashbytes); - for (unsigned i = 0; i < hashbytes; i++) - ((__global uint8_t*)s->hash->bytes+htl.nextbo)[i] = ((uint8_t*)(ph+WN/8-hashbytes))[i]; - } - } -} - -__kernel void digitOdd(const uint32_t r, - __global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) -{ - // equi::htlayout htl(eq, r); -// htlayout htl = htlayout_create(eq, r); - htlayout htl = htlayout_create_2(r); - collisiondata cd; - - // const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x; - const uint32_t id = get_global_id(0); - - for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - // cd.clear(); - collisiondata_clear(&cd); -// __global slot0 *buck = htl.hta.trees0[(r-1)/2][bucketid]; // optimize by updating previous buck?! - __global slot0 *buck = tree0_ptr(heap0, (r-1)/2)[bucketid]; // optimize by updating previous buck?! - uint32_t bsize = equi_getnslots(nslots, r-1, bucketid); // optimize by putting bucketsize with block?! - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot0 *pslot1 = buck + s1; // optimize by updating previous pslot1?! - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot0 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; -#if WN == 200 && BUCKBITS == 16 && RESTBITS == 4 && defined(XINTREE) - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4 - | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - xhash &= 0xf; -#elif WN == 144 && BUCKBITS == 20 && RESTBITS == 4 - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 8) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 4) - | (xhash = bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 4; - xhash &= 0xf; -#elif WN == 96 && BUCKBITS == 12 && RESTBITS == 4 - xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4) - | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - xhash &= 0xf; -#elif WN == 200 && BUCKBITS == 14 && RESTBITS == 6 - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) & 0xf) << 8) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 2 - | (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 6; -#else -#error not implemented -#endif - const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]); - if (xorslot >= NSLOTS) - continue; -#ifdef XINTREE - tree xort = tree_create4(bucketid, s0, s1, xhash); -#else - tree xort = tree_create3(bucketid, s0, s1); -#endif -// __global slot1 *xs = &htl.hta.trees1[r/2][xorbucketid][xorslot]; - __global slot1 *xs = &tree1_ptr(heap1, r/2)[xorbucketid][xorslot]; - xs->attr = xort; - for (uint32_t i = htl.dunits; i < htl.prevhashunits; i++) - xs->hash[i-htl.dunits].word = pslot0->hash[i].word ^ pslot1->hash[i].word; - } - } - } -} - - -__kernel void digitEven(const uint32_t r, - __global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) -{ - // equi::htlayout htl(eq, r); -// htlayout htl = htlayout_create(eq, r); - htlayout htl = htlayout_create_2(r); - collisiondata cd; - - // const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x; - const uint32_t id = get_global_id(0); - - for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - // cd.clear(); - collisiondata_clear(&cd); -// __global slot1 *buck = htl.hta.trees1[(r-1)/2][bucketid]; // OPTIMIZE BY UPDATING PREVIOUS - __global slot1 *buck = tree1_ptr(heap1, (r-1)/2)[bucketid]; // OPTIMIZE BY UPDATING PREVIOUS - uint32_t bsize = equi_getnslots(nslots, r-1, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot1 *pslot1 = buck + s1; // OPTIMIZE BY UPDATING PREVIOUS - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot1 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; -#if WN == 200 && BUCKBITS == 16 && RESTBITS == 4 && defined(XINTREE) - xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]); - xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; -#elif WN == 144 && BUCKBITS == 20 && RESTBITS == 4 - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 8) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 4) - | (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 4; -#elif WN == 96 && BUCKBITS == 12 && RESTBITS == 4 - xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; -#elif WN == 200 && BUCKBITS == 14 && RESTBITS == 6 - xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 6) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 2; -#else -#error not implemented -#endif - const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]); - if (xorslot >= NSLOTS) - continue; -#ifdef XINTREE - tree xort = tree_create4(bucketid, s0, s1, xhash); -#else - tree xort = tree_create3(bucketid, s0, s1); -#endif -// __global slot0 *xs = &htl.hta.trees0[r/2][xorbucketid][xorslot]; - __global slot0 *xs = &tree0_ptr(heap0, r/2)[xorbucketid][xorslot]; - xs->attr = xort; - for (uint32_t i=htl.dunits; i < htl.prevhashunits; i++) - xs->hash[i-htl.dunits].word = pslot0->hash[i].word ^ pslot1->hash[i].word; - } - } - } -} - - -#ifdef UNROLL - -__kernel void digit_1(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) -{ - htlayout htl = htlayout_create_2(1); - collisiondata cd; - const uint32_t id = get_global_id(0); - - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); - __global slot0 *buck = tree0_ptr(heap0, 0)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 0, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot0 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot0 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4 - | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - xhash &= 0xf; - const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot1 *xs = &htl.hta.trees1[0][xorbucketid][xorslot]; - __global slot1 *xs = &tree1_ptr(heap1, 0)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word; - xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word; - xs->hash[3].word = pslot0->hash[4].word ^ pslot1->hash[4].word; - xs->hash[4].word = pslot0->hash[5].word ^ pslot1->hash[5].word; - } - } - } -} -__kernel void digit_2(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(2); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot1 *buck = htl.hta.trees1[0][bucketid]; - __global slot1 *buck = tree1_ptr(heap1, 0)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 1, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot1 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot1 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]); - xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); - // __global slot0 *xs = &htl.hta.trees0[1][xorbucketid][xorslot]; - __global slot0 *xs = &tree0_ptr(heap0, 1)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word; - xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - xs->hash[2].word = pslot0->hash[2].word ^ pslot1->hash[2].word; - xs->hash[3].word = pslot0->hash[3].word ^ pslot1->hash[3].word; - xs->hash[4].word = pslot0->hash[4].word ^ pslot1->hash[4].word; - } - } - } -} -__kernel void digit_3(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(3); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot0 *buck = htl.hta.trees0[1][bucketid]; - __global slot0 *buck = tree0_ptr(heap0, 1)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 2, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot0 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot0 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4 - | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - xhash &= 0xf; - const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot1 *xs = &htl.hta.trees1[1][xorbucketid][xorslot]; - __global slot1 *xs = &tree1_ptr(heap1, 1)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word; - xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word; - xs->hash[3].word = pslot0->hash[4].word ^ pslot1->hash[4].word; - } - } - } -} -__kernel void digit_4(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(4); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot1 *buck = htl.hta.trees1[1][bucketid]; - __global slot1 *buck = tree1_ptr(heap1, 1)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 3, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot1 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot1 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]); - xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot0 *xs = &htl.hta.trees0[2][xorbucketid][xorslot]; - __global slot0 *xs = &tree0_ptr(heap0, 2)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word; - xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - xs->hash[2].word = pslot0->hash[2].word ^ pslot1->hash[2].word; - xs->hash[3].word = pslot0->hash[3].word ^ pslot1->hash[3].word; - } - } - } -} -__kernel void digit_5(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(5); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot0 *buck = htl.hta.trees0[2][bucketid]; - __global slot0 *buck = tree0_ptr(heap0, 2)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 4, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot0 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot0 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4 - | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - xhash &= 0xf; - const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot1 *xs = &htl.hta.trees1[2][xorbucketid][xorslot]; - __global slot1 *xs = &tree1_ptr(heap1, 2)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word; - xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word; - } - } - } -} -__kernel void digit_6(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(6); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot1 *buck = htl.hta.trees1[2][bucketid]; - __global slot1 *buck = tree1_ptr(heap1, 2)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 5, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot1 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot1 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]); - xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot0 *xs = &htl.hta.trees0[3][xorbucketid][xorslot]; - __global slot0 *xs = &tree0_ptr(heap0, 3)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word; - } - } - } -} -__kernel void digit_7(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(7); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot0 *buck = htl.hta.trees0[3][bucketid]; - __global slot0 *buck = tree0_ptr(heap0, 3)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 6, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot0 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot0 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4 - | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - xhash &= 0xf; - const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot1 *xs = &htl.hta.trees1[3][xorbucketid][xorslot]; - __global slot1 *xs = &tree1_ptr(heap1, 3)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word; - xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - } - } - } -} -__kernel void digit_8(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(8); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot1 *buck = htl.hta.trees1[3][bucketid]; - __global slot1 *buck = tree1_ptr(heap1, 3)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 7, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot1 *pslot1 = buck + s1; // OPTIMIZE BY UPDATING PREVIOUS - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot1 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]); - xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot0 *xs = &htl.hta.trees0[4][xorbucketid][xorslot]; - __global slot0 *xs = &tree0_ptr(heap0, 4)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - } - } - } -} -#endif //UNROLL - -__kernel void digitK(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots, - __global proof *sols, - __global uint32_t *nsols) { - collisiondata cd; - htlayout htl = htlayout_create_2(WK); - const uint32_t id = get_global_id(0); - for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); - __global slot0 *buck = tree0_ptr(heap0, (WK-1)/2)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, WK-1, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot0 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) // assume WK odd - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot0 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) { - tree xort = tree_create3(bucketid, s0, s1); - equi_candidate(heap0, heap1, sols, nsols, xort); - } - } - } - } -} diff --git a/3rdparty/amd_bins_windows/equiw200k9.bin b/3rdparty/amd_bins_windows/equiw200k9.bin deleted file mode 100644 index 868842f93..000000000 Binary files a/3rdparty/amd_bins_windows/equiw200k9.bin and /dev/null differ diff --git a/3rdparty/amd_bins_windows/zcash/gpu/blake2bcl.h b/3rdparty/amd_bins_windows/zcash/gpu/blake2bcl.h deleted file mode 100644 index 13cad965c..000000000 --- a/3rdparty/amd_bins_windows/zcash/gpu/blake2bcl.h +++ /dev/null @@ -1,150 +0,0 @@ -// Blake2-B CUDA Implementation -// tpruvot@github July 2016 -// permission granted to use under MIT license -// modified for use in Zcash by John Tromp September 2016 - -/** - * uint2 direct ops by c++ operator definitions - */ - -// static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { -// return make_uint2(a.x ^ b.x, a.y ^ b.y); -// } - -// uint2 ROR/ROL methods -uint2 ROR2(const uint2 a, const int offset) { - uint2 result; - if (!offset) - result = a; - else if (offset < 32) { - result.y = ((a.y >> offset) | (a.x << (32 - offset))); - result.x = ((a.x >> offset) | (a.y << (32 - offset))); - } else if (offset == 32) { - result.y = a.x; - result.x = a.y; - } else { - result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset))); - result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset))); - } - return result; -} - -uint2 SWAPUINT2(uint2 value) { - uint2 result; - result.x = value.y; - result.y = value.x; - return result; -// return make_uint2(value.y, value.x); -} - -#define ROR24(u) ROR2(u,24) -#define ROR16(u) ROR2(u,16) - -__constant int8_t blake2b_sigma[12][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 } , - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } -}; - -void G(const int32_t r, const int32_t i, uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d, uint64_t const m[16]) { - *a += *b + m[ blake2b_sigma[r][2*i] ]; - ((uint2*)d)[0] = SWAPUINT2( ((uint2*)d)[0] ^ ((uint2*)a)[0] ); - *c += *d; - ((uint2*)b)[0] = ROR24( ((uint2*)b)[0] ^ ((uint2*)c)[0] ); - *a += *b + m[ blake2b_sigma[r][2*i+1] ]; - ((uint2*)d)[0] = ROR16( ((uint2*)d)[0] ^ ((uint2*)a)[0] ); - *c += *d; - ((uint2*)b)[0] = ROR2( ((uint2*)b)[0] ^ ((uint2*)c)[0], 63U); -} - -#define ROUND(r) \ - G(r, 0, &v[0], &v[4], &v[ 8], &v[12], m); \ - G(r, 1, &v[1], &v[5], &v[ 9], &v[13], m); \ - G(r, 2, &v[2], &v[6], &v[10], &v[14], m); \ - G(r, 3, &v[3], &v[7], &v[11], &v[15], m); \ - G(r, 4, &v[0], &v[5], &v[10], &v[15], m); \ - G(r, 5, &v[1], &v[6], &v[11], &v[12], m); \ - G(r, 6, &v[2], &v[7], &v[ 8], &v[13], m); \ - G(r, 7, &v[3], &v[4], &v[ 9], &v[14], m); - -void blake2b_gpu_hash(blake2b_state *state, uint32_t idx, uint8_t *hash, uint32_t outlen) { - const uint32_t leb = idx; - *(uint32_t*)(state->buf + state->buflen) = leb; - state->buflen += 4; - state->counter += state->buflen; - for (unsigned i = 0; i < BLAKE2B_BLOCKBYTES - state->buflen; i++) - state->buf[i+state->buflen] = 0; - - uint64_t *d_data = (uint64_t *)state->buf; - uint64_t m[16]; - - m[0] = d_data[0]; - m[1] = d_data[1]; - m[2] = d_data[2]; - m[3] = d_data[3]; - m[4] = d_data[4]; - m[5] = d_data[5]; - m[6] = d_data[6]; - m[7] = d_data[7]; - m[8] = d_data[8]; - m[9] = d_data[9]; - m[10] = d_data[10]; - m[11] = d_data[11]; - m[12] = d_data[12]; - m[13] = d_data[13]; - m[14] = d_data[14]; - m[15] = d_data[15]; - - uint64_t v[16]; - - v[0] = state->h[0]; - v[1] = state->h[1]; - v[2] = state->h[2]; - v[3] = state->h[3]; - v[4] = state->h[4]; - v[5] = state->h[5]; - v[6] = state->h[6]; - v[7] = state->h[7]; - v[8] = 0x6a09e667f3bcc908; - v[9] = 0xbb67ae8584caa73b; - v[10] = 0x3c6ef372fe94f82b; - v[11] = 0xa54ff53a5f1d36f1; - v[12] = 0x510e527fade682d1 ^ state->counter; - v[13] = 0x9b05688c2b3e6c1f; - v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff; - v[15] = 0x5be0cd19137e2179; - - ROUND( 0 ); - ROUND( 1 ); - ROUND( 2 ); - ROUND( 3 ); - ROUND( 4 ); - ROUND( 5 ); - ROUND( 6 ); - ROUND( 7 ); - ROUND( 8 ); - ROUND( 9 ); - ROUND( 10 ); - ROUND( 11 ); - - state->h[0] ^= v[0] ^ v[ 8]; - state->h[1] ^= v[1] ^ v[ 9]; - state->h[2] ^= v[2] ^ v[10]; - state->h[3] ^= v[3] ^ v[11]; - state->h[4] ^= v[4] ^ v[12]; - state->h[5] ^= v[5] ^ v[13]; - state->h[6] ^= v[6] ^ v[14]; - state->h[7] ^= v[7] ^ v[15]; - - for (unsigned i = 0; i < outlen; i++) - hash[i] = ((uint8_t*)state->h)[i]; -} diff --git a/3rdparty/amd_bins_windows/zcash/gpu/common.h b/3rdparty/amd_bins_windows/zcash/gpu/common.h deleted file mode 100644 index 22ba9548e..000000000 --- a/3rdparty/amd_bins_windows/zcash/gpu/common.h +++ /dev/null @@ -1,156 +0,0 @@ -#if defined(__OPENCL_HOST__) -#define __global -#include "../blake2.h" -#else -typedef char int8_t; -typedef uchar uint8_t; -typedef short int16_t; -typedef ushort uint16_t; -typedef int int32_t; -typedef uint uint32_t; -typedef long int64_t; -typedef ulong uint64_t; - -#if defined(_MSC_VER) -#define ALIGN(x) __declspec(align(x)) -#else -#define ALIGN(x) __attribute__ ((__aligned__(x))) -#endif - -enum blake2b_constant -{ - BLAKE2B_BLOCKBYTES = 128, - BLAKE2B_OUTBYTES = 64, - BLAKE2B_KEYBYTES = 64, - BLAKE2B_SALTBYTES = 16, - BLAKE2B_PERSONALBYTES = 16 -}; - -#pragma pack(push, 1) -ALIGN( 64 ) typedef struct __blake2b_state { - uint64_t h[8]; - uint8_t buf[BLAKE2B_BLOCKBYTES]; - uint16_t counter; - uint8_t buflen; - uint8_t lastblock; -} blake2b_state; -#pragma pack(pop) -#endif - -#define COLLISION_BIT_LENGTH (WN / (WK+1)) -#define COLLISION_BYTE_LENGTH ((COLLISION_BIT_LENGTH+7)/8) -#define FINAL_FULL_WIDTH (2*COLLISION_BYTE_LENGTH+sizeof(uint32_t)*(1 << (WK))) - - -#define NDIGITS (WK+1) -#define DIGITBITS (WN/(NDIGITS)) -#define PROOFSIZE (1u< 64 -#error cant use XBITMAP with more than 64 slots -#endif - uint64_t xhashmap[NRESTS]; - uint64_t xmap; -#else - xslot nxhashslots[NRESTS]; - xslot xhashslots[NRESTS][XFULL]; - xslot *xx; - uint32_t n0; - uint32_t n1; -#endif - uint32_t s0; -} collisiondata; - - -typedef struct equi { - blake2b_state blake_ctx; - htalloc hta; - __global bsizes *nslots; - __global proof *sols; - uint32_t nsols; - uint32_t nthreads; -} equi; diff --git a/3rdparty/amd_bins_windows/zcash/gpu/equihash.cl b/3rdparty/amd_bins_windows/zcash/gpu/equihash.cl deleted file mode 100644 index 213a8e4d6..000000000 --- a/3rdparty/amd_bins_windows/zcash/gpu/equihash.cl +++ /dev/null @@ -1,1038 +0,0 @@ -#include "common.h" - -#include "blake2bcl.h" - -#define tree0_ptr(heap, r) ((__global bucket0 *)(heap + r)) -#define tree1_ptr(heap, r) ((__global bucket1 *)(heap + r)) - -uint32_t tree_bucket(tree t) -{ - const uint32_t bucketMask = ((1u<> BUCKBITS) & SLOTMASK; -} - -uint32_t tree_slotid1(tree t) -{ - const uint32_t slotMask = ((1u<> (BUCKBITS+SLOTBITS)) & SLOTMASK; -} - -uint32_t tree_xhash(tree t) -{ - return t >> (2*SLOTBITS + BUCKBITS); -} - -uint32_t tree_getindex(const tree t) -{ - const uint32_t bucketMask = ((1u<> BUCKBITS); -} - -void tree_setindex(tree *t, uint32_t idx) -{ - const uint32_t bucketMask = ((1u<> SLOTBITS); - (*t) |= ((idx & slotMask) << BUCKBITS); -} - -void tree_setxhash(tree *t, uint32_t xhash) -{ - const uint32_t xhashMask = ((1u << RESTBITS)-1); - (*t) &= ~(xhashMask << (2*SLOTBITS + BUCKBITS)); - (*t) |= (xhash << (2*SLOTBITS + BUCKBITS)); -} - -tree tree_create3(uint32_t bucketId, uint32_t s0, uint32_t s1) -{ - return bucketId | (s0 << BUCKBITS) | (s1 << (BUCKBITS+SLOTBITS)); -} - -tree tree_create4(uint32_t bucketId, uint32_t s0, uint32_t s1, uint32_t xhash) -{ - return bucketId | (s0 << BUCKBITS) | (s1 << (BUCKBITS+SLOTBITS)) | (xhash << (2*SLOTBITS+BUCKBITS));; -} - -// size (in bytes) of hash in round 0 <= r < WK -uint32_t hashsize(const uint32_t r) -{ -#ifdef XINTREE - const uint32_t hashbits = WN - (r+1) * DIGITBITS; -#else - const uint32_t hashbits = WN - (r+1) * DIGITBITS + RESTBITS; -#endif - return (hashbits + 7) / 8; -} - -uint32_t hashwords(uint32_t bytes) -{ - return (bytes + 3) / 4; -} - -htlayout htlayout_create_2(uint32_t r) -{ - htlayout R; - R.prevhashunits = 0; - R.dunits = 0; - - uint32_t nexthashbytes = hashsize(r); - R.nexthashunits = hashwords(nexthashbytes); - - R.prevbo = 0; - R.nextbo = R.nexthashunits * sizeof(hashunit) - nexthashbytes; // 0-3 - if (r) { - uint32_t prevhashbytes = hashsize(r-1); - R.prevhashunits = hashwords(prevhashbytes); - R.prevbo = R.prevhashunits * sizeof(hashunit) - prevhashbytes; // 0-3 - R.dunits = R.prevhashunits - R.nexthashunits; - } - - return R; -} - -uint32_t htlayout_getxhash0(uint32_t prevbo, __global const slot0 *pslot) -{ -#ifdef XINTREE - return tree_xhash(pslot->attr); -#elif WN == 200 && RESTBITS == 4 - return pslot->hash->bytes[prevbo] >> 4; -#elif WN == 200 && RESTBITS == 8 - return (pslot->hash->bytes[prevbo] & 0xf) << 4 | pslot->hash->bytes[prevbo+1] >> 4; -#elif WN == 144 && RESTBITS == 4 - return pslot->hash->bytes[prevbo] & 0xf; -#elif WN == 200 && RESTBITS == 6 - return (pslot->hash->bytes[prevbo] & 0x3) << 4 | pslot->hash->bytes[prevbo+1] >> 4; -#else -#error non implemented -#endif -} - -uint32_t htlayout_getxhash1(uint32_t prevbo, __global const slot1 *pslot) -{ -#ifdef XINTREE - return tree_xhash(pslot->attr); -#elif WN == 200 && RESTBITS == 4 - return pslot->hash->bytes[prevbo] & 0xf; -#elif WN == 200 && RESTBITS == 8 - return pslot->hash->bytes[prevbo]; -#elif WN == 144 && RESTBITS == 4 - return pslot->hash->bytes[prevbo] & 0xf; -#elif WN == 200 && RESTBITS == 6 - return pslot->hash->bytes[prevbo] & 0x3f; -#else -#error non implemented -#endif -} - -bool htlayout_equal(uint32_t prevhashunits, __global const hashunit *hash0, __global const hashunit *hash1) -{ - return hash0[prevhashunits-1].word == hash1[prevhashunits-1].word; -} - -void collisiondata_clear(collisiondata *data) -{ -#ifdef XBITMAP - // memset(xhashmap, 0, NRESTS * sizeof(u64)); - for (unsigned i = 0; i < NRESTS; i++) - data->xhashmap[i] = 0; -#else - // memset(nxhashslots, 0, NRESTS * sizeof(xslot)); - for (unsigned i = 0; i < NRESTS; i++) - data->nxhashslots[i] = 0; -#endif -} - -bool collisiondata_addslot(collisiondata *data, uint32_t s1, uint32_t xh) -{ -#ifdef XBITMAP - data->xmap = data->xhashmap[xh]; - data->xhashmap[xh] |= (uint64_t)1 << s1; - data->s0 = ~0; - return true; -#else - data->n1 = (uint32_t)data->nxhashslots[xh]++; - if (data->n1 >= XFULL) - return false; - data->xx = data->xhashslots[xh]; - data->xx[data->n1] = s1; - data->n0 = 0; - return true; -#endif -} - -bool collisiondata_nextcollision(collisiondata *data) -{ -#ifdef XBITMAP - return data->xmap != 0; -#else - return data->n0 < data->n1; -#endif -} - -uint64_t __ffsll(uint64_t x) -{ - return x ? (64 - clz(x & -x)) : 0; -} - -uint32_t collisiondata_slot(collisiondata *data) { -#ifdef XBITMAP - const uint32_t ffs = __ffsll(xmap); - data->s0 += ffs; - data->xmap >>= ffs; - return data->s0; -#else - return (uint32_t)data->xx[data->n0++]; -#endif -} - -uint32_t equi_getnslots(__global bsizes *nslots, const uint32_t r, const uint32_t bid) -{ - __global uint32_t *nslot = &nslots[r&1][bid]; - const uint32_t n = min(*nslot, NSLOTS); - *nslot = 0; - return n; -} - -void equi_orderindices(__global uint32_t *indices, uint32_t size) -{ - if (indices[0] > indices[size]) { - for (uint32_t i = 0; i < size; i++) { - const uint32_t tmp = indices[i]; - indices[i] = indices[size+i]; - indices[size+i] = tmp; - } - } -} - -void local_orderindices(uint32_t *indices, uint32_t size) -{ - if (indices[0] > indices[size]) { - for (uint32_t i = 0; i < size; i++) { - const uint32_t tmp = indices[i]; - indices[i] = indices[size+i]; - indices[size+i] = tmp; - } - } -} - - -void equi_listindices1(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 0)[tree_bucket(t)]; - const uint32_t size = 1 << 0; - indices[0] = tree_getindex((*buck)[tree_slotid0(t)].attr); - indices[size] = tree_getindex((*buck)[tree_slotid1(t)].attr); - equi_orderindices(indices, size); -} - -void equi_listindices2(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 0)[tree_bucket(t)]; - const uint32_t size = 1 << 1; - equi_listindices1(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices1(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices3(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 1)[tree_bucket(t)]; - const uint32_t size = 1 << 2; - equi_listindices2(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices2(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices4(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 1)[tree_bucket(t)]; - const uint32_t size = 1 << 3; - equi_listindices3(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices3(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices5(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 2)[tree_bucket(t)]; - const uint32_t size = 1 << 4; - equi_listindices4(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices4(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices6(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 2)[tree_bucket(t)]; - const uint32_t size = 1 << 5; - equi_listindices5(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices5(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices7(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 3)[tree_bucket(t)]; - const uint32_t size = 1 << 6; - equi_listindices6(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices6(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices8(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 3)[tree_bucket(t)]; - const uint32_t size = 1 << 7; - equi_listindices7(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices7(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices9(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 4)[tree_bucket(t)]; - const uint32_t size = 1 << 8; - equi_listindices8(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices8(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void local_listindices1(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 0)[tree_bucket(t)]; - const uint32_t size = 1 << 0; - indices[0] = tree_getindex((*buck)[tree_slotid0(t)].attr); - indices[size] = tree_getindex((*buck)[tree_slotid1(t)].attr); - local_orderindices(indices, size); -} - -void local_listindices2(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 0)[tree_bucket(t)]; - const uint32_t size = 1 << 1; - local_listindices1(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices1(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices3(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 1)[tree_bucket(t)]; - const uint32_t size = 1 << 2; - local_listindices2(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices2(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices4(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 1)[tree_bucket(t)]; - const uint32_t size = 1 << 3; - local_listindices3(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices3(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices5(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 2)[tree_bucket(t)]; - const uint32_t size = 1 << 4; - local_listindices4(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices4(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices6(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 2)[tree_bucket(t)]; - const uint32_t size = 1 << 5; - local_listindices5(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices5(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices7(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 3)[tree_bucket(t)]; - const uint32_t size = 1 << 6; - local_listindices6(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices6(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices8(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 3)[tree_bucket(t)]; - const uint32_t size = 1 << 7; - local_listindices7(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices7(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices9(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 4)[tree_bucket(t)]; - const uint32_t size = 1 << 8; - local_listindices8(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices8(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -// proper dupe test is a little costly on GPU, so allow false negatives -bool equi_probdupe(uint32_t *prf) { - unsigned short susp[PROOFSIZE]; - for (unsigned i = 0; i < PROOFSIZE; i++) - susp[i] = 0xFFFF; - - for (unsigned i = 0; i < PROOFSIZE; i++) { - uint32_t bin = prf[i] & (PROOFSIZE-1); - unsigned short msb = prf[i] >> WK; - if (msb == susp[bin]) - return true; - susp[bin] = msb; - } - - return false; -} - -void equi_candidate(__global uint32_t *heap0, - __global uint32_t *heap1, - __global proof *sols, - __global uint32_t *nsols, - const tree t) -{ - proof prf; -#if WK==9 - local_listindices9(heap0, heap1, t, (uint32_t*)&prf); -#elif WK==5 - local_listindices5(heap0, heap1, t, (uint32_t*)&prf); -#else -#error not implemented -#endif - if (equi_probdupe(prf)) - return; - uint32_t soli = atomic_inc(nsols); - if (soli < MAXSOLS) -#if WK==9 - equi_listindices9(heap0, heap1, t, sols[soli]); -#elif WK==5 - equi_listindices5(heap0, heap1, t, sols[soli]); -#else -#error not implemented -#endif -} - - -__kernel void digitH(__global blake2b_state *blake2bState, - __global const uint32_t *heap0, - __global bsizes *nslots) -{ - uint8_t hash[HASHOUT]; - blake2b_state state; - // equi::htlayout htl(eq, 0); - htlayout htl = htlayout_create_2(0); - const uint32_t hashbytes = hashsize(0); - // const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x; - const uint32_t id = get_global_id(0); - for (uint32_t block = id; block < NBLOCKS; block += get_global_size(0)) { - state = *blake2bState; - blake2b_gpu_hash(&state, block, hash, HASHOUT); - for (uint32_t i = 0; i < HASHESPERBLAKE; i++) { - const uint8_t *ph = hash + i * WN/8; -#if BUCKBITS == 16 && RESTBITS == 4 - const uint32_t bucketid = ((uint32_t)ph[0] << 8) | ph[1]; -#ifdef XINTREE - const uint32_t xhash = ph[2] >> 4; -#endif -#elif BUCKBITS == 14 && RESTBITS == 6 - const uint32_t bucketid = ((uint32_t)ph[0] << 6) | ph[1] >> 2; -#elif BUCKBITS == 12 && RESTBITS == 8 - const uint32_t bucketid = ((uint32_t)ph[0] << 4) | ph[1] >> 4; -#elif BUCKBITS == 20 && RESTBITS == 4 - const uint32_t bucketid = ((((uint32_t)ph[0] << 8) | ph[1]) << 4) | ph[2] >> 4; -#ifdef XINTREE - const uint32_t xhash = ph[2] & 0xf; -#endif -#elif BUCKBITS == 12 && RESTBITS == 4 - const uint32_t bucketid = ((uint32_t)ph[0] << 4) | ph[1] >> 4; - const uint32_t xhash = ph[1] & 0xf; -#else -#error not implemented -#endif - const uint32_t slot = atomic_inc(&nslots[0][bucketid]); - if (slot >= NSLOTS) - continue; - tree leaf; - tree_setindex(&leaf, block*HASHESPERBLAKE+i); -#ifdef XINTREE - tree_setxhash(&leaf, xhash); -#endif - __global slot0 *s = &tree0_ptr(heap0, 0)[bucketid][slot]; - s->attr = leaf; - - // memcpy(s.hash->bytes+htl.nextbo, ph+WN/8-hashbytes, hashbytes); - for (unsigned i = 0; i < hashbytes; i++) - ((__global uint8_t*)s->hash->bytes+htl.nextbo)[i] = ((uint8_t*)(ph+WN/8-hashbytes))[i]; - } - } -} - -__kernel void digitOdd(const uint32_t r, - __global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) -{ - // equi::htlayout htl(eq, r); -// htlayout htl = htlayout_create(eq, r); - htlayout htl = htlayout_create_2(r); - collisiondata cd; - - // const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x; - const uint32_t id = get_global_id(0); - - for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - // cd.clear(); - collisiondata_clear(&cd); -// __global slot0 *buck = htl.hta.trees0[(r-1)/2][bucketid]; // optimize by updating previous buck?! - __global slot0 *buck = tree0_ptr(heap0, (r-1)/2)[bucketid]; // optimize by updating previous buck?! - uint32_t bsize = equi_getnslots(nslots, r-1, bucketid); // optimize by putting bucketsize with block?! - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot0 *pslot1 = buck + s1; // optimize by updating previous pslot1?! - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot0 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; -#if WN == 200 && BUCKBITS == 16 && RESTBITS == 4 && defined(XINTREE) - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4 - | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - xhash &= 0xf; -#elif WN == 144 && BUCKBITS == 20 && RESTBITS == 4 - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 8) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 4) - | (xhash = bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 4; - xhash &= 0xf; -#elif WN == 96 && BUCKBITS == 12 && RESTBITS == 4 - xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4) - | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - xhash &= 0xf; -#elif WN == 200 && BUCKBITS == 14 && RESTBITS == 6 - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) & 0xf) << 8) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 2 - | (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 6; -#else -#error not implemented -#endif - const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]); - if (xorslot >= NSLOTS) - continue; -#ifdef XINTREE - tree xort = tree_create4(bucketid, s0, s1, xhash); -#else - tree xort = tree_create3(bucketid, s0, s1); -#endif -// __global slot1 *xs = &htl.hta.trees1[r/2][xorbucketid][xorslot]; - __global slot1 *xs = &tree1_ptr(heap1, r/2)[xorbucketid][xorslot]; - xs->attr = xort; - for (uint32_t i = htl.dunits; i < htl.prevhashunits; i++) - xs->hash[i-htl.dunits].word = pslot0->hash[i].word ^ pslot1->hash[i].word; - } - } - } -} - - -__kernel void digitEven(const uint32_t r, - __global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) -{ - // equi::htlayout htl(eq, r); -// htlayout htl = htlayout_create(eq, r); - htlayout htl = htlayout_create_2(r); - collisiondata cd; - - // const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x; - const uint32_t id = get_global_id(0); - - for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - // cd.clear(); - collisiondata_clear(&cd); -// __global slot1 *buck = htl.hta.trees1[(r-1)/2][bucketid]; // OPTIMIZE BY UPDATING PREVIOUS - __global slot1 *buck = tree1_ptr(heap1, (r-1)/2)[bucketid]; // OPTIMIZE BY UPDATING PREVIOUS - uint32_t bsize = equi_getnslots(nslots, r-1, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot1 *pslot1 = buck + s1; // OPTIMIZE BY UPDATING PREVIOUS - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot1 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; -#if WN == 200 && BUCKBITS == 16 && RESTBITS == 4 && defined(XINTREE) - xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]); - xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; -#elif WN == 144 && BUCKBITS == 20 && RESTBITS == 4 - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 8) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 4) - | (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 4; -#elif WN == 96 && BUCKBITS == 12 && RESTBITS == 4 - xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; -#elif WN == 200 && BUCKBITS == 14 && RESTBITS == 6 - xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 6) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 2; -#else -#error not implemented -#endif - const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]); - if (xorslot >= NSLOTS) - continue; -#ifdef XINTREE - tree xort = tree_create4(bucketid, s0, s1, xhash); -#else - tree xort = tree_create3(bucketid, s0, s1); -#endif -// __global slot0 *xs = &htl.hta.trees0[r/2][xorbucketid][xorslot]; - __global slot0 *xs = &tree0_ptr(heap0, r/2)[xorbucketid][xorslot]; - xs->attr = xort; - for (uint32_t i=htl.dunits; i < htl.prevhashunits; i++) - xs->hash[i-htl.dunits].word = pslot0->hash[i].word ^ pslot1->hash[i].word; - } - } - } -} - - -#ifdef UNROLL - -__kernel void digit_1(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) -{ - htlayout htl = htlayout_create_2(1); - collisiondata cd; - const uint32_t id = get_global_id(0); - - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); - __global slot0 *buck = tree0_ptr(heap0, 0)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 0, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot0 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot0 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4 - | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - xhash &= 0xf; - const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot1 *xs = &htl.hta.trees1[0][xorbucketid][xorslot]; - __global slot1 *xs = &tree1_ptr(heap1, 0)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word; - xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word; - xs->hash[3].word = pslot0->hash[4].word ^ pslot1->hash[4].word; - xs->hash[4].word = pslot0->hash[5].word ^ pslot1->hash[5].word; - } - } - } -} -__kernel void digit_2(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(2); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot1 *buck = htl.hta.trees1[0][bucketid]; - __global slot1 *buck = tree1_ptr(heap1, 0)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 1, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot1 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot1 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]); - xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); - // __global slot0 *xs = &htl.hta.trees0[1][xorbucketid][xorslot]; - __global slot0 *xs = &tree0_ptr(heap0, 1)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word; - xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - xs->hash[2].word = pslot0->hash[2].word ^ pslot1->hash[2].word; - xs->hash[3].word = pslot0->hash[3].word ^ pslot1->hash[3].word; - xs->hash[4].word = pslot0->hash[4].word ^ pslot1->hash[4].word; - } - } - } -} -__kernel void digit_3(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(3); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot0 *buck = htl.hta.trees0[1][bucketid]; - __global slot0 *buck = tree0_ptr(heap0, 1)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 2, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot0 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot0 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4 - | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - xhash &= 0xf; - const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot1 *xs = &htl.hta.trees1[1][xorbucketid][xorslot]; - __global slot1 *xs = &tree1_ptr(heap1, 1)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word; - xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word; - xs->hash[3].word = pslot0->hash[4].word ^ pslot1->hash[4].word; - } - } - } -} -__kernel void digit_4(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(4); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot1 *buck = htl.hta.trees1[1][bucketid]; - __global slot1 *buck = tree1_ptr(heap1, 1)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 3, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot1 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot1 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]); - xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot0 *xs = &htl.hta.trees0[2][xorbucketid][xorslot]; - __global slot0 *xs = &tree0_ptr(heap0, 2)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word; - xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - xs->hash[2].word = pslot0->hash[2].word ^ pslot1->hash[2].word; - xs->hash[3].word = pslot0->hash[3].word ^ pslot1->hash[3].word; - } - } - } -} -__kernel void digit_5(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(5); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot0 *buck = htl.hta.trees0[2][bucketid]; - __global slot0 *buck = tree0_ptr(heap0, 2)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 4, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot0 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot0 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4 - | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - xhash &= 0xf; - const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot1 *xs = &htl.hta.trees1[2][xorbucketid][xorslot]; - __global slot1 *xs = &tree1_ptr(heap1, 2)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word; - xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word; - } - } - } -} -__kernel void digit_6(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(6); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot1 *buck = htl.hta.trees1[2][bucketid]; - __global slot1 *buck = tree1_ptr(heap1, 2)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 5, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot1 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot1 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]); - xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot0 *xs = &htl.hta.trees0[3][xorbucketid][xorslot]; - __global slot0 *xs = &tree0_ptr(heap0, 3)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word; - } - } - } -} -__kernel void digit_7(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(7); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot0 *buck = htl.hta.trees0[3][bucketid]; - __global slot0 *buck = tree0_ptr(heap0, 3)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 6, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot0 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot0 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4 - | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - xhash &= 0xf; - const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot1 *xs = &htl.hta.trees1[3][xorbucketid][xorslot]; - __global slot1 *xs = &tree1_ptr(heap1, 3)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word; - xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - } - } - } -} -__kernel void digit_8(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(8); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot1 *buck = htl.hta.trees1[3][bucketid]; - __global slot1 *buck = tree1_ptr(heap1, 3)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 7, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot1 *pslot1 = buck + s1; // OPTIMIZE BY UPDATING PREVIOUS - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot1 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]); - xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot0 *xs = &htl.hta.trees0[4][xorbucketid][xorslot]; - __global slot0 *xs = &tree0_ptr(heap0, 4)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - } - } - } -} -#endif //UNROLL - -__kernel void digitK(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots, - __global proof *sols, - __global uint32_t *nsols) { - collisiondata cd; - htlayout htl = htlayout_create_2(WK); - const uint32_t id = get_global_id(0); - for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); - __global slot0 *buck = tree0_ptr(heap0, (WK-1)/2)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, WK-1, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot0 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) // assume WK odd - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot0 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) { - tree xort = tree_create3(bucketid, s0, s1); - equi_candidate(heap0, heap1, sols, nsols, xort); - } - } - } - } -} diff --git a/3rdparty/amd_silentarmy_kernels/zcash/gpu/kernel.cl b/3rdparty/amd_silentarmy_kernels/zcash/gpu/kernel.cl deleted file mode 100644 index 0fdc74d83..000000000 --- a/3rdparty/amd_silentarmy_kernels/zcash/gpu/kernel.cl +++ /dev/null @@ -1,555 +0,0 @@ -# 1 "input.cl" -# 1 "" -# 1 "" -# 1 "/usr/include/stdc-predef.h" 1 3 4 -# 1 "" 2 -# 1 "input.cl" -# 1 "param.h" 1 -# 60 "param.h" -typedef struct sols_s -{ - uint nr; - uint likely_invalids; - uchar valid[2000]; - uint values[2000][(1 << 9)]; -} sols_t; -# 2 "input.cl" 2 -# 36 "input.cl" -__constant ulong blake_iv[] = -{ - 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, - 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, - 0x510e527fade682d1, 0x9b05688c2b3e6c1f, - 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, -}; - - - - -__kernel -void kernel_init_ht(__global char *ht) -{ - uint tid = get_global_id(0); - *(__global uint *)(ht + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32) = 0; -} -# 80 "input.cl" -uint ht_store(uint round, __global char *ht, uint i, - ulong xi0, ulong xi1, ulong xi2, ulong xi3) -{ - uint row; - __global char *p; - uint cnt; -# 111 "input.cl" - if (!(round % 2)) - row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4); - else - row = ((xi0 & 0xf0000) >> 0) | - ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | - ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); - - - - xi0 = (xi0 >> 16) | (xi1 << (64 - 16)); - xi1 = (xi1 >> 16) | (xi2 << (64 - 16)); - xi2 = (xi2 >> 16) | (xi3 << (64 - 16)); - p = ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32; - cnt = atomic_inc((__global uint *)p); - if (cnt >= ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9)) - return 1; - p += cnt * 32 + (8 + ((round) / 2) * 4); - - *(__global uint *)(p - 4) = i; - if (round == 0 || round == 1) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - *(__global ulong *)(p + 16) = xi2; - } - else if (round == 2) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - *(__global uint *)(p + 16) = xi2; - } - else if (round == 3 || round == 4) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - - } - else if (round == 5) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global uint *)(p + 8) = xi1; - } - else if (round == 6 || round == 7) - { - - *(__global ulong *)(p + 0) = xi0; - } - else if (round == 8) - { - - *(__global uint *)(p + 0) = xi0; - } - return 0; -} -# 188 "input.cl" -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) -void kernel_round0(__global ulong *blake_state, __global char *ht, - __global uint *debug) -{ - uint tid = get_global_id(0); - ulong v[16]; - uint inputs_per_thread = (1 << (200 / (9 + 1))) / get_global_size(0); - uint input = tid * inputs_per_thread; - uint input_end = (tid + 1) * inputs_per_thread; - uint dropped = 0; - while (input < input_end) - { - - - ulong word1 = (ulong)input << 32; - - v[0] = blake_state[0]; - v[1] = blake_state[1]; - v[2] = blake_state[2]; - v[3] = blake_state[3]; - v[4] = blake_state[4]; - v[5] = blake_state[5]; - v[6] = blake_state[6]; - v[7] = blake_state[7]; - v[8] = blake_iv[0]; - v[9] = blake_iv[1]; - v[10] = blake_iv[2]; - v[11] = blake_iv[3]; - v[12] = blake_iv[4]; - v[13] = blake_iv[5]; - v[14] = blake_iv[6]; - v[15] = blake_iv[7]; - - v[12] ^= 140 + 4 ; - - v[14] ^= -1; - - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + word1); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + word1); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + word1); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - - - ulong h[7]; - h[0] = blake_state[0] ^ v[0] ^ v[8]; - h[1] = blake_state[1] ^ v[1] ^ v[9]; - h[2] = blake_state[2] ^ v[2] ^ v[10]; - h[3] = blake_state[3] ^ v[3] ^ v[11]; - h[4] = blake_state[4] ^ v[4] ^ v[12]; - h[5] = blake_state[5] ^ v[5] ^ v[13]; - h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff; - - - - dropped += ht_store(0, ht, input * 2, - h[0], - h[1], - h[2], - h[3]); - dropped += ht_store(0, ht, input * 2 + 1, - (h[3] >> 8) | (h[4] << (64 - 8)), - (h[4] >> 8) | (h[5] << (64 - 8)), - (h[5] >> 8) | (h[6] << (64 - 8)), - (h[6] >> 8)); - - - - - input++; - } - - - - -} -# 415 "input.cl" -uint xor_and_store(uint round, __global char *ht_dst, uint row, - uint slot_a, uint slot_b, __global ulong *a, __global ulong *b) -{ - ulong xi0, xi1, xi2; - - - - if (round == 1 || round == 2) - { - - xi0 = *(a++) ^ *(b++); - xi1 = *(a++) ^ *(b++); - xi2 = *a ^ *b; - if (round == 2) - { - - xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); - xi1 = (xi1 >> 8) | (xi2 << (64 - 8)); - xi2 = (xi2 >> 8); - } - } - else if (round == 3) - { - - xi0 = *a++ ^ *b++; - xi1 = *a++ ^ *b++; - xi2 = *(__global uint *)a ^ *(__global uint *)b; - } - else if (round == 4 || round == 5) - { - - xi0 = *a++ ^ *b++; - xi1 = *a ^ *b; - xi2 = 0; - if (round == 4) - { - - xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); - xi1 = (xi1 >> 8); - } - } - else if (round == 6) - { - - xi0 = *a++ ^ *b++; - xi1 = *(__global uint *)a ^ *(__global uint *)b; - xi2 = 0; - if (round == 6) - { - - xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); - xi1 = (xi1 >> 8); - } - } - else if (round == 7 || round == 8) - { - - xi0 = *a ^ *b; - xi1 = 0; - xi2 = 0; - if (round == 8) - { - - xi0 = (xi0 >> 8); - } - } - - - if (!xi0 && !xi1) - return 0; - - - - return ht_store(round, ht_dst, ((row << 12) | ((slot_b & 0x3f) << 6) | (slot_a & 0x3f)), - xi0, xi1, xi2, 0); -} - - - - - -void equihash_round(uint round, __global char *ht_src, __global char *ht_dst, - __global uint *debug) -{ - uint tid = get_global_id(0); - uint tlid = get_local_id(0); - __global char *p; - uint cnt; - uchar first_words[((1 << (((200 / (9 + 1)) + 1) - 20)) * 9)]; - uchar mask; - uint i, j; - - - ushort collisions[((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 3]; - uint nr_coll = 0; - uint n; - uint dropped_coll, dropped_stor; - __global ulong *a, *b; - uint xi_offset; - - xi_offset = (8 + ((round - 1) / 2) * 4); -# 524 "input.cl" - mask = 0; - - - - p = (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32); - cnt = *(__global uint *)p; - cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 9)); - p += xi_offset; - for (i = 0; i < cnt; i++, p += 32) - first_words[i] = *(__global uchar *)p; - - nr_coll = 0; - dropped_coll = 0; - for (i = 0; i < cnt; i++) - for (j = i + 1; j < cnt; j++) - if ((first_words[i] & mask) == - (first_words[j] & mask)) - { - - if (nr_coll >= sizeof (collisions) / sizeof (*collisions)) - dropped_coll++; - else - - - collisions[nr_coll++] = - ((ushort)j << 8) | ((ushort)i & 0xff); - - - - } - - dropped_stor = 0; - for (n = 0; n < nr_coll; n++) - { - i = collisions[n] & 0xff; - j = collisions[n] >> 8; - a = (__global ulong *) - (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 + i * 32 + xi_offset); - b = (__global ulong *) - (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 + j * 32 + xi_offset); - dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b); - } - if (round < 8) - - *(__global uint *)(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32) = 0; - - - - -} -# 585 "input.cl" -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round1(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(1, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round2(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(2, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round3(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(3, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round4(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(4, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round5(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(5, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round6(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(6, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round7(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(7, ht_src, ht_dst, debug); } - - -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) -void kernel_round8(__global char *ht_src, __global char *ht_dst, - __global uint *debug, __global sols_t *sols) -{ - uint tid = get_global_id(0); - equihash_round(8, ht_src, ht_dst, debug); - if (!tid) - sols->nr = sols->likely_invalids = 0; -} - -uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot) -{ - return *(__global uint *)(ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 + - slot * 32 + xi_offset - 4); -} - -void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs, - uint round) -{ - __global char *ht = htabs[round % 2]; - uint i = nr_inputs - 1; - uint j = nr_inputs * 2 - 1; - uint xi_offset = (8 + ((round) / 2) * 4); - do - { - ins[j] = expand_ref(ht, xi_offset, - (ins[i] >> 12), ((ins[i] >> 6) & 0x3f)); - ins[j - 1] = expand_ref(ht, xi_offset, - (ins[i] >> 12), (ins[i] & 0x3f)); - if (!i) - break ; - i--; - j -= 2; - } - while (1); -} - - - - -void potential_sol(__global char **htabs, __global sols_t *sols, - uint ref0, uint ref1) -{ - uint sol_i; - uint nr_values; - sol_i = atomic_inc(&sols->nr); - if (sol_i >= 2000) - return ; - sols->valid[sol_i] = 0; - nr_values = 0; - sols->values[sol_i][nr_values++] = ref0; - sols->values[sol_i][nr_values++] = ref1; - uint round = 9 - 1; - do - { - round--; - expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round); - nr_values *= 2; - } - while (round > 0); - sols->valid[sol_i] = 1; -} - - - - -__kernel -void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols) -{ - uint tid = get_global_id(0); - __global char *htabs[2] = { ht0, ht1 }; - uint ht_i = (9 - 1) % 2; - uint cnt; - uint xi_offset = (8 + ((9 - 1) / 2) * 4); - uint i, j; - __global char *a, *b; - uint ref_i, ref_j; - - - ulong collisions[5]; - uint coll; - - - - uint mask = 0xffffff; - - - - a = htabs[ht_i] + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32; - cnt = *(__global uint *)a; - cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 9)); - coll = 0; - a += xi_offset; - for (i = 0; i < cnt; i++, a += 32) - for (j = i + 1, b = a + 32; j < cnt; j++, b += 32) - if (((*(__global uint *)a) & mask) == - ((*(__global uint *)b) & mask)) - { - ref_i = *(__global uint *)(a - 4); - ref_j = *(__global uint *)(b - 4); - if (coll < sizeof (collisions) / sizeof (*collisions)) - collisions[coll++] = ((ulong)ref_i << 32) | ref_j; - else - atomic_inc(&sols->likely_invalids); - } - if (!coll) - return ; - for (i = 0; i < coll; i++) - potential_sol(htabs, sols, collisions[i] >> 32, - collisions[i] & 0xffffffff); -} diff --git a/3rdparty/kernel_R92xx_linux_14.4/equiw200k9.bin b/3rdparty/kernel_R92xx_linux_14.4/equiw200k9.bin deleted file mode 100644 index 45785dc4e..000000000 Binary files a/3rdparty/kernel_R92xx_linux_14.4/equiw200k9.bin and /dev/null differ diff --git a/3rdparty/kernel_RX4xx_windows_16.7/equiw200k9.bin b/3rdparty/kernel_RX4xx_windows_16.7/equiw200k9.bin deleted file mode 100644 index 868842f93..000000000 Binary files a/3rdparty/kernel_RX4xx_windows_16.7/equiw200k9.bin and /dev/null differ diff --git a/3rdparty/silentarmy/16_kernel.cl b/3rdparty/silentarmy/16_kernel.cl deleted file mode 100644 index b7d23e4bd..000000000 --- a/3rdparty/silentarmy/16_kernel.cl +++ /dev/null @@ -1,526 +0,0 @@ -# 1 "input.cl" -# 1 "" -# 1 "" -# 1 "/usr/include/stdc-predef.h" 1 3 4 -# 1 "" 2 -# 1 "input.cl" -# 1 "param.h" 1 -# 60 "param.h" -typedef struct sols_s -{ - uint nr; - uint likely_invalidss; - uchar valid[2000]; - uint values[2000][(1 << 9)]; -} sols_t; -# 2 "input.cl" 2 -# 35 "input.cl" -__constant ulong blake_iv[] = -{ - 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, - 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, - 0x510e527fade682d1, 0x9b05688c2b3e6c1f, - 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, -}; - - - - -__kernel -void kernel_init_ht(__global char *ht) -{ - uint tid = get_global_id(0); - *(__global uint *)(ht + tid * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32) = 0; -} -# 79 "input.cl" -uint ht_store(uint round, __global char *ht, uint i, - ulong xi0, ulong xi1, ulong xi2, ulong xi3) -{ - uint row; - __global char *p; - uint cnt; - - if (!(round % 2)) - row = (xi0 & 0xffff); - else - - - - - row = ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | - ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); -# 119 "input.cl" - xi0 = (xi0 >> 16) | (xi1 << (64 - 16)); - xi1 = (xi1 >> 16) | (xi2 << (64 - 16)); - xi2 = (xi2 >> 16) | (xi3 << (64 - 16)); - p = ht + row * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32; - cnt = atomic_inc((__global uint *)p); - if (cnt >= ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3)) - return 1; - p += cnt * 32 + (8 + ((round) / 2) * 4); - - *(__global uint *)(p - 4) = i; - if (round == 0 || round == 1) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - *(__global ulong *)(p + 16) = xi2; - } - else if (round == 2) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - *(__global uint *)(p + 16) = xi2; - } - else if (round == 3 || round == 4) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - - } - else if (round == 5) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global uint *)(p + 8) = xi1; - } - else if (round == 6 || round == 7) - { - - *(__global ulong *)(p + 0) = xi0; - } - else if (round == 8) - { - - *(__global uint *)(p + 0) = xi0; - } - return 0; -} -# 187 "input.cl" -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) -void kernel_round0(__global ulong *blake_state, __global char *ht, - __global uint *debug) -{ - uint tid = get_global_id(0); - ulong v[16]; - uint inputs_per_thread = (1 << (200 / (9 + 1))) / get_global_size(0); - uint input = tid * inputs_per_thread; - uint input_end = (tid + 1) * inputs_per_thread; - uint dropped = 0; - while (input < input_end) - { - - - ulong word1 = (ulong)input << 32; - - v[0] = blake_state[0]; - v[1] = blake_state[1]; - v[2] = blake_state[2]; - v[3] = blake_state[3]; - v[4] = blake_state[4]; - v[5] = blake_state[5]; - v[6] = blake_state[6]; - v[7] = blake_state[7]; - v[8] = blake_iv[0]; - v[9] = blake_iv[1]; - v[10] = blake_iv[2]; - v[11] = blake_iv[3]; - v[12] = blake_iv[4]; - v[13] = blake_iv[5]; - v[14] = blake_iv[6]; - v[15] = blake_iv[7]; - - v[12] ^= 140 + 4 ; - - v[14] ^= -1; - - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + word1); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + word1); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + word1); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - - - ulong h[7]; - h[0] = blake_state[0] ^ v[0] ^ v[8]; - h[1] = blake_state[1] ^ v[1] ^ v[9]; - h[2] = blake_state[2] ^ v[2] ^ v[10]; - h[3] = blake_state[3] ^ v[3] ^ v[11]; - h[4] = blake_state[4] ^ v[4] ^ v[12]; - h[5] = blake_state[5] ^ v[5] ^ v[13]; - h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff; - - - - dropped += ht_store(0, ht, input * 2, - h[0], - h[1], - h[2], - h[3]); - dropped += ht_store(0, ht, input * 2 + 1, - (h[3] >> 8) | (h[4] << (64 - 8)), - (h[4] >> 8) | (h[5] << (64 - 8)), - (h[5] >> 8) | (h[6] << (64 - 8)), - (h[6] >> 8)); - - - - - input++; - } - - - - -} -# 409 "input.cl" -uint xor_and_store(uint round, __global char *ht_dst, uint row, - uint slot_a, uint slot_b, __global ulong *a, __global ulong *b) -{ - ulong xi0, xi1, xi2; - - - - if (round == 1 || round == 2) - { - - - xi0 = *(a++) ^ *(b++); - xi1 = *(a++) ^ *(b++); - xi2 = *a ^ *b; - } - else if (round == 3) - { - - xi0 = *a++ ^ *b++; - xi1 = *a++ ^ *b++; - xi2 = *(__global uint *)a ^ *(__global uint *)b; - } - else if (round == 4 || round == 5) - { - - xi0 = *a++ ^ *b++; - xi1 = *a ^ *b; - xi2 = 0; - } - else if (round == 6) - { - - xi0 = *a++ ^ *b++; - xi1 = *(__global uint *)a ^ *(__global uint *)b; - xi2 = 0; - } - else if (round == 7 || round == 8) - { - - xi0 = *a ^ *b; - xi1 = 0; - xi2 = 0; - } - - - if (!xi0 && !xi1) - return 0; - - - - return ht_store(round, ht_dst, ((row << 16) | ((slot_b & 0xff) << 8) | (slot_a & 0xff)), - xi0, xi1, xi2, 0); -} - - - - - -void equihash_round(uint round, __global char *ht_src, __global char *ht_dst, - __global uint *debug) -{ - uint tid = get_global_id(0); - uint tlid = get_local_id(0); - __global char *p; - uint cnt; - uchar first_words[((1 << (((200 / (9 + 1)) + 1) - 16)) * 3)]; - uchar mask; - uint i, j; - - - ushort collisions[((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 2]; - uint nr_coll = 0; - uint n; - uint dropped_coll, dropped_stor; - __global ulong *a, *b; - uint xi_offset; - - xi_offset = (8 + ((round - 1) / 2) * 4); - - - mask = ((!(round % 2)) ? 0x0f : 0xf0); -# 499 "input.cl" - p = (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32); - cnt = *(__global uint *)p; - cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 16)) * 3)); - p += xi_offset; - for (i = 0; i < cnt; i++, p += 32) - first_words[i] = *(__global uchar *)p; - - nr_coll = 0; - dropped_coll = 0; - for (i = 0; i < cnt; i++) - for (j = i + 1; j < cnt; j++) - if ((first_words[i] & mask) == - (first_words[j] & mask)) - { - - if (nr_coll >= sizeof (collisions) / sizeof (*collisions)) - dropped_coll++; - else - - - collisions[nr_coll++] = - ((ushort)j << 8) | ((ushort)i & 0xff); - - - - } - - uint adj = (!(round % 2)) ? 1 : 0; - - dropped_stor = 0; - for (n = 0; n < nr_coll; n++) - { - i = collisions[n] & 0xff; - j = collisions[n] >> 8; - a = (__global ulong *) - (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32 + i * 32 + xi_offset - + adj); - b = (__global ulong *) - (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32 + j * 32 + xi_offset - + adj); - dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b); - } - - - - -} -# 557 "input.cl" -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round1(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(1, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round2(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(2, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round3(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(3, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round4(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(4, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round5(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(5, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round6(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(6, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round7(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(7, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round8(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(8, ht_src, ht_dst, debug); } - -uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot) -{ - return *(__global uint *)(ht + row * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32 + - slot * 32 + xi_offset - 4); -} - -void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs, - uint round) -{ - __global char *ht = htabs[round % 2]; - uint i = nr_inputs - 1; - uint j = nr_inputs * 2 - 1; - uint xi_offset = (8 + ((round) / 2) * 4); - do - { - ins[j] = expand_ref(ht, xi_offset, - (ins[i] >> 16), ((ins[i] >> 8) & 0xff)); - ins[j - 1] = expand_ref(ht, xi_offset, - (ins[i] >> 16), (ins[i] & 0xff)); - if (!i) - break ; - i--; - j -= 2; - } - while (1); -} - - - - -void potential_sol(__global char **htabs, __global sols_t *sols, - uint ref0, uint ref1) -{ - uint sol_i; - uint nr_values; - sol_i = atomic_inc(&sols->nr); - if (sol_i >= 2000) - return ; - sols->valid[sol_i] = 0; - nr_values = 0; - sols->values[sol_i][nr_values++] = ref0; - sols->values[sol_i][nr_values++] = ref1; - uint round = 9 - 1; - do - { - round--; - expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round); - nr_values *= 2; - } - while (round > 0); - sols->valid[sol_i] = 1; -} - - - - -__kernel -void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols) -{ - uint tid = get_global_id(0); - __global char *htabs[2] = { ht0, ht1 }; - uint ht_i = (9 - 1) % 2; - uint cnt; - uint xi_offset = (8 + ((9 - 1) / 2) * 4); - uint i, j; - __global char *a, *b; - uint ref_i, ref_j; - - - ulong collisions[5]; - uint coll; - - - - uint mask = 0xffffff; - - - - if (tid == 0) - sols->nr = sols->likely_invalidss = 0; - mem_fence(CLK_GLOBAL_MEM_FENCE); - a = htabs[ht_i] + tid * ((1 << (((200 / (9 + 1)) + 1) - 16)) * 3) * 32; - cnt = *(__global uint *)a; - cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 16)) * 3)); - coll = 0; - a += xi_offset; - for (i = 0; i < cnt; i++, a += 32) - for (j = i + 1, b = a + 32; j < cnt; j++, b += 32) - if (((*(__global uint *)a) & mask) == - ((*(__global uint *)b) & mask)) - { - ref_i = *(__global uint *)(a - 4); - ref_j = *(__global uint *)(b - 4); - if (coll < sizeof (collisions) / sizeof (*collisions)) - collisions[coll++] = ((ulong)ref_i << 32) | ref_j; - else - atomic_inc(&sols->likely_invalidss); - } - if (!coll) - return ; - for (i = 0; i < coll; i++) - potential_sol(htabs, sols, collisions[i] >> 32, - collisions[i] & 0xffffffff); -} diff --git a/3rdparty/silentarmy/19_kernel.cl b/3rdparty/silentarmy/19_kernel.cl deleted file mode 100644 index fd0f29a7a..000000000 --- a/3rdparty/silentarmy/19_kernel.cl +++ /dev/null @@ -1,531 +0,0 @@ -# 1 "input.cl" -# 1 "" -# 1 "" -# 1 "/usr/include/stdc-predef.h" 1 3 4 -# 1 "" 2 -# 1 "input.cl" -# 1 "param.h" 1 -# 60 "param.h" -typedef struct sols_s -{ - uint nr; - uint likely_invalidss; - uchar valid[2000]; - uint values[2000][(1 << 9)]; -} sols_t; -# 2 "input.cl" 2 -# 35 "input.cl" -__constant ulong blake_iv[] = -{ - 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, - 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, - 0x510e527fade682d1, 0x9b05688c2b3e6c1f, - 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, -}; - - - - -__kernel -void kernel_init_ht(__global char *ht) -{ - uint tid = get_global_id(0); - *(__global uint *)(ht + tid * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32) = 0; -} -# 79 "input.cl" -uint ht_store(uint round, __global char *ht, uint i, - ulong xi0, ulong xi1, ulong xi2, ulong xi3) -{ - uint row; - __global char *p; - uint cnt; -# 103 "input.cl" - if (!(round % 2)) - row = (xi0 & 0xffff) | ((xi0 & 0xe00000) >> 5); - else - row = ((xi0 & 0xe0000) >> 1) | - ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | - ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); -# 119 "input.cl" - xi0 = (xi0 >> 16) | (xi1 << (64 - 16)); - xi1 = (xi1 >> 16) | (xi2 << (64 - 16)); - xi2 = (xi2 >> 16) | (xi3 << (64 - 16)); - p = ht + row * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32; - cnt = atomic_inc((__global uint *)p); - if (cnt >= ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9)) - return 1; - p += cnt * 32 + (8 + ((round) / 2) * 4); - - *(__global uint *)(p - 4) = i; - if (round == 0 || round == 1) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - *(__global ulong *)(p + 16) = xi2; - } - else if (round == 2) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - *(__global uint *)(p + 16) = xi2; - } - else if (round == 3 || round == 4) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - - } - else if (round == 5) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global uint *)(p + 8) = xi1; - } - else if (round == 6 || round == 7) - { - - *(__global ulong *)(p + 0) = xi0; - } - else if (round == 8) - { - - *(__global uint *)(p + 0) = xi0; - } - return 0; -} -# 187 "input.cl" -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) -void kernel_round0(__global ulong *blake_state, __global char *ht, - __global uint *debug) -{ - uint tid = get_global_id(0); - ulong v[16]; - uint inputs_per_thread = (1 << (200 / (9 + 1))) / get_global_size(0); - uint input = tid * inputs_per_thread; - uint input_end = (tid + 1) * inputs_per_thread; - uint dropped = 0; - while (input < input_end) - { - - - ulong word1 = (ulong)input << 32; - - v[0] = blake_state[0]; - v[1] = blake_state[1]; - v[2] = blake_state[2]; - v[3] = blake_state[3]; - v[4] = blake_state[4]; - v[5] = blake_state[5]; - v[6] = blake_state[6]; - v[7] = blake_state[7]; - v[8] = blake_iv[0]; - v[9] = blake_iv[1]; - v[10] = blake_iv[2]; - v[11] = blake_iv[3]; - v[12] = blake_iv[4]; - v[13] = blake_iv[5]; - v[14] = blake_iv[6]; - v[15] = blake_iv[7]; - - v[12] ^= 140 + 4 ; - - v[14] ^= -1; - - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + word1); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + word1); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + word1); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - - - ulong h[7]; - h[0] = blake_state[0] ^ v[0] ^ v[8]; - h[1] = blake_state[1] ^ v[1] ^ v[9]; - h[2] = blake_state[2] ^ v[2] ^ v[10]; - h[3] = blake_state[3] ^ v[3] ^ v[11]; - h[4] = blake_state[4] ^ v[4] ^ v[12]; - h[5] = blake_state[5] ^ v[5] ^ v[13]; - h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff; - - - - dropped += ht_store(0, ht, input * 2, - h[0], - h[1], - h[2], - h[3]); - dropped += ht_store(0, ht, input * 2 + 1, - (h[3] >> 8) | (h[4] << (64 - 8)), - (h[4] >> 8) | (h[5] << (64 - 8)), - (h[5] >> 8) | (h[6] << (64 - 8)), - (h[6] >> 8)); - - - - - input++; - } - - - - -} -# 409 "input.cl" -uint xor_and_store(uint round, __global char *ht_dst, uint row, - uint slot_a, uint slot_b, __global ulong *a, __global ulong *b) -{ - ulong xi0, xi1, xi2; - - - - if (round == 1 || round == 2) - { - - - xi0 = *(a++) ^ *(b++); - xi1 = *(a++) ^ *(b++); - xi2 = *a ^ *b; - } - else if (round == 3) - { - - xi0 = *a++ ^ *b++; - xi1 = *a++ ^ *b++; - xi2 = *(__global uint *)a ^ *(__global uint *)b; - } - else if (round == 4 || round == 5) - { - - xi0 = *a++ ^ *b++; - xi1 = *a ^ *b; - xi2 = 0; - } - else if (round == 6) - { - - xi0 = *a++ ^ *b++; - xi1 = *(__global uint *)a ^ *(__global uint *)b; - xi2 = 0; - } - else if (round == 7 || round == 8) - { - - xi0 = *a ^ *b; - xi1 = 0; - xi2 = 0; - } - - - if (!xi0 && !xi1) - return 0; - - - - return ht_store(round, ht_dst, ((row << 13) | ((slot_b & 0x3f) << 6) | (slot_a & 0x3f)), - xi0, xi1, xi2, 0); -} - - - - - -void equihash_round(uint round, __global char *ht_src, __global char *ht_dst, - __global uint *debug) -{ - uint tid = get_global_id(0); - uint tlid = get_local_id(0); - __global char *p; - uint cnt; - uchar first_words[((1 << (((200 / (9 + 1)) + 1) - 19)) * 9)]; - uchar mask; - uint i, j; - - - ushort collisions[((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 2]; - uint nr_coll = 0; - uint n; - uint dropped_coll, dropped_stor; - __global ulong *a, *b; - uint xi_offset; - - xi_offset = (8 + ((round - 1) / 2) * 4); - - - - - - - mask = ((!(round % 2)) ? 0x01 : 0x10); - - - - - - p = (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32); - cnt = *(__global uint *)p; - cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 19)) * 9)); - p += xi_offset; - for (i = 0; i < cnt; i++, p += 32) - first_words[i] = *(__global uchar *)p; - - nr_coll = 0; - dropped_coll = 0; - for (i = 0; i < cnt; i++) - for (j = i + 1; j < cnt; j++) - if ((first_words[i] & mask) == - (first_words[j] & mask)) - { - - if (nr_coll >= sizeof (collisions) / sizeof (*collisions)) - dropped_coll++; - else - - - collisions[nr_coll++] = - ((ushort)j << 8) | ((ushort)i & 0xff); - - - - } - - uint adj = (!(round % 2)) ? 1 : 0; - - dropped_stor = 0; - for (n = 0; n < nr_coll; n++) - { - i = collisions[n] & 0xff; - j = collisions[n] >> 8; - a = (__global ulong *) - (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32 + i * 32 + xi_offset - + adj); - b = (__global ulong *) - (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32 + j * 32 + xi_offset - + adj); - dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b); - } - - - - -} -# 557 "input.cl" -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round1(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(1, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round2(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(2, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round3(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(3, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round4(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(4, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round5(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(5, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round6(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(6, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round7(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(7, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round8(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(8, ht_src, ht_dst, debug); } - -uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot) -{ - return *(__global uint *)(ht + row * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32 + - slot * 32 + xi_offset - 4); -} - -void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs, - uint round) -{ - __global char *ht = htabs[round % 2]; - uint i = nr_inputs - 1; - uint j = nr_inputs * 2 - 1; - uint xi_offset = (8 + ((round) / 2) * 4); - do - { - ins[j] = expand_ref(ht, xi_offset, - (ins[i] >> 13), ((ins[i] >> 6) & 0x3f)); - ins[j - 1] = expand_ref(ht, xi_offset, - (ins[i] >> 13), (ins[i] & 0x3f)); - if (!i) - break ; - i--; - j -= 2; - } - while (1); -} - - - - -void potential_sol(__global char **htabs, __global sols_t *sols, - uint ref0, uint ref1) -{ - uint sol_i; - uint nr_values; - sol_i = atomic_inc(&sols->nr); - if (sol_i >= 2000) - return ; - sols->valid[sol_i] = 0; - nr_values = 0; - sols->values[sol_i][nr_values++] = ref0; - sols->values[sol_i][nr_values++] = ref1; - uint round = 9 - 1; - do - { - round--; - expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round); - nr_values *= 2; - } - while (round > 0); - sols->valid[sol_i] = 1; -} - - - - -__kernel -void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols) -{ - uint tid = get_global_id(0); - __global char *htabs[2] = { ht0, ht1 }; - uint ht_i = (9 - 1) % 2; - uint cnt; - uint xi_offset = (8 + ((9 - 1) / 2) * 4); - uint i, j; - __global char *a, *b; - uint ref_i, ref_j; - - - ulong collisions[5]; - uint coll; - - - - uint mask = 0xffffff; - - - - if (tid == 0) - sols->nr = sols->likely_invalidss = 0; - mem_fence(CLK_GLOBAL_MEM_FENCE); - a = htabs[ht_i] + tid * ((1 << (((200 / (9 + 1)) + 1) - 19)) * 9) * 32; - cnt = *(__global uint *)a; - cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 19)) * 9)); - coll = 0; - a += xi_offset; - for (i = 0; i < cnt; i++, a += 32) - for (j = i + 1, b = a + 32; j < cnt; j++, b += 32) - if (((*(__global uint *)a) & mask) == - ((*(__global uint *)b) & mask)) - { - ref_i = *(__global uint *)(a - 4); - ref_j = *(__global uint *)(b - 4); - if (coll < sizeof (collisions) / sizeof (*collisions)) - collisions[coll++] = ((ulong)ref_i << 32) | ref_j; - else - atomic_inc(&sols->likely_invalidss); - } - if (!coll) - return ; - for (i = 0; i < coll; i++) - potential_sol(htabs, sols, collisions[i] >> 32, - collisions[i] & 0xffffffff); -} diff --git a/3rdparty/silentarmy/kernel.cl b/3rdparty/silentarmy/kernel.cl deleted file mode 100644 index 2099bd049..000000000 --- a/3rdparty/silentarmy/kernel.cl +++ /dev/null @@ -1,526 +0,0 @@ -# 1 "input.cl" -# 1 "" -# 1 "" -# 1 "/usr/include/stdc-predef.h" 1 3 4 -# 1 "" 2 -# 1 "input.cl" -# 1 "param.h" 1 -# 60 "param.h" -typedef struct sols_s -{ - uint nr; - uint likely_invalidss; - uchar valid[2000]; - uint values[2000][(1 << 9)]; -} sols_t; -# 2 "input.cl" 2 -# 35 "input.cl" -__constant ulong blake_iv[] = -{ - 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, - 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, - 0x510e527fade682d1, 0x9b05688c2b3e6c1f, - 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, -}; - - - - -__kernel -void kernel_init_ht(__global char *ht) -{ - uint tid = get_global_id(0); - *(__global uint *)(ht + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32) = 0; -} -# 79 "input.cl" -uint ht_store(uint round, __global char *ht, uint i, - ulong xi0, ulong xi1, ulong xi2, ulong xi3) -{ - uint row; - __global char *p; - uint cnt; -# 110 "input.cl" - if (!(round % 2)) - row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4); - else - row = ((xi0 & 0xf0000) >> 0) | - ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | - ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); - - - - xi0 = (xi0 >> 16) | (xi1 << (64 - 16)); - xi1 = (xi1 >> 16) | (xi2 << (64 - 16)); - xi2 = (xi2 >> 16) | (xi3 << (64 - 16)); - p = ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32; - cnt = atomic_inc((__global uint *)p); - if (cnt >= ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13)) - return 1; - p += cnt * 32 + (8 + ((round) / 2) * 4); - - *(__global uint *)(p - 4) = i; - if (round == 0 || round == 1) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - *(__global ulong *)(p + 16) = xi2; - } - else if (round == 2) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - *(__global uint *)(p + 16) = xi2; - } - else if (round == 3 || round == 4) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - - } - else if (round == 5) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global uint *)(p + 8) = xi1; - } - else if (round == 6 || round == 7) - { - - *(__global ulong *)(p + 0) = xi0; - } - else if (round == 8) - { - - *(__global uint *)(p + 0) = xi0; - } - return 0; -} -# 187 "input.cl" -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) -void kernel_round0(__global ulong *blake_state, __global char *ht, - __global uint *debug) -{ - uint tid = get_global_id(0); - ulong v[16]; - uint inputs_per_thread = (1 << (200 / (9 + 1))) / get_global_size(0); - uint input = tid * inputs_per_thread; - uint input_end = (tid + 1) * inputs_per_thread; - uint dropped = 0; - while (input < input_end) - { - - - ulong word1 = (ulong)input << 32; - - v[0] = blake_state[0]; - v[1] = blake_state[1]; - v[2] = blake_state[2]; - v[3] = blake_state[3]; - v[4] = blake_state[4]; - v[5] = blake_state[5]; - v[6] = blake_state[6]; - v[7] = blake_state[7]; - v[8] = blake_iv[0]; - v[9] = blake_iv[1]; - v[10] = blake_iv[2]; - v[11] = blake_iv[3]; - v[12] = blake_iv[4]; - v[13] = blake_iv[5]; - v[14] = blake_iv[6]; - v[15] = blake_iv[7]; - - v[12] ^= 140 + 4 ; - - v[14] ^= -1; - - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + word1); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + word1); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + word1); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - - - ulong h[7]; - h[0] = blake_state[0] ^ v[0] ^ v[8]; - h[1] = blake_state[1] ^ v[1] ^ v[9]; - h[2] = blake_state[2] ^ v[2] ^ v[10]; - h[3] = blake_state[3] ^ v[3] ^ v[11]; - h[4] = blake_state[4] ^ v[4] ^ v[12]; - h[5] = blake_state[5] ^ v[5] ^ v[13]; - h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff; - - - - dropped += ht_store(0, ht, input * 2, - h[0], - h[1], - h[2], - h[3]); - dropped += ht_store(0, ht, input * 2 + 1, - (h[3] >> 8) | (h[4] << (64 - 8)), - (h[4] >> 8) | (h[5] << (64 - 8)), - (h[5] >> 8) | (h[6] << (64 - 8)), - (h[6] >> 8)); - - - - - input++; - } - - - - -} -# 409 "input.cl" -uint xor_and_store(uint round, __global char *ht_dst, uint row, - uint slot_a, uint slot_b, __global ulong *a, __global ulong *b) -{ - ulong xi0, xi1, xi2; - - - - if (round == 1 || round == 2) - { - - - xi0 = *(a++) ^ *(b++); - xi1 = *(a++) ^ *(b++); - xi2 = *a ^ *b; - } - else if (round == 3) - { - - xi0 = *a++ ^ *b++; - xi1 = *a++ ^ *b++; - xi2 = *(__global uint *)a ^ *(__global uint *)b; - } - else if (round == 4 || round == 5) - { - - xi0 = *a++ ^ *b++; - xi1 = *a ^ *b; - xi2 = 0; - } - else if (round == 6) - { - - xi0 = *a++ ^ *b++; - xi1 = *(__global uint *)a ^ *(__global uint *)b; - xi2 = 0; - } - else if (round == 7 || round == 8) - { - - xi0 = *a ^ *b; - xi1 = 0; - xi2 = 0; - } - - - if (!xi0 && !xi1) - return 0; - - - - return ht_store(round, ht_dst, ((row << 12) | ((slot_b & 0x3f) << 6) | (slot_a & 0x3f)), - xi0, xi1, xi2, 0); -} - - - - - -void equihash_round(uint round, __global char *ht_src, __global char *ht_dst, - __global uint *debug) -{ - uint tid = get_global_id(0); - uint tlid = get_local_id(0); - __global char *p; - uint cnt; - uchar first_words[((1 << (((200 / (9 + 1)) + 1) - 20)) * 13)]; - uchar mask; - uint i, j; - - - ushort collisions[((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 2]; - uint nr_coll = 0; - uint n; - uint dropped_coll, dropped_stor; - __global ulong *a, *b; - uint xi_offset; - - xi_offset = (8 + ((round - 1) / 2) * 4); -# 495 "input.cl" - mask = 0; - - - - p = (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32); - cnt = *(__global uint *)p; - cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 13)); - p += xi_offset; - for (i = 0; i < cnt; i++, p += 32) - first_words[i] = *(__global uchar *)p; - - nr_coll = 0; - dropped_coll = 0; - for (i = 0; i < cnt; i++) - for (j = i + 1; j < cnt; j++) - if ((first_words[i] & mask) == - (first_words[j] & mask)) - { - - if (nr_coll >= sizeof (collisions) / sizeof (*collisions)) - dropped_coll++; - else - - - collisions[nr_coll++] = - ((ushort)j << 8) | ((ushort)i & 0xff); - - - - } - - uint adj = (!(round % 2)) ? 1 : 0; - - dropped_stor = 0; - for (n = 0; n < nr_coll; n++) - { - i = collisions[n] & 0xff; - j = collisions[n] >> 8; - a = (__global ulong *) - (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32 + i * 32 + xi_offset - + adj); - b = (__global ulong *) - (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32 + j * 32 + xi_offset - + adj); - dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b); - } - - - - -} -# 557 "input.cl" -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round1(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(1, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round2(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(2, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round3(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(3, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round4(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(4, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round5(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(5, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round6(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(6, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round7(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(7, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round8(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(8, ht_src, ht_dst, debug); } - -uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot) -{ - return *(__global uint *)(ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32 + - slot * 32 + xi_offset - 4); -} - -void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs, - uint round) -{ - __global char *ht = htabs[round % 2]; - uint i = nr_inputs - 1; - uint j = nr_inputs * 2 - 1; - uint xi_offset = (8 + ((round) / 2) * 4); - do - { - ins[j] = expand_ref(ht, xi_offset, - (ins[i] >> 12), ((ins[i] >> 6) & 0x3f)); - ins[j - 1] = expand_ref(ht, xi_offset, - (ins[i] >> 12), (ins[i] & 0x3f)); - if (!i) - break ; - i--; - j -= 2; - } - while (1); -} - - - - -void potential_sol(__global char **htabs, __global sols_t *sols, - uint ref0, uint ref1) -{ - uint sol_i; - uint nr_values; - sol_i = atomic_inc(&sols->nr); - if (sol_i >= 2000) - return ; - sols->valid[sol_i] = 0; - nr_values = 0; - sols->values[sol_i][nr_values++] = ref0; - sols->values[sol_i][nr_values++] = ref1; - uint round = 9 - 1; - do - { - round--; - expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round); - nr_values *= 2; - } - while (round > 0); - sols->valid[sol_i] = 1; -} - - - - -__kernel -void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols) -{ - uint tid = get_global_id(0); - __global char *htabs[2] = { ht0, ht1 }; - uint ht_i = (9 - 1) % 2; - uint cnt; - uint xi_offset = (8 + ((9 - 1) / 2) * 4); - uint i, j; - __global char *a, *b; - uint ref_i, ref_j; - - - ulong collisions[5]; - uint coll; - - - - uint mask = 0xffffff; - - - - if (tid == 0) - sols->nr = sols->likely_invalidss = 0; - mem_fence(CLK_GLOBAL_MEM_FENCE); - a = htabs[ht_i] + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 13) * 32; - cnt = *(__global uint *)a; - cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 13)); - coll = 0; - a += xi_offset; - for (i = 0; i < cnt; i++, a += 32) - for (j = i + 1, b = a + 32; j < cnt; j++, b += 32) - if (((*(__global uint *)a) & mask) == - ((*(__global uint *)b) & mask)) - { - ref_i = *(__global uint *)(a - 4); - ref_j = *(__global uint *)(b - 4); - if (coll < sizeof (collisions) / sizeof (*collisions)) - collisions[coll++] = ((ulong)ref_i << 32) | ref_j; - else - atomic_inc(&sols->likely_invalidss); - } - if (!coll) - return ; - for (i = 0; i < coll; i++) - potential_sol(htabs, sols, collisions[i] >> 32, - collisions[i] & 0xffffffff); -} diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..f9066756a --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,198 @@ +project(nheqminer) +cmake_minimum_required(VERSION 3.5) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") # -Wall + +## Enable solvers here +#### older slower +option(USE_CPU_TROMP "USE CPU_TROMP" OFF) +option(USE_CUDA_TROMP "USE CUDA_TROMP" OFF) +#### faster +option(USE_CPU_XENONCAT "USE CPU_XENONCAT" ON) +option(USE_CUDA_DJEZO "USE CUDA_DJEZO" ON) + +## Add solvers here +if (USE_CPU_TROMP) + add_definitions(-DUSE_CPU_TROMP) + message("-- USE_CPU_TROMP DEFINED") +endif() +if (USE_CPU_XENONCAT) + add_definitions(-DUSE_CPU_XENONCAT) + message("-- USE_CPU_XENONCAT DEFINED") +endif() +if (USE_CUDA_TROMP) + add_definitions(-DUSE_CUDA_TROMP) + message("-- USE_CUDA_TROMP DEFINED") +endif() +if (USE_CUDA_DJEZO) + add_definitions(-DUSE_CUDA_DJEZO) + message("-- USE_CUDA_DJEZO DEFINED") +endif() + + +######## +# LINUX +if(CMAKE_COMPILER_IS_GNUCXX) +# # use native cpu features +# set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -fPIC") +# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -fPIC") + +# # optimizations +# add_definitions(-O3) + + # use + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -m64 -msse2") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse2") + # optimizations + add_definitions(-O2) +endif() + +# Common +include_directories(${nheqminer_SOURCE_DIR}/nheqminer) + +# BOOST +#find_package(Threads REQUIRED COMPONENTS) +# compile boost staticaly +set(Boost_USE_STATIC_LIBS ON) +set(CMAKE_FIND_LIBRARY_SUFFIXES ".a") +#set(BUILD_SHARED_LIBRARIES OFF) +#set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++ -static") +find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread) + +if (Boost_FOUND) + # From the offical documentation: + # Add include directories to the build. [...] If the SYSTEM option is given, + # the compiler will be told the directories are meant as system include + # directories on some platforms (signalling this setting might achieve effects + # such as the compiler skipping warnings [...])." + include_directories (SYSTEM ${Boost_INCLUDE_DIR}) + + # From the offical documentation: + # "Specify directories in which the linker will look for libraries. [...] Note + # that this command is rarely necessary. Library locations returned by + # find_package() and find_library() are absolute paths. Pass these absolute + # library file paths directly to the target_link_libraries() command. CMake + # will ensure the linker finds them." + link_directories (${Boost_LIBRARY_DIRS}) +else() + message("Boost_FOUND NOT FOUND") +endif () + +include_directories(${CMAKE_CURRENT_BINARY_DIR}/../) + +set(SOURCE_FILES + # sources + nheqminer/amount.cpp + nheqminer/api.cpp + nheqminer/arith_uint256.cpp + nheqminer/crypto/sha256.cpp + nheqminer/json/json_spirit_reader.cpp + nheqminer/json/json_spirit_value.cpp + nheqminer/json/json_spirit_writer.cpp + nheqminer/libstratum/ZcashStratum.cpp + nheqminer/main.cpp + nheqminer/primitives/block.cpp + nheqminer/speed.cpp + nheqminer/uint256.cpp + nheqminer/utilstrencodings.cpp + # headers + nheqminer/amount.h + nheqminer/api.hpp + nheqminer/arith_uint256.h + nheqminer/crypto/sha256.h + nheqminer/hash.h + nheqminer/json/json_spirit.h + nheqminer/json/json_spirit_error_position.h + nheqminer/json/json_spirit_reader.h + nheqminer/json/json_spirit_reader_template.h + nheqminer/json/json_spirit_stream_reader.h + nheqminer/json/json_spirit_utils.h + nheqminer/json/json_spirit_value.h + nheqminer/json/json_spirit_writer.h + nheqminer/json/json_spirit_writer_template.h + nheqminer/libstratum/StratumClient.cpp + nheqminer/libstratum/StratumClient.h + nheqminer/libstratum/ZcashStratum.cpp + nheqminer/libstratum/ZcashStratum.h + nheqminer/primitives/block.h + nheqminer/primitives/transaction.h + nheqminer/script/script.h + nheqminer/serialize.h + nheqminer/speed.hpp + nheqminer/streams.h + nheqminer/support/allocators/zeroafterfree.h + nheqminer/tinyformat.h + nheqminer/uint252.h + nheqminer/uint256.h + nheqminer/utilstrencodings.h + nheqminer/version.h + nheqminer/zcash/JoinSplit.hpp + nheqminer/zcash/NoteEncryption.hpp + nheqminer/zcash/Proof.hpp + nheqminer/zcash/Zcash.h + nheqminer/SolverStub.h # just a stub + + nheqminer/AvailableSolvers.h + nheqminer/ISolver.h + nheqminer/Solver.h + nheqminer/MinerFactory.h + nheqminer/MinerFactory.cpp + + # make same path on windows + #blake shared + # src + blake2/blake2bx.cpp + # headers + blake2/blake2.h + blake2/blake2b-load-sse2.h + blake2/blake2b-load-sse41.h + blake2/blake2b-round.h + blake2/blake2-config.h + blake2/blake2-impl.h + blake2/blake2-round.h + ) + +#set(LIBS ${LIBS} ${Threads_LIBRARIES} ${Boost_LIBRARIES}) +set(LIBS ${LIBS} ${Boost_LIBRARIES}) + +message("-- CXXFLAGS: ${CMAKE_CXX_FLAGS}") +message("-- LIBS: ${LIBS}") + +if (USE_CPU_TROMP) + add_subdirectory(cpu_tromp) +endif() +if (USE_CPU_XENONCAT) + add_subdirectory(cpu_xenoncat) +endif() +if (USE_CUDA_TROMP) + add_subdirectory(cuda_tromp) +endif() +if (USE_CUDA_DJEZO) + add_subdirectory(cuda_djezo) +endif() + +#add_subdirectory(cpu_xenoncat) + +ADD_EXECUTABLE(${PROJECT_NAME} ${SOURCE_FILES}) + +#target_link_libraries(${PROJECT_NAME} ${LIBS} ${CUDA_LIBRARIES} ) +target_link_libraries(${PROJECT_NAME} ${CMAKE_THREAD_LIBS_INIT} ${LIBS} ) + +# link libs +if (USE_CPU_TROMP) + target_link_libraries(${PROJECT_NAME} cpu_tromp) +endif() +if (USE_CPU_XENONCAT) + add_library ( xenoncat_avx1 SHARED IMPORTED GLOBAL ) + set_target_properties ( xenoncat_avx1 PROPERTIES IMPORTED_LOCATION "../nheqminer/cpu_xenoncat/asm_linux/equihash_avx1.o" ) + add_library ( xenoncat_avx2 SHARED IMPORTED GLOBAL ) + set_target_properties ( xenoncat_avx2 PROPERTIES IMPORTED_LOCATION "../nheqminer/cpu_xenoncat/asm_linux/equihash_avx2.o" ) + target_link_libraries(${PROJECT_NAME} cpu_xenoncat xenoncat_avx1 xenoncat_avx2) +endif() +if (USE_CUDA_TROMP) + target_link_libraries(${PROJECT_NAME} cuda_tromp) +endif() +if (USE_CUDA_DJEZO) + target_link_libraries(${PROJECT_NAME} cuda_djezo) +endif() + diff --git a/Linux_cmake/nheqminer_AMD/CMakeLists.txt b/Linux_cmake/nheqminer_AMD/CMakeLists.txt deleted file mode 100644 index 40c1dabb2..000000000 --- a/Linux_cmake/nheqminer_AMD/CMakeLists.txt +++ /dev/null @@ -1,163 +0,0 @@ -project(nheqminer_AMD) -cmake_minimum_required(VERSION 2.8) - -#aux_source_directory(. SRC_LIST) -#add_executable(${PROJECT_NAME} ${SRC_LIST}) - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - -# LINUX -if(CMAKE_COMPILER_IS_GNUCXX) - # use native cpu features - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") - # optimizations - add_definitions(-O3) -endif() - -# Common -include_directories(${nheqminer_SOURCE_DIR}) - -add_definitions(-DBOOST_ALL_NO_LIB -DBOOST_ALL_DYN_LINK -DBOOST_LOG_DYN_LINK) - -find_package(Threads REQUIRED COMPONENTS) -find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread) - -if (Boost_FOUND) - # From the offical documentation: - # Add include directories to the build. [...] If the SYSTEM option is given, - # the compiler will be told the directories are meant as system include - # directories on some platforms (signalling this setting might achieve effects - # such as the compiler skipping warnings [...])." - include_directories (SYSTEM ${Boost_INCLUDE_DIR}) - - # From the offical documentation: - # "Specify directories in which the linker will look for libraries. [...] Note - # that this command is rarely necessary. Library locations returned by - # find_package() and find_library() are absolute paths. Pass these absolute - # library file paths directly to the target_link_libraries() command. CMake - # will ensure the linker finds them." - link_directories (${Boost_LIBRARY_DIRS}) -else() - message("Boost_FOUND NOT FOUND") -endif () - -## Add solvers here -#add_definitions(-DUSE_CPU_XENONCAT) -#add_definitions(-DUSE_CPU_TROMP) -add_definitions(-DUSE_OCL_XMP) -add_definitions(-DUSE_OCL_SILENTARMY) - -#add_library ( xenoncat_avx1 SHARED IMPORTED GLOBAL ) -#set_target_properties ( xenoncat_avx1 PROPERTIES IMPORTED_LOCATION "../../cpu_xenoncat/Linux/asm/equihash_avx1.o" ) - -#add_library ( xenoncat_avx2 SHARED IMPORTED GLOBAL ) -#set_target_properties ( xenoncat_avx2 PROPERTIES IMPORTED_LOCATION "../../cpu_xenoncat/Linux/asm/equihash_avx2.o" ) - -include_directories(${CMAKE_CURRENT_BINARY_DIR}/../../nheqminer/) -include_directories(${CMAKE_CURRENT_BINARY_DIR}/../../ocl_device_utils/) - -# OCL INC DIR -include_directories(${OPENCL_INCLUDE_DIRECTORY}) - - -set(SOURCE_FILES - # sources - ../../nheqminer/amount.cpp - ../../nheqminer/api.cpp - ../../nheqminer/arith_uint256.cpp - ../../nheqminer/crypto/sha256.cpp - ../../nheqminer/json/json_spirit_reader.cpp - ../../nheqminer/json/json_spirit_value.cpp - ../../nheqminer/json/json_spirit_writer.cpp - ../../nheqminer/libstratum/ZcashStratum.cpp - ../../nheqminer/main.cpp - ../../nheqminer/primitives/block.cpp - ../../nheqminer/speed.cpp - ../../nheqminer/uint256.cpp - ../../nheqminer/utilstrencodings.cpp - # headers - ../../nheqminer/amount.h - ../../nheqminer/api.hpp - ../../nheqminer/arith_uint256.h - ../../nheqminer/crypto/sha256.h - ../../nheqminer/hash.h - ../../nheqminer/json/json_spirit.h - ../../nheqminer/json/json_spirit_error_position.h - ../../nheqminer/json/json_spirit_reader.h - ../../nheqminer/json/json_spirit_reader_template.h - ../../nheqminer/json/json_spirit_stream_reader.h - ../../nheqminer/json/json_spirit_utils.h - ../../nheqminer/json/json_spirit_value.h - ../../nheqminer/json/json_spirit_writer.h - ../../nheqminer/json/json_spirit_writer_template.h - ../../nheqminer/libstratum/StratumClient.cpp - ../../nheqminer/libstratum/StratumClient.h - ../../nheqminer/libstratum/ZcashStratum.cpp - ../../nheqminer/libstratum/ZcashStratum.h - ../../nheqminer/primitives/block.h - ../../nheqminer/primitives/transaction.h - ../../nheqminer/script/script.h - ../../nheqminer/serialize.h - ../../nheqminer/speed.hpp - ../../nheqminer/streams.h - ../../nheqminer/support/allocators/zeroafterfree.h - ../../nheqminer/tinyformat.h - ../../nheqminer/uint252.h - ../../nheqminer/uint256.h - ../../nheqminer/utilstrencodings.h - ../../nheqminer/version.h - ../../nheqminer/zcash/JoinSplit.hpp - ../../nheqminer/zcash/NoteEncryption.hpp - ../../nheqminer/zcash/Proof.hpp - ../../nheqminer/zcash/Zcash.h - ../../nheqminer/SolverStub.h # just a stub - - ## cpu tromp - #../../cpu_tromp/blake2/blake2bx.cpp - #../../cpu_tromp/cpu_tromp.cpp - #../../cpu_tromp/blake2/blake2-config.h - #../../cpu_tromp/blake2/blake2-impl.h - #../../cpu_tromp/blake2/blake2-round.h - #../../cpu_tromp/blake2/blake2.h - #../../cpu_tromp/blake2/blake2b-load-sse2.h - #../../cpu_tromp/blake2/blake2b-load-sse41.h - #../../cpu_tromp/blake2/blake2b-round.h - #../../cpu_tromp/cpu_tromp.hpp - #../../cpu_tromp/equi.h - #../../cpu_tromp/equi_miner.h -# - ## cpu xenocat - #../../cpu_xenoncat/cpu_xenoncat.hpp - #../../cpu_xenoncat/xenoncat.cpp -# - # AMD ocl_device_utils - ../../ocl_device_utils/cl_ext.hpp - ../../ocl_device_utils/ocl_device_utils.h - ../../ocl_device_utils/ocl_device_utils.cpp - ../../ocl_device_utils/OpenCLDevice.h - ../../ocl_device_utils/opencl.h - ../../ocl_device_utils/opencl.cpp - # AMD ocl_xpm - ../../ocl_xpm/ocl_xmp.hpp - ../../ocl_xpm/ocl_xmp.cpp - ../../ocl_xpm/zcash/gpu/common.h - ../../cpu_tromp/blake2/blake2bx.cpp - # AMD ocl_silentarmy - ../../ocl_silentarmy/ocl_silentarmy.hpp - ../../ocl_silentarmy/param.h - ../../ocl_silentarmy/sa_blake.h - ../../ocl_silentarmy/ocl_silentarmy.cpp - ../../ocl_silentarmy/sa_blake.cpp - ) - -#add_executable(${PROJECT_NAME} ${SRC_LIST}) - -set(LIBS ${LIBS} ${Boost_LIBRARIES} ${OPENCL_LIBRARY}) - -#message("-- CXXFLAGS: ${CMAKE_CXX_FLAGS}") -#message("-- LIBS: ${LIBS}") - -add_executable(${PROJECT_NAME} ${SOURCE_FILES}) -#target_link_libraries(${PROJECT_NAME} ${Boost_LIBRARIES}) -target_link_libraries(${PROJECT_NAME} ${LIBS} ${CMAKE_THREAD_LIBS_INIT}) diff --git a/Linux_cmake/nheqminer_cpu/CMakeLists.txt b/Linux_cmake/nheqminer_cpu/CMakeLists.txt deleted file mode 100644 index d4269b444..000000000 --- a/Linux_cmake/nheqminer_cpu/CMakeLists.txt +++ /dev/null @@ -1,137 +0,0 @@ -project(nheqminer_cpu) -cmake_minimum_required(VERSION 2.8) - -#aux_source_directory(. SRC_LIST) -#add_executable(${PROJECT_NAME} ${SRC_LIST}) - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - -# LINUX -if(CMAKE_COMPILER_IS_GNUCXX) - # use native cpu features - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") - # optimizations - add_definitions(-O3) -endif() - -# Common -include_directories(${nheqminer_SOURCE_DIR}) - -add_definitions(-DBOOST_ALL_NO_LIB -DBOOST_ALL_DYN_LINK -DBOOST_LOG_DYN_LINK) - -find_package(Threads REQUIRED COMPONENTS) -find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread) - -if (Boost_FOUND) - # From the offical documentation: - # Add include directories to the build. [...] If the SYSTEM option is given, - # the compiler will be told the directories are meant as system include - # directories on some platforms (signalling this setting might achieve effects - # such as the compiler skipping warnings [...])." - include_directories (SYSTEM ${Boost_INCLUDE_DIR}) - - # From the offical documentation: - # "Specify directories in which the linker will look for libraries. [...] Note - # that this command is rarely necessary. Library locations returned by - # find_package() and find_library() are absolute paths. Pass these absolute - # library file paths directly to the target_link_libraries() command. CMake - # will ensure the linker finds them." - link_directories (${Boost_LIBRARY_DIRS}) -else() - message("Boost_FOUND NOT FOUND") -endif () - -## Add solvers here -add_definitions(-DUSE_CPU_XENONCAT) -add_definitions(-DUSE_CPU_TROMP) - -add_library ( xenoncat_avx1 SHARED IMPORTED GLOBAL ) -set_target_properties ( xenoncat_avx1 PROPERTIES IMPORTED_LOCATION "../../cpu_xenoncat/Linux/asm/equihash_avx1.o" ) - -add_library ( xenoncat_avx2 SHARED IMPORTED GLOBAL ) -set_target_properties ( xenoncat_avx2 PROPERTIES IMPORTED_LOCATION "../../cpu_xenoncat/Linux/asm/equihash_avx2.o" ) - -include_directories(${CMAKE_CURRENT_BINARY_DIR}/../../nheqminer/) - -set(SOURCE_FILES - # sources - ../../nheqminer/amount.cpp - ../../nheqminer/api.cpp - ../../nheqminer/arith_uint256.cpp - ../../nheqminer/crypto/sha256.cpp - ../../nheqminer/json/json_spirit_reader.cpp - ../../nheqminer/json/json_spirit_value.cpp - ../../nheqminer/json/json_spirit_writer.cpp - ../../nheqminer/libstratum/ZcashStratum.cpp - ../../nheqminer/main.cpp - ../../nheqminer/primitives/block.cpp - ../../nheqminer/speed.cpp - ../../nheqminer/uint256.cpp - ../../nheqminer/utilstrencodings.cpp - # headers - ../../nheqminer/amount.h - ../../nheqminer/api.hpp - ../../nheqminer/arith_uint256.h - ../../nheqminer/crypto/sha256.h - ../../nheqminer/hash.h - ../../nheqminer/json/json_spirit.h - ../../nheqminer/json/json_spirit_error_position.h - ../../nheqminer/json/json_spirit_reader.h - ../../nheqminer/json/json_spirit_reader_template.h - ../../nheqminer/json/json_spirit_stream_reader.h - ../../nheqminer/json/json_spirit_utils.h - ../../nheqminer/json/json_spirit_value.h - ../../nheqminer/json/json_spirit_writer.h - ../../nheqminer/json/json_spirit_writer_template.h - ../../nheqminer/libstratum/StratumClient.cpp - ../../nheqminer/libstratum/StratumClient.h - ../../nheqminer/libstratum/ZcashStratum.cpp - ../../nheqminer/libstratum/ZcashStratum.h - ../../nheqminer/primitives/block.h - ../../nheqminer/primitives/transaction.h - ../../nheqminer/script/script.h - ../../nheqminer/serialize.h - ../../nheqminer/speed.hpp - ../../nheqminer/streams.h - ../../nheqminer/support/allocators/zeroafterfree.h - ../../nheqminer/tinyformat.h - ../../nheqminer/uint252.h - ../../nheqminer/uint256.h - ../../nheqminer/utilstrencodings.h - ../../nheqminer/version.h - ../../nheqminer/zcash/JoinSplit.hpp - ../../nheqminer/zcash/NoteEncryption.hpp - ../../nheqminer/zcash/Proof.hpp - ../../nheqminer/zcash/Zcash.h - ../../nheqminer/SolverStub.h # just a stub - - # cpu tromp - ../../cpu_tromp/blake2/blake2bx.cpp - ../../cpu_tromp/cpu_tromp.cpp - ../../cpu_tromp/blake2/blake2-config.h - ../../cpu_tromp/blake2/blake2-impl.h - ../../cpu_tromp/blake2/blake2-round.h - ../../cpu_tromp/blake2/blake2.h - ../../cpu_tromp/blake2/blake2b-load-sse2.h - ../../cpu_tromp/blake2/blake2b-load-sse41.h - ../../cpu_tromp/blake2/blake2b-round.h - ../../cpu_tromp/cpu_tromp.hpp - ../../cpu_tromp/equi.h - ../../cpu_tromp/equi_miner.h - - # cpu xenocat - ../../cpu_xenoncat/cpu_xenoncat.hpp - ../../cpu_xenoncat/xenoncat.cpp - ) - -#add_executable(${PROJECT_NAME} ${SRC_LIST}) - -set(LIBS ${LIBS} ${Boost_LIBRARIES}) - -#message("-- CXXFLAGS: ${CMAKE_CXX_FLAGS}") -#message("-- LIBS: ${LIBS}") - -add_executable(${PROJECT_NAME} ${SOURCE_FILES}) -#target_link_libraries(${PROJECT_NAME} ${Boost_LIBRARIES}) -target_link_libraries(${PROJECT_NAME} ${LIBS} ${CMAKE_THREAD_LIBS_INIT} xenoncat_avx1 xenoncat_avx2 ) diff --git a/Linux_cmake/nheqminer_cpu_tromp/CMakeLists.txt b/Linux_cmake/nheqminer_cpu_tromp/CMakeLists.txt deleted file mode 100644 index 1fe33e5e6..000000000 --- a/Linux_cmake/nheqminer_cpu_tromp/CMakeLists.txt +++ /dev/null @@ -1,143 +0,0 @@ -project(nheqminer_cpu_tromp) -cmake_minimum_required(VERSION 2.8) - -#aux_source_directory(. SRC_LIST) -#add_executable(${PROJECT_NAME} ${SRC_LIST}) - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - -# LINUX -if(CMAKE_COMPILER_IS_GNUCXX) - # use native cpu features - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") - # optimizations - add_definitions(-O3) -endif() - -# Common -include_directories(${nheqminer_SOURCE_DIR}) - -add_definitions(-DBOOST_ALL_NO_LIB -DBOOST_ALL_DYN_LINK -DBOOST_LOG_DYN_LINK) - -find_package(Threads REQUIRED COMPONENTS) -find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread) - -if (Boost_FOUND) - # From the offical documentation: - # Add include directories to the build. [...] If the SYSTEM option is given, - # the compiler will be told the directories are meant as system include - # directories on some platforms (signalling this setting might achieve effects - # such as the compiler skipping warnings [...])." - include_directories (SYSTEM ${Boost_INCLUDE_DIR}) - - # From the offical documentation: - # "Specify directories in which the linker will look for libraries. [...] Note - # that this command is rarely necessary. Library locations returned by - # find_package() and find_library() are absolute paths. Pass these absolute - # library file paths directly to the target_link_libraries() command. CMake - # will ensure the linker finds them." - link_directories (${Boost_LIBRARY_DIRS}) -else() - message("Boost_FOUND NOT FOUND") -endif () - -## Add solvers here -add_definitions(-DUSE_CPU_TROMP) - -include_directories(${CMAKE_CURRENT_BINARY_DIR}/../../nheqminer/) - -set(SOURCE_FILES - # sources - ../../nheqminer/amount.cpp - ../../nheqminer/api.cpp - ../../nheqminer/arith_uint256.cpp - ../../nheqminer/crypto/sha256.cpp - ../../nheqminer/json/json_spirit_reader.cpp - ../../nheqminer/json/json_spirit_value.cpp - ../../nheqminer/json/json_spirit_writer.cpp - ../../nheqminer/libstratum/ZcashStratum.cpp - ../../nheqminer/main.cpp - ../../nheqminer/primitives/block.cpp - ../../nheqminer/speed.cpp - ../../nheqminer/uint256.cpp - ../../nheqminer/utilstrencodings.cpp - # headers - ../../nheqminer/amount.h - ../../nheqminer/api.hpp - ../../nheqminer/arith_uint256.h - ../../nheqminer/crypto/sha256.h - ../../nheqminer/hash.h - ../../nheqminer/json/json_spirit.h - ../../nheqminer/json/json_spirit_error_position.h - ../../nheqminer/json/json_spirit_reader.h - ../../nheqminer/json/json_spirit_reader_template.h - ../../nheqminer/json/json_spirit_stream_reader.h - ../../nheqminer/json/json_spirit_utils.h - ../../nheqminer/json/json_spirit_value.h - ../../nheqminer/json/json_spirit_writer.h - ../../nheqminer/json/json_spirit_writer_template.h - ../../nheqminer/libstratum/StratumClient.cpp - ../../nheqminer/libstratum/StratumClient.h - ../../nheqminer/libstratum/ZcashStratum.cpp - ../../nheqminer/libstratum/ZcashStratum.h - ../../nheqminer/primitives/block.h - ../../nheqminer/primitives/transaction.h - ../../nheqminer/script/script.h - ../../nheqminer/serialize.h - ../../nheqminer/speed.hpp - ../../nheqminer/streams.h - ../../nheqminer/support/allocators/zeroafterfree.h - ../../nheqminer/tinyformat.h - ../../nheqminer/uint252.h - ../../nheqminer/uint256.h - ../../nheqminer/utilstrencodings.h - ../../nheqminer/version.h - ../../nheqminer/zcash/JoinSplit.hpp - ../../nheqminer/zcash/NoteEncryption.hpp - ../../nheqminer/zcash/Proof.hpp - ../../nheqminer/zcash/Zcash.h - ../../nheqminer/SolverStub.h # just a stub - - # cpu tromp - ../../cpu_tromp/blake2/blake2bx.cpp - ../../cpu_tromp/cpu_tromp.cpp - ../../cpu_tromp/blake2/blake2-config.h - ../../cpu_tromp/blake2/blake2-impl.h - ../../cpu_tromp/blake2/blake2-round.h - ../../cpu_tromp/blake2/blake2.h - ../../cpu_tromp/blake2/blake2b-load-sse2.h - ../../cpu_tromp/blake2/blake2b-load-sse41.h - ../../cpu_tromp/blake2/blake2b-round.h - ../../cpu_tromp/cpu_tromp.hpp - ../../cpu_tromp/equi.h - ../../cpu_tromp/equi_miner.h - ) - -#if(USE_CPU_TROMP) -# set(SOURCE_FILES ${SOURCE_FILES} -# ../../cpu_tromp/blake2/blake2bx.cpp -# ../../cpu_tromp/cpu_tromp.cpp -# ../../cpu_tromp/blake2/blake2-config.h -# ../../cpu_tromp/blake2/blake2-impl.h -# ../../cpu_tromp/blake2/blake2-round.h -# ../../cpu_tromp/blake2/blake2.h -# ../../cpu_tromp/blake2/blake2b-load-sse2.h -# ../../cpu_tromp/blake2/blake2b-load-sse41.h -# ../../cpu_tromp/blake2/blake2b-round.h -# ../../cpu_tromp/cpu_tromp.hpp -# ../../cpu_tromp/equi.h -# ../../cpu_tromp/equi_miner.h -# ) -#endif() - -#add_executable(${PROJECT_NAME} ${SRC_LIST}) - -set(LIBS ${LIBS} ${Boost_LIBRARIES}) - -#message("-- CXXFLAGS: ${CMAKE_CXX_FLAGS}") -#message("-- LIBS: ${LIBS}") - -add_executable(${PROJECT_NAME} ${SOURCE_FILES}) -#target_link_libraries(${PROJECT_NAME} ${Boost_LIBRARIES}) -target_link_libraries(${PROJECT_NAME} ${LIBS} ${CMAKE_THREAD_LIBS_INIT}) diff --git a/Linux_cmake/nheqminer_cpu_xenoncat/CMakeLists.txt b/Linux_cmake/nheqminer_cpu_xenoncat/CMakeLists.txt deleted file mode 100644 index 25d58c633..000000000 --- a/Linux_cmake/nheqminer_cpu_xenoncat/CMakeLists.txt +++ /dev/null @@ -1,122 +0,0 @@ -project(nheqminer_cpu_xenoncat) -cmake_minimum_required(VERSION 2.8) - -#aux_source_directory(. SRC_LIST) -#add_executable(${PROJECT_NAME} ${SRC_LIST}) - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - -# LINUX -if(CMAKE_COMPILER_IS_GNUCXX) - # use native cpu features - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") - # optimizations - add_definitions(-O3) -endif() - -# Common -include_directories(${nheqminer_SOURCE_DIR}) - -add_definitions(-DBOOST_ALL_NO_LIB -DBOOST_ALL_DYN_LINK -DBOOST_LOG_DYN_LINK) - -find_package(Threads REQUIRED COMPONENTS) -find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread) - -if (Boost_FOUND) - # From the offical documentation: - # Add include directories to the build. [...] If the SYSTEM option is given, - # the compiler will be told the directories are meant as system include - # directories on some platforms (signalling this setting might achieve effects - # such as the compiler skipping warnings [...])." - include_directories (SYSTEM ${Boost_INCLUDE_DIR}) - - # From the offical documentation: - # "Specify directories in which the linker will look for libraries. [...] Note - # that this command is rarely necessary. Library locations returned by - # find_package() and find_library() are absolute paths. Pass these absolute - # library file paths directly to the target_link_libraries() command. CMake - # will ensure the linker finds them." - link_directories (${Boost_LIBRARY_DIRS}) -else() - message("Boost_FOUND NOT FOUND") -endif () - -## Add solvers here -add_definitions(-DUSE_CPU_XENONCAT) - -add_library ( xenoncat_avx1 SHARED IMPORTED GLOBAL ) -set_target_properties ( xenoncat_avx1 PROPERTIES IMPORTED_LOCATION "../../cpu_xenoncat/Linux/asm/equihash_avx1.o" ) - -add_library ( xenoncat_avx2 SHARED IMPORTED GLOBAL ) -set_target_properties ( xenoncat_avx2 PROPERTIES IMPORTED_LOCATION "../../cpu_xenoncat/Linux/asm/equihash_avx2.o" ) - -include_directories(${CMAKE_CURRENT_BINARY_DIR}/../../nheqminer/) - -set(SOURCE_FILES - # sources - ../../nheqminer/amount.cpp - ../../nheqminer/api.cpp - ../../nheqminer/arith_uint256.cpp - ../../nheqminer/crypto/sha256.cpp - ../../nheqminer/json/json_spirit_reader.cpp - ../../nheqminer/json/json_spirit_value.cpp - ../../nheqminer/json/json_spirit_writer.cpp - ../../nheqminer/libstratum/ZcashStratum.cpp - ../../nheqminer/main.cpp - ../../nheqminer/primitives/block.cpp - ../../nheqminer/speed.cpp - ../../nheqminer/uint256.cpp - ../../nheqminer/utilstrencodings.cpp - # headers - ../../nheqminer/amount.h - ../../nheqminer/api.hpp - ../../nheqminer/arith_uint256.h - ../../nheqminer/crypto/sha256.h - ../../nheqminer/hash.h - ../../nheqminer/json/json_spirit.h - ../../nheqminer/json/json_spirit_error_position.h - ../../nheqminer/json/json_spirit_reader.h - ../../nheqminer/json/json_spirit_reader_template.h - ../../nheqminer/json/json_spirit_stream_reader.h - ../../nheqminer/json/json_spirit_utils.h - ../../nheqminer/json/json_spirit_value.h - ../../nheqminer/json/json_spirit_writer.h - ../../nheqminer/json/json_spirit_writer_template.h - ../../nheqminer/libstratum/StratumClient.cpp - ../../nheqminer/libstratum/StratumClient.h - ../../nheqminer/libstratum/ZcashStratum.cpp - ../../nheqminer/libstratum/ZcashStratum.h - ../../nheqminer/primitives/block.h - ../../nheqminer/primitives/transaction.h - ../../nheqminer/script/script.h - ../../nheqminer/serialize.h - ../../nheqminer/speed.hpp - ../../nheqminer/streams.h - ../../nheqminer/support/allocators/zeroafterfree.h - ../../nheqminer/tinyformat.h - ../../nheqminer/uint252.h - ../../nheqminer/uint256.h - ../../nheqminer/utilstrencodings.h - ../../nheqminer/version.h - ../../nheqminer/zcash/JoinSplit.hpp - ../../nheqminer/zcash/NoteEncryption.hpp - ../../nheqminer/zcash/Proof.hpp - ../../nheqminer/zcash/Zcash.h - ../../nheqminer/SolverStub.h # just a stub - - # cpu xenocat - ../../cpu_xenoncat/cpu_xenoncat.hpp - ../../cpu_xenoncat/xenoncat.cpp - ) - -#add_executable(${PROJECT_NAME} ${SRC_LIST}) - -set(LIBS ${LIBS} ${Boost_LIBRARIES}) - -#message("-- CXXFLAGS: ${CMAKE_CXX_FLAGS}") -#message("-- LIBS: ${LIBS}") - -add_executable(${PROJECT_NAME} ${SOURCE_FILES}) -#target_link_libraries(${PROJECT_NAME} ${Boost_LIBRARIES}) -target_link_libraries(${PROJECT_NAME} ${LIBS} ${CMAKE_THREAD_LIBS_INIT} xenoncat_avx1 xenoncat_avx2 ) diff --git a/Linux_cmake/nheqminer_cuda_tromp/CMakeLists.txt b/Linux_cmake/nheqminer_cuda_tromp/CMakeLists.txt deleted file mode 100644 index 1853b22de..000000000 --- a/Linux_cmake/nheqminer_cuda_tromp/CMakeLists.txt +++ /dev/null @@ -1,170 +0,0 @@ -project(nheqminer_cuda_tromp) -cmake_minimum_required(VERSION 2.8) - -option(ENABLE_CUDA "Enable the cuda build" ON) - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - -# LINUX -if(CMAKE_COMPILER_IS_GNUCXX) - # use native cpu features - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") - # optimizations - add_definitions(-O3) -endif() - -# Common -include_directories(${nheqminer_SOURCE_DIR}) - -add_definitions(-DBOOST_ALL_NO_LIB -DBOOST_ALL_DYN_LINK -DBOOST_LOG_DYN_LINK) - -#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-m64;--std=c++11;--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo) - -set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo) - -add_definitions(-DHIST) -#add_definitions(-DXINTREE) -#add_definitions(-DUNROLL) - -list(APPEND CUDA_NVCC_FLAGS_RELEASE -O3) - - -FIND_PACKAGE(CUDA REQUIRED) -if(COMPUTE AND (COMPUTE GREATER 0)) - LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE}) -else(COMPUTE AND (COMPUTE GREATER 0)) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};; -gencode arch=compute_20,code=sm_21; -gencode arch=compute_30,code=sm_30; -gencode arch=compute_35,code=sm_35; -gencode arch=compute_50,code=sm_50; -gencode arch=compute_52,code=sm_52; -gencode arch=compute_61,code=sm_61 ) -endif(COMPUTE AND (COMPUTE GREATER 0)) - -include_directories(${CUDA_INCLUDE_DIRS}) - -find_package(Threads REQUIRED COMPONENTS) -find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread) - -if(CUDA_FOUND) -message("CUDA FOUND") -else() -message("CUDA NOT FOUND") -endif() - -if (Boost_FOUND) - # From the offical documentation: - # Add include directories to the build. [...] If the SYSTEM option is given, - # the compiler will be told the directories are meant as system include - # directories on some platforms (signalling this setting might achieve effects - # such as the compiler skipping warnings [...])." - include_directories (SYSTEM ${Boost_INCLUDE_DIR}) - - # From the offical documentation: - # "Specify directories in which the linker will look for libraries. [...] Note - # that this command is rarely necessary. Library locations returned by - # find_package() and find_library() are absolute paths. Pass these absolute - # library file paths directly to the target_link_libraries() command. CMake - # will ensure the linker finds them." - link_directories (${Boost_LIBRARY_DIRS}) -else() - message("Boost_FOUND NOT FOUND") -endif () - -## Add solvers here -#add_definitions(-DUSE_CPU_XENONCAT) -#add_definitions(-DUSE_CPU_TROMP) -add_definitions(-DUSE_CUDA_TROMP) - -#add_library ( xenoncat_avx1 SHARED IMPORTED GLOBAL ) -#set_target_properties ( xenoncat_avx1 PROPERTIES IMPORTED_LOCATION "../../cpu_xenoncat/Linux/asm/equihash_avx1.o" ) - -#add_library ( xenoncat_avx2 SHARED IMPORTED GLOBAL ) -#set_target_properties ( xenoncat_avx2 PROPERTIES IMPORTED_LOCATION "../../cpu_xenoncat/Linux/asm/equihash_avx2.o" ) - -include_directories(${CMAKE_CURRENT_BINARY_DIR}/../../nheqminer/) - -set(SOURCE_FILES - # sources - ../../nheqminer/amount.cpp - ../../nheqminer/api.cpp - ../../nheqminer/arith_uint256.cpp - ../../nheqminer/crypto/sha256.cpp - ../../nheqminer/json/json_spirit_reader.cpp - ../../nheqminer/json/json_spirit_value.cpp - ../../nheqminer/json/json_spirit_writer.cpp - ../../nheqminer/libstratum/ZcashStratum.cpp - ../../nheqminer/main.cpp - ../../nheqminer/primitives/block.cpp - ../../nheqminer/speed.cpp - ../../nheqminer/uint256.cpp - ../../nheqminer/utilstrencodings.cpp - # headers - ../../nheqminer/amount.h - ../../nheqminer/api.hpp - ../../nheqminer/arith_uint256.h - ../../nheqminer/crypto/sha256.h - ../../nheqminer/hash.h - ../../nheqminer/json/json_spirit.h - ../../nheqminer/json/json_spirit_error_position.h - ../../nheqminer/json/json_spirit_reader.h - ../../nheqminer/json/json_spirit_reader_template.h - ../../nheqminer/json/json_spirit_stream_reader.h - ../../nheqminer/json/json_spirit_utils.h - ../../nheqminer/json/json_spirit_value.h - ../../nheqminer/json/json_spirit_writer.h - ../../nheqminer/json/json_spirit_writer_template.h - ../../nheqminer/libstratum/StratumClient.cpp - ../../nheqminer/libstratum/StratumClient.h - ../../nheqminer/libstratum/ZcashStratum.cpp - ../../nheqminer/libstratum/ZcashStratum.h - ../../nheqminer/primitives/block.h - ../../nheqminer/primitives/transaction.h - ../../nheqminer/script/script.h - ../../nheqminer/serialize.h - ../../nheqminer/speed.hpp - ../../nheqminer/streams.h - ../../nheqminer/support/allocators/zeroafterfree.h - ../../nheqminer/tinyformat.h - ../../nheqminer/uint252.h - ../../nheqminer/uint256.h - ../../nheqminer/utilstrencodings.h - ../../nheqminer/version.h - ../../nheqminer/zcash/JoinSplit.hpp - ../../nheqminer/zcash/NoteEncryption.hpp - ../../nheqminer/zcash/Proof.hpp - ../../nheqminer/zcash/Zcash.h - ../../nheqminer/SolverStub.h # just a stub - -# # cpu tromp -# ../../cpu_tromp/blake2/blake2bx.cpp -# ../../cpu_tromp/cpu_tromp.cpp -# ../../cpu_tromp/blake2/blake2-config.h -# ../../cpu_tromp/blake2/blake2-impl.h -# ../../cpu_tromp/blake2/blake2-round.h -# ../../cpu_tromp/blake2/blake2.h -# ../../cpu_tromp/blake2/blake2b-load-sse2.h -# ../../cpu_tromp/blake2/blake2b-load-sse41.h -# ../../cpu_tromp/blake2/blake2b-round.h -# ../../cpu_tromp/cpu_tromp.hpp -# ../../cpu_tromp/equi.h -# ../../cpu_tromp/equi_miner.h - -# # cpu xenocat -# ../../cpu_xenoncat/cpu_xenoncat.hpp -# ../../cpu_xenoncat/xenoncat.cpp - - # cuda tromp - ../../cuda_tromp/cuda_tromp.hpp - ../../cuda_tromp/cuda_tromp.cpp - ../../cuda_tromp/eqcuda.hpp - ../../cuda_tromp/equi_miner.cu - ../../cpu_tromp/blake2/blake2bx.cpp - ) - -#add_executable(${PROJECT_NAME} ${SRC_LIST}) -set(LIBS ${LIBS} ${Threads_LIBRARIES} ${Boost_LIBRARIES}) - -#message("-- CXXFLAGS: ${CMAKE_CXX_FLAGS}") -#message("-- LIBS: ${LIBS}") - -#add_executable(${PROJECT_NAME} ${SOURCE_FILES}) -#target_link_libraries(${PROJECT_NAME} ${Boost_LIBRARIES}) -CUDA_ADD_EXECUTABLE(${PROJECT_NAME} ${SOURCE_FILES}) -target_link_libraries(${PROJECT_NAME} ${LIBS} ${CUDA_LIBRARIES} ) diff --git a/README.md b/README.md index 5b4a9ad79..e13bb1845 100644 --- a/README.md +++ b/README.md @@ -1,71 +1,58 @@ # Build instructions: ### Dependencies: - - Boost 1.54+ + - Boost 1.62+ ## Windows: Windows builds made by us are available here: https://github.com/nicehash/nheqminer/releases Download and install: -- [AMD APP SDK](http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/) (if not needed remove **USE_OCL_XMP** from **nheqminer** Preprocessor definitions under Properties > C/C++ > Preprocessor) -- [CUDA SDK](https://developer.nvidia.com/cuda-downloads) (if not needed remove **USE_CUDA_TROMP** from **nheqminer** Preprocessor definitions under Properties > C/C++ > Preprocessor) +- [CUDA SDK](https://developer.nvidia.com/cuda-downloads) (if not needed remove **USE_CUDA_TROMP** and **USE_CUDA_DJEZO** from **nheqminer** Preprocessor definitions under Properties > C/C++ > Preprocessor) - Visual Studio 2013 Community: https://www.visualstudio.com/en-us/news/releasenotes/vs2013-community-vs -- Visual Studio Update 5 installed +- [Visual Studio Update 5](https://www.microsoft.com/en-us/download/details.aspx?id=48129) installed - 64 bit version only Open **nheqminer.sln** under **nheqminer/nheqminer.sln** and build. You will have to build ReleaseSSE2 cpu_tromp project first, then Release7.5 cuda_tromp project, then select Release and build all. +### Enabled solvers: + - USE_CPU_TROMP + - USE_CPU_XENONCAT + - USE_CUDA_TROMP + - USE_CUDA_DJEZO -## Linux +If you don't wan't to build with all solvlers you can go to **nheqminer Properties > C/C++ > Preprocessor > Preprocessor Definitions** and remove the solver you don't need. +## Linux Work in progress. - -Working solvers CPU_TROMP, CPU_XENONCAT, CUDA_TROMP, OCL_XMP, OCL_SILENTARMY - -## Linux (Ubuntu 14.04 / 16.04) Build CPU_XENONCAT: - - - Open terminal and run the following commands: - - `sudo apt-get install cmake build-essential libboost-all-dev` - - `git clone -b Linux https://github.com/nicehash/nheqminer.git` - - `cd nheqminer/cpu_xenoncat/Linux/asm/` - - `sh assemble.sh` - - `cd ../../../Linux_cmake/nheqminer_cpu` - - `cmake .` - - `make -j $(nproc)` - -## Linux (Ubuntu 14.04 / 16.04) Build CUDA_TROMP: - - - Open terminal and run the following commands: - - **Ubuntu 14.04**: - - `wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_8.0.44-1_amd64.deb` - - `sudo dpkg -i cuda-repo-ubuntu1404_8.0.44-1_amd64.deb` - - **Ubuntu 16.04**: - - `wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-repo-ubuntu1604_8.0.44-1_amd64.deb` - - `sudo dpkg -i cuda-repo-ubuntu1604_8.0.44-1_amd64.deb` - - `sudo apt-get update` - - `sudo apt-get install cuda` - - `sudo apt-get install cuda-toolkit-8-0` - - `sudo apt-get install cmake build-essential libboost-all-dev` - - `git clone -b Linux https://github.com/nicehash/nheqminer.git` - - `cd nheqminer/Linux_cmake/nheqminer_cuda_tromp && cmake . && make -j $(nproc)` - - or specify your compute version for example 50 like so `cd nheqminer/Linux_cmake/nheqminer_cuda_tromp && cmake COMPUTE=50 . && make` - -## Linux (16.04) Build OCL_XMP, OCL_SILENTARMY: - - - Open terminal and run the following commands: - - [AMD APP SDK](http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/) - - and make sure you have the [AMD drivers](http://support.amd.com/en-us/download) installed - - install them to the default paths - - `sudo apt-get install mesa-common-dev` - - `sudo apt-get install cmake build-essential libboost-all-dev` - - `git clone -b Linux https://github.com/nicehash/nheqminer.git` - - `cd nheqminer/Linux_cmake/nheqminer_AMD && cmake . -DOPENCL_LIBRARY=/usr/lib/x86_64-linux-gnu/libOpenCL.so -DOPENCL_INCLUDE_DIRECTORY=/opt/AMDAPPSDK-3.0/include && make -j $(nproc)` - - `cp ../../3rdparty/amd_bins_linux/* -r .` - - `cp ../../3rdparty/amd_silentarmy_kernels/* -r .` - - - +Working solvers CPU_TROMP, CPU_XENONCAT, CUDA_TROMP, CUDA_DJEZO + +### General instructions: + - Install CUDA SDK v8 (make sure you have cuda libraries in **LD_LIBRARY_PATH** and cuda toolkit bins in **PATH**) + - example on Ubuntu: + - LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda-8.0/lib64:/usr/local/cuda-8.0/lib64/stubs" + - PATH="$PATH:/usr/local/cuda-8.0/" + - PATH="$PATH:/usr/local/cuda-8.0/bin" + + - Use Boost 1.62+ (if it is not available from the repos you will have to download and build it yourself) + - CMake v3.5 (if it is not available from the repos you will have to download and build it yourself) + - Currently support only static building (CPU_XENONCAT, CUDA_DJEZO are enabled by default, check **CMakeLists.txt** in **nheqminer** root folder) + - If not on Ubuntu make sure you have **fasm** installed and accessible in **PATH** + - After that open the terminal and run the following commands: + - `git clone https://github.com/nicehash/nheqminer.git` + - Generating asm object file: + - **On Ubuntu**: + - `cd nheqminer/cpu_xenoncat/asm_linux/` + - `sh assemble.sh` + - **bundeled fasm not compatible**: + - delete/replace (inside **nheqminer/cpu_xenoncat/asm_linux/** directory) with fasm binary compatible with your distro + - `cd nheqminer/cpu_xenoncat/asm_linux/` + - `sh assemble.sh` + - `cd ../../../` + - `mkdir build && cd build` + - `cmake ../nheqminer` + - `make -j $(nproc)` + # Run instructions: Parameters: diff --git a/blake2/blake2-config.h b/blake2/blake2-config.h new file mode 100644 index 000000000..3524209bf --- /dev/null +++ b/blake2/blake2-config.h @@ -0,0 +1,72 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . +*/ +#pragma once +#ifndef __BLAKE2_CONFIG_H__ +#define __BLAKE2_CONFIG_H__ + +// These don't work everywhere +#if (defined(__SSE2__) || defined(_M_AMD_64) || defined(_M_X64)) +#define HAVE_SSE2 +#endif + +#if defined(__SSSE3__) +#define HAVE_SSSE3 +#endif + +#if defined(__SSE4_1__) +#define HAVE_SSE41 +#endif + +#if defined(__AVX__) +#define HAVE_AVX +#endif + +#if defined(__XOP__) +#define HAVE_XOP +#endif + + +#ifdef HAVE_AVX2 +#ifndef HAVE_AVX +#define HAVE_AVX +#endif +#endif + +#ifdef HAVE_XOP +#ifndef HAVE_AVX +#define HAVE_AVX +#endif +#endif + +#ifdef HAVE_AVX +#ifndef HAVE_SSE41 +#define HAVE_SSE41 +#endif +#endif + +#ifdef HAVE_SSE41 +#ifndef HAVE_SSSE3 +#define HAVE_SSSE3 +#endif +#endif + +#ifdef HAVE_SSSE3 +#define HAVE_SSE2 +#endif + +#if !defined(HAVE_SSE2) +#error "This code requires at least SSE2." +#endif + +#endif + diff --git a/blake2/blake2-impl.h b/blake2/blake2-impl.h new file mode 100644 index 000000000..16219dbcb --- /dev/null +++ b/blake2/blake2-impl.h @@ -0,0 +1,136 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . +*/ +#pragma once +#ifndef __BLAKE2_IMPL_H__ +#define __BLAKE2_IMPL_H__ + +#include + +static inline uint32_t load32( const void *src ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + uint32_t w; + memcpy(&w, src, sizeof w); + return w; +#else + const uint8_t *p = ( const uint8_t * )src; + uint32_t w = *p++; + w |= ( uint32_t )( *p++ ) << 8; + w |= ( uint32_t )( *p++ ) << 16; + w |= ( uint32_t )( *p++ ) << 24; + return w; +#endif +} + +static inline uint64_t load64( const void *src ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + uint64_t w; + memcpy(&w, src, sizeof w); + return w; +#else + const uint8_t *p = ( const uint8_t * )src; + uint64_t w = *p++; + w |= ( uint64_t )( *p++ ) << 8; + w |= ( uint64_t )( *p++ ) << 16; + w |= ( uint64_t )( *p++ ) << 24; + w |= ( uint64_t )( *p++ ) << 32; + w |= ( uint64_t )( *p++ ) << 40; + w |= ( uint64_t )( *p++ ) << 48; + w |= ( uint64_t )( *p++ ) << 56; + return w; +#endif +} + +static inline void store32( void *dst, uint32_t w ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); +#else + uint8_t *p = ( uint8_t * )dst; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; +#endif +} + +static inline void store64( void *dst, uint64_t w ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); +#else + uint8_t *p = ( uint8_t * )dst; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; +#endif +} + +static inline uint64_t load48( const void *src ) +{ + const uint8_t *p = ( const uint8_t * )src; + uint64_t w = *p++; + w |= ( uint64_t )( *p++ ) << 8; + w |= ( uint64_t )( *p++ ) << 16; + w |= ( uint64_t )( *p++ ) << 24; + w |= ( uint64_t )( *p++ ) << 32; + w |= ( uint64_t )( *p++ ) << 40; + return w; +} + +static inline void store48( void *dst, uint64_t w ) +{ + uint8_t *p = ( uint8_t * )dst; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; +} + +static inline uint32_t rotl32( const uint32_t w, const unsigned c ) +{ + return ( w << c ) | ( w >> ( 32 - c ) ); +} + +static inline uint64_t rotl64( const uint64_t w, const unsigned c ) +{ + return ( w << c ) | ( w >> ( 64 - c ) ); +} + +static inline uint32_t rotr32( const uint32_t w, const unsigned c ) +{ + return ( w >> c ) | ( w << ( 32 - c ) ); +} + +static inline uint64_t rotr64( const uint64_t w, const unsigned c ) +{ + return ( w >> c ) | ( w << ( 64 - c ) ); +} + +/* prevents compiler optimizing out memset() */ +static inline void secure_zero_memory( void *v, size_t n ) +{ + volatile uint8_t *p = ( volatile uint8_t * )v; + while( n-- ) *p++ = 0; +} + +#endif + diff --git a/blake2/blake2-round.h b/blake2/blake2-round.h new file mode 100644 index 000000000..400ed2034 --- /dev/null +++ b/blake2/blake2-round.h @@ -0,0 +1,85 @@ +#define _mm_roti_epi64(x, c) \ + (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ + : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \ + : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ + : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \ + : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c)))) + +#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + row1l = _mm_add_epi64(row1l, row2l); \ + row1h = _mm_add_epi64(row1h, row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -32); \ + row4h = _mm_roti_epi64(row4h, -32); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -24); \ + row2h = _mm_roti_epi64(row2h, -24); \ + +#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + row1l = _mm_add_epi64(row1l, row2l); \ + row1h = _mm_add_epi64(row1h, row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -16); \ + row4h = _mm_roti_epi64(row4h, -16); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -63); \ + row2h = _mm_roti_epi64(row2h, -63); \ + +#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = _mm_alignr_epi8(row2h, row2l, 8); \ + t1 = _mm_alignr_epi8(row2l, row2h, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4h, row4l, 8); \ + t1 = _mm_alignr_epi8(row4l, row4h, 8); \ + row4l = t1; \ + row4h = t0; + +#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = _mm_alignr_epi8(row2l, row2h, 8); \ + t1 = _mm_alignr_epi8(row2h, row2l, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4l, row4h, 8); \ + t1 = _mm_alignr_epi8(row4h, row4l, 8); \ + row4l = t1; \ + row4h = t0; + +#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + \ + DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + \ + UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); diff --git a/blake2/blake2.h b/blake2/blake2.h new file mode 100644 index 000000000..85d63866f --- /dev/null +++ b/blake2/blake2.h @@ -0,0 +1,156 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . +*/ +#pragma once +#ifndef __BLAKE2_H__ +#define __BLAKE2_H__ + +#include +#include + +#if defined(_MSC_VER) +#define ALIGN(x) __declspec(align(x)) +#else +#define ALIGN(x) __attribute__ ((__aligned__(x))) +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + + enum blake2s_constant + { + BLAKE2S_BLOCKBYTES = 64, + BLAKE2S_OUTBYTES = 32, + BLAKE2S_KEYBYTES = 32, + BLAKE2S_SALTBYTES = 8, + BLAKE2S_PERSONALBYTES = 8 + }; + + enum blake2b_constant + { + BLAKE2B_BLOCKBYTES = 128, + BLAKE2B_OUTBYTES = 64, + BLAKE2B_KEYBYTES = 64, + BLAKE2B_SALTBYTES = 16, + BLAKE2B_PERSONALBYTES = 16 + }; + +#pragma pack(push, 1) + typedef struct __blake2s_param + { + uint8_t digest_length; // 1 + uint8_t key_length; // 2 + uint8_t fanout; // 3 + uint8_t depth; // 4 + uint32_t leaf_length; // 8 + uint8_t node_offset[6];// 14 + uint8_t node_depth; // 15 + uint8_t inner_length; // 16 + // uint8_t reserved[0]; + uint8_t salt[BLAKE2S_SALTBYTES]; // 24 + uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32 + } blake2s_param; + + ALIGN( 64 ) typedef struct __blake2s_state + { + uint32_t h[8]; + uint32_t t[2]; + uint32_t f[2]; + uint8_t buf[2 * BLAKE2S_BLOCKBYTES]; + size_t buflen; + uint8_t last_node; + } blake2s_state; + + typedef struct __blake2b_param + { + uint8_t digest_length; // 1 + uint8_t key_length; // 2 + uint8_t fanout; // 3 + uint8_t depth; // 4 + uint32_t leaf_length; // 8 + uint64_t node_offset; // 16 + uint8_t node_depth; // 17 + uint8_t inner_length; // 18 + uint8_t reserved[14]; // 32 + uint8_t salt[BLAKE2B_SALTBYTES]; // 48 + uint8_t personal[BLAKE2B_PERSONALBYTES]; // 64 + } blake2b_param; + + ALIGN( 64 ) typedef struct __blake2b_state + { + uint64_t h[8]; + uint8_t buf[BLAKE2B_BLOCKBYTES]; + uint16_t counter; + uint8_t buflen; + uint8_t lastblock; + } blake2b_state; + + ALIGN( 64 ) typedef struct __blake2sp_state + { + blake2s_state S[8][1]; + blake2s_state R[1]; + uint8_t buf[8 * BLAKE2S_BLOCKBYTES]; + size_t buflen; + } blake2sp_state; + + ALIGN( 64 ) typedef struct __blake2bp_state + { + blake2b_state S[4][1]; + blake2b_state R[1]; + uint8_t buf[4 * BLAKE2B_BLOCKBYTES]; + size_t buflen; + } blake2bp_state; +#pragma pack(pop) + + // Streaming API + int blake2s_init( blake2s_state *S, const uint8_t outlen ); + int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ); + int blake2s_init_param( blake2s_state *S, const blake2s_param *P ); + int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen ); + int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen ); + + int blake2b_init( blake2b_state *S, const uint8_t outlen ); + int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ); + int blake2b_init_param( blake2b_state *S, const blake2b_param *P ); + int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen ); + int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen ); + + int blake2sp_init( blake2sp_state *S, const uint8_t outlen ); + int blake2sp_init_key( blake2sp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ); + int blake2sp_update( blake2sp_state *S, const uint8_t *in, uint64_t inlen ); + int blake2sp_final( blake2sp_state *S, uint8_t *out, uint8_t outlen ); + + int blake2bp_init( blake2bp_state *S, const uint8_t outlen ); + int blake2bp_init_key( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ); + int blake2bp_update( blake2bp_state *S, const uint8_t *in, uint64_t inlen ); + int blake2bp_final( blake2bp_state *S, uint8_t *out, uint8_t outlen ); + + // Simple API + int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ); + int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ); + int blake2b_long(uint8_t *out, const void *in, const uint32_t outlen, const uint64_t inlen); + + int blake2sp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ); + int blake2bp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ); + + static inline int blake2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ) + { + return blake2b( out, in, key, outlen, inlen, keylen ); + } + +#if defined(__cplusplus) +} +#endif + +#endif + diff --git a/blake2/blake2b-load-sse2.h b/blake2/blake2b-load-sse2.h new file mode 100644 index 000000000..1ba153c87 --- /dev/null +++ b/blake2/blake2b-load-sse2.h @@ -0,0 +1,68 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . +*/ +#pragma once +#ifndef __BLAKE2B_LOAD_SSE2_H__ +#define __BLAKE2B_LOAD_SSE2_H__ + +#define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4) +#define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5) +#define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12) +#define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13) +#define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9) +#define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15) +#define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11) +#define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7) +#define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5) +#define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2) +#define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7) +#define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1) +#define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13) +#define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12) +#define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4) +#define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0) +#define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2) +#define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4) +#define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6) +#define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8) +#define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0) +#define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11) +#define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15) +#define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14) +#define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14) +#define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13) +#define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9) +#define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2) +#define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12) +#define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1) +#define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8) +#define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6) +#define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11) +#define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3) +#define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1) +#define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4) +#define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7) +#define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6) +#define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3) +#define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12) +#define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4) +#define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5) +#define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12) +#define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13) +#define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9) +#define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15) +#define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11) +#define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7) + + +#endif + diff --git a/blake2/blake2b-load-sse41.h b/blake2/blake2b-load-sse41.h new file mode 100644 index 000000000..f6c1bc839 --- /dev/null +++ b/blake2/blake2b-load-sse41.h @@ -0,0 +1,402 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . +*/ +#pragma once +#ifndef __BLAKE2B_LOAD_SSE41_H__ +#define __BLAKE2B_LOAD_SSE41_H__ + +#define LOAD_MSG_0_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m1); \ +b1 = _mm_unpacklo_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_0_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m0, m1); \ +b1 = _mm_unpackhi_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_0_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m4, m5); \ +b1 = _mm_unpacklo_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_0_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m5); \ +b1 = _mm_unpackhi_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_1_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m7, m2); \ +b1 = _mm_unpackhi_epi64(m4, m6); \ +} while(0) + + +#define LOAD_MSG_1_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m5, m4); \ +b1 = _mm_alignr_epi8(m3, m7, 8); \ +} while(0) + + +#define LOAD_MSG_1_3(b0, b1) \ +do \ +{ \ +b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ +b1 = _mm_unpackhi_epi64(m5, m2); \ +} while(0) + + +#define LOAD_MSG_1_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m6, m1); \ +b1 = _mm_unpackhi_epi64(m3, m1); \ +} while(0) + + +#define LOAD_MSG_2_1(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m6, m5, 8); \ +b1 = _mm_unpackhi_epi64(m2, m7); \ +} while(0) + + +#define LOAD_MSG_2_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m4, m0); \ +b1 = _mm_blend_epi16(m1, m6, 0xF0); \ +} while(0) + + +#define LOAD_MSG_2_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m5, m1, 0xF0); \ +b1 = _mm_unpackhi_epi64(m3, m4); \ +} while(0) + + +#define LOAD_MSG_2_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m7, m3); \ +b1 = _mm_alignr_epi8(m2, m0, 8); \ +} while(0) + + +#define LOAD_MSG_3_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m3, m1); \ +b1 = _mm_unpackhi_epi64(m6, m5); \ +} while(0) + + +#define LOAD_MSG_3_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m0); \ +b1 = _mm_unpacklo_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_3_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m1, m2, 0xF0); \ +b1 = _mm_blend_epi16(m2, m7, 0xF0); \ +} while(0) + + +#define LOAD_MSG_3_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m3, m5); \ +b1 = _mm_unpacklo_epi64(m0, m4); \ +} while(0) + + +#define LOAD_MSG_4_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m2); \ +b1 = _mm_unpacklo_epi64(m1, m5); \ +} while(0) + + +#define LOAD_MSG_4_2(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m0, m3, 0xF0); \ +b1 = _mm_blend_epi16(m2, m7, 0xF0); \ +} while(0) + + +#define LOAD_MSG_4_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m7, m5, 0xF0); \ +b1 = _mm_blend_epi16(m3, m1, 0xF0); \ +} while(0) + + +#define LOAD_MSG_4_4(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m6, m0, 8); \ +b1 = _mm_blend_epi16(m4, m6, 0xF0); \ +} while(0) + + +#define LOAD_MSG_5_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m1, m3); \ +b1 = _mm_unpacklo_epi64(m0, m4); \ +} while(0) + + +#define LOAD_MSG_5_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m6, m5); \ +b1 = _mm_unpackhi_epi64(m5, m1); \ +} while(0) + + +#define LOAD_MSG_5_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m2, m3, 0xF0); \ +b1 = _mm_unpackhi_epi64(m7, m0); \ +} while(0) + + +#define LOAD_MSG_5_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m6, m2); \ +b1 = _mm_blend_epi16(m7, m4, 0xF0); \ +} while(0) + + +#define LOAD_MSG_6_1(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m6, m0, 0xF0); \ +b1 = _mm_unpacklo_epi64(m7, m2); \ +} while(0) + + +#define LOAD_MSG_6_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m2, m7); \ +b1 = _mm_alignr_epi8(m5, m6, 8); \ +} while(0) + + +#define LOAD_MSG_6_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m3); \ +b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ +} while(0) + + +#define LOAD_MSG_6_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m3, m1); \ +b1 = _mm_blend_epi16(m1, m5, 0xF0); \ +} while(0) + + +#define LOAD_MSG_7_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m6, m3); \ +b1 = _mm_blend_epi16(m6, m1, 0xF0); \ +} while(0) + + +#define LOAD_MSG_7_2(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m7, m5, 8); \ +b1 = _mm_unpackhi_epi64(m0, m4); \ +} while(0) + + +#define LOAD_MSG_7_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m2, m7); \ +b1 = _mm_unpacklo_epi64(m4, m1); \ +} while(0) + + +#define LOAD_MSG_7_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m2); \ +b1 = _mm_unpacklo_epi64(m3, m5); \ +} while(0) + + +#define LOAD_MSG_8_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m3, m7); \ +b1 = _mm_alignr_epi8(m0, m5, 8); \ +} while(0) + + +#define LOAD_MSG_8_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m7, m4); \ +b1 = _mm_alignr_epi8(m4, m1, 8); \ +} while(0) + + +#define LOAD_MSG_8_3(b0, b1) \ +do \ +{ \ +b0 = m6; \ +b1 = _mm_alignr_epi8(m5, m0, 8); \ +} while(0) + + +#define LOAD_MSG_8_4(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m1, m3, 0xF0); \ +b1 = m2; \ +} while(0) + + +#define LOAD_MSG_9_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m5, m4); \ +b1 = _mm_unpackhi_epi64(m3, m0); \ +} while(0) + + +#define LOAD_MSG_9_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m1, m2); \ +b1 = _mm_blend_epi16(m3, m2, 0xF0); \ +} while(0) + + +#define LOAD_MSG_9_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m7, m4); \ +b1 = _mm_unpackhi_epi64(m1, m6); \ +} while(0) + + +#define LOAD_MSG_9_4(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m7, m5, 8); \ +b1 = _mm_unpacklo_epi64(m6, m0); \ +} while(0) + + +#define LOAD_MSG_10_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m1); \ +b1 = _mm_unpacklo_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_10_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m0, m1); \ +b1 = _mm_unpackhi_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_10_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m4, m5); \ +b1 = _mm_unpacklo_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_10_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m5); \ +b1 = _mm_unpackhi_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_11_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m7, m2); \ +b1 = _mm_unpackhi_epi64(m4, m6); \ +} while(0) + + +#define LOAD_MSG_11_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m5, m4); \ +b1 = _mm_alignr_epi8(m3, m7, 8); \ +} while(0) + + +#define LOAD_MSG_11_3(b0, b1) \ +do \ +{ \ +b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ +b1 = _mm_unpackhi_epi64(m5, m2); \ +} while(0) + + +#define LOAD_MSG_11_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m6, m1); \ +b1 = _mm_unpackhi_epi64(m3, m1); \ +} while(0) + + +#endif + diff --git a/blake2/blake2b-round.h b/blake2/blake2b-round.h new file mode 100644 index 000000000..3e6fd0cbe --- /dev/null +++ b/blake2/blake2b-round.h @@ -0,0 +1,170 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . +*/ +#pragma once +#ifndef __BLAKE2B_ROUND_H__ +#define __BLAKE2B_ROUND_H__ + +#define LOAD(p) _mm_load_si128( (const __m128i *)(p) ) +#define STORE(p,r) _mm_store_si128((__m128i *)(p), r) + +#define LOADU(p) _mm_loadu_si128( (const __m128i *)(p) ) +#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r) + +#define TOF(reg) _mm_castsi128_ps((reg)) +#define TOI(reg) _mm_castps_si128((reg)) + +#define LIKELY(x) __builtin_expect((x),1) + + +/* Microarchitecture-specific macros */ +#ifndef HAVE_XOP +#ifdef HAVE_SSSE3 +#define _mm_roti_epi64(x, c) \ + (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ + : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \ + : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ + : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \ + : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c)))) +#else +#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-c) )) +#endif +#else +/* ... */ +#endif + + + +#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, (-32)); \ + row4h = _mm_roti_epi64(row4h, (-32)); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, (-24)); \ + row2h = _mm_roti_epi64(row2h, (-24)); \ + +#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, (-16)); \ + row4h = _mm_roti_epi64(row4h, (-16)); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, (-63)); \ + row2h = _mm_roti_epi64(row2h, (-63)); \ + +#if defined(HAVE_SSSE3) +#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = _mm_alignr_epi8(row2h, row2l, 8); \ + t1 = _mm_alignr_epi8(row2l, row2h, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4h, row4l, 8); \ + t1 = _mm_alignr_epi8(row4l, row4h, 8); \ + row4l = t1; \ + row4h = t0; + +#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = _mm_alignr_epi8(row2l, row2h, 8); \ + t1 = _mm_alignr_epi8(row2h, row2l, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4l, row4h, 8); \ + t1 = _mm_alignr_epi8(row4h, row4l, 8); \ + row4l = t1; \ + row4h = t0; +#else + +#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = row4l;\ + t1 = row2l;\ + row4l = row3l;\ + row3l = row3h;\ + row3h = row4l;\ + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \ + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \ + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \ + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)) + +#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = row3l;\ + row3l = row3h;\ + row3h = t0;\ + t0 = row2l;\ + t1 = row4l;\ + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \ + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \ + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \ + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)) + +#endif + +#if defined(HAVE_SSE41) +#include "blake2b-load-sse41.h" +#else +#include "blake2b-load-sse2.h" +#endif + +#define ROUND(r) \ + LOAD_MSG_ ##r ##_1(b0, b1); \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + LOAD_MSG_ ##r ##_2(b0, b1); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + LOAD_MSG_ ##r ##_3(b0, b1); \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + LOAD_MSG_ ##r ##_4(b0, b1); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); + +#endif + +#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \ + G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \ + G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \ + \ + DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \ + \ + G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \ + G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \ + \ + UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); diff --git a/blake2/blake2bx.cpp b/blake2/blake2bx.cpp new file mode 100644 index 000000000..2df512e95 --- /dev/null +++ b/blake2/blake2bx.cpp @@ -0,0 +1,346 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . +*/ + +#include +#include +#include + + +#include "blake2.h" +#include "blake2-impl.h" + +#include "blake2-config.h" + +#ifdef WIN32 +#include +#endif + +#include +#if defined(HAVE_SSSE3) +#include +#endif +#if defined(HAVE_SSE41) +#include +#endif +#if defined(HAVE_AVX) +#include +#endif +#if defined(HAVE_XOP) +#include +#endif + +#include "blake2b-round.h" + + + +ALIGN(64) static const uint64_t blake2b_IV[8] = +{ + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +/* init xors IV with input parameter block */ +int blake2b_init_param(blake2b_state *S, const blake2b_param *P) +{ + //blake2b_init0( S ); + const uint8_t * v = (const uint8_t *)(blake2b_IV); + const uint8_t * p = (const uint8_t *)(P); + uint8_t * h = (uint8_t *)(S->h); + /* IV XOR ParamBlock */ + memset(S, 0, sizeof(blake2b_state)); + + for (int i = 0; i < BLAKE2B_OUTBYTES; ++i) h[i] = v[i] ^ p[i]; + + return 0; +} + +/* Some sort of default parameter block initialization, for sequential blake2b */ +int blake2b_init(blake2b_state *S, const uint8_t outlen) +{ + if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1; + + const blake2b_param P = + { + outlen, + 0, + 1, + 1, + 0, + 0, + 0, + 0, + { 0 }, + { 0 }, + { 0 } + }; + return blake2b_init_param(S, &P); +} + +int blake2b_init_key(blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen) +{ + if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1; + + if ((!keylen) || keylen > BLAKE2B_KEYBYTES) return -1; + + const blake2b_param P = + { + outlen, + keylen, + 1, + 1, + 0, + 0, + 0, + 0, + { 0 }, + { 0 }, + { 0 } + }; + + if (blake2b_init_param(S, &P) < 0) + return 0; + + { + uint8_t block[BLAKE2B_BLOCKBYTES]; + memset(block, 0, BLAKE2B_BLOCKBYTES); + memcpy(block, key, keylen); + blake2b_update(S, block, BLAKE2B_BLOCKBYTES); + secure_zero_memory(block, BLAKE2B_BLOCKBYTES); /* Burn the key from stack */ + } + return 0; +} + +static inline int blake2b_compress(blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES]) +{ + __m128i row1l, row1h; + __m128i row2l, row2h; + __m128i row3l, row3h; + __m128i row4l, row4h; + __m128i b0, b1; + __m128i t0, t1; +#if defined(HAVE_SSSE3) && !defined(HAVE_XOP) + const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); + const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); +#endif +#if defined(HAVE_SSE41) + const __m128i m0 = LOADU(block + 00); + const __m128i m1 = LOADU(block + 16); + const __m128i m2 = LOADU(block + 32); + const __m128i m3 = LOADU(block + 48); + const __m128i m4 = LOADU(block + 64); + const __m128i m5 = LOADU(block + 80); + const __m128i m6 = LOADU(block + 96); + const __m128i m7 = LOADU(block + 112); +#else + const uint64_t m0 = ( ( uint64_t * )block )[ 0]; + const uint64_t m1 = ( ( uint64_t * )block )[ 1]; + const uint64_t m2 = ( ( uint64_t * )block )[ 2]; + const uint64_t m3 = ( ( uint64_t * )block )[ 3]; + const uint64_t m4 = ( ( uint64_t * )block )[ 4]; + const uint64_t m5 = ( ( uint64_t * )block )[ 5]; + const uint64_t m6 = ( ( uint64_t * )block )[ 6]; + const uint64_t m7 = ( ( uint64_t * )block )[ 7]; + const uint64_t m8 = ( ( uint64_t * )block )[ 8]; + const uint64_t m9 = ( ( uint64_t * )block )[ 9]; + const uint64_t m10 = ( ( uint64_t * )block )[10]; + const uint64_t m11 = ( ( uint64_t * )block )[11]; + const uint64_t m12 = ( ( uint64_t * )block )[12]; + const uint64_t m13 = ( ( uint64_t * )block )[13]; + const uint64_t m14 = ( ( uint64_t * )block )[14]; + const uint64_t m15 = ( ( uint64_t * )block )[15]; +#endif + row1l = LOADU(&S->h[0]); + row1h = LOADU(&S->h[2]); + row2l = LOADU(&S->h[4]); + row2h = LOADU(&S->h[6]); + row3l = LOADU(&blake2b_IV[0]); + row3h = LOADU(&blake2b_IV[2]); + row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), _mm_set_epi32(0, 0, 0, S->counter)); + row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), _mm_set_epi32(0, 0, 0L - S->lastblock, 0L - S->lastblock)); + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + ROUND(10); + ROUND(11); + row1l = _mm_xor_si128(row3l, row1l); + row1h = _mm_xor_si128(row3h, row1h); + STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l)); + STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h)); + row2l = _mm_xor_si128(row4l, row2l); + row2h = _mm_xor_si128(row4h, row2h); + STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l)); + STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h)); + return 0; +} + + +int blake2b_update(blake2b_state *S, const uint8_t *in, uint64_t inlen) +{ + while (inlen > 0) + { + size_t left = S->buflen; + size_t fill = BLAKE2B_BLOCKBYTES - left; + + if (inlen > fill) + { + memcpy(S->buf + left, in, fill); // Fill buffer + in += fill; + inlen -= fill; + S->counter += BLAKE2B_BLOCKBYTES; + blake2b_compress(S, S->buf); // Compress + S->buflen = 0; + } + else // inlen <= fill + { + memcpy(S->buf + left, in, inlen); + S->buflen += inlen; // not enough to compress + in += inlen; + inlen = 0; + } + } + + return 0; +} + + +int blake2b_final(blake2b_state *S, uint8_t *out, uint8_t outlen) +{ + if (outlen > BLAKE2B_OUTBYTES) + return -1; + + if (S->buflen > BLAKE2B_BLOCKBYTES) + { + S->counter += BLAKE2B_BLOCKBYTES; + blake2b_compress(S, S->buf); + S->buflen -= BLAKE2B_BLOCKBYTES; + memcpy(S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen); + } + + S->counter += S->buflen; + S->lastblock = 1; + memset(S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */ + blake2b_compress(S, S->buf); + memcpy(out, &S->h[0], outlen); + S->lastblock = 0; + return 0; +} + + +int blake2b(uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen) +{ + blake2b_state S[1]; + + /* Verify parameters */ + if (NULL == in) return -1; + + if (NULL == out) return -1; + + if (NULL == key) keylen = 0; + + if (keylen) + { + if (blake2b_init_key(S, outlen, key, keylen) < 0) return -1; + } + else + { + if (blake2b_init(S, outlen) < 0) return -1; + } + + blake2b_update(S, (const uint8_t *)in, inlen); + blake2b_final(S, out, outlen); + return 0; +} + +#if defined(SUPERCOP) +int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen ) +{ + return blake2b( out, in, NULL, BLAKE2B_OUTBYTES, inlen, 0 ); +} +#endif + +#if defined(BLAKE2B_SELFTEST) +#include +#include "blake2-kat.h" +int main( int argc, char **argv ) +{ + uint8_t key[BLAKE2B_KEYBYTES]; + uint8_t buf[KAT_LENGTH]; + + for( size_t i = 0; i < BLAKE2B_KEYBYTES; ++i ) + key[i] = ( uint8_t )i; + + for( size_t i = 0; i < KAT_LENGTH; ++i ) + buf[i] = ( uint8_t )i; + + for( size_t i = 0; i < KAT_LENGTH; ++i ) + { + uint8_t hash[BLAKE2B_OUTBYTES]; + blake2b( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES ); + + if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) ) + { + puts( "error" ); + return -1; + } + } + + puts( "ok" ); + return 0; +} +#endif + +int blake2b_long(uint8_t *out, const void *in, const uint32_t outlen, const uint64_t inlen) +{ + blake2b_state blake_state; + if (outlen <= BLAKE2B_OUTBYTES) + { + blake2b_init(&blake_state, outlen); + blake2b_update(&blake_state, (const uint8_t*)&outlen, sizeof(uint32_t)); + blake2b_update(&blake_state, (const uint8_t *)in, inlen); + blake2b_final(&blake_state, out, outlen); + } + else + { + uint8_t out_buffer[BLAKE2B_OUTBYTES]; + uint8_t in_buffer[BLAKE2B_OUTBYTES]; + blake2b_init(&blake_state, BLAKE2B_OUTBYTES); + blake2b_update(&blake_state, (const uint8_t*)&outlen, sizeof(uint32_t)); + blake2b_update(&blake_state, (const uint8_t *)in, inlen); + blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES); + memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2); + out += BLAKE2B_OUTBYTES / 2; + uint32_t toproduce = outlen - BLAKE2B_OUTBYTES / 2; + while (toproduce > BLAKE2B_OUTBYTES) + { + memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES); + blake2b(out_buffer, in_buffer, NULL, BLAKE2B_OUTBYTES, BLAKE2B_OUTBYTES, 0); + memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2); + out += BLAKE2B_OUTBYTES / 2; + toproduce -= BLAKE2B_OUTBYTES / 2; + } + memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES); + blake2b(out_buffer, in_buffer, NULL, toproduce, BLAKE2B_OUTBYTES, 0); + memcpy(out, out_buffer, toproduce); + + } + return 0; +} \ No newline at end of file diff --git a/cpu_tromp/CMakeLists.txt b/cpu_tromp/CMakeLists.txt new file mode 100644 index 000000000..8214d97ff --- /dev/null +++ b/cpu_tromp/CMakeLists.txt @@ -0,0 +1,19 @@ +set(EXECUTABLE cpu_tromp) + +#cpu_tromp/ +file(GLOB SRC_LIST + cpu_tromp.cpp ) +file(GLOB HEADERS + cpu_tromp.hpp + equi.h + equi_miner.h + ) + +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CUDA_INCLUDE_DIRS}) +include_directories(..) +ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS}) +TARGET_LINK_LIBRARIES(${EXECUTABLE} ) + +install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib ) +install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} ) diff --git a/cpu_tromp/equi.h b/cpu_tromp/equi.h index 94ad0ad8a..ed91cee14 100644 --- a/cpu_tromp/equi.h +++ b/cpu_tromp/equi.h @@ -2,6 +2,7 @@ // Equihash solver // Copyright (c) 2016-2016 John Tromp + #include "blake2/blake2.h" #ifdef __APPLE__ #include "osx_barrier.h" @@ -131,3 +132,4 @@ int verify(u32 indices[PROOFSIZE], const char *header, const u32 headerlen, cons uchar hash[WN/8]; return verifyrec(&ctx, indices, hash, WK); } + diff --git a/cpu_xenoncat/CMakeLists.txt b/cpu_xenoncat/CMakeLists.txt new file mode 100644 index 000000000..66c698d74 --- /dev/null +++ b/cpu_xenoncat/CMakeLists.txt @@ -0,0 +1,17 @@ +set(EXECUTABLE cpu_xenoncat) + +#cpu_xenoncat/ +file(GLOB SRC_LIST + xenoncat.cpp ) +file(GLOB HEADERS + cpu_xenoncat.hpp + ) + +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CUDA_INCLUDE_DIRS}) +include_directories(..) +ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS}) +TARGET_LINK_LIBRARIES(${EXECUTABLE} ) + +install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib ) +install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} ) diff --git a/cpu_xenoncat/Linux/asm/t2.bin b/cpu_xenoncat/Linux/asm/t2.bin deleted file mode 100644 index 432b9ab90..000000000 Binary files a/cpu_xenoncat/Linux/asm/t2.bin and /dev/null differ diff --git a/cpu_xenoncat/Linux/blake2b/asm/assemble.sh b/cpu_xenoncat/Linux/blake2b/asm/assemble.sh deleted file mode 100644 index 91990b5d0..000000000 --- a/cpu_xenoncat/Linux/blake2b/asm/assemble.sh +++ /dev/null @@ -1,2 +0,0 @@ -fasm zcblake2_avx1.asm -fasm zcblake2_avx2.asm diff --git a/cpu_xenoncat/Linux/blake2b/asm/data_blake2b.asm b/cpu_xenoncat/Linux/blake2b/asm/data_blake2b.asm deleted file mode 100644 index be2026b1b..000000000 --- a/cpu_xenoncat/Linux/blake2b/asm/data_blake2b.asm +++ /dev/null @@ -1,36 +0,0 @@ -xshufb_ror24 db 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10 -xshufb_ror16 db 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9 -xshufb_bswap8 db 7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9,8 -xctrinc dd 0,2, 0,2 - -align 32 -iv dq 0x6a09e667f3bcc908, 0xbb67ae8584caa73b -dq 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 -dq 0x510e527fade682d1, 0x9b05688c2b3e6c1f -dq 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 - -s0 dq 0x6a09e667f3bcc908 xor 0x1010032, 0xbb67ae8584caa73b ;0x32=50 bytes output -s2 dq 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 -s4 dq 0x510e527fade682d1, 0x9b05688c2b3e6c1f -s6 dq 0x1f83d9abfb41bd6b xor 0x576f50687361635a ;Personalization -s7 dq 0x5be0cd19137e2179 xor 0x00000009000000c8 ;n=200, k=9 - -iv4xor128 dq 0x510e527fade682d1 xor 0x80, 0x9b05688c2b3e6c1f -dq 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 -iv4xor144 dq 0x510e527fade682d1 xor 144, 0x9b05688c2b3e6c1f -iv6inverted dq 0xe07c265404be4294, 0x5be0cd19137e2179 - -align 32 -yctrinit dd 0,0, 0,1, 0,2, 0,3 -yctrinc dd 0,4, 0,4, 0,4, 0,4 - -blake2sigma db 0,2,4,6,1,3,5,7,8,10,12,14,9,11,13,15 -db 14,4,9,13,10,8,15,6,1,0,11,5,12,2,7,3 -db 11,12,5,15,8,0,2,13,10,3,7,9,14,6,1,4 -db 7,3,13,11,9,1,12,14,2,5,4,15,6,10,0,8 -db 9,5,2,10,0,7,4,15,14,11,6,3,1,12,8,13 -db 2,6,0,8,12,10,11,3,4,7,15,1,13,5,14,9 -db 12,1,14,4,5,15,13,10,0,6,9,8,7,3,2,11 -db 13,7,12,3,11,14,1,9,5,15,8,2,0,4,6,10 -db 6,14,11,0,15,9,3,8,12,13,1,10,2,7,4,5 -db 10,8,7,1,2,4,6,5,15,9,3,13,11,14,12,0 diff --git a/cpu_xenoncat/Linux/blake2b/asm/macro_blake2b_avx1.asm b/cpu_xenoncat/Linux/blake2b/asm/macro_blake2b_avx1.asm deleted file mode 100644 index fa3aeee8c..000000000 --- a/cpu_xenoncat/Linux/blake2b/asm/macro_blake2b_avx1.asm +++ /dev/null @@ -1,349 +0,0 @@ -macro hR0 m0,m1,m2,m3,m4,m5,m6,m7,lim,src -{ -vpaddq xmm0,xmm0,xmm4 -vpaddq xmm1,xmm1,xmm5 -vpaddq xmm2,xmm2,xmm6 -vpaddq xmm3,xmm3,xmm7 -if m0 -#include -#include - -void Blake2PrepareMidstate2(void *midstate, unsigned char *input); -//midstate: 256 bytes of buffer for output midstate, aligned by 32 -//input: 140 bytes header, preferably aligned by 8 - -void Blake2Run2(unsigned char *hashout, void *midstate, uint32_t indexctr); -//hashout: hash output buffer: 2*64 bytes -//midstate: 256 bytes from Blake2PrepareMidstate2 -//indexctr: For n=200, k=9: {0, 2, 4, ..., 1048574} - -unsigned char __attribute__((aligned(8))) testdata[140] = -{ - 0x04, 0x00, 0x00, 0x00, 0x91, 0x5F, 0xA6, 0x1C, 0x4F, 0xA5, 0x92, 0x3C, 0xE6, 0xEE, 0xAD, 0x06, - 0x74, 0x6B, 0x61, 0x22, 0x54, 0x94, 0xEA, 0x5A, 0x2A, 0x97, 0xAE, 0x46, 0x6E, 0x6F, 0xAA, 0x9C, - 0x6E, 0xF6, 0x3A, 0x0D, 0xA5, 0xFC, 0x67, 0xD7, 0xF8, 0xDC, 0x78, 0xC3, 0xC8, 0x70, 0xCA, 0x09, - 0xBA, 0xAB, 0xAA, 0xF7, 0x02, 0x59, 0x68, 0xA8, 0x6F, 0xEB, 0x88, 0x75, 0xD3, 0xF3, 0xFF, 0xA7, - 0x2E, 0xB0, 0x0F, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x66, 0xCE, 0xD2, 0x57, 0x0F, 0x0F, 0x0F, 0x20, 0x00, 0x00, 0xF7, 0xF1, - 0x94, 0xA2, 0x53, 0x8E, 0x42, 0x5F, 0x21, 0x33, 0xCF, 0xA8, 0xD3, 0xCB, 0xF4, 0xDF, 0x71, 0xEF, - 0x38, 0x28, 0x51, 0x75, 0xCF, 0xED, 0xCB, 0x3E, 0x63, 0xA2, 0x00, 0x00 -}; -//expected output: 281dd5fc6d878538e640987b9bc597dbbd4af2cdf8bf5fb03bdfcefa40d8747d out.bin - -int main(void) -{ - unsigned char midstate_a[256+32]; - void *pmidstate = (void *) (((long) midstate_a+31L) & -32L); - unsigned char hashout_a[128+32]; - unsigned char *phashout = (unsigned char *) (((long) hashout_a+31L) & -32L); - unsigned char buf[128]; - FILE *outfile; - int i; - - Blake2PrepareMidstate2(pmidstate, testdata); - outfile = fopen("out.bin", "wb"); - - for (i=0; i<1048576; i+=2) { - Blake2Run2(phashout, pmidstate, i); - memcpy(buf, phashout, 50); - memcpy(buf+50, phashout+64, 50); - fwrite(buf, 100, 1, outfile); - } - - fclose(outfile); - - return 0; -} diff --git a/cpu_xenoncat/Linux/blake2b/example_avx2.c b/cpu_xenoncat/Linux/blake2b/example_avx2.c deleted file mode 100644 index bbf9782d3..000000000 --- a/cpu_xenoncat/Linux/blake2b/example_avx2.c +++ /dev/null @@ -1,53 +0,0 @@ -#include -#include -#include - -void Blake2PrepareMidstate4(void *midstate, unsigned char *input); -//midstate: 256 bytes of buffer for output midstate, aligned by 32 -//input: 140 bytes header, preferably aligned by 8 - -void Blake2Run4(unsigned char *hashout, void *midstate, uint32_t indexctr); -//hashout: hash output buffer: 4*64 bytes -//midstate: 256 bytes from Blake2PrepareMidstate4 -//indexctr: For n=200, k=9: {0, 4, 8, ..., 1048572} - -unsigned char __attribute__((aligned(8))) testdata[140] = -{ - 0x04, 0x00, 0x00, 0x00, 0x91, 0x5F, 0xA6, 0x1C, 0x4F, 0xA5, 0x92, 0x3C, 0xE6, 0xEE, 0xAD, 0x06, - 0x74, 0x6B, 0x61, 0x22, 0x54, 0x94, 0xEA, 0x5A, 0x2A, 0x97, 0xAE, 0x46, 0x6E, 0x6F, 0xAA, 0x9C, - 0x6E, 0xF6, 0x3A, 0x0D, 0xA5, 0xFC, 0x67, 0xD7, 0xF8, 0xDC, 0x78, 0xC3, 0xC8, 0x70, 0xCA, 0x09, - 0xBA, 0xAB, 0xAA, 0xF7, 0x02, 0x59, 0x68, 0xA8, 0x6F, 0xEB, 0x88, 0x75, 0xD3, 0xF3, 0xFF, 0xA7, - 0x2E, 0xB0, 0x0F, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x66, 0xCE, 0xD2, 0x57, 0x0F, 0x0F, 0x0F, 0x20, 0x00, 0x00, 0xF7, 0xF1, - 0x94, 0xA2, 0x53, 0x8E, 0x42, 0x5F, 0x21, 0x33, 0xCF, 0xA8, 0xD3, 0xCB, 0xF4, 0xDF, 0x71, 0xEF, - 0x38, 0x28, 0x51, 0x75, 0xCF, 0xED, 0xCB, 0x3E, 0x63, 0xA2, 0x00, 0x00 -}; -//expected output: 281dd5fc6d878538e640987b9bc597dbbd4af2cdf8bf5fb03bdfcefa40d8747d out.bin - -int main(void) -{ - unsigned char midstate_a[256+32]; - void *pmidstate = (void *) (((long) midstate_a+31L) & -32L); - unsigned char hashout_a[256+32]; - unsigned char *phashout = (unsigned char *) (((long) hashout_a+31L) & -32L); - unsigned char buf[256]; - FILE *outfile; - int i; - - Blake2PrepareMidstate4(pmidstate, testdata); - outfile = fopen("out.bin", "wb"); - - for (i=0; i<1048576; i+=4) { - Blake2Run4(phashout, pmidstate, i); - memcpy(buf, phashout, 50); - memcpy(buf+50, phashout+64, 50); - memcpy(buf+100, phashout+128, 50); - memcpy(buf+150, phashout+192, 50); - fwrite(buf, 200, 1, outfile); - } - - fclose(outfile); - - return 0; -} diff --git a/cpu_xenoncat/Linux/demo/input.bin b/cpu_xenoncat/Linux/demo/input.bin deleted file mode 100644 index 432b9ab90..000000000 Binary files a/cpu_xenoncat/Linux/demo/input.bin and /dev/null differ diff --git a/cpu_xenoncat/Linux/demo/quickbench.c b/cpu_xenoncat/Linux/demo/quickbench.c deleted file mode 100644 index 036f8122a..000000000 --- a/cpu_xenoncat/Linux/demo/quickbench.c +++ /dev/null @@ -1,78 +0,0 @@ -//compile with -//gcc -o quickbench quickbench.c equihash_avx2.o -#include -#include -#include -#include -#include - -#define CONTEXT_SIZE 178033152 -#define ITERATIONS 10 - -//Linkage with assembly -//EhPrepare takes in 136 bytes of input. The remaining 4 bytes of input is fed as nonce to EhSolver. -//EhPrepare saves the 136 bytes in context, and EhSolver can be called repeatedly with different nonce. -void EhPrepare(void *context, void *input); -int32_t EhSolver(void *context, uint32_t nonce); -extern char testinput[]; - -int main(void) -{ - void *context_alloc, *context, *context_end; - uint32_t *pu32; - uint64_t *pu64, previous_rdtsc; - uint8_t inputheader[144]; //140 byte header - FILE *infile, *outfile; - struct timespec time0, time1; - long t0, t1; - int32_t numsolutions, total_solutions; - uint32_t nonce, delta_time, total_time; - int i, j; - - context_alloc = malloc(CONTEXT_SIZE+4096); - context = (void*) (((long) context_alloc+4095) & -4096); - context_end = context + CONTEXT_SIZE; - - infile = 0; - infile = fopen("input.bin", "rb"); - if (infile) { - puts("Reading input.bin"); - fread(inputheader, 140, 1, infile); - fclose(infile); - } else { - puts("input.bin not found, use sample data (beta1 testnet block 2)"); - memcpy(inputheader, testinput, 140); - } - - - EhPrepare(context, (void *) inputheader); - - //Warm up, timing not taken into average - nonce = 0; - clock_gettime(CLOCK_MONOTONIC, &time0); - numsolutions = EhSolver(context, nonce); - clock_gettime(CLOCK_MONOTONIC, &time1); - delta_time = (uint32_t) ((time1.tv_sec * 1000000000 + time1.tv_nsec) - - (time0.tv_sec * 1000000000 + time0.tv_nsec))/1000000; - printf("(Warm up) Time: %u ms, solutions: %u\n", delta_time, numsolutions); - - printf("Running %d iterations...\n", ITERATIONS); - nonce = 58; //arbritary number to get 19 solutions in 10 iterations (to match 1.88 solutions per run) - total_time = total_solutions = 0; - for (i=0; i -#include -#include -#include -#include -#include -#include //for rdtsc - -#define CONTEXT_SIZE 178033152 - -//Linkage with assembly -//EhPrepare takes in 136 bytes of input. The remaining 4 bytes of input is fed as nonce to EhSolver. -//EhPrepare saves the 136 bytes in context, and EhSolver can be called repeatedly with different nonce. -void EhPrepare(void *context, void *input); -int32_t EhSolver(void *context, uint32_t nonce); -extern char testinput[]; - -//context is the memory used for Equihash computation. It should be allocated outside of SolverFunction, the size is defined by CONTEXT_SIZE, about 180MB. -//SolverFunction API has slight overhead in mining due to missing opportunity to run EhSolver multiple times after a single EhPrepare. -int SolverFunction(void* context, const unsigned char* input, - bool (*validBlock)(void*, const unsigned char*), - void* validBlockData, - bool (*cancelled)(void*), - void* cancelledData, - int numThreads, - int n, int k) -{ - int numsolutions, i; - - EhPrepare(context, (void *) input); - numsolutions = EhSolver(context, *(uint32_t *)(input+136)); - - for (i=0; i Ubuntu 14.04 check gcc versions +#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-std=c++11) + +file(GLOB SRC_LIST + cuda_djezo.cpp + equi_miner.cu ) +file(GLOB HEADERS + cuda_djezo.hpp + eqcuda.hpp + ) + + +set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-D_FORCE_INLINES;--disable-warnings;--ptxas-options=-v;-Xptxas=-dlcm=ca;-Xptxas=-dscm=cs; -O3) + +FIND_PACKAGE(CUDA REQUIRED) +if(COMPUTE AND (COMPUTE GREATER 0)) + LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE}) +else(COMPUTE AND (COMPUTE GREATER 0)) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=sm_50; -gencode arch=compute_52,code=sm_52; -gencode arch=compute_60,code=sm_60 ) +endif(COMPUTE AND (COMPUTE GREATER 0)) + +if(CUDA_FOUND) +message("CUDA FOUND") +else() +message("CUDA NOT FOUND") +endif() + + +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CUDA_INCLUDE_DIRS}) +include_directories(..) +CUDA_ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS}) +TARGET_LINK_LIBRARIES(${EXECUTABLE} ${CUDA_LIBRARIES} cuda) + +message("-- CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}") + +install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib ) +install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} ) diff --git a/cuda_djezo/LICENSE b/cuda_djezo/LICENSE new file mode 100644 index 000000000..bb7b082bf --- /dev/null +++ b/cuda_djezo/LICENSE @@ -0,0 +1,675 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2016-2017 NiceHash (www.nicehash.com) + + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/cuda_djezo/blake2b.cu b/cuda_djezo/blake2b.cu new file mode 100644 index 000000000..866c82592 --- /dev/null +++ b/cuda_djezo/blake2b.cu @@ -0,0 +1,336 @@ +// Blake2-B CUDA Implementation +// tpruvot@github July 2016 +// permission granted to use under MIT license +// modified for use in Zcash by John Tromp September 2016 + +/** + * uint2 direct ops by c++ operator definitions + */ +static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { + return make_uint2(a.x ^ b.x, a.y ^ b.y); +} +static __forceinline__ __device__ uint4 operator^ (uint4 a, uint4 b) { + return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); +} +// uint2 ROR/ROL methods +__device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset) { + uint2 result; +#if __CUDA_ARCH__ > 300 +/* if (offset < 32) { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + } else *//* if (offset < 64) */ { + /* offset SHOULD BE < 64 ! */ + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } +#else + if (!offset) + result = a; + else if (offset < 32) { + result.y = ((a.y >> offset) | (a.x << (32 - offset))); + result.x = ((a.x >> offset) | (a.y << (32 - offset))); + } else if (offset == 32) { + result.y = a.x; + result.x = a.y; + } else { + result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset))); + result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset))); + } +#endif + return result; +} +__device__ __forceinline__ uint2 SWAPUINT2(uint2 value) { + return make_uint2(value.y, value.x); +} +#ifdef __CUDA_ARCH__ +__device__ __inline__ uint2 ROR24(const uint2 a) { + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x2107); + result.y = __byte_perm(a.y, a.x, 0x6543); + return result; +} +__device__ __inline__ uint2 ROR16(const uint2 a) { + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x1076); + result.y = __byte_perm(a.y, a.x, 0x5432); + return result; +} +#else +#define ROR24(u) ROR2(u,24) +#define ROR16(u) ROR2(u,16) +#endif + +typedef uint64_t u64; + +static __constant__ const int8_t blake2b_sigma[12][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 } , + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } +}; + +__device__ __constant__ +static const u64 blake_iv[] = +{ + 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, + 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, 0x9b05688c2b3e6c1f, + 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, +}; + +__device__ __forceinline__ +static void G(const int r, const int i, u64 &a, u64 &b, u64 &c, u64 &d, u64 const m[16]) { + a = a + b + m[ blake2b_sigma[r][2*i] ]; + ((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] ); + c = c + d; + ((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] ); + a = a + b + m[ blake2b_sigma[r][2*i+1] ]; + ((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] ); + c = c + d; + ((uint2*)&b)[0] = ROR2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U); +} + +//__device__ __forceinline__ +//static void G2(u64 &a, u64 &b, u64 &c, u64 &d, u64 x, u64 y) { +// a = a + b + x; +// ((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]); +// c = c + d; +// ((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]); +// a = a + b + y; +// ((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]); +// c = c + d; +// ((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U); +//} + +__device__ __forceinline__ +static void G2(u64 & a, u64 & b, u64 & c, u64 & d, u64 x, u64 y) { + a = a + b + x; + ((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]); + c = c + d; + ((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]); + a = a + b + y; + ((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]); + c = c + d; + ((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U); +} + +#define ROUND(r) \ + G(r, 0, v[0], v[4], v[ 8], v[12], m); \ + G(r, 1, v[1], v[5], v[ 9], v[13], m); \ + G(r, 2, v[2], v[6], v[10], v[14], m); \ + G(r, 3, v[3], v[7], v[11], v[15], m); \ + G(r, 4, v[0], v[5], v[10], v[15], m); \ + G(r, 5, v[1], v[6], v[11], v[12], m); \ + G(r, 6, v[2], v[7], v[ 8], v[13], m); \ + G(r, 7, v[3], v[4], v[ 9], v[14], m); + + +__forceinline__ __device__ void blake2b_gpu_hash3(uint64_t* h, u32 idx, u32 nonce) { + u64 m = (u64)idx << 32 | (u64)nonce; + + u64 v[16]; + + v[0] = h[0]; + v[1] = h[1]; + v[2] = h[2]; + v[3] = h[3]; + v[4] = h[4]; + v[5] = h[5]; + v[6] = h[6]; + v[7] = h[7]; + v[8] = blake_iv[0]; + v[9] = blake_iv[1]; + v[10] = blake_iv[2]; + v[11] = blake_iv[3]; + v[12] = blake_iv[4] ^ (128 + 16); + v[13] = blake_iv[5]; + v[14] = blake_iv[6] ^ 0xffffffffffffffff; + v[15] = blake_iv[7]; + + // mix 1 + G2(v[0], v[4], v[8], v[12], 0, m); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 2 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], m, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 3 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, m); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 4 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, m); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 5 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, m); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 6 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], m, 0); + + // mix 7 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], m, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 8 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, m); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 9 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], m, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 10 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], m, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 11 + G2(v[0], v[4], v[8], v[12], 0, m); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 12 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], m, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + h[0] ^= v[0] ^ v[8]; + h[1] ^= v[1] ^ v[9]; + h[2] ^= v[2] ^ v[10]; + h[3] ^= v[3] ^ v[11]; + h[4] ^= v[4] ^ v[12]; + h[5] ^= v[5] ^ v[13]; + h[6] ^= v[6] ^ v[14]; +} + + +__forceinline__ __device__ void blake2b_gpu_hash2(uint64_t* h, u32 idx) { + u64 m[16] = { 0 }; + u32* ptr = (u32*)&m[1]; + + ptr[1] = idx; + + u64 v[16]; + + v[0] = h[0]; + v[1] = h[1]; + v[2] = h[2]; + v[3] = h[3]; + v[4] = h[4]; + v[5] = h[5]; + v[6] = h[6]; + v[7] = h[7]; + v[8] = 0x6a09e667f3bcc908; + v[9] = 0xbb67ae8584caa73b; + v[10] = 0x3c6ef372fe94f82b; + v[11] = 0xa54ff53a5f1d36f1; + v[12] = 0x510e527fade682d1 ^ (128 + 16); + v[13] = 0x9b05688c2b3e6c1f; + v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff; + v[15] = 0x5be0cd19137e2179; + + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + ROUND(10); + ROUND(11); + + h[0] ^= v[0] ^ v[8]; + h[1] ^= v[1] ^ v[9]; + h[2] ^= v[2] ^ v[10]; + h[3] ^= v[3] ^ v[11]; + h[4] ^= v[4] ^ v[12]; + h[5] ^= v[5] ^ v[13]; + h[6] ^= v[6] ^ v[14]; + //h[7] ^= v[7] ^ v[15]; + //memcpy(hash, (uchar *)h, outlen); +} \ No newline at end of file diff --git a/cuda_djezo/cuda_djezo.cpp b/cuda_djezo/cuda_djezo.cpp new file mode 100644 index 000000000..30d672e89 --- /dev/null +++ b/cuda_djezo/cuda_djezo.cpp @@ -0,0 +1,128 @@ +#include +#include +#include +#include +#include + +#include "cuda_djezo.hpp" + +struct proof; +#include "eqcuda.hpp" + + +cuda_djezo::cuda_djezo(int platf_id, int dev_id) +{ + device_id = dev_id; + getinfo(0, dev_id, m_gpu_name, m_sm_count, m_version); + + combo_mode = 1; + + int major, minor; + std::string::size_type n = m_version.find("."); + if (n != std::string::npos) + { + major = atoi(m_version.substr(0, n).c_str()); + minor = atoi(m_version.substr(n + 1, m_version.length() - n - 1).c_str()); + + if (major < 5) + { + throw std::runtime_error("Only CUDA devices with SM 5.0 and higher are supported."); + } + else if (major == 5 && minor == 0) + { + combo_mode = 2; + } + } + else + throw std::runtime_error("Uknown Compute/SM version."); +} + + +std::string cuda_djezo::getdevinfo() +{ + return m_gpu_name + " (#" + std::to_string(device_id) + ") M=" + std::to_string(combo_mode); +} + + +int cuda_djezo::getcount() +{ + int device_count; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + return device_count; +} + +void cuda_djezo::getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version) +{ + //int runtime_version; + //checkCudaErrors(cudaRuntimeGetVersion(&runtime_version)); + + cudaDeviceProp device_props; + + checkCudaErrors(cudaGetDeviceProperties(&device_props, d_id)); + + gpu_name = device_props.name; + sm_count = device_props.multiProcessorCount; + version = std::to_string(device_props.major) + "." + std::to_string(device_props.minor); +} + + +void cuda_djezo::start(cuda_djezo& device_context) +{ + switch (device_context.combo_mode) + { +#ifdef CONFIG_MODE_2 + case 2: + device_context.context = new eq_cuda_context(device_context.device_id); + break; +#endif +#ifdef CONFIG_MODE_3 + case 3: + device_context.context = new eq_cuda_context(device_context.device_id); + break; +#endif + default: + device_context.context = new eq_cuda_context(device_context.device_id); + break; + } +} + +void cuda_djezo::stop(cuda_djezo& device_context) +{ + if (device_context.context) + { + delete device_context.context; + device_context.context = nullptr; + } +} + +void cuda_djezo::solve(const char *tequihash_header, + unsigned int tequihash_header_len, + const char* nonce, + unsigned int nonce_len, + std::function cancelf, + std::function&, size_t, const unsigned char*)> solutionf, + std::function hashdonef, + cuda_djezo& device_context) +{ + device_context.context->solve(tequihash_header, + tequihash_header_len, + nonce, + nonce_len, + cancelf, + solutionf, + hashdonef); +} + + +void eq_cuda_context_interface::solve(const char *tequihash_header, + unsigned int tequihash_header_len, + const char* nonce, + unsigned int nonce_len, + std::function cancelf, + std::function&, size_t, const unsigned char*)> solutionf, + std::function hashdonef) +{ +} + + +eq_cuda_context_interface::~eq_cuda_context_interface() { } \ No newline at end of file diff --git a/ocl_xpm/ocl_xmp.hpp b/cuda_djezo/cuda_djezo.hpp similarity index 51% rename from ocl_xpm/ocl_xmp.hpp rename to cuda_djezo/cuda_djezo.hpp index 3e0d4054a..1843c462a 100644 --- a/ocl_xpm/ocl_xmp.hpp +++ b/cuda_djezo/cuda_djezo.hpp @@ -1,33 +1,22 @@ #pragma once + #ifdef _LIB -#define DLL_OCL_XMP __declspec(dllexport) +#define DLL_CUDA_DJEZO __declspec(dllexport) #else -#define DLL_OCL_XMP +#define DLL_CUDA_DJEZO #endif -// remove after -#include -#include -#include -#include - -struct MinerInstance; +struct eq_cuda_context_interface; -struct DLL_OCL_XMP ocl_xmp +struct DLL_CUDA_DJEZO cuda_djezo { - //int threadsperblock; + int threadsperblock; int blocks; int device_id; - int platform_id; - - MinerInstance* context; - // threads - unsigned threadsNum; // TMP - unsigned wokrsize; - - bool is_init_success = false; + int combo_mode; + eq_cuda_context_interface* context; - ocl_xmp(int platf_id, int dev_id); + cuda_djezo(int platf_id, int dev_id); std::string getdevinfo(); @@ -35,9 +24,9 @@ struct DLL_OCL_XMP ocl_xmp static void getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version); - static void start(ocl_xmp& device_context); + static void start(cuda_djezo& device_context); - static void stop(ocl_xmp& device_context); + static void stop(cuda_djezo& device_context); static void solve(const char *tequihash_header, unsigned int tequihash_header_len, @@ -46,11 +35,12 @@ struct DLL_OCL_XMP ocl_xmp std::function cancelf, std::function&, size_t, const unsigned char*)> solutionf, std::function hashdonef, - ocl_xmp& device_context); + cuda_djezo& device_context); - std::string getname() { return "OCL_XMP"; } + std::string getname() { return "CUDA-DJEZO"; } private: std::string m_gpu_name; std::string m_version; + int m_sm_count; }; \ No newline at end of file diff --git a/cuda_djezo/cuda_djezo.vcxproj b/cuda_djezo/cuda_djezo.vcxproj new file mode 100644 index 000000000..759a7cf20 --- /dev/null +++ b/cuda_djezo/cuda_djezo.vcxproj @@ -0,0 +1,117 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + + + + + + + {268B10AD-D845-498B-8663-AB8911CA2039} + cuda_djezo + $(CUDA_PATH_V8_0) + + + + DynamicLibrary + true + MultiByte + v120 + + + DynamicLibrary + false + true + MultiByte + v120 + + + + + + + + + + + + + + true + + + + Level3 + Disabled + WIN32;WIN64;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) + 4334;4316;4244;4996;4251; + ..\3rdparty\include;%(AdditionalIncludeDirectories) + + + true + Console + cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + + + echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" +copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + + 64 + compute_61,sm_61;compute_52,sm_52;compute_50,sm_50; + true + false + + + + + Level3 + MaxSpeed + true + true + WIN32;WIN64;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) + true + 4334;4316;4244;4996;4251; + ..\3rdparty\include;%(AdditionalIncludeDirectories) + + + true + true + true + Console + cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" +copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + + 64 + compute_50,sm_50;compute_52,sm_52;compute_61,sm_61; + true + –Xptxas –dlcm=ca -Xptxas -dscm=cs %(AdditionalOptions) + + + + + + + \ No newline at end of file diff --git a/cuda_djezo/eqcuda.hpp b/cuda_djezo/eqcuda.hpp new file mode 100644 index 000000000..48d663a45 --- /dev/null +++ b/cuda_djezo/eqcuda.hpp @@ -0,0 +1,99 @@ +#pragma once + +#include "cuda.h" +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include "device_functions_decls.h" +#include "../cpu_tromp/blake2/blake2.h" +#include "cuda_djezo.hpp" + +#ifdef WIN32 +#define _SNPRINTF _snprintf +#else +#include +#define _SNPRINTF snprintf +#endif + +#define checkCudaErrors(call) \ +do { \ + cudaError_t err = call; \ + if (cudaSuccess != err) { \ + char errorBuff[512]; \ + _SNPRINTF(errorBuff, sizeof(errorBuff) - 1, \ + "CUDA error '%s' in func '%s' line %d", \ + cudaGetErrorString(err), __FUNCTION__, __LINE__); \ + throw std::runtime_error(errorBuff); \ + } \ +} while (0) + +#define checkCudaDriverErrors(call) \ +do { \ + CUresult err = call; \ + if (CUDA_SUCCESS != err) { \ + char errorBuff[512]; \ + _SNPRINTF(errorBuff, sizeof(errorBuff) - 1, \ + "CUDA error DRIVER: '%d' in func '%s' line %d", \ + err, __FUNCTION__, __LINE__); \ + throw std::runtime_error(errorBuff); \ + } \ +} while (0) + +typedef uint64_t u64; +typedef uint32_t u32; +typedef uint16_t u16; +typedef uint8_t u8; +typedef unsigned char uchar; + +struct packer_default; +struct packer_cantor; + +#define MAXREALSOLS 9 + +struct scontainerreal +{ + u32 sols[MAXREALSOLS][512]; + u32 nsols; +}; + +template +struct equi; + +struct eq_cuda_context_interface +{ + virtual ~eq_cuda_context_interface(); + + virtual void solve(const char *tequihash_header, + unsigned int tequihash_header_len, + const char* nonce, + unsigned int nonce_len, + std::function cancelf, + std::function&, size_t, const unsigned char*)> solutionf, + std::function hashdonef); +}; + + +template +struct eq_cuda_context : public eq_cuda_context_interface +{ + int threadsperblock; + int totalblocks; + int device_id; + equi* device_eq; + scontainerreal* solutions; + CUcontext pctx; + + eq_cuda_context(int id); + ~eq_cuda_context(); + + void solve(const char *tequihash_header, + unsigned int tequihash_header_len, + const char* nonce, + unsigned int nonce_len, + std::function cancelf, + std::function&, size_t, const unsigned char*)> solutionf, + std::function hashdonef); +}; + +#define CONFIG_MODE_1 9, 1248, 12, 640, packer_cantor + +#define CONFIG_MODE_2 8, 640, 12, 512, packer_default \ No newline at end of file diff --git a/cuda_djezo/equi_miner.cu b/cuda_djezo/equi_miner.cu new file mode 100644 index 000000000..6ef9f45f2 --- /dev/null +++ b/cuda_djezo/equi_miner.cu @@ -0,0 +1,2159 @@ +/* + Equihash solver created by djeZo (l33tsoftw@gmail.com) for NiceHash + + Based on CUDA solver by John Tromp released under MIT license. + + Some helper functions taken out of OpenCL solver by Marc Bevand + released under MIT license. + + cuda_djezo solver is released by NiceHash (www.nicehash.com) under + GPL 3.0 license. If you don't have a copy, you can obtain one from + https://www.gnu.org/licenses/gpl-3.0.txt +*/ + +/* +The MIT License (MIT) + +Copyright (c) 2016 John Tromp + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software, and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* +The MIT License (MIT) + +Copyright (c) 2016 Marc Bevand + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software, and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifdef WIN32 +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eqcuda.hpp" +#include "sm_32_intrinsics.h" + +#define WN 200 +#define WK 9 +#define NDIGITS (WK+1) +#define DIGITBITS (WN/(NDIGITS)) +#define PROOFSIZE (1< +#include +#define __launch_bounds__(max_tpb, min_blocks) +#define __CUDA_ARCH__ 520 +uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z); +uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z); +uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z); +uint32_t atomicExch(uint32_t *x, uint32_t y); +uint32_t atomicAdd(uint32_t *x, uint32_t y); +void __syncthreads(void); +void __threadfence(void); +void __threadfence_block(void); +uint32_t __ldg(const uint32_t* address); +uint64_t __ldg(const uint64_t* address); +uint4 __ldca(const uint4 *ptr); +u32 __ldca(const u32 *ptr); +u32 umin(const u32, const u32); +u32 umax(const u32, const u32); +#endif + + +typedef u32 proof[PROOFSIZE]; + + +struct __align__(32) slot +{ + u32 hash[8]; +}; + + +struct __align__(16) slotsmall +{ + u32 hash[4]; +}; + + +struct __align__(8) slottiny +{ + u32 hash[2]; +}; + + +template +struct equi +{ + slot round0trees[4096][RB8_NSLOTS]; + slot trees[1][NBUCKETS][NSLOTS]; + struct + { + slotsmall treessmall[NSLOTS]; + slottiny treestiny[NSLOTS]; + } round2trees[NBUCKETS]; + struct + { + slotsmall treessmall[NSLOTS]; + slottiny treestiny[NSLOTS]; + } round3trees[NBUCKETS]; + slotsmall treessmall[4][NBUCKETS][NSLOTS]; + slottiny treestiny[1][4096][RB8_NSLOTS_LD]; + u32 round4bidandsids[NBUCKETS][NSLOTS]; + union + { + u64 blake_h[8]; + u32 blake_h32[16]; + }; + struct + { + u32 nslots8[4096]; + u32 nslots0[4096]; + u32 nslots[9][NBUCKETS]; + scontainerreal srealcont; + } edata; +}; + + +__device__ __constant__ const u64 blake_iv[] = +{ + 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, + 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, 0x9b05688c2b3e6c1f, + 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, +}; + +__device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) +{ + return make_uint2(a.x ^ b.x, a.y ^ b.y); +} + +__device__ __forceinline__ uint4 operator^ (uint4 a, uint4 b) +{ + return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); +} + +__device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset) +{ + uint2 result; + { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } + return result; +} + +__device__ __forceinline__ uint2 SWAPUINT2(uint2 value) +{ + return make_uint2(value.y, value.x); +} + +__device__ __forceinline__ uint2 ROR24(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x2107); + result.y = __byte_perm(a.y, a.x, 0x6543); + return result; +} + +__device__ __forceinline__ uint2 ROR16(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x1076); + result.y = __byte_perm(a.y, a.x, 0x5432); + return result; +} + +__device__ __forceinline__ void G2(u64 & a, u64 & b, u64 & c, u64 & d, u64 x, u64 y) +{ + a = a + b + x; + ((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]); + c = c + d; + ((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]); + a = a + b + y; + ((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]); + c = c + d; + ((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U); +} + + +struct packer_default +{ + __device__ __forceinline__ static u32 set_bucketid_and_slots(const u32 bucketid, const u32 s0, const u32 s1, const u32 RB, const u32 SM) + { + return (((bucketid << SLOTBITS) | s0) << SLOTBITS) | s1; + } + + __device__ __forceinline__ static u32 get_bucketid(const u32 bid, const u32 RB, const u32 SM) + { + // BUCKMASK-ed to prevent illegal memory accesses in case of memory errors + return (bid >> (2 * SLOTBITS)) & BUCKMASK; + } + + __device__ __forceinline__ static u32 get_slot0(const u32 bid, const u32 s1, const u32 RB, const u32 SM) + { + return bid & SLOTMASK; + } + + __device__ __forceinline__ static u32 get_slot1(const u32 bid, const u32 RB, const u32 SM) + { + return (bid >> SLOTBITS) & SLOTMASK; + } +}; + + +struct packer_cantor +{ + __device__ __forceinline__ static u32 cantor(const u32 s0, const u32 s1) + { + u32 a = umax(s0, s1); + u32 b = umin(s0, s1); + return a * (a + 1) / 2 + b; + } + + __device__ __forceinline__ static u32 set_bucketid_and_slots(const u32 bucketid, const u32 s0, const u32 s1, const u32 RB, const u32 SM) + { + return (bucketid << CANTORBITS) | cantor(s0, s1); + } + + __device__ __forceinline__ static u32 get_bucketid(const u32 bid, const u32 RB, const u32 SM) + { + return (bid >> CANTORBITS) & BUCKMASK; + } + + __device__ __forceinline__ static u32 get_slot0(const u32 bid, const u32 s1, const u32 RB, const u32 SM) + { + return ((bid & CANTORMASK) - cantor(0, s1)) & SLOTMASK; + } + + __device__ __forceinline__ static u32 get_slot1(const u32 bid, const u32 RB, const u32 SM) + { + u32 k, q, sqr = 8 * (bid & CANTORMASK) + 1; + // this k=sqrt(sqr) computing loop averages 3.4 iterations out of maximum 9 + for (k = CANTORMAXSQRT; (q = sqr / k) < k; k = (k + q) / 2); + return ((k - 1) / 2) & SLOTMASK; + } +}; + + +template +__global__ void digit_first(equi* eq, u32 nonce) +{ + const u32 block = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ u64 hash_h[8]; + u32* hash_h32 = (u32*)hash_h; + + if (threadIdx.x < 16) + hash_h32[threadIdx.x] = __ldca(&eq->blake_h32[threadIdx.x]); + + __syncthreads(); + + u64 m = (u64)block << 32 | (u64)nonce; + + union + { + u64 v[16]; + u32 v32[32]; + uint4 v128[8]; + }; + + v[0] = hash_h[0]; + v[1] = hash_h[1]; + v[2] = hash_h[2]; + v[3] = hash_h[3]; + v[4] = hash_h[4]; + v[5] = hash_h[5]; + v[6] = hash_h[6]; + v[7] = hash_h[7]; + v[8] = blake_iv[0]; + v[9] = blake_iv[1]; + v[10] = blake_iv[2]; + v[11] = blake_iv[3]; + v[12] = blake_iv[4] ^ (128 + 16); + v[13] = blake_iv[5]; + v[14] = blake_iv[6] ^ 0xffffffffffffffff; + v[15] = blake_iv[7]; + + // mix 1 + G2(v[0], v[4], v[8], v[12], 0, m); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 2 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], m, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 3 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, m); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 4 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, m); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 5 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, m); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 6 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], m, 0); + + // mix 7 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], m, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 8 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, m); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 9 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], m, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 10 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], m, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 11 + G2(v[0], v[4], v[8], v[12], 0, m); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 12 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], m, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + v[0] ^= hash_h[0] ^ v[8]; + v[1] ^= hash_h[1] ^ v[9]; + v[2] ^= hash_h[2] ^ v[10]; + v[3] ^= hash_h[3] ^ v[11]; + v[4] ^= hash_h[4] ^ v[12]; + v[5] ^= hash_h[5] ^ v[13]; + v32[12] ^= hash_h32[12] ^ v32[28]; + + u32 bexor = __byte_perm(v32[0], 0, 0x4012); // first 20 bits + u32 bucketid; + asm("bfe.u32 %0, %1, 12, 12;" : "=r"(bucketid) : "r"(bexor)); + u32 slotp = atomicAdd(&eq->edata.nslots0[bucketid], 1); + if (slotp < RB8_NSLOTS) + { + slot* s = &eq->round0trees[bucketid][slotp]; + + uint4 tt; + tt.x = __byte_perm(v32[0], v32[1], 0x1234); + tt.y = __byte_perm(v32[1], v32[2], 0x1234); + tt.z = __byte_perm(v32[2], v32[3], 0x1234); + tt.w = __byte_perm(v32[3], v32[4], 0x1234); + *(uint4*)(&s->hash[0]) = tt; + + tt.x = __byte_perm(v32[4], v32[5], 0x1234); + tt.y = __byte_perm(v32[5], v32[6], 0x1234); + tt.z = 0; + tt.w = block << 1; + *(uint4*)(&s->hash[4]) = tt; + } + + bexor = __byte_perm(v32[6], 0, 0x0123); + asm("bfe.u32 %0, %1, 12, 12;" : "=r"(bucketid) : "r"(bexor)); + slotp = atomicAdd(&eq->edata.nslots0[bucketid], 1); + if (slotp < RB8_NSLOTS) + { + slot* s = &eq->round0trees[bucketid][slotp]; + + uint4 tt; + tt.x = __byte_perm(v32[6], v32[7], 0x2345); + tt.y = __byte_perm(v32[7], v32[8], 0x2345); + tt.z = __byte_perm(v32[8], v32[9], 0x2345); + tt.w = __byte_perm(v32[9], v32[10], 0x2345); + *(uint4*)(&s->hash[0]) = tt; + + tt.x = __byte_perm(v32[10], v32[11], 0x2345); + tt.y = __byte_perm(v32[11], v32[12], 0x2345); + tt.z = 0; + tt.w = (block << 1) + 1; + *(uint4*)(&s->hash[4]) = tt; + } +} + +/* + Functions digit_1 to digit_8 works by the same principle; + Each thread does 2-3 slot loads (loads are coalesced). + Xorwork of slots is loaded into shared memory and is kept in registers (except for digit_1). + At the same time, restbits (8 or 9 bits) in xorwork are used for collisions. + Restbits determine position in ht. + Following next is pair creation. First one (or two) pairs' xorworks are put into global memory + as soon as possible, the rest pairs are saved in shared memory (one u32 per pair - 16 bit indices). + In most cases, all threads have one (or two) pairs so with this trick, we offload memory writes a bit in last step. + In last step we save xorwork of pairs in memory. +*/ +template +__global__ void digit_1(equi* eq) +{ + __shared__ u16 ht[256][SSM - 1]; + __shared__ uint2 lastword1[RB8_NSLOTS]; + __shared__ uint4 lastword2[RB8_NSLOTS]; + __shared__ int ht_len[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + if (threadid < 256) + ht_len[threadid] = 0; + else if (threadid == (THREADS - 1)) + pairs_len = 0; + else if (threadid == (THREADS - 33)) + next_pair = 0; + + u32 bsize = umin(eq->edata.nslots0[bucketid], RB8_NSLOTS); + + u32 hr[2]; + int pos[2]; + pos[0] = pos[1] = SSM; + + uint2 ta[2]; + uint4 tb[2]; + + u32 si[2]; + + // enable this to make fully safe shared mem operations; + // disabled gains some speed, but can rarely cause a crash + //__syncthreads(); + +#pragma unroll + for (u32 i = 0; i != 2; ++i) + { + si[i] = i * THREADS + threadid; + if (si[i] >= bsize) break; + + const slot* pslot1 = eq->round0trees[bucketid] + si[i]; + + // get xhash + uint4 a1 = *(uint4*)(&pslot1->hash[0]); + uint2 a2 = *(uint2*)(&pslot1->hash[4]); + ta[i].x = a1.x; + ta[i].y = a1.y; + lastword1[si[i]] = ta[i]; + tb[i].x = a1.z; + tb[i].y = a1.w; + tb[i].z = a2.x; + tb[i].w = a2.y; + lastword2[si[i]] = tb[i]; + + asm("bfe.u32 %0, %1, 20, 8;" : "=r"(hr[i]) : "r"(ta[i].x)); + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + int* pairs = ht_len; + + u32 xors[6]; + u32 xorbucketid, xorslot; + +#pragma unroll + for (u32 i = 0; i != 2; ++i) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + *(uint2*)(&xors[0]) = ta[i] ^ lastword1[p]; + + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[1][xorbucketid], 1); + + if (xorslot < NSLOTS) + { + *(uint4*)(&xors[2]) = lastword2[si[i]] ^ lastword2[p]; + + slot &xs = eq->trees[0][xorbucketid][xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]); + uint4 ttx; + ttx.x = xors[5]; + ttx.y = xors[0]; + ttx.z = packer_default::set_bucketid_and_slots(bucketid, si[i], p, 8, RB8_NSLOTS); + ttx.w = 0; + *(uint4*)(&xs.hash[4]) = ttx; + } + + for (int k = 1; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + + u32 i, k; + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + i = __byte_perm(pair, 0, 0x4510); + k = __byte_perm(pair, 0, 0x4532); + + *(uint2*)(&xors[0]) = lastword1[i] ^ lastword1[k]; + + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[1][xorbucketid], 1); + + if (xorslot < NSLOTS) + { + *(uint4*)(&xors[2]) = lastword2[i] ^ lastword2[k]; + + slot &xs = eq->trees[0][xorbucketid][xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]); + uint4 ttx; + ttx.x = xors[5]; + ttx.y = xors[0]; + ttx.z = packer_default::set_bucketid_and_slots(bucketid, i, k, 8, RB8_NSLOTS); + ttx.w = 0; + *(uint4*)(&xs.hash[4]) = ttx; + } + } +} + + +template +__global__ void digit_2(equi* eq) +{ + __shared__ u16 ht[NRESTS][SSM - 1]; + __shared__ u32 lastword1[NSLOTS]; + __shared__ uint4 lastword2[NSLOTS]; + __shared__ int ht_len[NRESTS]; + __shared__ int pairs[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + if (threadid < NRESTS) + ht_len[threadid] = 0; + else if (threadid == (THREADS - 1)) + pairs_len = 0; + else if (threadid == (THREADS - 33)) + next_pair = 0; + + slot* buck = eq->trees[0][bucketid]; + u32 bsize = umin(eq->edata.nslots[1][bucketid], NSLOTS); + + u32 hr[2]; + int pos[2]; + pos[0] = pos[1] = SSM; + + u32 ta[2]; + uint4 tt[2]; + + u32 si[2]; + + // enable this to make fully safe shared mem operations; + // disabled gains some speed, but can rarely cause a crash + //__syncthreads(); + +#pragma unroll + for (u32 i = 0; i != 2; ++i) + { + si[i] = i * THREADS + threadid; + if (si[i] >= bsize) break; + + // get slot + const slot* pslot1 = buck + si[i]; + + uint4 ttx = *(uint4*)(&pslot1->hash[0]); + lastword1[si[i]] = ta[i] = ttx.x; + uint2 tty = *(uint2*)(&pslot1->hash[4]); + tt[i].x = ttx.y; + tt[i].y = ttx.z; + tt[i].z = ttx.w; + tt[i].w = tty.x; + lastword2[si[i]] = tt[i]; + + hr[i] = tty.y & RESTMASK; + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + + u32 xors[5]; + u32 xorbucketid, xorslot; + +#pragma unroll + for (u32 i = 0; i != 2; ++i) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + xors[0] = ta[i] ^ lastword1[p]; + + xorbucketid = xors[0] >> (12 + RB); + xorslot = atomicAdd(&eq->edata.nslots[2][xorbucketid], 1); + if (xorslot < NSLOTS) + { + *(uint4*)(&xors[1]) = tt[i] ^ lastword2[p]; + slotsmall &xs = eq->round2trees[xorbucketid].treessmall[xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]); + slottiny &xst = eq->round2trees[xorbucketid].treestiny[xorslot]; + uint2 ttx; + ttx.x = xors[4]; + ttx.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint2*)(&xst.hash[0]) = ttx; + } + + for (int k = 1; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + + u32 i, k; + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + i = __byte_perm(pair, 0, 0x4510); + k = __byte_perm(pair, 0, 0x4532); + + xors[0] = lastword1[i] ^ lastword1[k]; + + xorbucketid = xors[0] >> (12 + RB); + xorslot = atomicAdd(&eq->edata.nslots[2][xorbucketid], 1); + if (xorslot < NSLOTS) + { + *(uint4*)(&xors[1]) = lastword2[i] ^ lastword2[k]; + slotsmall &xs = eq->round2trees[xorbucketid].treessmall[xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]); + slottiny &xst = eq->round2trees[xorbucketid].treestiny[xorslot]; + uint2 ttx; + ttx.x = xors[4]; + ttx.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + *(uint2*)(&xst.hash[0]) = ttx; + } + } +} + + +template +__global__ void digit_3(equi* eq) +{ + __shared__ u16 ht[NRESTS][(SSM - 1)]; + __shared__ uint4 lastword1[NSLOTS]; + __shared__ u32 lastword2[NSLOTS]; + __shared__ int ht_len[NRESTS]; + __shared__ int pairs[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + if (threadid < NRESTS) + ht_len[threadid] = 0; + else if (threadid == (THREADS - 1)) + pairs_len = 0; + else if (threadid == (THREADS - 33)) + next_pair = 0; + + u32 bsize = umin(eq->edata.nslots[2][bucketid], NSLOTS); + + u32 hr[2]; + int pos[2]; + pos[0] = pos[1] = SSM; + + u32 si[2]; + uint4 tt[2]; + u32 ta[2]; + + // enable this to make fully safe shared mem operations; + // disabled gains some speed, but can rarely cause a crash + //__syncthreads(); + +#pragma unroll + for (u32 i = 0; i != 2; ++i) + { + si[i] = i * THREADS + threadid; + if (si[i] >= bsize) break; + + slotsmall &xs = eq->round2trees[bucketid].treessmall[si[i]]; + slottiny &xst = eq->round2trees[bucketid].treestiny[si[i]]; + + tt[i] = *(uint4*)(&xs.hash[0]); + lastword1[si[i]] = tt[i]; + ta[i] = xst.hash[0]; + lastword2[si[i]] = ta[i]; + asm("bfe.u32 %0, %1, 12, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB)); + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + + u32 xors[5]; + u32 bexor, xorbucketid, xorslot; + +#pragma unroll + for (u32 i = 0; i != 2; ++i) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + xors[4] = ta[i] ^ lastword2[p]; + + if (xors[4] != 0) + { + *(uint4*)(&xors[0]) = tt[i] ^ lastword1[p]; + + bexor = __byte_perm(xors[0], xors[1], 0x2107); + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[3][xorbucketid], 1); + + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->round3trees[xorbucketid].treessmall[xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]); + slottiny &xst = eq->round3trees[xorbucketid].treestiny[xorslot]; + uint2 ttx; + ttx.x = bexor; + ttx.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint2*)(&xst.hash[0]) = ttx; + } + } + + for (int k = 1; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + + u32 i, k; + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + i = __byte_perm(pair, 0, 0x4510); + k = __byte_perm(pair, 0, 0x4532); + + xors[4] = lastword2[i] ^ lastword2[k]; + + if (xors[4] != 0) + { + *(uint4*)(&xors[0]) = lastword1[i] ^ lastword1[k]; + + bexor = __byte_perm(xors[0], xors[1], 0x2107); + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[3][xorbucketid], 1); + + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->round3trees[xorbucketid].treessmall[xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]); + slottiny &xst = eq->round3trees[xorbucketid].treestiny[xorslot]; + uint2 ttx; + ttx.x = bexor; + ttx.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + *(uint2*)(&xst.hash[0]) = ttx; + } + } + } +} + + +template +__global__ void digit_4(equi* eq) +{ + __shared__ u16 ht[NRESTS][(SSM - 1)]; + __shared__ uint4 lastword[NSLOTS]; + __shared__ int ht_len[NRESTS]; + __shared__ int pairs[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + if (threadid < NRESTS) + ht_len[threadid] = 0; + else if (threadid == (THREADS - 1)) + pairs_len = 0; + else if (threadid == (THREADS - 33)) + next_pair = 0; + + u32 bsize = umin(eq->edata.nslots[3][bucketid], NSLOTS); + + u32 hr[2]; + int pos[2]; + pos[0] = pos[1] = SSM; + + u32 si[2]; + uint4 tt[2]; + + // enable this to make fully safe shared mem operations; + // disabled gains some speed, but can rarely cause a crash + //__syncthreads(); + +#pragma unroll + for (u32 i = 0; i != 2; ++i) + { + si[i] = i * THREADS + threadid; + if (si[i] >= bsize) break; + + slotsmall &xs = eq->round3trees[bucketid].treessmall[si[i]]; + slottiny &xst = eq->round3trees[bucketid].treestiny[si[i]]; + + // get xhash + tt[i] = *(uint4*)(&xs.hash[0]); + lastword[si[i]] = tt[i]; + hr[i] = xst.hash[0] & RESTMASK; + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + u32 xors[4]; + u32 xorbucketid, xorslot; + +#pragma unroll + for (u32 i = 0; i != 2; ++i) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + *(uint4*)(&xors[0]) = tt[i] ^ lastword[p]; + + if (xors[3] != 0) + { + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(4 + RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[4][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[3][xorbucketid][xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]); + + eq->round4bidandsids[xorbucketid][xorslot] = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + } + } + + for (int k = 1; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + u32 i, k; + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + i = __byte_perm(pair, 0, 0x4510); + k = __byte_perm(pair, 0, 0x4532); + + *(uint4*)(&xors[0]) = lastword[i] ^ lastword[k]; + if (xors[3] != 0) + { + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(4 + RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[4][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[3][xorbucketid][xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]); + eq->round4bidandsids[xorbucketid][xorslot] = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + } + } + } +} + + +template +__global__ void digit_5(equi* eq) +{ + __shared__ u16 ht[NRESTS][(SSM - 1)]; + __shared__ uint4 lastword[NSLOTS]; + __shared__ int ht_len[NRESTS]; + __shared__ int pairs[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + if (threadid < NRESTS) + ht_len[threadid] = 0; + else if (threadid == (THREADS - 1)) + pairs_len = 0; + else if (threadid == (THREADS - 33)) + next_pair = 0; + + slotsmall* buck = eq->treessmall[3][bucketid]; + u32 bsize = umin(eq->edata.nslots[4][bucketid], NSLOTS); + + u32 hr[2]; + int pos[2]; + pos[0] = pos[1] = SSM; + + u32 si[2]; + uint4 tt[2]; + + // enable this to make fully safe shared mem operations; + // disabled gains some speed, but can rarely cause a crash + //__syncthreads(); + +#pragma unroll + for (u32 i = 0; i != 2; ++i) + { + si[i] = i * THREADS + threadid; + if (si[i] >= bsize) break; + + const slotsmall* pslot1 = buck + si[i]; + + tt[i] = *(uint4*)(&pslot1->hash[0]); + lastword[si[i]] = tt[i]; + asm("bfe.u32 %0, %1, 4, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB)); + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + u32 xors[4]; + u32 bexor, xorbucketid, xorslot; + +#pragma unroll + for (u32 i = 0; i != 2; ++i) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + *(uint4*)(&xors[0]) = tt[i] ^ lastword[p]; + + if (xors[3] != 0) + { + bexor = __byte_perm(xors[0], xors[1], 0x1076); + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[5][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[2][xorbucketid][xorslot]; + uint4 ttx; + ttx.x = xors[1]; + ttx.y = xors[2]; + ttx.z = xors[3]; + ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint4*)(&xs.hash[0]) = ttx; + } + } + + for (int k = 1; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + u32 i, k; + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + i = __byte_perm(pair, 0, 0x4510); + k = __byte_perm(pair, 0, 0x4532); + + *(uint4*)(&xors[0]) = lastword[i] ^ lastword[k]; + + if (xors[3] != 0) + { + bexor = __byte_perm(xors[0], xors[1], 0x1076); + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[5][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[2][xorbucketid][xorslot]; + uint4 tt; + tt.x = xors[1]; + tt.y = xors[2]; + tt.z = xors[3]; + tt.w = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + *(uint4*)(&xs.hash[0]) = tt; + } + } + } +} + + +template +__global__ void digit_6(equi* eq) +{ + __shared__ u16 ht[NRESTS][(SSM - 1)]; + __shared__ uint2 lastword1[NSLOTS]; + __shared__ u32 lastword2[NSLOTS]; + __shared__ int ht_len[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 bsize_sh; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + ht_len[threadid] = 0; + if (threadid == (NRESTS - 1)) + { + pairs_len = 0; + next_pair = 0; + } + else if (threadid == (NRESTS - 33)) + bsize_sh = umin(eq->edata.nslots[5][bucketid], NSLOTS); + + slotsmall* buck = eq->treessmall[2][bucketid]; + + u32 hr[3]; + int pos[3]; + pos[0] = pos[1] = pos[2] = SSM; + + u32 si[3]; + uint4 tt[3]; + + __syncthreads(); + + u32 bsize = bsize_sh; + +#pragma unroll + for (u32 i = 0; i != 3; ++i) + { + si[i] = i * NRESTS + threadid; + if (si[i] >= bsize) break; + + const slotsmall* pslot1 = buck + si[i]; + + tt[i] = *(uint4*)(&pslot1->hash[0]); + lastword1[si[i]] = *(uint2*)(&tt[i].x); + lastword2[si[i]] = tt[i].z; + asm("bfe.u32 %0, %1, 16, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB)); + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + // doing this to save shared memory + int* pairs = ht_len; + __syncthreads(); + + u32 xors[3]; + u32 bexor, xorbucketid, xorslot; + +#pragma unroll + for (u32 i = 0; i != 3; ++i) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + xors[2] = tt[i].z ^ lastword2[p]; + + if (xors[2] != 0) + { + *(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ lastword1[p]; + + bexor = __byte_perm(xors[0], xors[1], 0x1076); + xorbucketid = bexor >> (12 + RB); + xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot]; + uint4 ttx; + ttx.x = xors[1]; + ttx.y = xors[2]; + ttx.z = bexor; + ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint4*)(&xs.hash[0]) = ttx; + } + } + + if (pos[i] > 1) + { + p = ht[hr[i]][1]; + + xors[2] = tt[i].z ^ lastword2[p]; + + if (xors[2] != 0) + { + *(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ lastword1[p]; + + bexor = __byte_perm(xors[0], xors[1], 0x1076); + xorbucketid = bexor >> (12 + RB); + xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot]; + uint4 ttx; + ttx.x = xors[1]; + ttx.y = xors[2]; + ttx.z = bexor; + ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint4*)(&xs.hash[0]) = ttx; + } + } + + for (int k = 2; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + u32 pair = pairs[s]; + u32 i = __byte_perm(pair, 0, 0x4510); + u32 k = __byte_perm(pair, 0, 0x4532); + + xors[2] = lastword2[i] ^ lastword2[k]; + if (xors[2] == 0) + continue; + + *(uint2*)(&xors[0]) = lastword1[i] ^ lastword1[k]; + + bexor = __byte_perm(xors[0], xors[1], 0x1076); + xorbucketid = bexor >> (12 + RB); + xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1); + if (xorslot >= NSLOTS) continue; + slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot]; + uint4 ttx; + ttx.x = xors[1]; + ttx.y = xors[2]; + ttx.z = bexor; + ttx.w = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + *(uint4*)(&xs.hash[0]) = ttx; + } +} + + +template +__global__ void digit_7(equi* eq) +{ + __shared__ u16 ht[NRESTS][(SSM - 1)]; + __shared__ u32 lastword[NSLOTS][2]; + __shared__ int ht_len[NRESTS]; + __shared__ int pairs[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 bsize_sh; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + ht_len[threadid] = 0; + if (threadid == (NRESTS - 1)) + { + pairs_len = 0; + next_pair = 0; + } + else if (threadid == (NRESTS - 33)) + bsize_sh = umin(eq->edata.nslots[6][bucketid], NSLOTS); + + slotsmall* buck = eq->treessmall[0][bucketid]; + + u32 hr[3]; + int pos[3]; + pos[0] = pos[1] = pos[2] = SSM; + + u32 si[3]; + uint4 tt[3]; + + __syncthreads(); + + u32 bsize = bsize_sh; + +#pragma unroll + for (u32 i = 0; i != 3; ++i) + { + si[i] = i * NRESTS + threadid; + if (si[i] >= bsize) break; + + const slotsmall* pslot1 = buck + si[i]; + + // get xhash + tt[i] = *(uint4*)(&pslot1->hash[0]); + *(uint2*)(&lastword[si[i]][0]) = *(uint2*)(&tt[i].x); + asm("bfe.u32 %0, %1, 12, %2;" : "=r"(hr[i]) : "r"(tt[i].z), "r"(RB)); + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + + u32 xors[2]; + u32 xorbucketid, xorslot; + +#pragma unroll + for (u32 i = 0; i != 3; ++i) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + *(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]); + + if (xors[1] != 0) + { + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot]; + uint4 ttx; + ttx.x = xors[0]; + ttx.y = xors[1]; + ttx.z = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + ttx.w = 0; + *(uint4*)(&xs.hash[0]) = ttx; + } + } + + if (pos[i] > 1) + { + p = ht[hr[i]][1]; + + *(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]); + + if (xors[1] != 0) + { + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot]; + uint4 ttx; + ttx.x = xors[0]; + ttx.y = xors[1]; + ttx.z = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + ttx.w = 0; + *(uint4*)(&xs.hash[0]) = ttx; + } + } + + for (int k = 2; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + u32 i = __byte_perm(pair, 0, 0x4510); + u32 k = __byte_perm(pair, 0, 0x4532); + + *(uint2*)(&xors[0]) = *(uint2*)(&lastword[i][0]) ^ *(uint2*)(&lastword[k][0]); + + if (xors[1] == 0) + continue; + + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1); + if (xorslot >= NSLOTS) continue; + slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot]; + uint4 tt; + tt.x = xors[0]; + tt.y = xors[1]; + tt.z = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + tt.w = 0; + *(uint4*)(&xs.hash[0]) = tt; + } +} + + +template +__global__ void digit_8(equi* eq) +{ + __shared__ u16 ht[NRESTS][(SSM - 1)]; + __shared__ u32 lastword[NSLOTS][2]; + __shared__ int ht_len[NRESTS]; + __shared__ int pairs[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 bsize_sh; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + ht_len[threadid] = 0; + if (threadid == (NRESTS - 1)) + { + next_pair = 0; + pairs_len = 0; + } + else if (threadid == (NRESTS - 33)) + bsize_sh = umin(eq->edata.nslots[7][bucketid], NSLOTS); + + slotsmall* buck = eq->treessmall[1][bucketid]; + + u32 hr[3]; + int pos[3]; + pos[0] = pos[1] = pos[2] = SSM; + + u32 si[3]; + uint2 tt[3]; + + __syncthreads(); + + u32 bsize = bsize_sh; + +#pragma unroll + for (u32 i = 0; i != 3; ++i) + { + si[i] = i * NRESTS + threadid; + if (si[i] >= bsize) break; + + const slotsmall* pslot1 = buck + si[i]; + + // get xhash + tt[i] = *(uint2*)(&pslot1->hash[0]); + *(uint2*)(&lastword[si[i]][0]) = *(uint2*)(&tt[i].x); + asm("bfe.u32 %0, %1, 8, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB)); + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + + u32 xors[2]; + u32 bexor, xorbucketid, xorslot; + +#pragma unroll + for (u32 i = 0; i != 3; ++i) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + *(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]); + + if (xors[1] != 0) + { + bexor = __byte_perm(xors[0], xors[1], 0x0765); + xorbucketid = bexor >> (12 + 8); + xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1); + if (xorslot < RB8_NSLOTS_LD) + { + slottiny &xs = eq->treestiny[0][xorbucketid][xorslot]; + uint2 tt; + tt.x = xors[1]; + tt.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint2*)(&xs.hash[0]) = tt; + } + } + + if (pos[i] > 1) + { + p = ht[hr[i]][1]; + + *(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]); + + if (xors[1] != 0) + { + bexor = __byte_perm(xors[0], xors[1], 0x0765); + xorbucketid = bexor >> (12 + 8); + xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1); + if (xorslot < RB8_NSLOTS_LD) + { + slottiny &xs = eq->treestiny[0][xorbucketid][xorslot]; + uint2 tt; + tt.x = xors[1]; + tt.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint2*)(&xs.hash[0]) = tt; + } + } + + for (int k = 2; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + u32 i = __byte_perm(pair, 0, 0x4510); + u32 k = __byte_perm(pair, 0, 0x4532); + + *(uint2*)(&xors[0]) = *(uint2*)(&lastword[i][0]) ^ *(uint2*)(&lastword[k][0]); + + if (xors[1] == 0) + continue; + + bexor = __byte_perm(xors[0], xors[1], 0x0765); + xorbucketid = bexor >> (12 + 8); + xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1); + if (xorslot >= RB8_NSLOTS_LD) continue; + slottiny &xs = eq->treestiny[0][xorbucketid][xorslot]; + uint2 tt; + tt.x = xors[1]; + tt.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + *(uint2*)(&xs.hash[0]) = tt; + } +} + +/* + Last round function is similar to previous ones but has different ending. + We use warps to process final candidates. Each warp process one candidate. + First two bidandsids (u32 of stored bucketid and two slotids) are retreived by + lane 0 and lane 16, next four bidandsids by lane 0, 8, 16 and 24, ... until + all lanes in warp have bidandsids from round 4. Next, each thread retreives + 16 indices. While doing so, indices are put into comparison using atomicExch + to determine if there are duplicates (tromp's method). At the end, if no + duplicates are found, candidate solution is saved (all indices). Note that this + dup check method is not exact so CPU dup checking is needed after. +*/ +template +__global__ void digit_last_wdc(equi* eq) +{ + __shared__ u8 shared_data[8192]; + int* ht_len = (int*)(&shared_data[0]); + int* pairs = ht_len; + u32* lastword = (u32*)(&shared_data[256 * 4]); + u16* ht = (u16*)(&shared_data[256 * 4 + RB8_NSLOTS_LD * 4]); + u32* pairs_len = (u32*)(&shared_data[8188]); + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len +#pragma unroll + for (u32 i = 0; i != FCT; ++i) + ht_len[(i * (256 / FCT)) + threadid] = 0; + + if (threadid == ((256 / FCT) - 1)) + *pairs_len = 0; + + slottiny* buck = eq->treestiny[0][bucketid]; + u32 bsize = umin(eq->edata.nslots8[bucketid], RB8_NSLOTS_LD); + + u32 si[3 * FCT]; + u32 hr[3 * FCT]; + int pos[3 * FCT]; + u32 lw[3 * FCT]; +#pragma unroll + for (u32 i = 0; i != (3 * FCT); ++i) + pos[i] = SSM; + + __syncthreads(); + +#pragma unroll + for (u32 i = 0; i != (3 * FCT); ++i) + { + si[i] = i * (256 / FCT) + threadid; + if (si[i] >= bsize) break; + + const slottiny* pslot1 = buck + si[i]; + + // get xhash + uint2 tt = *(uint2*)(&pslot1->hash[0]); + lw[i] = tt.x; + lastword[si[i]] = lw[i]; + + u32 a; + asm("bfe.u32 %0, %1, 20, 8;" : "=r"(a) : "r"(lw[i])); + hr[i] = a; + + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) + ht[hr[i] * (SSM - 1) + pos[i]] = si[i]; + } + + __syncthreads(); + +#pragma unroll + for (u32 i = 0; i != (3 * FCT); ++i) + { + if (pos[i] >= SSM) continue; + + for (int k = 0; k != pos[i]; ++k) + { + u16 prev = ht[hr[i] * (SSM - 1) + k]; + if (lw[i] != lastword[prev]) continue; + u32 pindex = atomicAdd(pairs_len, 1); + if (pindex >= MAXPAIRS) break; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + + __syncthreads(); + u32 plen = umin(*pairs_len, 64); + +#define CALC_LEVEL(a, b, c, d) { \ + u32 plvl = levels[b]; \ + u32* bucks = eq->round4bidandsids[PACKER::get_bucketid(plvl, RB, SM)]; \ + u32 slot1 = PACKER::get_slot1(plvl, RB, SM); \ + u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); \ + levels[b] = bucks[slot1]; \ + levels[c] = bucks[slot0]; \ + } + +#define CALC_LEVEL_SMALL(a, b, c, d) { \ + u32 plvl = levels[b]; \ + slotsmall* bucks = eq->treessmall[a][PACKER::get_bucketid(plvl, RB, SM)]; \ + u32 slot1 = PACKER::get_slot1(plvl, RB, SM); \ + u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); \ + levels[b] = bucks[slot1].hash[d]; \ + levels[c] = bucks[slot0].hash[d]; \ + } + + u32 lane = threadIdx.x & 0x1f; + u32 par = threadIdx.x >> 5; + + u32* levels = (u32*)&pairs[MAXPAIRS + (par << DUPBITS)]; + u32* susp = levels; + + while (par < plen) + { + int pair = pairs[par]; + par += W; + + if (lane % 16 == 0) + { + u32 plvl; + if (lane == 0) plvl = buck[__byte_perm(pair, 0, 0x4510)].hash[1]; + else plvl = buck[__byte_perm(pair, 0, 0x4532)].hash[1]; + slotsmall* bucks = eq->treessmall[1][PACKER::get_bucketid(plvl, RB, SM)]; + u32 slot1 = PACKER::get_slot1(plvl, RB, SM); + u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); + levels[lane] = bucks[slot1].hash[2]; + levels[lane + 8] = bucks[slot0].hash[2]; + } + + if (lane % 8 == 0) + CALC_LEVEL_SMALL(0, lane, lane + 4, 3); + + if (lane % 4 == 0) + CALC_LEVEL_SMALL(2, lane, lane + 2, 3); + + if (lane % 2 == 0) + CALC_LEVEL(0, lane, lane + 1, 4); + + u32 ind[16]; + + u32 f1 = levels[lane]; + const slottiny* buck_v4 = &eq->round3trees[PACKER::get_bucketid(f1, RB, SM)].treestiny[0]; + const u32 slot1_v4 = PACKER::get_slot1(f1, RB, SM); + const u32 slot0_v4 = PACKER::get_slot0(f1, slot1_v4, RB, SM); + + susp[lane] = 0xffffffff; + susp[32 + lane] = 0xffffffff; + +#define CHECK_DUP(a) \ + __any(atomicExch(&susp[(ind[a] & ((1 << DUPBITS) - 1))], (ind[a] >> DUPBITS)) == (ind[a] >> DUPBITS)) + + u32 f2 = buck_v4[slot1_v4].hash[1]; + const slottiny* buck_v3_1 = &eq->round2trees[PACKER::get_bucketid(f2, RB, SM)].treestiny[0]; + const u32 slot1_v3_1 = PACKER::get_slot1(f2, RB, SM); + const u32 slot0_v3_1 = PACKER::get_slot0(f2, slot1_v3_1, RB, SM); + + susp[64 + lane] = 0xffffffff; + susp[96 + lane] = 0xffffffff; + + u32 f0 = buck_v3_1[slot1_v3_1].hash[1]; + const slot* buck_v2_1 = eq->trees[0][PACKER::get_bucketid(f0, RB, SM)]; + const u32 slot1_v2_1 = PACKER::get_slot1(f0, RB, SM); + const u32 slot0_v2_1 = PACKER::get_slot0(f0, slot1_v2_1, RB, SM); + + susp[128 + lane] = 0xffffffff; + susp[160 + lane] = 0xffffffff; + + u32 f3 = buck_v2_1[slot1_v2_1].hash[6]; + const slot* buck_fin_1 = eq->round0trees[packer_default::get_bucketid(f3, 8, RB8_NSLOTS)]; + const u32 slot1_fin_1 = packer_default::get_slot1(f3, 8, RB8_NSLOTS); + const u32 slot0_fin_1 = packer_default::get_slot0(f3, slot1_fin_1, 8, RB8_NSLOTS); + + susp[192 + lane] = 0xffffffff; + susp[224 + lane] = 0xffffffff; + + ind[0] = buck_fin_1[slot1_fin_1].hash[7]; + if (CHECK_DUP(0)) continue; + ind[1] = buck_fin_1[slot0_fin_1].hash[7]; + if (CHECK_DUP(1)) continue; + + u32 f4 = buck_v2_1[slot0_v2_1].hash[6]; + const slot* buck_fin_2 = eq->round0trees[packer_default::get_bucketid(f4, 8, RB8_NSLOTS)]; + const u32 slot1_fin_2 = packer_default::get_slot1(f4, 8, RB8_NSLOTS); + const u32 slot0_fin_2 = packer_default::get_slot0(f4, slot1_fin_2, 8, RB8_NSLOTS); + + ind[2] = buck_fin_2[slot1_fin_2].hash[7]; + if (CHECK_DUP(2)) continue; + ind[3] = buck_fin_2[slot0_fin_2].hash[7]; + if (CHECK_DUP(3)) continue; + + u32 f5 = buck_v3_1[slot0_v3_1].hash[1]; + const slot* buck_v2_2 = eq->trees[0][PACKER::get_bucketid(f5, RB, SM)]; + const u32 slot1_v2_2 = PACKER::get_slot1(f5, RB, SM); + const u32 slot0_v2_2 = PACKER::get_slot0(f5, slot1_v2_2, RB, SM); + + u32 f6 = buck_v2_2[slot1_v2_2].hash[6]; + const slot* buck_fin_3 = eq->round0trees[packer_default::get_bucketid(f6, 8, RB8_NSLOTS)]; + const u32 slot1_fin_3 = packer_default::get_slot1(f6, 8, RB8_NSLOTS); + const u32 slot0_fin_3 = packer_default::get_slot0(f6, slot1_fin_3, 8, RB8_NSLOTS); + + ind[4] = buck_fin_3[slot1_fin_3].hash[7]; + if (CHECK_DUP(4)) continue; + ind[5] = buck_fin_3[slot0_fin_3].hash[7]; + if (CHECK_DUP(5)) continue; + + u32 f7 = buck_v2_2[slot0_v2_2].hash[6]; + const slot* buck_fin_4 = eq->round0trees[packer_default::get_bucketid(f7, 8, RB8_NSLOTS)]; + const u32 slot1_fin_4 = packer_default::get_slot1(f7, 8, RB8_NSLOTS); + const u32 slot0_fin_4 = packer_default::get_slot0(f7, slot1_fin_4, 8, RB8_NSLOTS); + + ind[6] = buck_fin_4[slot1_fin_4].hash[7]; + if (CHECK_DUP(6)) continue; + ind[7] = buck_fin_4[slot0_fin_4].hash[7]; + if (CHECK_DUP(7)) continue; + + u32 f8 = buck_v4[slot0_v4].hash[1]; + const slottiny* buck_v3_2 = &eq->round2trees[PACKER::get_bucketid(f8, RB, SM)].treestiny[0]; + const u32 slot1_v3_2 = PACKER::get_slot1(f8, RB, SM); + const u32 slot0_v3_2 = PACKER::get_slot0(f8, slot1_v3_2, RB, SM); + + u32 f9 = buck_v3_2[slot1_v3_2].hash[1]; + const slot* buck_v2_3 = eq->trees[0][PACKER::get_bucketid(f9, RB, SM)]; + const u32 slot1_v2_3 = PACKER::get_slot1(f9, RB, SM); + const u32 slot0_v2_3 = PACKER::get_slot0(f9, slot1_v2_3, RB, SM); + + u32 f10 = buck_v2_3[slot1_v2_3].hash[6]; + const slot* buck_fin_5 = eq->round0trees[packer_default::get_bucketid(f10, 8, RB8_NSLOTS)]; + const u32 slot1_fin_5 = packer_default::get_slot1(f10, 8, RB8_NSLOTS); + const u32 slot0_fin_5 = packer_default::get_slot0(f10, slot1_fin_5, 8, RB8_NSLOTS); + + ind[8] = buck_fin_5[slot1_fin_5].hash[7]; + if (CHECK_DUP(8)) continue; + ind[9] = buck_fin_5[slot0_fin_5].hash[7]; + if (CHECK_DUP(9)) continue; + + u32 f11 = buck_v2_3[slot0_v2_3].hash[6]; + const slot* buck_fin_6 = eq->round0trees[packer_default::get_bucketid(f11, 8, RB8_NSLOTS)]; + const u32 slot1_fin_6 = packer_default::get_slot1(f11, 8, RB8_NSLOTS); + const u32 slot0_fin_6 = packer_default::get_slot0(f11, slot1_fin_6, 8, RB8_NSLOTS); + + ind[10] = buck_fin_6[slot1_fin_6].hash[7]; + if (CHECK_DUP(10)) continue; + ind[11] = buck_fin_6[slot0_fin_6].hash[7]; + if (CHECK_DUP(11)) continue; + + u32 f12 = buck_v3_2[slot0_v3_2].hash[1]; + const slot* buck_v2_4 = eq->trees[0][PACKER::get_bucketid(f12, RB, SM)]; + const u32 slot1_v2_4 = PACKER::get_slot1(f12, RB, SM); + const u32 slot0_v2_4 = PACKER::get_slot0(f12, slot1_v2_4, RB, SM); + + u32 f13 = buck_v2_4[slot1_v2_4].hash[6]; + const slot* buck_fin_7 = eq->round0trees[packer_default::get_bucketid(f13, 8, RB8_NSLOTS)]; + const u32 slot1_fin_7 = packer_default::get_slot1(f13, 8, RB8_NSLOTS); + const u32 slot0_fin_7 = packer_default::get_slot0(f13, slot1_fin_7, 8, RB8_NSLOTS); + + ind[12] = buck_fin_7[slot1_fin_7].hash[7]; + if (CHECK_DUP(12)) continue; + ind[13] = buck_fin_7[slot0_fin_7].hash[7]; + if (CHECK_DUP(13)) continue; + + u32 f14 = buck_v2_4[slot0_v2_4].hash[6]; + const slot* buck_fin_8 = eq->round0trees[packer_default::get_bucketid(f14, 8, RB8_NSLOTS)]; + const u32 slot1_fin_8 = packer_default::get_slot1(f14, 8, RB8_NSLOTS); + const u32 slot0_fin_8 = packer_default::get_slot0(f14, slot1_fin_8, 8, RB8_NSLOTS); + + ind[14] = buck_fin_8[slot1_fin_8].hash[7]; + if (CHECK_DUP(14)) continue; + ind[15] = buck_fin_8[slot0_fin_8].hash[7]; + if (CHECK_DUP(15)) continue; + + u32 soli; + if (lane == 0) + { + soli = atomicAdd(&eq->edata.srealcont.nsols, 1); + } + soli = __shfl(soli, 0); + + if (soli < MAXREALSOLS) + { + u32 pos = lane << 4; + *(uint4*)(&eq->edata.srealcont.sols[soli][pos]) = *(uint4*)(&ind[0]); + *(uint4*)(&eq->edata.srealcont.sols[soli][pos + 4]) = *(uint4*)(&ind[4]); + *(uint4*)(&eq->edata.srealcont.sols[soli][pos + 8]) = *(uint4*)(&ind[8]); + *(uint4*)(&eq->edata.srealcont.sols[soli][pos + 12]) = *(uint4*)(&ind[12]); + } + } +} + + +std::mutex dev_init; +int dev_init_done[8] = { 0 }; + + +__host__ int compu32(const void *pa, const void *pb) +{ + uint32_t a = *(uint32_t *)pa, b = *(uint32_t *)pb; + return a b[i]) + { + need_sorting = 1; + tmp = a[i]; + a[i] = b[i]; + b[i] = tmp; + } + else if (a[i] < b[i]) + return; +} + + +__host__ void setheader(blake2b_state *ctx, const char *header, const u32 headerLen, const char* nce, const u32 nonceLen) +{ + uint32_t le_N = WN; + uint32_t le_K = WK; + uchar personal[] = "ZcashPoW01230123"; + memcpy(personal + 8, &le_N, 4); + memcpy(personal + 12, &le_K, 4); + blake2b_param P[1]; + P->digest_length = HASHOUT; + P->key_length = 0; + P->fanout = 1; + P->depth = 1; + P->leaf_length = 0; + P->node_offset = 0; + P->node_depth = 0; + P->inner_length = 0; + memset(P->reserved, 0, sizeof(P->reserved)); + memset(P->salt, 0, sizeof(P->salt)); + memcpy(P->personal, (const uint8_t *)personal, 16); + blake2b_init_param(ctx, P); + blake2b_update(ctx, (const uchar *)header, headerLen); + blake2b_update(ctx, (const uchar *)nce, nonceLen); +} + + +#ifdef WIN32 +typedef CUresult(CUDAAPI *dec_cuDeviceGet)(CUdevice*, int); +typedef CUresult(CUDAAPI *dec_cuCtxCreate)(CUcontext*, unsigned int, CUdevice); +typedef CUresult(CUDAAPI *dec_cuCtxPushCurrent)(CUcontext); +typedef CUresult(CUDAAPI *dec_cuCtxDestroy)(CUcontext); + +dec_cuDeviceGet _cuDeviceGet = nullptr; +dec_cuCtxCreate _cuCtxCreate = nullptr; +dec_cuCtxPushCurrent _cuCtxPushCurrent = nullptr; +dec_cuCtxDestroy _cuCtxDestroy = nullptr; +#endif + + +template +__host__ eq_cuda_context::eq_cuda_context(int id) + : device_id(id) +{ + solutions = nullptr; + + dev_init.lock(); + if (!dev_init_done[device_id]) + { + // only first thread shall init device + checkCudaErrors(cudaSetDevice(device_id)); + checkCudaErrors(cudaDeviceReset()); + checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); + + pctx = nullptr; + } + else + { + // create new context + CUdevice dev; + +#ifdef WIN32 + if (_cuDeviceGet == nullptr) + { + HMODULE hmod = LoadLibraryA("nvcuda.dll"); + if (hmod == NULL) + throw std::runtime_error("Failed to load nvcuda.dll"); + _cuDeviceGet = (dec_cuDeviceGet)GetProcAddress(hmod, "cuDeviceGet"); + if (_cuDeviceGet == nullptr) + throw std::runtime_error("Failed to get cuDeviceGet address"); + _cuCtxCreate = (dec_cuCtxCreate)GetProcAddress(hmod, "cuCtxCreate_v2"); + if (_cuCtxCreate == nullptr) + throw std::runtime_error("Failed to get cuCtxCreate address"); + _cuCtxPushCurrent = (dec_cuCtxPushCurrent)GetProcAddress(hmod, "cuCtxPushCurrent_v2"); + if (_cuCtxPushCurrent == nullptr) + throw std::runtime_error("Failed to get cuCtxPushCurrent address"); + _cuCtxDestroy = (dec_cuCtxDestroy)GetProcAddress(hmod, "cuCtxDestroy_v2"); + if (_cuCtxDestroy == nullptr) + throw std::runtime_error("Failed to get cuCtxDestroy address"); + } + + + checkCudaDriverErrors(_cuDeviceGet(&dev, device_id)); + checkCudaDriverErrors(_cuCtxCreate(&pctx, CU_CTX_SCHED_BLOCKING_SYNC, dev)); + checkCudaDriverErrors(_cuCtxPushCurrent(pctx)); +#else + checkCudaDriverErrors(cuDeviceGet(&dev, device_id)); + checkCudaDriverErrors(cuCtxCreate(&pctx, CU_CTX_SCHED_BLOCKING_SYNC, dev)); + checkCudaDriverErrors(cuCtxPushCurrent(pctx)); +#endif + } + ++dev_init_done[device_id]; + dev_init.unlock(); + + if (cudaMalloc((void**)&device_eq, sizeof(equi)) != cudaSuccess) + throw std::runtime_error("CUDA: failed to alloc memory"); + + solutions = (scontainerreal*)malloc(sizeof(scontainerreal)); +} + + +template +__host__ void eq_cuda_context::solve(const char *tequihash_header, + unsigned int tequihash_header_len, + const char* nonce, + unsigned int nonce_len, + std::function cancelf, + std::function&, size_t, const unsigned char*)> solutionf, + std::function hashdonef) +{ + blake2b_state blake_ctx; + + int blocks = NBUCKETS; + + setheader(&blake_ctx, tequihash_header, tequihash_header_len, nonce, nonce_len); + + // todo: improve + // djezo solver allows last 4 bytes of nonce to be iterrated + // this can be used to create internal loop - calc initial blake hash only once, then load 8*8 bytes on device (blake state h) + // then just iterate nn++ + // less CPU load, 1 cudaMemcpy less -> faster + //u32 nn = *(u32*)&nonce[28]; + u32 nn = 0; + + checkCudaErrors(cudaMemcpy(&device_eq->blake_h, &blake_ctx.h, sizeof(u64) * 8, cudaMemcpyHostToDevice)); + + checkCudaErrors(cudaMemset(&device_eq->edata, 0, sizeof(device_eq->edata))); + + digit_first << > >(device_eq, nn); + + digit_1 << <4096, 512 >> >(device_eq); + + digit_2 << > >(device_eq); + + digit_3 << > >(device_eq); + + if (cancelf()) return; + + digit_4 << > >(device_eq); + + digit_5 << > >(device_eq); + + digit_6 << > >(device_eq); + + digit_7 << > >(device_eq); + + digit_8 << > >(device_eq); + + digit_last_wdc << <4096, 256 / 2 >> >(device_eq); + + checkCudaErrors(cudaMemcpy(solutions, &device_eq->edata.srealcont, (MAXREALSOLS * (512 * 4)) + 4, cudaMemcpyDeviceToHost)); + + //printf("nsols: %u\n", solutions->nsols); + //if (solutions->nsols > 9) + // printf("missing sol, total: %u\n", solutions->nsols); + + for (u32 s = 0; (s < solutions->nsols) && (s < MAXREALSOLS); s++) + { + // remove dups on CPU (dup removal on GPU is not fully exact and can pass on some invalid solutions) + if (duped(solutions->sols[s])) continue; + + // perform sort of pairs + for (uint32_t level = 0; level < 9; level++) + for (uint32_t i = 0; i < (1 << 9); i += (2 << level)) + sort_pair(&solutions->sols[s][i], 1 << level); + + std::vector index_vector(PROOFSIZE); + for (u32 i = 0; i < PROOFSIZE; i++) { + index_vector[i] = solutions->sols[s][i]; + } + + solutionf(index_vector, DIGITBITS, nullptr); + } + + hashdonef(); +} + + +template +__host__ eq_cuda_context::~eq_cuda_context() +{ + if (solutions) + free(solutions); + + cudaFree(device_eq); + + if (pctx) + { + // non primary thread, destroy context +#ifdef WIN32 + checkCudaDriverErrors(_cuCtxDestroy(pctx)); +#else + checkCudaDriverErrors(cuCtxDestroy(pctx)); +#endif + } + else + { + checkCudaErrors(cudaDeviceReset()); + + dev_init_done[device_id] = 0; + } +} + + +#ifdef CONFIG_MODE_1 +template class eq_cuda_context; +#endif + +#ifdef CONFIG_MODE_2 +template class eq_cuda_context; +#endif + +#ifdef CONFIG_MODE_3 +template class eq_cuda_context; +#endif diff --git a/cuda_tromp/CMakeLists.txt b/cuda_tromp/CMakeLists.txt new file mode 100644 index 000000000..12bdc8bf4 --- /dev/null +++ b/cuda_tromp/CMakeLists.txt @@ -0,0 +1,57 @@ +set(EXECUTABLE cuda_tromp) + +option(ENABLE_CUDA "Enable the cuda build" ON) + +# depending on gcc version +# ;-std=c++11 => Ubuntu 14.04 check gcc versions +#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-std=c++11) + +file(GLOB SRC_LIST + cuda_tromp.cpp + equi_miner.cu ) +file(GLOB HEADERS + cuda_tromp.hpp + eqcuda.hpp + ) + + +#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-m64;--std=c++11;--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo) + +set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--disable-warnings;--ptxas-options=-v;-use_fast_math;-lineinfo) + +add_definitions(-DHIST) +#add_definitions(-DXINTREE) +#add_definitions(-DUNROLL) + +list(APPEND CUDA_NVCC_FLAGS_RELEASE -O3) + + +FIND_PACKAGE(CUDA REQUIRED) +if(COMPUTE AND (COMPUTE GREATER 0)) + LIST(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${COMPUTE},code=sm_${COMPUTE}) +else(COMPUTE AND (COMPUTE GREATER 0)) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};; -gencode arch=compute_20,code=sm_21; -gencode arch=compute_30,code=sm_30; -gencode arch=compute_35,code=sm_35; -gencode arch=compute_50,code=sm_50; -gencode arch=compute_52,code=sm_52; -gencode arch=compute_61,code=sm_61 ) +endif(COMPUTE AND (COMPUTE GREATER 0)) + +include_directories(${CUDA_INCLUDE_DIRS}) + +find_package(Threads REQUIRED COMPONENTS) +find_package(Boost REQUIRED COMPONENTS system log_setup log date_time filesystem thread) + +if(CUDA_FOUND) +message("CUDA FOUND") +else() +message("CUDA NOT FOUND") +endif() + + +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CUDA_INCLUDE_DIRS}) +include_directories(..) +CUDA_ADD_LIBRARY(${EXECUTABLE} STATIC ${SRC_LIST} ${HEADERS}) +TARGET_LINK_LIBRARIES(${EXECUTABLE} ${CUDA_LIBRARIES}) + +message("-- CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}") + +install( TARGETS ${EXECUTABLE} RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib ) +install( FILES ${HEADERS} DESTINATION include/${EXECUTABLE} ) diff --git a/nheqminer/AvailableSolvers.h b/nheqminer/AvailableSolvers.h new file mode 100644 index 000000000..a071ae3b5 --- /dev/null +++ b/nheqminer/AvailableSolvers.h @@ -0,0 +1,97 @@ +#pragma once + +#include "Solver.h" +#include "SolverStub.h" + + +#ifdef USE_CPU_TROMP +#include "../cpu_tromp/cpu_tromp.hpp" +#else +CREATE_SOLVER_STUB(cpu_tromp, "cpu_tromp_STUB") +#endif +#ifdef USE_CPU_XENONCAT +#include "../cpu_xenoncat/cpu_xenoncat.hpp" +#else +CREATE_SOLVER_STUB(cpu_xenoncat, "cpu_xenoncat_STUB") +#endif +#ifdef USE_CUDA_TROMP +#include "../cuda_tromp/cuda_tromp.hpp" +#else +CREATE_SOLVER_STUB(cuda_tromp, "cuda_tromp_STUB") +#endif +#ifdef USE_CUDA_DJEZO +#include "../cuda_djezo/cuda_djezo.hpp" +#else +CREATE_SOLVER_STUB(cuda_djezo, "cuda_djezo_STUB") +#endif +// OpenCL solvers are fropped replace with new OS solvers +#ifdef USE_OCL_XMP +#include "../ocl_xpm/ocl_xmp.hpp" +#else +CREATE_SOLVER_STUB(ocl_xmp, "ocl_xmp_STUB") +#endif +#ifdef USE_OCL_SILENTARMY +#include "../ocl_silentarmy/ocl_silentarmy.hpp" +#else +CREATE_SOLVER_STUB(ocl_silentarmy, "ocl_silentarmy_STUB") +#endif + +//namespace AvailableSolvers +//{ +//} // AvailableSolvers + +// CPU solvers +class CPUSolverTromp : public Solver { +public: + CPUSolverTromp(int use_opt) : Solver(new cpu_tromp(), SolverType::CPU) { + _context->use_opt = use_opt; + } + virtual ~CPUSolverTromp() {} +}; +class CPUSolverXenoncat : public Solver { +public: + CPUSolverXenoncat(int use_opt) : Solver(new cpu_xenoncat(), SolverType::CPU) { + _context->use_opt = use_opt; + } + virtual ~CPUSolverXenoncat() {} +}; +// TODO remove platform id for cuda solvers +// CUDA solvers +class CUDASolverDjezo : public Solver { +public: + CUDASolverDjezo(int dev_id, int blocks, int threadsperblock) : Solver(new cuda_djezo(0, dev_id), SolverType::CUDA) { + if (blocks > 0) { + _context->blocks = blocks; + } + if (threadsperblock > 0) { + _context->threadsperblock = threadsperblock; + } + } + virtual ~CUDASolverDjezo() {} +}; +class CUDASolverTromp : public Solver { +public: + CUDASolverTromp(int dev_id, int blocks, int threadsperblock) : Solver(new cuda_tromp(0, dev_id), SolverType::CUDA) { + if (blocks > 0) { + _context->blocks = blocks; + } + if (threadsperblock > 0) { + _context->threadsperblock = threadsperblock; + } + } + virtual ~CUDASolverTromp() {} +}; +// OpenCL solvers +class OPENCLSolverSilentarmy : public Solver { +public: + OPENCLSolverSilentarmy(int platf_id, int dev_id) : Solver(new ocl_silentarmy(platf_id, dev_id), SolverType::OPENCL) { + } + virtual ~OPENCLSolverSilentarmy() {} +}; +class OPENCLSolverXMP : public Solver { +public: + OPENCLSolverXMP(int platf_id, int dev_id) : Solver(new ocl_xmp(platf_id, dev_id), SolverType::OPENCL) { + } + virtual ~OPENCLSolverXMP() {} +}; + diff --git a/nheqminer/ISolver.h b/nheqminer/ISolver.h new file mode 100644 index 000000000..bad815f66 --- /dev/null +++ b/nheqminer/ISolver.h @@ -0,0 +1,34 @@ +#pragma once + +#include +#include +#include +#include + +enum class SolverType { + CPU = 0, + CUDA, + OPENCL +}; + +class ISolver +{ +public: + //ISolver() { } + //virtual ~ISolver() { } + virtual void start() = 0; + virtual void stop() = 0; + + virtual void solve(const char *tequihash_header, + unsigned int tequihash_header_len, + const char* nonce, + unsigned int nonce_len, + std::function cancelf, + std::function&, size_t, const unsigned char*)> solutionf, + std::function hashdonef) = 0; + + virtual std::string getdevinfo() = 0; + virtual std::string getname() = 0; + virtual SolverType GetType() const = 0; +}; + diff --git a/nheqminer/MinerFactory.cpp b/nheqminer/MinerFactory.cpp new file mode 100644 index 000000000..b8a701956 --- /dev/null +++ b/nheqminer/MinerFactory.cpp @@ -0,0 +1,94 @@ +#include "MinerFactory.h" + +#include + +extern int use_avx; +extern int use_avx2; + + + +MinerFactory::~MinerFactory() +{ + ClearAllSolvers(); +} + +std::vector MinerFactory::GenerateSolvers(int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, + int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) { + std::vector solversPointers; + + for (int i = 0; i < cuda_count; ++i) { + solversPointers.push_back(GenCUDASolver(cuda_en[i], cuda_b[i], cuda_t[i])); + } + + for (int i = 0; i < opencl_count; ++i) + { + if (opencl_t[i] < 1) opencl_t[i] = 1; + + // add multiple threads if wanted + for (int k = 0; k < opencl_t[i]; ++k) { + // todo: save local&global work size, new solvers + solversPointers.push_back(GenOPENCLSolver(opencl_platf, opencl_en[i])); + } + } + + bool hasGpus = solversPointers.size() > 0; + if (cpu_threads < 0) { + cpu_threads = std::thread::hardware_concurrency(); + if (cpu_threads < 1) cpu_threads = 1; + else if (hasGpus) --cpu_threads; // decrease number of threads if there are GPU workers + } + + for (int i = 0; i < cpu_threads; ++i) + { + solversPointers.push_back(GenCPUSolver(use_avx2)); + } + + return solversPointers; +} + +void MinerFactory::ClearAllSolvers() { + for (ISolver * ds : _solvers) { + if (ds != nullptr) { + delete ds; + } + } + _solvers.clear(); +} + +ISolver * MinerFactory::GenCPUSolver(int use_opt) { + // TODO fix dynamic linking on Linux +#ifdef USE_CPU_XENONCAT + if (_use_xenoncat) { + _solvers.push_back(new CPUSolverXenoncat(use_opt)); + return _solvers.back(); + } else { + _solvers.push_back(new CPUSolverTromp(use_opt)); + return _solvers.back(); + } +#else + _solvers.push_back(new CPUSolverTromp(use_opt)); + return _solvers.back(); +#endif +} + +ISolver * MinerFactory::GenCUDASolver(int dev_id, int blocks, int threadsperblock) { + if (_use_cuda_djezo) { + _solvers.push_back(new CUDASolverDjezo(dev_id, blocks, threadsperblock)); + return _solvers.back(); + } + else { + _solvers.push_back(new CUDASolverTromp(dev_id, blocks, threadsperblock)); + return _solvers.back(); + } +} +// no OpenCL solvers at the moment keep for future reference +ISolver * MinerFactory::GenOPENCLSolver(int platf_id, int dev_id) { + if (_use_silentarmy) { + _solvers.push_back(new OPENCLSolverSilentarmy(platf_id, dev_id)); + return _solvers.back(); + } + else { + _solvers.push_back(new OPENCLSolverXMP(platf_id, dev_id)); + return _solvers.back(); + } +} diff --git a/nheqminer/MinerFactory.h b/nheqminer/MinerFactory.h new file mode 100644 index 000000000..94c63a0d7 --- /dev/null +++ b/nheqminer/MinerFactory.h @@ -0,0 +1,30 @@ +#pragma once + +#include + +class MinerFactory +{ +public: + MinerFactory(bool use_xenoncat, bool use_cuda_djezo, bool use_silentarmy) + : _use_xenoncat(use_xenoncat), _use_cuda_djezo(use_cuda_djezo), _use_silentarmy(use_silentarmy) { + } + + ~MinerFactory(); + + std::vector GenerateSolvers(int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, + int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); + void ClearAllSolvers(); + +private: + std::vector _solvers; + + bool _use_xenoncat = true; + bool _use_cuda_djezo = true; + bool _use_silentarmy = true; + + ISolver * GenCPUSolver(int use_opt); + ISolver * GenCUDASolver(int dev_id, int blocks, int threadsperblock); + ISolver * GenOPENCLSolver(int platf_id, int dev_id); + +}; + diff --git a/nheqminer/Solver.h b/nheqminer/Solver.h new file mode 100644 index 000000000..7ebb8c100 --- /dev/null +++ b/nheqminer/Solver.h @@ -0,0 +1,57 @@ +#pragma once + +#include "ISolver.h" + +template +class Solver : public ISolver +{ +protected: + const SolverType _type; + StaticInterface * const _context = nullptr; +public: + Solver(StaticInterface *contex, SolverType type) : _context(contex), _type(type){} + virtual ~Solver() { + // the solver owns the context should delete it + if (_context != nullptr) { + delete _context; + } + } + + virtual void start() override { + StaticInterface::start(*_context); + } + + virtual void stop() override { + StaticInterface::stop(*_context); + } + + virtual void solve(const char *tequihash_header, + unsigned int tequihash_header_len, + const char* nonce, + unsigned int nonce_len, + std::function cancelf, + std::function&, size_t, const unsigned char*)> solutionf, + std::function hashdonef) override { + StaticInterface::solve( + tequihash_header, + tequihash_header_len, + nonce, + nonce_len, + cancelf, + solutionf, + hashdonef, + *_context); + } + + virtual std::string getdevinfo() override { + return _context->getdevinfo(); + } + + virtual std::string getname() override { + return _context->getname(); + } + + virtual SolverType GetType() const override { + return _type; + } +}; \ No newline at end of file diff --git a/nheqminer/libstratum/StratumClient.cpp b/nheqminer/libstratum/StratumClient.cpp index 50dbb6f1f..165f1a007 100644 --- a/nheqminer/libstratum/StratumClient.cpp +++ b/nheqminer/libstratum/StratumClient.cpp @@ -421,13 +421,5 @@ bool StratumClient::submit(const Solution* solution, const return true; } -// XMP -template class StratumClient; -template class StratumClient; -template class StratumClient; -template class StratumClient; -// Silentarmy -template class StratumClient; -template class StratumClient; -template class StratumClient; -template class StratumClient; \ No newline at end of file +// create StratumClient class +template class StratumClient; \ No newline at end of file diff --git a/nheqminer/libstratum/StratumClient.h b/nheqminer/libstratum/StratumClient.h index 373baf183..eff327812 100644 --- a/nheqminer/libstratum/StratumClient.h +++ b/nheqminer/libstratum/StratumClient.h @@ -162,13 +162,5 @@ class StratumClient }; -// XMP -typedef StratumClient ZcashStratumClientAVXCUDA80_XMP; -typedef StratumClient ZcashStratumClientSSE2CUDA80_XMP; -typedef StratumClient ZcashStratumClientAVXCUDA75_XMP; -typedef StratumClient ZcashStratumClientSSE2CUDA75_XMP; -// Silentarmy -typedef StratumClient ZcashStratumClientAVXCUDA80_SA; -typedef StratumClient ZcashStratumClientSSE2CUDA80_SA; -typedef StratumClient ZcashStratumClientAVXCUDA75_SA; -typedef StratumClient ZcashStratumClientSSE2CUDA75_SA; \ No newline at end of file +// ZcashStratumClient +typedef StratumClient ZcashStratumClient; \ No newline at end of file diff --git a/nheqminer/libstratum/ZcashStratum.cpp b/nheqminer/libstratum/ZcashStratum.cpp index 6c7758c20..7eac71999 100644 --- a/nheqminer/libstratum/ZcashStratum.cpp +++ b/nheqminer/libstratum/ZcashStratum.cpp @@ -6,7 +6,6 @@ #include "ZcashStratum.h" #include "utilstrencodings.h" -//#include "trompequihash/equi_miner.h" #include "streams.h" #include @@ -96,10 +95,10 @@ std::vector GetMinimalFromIndices(std::vector indices, return ret; } -template -void static ZcashMinerThread(ZcashMiner* miner, int size, int pos, Solver& extra) + +void static ZcashMinerThread(ZcashMiner* miner, int size, int pos, ISolver *solver) { - BOOST_LOG_CUSTOM(info, pos) << "Starting thread #" << pos << " (" << extra.getname() << ") " << extra.getdevinfo(); + BOOST_LOG_CUSTOM(info, pos) << "Starting thread #" << pos << " (" << solver->getname() << ") " << solver->getdevinfo(); std::shared_ptr m_zmt(new std::mutex); CBlockHeader header; @@ -141,7 +140,7 @@ void static ZcashMinerThread(ZcashMiner* mi try { - Solver::start(extra); + solver->start(); while (true) { // Wait for work @@ -236,14 +235,13 @@ void static ZcashMinerThread(ZcashMiner* mi speed.AddHash(); }; - Solver::solve(tequihash_header, + solver->solve(tequihash_header, tequihash_header_len, (const char*)bNonce.begin(), bNonce.size(), cancelFun, solutionFound, - hashDone, - extra); + hashDone); // Check for stop if (!miner->minerThreadActive[pos]) @@ -278,18 +276,19 @@ void static ZcashMinerThread(ZcashMiner* mi catch (const std::runtime_error &e) { BOOST_LOG_CUSTOM(error, pos) << e.what(); + exit(0); } try { - Solver::stop(extra); + solver->stop(); } catch (const std::runtime_error &e) { BOOST_LOG_CUSTOM(error, pos) << e.what(); } - BOOST_LOG_CUSTOM(info, pos) << "Thread #" << pos << " ended (" << extra.getname() << ")"; + BOOST_LOG_CUSTOM(info, pos) << "Thread #" << pos << " ended (" << solver->getname() << ")"; } ZcashJob* ZcashJob::clone() const @@ -333,82 +332,29 @@ std::string ZcashJob::getSubmission(const EquihashSolution* solution) return stream.str(); } -template -ZcashMiner::ZcashMiner(int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) - : minerThreads{nullptr} + +ZcashMiner::ZcashMiner(const std::vector &i_solvers) + : minerThreads{ nullptr } { m_isActive = false; - nThreads = 0; - - for (int i = 0; i < cuda_count; ++i) - { - CUDASolver* context = new CUDASolver(0, cuda_en[i]); - if (cuda_b[i] > 0) - context->blocks = cuda_b[i]; - if (cuda_t[i] > 0) - context->threadsperblock = cuda_t[i]; - - cuda_contexts.push_back(context); - } - nThreads += cuda_contexts.size(); - - - for (int i = 0; i < opencl_count; ++i) - { - if (opencl_t[i] < 1) opencl_t[i] = 1; - - // add multiple threads if wanted - for (int k = 0; k < opencl_t[i]; ++k) - { - OPENCLSolver* context = new OPENCLSolver(opencl_platf, opencl_en[i]); - // todo: save local&global work size - opencl_contexts.push_back(context); - } - } - nThreads += opencl_contexts.size(); - - - - if (cpu_threads < 0) { - cpu_threads = std::thread::hardware_concurrency(); - if (cpu_threads < 1) cpu_threads = 1; - else if (cuda_contexts.size() + opencl_contexts.size() > 0) --cpu_threads; // decrease number of threads if there are GPU workers - } - - - for (int i = 0; i < cpu_threads; ++i) - { - CPUSolver* context = new CPUSolver(); - context->use_opt = use_avx2; - cpu_contexts.push_back(context); - } - nThreads += cpu_contexts.size(); - - -// nThreads = cpu_contexts.size() + cuda_contexts.size() + opencl_contexts.size(); + solvers = i_solvers; + nThreads = solvers.size(); } -template -ZcashMiner::~ZcashMiner() + +ZcashMiner::~ZcashMiner() { stop(); - for (auto it = cpu_contexts.begin(); it != cpu_contexts.end(); ++it) - delete (*it); - for (auto it = cuda_contexts.begin(); it != cuda_contexts.end(); ++it) - delete (*it); - cpu_contexts.clear(); - cuda_contexts.clear(); } -template -std::string ZcashMiner::userAgent() + +std::string ZcashMiner::userAgent() { - return "equihashminer/" STANDALONE_MINER_VERSION; + return "nheqminer/" STANDALONE_MINER_VERSION; } -template -void ZcashMiner::start() + +void ZcashMiner::start() { if (minerThreads) { stop(); @@ -419,60 +365,71 @@ void ZcashMiner::start() minerThreads = new std::thread[nThreads]; minerThreadActive = new bool[nThreads]; - - // start cpu threads - int i = 0; - for ( ; i < cpu_contexts.size(); ++i) - { - minerThreadActive[i] = true; - minerThreads[i] = std::thread(boost::bind(&ZcashMinerThread, - this, nThreads, i, *cpu_contexts.at(i))); + // sort solvers CPU, CUDA, OPENCL + std::sort(solvers.begin(), solvers.end(), [](const ISolver* a, const ISolver* b) { return a->GetType() < b->GetType(); }); + + // start solvers + // #1 start cpu threads + // #2 start CUDA threads + // #3 start OPENCL threads + for (int i = 0; i < solvers.size(); ++i) { + minerThreadActive[i] = true; + minerThreads[i] = std::thread(boost::bind(&ZcashMinerThread, this, nThreads, i, solvers[i])); + if (solvers[i]->GetType() == SolverType::CPU) { #ifdef WIN32 - HANDLE hThread = minerThreads[i].native_handle(); - if (!SetThreadPriority(hThread, THREAD_PRIORITY_LOWEST)) - { - BOOST_LOG_CUSTOM(warning, i) << "Failed to set low priority"; - } - else - { - BOOST_LOG_CUSTOM(debug, i) << "Priority set to " << GetThreadPriority(hThread); - } + HANDLE hThread = minerThreads[i].native_handle(); + if (!SetThreadPriority(hThread, THREAD_PRIORITY_LOWEST)) + { + BOOST_LOG_CUSTOM(warning, i) << "Failed to set low priority"; + } + else + { + BOOST_LOG_CUSTOM(debug, i) << "Priority set to " << GetThreadPriority(hThread); + } #else - // todo: linux set low priority + // todo: linux set low priority #endif - } + } + } + + + //for ( ; ) + //{ + // + //} - // start CUDA threads - for (; i < (cpu_contexts.size() + cuda_contexts.size()); ++i) - { - minerThreadActive[i] = true; - minerThreads[i] = std::thread(boost::bind(&ZcashMinerThread, - this, nThreads, i, *cuda_contexts.at(i - cpu_contexts.size()))); - } + // + //for (; i < (cpu_contexts.size() + cuda_contexts.size()); ++i) + //{ + // minerThreadActive[i] = true; + // minerThreads[i] = std::thread(boost::bind(&ZcashMinerThread, + // this, nThreads, i, *cuda_contexts.at(i - cpu_contexts.size()))); + //} - // start OPENCL threads - for (; i < (cpu_contexts.size() + cuda_contexts.size() + opencl_contexts.size()); ++i) - { - minerThreadActive[i] = true; - minerThreads[i] = std::thread(boost::bind(&ZcashMinerThread, - this, nThreads, i, *opencl_contexts.at(i - cpu_contexts.size() - cuda_contexts.size()))); - } + // + //for (; i < (cpu_contexts.size() + cuda_contexts.size() + opencl_contexts.size()); ++i) + //{ + // minerThreadActive[i] = true; + // minerThreads[i] = std::thread(boost::bind(&ZcashMinerThread, + // this, nThreads, i, *opencl_contexts.at(i - cpu_contexts.size() - cuda_contexts.size()))); + //} - /*minerThreads = new boost::thread_group(); - for (int i = 0; i < nThreads; i++) { - minerThreads->create_thread(boost::bind(&ZcashMinerThread, this, nThreads, i)); - }*/ + + ///*minerThreads = new boost::thread_group(); + //for (int i = 0; i < nThreads; i++) { + // minerThreads->create_thread(boost::bind(&ZcashMinerThread, this, nThreads, i)); + //}*/ speed.Reset(); } -template -void ZcashMiner::stop() + +void ZcashMiner::stop() { m_isActive = false; if (minerThreads) @@ -492,8 +449,8 @@ void ZcashMiner::stop() }*/ } -template -void ZcashMiner::setServerNonce(const std::string& n1str) + +void ZcashMiner::setServerNonce(const std::string& n1str) { //auto n1str = params[1].get_str(); BOOST_LOG_TRIVIAL(info) << "miner | Extranonce is " << n1str; @@ -518,8 +475,8 @@ void ZcashMiner::setServerNonce(const std:: nonce2Inc <<= nonce1Bits; } -template -ZcashJob* ZcashMiner::parseJob(const Array& params) + +ZcashJob* ZcashMiner::parseJob(const Array& params) { if (params.size() < 2) { throw std::logic_error("Invalid job params"); @@ -571,59 +528,47 @@ ZcashJob* ZcashMiner::parseJob(const Array& return ret; } -template -void ZcashMiner::setJob(ZcashJob* job) + +void ZcashMiner::setJob(ZcashJob* job) { NewJob(job); } -template -void ZcashMiner::onSolutionFound( + +void ZcashMiner::onSolutionFound( const std::function callback) { solutionFoundCallback = callback; } -template -void ZcashMiner::submitSolution(const EquihashSolution& solution, const std::string& jobid) + +void ZcashMiner::submitSolution(const EquihashSolution& solution, const std::string& jobid) { solutionFoundCallback(solution, jobid); speed.AddShare(); } -template -void ZcashMiner::acceptedSolution(bool stale) + +void ZcashMiner::acceptedSolution(bool stale) { speed.AddShareOK(); } -template -void ZcashMiner::rejectedSolution(bool stale) + +void ZcashMiner::rejectedSolution(bool stale) { } -template -void ZcashMiner::failedSolution() + +void ZcashMiner::failedSolution() { } -// XMP -template class ZcashMiner; -template class ZcashMiner; -template class ZcashMiner; -template class ZcashMiner; -// Silentarmy -template class ZcashMiner; -template class ZcashMiner; -template class ZcashMiner; -template class ZcashMiner; - std::mutex benchmark_work; std::vector benchmark_nonces; std::atomic_int benchmark_solutions; -template -bool benchmark_solve_equihash(const CBlock& pblock, const char *tequihash_header, unsigned int tequihash_header_len, Solver& extra) +bool benchmark_solve_equihash(const CBlock& pblock, const char *tequihash_header, unsigned int tequihash_header_len, ISolver *solver) { benchmark_work.lock(); if (benchmark_nonces.empty()) @@ -658,24 +603,23 @@ bool benchmark_solve_equihash(const CBlock& pblock, const char *tequihash_header ++benchmark_solutions; }; - Solver::solve(tequihash_header, + solver->solve(tequihash_header, tequihash_header_len, (const char*)nonce->begin(), nonce->size(), []() { return false; }, solutionFound, - []() {}, - extra); + []() {} + ); delete nonce; return true; } -template -int benchmark_thread(int tid, Solver& extra) +int benchmark_thread(int tid, ISolver *solver) { - BOOST_LOG_TRIVIAL(debug) << "Thread #" << tid << " started (" << extra.getname() << ")"; + BOOST_LOG_TRIVIAL(debug) << "Thread #" << tid << " started (" << solver->getname() << ")"; try { @@ -687,7 +631,11 @@ int benchmark_thread(int tid, Solver& extra) const char *tequihash_header = (char *)&ss[0]; unsigned int tequihash_header_len = ss.size(); - while (benchmark_solve_equihash(pblock, tequihash_header, tequihash_header_len, extra)) {} + solver->start(); + + while (benchmark_solve_equihash(pblock, tequihash_header, tequihash_header_len, solver)) {} + + solver->stop(); } catch (const std::runtime_error &e) { @@ -696,15 +644,12 @@ int benchmark_thread(int tid, Solver& extra) return 0; } - BOOST_LOG_TRIVIAL(debug) << "Thread #" << tid << " ended (" << extra.getname() << ")"; + BOOST_LOG_TRIVIAL(debug) << "Thread #" << tid << " ended (" << solver->getname() << ")"; return 0; } -template -void ZcashMiner::doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) -{ +void Solvers_doBenchmark(int hashes, const std::vector &solvers) { // generate array of various nonces std::srand(std::time(0)); benchmark_nonces.push_back(new uint256()); @@ -719,77 +664,39 @@ void ZcashMiner::doBenchmark(int hashes, in size_t total_hashes = benchmark_nonces.size(); - std::vector cpu_contexts; - std::vector cuda_contexts; - std::vector opencl_contexts; - - for (int i = 0; i < cuda_count; ++i) - { - CUDASolver* context = new CUDASolver(0, cuda_en[i]); - if (cuda_b[i] > 0) - context->blocks = cuda_b[i]; - if (cuda_t[i] > 0) - context->threadsperblock = cuda_t[i]; - - BOOST_LOG_TRIVIAL(info) << "Benchmarking CUDA worker (" << context->getname() << ") " << context->getdevinfo(); - - CUDASolver::start(*context); // init CUDA before to get more accurate benchmark - - cuda_contexts.push_back(context); - } - - for (int i = 0; i < opencl_count; ++i) - { - if (opencl_t[i] < 1) opencl_t[i] = 1; - - for (int k = 0; k < opencl_t[i]; ++k) - { - OPENCLSolver* context = new OPENCLSolver(opencl_platf, opencl_en[i]); - - // todo: save local&global work size - - BOOST_LOG_TRIVIAL(info) << "Benchmarking OPENCL worker (" << context->getname() << ") " << context->getdevinfo(); - - OPENCLSolver::start(*context); // init OPENCL before to get more accurate benchmark - - opencl_contexts.push_back(context); + // log what is benchmarking + for (ISolver* solver : solvers) { + if (solver->GetType() == SolverType::CPU) { + BOOST_LOG_TRIVIAL(info) << "Benchmarking CPU worker (" << solver->getname() << ") " << solver->getdevinfo(); + } + else if (solver->GetType() == SolverType::CUDA) { + BOOST_LOG_TRIVIAL(info) << "Benchmarking CUDA worker (" << solver->getname() << ") " << solver->getdevinfo(); + } + else if (solver->GetType() == SolverType::OPENCL) { + BOOST_LOG_TRIVIAL(info) << "Benchmarking OPENCL worker (" << solver->getname() << ") " << solver->getdevinfo(); } } - if (cpu_threads < 0) - { - cpu_threads = std::thread::hardware_concurrency(); - if (cpu_threads < 1) cpu_threads = 1; - else if (cuda_contexts.size() + opencl_contexts.size() > 0) --cpu_threads; // decrease number of threads if there are GPU workers - } - - for (int i = 0; i < cpu_threads; ++i) - { - CPUSolver* context = new CPUSolver(); - context->use_opt = use_avx2; - BOOST_LOG_TRIVIAL(info) << "Benchmarking CPU worker (" << context->getname() << ") " << context->getdevinfo(); - CPUSolver::start(*context); - cpu_contexts.push_back(context); - } - - int nThreads = cpu_contexts.size() + cuda_contexts.size() + opencl_contexts.size(); - + int nThreads = solvers.size(); std::thread* bthreads = new std::thread[nThreads]; + benchmark_work.lock(); + // bind benchmark threads + for (int i = 0; i < solvers.size(); ++i) { + bthreads[i] = std::thread(boost::bind(&benchmark_thread, i, solvers[i])); + } +#ifdef WIN32 + // TODO get back to this sleep + Sleep(1000); +#else + sleep(1); +#endif + BOOST_LOG_TRIVIAL(info) << "Benchmark starting... this may take several minutes, please wait..."; + benchmark_work.unlock(); auto start = std::chrono::high_resolution_clock::now(); - int i = 0; - for ( ; i < cpu_contexts.size(); ++i) - bthreads[i] = std::thread(boost::bind(&benchmark_thread, i, *cpu_contexts.at(i))); - - for (; i < (cuda_contexts.size() + cpu_contexts.size()); ++i) - bthreads[i] = std::thread(boost::bind(&benchmark_thread, i, *cuda_contexts.at(i - cpu_contexts.size()))); - - for (; i < (opencl_contexts.size() + cuda_contexts.size() + cpu_contexts.size()); ++i) - bthreads[i] = std::thread(boost::bind(&benchmark_thread, i, *opencl_contexts.at(i - cpu_contexts.size() - cuda_contexts.size()))); - for (int i = 0; i < nThreads; ++i) bthreads[i].join(); @@ -799,25 +706,6 @@ void ZcashMiner::doBenchmark(int hashes, in size_t hashes_done = total_hashes - benchmark_nonces.size(); - for (auto it = cpu_contexts.begin(); it != cpu_contexts.end(); ++it) - { - CPUSolver::stop(**it); - delete (*it); - } - for (auto it = cuda_contexts.begin(); it != cuda_contexts.end(); ++it) - { - CUDASolver::stop(**it); - delete (*it); - } - for (auto it = opencl_contexts.begin(); it != opencl_contexts.end(); ++it) - { - OPENCLSolver::stop(**it); - delete (*it); - } - cpu_contexts.clear(); - cuda_contexts.clear(); - opencl_contexts.clear(); - BOOST_LOG_TRIVIAL(info) << "Benchmark done!"; BOOST_LOG_TRIVIAL(info) << "Total time : " << msec << " ms"; BOOST_LOG_TRIVIAL(info) << "Total iterations: " << hashes_done; @@ -825,53 +713,3 @@ void ZcashMiner::doBenchmark(int hashes, in BOOST_LOG_TRIVIAL(info) << "Speed: " << ((double)hashes_done * 1000 / (double)msec) << " I/s"; BOOST_LOG_TRIVIAL(info) << "Speed: " << ((double)benchmark_solutions * 1000 / (double)msec) << " Sols/s"; } - - -//void ZMinerAVX_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, -// int opencl_count, int opencl_platf, int* opencl_en) { -// ZMinerAVX::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en); -//} -// -//void ZMinerSSE2_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, -// int opencl_count, int opencl_platf, int* opencl_en) { -// ZMinerSSE2::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en); -//} - - - -// ocl_xmp -// gcc static undefined reference workaround -void ZMinerAVXCUDA80_XMP_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) { - ZMinerAVXCUDA80_XMP::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t); -} -void ZMinerSSE2CUDA80_XMP_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) { - ZMinerSSE2CUDA80_XMP::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t); -} -void ZMinerAVXCUDA75_XMP_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) { - ZMinerAVXCUDA75_XMP::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t); -} -void ZMinerSSE2CUDA75_XMP_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) { - ZMinerSSE2CUDA75_XMP::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t); -} -// ocl_silentarmy -void ZMinerAVXCUDA80_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) { - ZMinerAVXCUDA80_SA::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t); -} -void ZMinerSSE2CUDA80_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) { - ZMinerSSE2CUDA80_SA::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t); -} -void ZMinerAVXCUDA75_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) { - ZMinerAVXCUDA75_SA::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t); -} -void ZMinerSSE2CUDA75_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t) { - ZMinerSSE2CUDA75_SA::doBenchmark(hashes, cpu_threads, cuda_count, cuda_en, cuda_b, cuda_t, opencl_count, opencl_platf, opencl_en, opencl_t); -} - diff --git a/nheqminer/libstratum/ZcashStratum.h b/nheqminer/libstratum/ZcashStratum.h index 7b8940fbb..ad659962a 100644 --- a/nheqminer/libstratum/ZcashStratum.h +++ b/nheqminer/libstratum/ZcashStratum.h @@ -16,40 +16,7 @@ #include "json/json_spirit_value.h" -#include "SolverStub.h" - -#ifdef USE_CPU_TROMP -#include "../cpu_tromp/cpu_tromp.hpp" -#else -CREATE_SOLVER_STUB(cpu_tromp, "cpu_tromp_STUB") -#endif -#ifdef USE_CPU_XENONCAT -#include "../cpu_xenoncat/cpu_xenoncat.hpp" -#else -CREATE_SOLVER_STUB(cpu_xenoncat, "cpu_xenoncat_STUB") -#endif -#ifdef USE_CUDA_TROMP -#include "../cuda_tromp/cuda_tromp.hpp" - -// TODO fix this -#ifndef WIN32 -CREATE_SOLVER_STUB(cuda_tromp_75, "cuda_tromp_75_STUB") -#endif - -#else -CREATE_SOLVER_STUB(cuda_tromp, "cuda_tromp_STUB") -CREATE_SOLVER_STUB(cuda_tromp_75, "cuda_tromp_75_STUB") -#endif -#ifdef USE_OCL_XMP -#include "../ocl_xpm/ocl_xmp.hpp" -#else -CREATE_SOLVER_STUB(ocl_xmp, "ocl_xmp_STUB") -#endif -#ifdef USE_OCL_SILENTARMY -#include "../ocl_silentarmy/ocl_silentarmy.hpp" -#else -CREATE_SOLVER_STUB(ocl_silentarmy, "ocl_silentarmy_STUB") -#endif +#include "ISolver.h" using namespace json_spirit; @@ -108,7 +75,7 @@ inline bool operator==(const ZcashJob& a, const ZcashJob& b) typedef boost::signals2::signal NewJob_t; -template + class ZcashMiner { int nThreads; @@ -121,18 +88,13 @@ class ZcashMiner std::function solutionFoundCallback; bool m_isActive; - - std::vector cpu_contexts; - std::vector cuda_contexts; - std::vector opencl_contexts; - + std::vector solvers; public: NewJob_t NewJob; bool* minerThreadActive; - ZcashMiner(int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); + ZcashMiner(const std::vector &i_solvers); ~ZcashMiner(); std::string userAgent(); @@ -147,39 +109,7 @@ class ZcashMiner void acceptedSolution(bool stale); void rejectedSolution(bool stale); void failedSolution(); - - static void doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); }; -// 8 combos make sure not to go beyond this -// ocl_xmp -typedef ZcashMiner ZMinerAVXCUDA80_XMP; -typedef ZcashMiner ZMinerSSE2CUDA80_XMP; -typedef ZcashMiner ZMinerAVXCUDA75_XMP; -typedef ZcashMiner ZMinerSSE2CUDA75_XMP; -// ocl_silentarmy -typedef ZcashMiner ZMinerAVXCUDA80_SA; -typedef ZcashMiner ZMinerSSE2CUDA80_SA; -typedef ZcashMiner ZMinerAVXCUDA75_SA; -typedef ZcashMiner ZMinerSSE2CUDA75_SA; - -// ocl_xmp -// gcc static undefined reference workaround -void ZMinerAVXCUDA80_XMP_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); -void ZMinerSSE2CUDA80_XMP_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); -void ZMinerAVXCUDA75_XMP_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); -void ZMinerSSE2CUDA75_XMP_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); -// ocl_silentarmy -void ZMinerAVXCUDA80_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); -void ZMinerSSE2CUDA80_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); -void ZMinerAVXCUDA75_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); -void ZMinerSSE2CUDA75_SA_doBenchmark(int hashes, int cpu_threads, int cuda_count, int* cuda_en, int* cuda_b, int* cuda_t, - int opencl_count, int opencl_platf, int* opencl_en, int* opencl_t); +void Solvers_doBenchmark(int hashes, const std::vector &solvers); + diff --git a/nheqminer/main.cpp b/nheqminer/main.cpp index ce818dae9..6f7a230c5 100644 --- a/nheqminer/main.cpp +++ b/nheqminer/main.cpp @@ -5,6 +5,8 @@ #include "primitives/block.h" #include "streams.h" +#include "MinerFactory.h" + #include "libstratum/StratumClient.h" #if defined(USE_OCL_XMP) || defined(USE_OCL_SILENTARMY) @@ -43,36 +45,27 @@ namespace keywords = boost::log::keywords; #endif // TODO: -// fix compiler issues with standard vs2013 compiler -// file logging -// mingw compilation for windows (faster?) +// #1 file logging +// #2 mingw compilation for windows (faster?) +// #3 benchmark accuracy fix: first wait for solvers to init and then measure speed +// #4 Linux fix cmake to generate all in one binary (just like Windows) +// #5 after #4 is done add solver chooser for CPU and CUDA devices (general and per device), example: [-s 0 automatic, -s 1 solver1, -s 2 solver2, ...] int use_avx = 0; int use_avx2 = 0; int use_old_cuda = 0; int use_old_xmp = 0; -// _XMP -static ZcashStratumClientAVXCUDA80_XMP* scSigAVXC80_XMP = nullptr; -static ZcashStratumClientSSE2CUDA80_XMP* scSigSSE2C80_XMP = nullptr; -static ZcashStratumClientAVXCUDA75_XMP* scSigAVXC75_XMP = nullptr; -static ZcashStratumClientSSE2CUDA75_XMP* scSigSSE2C75_XMP = nullptr; -// _SA -static ZcashStratumClientAVXCUDA80_SA* scSigAVXC80_SA = nullptr; -static ZcashStratumClientSSE2CUDA80_SA* scSigSSE2C80_SA = nullptr; -static ZcashStratumClientAVXCUDA75_SA* scSigAVXC75_SA = nullptr; -static ZcashStratumClientSSE2CUDA75_SA* scSigSSE2C75_SA = nullptr; +// TODO move somwhere else +MinerFactory *_MinerFactory = nullptr; + +// stratum client sig +static ZcashStratumClient* scSig = nullptr; extern "C" void stratum_sigint_handler(int signum) { - if (scSigAVXC80_XMP) scSigAVXC80_XMP->disconnect(); - if (scSigSSE2C80_XMP) scSigSSE2C80_XMP->disconnect(); - if (scSigAVXC75_XMP) scSigAVXC75_XMP->disconnect(); - if (scSigSSE2C75_XMP) scSigSSE2C75_XMP->disconnect(); - if (scSigAVXC80_SA) scSigAVXC80_SA->disconnect(); - if (scSigSSE2C80_SA) scSigSSE2C80_SA->disconnect(); - if (scSigAVXC75_SA) scSigAVXC75_SA->disconnect(); - if (scSigSSE2C75_SA) scSigSSE2C75_SA->disconnect(); + if (scSig) scSig->disconnect(); + if (_MinerFactory) _MinerFactory->ClearAllSolvers(); } void print_help() @@ -96,28 +89,33 @@ void print_help() std::cout << std::endl; std::cout << "NVIDIA CUDA settings" << std::endl; std::cout << "\t-ci\t\tCUDA info" << std::endl; - std::cout << "\t-cv [ver]\tSet CUDA version (0 = default 8.0, 1 = 7.5)" << std::endl; + std::cout << "\t-cv [ver]\tSet CUDA solver (0 = djeZo, 1 = tromp)" << std::endl; std::cout << "\t-cd [devices]\tEnable CUDA mining on spec. devices" << std::endl; std::cout << "\t-cb [blocks]\tNumber of blocks" << std::endl; std::cout << "\t-ct [tpb]\tNumber of threads per block" << std::endl; std::cout << "Example: -cd 0 2 -cb 12 16 -ct 64 128" << std::endl; std::cout << std::endl; - std::cout << "OpenCL settings" << std::endl; - std::cout << "\t-oi\t\tOpenCL info" << std::endl; - std::cout << "\t-ov [ver]\tSet OpenCL solver (0 = silentarmy, 1 = xmp)" << std::endl; - std::cout << "\t-op [devices]\tSet OpenCL platform to selecd platform devices (-od)" << std::endl; - std::cout << "\t-od [devices]\tEnable OpenCL mining on spec. devices (specify plafrom number first -op)" << std::endl; - std::cout << "\t-ot [threads]\tSet number of threads per device" << std::endl; - //std::cout << "\t-cb [blocks]\tNumber of blocks" << std::endl; - //std::cout << "\t-ct [tpb]\tNumber of threads per block" << std::endl; - std::cout << "Example: -op 2 -od 0 2" << std::endl; //-cb 12 16 -ct 64 128" << std::endl; + //std::cout << "OpenCL settings" << std::endl; + //std::cout << "\t-oi\t\tOpenCL info" << std::endl; + //std::cout << "\t-ov [ver]\tSet OpenCL solver (0 = silentarmy, 1 = xmp)" << std::endl; + //std::cout << "\t-op [platf]\tSet OpenCL platform to selecd platform devices (-od)" << std::endl; + //std::cout << "\t-od [devices]\tEnable OpenCL mining on spec. devices (specify plafrom number first -op)" << std::endl; + //std::cout << "\t-ot [threads]\tSet number of threads per device" << std::endl; + ////std::cout << "\t-cb [blocks]\tNumber of blocks" << std::endl; + ////std::cout << "\t-ct [tpb]\tNumber of threads per block" << std::endl; + //std::cout << "Example: -op 2 -od 0 2" << std::endl; //-cb 12 16 -ct 64 128" << std::endl; std::cout << std::endl; } void print_cuda_info() { - int num_devices = cuda_tromp::getcount(); +#if defined(USE_CUDA_DJEZO) || defined(USE_CUDA_TROMP) +#ifdef USE_CUDA_DJEZO + int num_devices = cuda_djezo::getcount(); +#elif USE_CUDA_TROMP + int num_devices = cuda_tromp::getcount(); +#endif std::cout << "Number of CUDA devices found: " << num_devices << std::endl; @@ -125,9 +123,14 @@ void print_cuda_info() { std::string gpuname, version; int smcount; - cuda_tromp::getinfo(0, i, gpuname, smcount, version); +#ifdef USE_CUDA_DJEZO + cuda_djezo::getinfo(0, i, gpuname, smcount, version); +#elif USE_CUDA_TROMP + cuda_tromp::getinfo(0, i, gpuname, smcount, version); +#endif std::cout << "\t#" << i << " " << gpuname << " | SM version: " << version << " | SM count: " << smcount << std::endl; } +#endif } void print_opencl_info() { @@ -136,13 +139,14 @@ void print_opencl_info() { #endif } +#define MAX_INSTANCES 8 * 2 -int cuda_enabled[8] = { 0 }; -int cuda_blocks[8] = { 0 }; -int cuda_tpb[8] = { 0 }; +int cuda_enabled[MAX_INSTANCES] = { 0 }; +int cuda_blocks[MAX_INSTANCES] = { 0 }; +int cuda_tpb[MAX_INSTANCES] = { 0 }; -int opencl_enabled[8] = { 0 }; -int opencl_threads[8] = { 0 }; +int opencl_enabled[MAX_INSTANCES] = { 0 }; +int opencl_threads[MAX_INSTANCES] = { 0 }; // todo: opencl local and global worksize @@ -180,10 +184,10 @@ void detect_AVX_and_AVX2() } } -template -void start_mining(int api_port, int cpu_threads, int cuda_device_count, int opencl_device_count, int opencl_platform, - const std::string& host, const std::string& port, const std::string& user, const std::string& password, - StratumType* handler) + +void start_mining(int api_port, const std::string& host, const std::string& port, + const std::string& user, const std::string& password, + ZcashStratumClient* handler, const std::vector &i_solvers) { std::shared_ptr io_service(new boost::asio::io_service); @@ -197,9 +201,9 @@ void start_mining(int api_port, int cpu_threads, int cuda_device_count, int open api = nullptr; } } - - MinerType miner(cpu_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads); - StratumType sc{ + + ZcashMiner miner(i_solvers); + ZcashStratumClient sc{ io_service, &miner, host, port, user, password, 0, 0 }; @@ -241,16 +245,15 @@ int main(int argc, char* argv[]) std::cout << "\t==================== www.nicehash.com ====================" << std::endl; std::cout << "\t\tEquihash CPU&GPU Miner for NiceHash v" STANDALONE_MINER_VERSION << std::endl; std::cout << "\tThanks to Zcash developers for providing base of the code." << std::endl; - std::cout << "\t Special thanks to tromp, xenoncat, mbevand "<< std::endl; - std::cout << "\t and eXtremal-ik7 for providing " << std::endl; - std::cout << "\t optimized CPU, CUDA and AMD equihash solvers." << std::endl; + std::cout << "\t Special thanks to tromp, xenoncat and djeZo for providing "<< std::endl; + std::cout << "\t optimized CPU and CUDA equihash solvers." << std::endl; std::cout << "\t==================== www.nicehash.com ====================" << std::endl; std::cout << std::endl; std::string location = "equihash.eu.nicehash.com:3357"; - std::string user = ""; + std::string user = "34HKWdzLxWBduUfJE9JxaFhoXnfC6gmePG"; std::string password = "x"; - int num_threads = -1; + int num_threads = 0; bool benchmark = false; int log_level = 2; int num_hashes = 200; @@ -280,7 +283,7 @@ int main(int argc, char* argv[]) use_old_cuda = atoi(argv[++i]); break; case 'd': - while (cuda_device_count < 8 && i + 1 < argc) + while (cuda_device_count < MAX_INSTANCES && i + 1 < argc) { try { @@ -295,7 +298,7 @@ int main(int argc, char* argv[]) } break; case 'b': - while (cuda_bc < 8 && i + 1 < argc) + while (cuda_bc < MAX_INSTANCES && i + 1 < argc) { try { @@ -310,7 +313,7 @@ int main(int argc, char* argv[]) } break; case 't': - while (cuda_tbpc < 8 && i + 1 < argc) + while (cuda_tbpc < MAX_INSTANCES && i + 1 < argc) { try { @@ -327,53 +330,53 @@ int main(int argc, char* argv[]) } break; } - case 'o': - { - switch (argv[i][2]) - { - case 'i': - print_opencl_info(); - return 0; - case 'v': - use_old_xmp = atoi(argv[++i]); - break; - case 'p': - opencl_platform = std::stol(argv[++i]); - break; - case 'd': - while (opencl_device_count < 8 && i + 1 < argc) - { - try - { - opencl_enabled[opencl_device_count] = std::stol(argv[++i]); - ++opencl_device_count; - } - catch (...) - { - --i; - break; - } - } - break; - case 't': - while (opencl_t < 8 && i + 1 < argc) - { - try - { - opencl_threads[opencl_t] = std::stol(argv[++i]); - ++opencl_t; - } - catch (...) - { - --i; - break; - } - } - break; - // TODO extra parameters for OpenCL - } - break; - } + //case 'o': + //{ + // switch (argv[i][2]) + // { + // case 'i': + // print_opencl_info(); + // return 0; + // case 'v': + // use_old_xmp = atoi(argv[++i]); + // break; + // case 'p': + // opencl_platform = std::stol(argv[++i]); + // break; + // case 'd': + // while (opencl_device_count < 8 && i + 1 < argc) + // { + // try + // { + // opencl_enabled[opencl_device_count] = std::stol(argv[++i]); + // ++opencl_device_count; + // } + // catch (...) + // { + // --i; + // break; + // } + // } + // break; + // case 't': + // while (opencl_t < 8 && i + 1 < argc) + // { + // try + // { + // opencl_threads[opencl_t] = std::stol(argv[++i]); + // ++opencl_t; + // } + // catch (...) + // { + // --i; + // break; + // } + // } + // break; + // // TODO extra parameters for OpenCL + // } + // break; + //} case 'l': location = argv[++i]; break; @@ -445,6 +448,7 @@ int main(int argc, char* argv[]) try { + _MinerFactory = new MinerFactory(use_avx == 1, use_old_cuda == 0, use_old_xmp == 0); if (!benchmark) { if (user.length() == 0) @@ -457,97 +461,14 @@ int main(int argc, char* argv[]) std::string host = delim != std::string::npos ? location.substr(0, delim) : location; std::string port = delim != std::string::npos ? location.substr(delim + 1) : "2142"; - if (use_old_xmp) { - if (use_avx) - { - if (use_old_cuda) - { - start_mining(api_port, num_threads, cuda_device_count, opencl_device_count, opencl_platform, - host, port, user, password, scSigAVXC75_XMP); - } - else - { - start_mining(api_port, num_threads, cuda_device_count, opencl_device_count, opencl_platform, - host, port, user, password, scSigAVXC80_XMP); - } - } - else - { - if (use_old_cuda) - { - start_mining(api_port, num_threads, cuda_device_count, opencl_device_count, opencl_platform, - host, port, user, password, scSigSSE2C75_XMP); - } - else - { - start_mining(api_port, num_threads, cuda_device_count, opencl_device_count, opencl_platform, - host, port, user, password, scSigSSE2C80_XMP); - } - } - } - else { // sarmy - if (use_avx) - { - if (use_old_cuda) - { - start_mining(api_port, num_threads, cuda_device_count, opencl_device_count, opencl_platform, - host, port, user, password, scSigAVXC75_SA); - } - else - { - start_mining(api_port, num_threads, cuda_device_count, opencl_device_count, opencl_platform, - host, port, user, password, scSigAVXC80_SA); - } - } - else - { - if (use_old_cuda) - { - start_mining(api_port, num_threads, cuda_device_count, opencl_device_count, opencl_platform, - host, port, user, password, scSigSSE2C75_SA); - } - else - { - start_mining(api_port, num_threads, cuda_device_count, opencl_device_count, opencl_platform, - host, port, user, password, scSigSSE2C80_SA); - } - } - } + start_mining(api_port, host, port, user, password, + scSig, + _MinerFactory->GenerateSolvers(num_threads, cuda_device_count, cuda_enabled, cuda_blocks, + cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads)); } else { - if (use_old_xmp) { - if (use_avx) - { - if (use_old_cuda) - ZMinerAVXCUDA75_XMP_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads); - else - ZMinerAVXCUDA80_XMP_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads); - } - else - { - if (use_old_cuda) - ZMinerSSE2CUDA75_XMP_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads); - else - ZMinerSSE2CUDA80_XMP_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads); - } - } - else { // sarmy - if (use_avx) - { - if (use_old_cuda) - ZMinerAVXCUDA75_SA_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads); - else - ZMinerAVXCUDA80_SA_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads); - } - else - { - if (use_old_cuda) - ZMinerSSE2CUDA75_SA_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads); - else - ZMinerSSE2CUDA80_SA_doBenchmark(num_hashes, num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads); - } - } + Solvers_doBenchmark(num_hashes, _MinerFactory->GenerateSolvers(num_threads, cuda_device_count, cuda_enabled, cuda_blocks, cuda_tpb, opencl_device_count, opencl_platform, opencl_enabled, opencl_threads)); } } catch (std::runtime_error& er) diff --git a/nheqminer/nheqminer.sln b/nheqminer/nheqminer.sln index 0359f7bac..42fcd0e06 100644 --- a/nheqminer/nheqminer.sln +++ b/nheqminer/nheqminer.sln @@ -8,7 +8,6 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nheqminer", "nheqminer.vcxp {299E011B-5242-4EDA-B2F2-73C9B48F12FD} = {299E011B-5242-4EDA-B2F2-73C9B48F12FD} {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B} = {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B} {33C2B469-F025-4223-B9B6-E69D42FEA7D6} = {33C2B469-F025-4223-B9B6-E69D42FEA7D6} - {5EC9EDEB-8E49-4126-9161-1560683CBC71} = {5EC9EDEB-8E49-4126-9161-1560683CBC71} EndProjectSection EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuda_tromp", "..\cuda_tromp\cuda_tromp.vcxproj", "{33C2B469-F025-4223-B9B6-E69D42FEA7D6}" @@ -17,130 +16,63 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cpu_xenoncat", "..\cpu_xeno EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cpu_tromp", "..\cpu_tromp\cpu_tromp.vcxproj", "{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}" EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ocl_xpm", "..\ocl_xpm\ocl_xpm.vcxproj", "{5EC9EDEB-8E49-4126-9161-1560683CBC71}" -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ocl_device_utils", "..\ocl_device_utils\ocl_device_utils.vcxproj", "{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}" -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ocl_silentarmy", "..\ocl_silentarmy\ocl_silentarmy.vcxproj", "{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}" +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuda_djezo", "..\cuda_djezo\cuda_djezo.vcxproj", "{268B10AD-D845-498B-8663-AB8911CA2039}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|Mixed Platforms = Debug|Mixed Platforms Debug|Win32 = Debug|Win32 Debug|x64 = Debug|x64 - Release|Mixed Platforms = Release|Mixed Platforms Release|Win32 = Release|Win32 Release|x64 = Release|x64 - ReleaseSlow|Mixed Platforms = ReleaseSlow|Mixed Platforms ReleaseSlow|Win32 = ReleaseSlow|Win32 ReleaseSlow|x64 = ReleaseSlow|x64 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution - {6FF7D209-05A3-4550-93CC-211D33503719}.Debug|Mixed Platforms.ActiveCfg = Debug|x64 - {6FF7D209-05A3-4550-93CC-211D33503719}.Debug|Mixed Platforms.Build.0 = Debug|x64 {6FF7D209-05A3-4550-93CC-211D33503719}.Debug|Win32.ActiveCfg = Debug|x64 {6FF7D209-05A3-4550-93CC-211D33503719}.Debug|x64.ActiveCfg = Debug|x64 {6FF7D209-05A3-4550-93CC-211D33503719}.Debug|x64.Build.0 = Debug|x64 - {6FF7D209-05A3-4550-93CC-211D33503719}.Release|Mixed Platforms.ActiveCfg = Release|x64 - {6FF7D209-05A3-4550-93CC-211D33503719}.Release|Mixed Platforms.Build.0 = Release|x64 {6FF7D209-05A3-4550-93CC-211D33503719}.Release|Win32.ActiveCfg = Release|x64 {6FF7D209-05A3-4550-93CC-211D33503719}.Release|x64.ActiveCfg = Release|x64 {6FF7D209-05A3-4550-93CC-211D33503719}.Release|x64.Build.0 = Release|x64 - {6FF7D209-05A3-4550-93CC-211D33503719}.ReleaseSlow|Mixed Platforms.ActiveCfg = Release|x64 - {6FF7D209-05A3-4550-93CC-211D33503719}.ReleaseSlow|Mixed Platforms.Build.0 = Release|x64 {6FF7D209-05A3-4550-93CC-211D33503719}.ReleaseSlow|Win32.ActiveCfg = Release|x64 {6FF7D209-05A3-4550-93CC-211D33503719}.ReleaseSlow|x64.ActiveCfg = Release|x64 {6FF7D209-05A3-4550-93CC-211D33503719}.ReleaseSlow|x64.Build.0 = Release|x64 - {33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Debug|Mixed Platforms.ActiveCfg = Debug|x64 - {33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Debug|Mixed Platforms.Build.0 = Debug|x64 {33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Debug|Win32.ActiveCfg = Debug|x64 {33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Debug|x64.ActiveCfg = Debug|x64 {33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Debug|x64.Build.0 = Debug|x64 - {33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Release|Mixed Platforms.ActiveCfg = Release|x64 - {33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Release|Mixed Platforms.Build.0 = Release|x64 {33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Release|Win32.ActiveCfg = Release|x64 {33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Release|x64.ActiveCfg = Release|x64 {33C2B469-F025-4223-B9B6-E69D42FEA7D6}.Release|x64.Build.0 = Release|x64 - {33C2B469-F025-4223-B9B6-E69D42FEA7D6}.ReleaseSlow|Mixed Platforms.ActiveCfg = Release7.5|x64 - {33C2B469-F025-4223-B9B6-E69D42FEA7D6}.ReleaseSlow|Mixed Platforms.Build.0 = Release7.5|x64 {33C2B469-F025-4223-B9B6-E69D42FEA7D6}.ReleaseSlow|Win32.ActiveCfg = Release7.5|x64 {33C2B469-F025-4223-B9B6-E69D42FEA7D6}.ReleaseSlow|x64.ActiveCfg = Release7.5|x64 {33C2B469-F025-4223-B9B6-E69D42FEA7D6}.ReleaseSlow|x64.Build.0 = Release7.5|x64 - {299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Debug|Mixed Platforms.ActiveCfg = Debug|x64 - {299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Debug|Mixed Platforms.Build.0 = Debug|x64 {299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Debug|Win32.ActiveCfg = Debug|x64 {299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Debug|x64.ActiveCfg = Debug|x64 {299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Debug|x64.Build.0 = Debug|x64 - {299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Release|Mixed Platforms.ActiveCfg = Release|x64 - {299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Release|Mixed Platforms.Build.0 = Release|x64 {299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Release|Win32.ActiveCfg = Release|x64 {299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Release|x64.ActiveCfg = Release|x64 {299E011B-5242-4EDA-B2F2-73C9B48F12FD}.Release|x64.Build.0 = Release|x64 - {299E011B-5242-4EDA-B2F2-73C9B48F12FD}.ReleaseSlow|Mixed Platforms.ActiveCfg = Release|x64 - {299E011B-5242-4EDA-B2F2-73C9B48F12FD}.ReleaseSlow|Mixed Platforms.Build.0 = Release|x64 {299E011B-5242-4EDA-B2F2-73C9B48F12FD}.ReleaseSlow|Win32.ActiveCfg = Release|x64 {299E011B-5242-4EDA-B2F2-73C9B48F12FD}.ReleaseSlow|x64.ActiveCfg = Release|x64 {299E011B-5242-4EDA-B2F2-73C9B48F12FD}.ReleaseSlow|x64.Build.0 = Release|x64 - {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Debug|Mixed Platforms.ActiveCfg = Debug|x64 - {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Debug|Mixed Platforms.Build.0 = Debug|x64 {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Debug|Win32.ActiveCfg = Debug|x64 {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Debug|x64.ActiveCfg = Debug|x64 {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Debug|x64.Build.0 = Debug|x64 - {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Release|Mixed Platforms.ActiveCfg = Release|x64 - {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Release|Mixed Platforms.Build.0 = Release|x64 {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Release|Win32.ActiveCfg = Release|x64 {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Release|x64.ActiveCfg = Release|x64 {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.Release|x64.Build.0 = Release|x64 - {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.ReleaseSlow|Mixed Platforms.ActiveCfg = ReleaseSSE2|x64 - {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.ReleaseSlow|Mixed Platforms.Build.0 = ReleaseSSE2|x64 {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.ReleaseSlow|Win32.ActiveCfg = ReleaseSSE2|x64 {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.ReleaseSlow|x64.ActiveCfg = ReleaseSSE2|x64 {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}.ReleaseSlow|x64.Build.0 = ReleaseSSE2|x64 - {5EC9EDEB-8E49-4126-9161-1560683CBC71}.Debug|Mixed Platforms.ActiveCfg = Debug|x64 - {5EC9EDEB-8E49-4126-9161-1560683CBC71}.Debug|Mixed Platforms.Build.0 = Debug|x64 - {5EC9EDEB-8E49-4126-9161-1560683CBC71}.Debug|Win32.ActiveCfg = Debug|x64 - {5EC9EDEB-8E49-4126-9161-1560683CBC71}.Debug|x64.ActiveCfg = Debug|x64 - {5EC9EDEB-8E49-4126-9161-1560683CBC71}.Debug|x64.Build.0 = Debug|x64 - {5EC9EDEB-8E49-4126-9161-1560683CBC71}.Release|Mixed Platforms.ActiveCfg = Release|x64 - {5EC9EDEB-8E49-4126-9161-1560683CBC71}.Release|Mixed Platforms.Build.0 = Release|x64 - {5EC9EDEB-8E49-4126-9161-1560683CBC71}.Release|Win32.ActiveCfg = Release|x64 - {5EC9EDEB-8E49-4126-9161-1560683CBC71}.Release|x64.ActiveCfg = Release|x64 - {5EC9EDEB-8E49-4126-9161-1560683CBC71}.Release|x64.Build.0 = Release|x64 - {5EC9EDEB-8E49-4126-9161-1560683CBC71}.ReleaseSlow|Mixed Platforms.ActiveCfg = Release|x64 - {5EC9EDEB-8E49-4126-9161-1560683CBC71}.ReleaseSlow|Mixed Platforms.Build.0 = Release|x64 - {5EC9EDEB-8E49-4126-9161-1560683CBC71}.ReleaseSlow|Win32.ActiveCfg = Release|x64 - {5EC9EDEB-8E49-4126-9161-1560683CBC71}.ReleaseSlow|x64.ActiveCfg = Release|x64 - {5EC9EDEB-8E49-4126-9161-1560683CBC71}.ReleaseSlow|x64.Build.0 = Release|x64 - {5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Debug|Mixed Platforms.ActiveCfg = Debug|x64 - {5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Debug|Mixed Platforms.Build.0 = Debug|x64 - {5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Debug|Win32.ActiveCfg = Debug|x64 - {5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Debug|x64.ActiveCfg = Debug|x64 - {5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Debug|x64.Build.0 = Debug|x64 - {5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Release|Mixed Platforms.ActiveCfg = Release|x64 - {5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Release|Mixed Platforms.Build.0 = Release|x64 - {5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Release|Win32.ActiveCfg = Release|x64 - {5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Release|x64.ActiveCfg = Release|x64 - {5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.Release|x64.Build.0 = Release|x64 - {5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.ReleaseSlow|Mixed Platforms.ActiveCfg = Release|x64 - {5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.ReleaseSlow|Mixed Platforms.Build.0 = Release|x64 - {5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.ReleaseSlow|Win32.ActiveCfg = Release|x64 - {5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.ReleaseSlow|x64.ActiveCfg = Release|x64 - {5DBCE38A-C8D2-4498-A92A-9AF8D5196135}.ReleaseSlow|x64.Build.0 = Release|x64 - {AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Debug|Mixed Platforms.ActiveCfg = Debug|x64 - {AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Debug|Mixed Platforms.Build.0 = Debug|x64 - {AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Debug|Win32.ActiveCfg = Debug|x64 - {AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Debug|x64.ActiveCfg = Debug|x64 - {AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Debug|x64.Build.0 = Debug|x64 - {AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Release|Mixed Platforms.ActiveCfg = Release|x64 - {AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Release|Mixed Platforms.Build.0 = Release|x64 - {AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Release|Win32.ActiveCfg = Release|x64 - {AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Release|x64.ActiveCfg = Release|x64 - {AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.Release|x64.Build.0 = Release|x64 - {AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.ReleaseSlow|Mixed Platforms.ActiveCfg = Release|x64 - {AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.ReleaseSlow|Mixed Platforms.Build.0 = Release|x64 - {AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.ReleaseSlow|Win32.ActiveCfg = Release|x64 - {AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.ReleaseSlow|x64.ActiveCfg = Release|x64 - {AB01E715-795A-4089-8DF0-AE6EBDC1AB48}.ReleaseSlow|x64.Build.0 = Release|x64 + {268B10AD-D845-498B-8663-AB8911CA2039}.Debug|Win32.ActiveCfg = Debug|x64 + {268B10AD-D845-498B-8663-AB8911CA2039}.Debug|x64.ActiveCfg = Debug|x64 + {268B10AD-D845-498B-8663-AB8911CA2039}.Debug|x64.Build.0 = Debug|x64 + {268B10AD-D845-498B-8663-AB8911CA2039}.Release|Win32.ActiveCfg = Release|x64 + {268B10AD-D845-498B-8663-AB8911CA2039}.Release|x64.ActiveCfg = Release|x64 + {268B10AD-D845-498B-8663-AB8911CA2039}.Release|x64.Build.0 = Release|x64 + {268B10AD-D845-498B-8663-AB8911CA2039}.ReleaseSlow|Win32.ActiveCfg = Release|x64 + {268B10AD-D845-498B-8663-AB8911CA2039}.ReleaseSlow|x64.ActiveCfg = Release|x64 + {268B10AD-D845-498B-8663-AB8911CA2039}.ReleaseSlow|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/nheqminer/nheqminer.vcxproj b/nheqminer/nheqminer.vcxproj index 032382760..f37bb361f 100644 --- a/nheqminer/nheqminer.vcxproj +++ b/nheqminer/nheqminer.vcxproj @@ -84,7 +84,7 @@ MaxSpeed true true - WIN32;NDEBUG;_CONSOLE;USE_CPU_TROMP;USE_CPU_XENONCAT;USE_CUDA_TROMP;USE_OCL_XMP;USE_OCL_SILENTARMY;%(PreprocessorDefinitions) + WIN32;NDEBUG;_CONSOLE;USE_CPU_TROMP;USE_CPU_XENONCAT;USE_CUDA_TROMP;USE_CUDA_DJEZO;%(PreprocessorDefinitions) NotSet -D_WIN32_WINNT=0x0601 %(AdditionalOptions) 4068;4996;4503;4267;4180;4290;4244;4800;4334;4251 @@ -96,8 +96,8 @@ true true true - cuda_tromp.lib;cuda_tromp_75.lib;cpu_xenoncat.lib;cpu_tromp_SSE2.lib;cpu_tromp_AVX.lib;ocl_device_utils.lib;ocl_xpm.lib;ocl_silentarmy.lib;OpenCL.lib - .\trompequihash\pthreads\x64;..\3rdparty\libs\win64;$(AMDAPPSDKROOT)\lib\x86_64\;%(AdditionalLibraryDirectories) + cuda_djezo.lib;cuda_tromp.lib;cuda_tromp_75.lib;cpu_xenoncat.lib;cpu_tromp_SSE2.lib;cpu_tromp_AVX.lib + .\trompequihash\pthreads\x64;..\3rdparty\libs\win64;%(AdditionalLibraryDirectories) @@ -105,11 +105,13 @@ + + @@ -126,10 +128,13 @@ + + + @@ -158,6 +163,7 @@ + diff --git a/nheqminer/nheqminer.vcxproj.filters b/nheqminer/nheqminer.vcxproj.filters index 80dca38c7..92d63fe46 100644 --- a/nheqminer/nheqminer.vcxproj.filters +++ b/nheqminer/nheqminer.vcxproj.filters @@ -167,6 +167,21 @@ Header Files + + Header Files\solvers + + + Header Files + + + Header Files + + + Header Files + + + Header Files + @@ -211,5 +226,8 @@ Source Files\stuff + + Source Files + \ No newline at end of file diff --git a/nheqminer/speed.hpp b/nheqminer/speed.hpp index b758a1a2e..beeea05fd 100644 --- a/nheqminer/speed.hpp +++ b/nheqminer/speed.hpp @@ -1,6 +1,6 @@ #pragma once -#define INTERVAL_SECONDS 300 // 5 minutes +#define INTERVAL_SECONDS 15 // 15 seconds class Speed { diff --git a/nheqminer/version.h b/nheqminer/version.h index 716eb2bae..af4c23e33 100644 --- a/nheqminer/version.h +++ b/nheqminer/version.h @@ -34,7 +34,7 @@ static const int BIP0031_VERSION = 60000; //! "mempool" command, enhanced "getdata" behavior starts with this version static const int MEMPOOL_GD_VERSION = 60002; -#define STANDALONE_MINER_VERSION "0.4b" +#define STANDALONE_MINER_VERSION "0.5c" // uncomment to use with ZCash address //#define ZCASH_POOL diff --git a/ocl_device_utils/OpenCLDevice.h b/ocl_device_utils/OpenCLDevice.h deleted file mode 100644 index ab0f5f437..000000000 --- a/ocl_device_utils/OpenCLDevice.h +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -#include - -// This will list OpenCL devices, but AMD will only have aditional BusID -struct OpenCLDevice { - unsigned int DeviceID; - std::string _CL_DEVICE_NAME; - std::string _CL_DEVICE_TYPE; - unsigned long long _CL_DEVICE_GLOBAL_MEM_SIZE; - std::string _CL_DEVICE_VENDOR; - std::string _CL_DEVICE_VERSION; - std::string _CL_DRIVER_VERSION; -}; - diff --git a/ocl_device_utils/cl_ext.hpp b/ocl_device_utils/cl_ext.hpp deleted file mode 100644 index 507598171..000000000 --- a/ocl_device_utils/cl_ext.hpp +++ /dev/null @@ -1,12355 +0,0 @@ -/******************************************************************************* -* Copyright (c) 2008-2013 The Khronos Group Inc. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and/or associated documentation files (the -* "Materials"), to deal in the Materials without restriction, including -* without limitation the rights to use, copy, modify, merge, publish, -* distribute, sublicense, and/or sell copies of the Materials, and to -* permit persons to whom the Materials are furnished to do so, subject to -* the following conditions: -* -* The above copyright notice and this permission notice shall be included -* in all copies or substantial portions of the Materials. -* -* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. -******************************************************************************/ - -/*! \file -* -* \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and -* OpenCL 1.2 (rev 15) -* \author Benedict R. Gaster, Laurent Morichetti and Lee Howes -* -* Additions and fixes from: -* Brian Cole, March 3rd 2010 and April 2012 -* Matt Gruenke, April 2012. -* Bruce Merry, February 2013. -* -* \version 1.2.5 -* \date June 2013 -* -* Optional extension support -* -* cl -* cl_ext_device_fission -* #define USE_CL_DEVICE_FISSION -*/ - -/*! \mainpage -* \section intro Introduction -* For many large applications C++ is the language of choice and so it seems -* reasonable to define C++ bindings for OpenCL. -* -* -* The interface is contained with a single C++ header file \em cl.hpp and all -* definitions are contained within the namespace \em cl. There is no additional -* requirement to include \em cl.h and to use either the C++ or original C -* bindings it is enough to simply include \em cl.hpp. -* -* The bindings themselves are lightweight and correspond closely to the -* underlying C API. Using the C++ bindings introduces no additional execution -* overhead. -* -* For detail documentation on the bindings see: -* -* The OpenCL C++ Wrapper API 1.2 (revision 09) -* http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf -* -* \section example Example -* -* The following example shows a general use case for the C++ -* bindings, including support for the optional exception feature and -* also the supplied vector and string classes, see following sections for -* decriptions of these features. -* -* \code -* #define __CL_ENABLE_EXCEPTIONS -* -* #if defined(__APPLE__) || defined(__MACOSX) -* #include -* #else -* #include -* #endif -* #include -* #include -* #include -* -* const char * helloStr = "__kernel void " -* "hello(void) " -* "{ " -* " " -* "} "; -* -* int -* main(void) -* { -* cl_int err = CL_SUCCESS; -* try { -* -* std::vector platforms; -* cl::Platform::get(&platforms); -* if (platforms.size() == 0) { -* std::cout << "Platform size 0\n"; -* return -1; -* } -* -* cl_context_properties properties[] = -* { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0}; -* cl::Context context(CL_DEVICE_TYPE_CPU, properties); -* -* std::vector devices = context.getInfo(); -* -* cl::Program::Sources source(1, -* std::make_pair(helloStr,strlen(helloStr))); -* cl::Program program_ = cl::Program(context, source); -* program_.build(devices); -* -* cl::Kernel kernel(program_, "hello", &err); -* -* cl::Event event; -* cl::CommandQueue queue(context, devices[0], 0, &err); -* queue.enqueueNDRangeKernel( -* kernel, -* cl::NullRange, -* cl::NDRange(4,4), -* cl::NullRange, -* NULL, -* &event); -* -* event.wait(); -* } -* catch (cl::Error err) { -* std::cerr -* << "ERROR: " -* << err.what() -* << "(" -* << err.err() -* << ")" -* << std::endl; -* } -* -* return EXIT_SUCCESS; -* } -* -* \endcode -* -*/ -#ifndef CL_HPP_ -#define CL_HPP_ - -#ifdef _WIN32 - -#include -#include -#include -#include - -#if defined(__CL_ENABLE_EXCEPTIONS) -#include -#endif // #if defined(__CL_ENABLE_EXCEPTIONS) - -#pragma push_macro("max") -#undef max -#if defined(USE_DX_INTEROP) -#include -#include -#endif -#endif // _WIN32 - -// -#if defined(USE_CL_DEVICE_FISSION) -#include // AMD topology not needed here -#endif - -#if defined(__APPLE__) || defined(__MACOSX) -#include -#include -#include -#else -#include -#include -#endif // !__APPLE__ - -// To avoid accidentally taking ownership of core OpenCL types -// such as cl_kernel constructors are made explicit -// under OpenCL 1.2 -#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) -#define __CL_EXPLICIT_CONSTRUCTORS explicit -#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) -#define __CL_EXPLICIT_CONSTRUCTORS -#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - -// Define deprecated prefixes and suffixes to ensure compilation -// in case they are not pre-defined -#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) -#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED -#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) -#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED) -#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED -#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) - -#if !defined(CL_CALLBACK) -#define CL_CALLBACK -#endif //CL_CALLBACK - -#include -#include - -#if !defined(__NO_STD_VECTOR) -#include -#endif - -#if !defined(__NO_STD_STRING) -#include -#endif - -#if defined(__linux__) || defined(__APPLE__) || defined(__MACOSX) -#include - -#include -#include -#endif // __linux__ - -#include - - -/*! \namespace cl -* -* \brief The OpenCL C++ bindings are defined within this namespace. -* -*/ -namespace cl { - - class Memory; - - /** - * Deprecated APIs for 1.2 - */ -#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) -#define __INIT_CL_EXT_FCN_PTR(name) \ - if(!pfn_##name) { \ - pfn_##name = (PFN_##name) \ - clGetExtensionFunctionAddress(#name); \ - if(!pfn_##name) { \ - } \ - } -#endif // #if defined(CL_VERSION_1_1) - -#if defined(CL_VERSION_1_2) -#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \ - if(!pfn_##name) { \ - pfn_##name = (PFN_##name) \ - clGetExtensionFunctionAddressForPlatform(platform, #name); \ - if(!pfn_##name) { \ - } \ - } -#endif // #if defined(CL_VERSION_1_1) - - class Program; - class Device; - class Context; - class CommandQueue; - class Memory; - class Buffer; - -#if defined(__CL_ENABLE_EXCEPTIONS) - /*! \brief Exception class - * - * This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined. - */ - class Error : public std::exception - { - private: - cl_int err_; - const char * errStr_; - public: - /*! \brief Create a new CL error exception for a given error code - * and corresponding message. - * - * \param err error code value. - * - * \param errStr a descriptive string that must remain in scope until - * handling of the exception has concluded. If set, it - * will be returned by what(). - */ - Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr) - {} - - ~Error() throw() {} - - /*! \brief Get error string associated with exception - * - * \return A memory pointer to the error message string. - */ - virtual const char * what() const throw () - { - if (errStr_ == NULL) { - return "empty"; - } - else { - return errStr_; - } - } - - /*! \brief Get error code associated with exception - * - * \return The error code. - */ - cl_int err(void) const { return err_; } - }; - -#define __ERR_STR(x) #x -#else -#define __ERR_STR(x) NULL -#endif // __CL_ENABLE_EXCEPTIONS - - - namespace detail - { -#if defined(__CL_ENABLE_EXCEPTIONS) - static inline cl_int errHandler( - cl_int err, - const char * errStr = NULL) - { - if (err != CL_SUCCESS) { - throw Error(err, errStr); - } - return err; - } -#else - static inline cl_int errHandler(cl_int err, const char * errStr = NULL) - { - (void)errStr; // suppress unused variable warning - return err; - } -#endif // __CL_ENABLE_EXCEPTIONS - } - - - - //! \cond DOXYGEN_DETAIL -#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS) -#define __GET_DEVICE_INFO_ERR __ERR_STR(clGetDeviceInfo) -#define __GET_PLATFORM_INFO_ERR __ERR_STR(clGetPlatformInfo) -#define __GET_DEVICE_IDS_ERR __ERR_STR(clGetDeviceIDs) -#define __GET_PLATFORM_IDS_ERR __ERR_STR(clGetPlatformIDs) -#define __GET_CONTEXT_INFO_ERR __ERR_STR(clGetContextInfo) -#define __GET_EVENT_INFO_ERR __ERR_STR(clGetEventInfo) -#define __GET_EVENT_PROFILE_INFO_ERR __ERR_STR(clGetEventProfileInfo) -#define __GET_MEM_OBJECT_INFO_ERR __ERR_STR(clGetMemObjectInfo) -#define __GET_IMAGE_INFO_ERR __ERR_STR(clGetImageInfo) -#define __GET_SAMPLER_INFO_ERR __ERR_STR(clGetSamplerInfo) -#define __GET_KERNEL_INFO_ERR __ERR_STR(clGetKernelInfo) -#if defined(CL_VERSION_1_2) -#define __GET_KERNEL_ARG_INFO_ERR __ERR_STR(clGetKernelArgInfo) -#endif // #if defined(CL_VERSION_1_2) -#define __GET_KERNEL_WORK_GROUP_INFO_ERR __ERR_STR(clGetKernelWorkGroupInfo) -#define __GET_PROGRAM_INFO_ERR __ERR_STR(clGetProgramInfo) -#define __GET_PROGRAM_BUILD_INFO_ERR __ERR_STR(clGetProgramBuildInfo) -#define __GET_COMMAND_QUEUE_INFO_ERR __ERR_STR(clGetCommandQueueInfo) - -#define __CREATE_CONTEXT_ERR __ERR_STR(clCreateContext) -#define __CREATE_CONTEXT_FROM_TYPE_ERR __ERR_STR(clCreateContextFromType) -#define __GET_SUPPORTED_IMAGE_FORMATS_ERR __ERR_STR(clGetSupportedImageFormats) - -#define __CREATE_BUFFER_ERR __ERR_STR(clCreateBuffer) -#define __COPY_ERR __ERR_STR(cl::copy) -#define __CREATE_SUBBUFFER_ERR __ERR_STR(clCreateSubBuffer) -#define __CREATE_GL_BUFFER_ERR __ERR_STR(clCreateFromGLBuffer) -#define __CREATE_GL_RENDER_BUFFER_ERR __ERR_STR(clCreateFromGLBuffer) -#define __GET_GL_OBJECT_INFO_ERR __ERR_STR(clGetGLObjectInfo) -#if defined(CL_VERSION_1_2) -#define __CREATE_IMAGE_ERR __ERR_STR(clCreateImage) -#define __CREATE_GL_TEXTURE_ERR __ERR_STR(clCreateFromGLTexture) -#define __IMAGE_DIMENSION_ERR __ERR_STR(Incorrect image dimensions) -#endif // #if defined(CL_VERSION_1_2) -#define __CREATE_SAMPLER_ERR __ERR_STR(clCreateSampler) -#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback) - -#define __CREATE_USER_EVENT_ERR __ERR_STR(clCreateUserEvent) -#define __SET_USER_EVENT_STATUS_ERR __ERR_STR(clSetUserEventStatus) -#define __SET_EVENT_CALLBACK_ERR __ERR_STR(clSetEventCallback) -#define __WAIT_FOR_EVENTS_ERR __ERR_STR(clWaitForEvents) - -#define __CREATE_KERNEL_ERR __ERR_STR(clCreateKernel) -#define __SET_KERNEL_ARGS_ERR __ERR_STR(clSetKernelArg) -#define __CREATE_PROGRAM_WITH_SOURCE_ERR __ERR_STR(clCreateProgramWithSource) -#define __CREATE_PROGRAM_WITH_BINARY_ERR __ERR_STR(clCreateProgramWithBinary) -#if defined(CL_VERSION_1_2) -#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR __ERR_STR(clCreateProgramWithBuiltInKernels) -#endif // #if defined(CL_VERSION_1_2) -#define __BUILD_PROGRAM_ERR __ERR_STR(clBuildProgram) -#if defined(CL_VERSION_1_2) -#define __COMPILE_PROGRAM_ERR __ERR_STR(clCompileProgram) - -#endif // #if defined(CL_VERSION_1_2) -#define __CREATE_KERNELS_IN_PROGRAM_ERR __ERR_STR(clCreateKernelsInProgram) - -#define __CREATE_COMMAND_QUEUE_ERR __ERR_STR(clCreateCommandQueue) -#define __SET_COMMAND_QUEUE_PROPERTY_ERR __ERR_STR(clSetCommandQueueProperty) -#define __ENQUEUE_READ_BUFFER_ERR __ERR_STR(clEnqueueReadBuffer) -#define __ENQUEUE_READ_BUFFER_RECT_ERR __ERR_STR(clEnqueueReadBufferRect) -#define __ENQUEUE_WRITE_BUFFER_ERR __ERR_STR(clEnqueueWriteBuffer) -#define __ENQUEUE_WRITE_BUFFER_RECT_ERR __ERR_STR(clEnqueueWriteBufferRect) -#define __ENQEUE_COPY_BUFFER_ERR __ERR_STR(clEnqueueCopyBuffer) -#define __ENQEUE_COPY_BUFFER_RECT_ERR __ERR_STR(clEnqueueCopyBufferRect) -#define __ENQUEUE_FILL_BUFFER_ERR __ERR_STR(clEnqueueFillBuffer) -#define __ENQUEUE_READ_IMAGE_ERR __ERR_STR(clEnqueueReadImage) -#define __ENQUEUE_WRITE_IMAGE_ERR __ERR_STR(clEnqueueWriteImage) -#define __ENQUEUE_COPY_IMAGE_ERR __ERR_STR(clEnqueueCopyImage) -#define __ENQUEUE_FILL_IMAGE_ERR __ERR_STR(clEnqueueFillImage) -#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR __ERR_STR(clEnqueueCopyImageToBuffer) -#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR __ERR_STR(clEnqueueCopyBufferToImage) -#define __ENQUEUE_MAP_BUFFER_ERR __ERR_STR(clEnqueueMapBuffer) -#define __ENQUEUE_MAP_IMAGE_ERR __ERR_STR(clEnqueueMapImage) -#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR __ERR_STR(clEnqueueUnMapMemObject) -#define __ENQUEUE_NDRANGE_KERNEL_ERR __ERR_STR(clEnqueueNDRangeKernel) -#define __ENQUEUE_TASK_ERR __ERR_STR(clEnqueueTask) -#define __ENQUEUE_NATIVE_KERNEL __ERR_STR(clEnqueueNativeKernel) -#if defined(CL_VERSION_1_2) -#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR __ERR_STR(clEnqueueMigrateMemObjects) -#endif // #if defined(CL_VERSION_1_2) - -#define __ENQUEUE_ACQUIRE_GL_ERR __ERR_STR(clEnqueueAcquireGLObjects) -#define __ENQUEUE_RELEASE_GL_ERR __ERR_STR(clEnqueueReleaseGLObjects) - - -#define __RETAIN_ERR __ERR_STR(Retain Object) -#define __RELEASE_ERR __ERR_STR(Release Object) -#define __FLUSH_ERR __ERR_STR(clFlush) -#define __FINISH_ERR __ERR_STR(clFinish) -#define __VECTOR_CAPACITY_ERR __ERR_STR(Vector capacity error) - - /** - * CL 1.2 version that uses device fission. - */ -#if defined(CL_VERSION_1_2) -#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevices) -#else -#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevicesEXT) -#endif // #if defined(CL_VERSION_1_2) - - /** - * Deprecated APIs for 1.2 - */ -#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) -#define __ENQUEUE_MARKER_ERR __ERR_STR(clEnqueueMarker) -#define __ENQUEUE_WAIT_FOR_EVENTS_ERR __ERR_STR(clEnqueueWaitForEvents) -#define __ENQUEUE_BARRIER_ERR __ERR_STR(clEnqueueBarrier) -#define __UNLOAD_COMPILER_ERR __ERR_STR(clUnloadCompiler) -#define __CREATE_GL_TEXTURE_2D_ERR __ERR_STR(clCreateFromGLTexture2D) -#define __CREATE_GL_TEXTURE_3D_ERR __ERR_STR(clCreateFromGLTexture3D) -#define __CREATE_IMAGE2D_ERR __ERR_STR(clCreateImage2D) -#define __CREATE_IMAGE3D_ERR __ERR_STR(clCreateImage3D) -#endif // #if defined(CL_VERSION_1_1) - -#endif // __CL_USER_OVERRIDE_ERROR_STRINGS - //! \endcond - - /** - * CL 1.2 marker and barrier commands - */ -#if defined(CL_VERSION_1_2) -#define __ENQUEUE_MARKER_WAIT_LIST_ERR __ERR_STR(clEnqueueMarkerWithWaitList) -#define __ENQUEUE_BARRIER_WAIT_LIST_ERR __ERR_STR(clEnqueueBarrierWithWaitList) -#endif // #if defined(CL_VERSION_1_2) - -#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING) - typedef std::string STRING_CLASS; -#elif !defined(__USE_DEV_STRING) - - /*! \class string - * \brief Simple string class, that provides a limited subset of std::string - * functionality but avoids many of the issues that come with that class. - - * \note Deprecated. Please use std::string as default or - * re-define the string class to match the std::string - * interface by defining STRING_CLASS - */ - class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED - { - private: - ::size_t size_; - char * str_; - public: - //! \brief Constructs an empty string, allocating no memory. - string(void) : size_(0), str_(NULL) - { - } - - /*! \brief Constructs a string populated from an arbitrary value of - * specified size. - * - * An extra '\0' is added, in case none was contained in str. - * - * \param str the initial value of the string instance. Note that '\0' - * characters receive no special treatment. If NULL, - * the string is left empty, with a size of 0. - * - * \param size the number of characters to copy from str. - */ - string(const char * str, ::size_t size) : - size_(size), - str_(NULL) - { - if (size > 0) { - str_ = new char[size_ + 1]; - if (str_ != NULL) { - memcpy(str_, str, size_ * sizeof(char)); - str_[size_] = '\0'; - } - else { - size_ = 0; - } - } - } - - /*! \brief Constructs a string populated from a null-terminated value. - * - * \param str the null-terminated initial value of the string instance. - * If NULL, the string is left empty, with a size of 0. - */ - string(const char * str) : - size_(0), - str_(NULL) - { - if (str) { - size_ = ::strlen(str); - } - if (size_ > 0) { - str_ = new char[size_ + 1]; - if (str_ != NULL) { - memcpy(str_, str, (size_ + 1) * sizeof(char)); - } - } - } - - void resize(::size_t n) - { - if (size_ == n) { - return; - } - if (n == 0) { - if (str_) { - delete[] str_; - } - str_ = NULL; - size_ = 0; - } - else { - char *newString = new char[n + 1]; - int copySize = n; - if (size_ < n) { - copySize = size_; - } - size_ = n; - - if (str_) { - memcpy(newString, str_, (copySize + 1) * sizeof(char)); - } - if (copySize < size_) { - memset(newString + copySize, 0, size_ - copySize); - } - newString[size_] = '\0'; - - delete[] str_; - str_ = newString; - } - } - - const char& operator[] (::size_t pos) const - { - return str_[pos]; - } - - char& operator[] (::size_t pos) - { - return str_[pos]; - } - - /*! \brief Copies the value of another string to this one. - * - * \param rhs the string to copy. - * - * \returns a reference to the modified instance. - */ - string& operator=(const string& rhs) - { - if (this == &rhs) { - return *this; - } - - if (str_ != NULL) { - delete[] str_; - str_ = NULL; - size_ = 0; - } - - if (rhs.size_ == 0 || rhs.str_ == NULL) { - str_ = NULL; - size_ = 0; - } - else { - str_ = new char[rhs.size_ + 1]; - size_ = rhs.size_; - - if (str_ != NULL) { - memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char)); - } - else { - size_ = 0; - } - } - - return *this; - } - - /*! \brief Constructs a string by copying the value of another instance. - * - * \param rhs the string to copy. - */ - string(const string& rhs) : - size_(0), - str_(NULL) - { - *this = rhs; - } - - //! \brief Destructor - frees memory used to hold the current value. - ~string() - { - delete[] str_; - str_ = NULL; - } - - //! \brief Queries the length of the string, excluding any added '\0's. - ::size_t size(void) const { return size_; } - - //! \brief Queries the length of the string, excluding any added '\0's. - ::size_t length(void) const { return size(); } - - /*! \brief Returns a pointer to the private copy held by this instance, - * or "" if empty/unset. - */ - const char * c_str(void) const { return (str_) ? str_ : ""; } - }; - typedef cl::string STRING_CLASS; -#endif // #elif !defined(__USE_DEV_STRING) - -#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR) -#define VECTOR_CLASS std::vector -#elif !defined(__USE_DEV_VECTOR) -#define VECTOR_CLASS cl::vector - -#if !defined(__MAX_DEFAULT_VECTOR_SIZE) -#define __MAX_DEFAULT_VECTOR_SIZE 10 -#endif - - /*! \class vector - * \brief Fixed sized vector implementation that mirroring - * - * \note Deprecated. Please use std::vector as default or - * re-define the vector class to match the std::vector - * interface by defining VECTOR_CLASS - - * \note Not recommended for use with custom objects as - * current implementation will construct N elements - * - * std::vector functionality. - * \brief Fixed sized vector compatible with std::vector. - * - * \note - * This differs from std::vector<> not just in memory allocation, - * but also in terms of when members are constructed, destroyed, - * and assigned instead of being copy constructed. - * - * \param T type of element contained in the vector. - * - * \param N maximum size of the vector. - */ - template - class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED - { - private: - T data_[N]; - unsigned int size_; - - public: - //! \brief Constructs an empty vector with no memory allocated. - vector() : - size_(static_cast(0)) - {} - - //! \brief Deallocates the vector's memory and destroys all of its elements. - ~vector() - { - clear(); - } - - //! \brief Returns the number of elements currently contained. - unsigned int size(void) const - { - return size_; - } - - /*! \brief Empties the vector of all elements. - * \note - * This does not deallocate memory but will invoke destructors - * on contained elements. - */ - void clear() - { - while (!empty()) { - pop_back(); - } - } - - /*! \brief Appends an element after the last valid element. - * Calling this on a vector that has reached capacity will throw an - * exception if exceptions are enabled. - */ - void push_back(const T& x) - { - if (size() < N) { - new (&data_[size_]) T(x); - size_++; - } - else { - detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR); - } - } - - /*! \brief Removes the last valid element from the vector. - * Calling this on an empty vector will throw an exception - * if exceptions are enabled. - */ - void pop_back(void) - { - if (size_ != 0) { - --size_; - data_[size_].~T(); - } - else { - detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR); - } - } - - /*! \brief Constructs with a value copied from another. - * - * \param vec the vector to copy. - */ - vector(const vector& vec) : - size_(vec.size_) - { - if (size_ != 0) { - assign(vec.begin(), vec.end()); - } - } - - /*! \brief Constructs with a specified number of initial elements. - * - * \param size number of initial elements. - * - * \param val value of initial elements. - */ - vector(unsigned int size, const T& val = T()) : - size_(0) - { - for (unsigned int i = 0; i < size; i++) { - push_back(val); - } - } - - /*! \brief Overwrites the current content with that copied from another - * instance. - * - * \param rhs vector to copy. - * - * \returns a reference to this. - */ - vector& operator=(const vector& rhs) - { - if (this == &rhs) { - return *this; - } - - if (rhs.size_ != 0) { - assign(rhs.begin(), rhs.end()); - } - else { - clear(); - } - - return *this; - } - - /*! \brief Tests equality against another instance. - * - * \param vec the vector against which to compare. - */ - bool operator==(vector &vec) - { - if (size() != vec.size()) { - return false; - } - - for (unsigned int i = 0; i < size(); ++i) { - if (operator[](i) != vec[i]) { - return false; - } - } - return true; - } - - //! \brief Conversion operator to T*. - operator T* () { return data_; } - - //! \brief Conversion operator to const T*. - operator const T* () const { return data_; } - - //! \brief Tests whether this instance has any elements. - bool empty(void) const - { - return size_ == 0; - } - - //! \brief Returns the maximum number of elements this instance can hold. - unsigned int max_size(void) const - { - return N; - } - - //! \brief Returns the maximum number of elements this instance can hold. - unsigned int capacity() const - { - return N; - } - - /*! \brief Returns a reference to a given element. - * - * \param index which element to access. * - * \note - * The caller is responsible for ensuring index is >= 0 and < size(). - */ - T& operator[](int index) - { - return data_[index]; - } - - /*! \brief Returns a const reference to a given element. - * - * \param index which element to access. - * - * \note - * The caller is responsible for ensuring index is >= 0 and < size(). - */ - const T& operator[](int index) const - { - return data_[index]; - } - - /*! \brief Assigns elements of the vector based on a source iterator range. - * - * \param start Beginning iterator of source range - * \param end Enditerator of source range - * - * \note - * Will throw an exception if exceptions are enabled and size exceeded. - */ - template - void assign(I start, I end) - { - clear(); - while (start != end) { - push_back(*start); - start++; - } - } - - /*! \class iterator - * \brief Const iterator class for vectors - */ - class iterator - { - private: - const vector *vec_; - int index_; - - /** - * Internal iterator constructor to capture reference - * to the vector it iterates over rather than taking - * the vector by copy. - */ - iterator(const vector &vec, int index) : - vec_(&vec) - { - if (!vec.empty()) { - index_ = index; - } - else { - index_ = -1; - } - } - - public: - iterator(void) : - index_(-1), - vec_(NULL) - { - } - - iterator(const iterator& rhs) : - vec_(rhs.vec_), - index_(rhs.index_) - { - } - - ~iterator(void) {} - - static iterator begin(const cl::vector &vec) - { - iterator i(vec, 0); - - return i; - } - - static iterator end(const cl::vector &vec) - { - iterator i(vec, vec.size()); - - return i; - } - - bool operator==(iterator i) - { - return ((vec_ == i.vec_) && - (index_ == i.index_)); - } - - bool operator!=(iterator i) - { - return (!(*this == i)); - } - - iterator& operator++() - { - ++index_; - return *this; - } - - iterator operator++(int) - { - iterator retVal(*this); - ++index_; - return retVal; - } - - iterator& operator--() - { - --index_; - return *this; - } - - iterator operator--(int) - { - iterator retVal(*this); - --index_; - return retVal; - } - - const T& operator *() const - { - return (*vec_)[index_]; - } - }; - - iterator begin(void) - { - return iterator::begin(*this); - } - - iterator begin(void) const - { - return iterator::begin(*this); - } - - iterator end(void) - { - return iterator::end(*this); - } - - iterator end(void) const - { - return iterator::end(*this); - } - - T& front(void) - { - return data_[0]; - } - - T& back(void) - { - return data_[size_]; - } - - const T& front(void) const - { - return data_[0]; - } - - const T& back(void) const - { - return data_[size_ - 1]; - } - }; -#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR) - - - - - - namespace detail { -#define __DEFAULT_NOT_INITIALIZED 1 -#define __DEFAULT_BEING_INITIALIZED 2 -#define __DEFAULT_INITIALIZED 4 - - /* - * Compare and exchange primitives are needed for handling of defaults - */ - inline int compare_exchange(volatile int * dest, int exchange, int comparand) - { -#ifdef _WIN32 - return (int)(InterlockedCompareExchange( - (volatile long*)dest, - (long)exchange, - (long)comparand)); -#elif defined(__APPLE__) || defined(__MACOSX) - return OSAtomicOr32Orig((uint32_t)exchange, (volatile uint32_t*)dest); -#else // !_WIN32 || defined(__APPLE__) || defined(__MACOSX) - return (__sync_val_compare_and_swap( - dest, - comparand, - exchange)); -#endif // !_WIN32 - } - - inline void fence() { _mm_mfence(); } - }; // namespace detail - - - /*! \brief class used to interface between C++ and - * OpenCL C calls that require arrays of size_t values, whose - * size is known statically. - */ - template - class size_t - { - private: - ::size_t data_[N]; - - public: - //! \brief Initialize size_t to all 0s - size_t() - { - for (int i = 0; i < N; ++i) { - data_[i] = 0; - } - } - - ::size_t& operator[](int index) - { - return data_[index]; - } - - const ::size_t& operator[](int index) const - { - return data_[index]; - } - - //! \brief Conversion operator to T*. - operator ::size_t* () { return data_; } - - //! \brief Conversion operator to const T*. - operator const ::size_t* () const { return data_; } - }; - - namespace detail { - - // Generic getInfoHelper. The final parameter is used to guide overload - // resolution: the actual parameter passed is an int, which makes this - // a worse conversion sequence than a specialization that declares the - // parameter as an int. - template - inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long) - { - return f(name, sizeof(T), param, NULL); - } - - // Specialized getInfoHelper for VECTOR_CLASS params - template - inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, long) - { - ::size_t required; - cl_int err = f(name, 0, NULL, &required); - if (err != CL_SUCCESS) { - return err; - } - - T* value = (T*)alloca(required); - err = f(name, required, value, NULL); - if (err != CL_SUCCESS) { - return err; - } - - param->assign(&value[0], &value[required / sizeof(T)]); - return CL_SUCCESS; - } - - /* Specialization for reference-counted types. This depends on the - * existence of Wrapper::cl_type, and none of the other types having the - * cl_type member. Note that simplify specifying the parameter as Wrapper - * does not work, because when using a derived type (e.g. Context) the generic - * template will provide a better match. - */ - template - inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, int, typename T::cl_type = 0) - { - ::size_t required; - cl_int err = f(name, 0, NULL, &required); - if (err != CL_SUCCESS) { - return err; - } - - typename T::cl_type * value = (typename T::cl_type *) alloca(required); - err = f(name, required, value, NULL); - if (err != CL_SUCCESS) { - return err; - } - - ::size_t elements = required / sizeof(typename T::cl_type); - param->assign(&value[0], &value[elements]); - for (::size_t i = 0; i < elements; i++) - { - if (value[i] != NULL) - { - err = (*param)[i].retain(); - if (err != CL_SUCCESS) { - return err; - } - } - } - return CL_SUCCESS; - } - - // Specialized for getInfo - template - inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, int) - { - cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL); - - if (err != CL_SUCCESS) { - return err; - } - - return CL_SUCCESS; - } - - // Specialized GetInfoHelper for STRING_CLASS params - template - inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long) - { - ::size_t required; - cl_int err = f(name, 0, NULL, &required); - if (err != CL_SUCCESS) { - return err; - } - - char* value = (char*)alloca(required); - err = f(name, required, value, NULL); - if (err != CL_SUCCESS) { - return err; - } - - *param = value; - return CL_SUCCESS; - } - - // Specialized GetInfoHelper for cl::size_t params - template - inline cl_int getInfoHelper(Func f, cl_uint name, size_t* param, long) - { - ::size_t required; - cl_int err = f(name, 0, NULL, &required); - if (err != CL_SUCCESS) { - return err; - } - - ::size_t* value = (::size_t*) alloca(required); - err = f(name, required, value, NULL); - if (err != CL_SUCCESS) { - return err; - } - - for (int i = 0; i < N; ++i) { - (*param)[i] = value[i]; - } - - return CL_SUCCESS; - } - - template struct ReferenceHandler; - - /* Specialization for reference-counted types. This depends on the - * existence of Wrapper::cl_type, and none of the other types having the - * cl_type member. Note that simplify specifying the parameter as Wrapper - * does not work, because when using a derived type (e.g. Context) the generic - * template will provide a better match. - */ - template - inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0) - { - typename T::cl_type value; - cl_int err = f(name, sizeof(value), &value, NULL); - if (err != CL_SUCCESS) { - return err; - } - *param = value; - if (value != NULL) - { - err = param->retain(); - if (err != CL_SUCCESS) { - return err; - } - } - return CL_SUCCESS; - } - -#define __PARAM_NAME_INFO_1_0(F) \ - F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \ - F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \ - F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \ - F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \ - F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \ - \ - F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \ - F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \ - F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \ - F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \ - F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \ - F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \ - F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \ - F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \ - F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \ - F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \ - F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \ - F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \ - F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \ - F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \ - F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \ - F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\ - F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \ - F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \ - F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \ - F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \ - F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \ - F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \ - F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \ - F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \ - F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \ - F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \ - F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \ - F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \ - F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \ - F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \ - F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \ - F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \ - F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \ - F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \ - F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \ - F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \ - \ - F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \ - F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS) \ - F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS) \ - \ - F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \ - F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \ - F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \ - F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_uint) \ - \ - F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \ - F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \ - F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \ - F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \ - \ - F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \ - F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \ - F(cl_mem_info, CL_MEM_SIZE, ::size_t) \ - F(cl_mem_info, CL_MEM_HOST_PTR, void*) \ - F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \ - F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \ - F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \ - \ - F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \ - F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \ - F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \ - F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \ - F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \ - F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \ - F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \ - \ - F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \ - F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \ - F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \ - F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \ - F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \ - \ - F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \ - F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \ - F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \ - F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS) \ - F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \ - F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \ - F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS) \ - \ - F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \ - F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \ - F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \ - \ - F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \ - F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \ - F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \ - F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \ - F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \ - \ - F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \ - F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \ - F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \ - \ - F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \ - F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \ - F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \ - F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties) - -#if defined(CL_VERSION_1_1) -#define __PARAM_NAME_INFO_1_1(F) \ - F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \ - F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \ - F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \ - F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \ - F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \ - \ - F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \ - F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \ - \ - F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \ - F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \ - \ - F(cl_event_info, CL_EVENT_CONTEXT, cl::Context) -#endif // CL_VERSION_1_1 - - -#if defined(CL_VERSION_1_2) -#define __PARAM_NAME_INFO_1_2(F) \ - F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \ - \ - F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \ - F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \ - \ - F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \ - \ - F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \ - \ - F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \ - F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \ - F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \ - F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \ - \ - F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \ - F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS) \ - F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS) \ - F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \ - F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \ - F(cl_device_info, CL_DEVICE_TOPOLOGY_AMD, cl_device_topology_amd) \ - F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS) -#endif // #if defined(CL_VERSION_1_2) - -#if defined(USE_CL_DEVICE_FISSION) -#define __PARAM_NAME_DEVICE_FISSION(F) \ - F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \ - F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS) \ - F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS) \ - F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \ - F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS) -#endif // USE_CL_DEVICE_FISSION - - template - struct param_traits {}; - -#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \ -struct token; \ -template<> \ -struct param_traits \ - { \ - enum { value = param_name }; \ - typedef T param_type; \ - }; - - __PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS) -#if defined(CL_VERSION_1_1) - __PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS) -#endif // CL_VERSION_1_1 -#if defined(CL_VERSION_1_2) - __PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS) -#endif // CL_VERSION_1_1 - -#if defined(USE_CL_DEVICE_FISSION) - __PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS); -#endif // USE_CL_DEVICE_FISSION - -#ifdef CL_PLATFORM_ICD_SUFFIX_KHR - __CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS) -#endif - -#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD - __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong) -#endif - -#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD - __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>) -#endif -#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD - __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_SIMD_WIDTH_AMD - __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD - __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD - __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD - __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD - __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD - __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD - __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD - __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint) -#endif - -#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV - __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint) -#endif -#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV - __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint) -#endif -#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV - __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint) -#endif -#ifdef CL_DEVICE_WARP_SIZE_NV - __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint) -#endif -#ifdef CL_DEVICE_GPU_OVERLAP_NV - __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool) -#endif -#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV - __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool) -#endif -#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV - __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool) -#endif - - // Convenience functions - - template - inline cl_int - getInfo(Func f, cl_uint name, T* param) - { - return getInfoHelper(f, name, param, 0); - } - - template - struct GetInfoFunctor0 - { - Func f_; const Arg0& arg0_; - cl_int operator ()( - cl_uint param, ::size_t size, void* value, ::size_t* size_ret) - { - return f_(arg0_, param, size, value, size_ret); - } - }; - - template - struct GetInfoFunctor1 - { - Func f_; const Arg0& arg0_; const Arg1& arg1_; - cl_int operator ()( - cl_uint param, ::size_t size, void* value, ::size_t* size_ret) - { - return f_(arg0_, arg1_, param, size, value, size_ret); - } - }; - - template - inline cl_int - getInfo(Func f, const Arg0& arg0, cl_uint name, T* param) - { - GetInfoFunctor0 f0 = { f, arg0 }; - return getInfoHelper(f0, name, param, 0); - } - - template - inline cl_int - getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param) - { - GetInfoFunctor1 f0 = { f, arg0, arg1 }; - return getInfoHelper(f0, name, param, 0); - } - - template - struct ReferenceHandler - { }; - -#if defined(CL_VERSION_1_2) - /** - * OpenCL 1.2 devices do have retain/release. - */ - template <> - struct ReferenceHandler - { - /** - * Retain the device. - * \param device A valid device created using createSubDevices - * \return - * CL_SUCCESS if the function executed successfully. - * CL_INVALID_DEVICE if device was not a valid subdevice - * CL_OUT_OF_RESOURCES - * CL_OUT_OF_HOST_MEMORY - */ - static cl_int retain(cl_device_id device) - { - return ::clRetainDevice(device); - } - /** - * Retain the device. - * \param device A valid device created using createSubDevices - * \return - * CL_SUCCESS if the function executed successfully. - * CL_INVALID_DEVICE if device was not a valid subdevice - * CL_OUT_OF_RESOURCES - * CL_OUT_OF_HOST_MEMORY - */ - static cl_int release(cl_device_id device) - { - return ::clReleaseDevice(device); - } - }; -#else // #if defined(CL_VERSION_1_2) - /** - * OpenCL 1.1 devices do not have retain/release. - */ - template <> - struct ReferenceHandler - { - // cl_device_id does not have retain(). - static cl_int retain(cl_device_id) - { - return CL_SUCCESS; - } - // cl_device_id does not have release(). - static cl_int release(cl_device_id) - { - return CL_SUCCESS; - } - }; -#endif // #if defined(CL_VERSION_1_2) - - template <> - struct ReferenceHandler - { - // cl_platform_id does not have retain(). - static cl_int retain(cl_platform_id) - { - return CL_SUCCESS; - } - // cl_platform_id does not have release(). - static cl_int release(cl_platform_id) - { - return CL_SUCCESS; - } - }; - - template <> - struct ReferenceHandler - { - static cl_int retain(cl_context context) - { - return ::clRetainContext(context); - } - static cl_int release(cl_context context) - { - return ::clReleaseContext(context); - } - }; - - template <> - struct ReferenceHandler - { - static cl_int retain(cl_command_queue queue) - { - return ::clRetainCommandQueue(queue); - } - static cl_int release(cl_command_queue queue) - { - return ::clReleaseCommandQueue(queue); - } - }; - - template <> - struct ReferenceHandler - { - static cl_int retain(cl_mem memory) - { - return ::clRetainMemObject(memory); - } - static cl_int release(cl_mem memory) - { - return ::clReleaseMemObject(memory); - } - }; - - template <> - struct ReferenceHandler - { - static cl_int retain(cl_sampler sampler) - { - return ::clRetainSampler(sampler); - } - static cl_int release(cl_sampler sampler) - { - return ::clReleaseSampler(sampler); - } - }; - - template <> - struct ReferenceHandler - { - static cl_int retain(cl_program program) - { - return ::clRetainProgram(program); - } - static cl_int release(cl_program program) - { - return ::clReleaseProgram(program); - } - }; - - template <> - struct ReferenceHandler - { - static cl_int retain(cl_kernel kernel) - { - return ::clRetainKernel(kernel); - } - static cl_int release(cl_kernel kernel) - { - return ::clReleaseKernel(kernel); - } - }; - - template <> - struct ReferenceHandler - { - static cl_int retain(cl_event event) - { - return ::clRetainEvent(event); - } - static cl_int release(cl_event event) - { - return ::clReleaseEvent(event); - } - }; - - - // Extracts version number with major in the upper 16 bits, minor in the lower 16 - static cl_uint getVersion(const char *versionInfo) - { - int highVersion = 0; - int lowVersion = 0; - int index = 7; - while (versionInfo[index] != '.') { - highVersion *= 10; - highVersion += versionInfo[index] - '0'; - ++index; - } - ++index; - while (versionInfo[index] != ' ') { - lowVersion *= 10; - lowVersion += versionInfo[index] - '0'; - ++index; - } - return (highVersion << 16) | lowVersion; - } - - static cl_uint getPlatformVersion(cl_platform_id platform) - { - ::size_t size = 0; - clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size); - char *versionInfo = (char *)alloca(size); - clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size); - return getVersion(versionInfo); - } - - static cl_uint getDevicePlatformVersion(cl_device_id device) - { - cl_platform_id platform; - clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL); - return getPlatformVersion(platform); - } - -#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - static cl_uint getContextPlatformVersion(cl_context context) - { - // The platform cannot be queried directly, so we first have to grab a - // device and obtain its context - ::size_t size = 0; - clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size); - if (size == 0) - return 0; - cl_device_id *devices = (cl_device_id *)alloca(size); - clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL); - return getDevicePlatformVersion(devices[0]); - } -#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - - template - class Wrapper - { - public: - typedef T cl_type; - - protected: - cl_type object_; - - public: - Wrapper() : object_(NULL) { } - - Wrapper(const cl_type &obj) : object_(obj) { } - - ~Wrapper() - { - if (object_ != NULL) { release(); } - } - - Wrapper(const Wrapper& rhs) - { - object_ = rhs.object_; - if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } - } - - Wrapper& operator = (const Wrapper& rhs) - { - if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } - object_ = rhs.object_; - if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } - return *this; - } - - Wrapper& operator = (const cl_type &rhs) - { - if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } - object_ = rhs; - return *this; - } - - cl_type operator ()() const { return object_; } - - cl_type& operator ()() { return object_; } - - protected: - template - friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type); - - cl_int retain() const - { - return ReferenceHandler::retain(object_); - } - - cl_int release() const - { - return ReferenceHandler::release(object_); - } - }; - - template <> - class Wrapper - { - public: - typedef cl_device_id cl_type; - - protected: - cl_type object_; - bool referenceCountable_; - - static bool isReferenceCountable(cl_device_id device) - { - bool retVal = false; - if (device != NULL) { - int version = getDevicePlatformVersion(device); - if (version > ((1 << 16) + 1)) { - retVal = true; - } - } - return retVal; - } - - public: - Wrapper() : object_(NULL), referenceCountable_(false) - { - } - - Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false) - { - referenceCountable_ = isReferenceCountable(obj); - } - - ~Wrapper() - { - if (object_ != NULL) { release(); } - } - - Wrapper(const Wrapper& rhs) - { - object_ = rhs.object_; - referenceCountable_ = isReferenceCountable(object_); - if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } - } - - Wrapper& operator = (const Wrapper& rhs) - { - if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } - object_ = rhs.object_; - referenceCountable_ = rhs.referenceCountable_; - if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } - return *this; - } - - Wrapper& operator = (const cl_type &rhs) - { - if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } - object_ = rhs; - referenceCountable_ = isReferenceCountable(object_); - return *this; - } - - cl_type operator ()() const { return object_; } - - cl_type& operator ()() { return object_; } - - protected: - template - friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type); - - template - friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS*, int, typename U::cl_type); - - cl_int retain() const - { - if (referenceCountable_) { - return ReferenceHandler::retain(object_); - } - else { - return CL_SUCCESS; - } - } - - cl_int release() const - { - if (referenceCountable_) { - return ReferenceHandler::release(object_); - } - else { - return CL_SUCCESS; - } - } - }; - - } // namespace detail - //! \endcond - - /*! \stuct ImageFormat - * \brief Adds constructors and member functions for cl_image_format. - * - * \see cl_image_format - */ - struct ImageFormat : public cl_image_format - { - //! \brief Default constructor - performs no initialization. - ImageFormat(){} - - //! \brief Initializing constructor. - ImageFormat(cl_channel_order order, cl_channel_type type) - { - image_channel_order = order; - image_channel_data_type = type; - } - - //! \brief Assignment operator. - ImageFormat& operator = (const ImageFormat& rhs) - { - if (this != &rhs) { - this->image_channel_data_type = rhs.image_channel_data_type; - this->image_channel_order = rhs.image_channel_order; - } - return *this; - } - }; - - /*! \brief Class interface for cl_device_id. - * - * \note Copies of these objects are inexpensive, since they don't 'own' - * any underlying resources or data structures. - * - * \see cl_device_id - */ - class Device : public detail::Wrapper - { - public: - //! \brief Default constructor - initializes to NULL. - Device() : detail::Wrapper() { } - - /*! \brief Copy constructor. - * - * This simply copies the device ID value, which is an inexpensive operation. - */ - Device(const Device& device) : detail::Wrapper(device) { } - - /*! \brief Constructor from cl_device_id. - * - * This simply copies the device ID value, which is an inexpensive operation. - */ - Device(const cl_device_id &device) : detail::Wrapper(device) { } - - /*! \brief Returns the first device on the default context. - * - * \see Context::getDefault() - */ - static Device getDefault(cl_int * err = NULL); - - /*! \brief Assignment operator from Device. - * - * This simply copies the device ID value, which is an inexpensive operation. - */ - Device& operator = (const Device& rhs) - { - if (this != &rhs) { - detail::Wrapper::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment operator from cl_device_id. - * - * This simply copies the device ID value, which is an inexpensive operation. - */ - Device& operator = (const cl_device_id& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - //! \brief Wrapper for clGetDeviceInfo(). - template - cl_int getInfo(cl_device_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetDeviceInfo, object_, name, param), - __GET_DEVICE_INFO_ERR); - } - - //! \brief Wrapper for clGetDeviceInfo() that returns by value. - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_device_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - /** - * CL 1.2 version - */ -#if defined(CL_VERSION_1_2) - //! \brief Wrapper for clCreateSubDevicesEXT(). - cl_int createSubDevices( - const cl_device_partition_property * properties, - VECTOR_CLASS* devices) - { - cl_uint n = 0; - cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __CREATE_SUB_DEVICES); - } - - cl_device_id* ids = (cl_device_id*)alloca(n * sizeof(cl_device_id)); - err = clCreateSubDevices(object_, properties, n, ids, NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __CREATE_SUB_DEVICES); - } - - devices->assign(&ids[0], &ids[n]); - return CL_SUCCESS; - } -#endif // #if defined(CL_VERSION_1_2) - - /** - * CL 1.1 version that uses device fission. - */ -#if defined(CL_VERSION_1_1) -#if defined(USE_CL_DEVICE_FISSION) - cl_int createSubDevices( - const cl_device_partition_property_ext * properties, - VECTOR_CLASS* devices) - { - typedef CL_API_ENTRY cl_int - (CL_API_CALL * PFN_clCreateSubDevicesEXT)( - cl_device_id /*in_device*/, - const cl_device_partition_property_ext * /* properties */, - cl_uint /*num_entries*/, - cl_device_id * /*out_devices*/, - cl_uint * /*num_devices*/) CL_EXT_SUFFIX__VERSION_1_1; - - static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL; - __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT); - - cl_uint n = 0; - cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __CREATE_SUB_DEVICES); - } - - cl_device_id* ids = (cl_device_id*)alloca(n * sizeof(cl_device_id)); - err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __CREATE_SUB_DEVICES); - } - - devices->assign(&ids[0], &ids[n]); - return CL_SUCCESS; - } -#endif // #if defined(USE_CL_DEVICE_FISSION) -#endif // #if defined(CL_VERSION_1_1) - }; - - /*! \brief Class interface for cl_platform_id. - * - * \note Copies of these objects are inexpensive, since they don't 'own' - * any underlying resources or data structures. - * - * \see cl_platform_id - */ - class Platform : public detail::Wrapper - { - public: - //! \brief Default constructor - initializes to NULL. - Platform() : detail::Wrapper() { } - - /*! \brief Copy constructor. - * - * This simply copies the platform ID value, which is an inexpensive operation. - */ - Platform(const Platform& platform) : detail::Wrapper(platform) { } - - /*! \brief Constructor from cl_platform_id. - * - * This simply copies the platform ID value, which is an inexpensive operation. - */ - Platform(const cl_platform_id &platform) : detail::Wrapper(platform) { } - - /*! \brief Assignment operator from Platform. - * - * This simply copies the platform ID value, which is an inexpensive operation. - */ - Platform& operator = (const Platform& rhs) - { - if (this != &rhs) { - detail::Wrapper::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment operator from cl_platform_id. - * - * This simply copies the platform ID value, which is an inexpensive operation. - */ - Platform& operator = (const cl_platform_id& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - //! \brief Wrapper for clGetPlatformInfo(). - cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetPlatformInfo, object_, name, param), - __GET_PLATFORM_INFO_ERR); - } - - //! \brief Wrapper for clGetPlatformInfo() that returns by value. - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_platform_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - /*! \brief Gets a list of devices for this platform. - * - * Wraps clGetDeviceIDs(). - */ - cl_int getDevices( - cl_device_type type, - VECTOR_CLASS* devices) const - { - cl_uint n = 0; - if (devices == NULL) { - return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR); - } - cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_DEVICE_IDS_ERR); - } - - cl_device_id* ids = (cl_device_id*)alloca(n * sizeof(cl_device_id)); - err = ::clGetDeviceIDs(object_, type, n, ids, NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_DEVICE_IDS_ERR); - } - - devices->assign(&ids[0], &ids[n]); - return CL_SUCCESS; - } - -#if defined(USE_DX_INTEROP) - /*! \brief Get the list of available D3D10 devices. - * - * \param d3d_device_source. - * - * \param d3d_object. - * - * \param d3d_device_set. - * - * \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device - * values returned in devices can be used to identify a specific OpenCL - * device. If \a devices argument is NULL, this argument is ignored. - * - * \return One of the following values: - * - CL_SUCCESS if the function is executed successfully. - * - * The application can query specific capabilities of the OpenCL device(s) - * returned by cl::getDevices. This can be used by the application to - * determine which device(s) to use. - * - * \note In the case that exceptions are enabled and a return value - * other than CL_SUCCESS is generated, then cl::Error exception is - * generated. - */ - cl_int getDevices( - cl_d3d10_device_source_khr d3d_device_source, - void * d3d_object, - cl_d3d10_device_set_khr d3d_device_set, - VECTOR_CLASS* devices) const - { - typedef CL_API_ENTRY cl_int(CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)( - cl_platform_id platform, - cl_d3d10_device_source_khr d3d_device_source, - void * d3d_object, - cl_d3d10_device_set_khr d3d_device_set, - cl_uint num_entries, - cl_device_id * devices, - cl_uint* num_devices); - - if (devices == NULL) { - return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR); - } - - static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL; - __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR); - - cl_uint n = 0; - cl_int err = pfn_clGetDeviceIDsFromD3D10KHR( - object_, - d3d_device_source, - d3d_object, - d3d_device_set, - 0, - NULL, - &n); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_DEVICE_IDS_ERR); - } - - cl_device_id* ids = (cl_device_id*)alloca(n * sizeof(cl_device_id)); - err = pfn_clGetDeviceIDsFromD3D10KHR( - object_, - d3d_device_source, - d3d_object, - d3d_device_set, - n, - ids, - NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_DEVICE_IDS_ERR); - } - - devices->assign(&ids[0], &ids[n]); - return CL_SUCCESS; - } -#endif - - /*! \brief Gets a list of available platforms. - * - * Wraps clGetPlatformIDs(). - */ - static cl_int get( - VECTOR_CLASS* platforms) - { - cl_uint n = 0; - - if (platforms == NULL) { - return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR); - } - - cl_int err = ::clGetPlatformIDs(0, NULL, &n); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); - } - - cl_platform_id* ids = (cl_platform_id*)alloca( - n * sizeof(cl_platform_id)); - err = ::clGetPlatformIDs(n, ids, NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); - } - - platforms->assign(&ids[0], &ids[n]); - return CL_SUCCESS; - } - - /*! \brief Gets the first available platform. - * - * Wraps clGetPlatformIDs(), returning the first result. - */ - static cl_int get( - Platform * platform) - { - cl_uint n = 0; - - if (platform == NULL) { - return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR); - } - - cl_int err = ::clGetPlatformIDs(0, NULL, &n); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); - } - - cl_platform_id* ids = (cl_platform_id*)alloca( - n * sizeof(cl_platform_id)); - err = ::clGetPlatformIDs(n, ids, NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); - } - - *platform = ids[0]; - return CL_SUCCESS; - } - - /*! \brief Gets the first available platform, returning it by value. - * - * Wraps clGetPlatformIDs(), returning the first result. - */ - static Platform get( - cl_int * errResult = NULL) - { - Platform platform; - cl_uint n = 0; - cl_int err = ::clGetPlatformIDs(0, NULL, &n); - if (err != CL_SUCCESS) { - detail::errHandler(err, __GET_PLATFORM_IDS_ERR); - if (errResult != NULL) { - *errResult = err; - } - } - - cl_platform_id* ids = (cl_platform_id*)alloca( - n * sizeof(cl_platform_id)); - err = ::clGetPlatformIDs(n, ids, NULL); - - if (err != CL_SUCCESS) { - detail::errHandler(err, __GET_PLATFORM_IDS_ERR); - } - - if (errResult != NULL) { - *errResult = err; - } - - return ids[0]; - } - - static Platform getDefault( - cl_int *errResult = NULL) - { - return get(errResult); - } - - -#if defined(CL_VERSION_1_2) - //! \brief Wrapper for clUnloadCompiler(). - cl_int - unloadCompiler() - { - return ::clUnloadPlatformCompiler(object_); - } -#endif // #if defined(CL_VERSION_1_2) - }; // class Platform - - /** - * Deprecated APIs for 1.2 - */ -#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) - /** - * Unload the OpenCL compiler. - * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead. - */ - inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int - UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - inline cl_int - UnloadCompiler() - { - return ::clUnloadCompiler(); - } -#endif // #if defined(CL_VERSION_1_1) - - /*! \brief Class interface for cl_context. - * - * \note Copies of these objects are shallow, meaning that the copy will refer - * to the same underlying cl_context as the original. For details, see - * clRetainContext() and clReleaseContext(). - * - * \see cl_context - */ - class Context - : public detail::Wrapper - { - private: - static volatile int default_initialized_; - static Context default_; - static volatile cl_int default_error_; - public: - /*! \brief Destructor. - * - * This calls clReleaseContext() on the value held by this instance. - */ - ~Context() { } - - /*! \brief Constructs a context including a list of specified devices. - * - * Wraps clCreateContext(). - */ - Context( - const VECTOR_CLASS& devices, - cl_context_properties* properties = NULL, - void (CL_CALLBACK * notifyFptr)( - const char *, - const void *, - ::size_t, - void *) = NULL, - void* data = NULL, - cl_int* err = NULL) - { - cl_int error; - - ::size_t numDevices = devices.size(); - cl_device_id* deviceIDs = (cl_device_id*)alloca(numDevices * sizeof(cl_device_id)); - for (::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex) { - deviceIDs[deviceIndex] = (devices[deviceIndex])(); - } - - object_ = ::clCreateContext( - properties, (cl_uint)numDevices, - deviceIDs, - notifyFptr, data, &error); - - detail::errHandler(error, __CREATE_CONTEXT_ERR); - if (err != NULL) { - *err = error; - } - } - - Context( - const Device& device, - cl_context_properties* properties = NULL, - void (CL_CALLBACK * notifyFptr)( - const char *, - const void *, - ::size_t, - void *) = NULL, - void* data = NULL, - cl_int* err = NULL) - { - cl_int error; - - cl_device_id deviceID = device(); - - object_ = ::clCreateContext( - properties, 1, - &deviceID, - notifyFptr, data, &error); - - detail::errHandler(error, __CREATE_CONTEXT_ERR); - if (err != NULL) { - *err = error; - } - } - - /*! \brief Constructs a context including all devices of a specified type. - * - * Wraps clCreateContextFromType(). - */ - Context( - cl_device_type type, - cl_context_properties* properties = NULL, - void (CL_CALLBACK * notifyFptr)( - const char *, - const void *, - ::size_t, - void *) = NULL, - void* data = NULL, - cl_int* err = NULL) - { - cl_int error; - -#if !defined(__APPLE__) || !defined(__MACOS) - cl_context_properties prop[4] = { CL_CONTEXT_PLATFORM, 0, 0, 0 }; - if (properties == NULL) { - prop[1] = (cl_context_properties)Platform::get(&error)(); - if (error != CL_SUCCESS) { - detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); - if (err != NULL) { - *err = error; - return; - } - } - - properties = &prop[0]; - } -#endif - object_ = ::clCreateContextFromType( - properties, type, notifyFptr, data, &error); - - detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); - if (err != NULL) { - *err = error; - } - } - - /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT. - * - * \note All calls to this function return the same cl_context as the first. - */ - static Context getDefault(cl_int * err = NULL) - { - int state = detail::compare_exchange( - &default_initialized_, - __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED); - - if (state & __DEFAULT_INITIALIZED) { - if (err != NULL) { - *err = default_error_; - } - return default_; - } - - if (state & __DEFAULT_BEING_INITIALIZED) { - // Assume writes will propagate eventually... - while (default_initialized_ != __DEFAULT_INITIALIZED) { - detail::fence(); - } - - if (err != NULL) { - *err = default_error_; - } - return default_; - } - - cl_int error; - default_ = Context( - CL_DEVICE_TYPE_DEFAULT, - NULL, - NULL, - NULL, - &error); - - detail::fence(); - - default_error_ = error; - // Assume writes will propagate eventually... - default_initialized_ = __DEFAULT_INITIALIZED; - - detail::fence(); - - if (err != NULL) { - *err = default_error_; - } - return default_; - - } - - //! \brief Default constructor - initializes to NULL. - Context() : detail::Wrapper() { } - - /*! \brief Copy constructor. - * - * This calls clRetainContext() on the parameter's cl_context. - */ - Context(const Context& context) : detail::Wrapper(context) { } - - /*! \brief Constructor from cl_context - takes ownership. - * - * This effectively transfers ownership of a refcount on the cl_context - * into the new Context object. - */ - __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper(context) { } - - /*! \brief Assignment operator from Context. - * - * This calls clRetainContext() on the parameter and clReleaseContext() on - * the previous value held by this instance. - */ - Context& operator = (const Context& rhs) - { - if (this != &rhs) { - detail::Wrapper::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment operator from cl_context - takes ownership. - * - * This effectively transfers ownership of a refcount on the rhs and calls - * clReleaseContext() on the value previously held by this instance. - */ - Context& operator = (const cl_context& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - //! \brief Wrapper for clGetContextInfo(). - template - cl_int getInfo(cl_context_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetContextInfo, object_, name, param), - __GET_CONTEXT_INFO_ERR); - } - - //! \brief Wrapper for clGetContextInfo() that returns by value. - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_context_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - /*! \brief Gets a list of supported image formats. - * - * Wraps clGetSupportedImageFormats(). - */ - cl_int getSupportedImageFormats( - cl_mem_flags flags, - cl_mem_object_type type, - VECTOR_CLASS* formats) const - { - cl_uint numEntries; - cl_int err = ::clGetSupportedImageFormats( - object_, - flags, - type, - 0, - NULL, - &numEntries); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); - } - - ImageFormat* value = (ImageFormat*) - alloca(numEntries * sizeof(ImageFormat)); - err = ::clGetSupportedImageFormats( - object_, - flags, - type, - numEntries, - (cl_image_format*)value, - NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); - } - - formats->assign(&value[0], &value[numEntries]); - return CL_SUCCESS; - } - }; - - inline Device Device::getDefault(cl_int * err) - { - cl_int error; - Device device; - - Context context = Context::getDefault(&error); - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - - if (error != CL_SUCCESS) { - if (err != NULL) { - *err = error; - } - } - else { - device = context.getInfo()[0]; - if (err != NULL) { - *err = CL_SUCCESS; - } - } - - return device; - } - - -#ifdef _WIN32 - __declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED; - __declspec(selectany) Context Context::default_; - __declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS; -#else - __attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED; - __attribute__((weak)) Context Context::default_; - __attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS; -#endif - - /*! \brief Class interface for cl_event. - * - * \note Copies of these objects are shallow, meaning that the copy will refer - * to the same underlying cl_event as the original. For details, see - * clRetainEvent() and clReleaseEvent(). - * - * \see cl_event - */ - class Event : public detail::Wrapper - { - public: - /*! \brief Destructor. - * - * This calls clReleaseEvent() on the value held by this instance. - */ - ~Event() { } - - //! \brief Default constructor - initializes to NULL. - Event() : detail::Wrapper() { } - - /*! \brief Copy constructor. - * - * This calls clRetainEvent() on the parameter's cl_event. - */ - Event(const Event& event) : detail::Wrapper(event) { } - - /*! \brief Constructor from cl_event - takes ownership. - * - * This effectively transfers ownership of a refcount on the cl_event - * into the new Event object. - */ - Event(const cl_event& event) : detail::Wrapper(event) { } - - /*! \brief Assignment operator from cl_event - takes ownership. - * - * This effectively transfers ownership of a refcount on the rhs and calls - * clReleaseEvent() on the value previously held by this instance. - */ - Event& operator = (const Event& rhs) - { - if (this != &rhs) { - detail::Wrapper::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment operator from cl_event. - * - * This calls clRetainEvent() on the parameter and clReleaseEvent() on - * the previous value held by this instance. - */ - Event& operator = (const cl_event& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - //! \brief Wrapper for clGetEventInfo(). - template - cl_int getInfo(cl_event_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetEventInfo, object_, name, param), - __GET_EVENT_INFO_ERR); - } - - //! \brief Wrapper for clGetEventInfo() that returns by value. - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_event_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - //! \brief Wrapper for clGetEventProfilingInfo(). - template - cl_int getProfilingInfo(cl_profiling_info name, T* param) const - { - return detail::errHandler(detail::getInfo( - &::clGetEventProfilingInfo, object_, name, param), - __GET_EVENT_PROFILE_INFO_ERR); - } - - //! \brief Wrapper for clGetEventProfilingInfo() that returns by value. - template typename - detail::param_traits::param_type - getProfilingInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_profiling_info, name>::param_type param; - cl_int result = getProfilingInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - /*! \brief Blocks the calling thread until this event completes. - * - * Wraps clWaitForEvents(). - */ - cl_int wait() const - { - return detail::errHandler( - ::clWaitForEvents(1, &object_), - __WAIT_FOR_EVENTS_ERR); - } - -#if defined(CL_VERSION_1_1) - /*! \brief Registers a user callback function for a specific command execution status. - * - * Wraps clSetEventCallback(). - */ - cl_int setCallback( - cl_int type, - void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *), - void * user_data = NULL) - { - return detail::errHandler( - ::clSetEventCallback( - object_, - type, - pfn_notify, - user_data), - __SET_EVENT_CALLBACK_ERR); - } -#endif - - /*! \brief Blocks the calling thread until every event specified is complete. - * - * Wraps clWaitForEvents(). - */ - static cl_int - waitForEvents(const VECTOR_CLASS& events) - { - return detail::errHandler( - ::clWaitForEvents( - (cl_uint)events.size(), (cl_event*)&events.front()), - __WAIT_FOR_EVENTS_ERR); - } - }; - -#if defined(CL_VERSION_1_1) - /*! \brief Class interface for user events (a subset of cl_event's). - * - * See Event for details about copy semantics, etc. - */ - class UserEvent : public Event - { - public: - /*! \brief Constructs a user event on a given context. - * - * Wraps clCreateUserEvent(). - */ - UserEvent( - const Context& context, - cl_int * err = NULL) - { - cl_int error; - object_ = ::clCreateUserEvent( - context(), - &error); - - detail::errHandler(error, __CREATE_USER_EVENT_ERR); - if (err != NULL) { - *err = error; - } - } - - //! \brief Default constructor - initializes to NULL. - UserEvent() : Event() { } - - //! \brief Copy constructor - performs shallow copy. - UserEvent(const UserEvent& event) : Event(event) { } - - //! \brief Assignment Operator - performs shallow copy. - UserEvent& operator = (const UserEvent& rhs) - { - if (this != &rhs) { - Event::operator=(rhs); - } - return *this; - } - - /*! \brief Sets the execution status of a user event object. - * - * Wraps clSetUserEventStatus(). - */ - cl_int setStatus(cl_int status) - { - return detail::errHandler( - ::clSetUserEventStatus(object_, status), - __SET_USER_EVENT_STATUS_ERR); - } - }; -#endif - - /*! \brief Blocks the calling thread until every event specified is complete. - * - * Wraps clWaitForEvents(). - */ - inline static cl_int - WaitForEvents(const VECTOR_CLASS& events) - { - return detail::errHandler( - ::clWaitForEvents( - (cl_uint)events.size(), (cl_event*)&events.front()), - __WAIT_FOR_EVENTS_ERR); - } - - /*! \brief Class interface for cl_mem. - * - * \note Copies of these objects are shallow, meaning that the copy will refer - * to the same underlying cl_mem as the original. For details, see - * clRetainMemObject() and clReleaseMemObject(). - * - * \see cl_mem - */ - class Memory : public detail::Wrapper - { - public: - - /*! \brief Destructor. - * - * This calls clReleaseMemObject() on the value held by this instance. - */ - ~Memory() {} - - //! \brief Default constructor - initializes to NULL. - Memory() : detail::Wrapper() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * This calls clRetainMemObject() on the parameter's cl_mem. - */ - Memory(const Memory& memory) : detail::Wrapper(memory) { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * This effectively transfers ownership of a refcount on the cl_mem - * into the new Memory object. - */ - __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper(memory) { } - - /*! \brief Assignment operator from Memory. - * - * This calls clRetainMemObject() on the parameter and clReleaseMemObject() - * on the previous value held by this instance. - */ - Memory& operator = (const Memory& rhs) - { - if (this != &rhs) { - detail::Wrapper::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment operator from cl_mem - takes ownership. - * - * This effectively transfers ownership of a refcount on the rhs and calls - * clReleaseMemObject() on the value previously held by this instance. - */ - Memory& operator = (const cl_mem& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - //! \brief Wrapper for clGetMemObjectInfo(). - template - cl_int getInfo(cl_mem_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetMemObjectInfo, object_, name, param), - __GET_MEM_OBJECT_INFO_ERR); - } - - //! \brief Wrapper for clGetMemObjectInfo() that returns by value. - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_mem_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - -#if defined(CL_VERSION_1_1) - /*! \brief Registers a callback function to be called when the memory object - * is no longer needed. - * - * Wraps clSetMemObjectDestructorCallback(). - * - * Repeated calls to this function, for a given cl_mem value, will append - * to the list of functions called (in reverse order) when memory object's - * resources are freed and the memory object is deleted. - * - * \note - * The registered callbacks are associated with the underlying cl_mem - * value - not the Memory class instance. - */ - cl_int setDestructorCallback( - void (CL_CALLBACK * pfn_notify)(cl_mem, void *), - void * user_data = NULL) - { - return detail::errHandler( - ::clSetMemObjectDestructorCallback( - object_, - pfn_notify, - user_data), - __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR); - } -#endif - - }; - - // Pre-declare copy functions - class Buffer; - template< typename IteratorType > - cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer); - template< typename IteratorType > - cl_int copy(const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator); - - /*! \brief Class interface for Buffer Memory Objects. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ - class Buffer : public Memory - { - public: - - /*! \brief Constructs a Buffer in a specified context. - * - * Wraps clCreateBuffer(). - * - * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was - * specified. Note alignment & exclusivity requirements. - */ - Buffer( - const Context& context, - cl_mem_flags flags, - ::size_t size, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); - - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - } - - /*! \brief Constructs a Buffer in the default context. - * - * Wraps clCreateBuffer(). - * - * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was - * specified. Note alignment & exclusivity requirements. - * - * \see Context::getDefault() - */ - Buffer( - cl_mem_flags flags, - ::size_t size, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - - Context context = Context::getDefault(err); - - object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); - - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - } - - /*! - * \brief Construct a Buffer from a host container via iterators. - * If useHostPtr is specified iterators must be random access. - */ - template< typename IteratorType > - Buffer( - IteratorType startIterator, - IteratorType endIterator, - bool readOnly, - bool useHostPtr = false, - cl_int* err = NULL) - { - typedef typename std::iterator_traits::value_type DataType; - cl_int error; - - cl_mem_flags flags = 0; - if (readOnly) { - flags |= CL_MEM_READ_ONLY; - } - else { - flags |= CL_MEM_READ_WRITE; - } - if (useHostPtr) { - flags |= CL_MEM_USE_HOST_PTR; - } - - ::size_t size = sizeof(DataType)*(endIterator - startIterator); - - Context context = Context::getDefault(err); - - if (useHostPtr) { - object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); - } - else { - object_ = ::clCreateBuffer(context(), flags, size, 0, &error); - } - - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - - if (!useHostPtr) { - error = cl::copy(startIterator, endIterator, *this); - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - } - } - - //! \brief Default constructor - initializes to NULL. - Buffer() : Memory() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * See Memory for further details. - */ - Buffer(const Buffer& buffer) : Memory(buffer) { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * See Memory for further details. - */ - __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { } - - /*! \brief Assignment from Buffer - performs shallow copy. - * - * See Memory for further details. - */ - Buffer& operator = (const Buffer& rhs) - { - if (this != &rhs) { - Memory::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Buffer& operator = (const cl_mem& rhs) - { - Memory::operator=(rhs); - return *this; - } - -#if defined(CL_VERSION_1_1) - /*! \brief Creates a new buffer object from this. - * - * Wraps clCreateSubBuffer(). - */ - Buffer createSubBuffer( - cl_mem_flags flags, - cl_buffer_create_type buffer_create_type, - const void * buffer_create_info, - cl_int * err = NULL) - { - Buffer result; - cl_int error; - result.object_ = ::clCreateSubBuffer( - object_, - flags, - buffer_create_type, - buffer_create_info, - &error); - - detail::errHandler(error, __CREATE_SUBBUFFER_ERR); - if (err != NULL) { - *err = error; - } - - return result; - } -#endif - }; - -#if defined (USE_DX_INTEROP) - /*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's. - * - * This is provided to facilitate interoperability with Direct3D. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ - class BufferD3D10 : public Buffer - { - public: - typedef CL_API_ENTRY cl_mem(CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)( - cl_context context, cl_mem_flags flags, ID3D10Buffer* buffer, - cl_int* errcode_ret); - - /*! \brief Constructs a BufferD3D10, in a specified context, from a - * given ID3D10Buffer. - * - * Wraps clCreateFromD3D10BufferKHR(). - */ - BufferD3D10( - const Context& context, - cl_mem_flags flags, - ID3D10Buffer* bufobj, - cl_int * err = NULL) - { - static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL; - -#if defined(CL_VERSION_1_2) - vector props = context.getInfo(); - cl_platform platform = -1; - for (int i = 0; i < props.size(); ++i) { - if (props[i] == CL_CONTEXT_PLATFORM) { - platform = props[i + 1]; - } - } - __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR); -#endif -#if defined(CL_VERSION_1_1) - __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR); -#endif - - cl_int error; - object_ = pfn_clCreateFromD3D10BufferKHR( - context(), - flags, - bufobj, - &error); - - detail::errHandler(error, __CREATE_GL_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - } - - //! \brief Default constructor - initializes to NULL. - BufferD3D10() : Buffer() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * See Memory for further details. - */ - BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * See Memory for further details. - */ - __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { } - - /*! \brief Assignment from BufferD3D10 - performs shallow copy. - * - * See Memory for further details. - */ - BufferD3D10& operator = (const BufferD3D10& rhs) - { - if (this != &rhs) { - Buffer::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - BufferD3D10& operator = (const cl_mem& rhs) - { - Buffer::operator=(rhs); - return *this; - } - }; -#endif - - /*! \brief Class interface for GL Buffer Memory Objects. - * - * This is provided to facilitate interoperability with OpenGL. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ - class BufferGL : public Buffer - { - public: - /*! \brief Constructs a BufferGL in a specified context, from a given - * GL buffer. - * - * Wraps clCreateFromGLBuffer(). - */ - BufferGL( - const Context& context, - cl_mem_flags flags, - GLuint bufobj, - cl_int * err = NULL) - { - cl_int error; - object_ = ::clCreateFromGLBuffer( - context(), - flags, - bufobj, - &error); - - detail::errHandler(error, __CREATE_GL_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - } - - //! \brief Default constructor - initializes to NULL. - BufferGL() : Buffer() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * See Memory for further details. - */ - BufferGL(const BufferGL& buffer) : Buffer(buffer) { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * See Memory for further details. - */ - __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { } - - /*! \brief Assignment from BufferGL - performs shallow copy. - * - * See Memory for further details. - */ - BufferGL& operator = (const BufferGL& rhs) - { - if (this != &rhs) { - Buffer::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - BufferGL& operator = (const cl_mem& rhs) - { - Buffer::operator=(rhs); - return *this; - } - - //! \brief Wrapper for clGetGLObjectInfo(). - cl_int getObjectInfo( - cl_gl_object_type *type, - GLuint * gl_object_name) - { - return detail::errHandler( - ::clGetGLObjectInfo(object_, type, gl_object_name), - __GET_GL_OBJECT_INFO_ERR); - } - }; - - /*! \brief Class interface for GL Render Buffer Memory Objects. - * - * This is provided to facilitate interoperability with OpenGL. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ - class BufferRenderGL : public Buffer - { - public: - /*! \brief Constructs a BufferRenderGL in a specified context, from a given - * GL Renderbuffer. - * - * Wraps clCreateFromGLRenderbuffer(). - */ - BufferRenderGL( - const Context& context, - cl_mem_flags flags, - GLuint bufobj, - cl_int * err = NULL) - { - cl_int error; - object_ = ::clCreateFromGLRenderbuffer( - context(), - flags, - bufobj, - &error); - - detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - } - - //! \brief Default constructor - initializes to NULL. - BufferRenderGL() : Buffer() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * See Memory for further details. - */ - BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * See Memory for further details. - */ - __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Buffer(buffer) { } - - /*! \brief Assignment from BufferGL - performs shallow copy. - * - * See Memory for further details. - */ - BufferRenderGL& operator = (const BufferRenderGL& rhs) - { - if (this != &rhs) { - Buffer::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - BufferRenderGL& operator = (const cl_mem& rhs) - { - Buffer::operator=(rhs); - return *this; - } - - //! \brief Wrapper for clGetGLObjectInfo(). - cl_int getObjectInfo( - cl_gl_object_type *type, - GLuint * gl_object_name) - { - return detail::errHandler( - ::clGetGLObjectInfo(object_, type, gl_object_name), - __GET_GL_OBJECT_INFO_ERR); - } - }; - - /*! \brief C++ base class for Image Memory objects. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ - class Image : public Memory - { - protected: - //! \brief Default constructor - initializes to NULL. - Image() : Memory() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * See Memory for further details. - */ - Image(const Image& image) : Memory(image) { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * See Memory for further details. - */ - __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { } - - /*! \brief Assignment from Image - performs shallow copy. - * - * See Memory for further details. - */ - Image& operator = (const Image& rhs) - { - if (this != &rhs) { - Memory::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Image& operator = (const cl_mem& rhs) - { - Memory::operator=(rhs); - return *this; - } - - public: - //! \brief Wrapper for clGetImageInfo(). - template - cl_int getImageInfo(cl_image_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetImageInfo, object_, name, param), - __GET_IMAGE_INFO_ERR); - } - - //! \brief Wrapper for clGetImageInfo() that returns by value. - template typename - detail::param_traits::param_type - getImageInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_image_info, name>::param_type param; - cl_int result = getImageInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - }; - -#if defined(CL_VERSION_1_2) - /*! \brief Class interface for 1D Image Memory objects. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ - class Image1D : public Image - { - public: - /*! \brief Constructs a 1D Image in a specified context. - * - * Wraps clCreateImage(). - */ - Image1D( - const Context& context, - cl_mem_flags flags, - ImageFormat format, - ::size_t width, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - cl_image_desc desc; - desc.image_type = CL_MEM_OBJECT_IMAGE1D; - desc.image_width = width; - desc.image_row_pitch = 0; - desc.num_mip_levels = 0; - desc.num_samples = 0; - desc.buffer = 0; - object_ = ::clCreateImage( - context(), - flags, - &format, - &desc, - host_ptr, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - } - - //! \brief Default constructor - initializes to NULL. - Image1D() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * See Memory for further details. - */ - Image1D(const Image1D& image1D) : Image(image1D) { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * See Memory for further details. - */ - __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { } - - /*! \brief Assignment from Image1D - performs shallow copy. - * - * See Memory for further details. - */ - Image1D& operator = (const Image1D& rhs) - { - if (this != &rhs) { - Image::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Image1D& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } - }; - - /*! \class Image1DBuffer - * \brief Image interface for 1D buffer images. - */ - class Image1DBuffer : public Image - { - public: - Image1DBuffer( - const Context& context, - cl_mem_flags flags, - ImageFormat format, - ::size_t width, - const Buffer &buffer, - cl_int* err = NULL) - { - cl_int error; - cl_image_desc desc; - desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; - desc.image_width = width; - desc.image_row_pitch = 0; - desc.num_mip_levels = 0; - desc.num_samples = 0; - desc.buffer = buffer(); - object_ = ::clCreateImage( - context(), - flags, - &format, - &desc, - NULL, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - } - - Image1DBuffer() { } - - Image1DBuffer(const Image1DBuffer& image1D) : Image(image1D) { } - - __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { } - - Image1DBuffer& operator = (const Image1DBuffer& rhs) - { - if (this != &rhs) { - Image::operator=(rhs); - } - return *this; - } - - Image1DBuffer& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } - }; - - /*! \class Image1DArray - * \brief Image interface for arrays of 1D images. - */ - class Image1DArray : public Image - { - public: - Image1DArray( - const Context& context, - cl_mem_flags flags, - ImageFormat format, - ::size_t arraySize, - ::size_t width, - ::size_t rowPitch, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - cl_image_desc desc; - desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY; - desc.image_array_size = arraySize; - desc.image_width = width; - desc.image_row_pitch = rowPitch; - desc.num_mip_levels = 0; - desc.num_samples = 0; - desc.buffer = 0; - object_ = ::clCreateImage( - context(), - flags, - &format, - &desc, - host_ptr, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - } - - Image1DArray() { } - - Image1DArray(const Image1DArray& imageArray) : Image(imageArray) { } - - __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { } - - Image1DArray& operator = (const Image1DArray& rhs) - { - if (this != &rhs) { - Image::operator=(rhs); - } - return *this; - } - - Image1DArray& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } - }; -#endif // #if defined(CL_VERSION_1_2) - - - /*! \brief Class interface for 2D Image Memory objects. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ - class Image2D : public Image - { - public: - /*! \brief Constructs a 1D Image in a specified context. - * - * Wraps clCreateImage(). - */ - Image2D( - const Context& context, - cl_mem_flags flags, - ImageFormat format, - ::size_t width, - ::size_t height, - ::size_t row_pitch = 0, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - bool useCreateImage; - -#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - // Run-time decision based on the actual platform - { - cl_uint version = detail::getContextPlatformVersion(context()); - useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above - } -#elif defined(CL_VERSION_1_2) - useCreateImage = true; -#else - useCreateImage = false; -#endif - -#if defined(CL_VERSION_1_2) - if (useCreateImage) - { - cl_image_desc desc; - desc.image_type = CL_MEM_OBJECT_IMAGE2D; - desc.image_width = width; - desc.image_height = height; - desc.image_row_pitch = row_pitch; - desc.num_mip_levels = 0; - desc.num_samples = 0; - desc.buffer = 0; - object_ = ::clCreateImage( - context(), - flags, - &format, - &desc, - host_ptr, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // #if defined(CL_VERSION_1_2) -#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - if (!useCreateImage) - { - object_ = ::clCreateImage2D( - context(), flags, &format, width, height, row_pitch, host_ptr, &error); - - detail::errHandler(error, __CREATE_IMAGE2D_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - } - - //! \brief Default constructor - initializes to NULL. - Image2D() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * See Memory for further details. - */ - Image2D(const Image2D& image2D) : Image(image2D) { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * See Memory for further details. - */ - __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { } - - /*! \brief Assignment from Image2D - performs shallow copy. - * - * See Memory for further details. - */ - Image2D& operator = (const Image2D& rhs) - { - if (this != &rhs) { - Image::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Image2D& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } - }; - - -#if !defined(CL_VERSION_1_2) - /*! \brief Class interface for GL 2D Image Memory objects. - * - * This is provided to facilitate interoperability with OpenGL. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - * \note Deprecated for OpenCL 1.2. Please use ImageGL instead. - */ - class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D - { - public: - /*! \brief Constructs an Image2DGL in a specified context, from a given - * GL Texture. - * - * Wraps clCreateFromGLTexture2D(). - */ - Image2DGL( - const Context& context, - cl_mem_flags flags, - GLenum target, - GLint miplevel, - GLuint texobj, - cl_int * err = NULL) - { - cl_int error; - object_ = ::clCreateFromGLTexture2D( - context(), - flags, - target, - miplevel, - texobj, - &error); - - detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR); - if (err != NULL) { - *err = error; - } - - } - - //! \brief Default constructor - initializes to NULL. - Image2DGL() : Image2D() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * See Memory for further details. - */ - Image2DGL(const Image2DGL& image) : Image2D(image) { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * See Memory for further details. - */ - __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { } - - /*! \brief Assignment from Image2DGL - performs shallow copy. - * - * See Memory for further details. - */ - Image2DGL& operator = (const Image2DGL& rhs) - { - if (this != &rhs) { - Image2D::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Image2DGL& operator = (const cl_mem& rhs) - { - Image2D::operator=(rhs); - return *this; - } - }; -#endif // #if !defined(CL_VERSION_1_2) - -#if defined(CL_VERSION_1_2) - /*! \class Image2DArray - * \brief Image interface for arrays of 2D images. - */ - class Image2DArray : public Image - { - public: - Image2DArray( - const Context& context, - cl_mem_flags flags, - ImageFormat format, - ::size_t arraySize, - ::size_t width, - ::size_t height, - ::size_t rowPitch, - ::size_t slicePitch, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - cl_image_desc desc; - desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY; - desc.image_array_size = arraySize; - desc.image_width = width; - desc.image_height = height; - desc.image_row_pitch = rowPitch; - desc.image_slice_pitch = slicePitch; - desc.num_mip_levels = 0; - desc.num_samples = 0; - desc.buffer = 0; - object_ = ::clCreateImage( - context(), - flags, - &format, - &desc, - host_ptr, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - } - - Image2DArray() { } - - Image2DArray(const Image2DArray& imageArray) : Image(imageArray) { } - - __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { } - - Image2DArray& operator = (const Image2DArray& rhs) - { - if (this != &rhs) { - Image::operator=(rhs); - } - return *this; - } - - Image2DArray& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } - }; -#endif // #if defined(CL_VERSION_1_2) - - /*! \brief Class interface for 3D Image Memory objects. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ - class Image3D : public Image - { - public: - /*! \brief Constructs a 3D Image in a specified context. - * - * Wraps clCreateImage(). - */ - Image3D( - const Context& context, - cl_mem_flags flags, - ImageFormat format, - ::size_t width, - ::size_t height, - ::size_t depth, - ::size_t row_pitch = 0, - ::size_t slice_pitch = 0, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - bool useCreateImage; - -#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - // Run-time decision based on the actual platform - { - cl_uint version = detail::getContextPlatformVersion(context()); - useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above - } -#elif defined(CL_VERSION_1_2) - useCreateImage = true; -#else - useCreateImage = false; -#endif - -#if defined(CL_VERSION_1_2) - if (useCreateImage) - { - cl_image_desc desc; - desc.image_type = CL_MEM_OBJECT_IMAGE3D; - desc.image_width = width; - desc.image_height = height; - desc.image_depth = depth; - desc.image_row_pitch = row_pitch; - desc.image_slice_pitch = slice_pitch; - desc.num_mip_levels = 0; - desc.num_samples = 0; - desc.buffer = 0; - object_ = ::clCreateImage( - context(), - flags, - &format, - &desc, - host_ptr, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // #if defined(CL_VERSION_1_2) -#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - if (!useCreateImage) - { - object_ = ::clCreateImage3D( - context(), flags, &format, width, height, depth, row_pitch, - slice_pitch, host_ptr, &error); - - detail::errHandler(error, __CREATE_IMAGE3D_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - } - - //! \brief Default constructor - initializes to NULL. - Image3D() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * See Memory for further details. - */ - Image3D(const Image3D& image3D) : Image(image3D) { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * See Memory for further details. - */ - __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { } - - /*! \brief Assignment from Image3D - performs shallow copy. - * - * See Memory for further details. - */ - Image3D& operator = (const Image3D& rhs) - { - if (this != &rhs) { - Image::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Image3D& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } - }; - -#if !defined(CL_VERSION_1_2) - /*! \brief Class interface for GL 3D Image Memory objects. - * - * This is provided to facilitate interoperability with OpenGL. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ - class Image3DGL : public Image3D - { - public: - /*! \brief Constructs an Image3DGL in a specified context, from a given - * GL Texture. - * - * Wraps clCreateFromGLTexture3D(). - */ - Image3DGL( - const Context& context, - cl_mem_flags flags, - GLenum target, - GLint miplevel, - GLuint texobj, - cl_int * err = NULL) - { - cl_int error; - object_ = ::clCreateFromGLTexture3D( - context(), - flags, - target, - miplevel, - texobj, - &error); - - detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR); - if (err != NULL) { - *err = error; - } - } - - //! \brief Default constructor - initializes to NULL. - Image3DGL() : Image3D() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * See Memory for further details. - */ - Image3DGL(const Image3DGL& image) : Image3D(image) { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * See Memory for further details. - */ - __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { } - - /*! \brief Assignment from Image3DGL - performs shallow copy. - * - * See Memory for further details. - */ - Image3DGL& operator = (const Image3DGL& rhs) - { - if (this != &rhs) { - Image3D::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Image3DGL& operator = (const cl_mem& rhs) - { - Image3D::operator=(rhs); - return *this; - } - }; -#endif // #if !defined(CL_VERSION_1_2) - -#if defined(CL_VERSION_1_2) - /*! \class ImageGL - * \brief general image interface for GL interop. - * We abstract the 2D and 3D GL images into a single instance here - * that wraps all GL sourced images on the grounds that setup information - * was performed by OpenCL anyway. - */ - class ImageGL : public Image - { - public: - ImageGL( - const Context& context, - cl_mem_flags flags, - GLenum target, - GLint miplevel, - GLuint texobj, - cl_int * err = NULL) - { - cl_int error; - object_ = ::clCreateFromGLTexture( - context(), - flags, - target, - miplevel, - texobj, - &error); - - detail::errHandler(error, __CREATE_GL_TEXTURE_ERR); - if (err != NULL) { - *err = error; - } - } - - ImageGL() : Image() { } - - ImageGL(const ImageGL& image) : Image(image) { } - - __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { } - - ImageGL& operator = (const ImageGL& rhs) - { - if (this != &rhs) { - Image::operator=(rhs); - } - return *this; - } - - ImageGL& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } - }; -#endif // #if defined(CL_VERSION_1_2) - - /*! \brief Class interface for cl_sampler. - * - * \note Copies of these objects are shallow, meaning that the copy will refer - * to the same underlying cl_sampler as the original. For details, see - * clRetainSampler() and clReleaseSampler(). - * - * \see cl_sampler - */ - class Sampler : public detail::Wrapper - { - public: - /*! \brief Destructor. - * - * This calls clReleaseSampler() on the value held by this instance. - */ - ~Sampler() { } - - //! \brief Default constructor - initializes to NULL. - Sampler() { } - - /*! \brief Constructs a Sampler in a specified context. - * - * Wraps clCreateSampler(). - */ - Sampler( - const Context& context, - cl_bool normalized_coords, - cl_addressing_mode addressing_mode, - cl_filter_mode filter_mode, - cl_int* err = NULL) - { - cl_int error; - object_ = ::clCreateSampler( - context(), - normalized_coords, - addressing_mode, - filter_mode, - &error); - - detail::errHandler(error, __CREATE_SAMPLER_ERR); - if (err != NULL) { - *err = error; - } - } - - /*! \brief Copy constructor - performs shallow copy. - * - * This calls clRetainSampler() on the parameter's cl_sampler. - */ - Sampler(const Sampler& sampler) : detail::Wrapper(sampler) { } - - /*! \brief Constructor from cl_sampler - takes ownership. - * - * This effectively transfers ownership of a refcount on the cl_sampler - * into the new Sampler object. - */ - Sampler(const cl_sampler& sampler) : detail::Wrapper(sampler) { } - - /*! \brief Assignment operator from Sampler. - * - * This calls clRetainSampler() on the parameter and clReleaseSampler() - * on the previous value held by this instance. - */ - Sampler& operator = (const Sampler& rhs) - { - if (this != &rhs) { - detail::Wrapper::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment operator from cl_sampler - takes ownership. - * - * This effectively transfers ownership of a refcount on the rhs and calls - * clReleaseSampler() on the value previously held by this instance. - */ - Sampler& operator = (const cl_sampler& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - //! \brief Wrapper for clGetSamplerInfo(). - template - cl_int getInfo(cl_sampler_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetSamplerInfo, object_, name, param), - __GET_SAMPLER_INFO_ERR); - } - - //! \brief Wrapper for clGetSamplerInfo() that returns by value. - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_sampler_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - }; - - class Program; - class CommandQueue; - class Kernel; - - //! \brief Class interface for specifying NDRange values. - class NDRange - { - private: - size_t<3> sizes_; - cl_uint dimensions_; - - public: - //! \brief Default constructor - resulting range has zero dimensions. - NDRange() - : dimensions_(0) - { } - - //! \brief Constructs one-dimensional range. - NDRange(::size_t size0) - : dimensions_(1) - { - sizes_[0] = size0; - } - - //! \brief Constructs two-dimensional range. - NDRange(::size_t size0, ::size_t size1) - : dimensions_(2) - { - sizes_[0] = size0; - sizes_[1] = size1; - } - - //! \brief Constructs three-dimensional range. - NDRange(::size_t size0, ::size_t size1, ::size_t size2) - : dimensions_(3) - { - sizes_[0] = size0; - sizes_[1] = size1; - sizes_[2] = size2; - } - - /*! \brief Conversion operator to const ::size_t *. - * - * \returns a pointer to the size of the first dimension. - */ - operator const ::size_t*() const { - return (const ::size_t*) sizes_; - } - - //! \brief Queries the number of dimensions in the range. - ::size_t dimensions() const { return dimensions_; } - }; - - //! \brief A zero-dimensional range. - static const NDRange NullRange; - - //! \brief Local address wrapper for use with Kernel::setArg - struct LocalSpaceArg - { - ::size_t size_; - }; - - namespace detail { - - template - struct KernelArgumentHandler - { - static ::size_t size(const T&) { return sizeof(T); } - static T* ptr(T& value) { return &value; } - }; - - template <> - struct KernelArgumentHandler - { - static ::size_t size(const LocalSpaceArg& value) { return value.size_; } - static void* ptr(LocalSpaceArg&) { return NULL; } - }; - - } - //! \endcond - - /*! __local - * \brief Helper function for generating LocalSpaceArg objects. - * Deprecated. Replaced with Local. - */ - inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg - __local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - inline LocalSpaceArg - __local(::size_t size) - { - LocalSpaceArg ret = { size }; - return ret; - } - - /*! Local - * \brief Helper function for generating LocalSpaceArg objects. - */ - inline LocalSpaceArg - Local(::size_t size) - { - LocalSpaceArg ret = { size }; - return ret; - } - - //class KernelFunctor; - - /*! \brief Class interface for cl_kernel. - * - * \note Copies of these objects are shallow, meaning that the copy will refer - * to the same underlying cl_kernel as the original. For details, see - * clRetainKernel() and clReleaseKernel(). - * - * \see cl_kernel - */ - class Kernel : public detail::Wrapper - { - public: - inline Kernel(const Program& program, const char* name, cl_int* err = NULL); - - /*! \brief Destructor. - * - * This calls clReleaseKernel() on the value held by this instance. - */ - ~Kernel() { } - - //! \brief Default constructor - initializes to NULL. - Kernel() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * This calls clRetainKernel() on the parameter's cl_kernel. - */ - Kernel(const Kernel& kernel) : detail::Wrapper(kernel) { } - - /*! \brief Constructor from cl_kernel - takes ownership. - * - * This effectively transfers ownership of a refcount on the cl_kernel - * into the new Kernel object. - */ - __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper(kernel) { } - - /*! \brief Assignment operator from Kernel. - * - * This calls clRetainKernel() on the parameter and clReleaseKernel() - * on the previous value held by this instance. - */ - Kernel& operator = (const Kernel& rhs) - { - if (this != &rhs) { - detail::Wrapper::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment operator from cl_kernel - takes ownership. - * - * This effectively transfers ownership of a refcount on the rhs and calls - * clReleaseKernel() on the value previously held by this instance. - */ - Kernel& operator = (const cl_kernel& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - template - cl_int getInfo(cl_kernel_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetKernelInfo, object_, name, param), - __GET_KERNEL_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_kernel_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - -#if defined(CL_VERSION_1_2) - template - cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param), - __GET_KERNEL_ARG_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getArgInfo(cl_uint argIndex, cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_kernel_arg_info, name>::param_type param; - cl_int result = getArgInfo(argIndex, name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } -#endif // #if defined(CL_VERSION_1_2) - - template - cl_int getWorkGroupInfo( - const Device& device, cl_kernel_work_group_info name, T* param) const - { - return detail::errHandler( - detail::getInfo( - &::clGetKernelWorkGroupInfo, object_, device(), name, param), - __GET_KERNEL_WORK_GROUP_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getWorkGroupInfo(const Device& device, cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_kernel_work_group_info, name>::param_type param; - cl_int result = getWorkGroupInfo(device, name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - template - cl_int setArg(cl_uint index, T value) - { - return detail::errHandler( - ::clSetKernelArg( - object_, - index, - detail::KernelArgumentHandler::size(value), - detail::KernelArgumentHandler::ptr(value)), - __SET_KERNEL_ARGS_ERR); - } - - cl_int setArg(cl_uint index, ::size_t size, void* argPtr) - { - return detail::errHandler( - ::clSetKernelArg(object_, index, size, argPtr), - __SET_KERNEL_ARGS_ERR); - } - }; - - /*! \class Program - * \brief Program interface that implements cl_program. - */ - class Program : public detail::Wrapper - { - public: - typedef VECTOR_CLASS > Binaries; - typedef VECTOR_CLASS > Sources; - - Program( - const STRING_CLASS& source, - cl_int* err = NULL) - { - cl_int error; - - const char * strings = source.c_str(); - const ::size_t length = source.size(); - - Context context = Context::getDefault(err); - - object_ = ::clCreateProgramWithSource( - context(), (cl_uint)1, &strings, &length, &error); - - detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); - - if (error == CL_SUCCESS) { - - error = ::clBuildProgram( - object_, - 0, - NULL, - "", - NULL, - NULL); - - detail::errHandler(error, __BUILD_PROGRAM_ERR); - } - - if (err != NULL) { - *err = error; - } - } - - Program( - const STRING_CLASS& source, - bool build, - cl_int* err = NULL) - { - cl_int error; - - const char * strings = source.c_str(); - const ::size_t length = source.size(); - - Context context = Context::getDefault(err); - - object_ = ::clCreateProgramWithSource( - context(), (cl_uint)1, &strings, &length, &error); - - detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); - - if (error == CL_SUCCESS && build) { - - error = ::clBuildProgram( - object_, - 0, - NULL, - "", - NULL, - NULL); - - detail::errHandler(error, __BUILD_PROGRAM_ERR); - } - - if (err != NULL) { - *err = error; - } - } - - Program( - const Context& context, - const STRING_CLASS& source, - bool build = false, - cl_int* err = NULL) - { - cl_int error; - - const char * strings = source.c_str(); - const ::size_t length = source.size(); - - object_ = ::clCreateProgramWithSource( - context(), (cl_uint)1, &strings, &length, &error); - - detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); - - if (error == CL_SUCCESS && build) { - - error = ::clBuildProgram( - object_, - 0, - NULL, - "", - NULL, - NULL); - - detail::errHandler(error, __BUILD_PROGRAM_ERR); - } - - if (err != NULL) { - *err = error; - } - } - - Program( - const Context& context, - const Sources& sources, - cl_int* err = NULL) - { - cl_int error; - - const ::size_t n = (::size_t)sources.size(); - ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t)); - const char** strings = (const char**)alloca(n * sizeof(const char*)); - - for (::size_t i = 0; i < n; ++i) { - strings[i] = sources[(int)i].first; - lengths[i] = sources[(int)i].second; - } - - object_ = ::clCreateProgramWithSource( - context(), (cl_uint)n, strings, lengths, &error); - - detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); - if (err != NULL) { - *err = error; - } - } - - /** - * Construct a program object from a list of devices and a per-device list of binaries. - * \param context A valid OpenCL context in which to construct the program. - * \param devices A vector of OpenCL device objects for which the program will be created. - * \param binaries A vector of pairs of a pointer to a binary object and its length. - * \param binaryStatus An optional vector that on completion will be resized to - * match the size of binaries and filled with values to specify if each binary - * was successfully loaded. - * Set to CL_SUCCESS if the binary was successfully loaded. - * Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL. - * Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device. - * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors: - * CL_INVALID_CONTEXT if context is not a valid context. - * CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; - * or if any entry in binaries is NULL or has length 0. - * CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context. - * CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device. - * CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host. - */ - Program( - const Context& context, - const VECTOR_CLASS& devices, - const Binaries& binaries, - VECTOR_CLASS* binaryStatus = NULL, - cl_int* err = NULL) - { - cl_int error; - - const ::size_t numDevices = devices.size(); - - // Catch size mismatch early and return - if (binaries.size() != numDevices) { - error = CL_INVALID_VALUE; - detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); - if (err != NULL) { - *err = error; - } - return; - } - - ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t)); - const unsigned char** images = (const unsigned char**)alloca(numDevices * sizeof(const unsigned char**)); - - for (::size_t i = 0; i < numDevices; ++i) { - images[i] = (const unsigned char*)binaries[i].first; - lengths[i] = binaries[(int)i].second; - } - - cl_device_id* deviceIDs = (cl_device_id*)alloca(numDevices * sizeof(cl_device_id)); - for (::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex) { - deviceIDs[deviceIndex] = (devices[deviceIndex])(); - } - - if (binaryStatus) { - binaryStatus->resize(numDevices); - } - - object_ = ::clCreateProgramWithBinary( - context(), (cl_uint)devices.size(), - deviceIDs, - lengths, images, binaryStatus != NULL - ? &binaryStatus->front() - : NULL, &error); - - detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); - if (err != NULL) { - *err = error; - } - } - - -#if defined(CL_VERSION_1_2) - /** - * Create program using builtin kernels. - * \param kernelNames Semi-colon separated list of builtin kernel names - */ - Program( - const Context& context, - const VECTOR_CLASS& devices, - const STRING_CLASS& kernelNames, - cl_int* err = NULL) - { - cl_int error; - - - ::size_t numDevices = devices.size(); - cl_device_id* deviceIDs = (cl_device_id*)alloca(numDevices * sizeof(cl_device_id)); - for (::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex) { - deviceIDs[deviceIndex] = (devices[deviceIndex])(); - } - - object_ = ::clCreateProgramWithBuiltInKernels( - context(), - (cl_uint)devices.size(), - deviceIDs, - kernelNames.c_str(), - &error); - - detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // #if defined(CL_VERSION_1_2) - - Program() { } - - Program(const Program& program) : detail::Wrapper(program) { } - - __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper(program) { } - - Program& operator = (const Program& rhs) - { - if (this != &rhs) { - detail::Wrapper::operator=(rhs); - } - return *this; - } - - Program& operator = (const cl_program& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - cl_int build( - const VECTOR_CLASS& devices, - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL) const - { - ::size_t numDevices = devices.size(); - cl_device_id* deviceIDs = (cl_device_id*)alloca(numDevices * sizeof(cl_device_id)); - for (::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex) { - deviceIDs[deviceIndex] = (devices[deviceIndex])(); - } - - return detail::errHandler( - ::clBuildProgram( - object_, - (cl_uint) - devices.size(), - deviceIDs, - options, - notifyFptr, - data), - __BUILD_PROGRAM_ERR); - } - - cl_int build( - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL) const - { - return detail::errHandler( - ::clBuildProgram( - object_, - 0, - NULL, - options, - notifyFptr, - data), - __BUILD_PROGRAM_ERR); - } - -#if defined(CL_VERSION_1_2) - cl_int compile( - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL) const - { - return detail::errHandler( - ::clCompileProgram( - object_, - 0, - NULL, - options, - 0, - NULL, - NULL, - notifyFptr, - data), - __COMPILE_PROGRAM_ERR); - } -#endif - - template - cl_int getInfo(cl_program_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetProgramInfo, object_, name, param), - __GET_PROGRAM_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_program_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - template - cl_int getBuildInfo( - const Device& device, cl_program_build_info name, T* param) const - { - return detail::errHandler( - detail::getInfo( - &::clGetProgramBuildInfo, object_, device(), name, param), - __GET_PROGRAM_BUILD_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getBuildInfo(const Device& device, cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_program_build_info, name>::param_type param; - cl_int result = getBuildInfo(device, name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - cl_int createKernels(VECTOR_CLASS* kernels) - { - cl_uint numKernels; - cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); - } - - Kernel* value = (Kernel*)alloca(numKernels * sizeof(Kernel)); - err = ::clCreateKernelsInProgram( - object_, numKernels, (cl_kernel*)value, NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); - } - - kernels->assign(&value[0], &value[numKernels]); - return CL_SUCCESS; - } - }; - -#if defined(CL_VERSION_1_2) - inline Program linkProgram( - Program input1, - Program input2, - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL, - cl_int* err = NULL) - { - cl_int err_local = CL_SUCCESS; - - cl_program programs[2] = { input1(), input2() }; - - Context ctx = input1.getInfo(); - - cl_program prog = ::clLinkProgram( - ctx(), - 0, - NULL, - options, - 2, - programs, - notifyFptr, - data, - &err_local); - - detail::errHandler(err_local, __COMPILE_PROGRAM_ERR); - if (err != NULL) { - *err = err_local; - } - - return Program(prog); - } - - inline Program linkProgram( - VECTOR_CLASS inputPrograms, - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL, - cl_int* err = NULL) - { - cl_int err_local = CL_SUCCESS; - - cl_program * programs = (cl_program*)alloca(inputPrograms.size() * sizeof(cl_program)); - - if (programs != NULL) { - for (unsigned int i = 0; i < inputPrograms.size(); i++) { - programs[i] = inputPrograms[i](); - } - } - - cl_program prog = ::clLinkProgram( - Context::getDefault()(), - 0, - NULL, - options, - (cl_uint)inputPrograms.size(), - programs, - notifyFptr, - data, - &err_local); - - detail::errHandler(err_local, __COMPILE_PROGRAM_ERR); - if (err != NULL) { - *err = err_local; - } - - return Program(prog); - } -#endif - - template<> - inline VECTOR_CLASS cl::Program::getInfo(cl_int* err) const - { - VECTOR_CLASS< ::size_t> sizes = getInfo(); - VECTOR_CLASS binaries; - for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s) - { - char *ptr = NULL; - if (*s != 0) - ptr = new char[*s]; - binaries.push_back(ptr); - } - - cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries); - if (err != NULL) { - *err = result; - } - return binaries; - } - - inline Kernel::Kernel(const Program& program, const char* name, cl_int* err) - { - cl_int error; - - object_ = ::clCreateKernel(program(), name, &error); - detail::errHandler(error, __CREATE_KERNEL_ERR); - - if (err != NULL) { - *err = error; - } - - } - - /*! \class CommandQueue - * \brief CommandQueue interface for cl_command_queue. - */ - class CommandQueue : public detail::Wrapper - { - private: - static volatile int default_initialized_; - static CommandQueue default_; - static volatile cl_int default_error_; - public: - CommandQueue( - cl_command_queue_properties properties, - cl_int* err = NULL) - { - cl_int error; - - Context context = Context::getDefault(&error); - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - - if (error != CL_SUCCESS) { - if (err != NULL) { - *err = error; - } - } - else { - Device device = context.getInfo()[0]; - - object_ = ::clCreateCommandQueue( - context(), device(), properties, &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - if (err != NULL) { - *err = error; - } - } - } - - CommandQueue( - const Context& context, - const Device& device, - cl_command_queue_properties properties = 0, - cl_int* err = NULL) - { - cl_int error; - object_ = ::clCreateCommandQueue( - context(), device(), properties, &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - if (err != NULL) { - *err = error; - } - } - - static CommandQueue getDefault(cl_int * err = NULL) - { - int state = detail::compare_exchange( - &default_initialized_, - __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED); - - if (state & __DEFAULT_INITIALIZED) { - if (err != NULL) { - *err = default_error_; - } - return default_; - } - - if (state & __DEFAULT_BEING_INITIALIZED) { - // Assume writes will propagate eventually... - while (default_initialized_ != __DEFAULT_INITIALIZED) { - detail::fence(); - } - - if (err != NULL) { - *err = default_error_; - } - return default_; - } - - cl_int error; - - Context context = Context::getDefault(&error); - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - - if (error != CL_SUCCESS) { - if (err != NULL) { - *err = error; - } - } - else { - Device device = context.getInfo()[0]; - - default_ = CommandQueue(context, device, 0, &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - if (err != NULL) { - *err = error; - } - } - - detail::fence(); - - default_error_ = error; - // Assume writes will propagate eventually... - default_initialized_ = __DEFAULT_INITIALIZED; - - detail::fence(); - - if (err != NULL) { - *err = default_error_; - } - return default_; - - } - - CommandQueue() { } - - CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper(commandQueue) { } - - CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper(commandQueue) { } - - CommandQueue& operator = (const CommandQueue& rhs) - { - if (this != &rhs) { - detail::Wrapper::operator=(rhs); - } - return *this; - } - - CommandQueue& operator = (const cl_command_queue& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - template - cl_int getInfo(cl_command_queue_info name, T* param) const - { - return detail::errHandler( - detail::getInfo( - &::clGetCommandQueueInfo, object_, name, param), - __GET_COMMAND_QUEUE_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_command_queue_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - cl_int enqueueReadBuffer( - const Buffer& buffer, - cl_bool blocking, - ::size_t offset, - ::size_t size, - void* ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueReadBuffer( - object_, buffer(), blocking, offset, size, - ptr, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_READ_BUFFER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueWriteBuffer( - const Buffer& buffer, - cl_bool blocking, - ::size_t offset, - ::size_t size, - const void* ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueWriteBuffer( - object_, buffer(), blocking, offset, size, - ptr, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_WRITE_BUFFER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueCopyBuffer( - const Buffer& src, - const Buffer& dst, - ::size_t src_offset, - ::size_t dst_offset, - ::size_t size, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueCopyBuffer( - object_, src(), dst(), src_offset, dst_offset, size, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQEUE_COPY_BUFFER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueReadBufferRect( - const Buffer& buffer, - cl_bool blocking, - const size_t<3>& buffer_offset, - const size_t<3>& host_offset, - const size_t<3>& region, - ::size_t buffer_row_pitch, - ::size_t buffer_slice_pitch, - ::size_t host_row_pitch, - ::size_t host_slice_pitch, - void *ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueReadBufferRect( - object_, - buffer(), - blocking, - (const ::size_t *)buffer_offset, - (const ::size_t *)host_offset, - (const ::size_t *)region, - buffer_row_pitch, - buffer_slice_pitch, - host_row_pitch, - host_slice_pitch, - ptr, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_READ_BUFFER_RECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueWriteBufferRect( - const Buffer& buffer, - cl_bool blocking, - const size_t<3>& buffer_offset, - const size_t<3>& host_offset, - const size_t<3>& region, - ::size_t buffer_row_pitch, - ::size_t buffer_slice_pitch, - ::size_t host_row_pitch, - ::size_t host_slice_pitch, - void *ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueWriteBufferRect( - object_, - buffer(), - blocking, - (const ::size_t *)buffer_offset, - (const ::size_t *)host_offset, - (const ::size_t *)region, - buffer_row_pitch, - buffer_slice_pitch, - host_row_pitch, - host_slice_pitch, - ptr, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_WRITE_BUFFER_RECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueCopyBufferRect( - const Buffer& src, - const Buffer& dst, - const size_t<3>& src_origin, - const size_t<3>& dst_origin, - const size_t<3>& region, - ::size_t src_row_pitch, - ::size_t src_slice_pitch, - ::size_t dst_row_pitch, - ::size_t dst_slice_pitch, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueCopyBufferRect( - object_, - src(), - dst(), - (const ::size_t *)src_origin, - (const ::size_t *)dst_origin, - (const ::size_t *)region, - src_row_pitch, - src_slice_pitch, - dst_row_pitch, - dst_slice_pitch, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQEUE_COPY_BUFFER_RECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - -#if defined(CL_VERSION_1_2) - /** - * Enqueue a command to fill a buffer object with a pattern - * of a given size. The pattern is specified a as vector. - * \tparam PatternType The datatype of the pattern field. - * The pattern type must be an accepted OpenCL data type. - */ - template - cl_int enqueueFillBuffer( - const Buffer& buffer, - PatternType pattern, - ::size_t offset, - ::size_t size, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueFillBuffer( - object_, - buffer(), - static_cast(&pattern), - sizeof(PatternType), - offset, - size, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_FILL_BUFFER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } -#endif // #if defined(CL_VERSION_1_2) - - cl_int enqueueReadImage( - const Image& image, - cl_bool blocking, - const size_t<3>& origin, - const size_t<3>& region, - ::size_t row_pitch, - ::size_t slice_pitch, - void* ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueReadImage( - object_, image(), blocking, (const ::size_t *) origin, - (const ::size_t *) region, row_pitch, slice_pitch, ptr, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_READ_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueWriteImage( - const Image& image, - cl_bool blocking, - const size_t<3>& origin, - const size_t<3>& region, - ::size_t row_pitch, - ::size_t slice_pitch, - void* ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueWriteImage( - object_, image(), blocking, (const ::size_t *) origin, - (const ::size_t *) region, row_pitch, slice_pitch, ptr, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_WRITE_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueCopyImage( - const Image& src, - const Image& dst, - const size_t<3>& src_origin, - const size_t<3>& dst_origin, - const size_t<3>& region, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueCopyImage( - object_, src(), dst(), (const ::size_t *) src_origin, - (const ::size_t *)dst_origin, (const ::size_t *) region, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_COPY_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - -#if defined(CL_VERSION_1_2) - /** - * Enqueue a command to fill an image object with a specified color. - * \param fillColor is the color to use to fill the image. - * This is a four component RGBA floating-point color value if - * the image channel data type is not an unnormalized signed or - * unsigned data type. - */ - cl_int enqueueFillImage( - const Image& image, - cl_float4 fillColor, - const size_t<3>& origin, - const size_t<3>& region, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueFillImage( - object_, - image(), - static_cast(&fillColor), - (const ::size_t *) origin, - (const ::size_t *) region, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_FILL_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - /** - * Enqueue a command to fill an image object with a specified color. - * \param fillColor is the color to use to fill the image. - * This is a four component RGBA signed integer color value if - * the image channel data type is an unnormalized signed integer - * type. - */ - cl_int enqueueFillImage( - const Image& image, - cl_int4 fillColor, - const size_t<3>& origin, - const size_t<3>& region, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueFillImage( - object_, - image(), - static_cast(&fillColor), - (const ::size_t *) origin, - (const ::size_t *) region, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_FILL_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - /** - * Enqueue a command to fill an image object with a specified color. - * \param fillColor is the color to use to fill the image. - * This is a four component RGBA unsigned integer color value if - * the image channel data type is an unnormalized unsigned integer - * type. - */ - cl_int enqueueFillImage( - const Image& image, - cl_uint4 fillColor, - const size_t<3>& origin, - const size_t<3>& region, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueFillImage( - object_, - image(), - static_cast(&fillColor), - (const ::size_t *) origin, - (const ::size_t *) region, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_FILL_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } -#endif // #if defined(CL_VERSION_1_2) - - cl_int enqueueCopyImageToBuffer( - const Image& src, - const Buffer& dst, - const size_t<3>& src_origin, - const size_t<3>& region, - ::size_t dst_offset, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueCopyImageToBuffer( - object_, src(), dst(), (const ::size_t *) src_origin, - (const ::size_t *) region, dst_offset, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueCopyBufferToImage( - const Buffer& src, - const Image& dst, - ::size_t src_offset, - const size_t<3>& dst_origin, - const size_t<3>& region, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueCopyBufferToImage( - object_, src(), dst(), src_offset, - (const ::size_t *) dst_origin, (const ::size_t *) region, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - void* enqueueMapBuffer( - const Buffer& buffer, - cl_bool blocking, - cl_map_flags flags, - ::size_t offset, - ::size_t size, - const VECTOR_CLASS* events = NULL, - Event* event = NULL, - cl_int* err = NULL) const - { - cl_int error; - void * result = ::clEnqueueMapBuffer( - object_, buffer(), blocking, flags, offset, size, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (cl_event*)event, - &error); - - detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - return result; - } - - void* enqueueMapImage( - const Image& buffer, - cl_bool blocking, - cl_map_flags flags, - const size_t<3>& origin, - const size_t<3>& region, - ::size_t * row_pitch, - ::size_t * slice_pitch, - const VECTOR_CLASS* events = NULL, - Event* event = NULL, - cl_int* err = NULL) const - { - cl_int error; - void * result = ::clEnqueueMapImage( - object_, buffer(), blocking, flags, - (const ::size_t *) origin, (const ::size_t *) region, - row_pitch, slice_pitch, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (cl_event*)event, - &error); - - detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - return result; - } - - cl_int enqueueUnmapMemObject( - const Memory& memory, - void* mapped_ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueUnmapMemObject( - object_, memory(), mapped_ptr, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_UNMAP_MEM_OBJECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - -#if defined(CL_VERSION_1_2) - /** - * Enqueues a marker command which waits for either a list of events to complete, - * or all previously enqueued commands to complete. - * - * Enqueues a marker command which waits for either a list of events to complete, - * or if the list is empty it waits for all commands previously enqueued in command_queue - * to complete before it completes. This command returns an event which can be waited on, - * i.e. this event can be waited on to insure that all events either in the event_wait_list - * or all previously enqueued commands, queued before this command to command_queue, - * have completed. - */ - cl_int enqueueMarkerWithWaitList( - const VECTOR_CLASS *events = 0, - Event *event = 0) - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueMarkerWithWaitList( - object_, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_MARKER_WAIT_LIST_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - /** - * A synchronization point that enqueues a barrier operation. - * - * Enqueues a barrier command which waits for either a list of events to complete, - * or if the list is empty it waits for all commands previously enqueued in command_queue - * to complete before it completes. This command blocks command execution, that is, any - * following commands enqueued after it do not execute until it completes. This command - * returns an event which can be waited on, i.e. this event can be waited on to insure that - * all events either in the event_wait_list or all previously enqueued commands, queued - * before this command to command_queue, have completed. - */ - cl_int enqueueBarrierWithWaitList( - const VECTOR_CLASS *events = 0, - Event *event = 0) - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueBarrierWithWaitList( - object_, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_BARRIER_WAIT_LIST_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - /** - * Enqueues a command to indicate with which device a set of memory objects - * should be associated. - */ - cl_int enqueueMigrateMemObjects( - const VECTOR_CLASS &memObjects, - cl_mem_migration_flags flags, - const VECTOR_CLASS* events = NULL, - Event* event = NULL - ) - { - cl_event tmp; - - cl_mem* localMemObjects = static_cast(alloca(memObjects.size() * sizeof(cl_mem))); - for (int i = 0; i < (int)memObjects.size(); ++i) { - localMemObjects[i] = memObjects[i](); - } - - - cl_int err = detail::errHandler( - ::clEnqueueMigrateMemObjects( - object_, - (cl_uint)memObjects.size(), - static_cast(localMemObjects), - flags, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_UNMAP_MEM_OBJECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } -#endif // #if defined(CL_VERSION_1_2) - - cl_int enqueueNDRangeKernel( - const Kernel& kernel, - const NDRange& offset, - const NDRange& global, - const NDRange& local = NullRange, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueNDRangeKernel( - object_, kernel(), (cl_uint)global.dimensions(), - offset.dimensions() != 0 ? (const ::size_t*) offset : NULL, - (const ::size_t*) global, - local.dimensions() != 0 ? (const ::size_t*) local : NULL, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_NDRANGE_KERNEL_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueTask( - const Kernel& kernel, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueTask( - object_, kernel(), - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_TASK_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueNativeKernel( - void (CL_CALLBACK *userFptr)(void *), - std::pair args, - const VECTOR_CLASS* mem_objects = NULL, - const VECTOR_CLASS* mem_locs = NULL, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) - ? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem)) - : NULL; - - if (mems != NULL) { - for (unsigned int i = 0; i < mem_objects->size(); i++) { - mems[i] = ((*mem_objects)[i])(); - } - } - - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueNativeKernel( - object_, userFptr, args.first, args.second, - (mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0, - mems, - (mem_locs != NULL) ? (const void **)&mem_locs->front() : NULL, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_NATIVE_KERNEL); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - /** - * Deprecated APIs for 1.2 - */ -#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) - CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED - { - return detail::errHandler( - ::clEnqueueMarker(object_, (cl_event*)event), - __ENQUEUE_MARKER_ERR); - } - - CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - cl_int enqueueWaitForEvents(const VECTOR_CLASS& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED - { - return detail::errHandler( - ::clEnqueueWaitForEvents( - object_, - (cl_uint)events.size(), - (const cl_event*)&events.front()), - __ENQUEUE_WAIT_FOR_EVENTS_ERR); - } -#endif // #if defined(CL_VERSION_1_1) - - cl_int enqueueAcquireGLObjects( - const VECTOR_CLASS* mem_objects = NULL, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueAcquireGLObjects( - object_, - (mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0, - (mem_objects != NULL) ? (const cl_mem *)&mem_objects->front() : NULL, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_ACQUIRE_GL_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueReleaseGLObjects( - const VECTOR_CLASS* mem_objects = NULL, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueReleaseGLObjects( - object_, - (mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0, - (mem_objects != NULL) ? (const cl_mem *)&mem_objects->front() : NULL, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_RELEASE_GL_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - -#if defined (USE_DX_INTEROP) - typedef CL_API_ENTRY cl_int(CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem* mem_objects, cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, cl_event* event); - typedef CL_API_ENTRY cl_int(CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)( - cl_command_queue command_queue, cl_uint num_objects, - const cl_mem* mem_objects, cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, cl_event* event); - - cl_int enqueueAcquireD3D10Objects( - const VECTOR_CLASS* mem_objects = NULL, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL; -#if defined(CL_VERSION_1_2) - cl_context context = getInfo(); - cl::Device device(getInfo()); - cl_platform_id platform = device.getInfo(); - __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR); -#endif -#if defined(CL_VERSION_1_1) - __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR); -#endif - - cl_event tmp; - cl_int err = detail::errHandler( - pfn_clEnqueueAcquireD3D10ObjectsKHR( - object_, - (mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0, - (mem_objects != NULL) ? (const cl_mem *)&mem_objects->front() : NULL, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_ACQUIRE_GL_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueReleaseD3D10Objects( - const VECTOR_CLASS* mem_objects = NULL, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL; -#if defined(CL_VERSION_1_2) - cl_context context = getInfo(); - cl::Device device(getInfo()); - cl_platform_id platform = device.getInfo(); - __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR); -#endif // #if defined(CL_VERSION_1_2) -#if defined(CL_VERSION_1_1) - __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR); -#endif // #if defined(CL_VERSION_1_1) - - cl_event tmp; - cl_int err = detail::errHandler( - pfn_clEnqueueReleaseD3D10ObjectsKHR( - object_, - (mem_objects != NULL) ? (cl_uint)mem_objects->size() : 0, - (mem_objects != NULL) ? (const cl_mem *)&mem_objects->front() : NULL, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_RELEASE_GL_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } -#endif - - /** - * Deprecated APIs for 1.2 - */ -#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) - CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED - { - return detail::errHandler( - ::clEnqueueBarrier(object_), - __ENQUEUE_BARRIER_ERR); - } -#endif // #if defined(CL_VERSION_1_1) - - cl_int flush() const - { - return detail::errHandler(::clFlush(object_), __FLUSH_ERR); - } - - cl_int finish() const - { - return detail::errHandler(::clFinish(object_), __FINISH_ERR); - } - }; - -#ifdef _WIN32 - __declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED; - __declspec(selectany) CommandQueue CommandQueue::default_; - __declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS; -#else - __attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED; - __attribute__((weak)) CommandQueue CommandQueue::default_; - __attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS; -#endif - - inline cl_int enqueueReadBuffer( - const Buffer& buffer, - cl_bool blocking, - ::size_t offset, - ::size_t size, - void* ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) - { - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event); - } - - inline cl_int enqueueWriteBuffer( - const Buffer& buffer, - cl_bool blocking, - ::size_t offset, - ::size_t size, - const void* ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) - { - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event); - } - - inline void* enqueueMapBuffer( - const Buffer& buffer, - cl_bool blocking, - cl_map_flags flags, - ::size_t offset, - ::size_t size, - const VECTOR_CLASS* events = NULL, - Event* event = NULL, - cl_int* err = NULL) - { - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - - void * result = ::clEnqueueMapBuffer( - queue(), buffer(), blocking, flags, offset, size, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (cl_event*)event, - &error); - - detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - return result; - } - - inline cl_int enqueueUnmapMemObject( - const Memory& memory, - void* mapped_ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) - { - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); - if (error != CL_SUCCESS) { - return error; - } - - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueUnmapMemObject( - queue(), memory(), mapped_ptr, - (events != NULL) ? (cl_uint)events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_UNMAP_MEM_OBJECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - inline cl_int enqueueCopyBuffer( - const Buffer& src, - const Buffer& dst, - ::size_t src_offset, - ::size_t dst_offset, - ::size_t size, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) - { - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event); - } - - /** - * Blocking copy operation between iterators and a buffer. - */ - template< typename IteratorType > - inline cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer) - { - typedef typename std::iterator_traits::value_type DataType; - cl_int error; - - ::size_t length = endIterator - startIterator; - ::size_t byteLength = length*sizeof(DataType); - - DataType *pointer = - static_cast(enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error)); - // if exceptions enabled, enqueueMapBuffer will throw - if (error != CL_SUCCESS) { - return error; - } -#if defined(_MSC_VER) - std::copy( - startIterator, - endIterator, - stdext::checked_array_iterator( - pointer, length)); -#else - std::copy(startIterator, endIterator, pointer); -#endif - Event endEvent; - error = enqueueUnmapMemObject(buffer, pointer, 0, &endEvent); - // if exceptions enabled, enqueueUnmapMemObject will throw - if (error != CL_SUCCESS) { - return error; - } - endEvent.wait(); - return CL_SUCCESS; - } - - /** - * Blocking copy operation between iterators and a buffer. - */ - template< typename IteratorType > - inline cl_int copy(const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator) - { - typedef typename std::iterator_traits::value_type DataType; - cl_int error; - - ::size_t length = endIterator - startIterator; - ::size_t byteLength = length*sizeof(DataType); - - DataType *pointer = - static_cast(enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error)); - // if exceptions enabled, enqueueMapBuffer will throw - if (error != CL_SUCCESS) { - return error; - } - std::copy(pointer, pointer + length, startIterator); - Event endEvent; - error = enqueueUnmapMemObject(buffer, pointer, 0, &endEvent); - // if exceptions enabled, enqueueUnmapMemObject will throw - if (error != CL_SUCCESS) { - return error; - } - endEvent.wait(); - return CL_SUCCESS; - } - -#if defined(CL_VERSION_1_1) - inline cl_int enqueueReadBufferRect( - const Buffer& buffer, - cl_bool blocking, - const size_t<3>& buffer_offset, - const size_t<3>& host_offset, - const size_t<3>& region, - ::size_t buffer_row_pitch, - ::size_t buffer_slice_pitch, - ::size_t host_row_pitch, - ::size_t host_slice_pitch, - void *ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) - { - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueReadBufferRect( - buffer, - blocking, - buffer_offset, - host_offset, - region, - buffer_row_pitch, - buffer_slice_pitch, - host_row_pitch, - host_slice_pitch, - ptr, - events, - event); - } - - inline cl_int enqueueWriteBufferRect( - const Buffer& buffer, - cl_bool blocking, - const size_t<3>& buffer_offset, - const size_t<3>& host_offset, - const size_t<3>& region, - ::size_t buffer_row_pitch, - ::size_t buffer_slice_pitch, - ::size_t host_row_pitch, - ::size_t host_slice_pitch, - void *ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) - { - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueWriteBufferRect( - buffer, - blocking, - buffer_offset, - host_offset, - region, - buffer_row_pitch, - buffer_slice_pitch, - host_row_pitch, - host_slice_pitch, - ptr, - events, - event); - } - - inline cl_int enqueueCopyBufferRect( - const Buffer& src, - const Buffer& dst, - const size_t<3>& src_origin, - const size_t<3>& dst_origin, - const size_t<3>& region, - ::size_t src_row_pitch, - ::size_t src_slice_pitch, - ::size_t dst_row_pitch, - ::size_t dst_slice_pitch, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) - { - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueCopyBufferRect( - src, - dst, - src_origin, - dst_origin, - region, - src_row_pitch, - src_slice_pitch, - dst_row_pitch, - dst_slice_pitch, - events, - event); - } -#endif - - inline cl_int enqueueReadImage( - const Image& image, - cl_bool blocking, - const size_t<3>& origin, - const size_t<3>& region, - ::size_t row_pitch, - ::size_t slice_pitch, - void* ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) - { - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueReadImage( - image, - blocking, - origin, - region, - row_pitch, - slice_pitch, - ptr, - events, - event); - } - - inline cl_int enqueueWriteImage( - const Image& image, - cl_bool blocking, - const size_t<3>& origin, - const size_t<3>& region, - ::size_t row_pitch, - ::size_t slice_pitch, - void* ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) - { - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueWriteImage( - image, - blocking, - origin, - region, - row_pitch, - slice_pitch, - ptr, - events, - event); - } - - inline cl_int enqueueCopyImage( - const Image& src, - const Image& dst, - const size_t<3>& src_origin, - const size_t<3>& dst_origin, - const size_t<3>& region, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) - { - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueCopyImage( - src, - dst, - src_origin, - dst_origin, - region, - events, - event); - } - - inline cl_int enqueueCopyImageToBuffer( - const Image& src, - const Buffer& dst, - const size_t<3>& src_origin, - const size_t<3>& region, - ::size_t dst_offset, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) - { - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueCopyImageToBuffer( - src, - dst, - src_origin, - region, - dst_offset, - events, - event); - } - - inline cl_int enqueueCopyBufferToImage( - const Buffer& src, - const Image& dst, - ::size_t src_offset, - const size_t<3>& dst_origin, - const size_t<3>& region, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) - { - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueCopyBufferToImage( - src, - dst, - src_offset, - dst_origin, - region, - events, - event); - } - - - inline cl_int flush(void) - { - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.flush(); - } - - inline cl_int finish(void) - { - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - - return queue.finish(); - } - - // Kernel Functor support - // New interface as of September 2011 - // Requires the C++11 std::tr1::function (note do not support TR1) - // Visual Studio 2010 and GCC 4.2 - - struct EnqueueArgs - { - CommandQueue queue_; - const NDRange offset_; - const NDRange global_; - const NDRange local_; - VECTOR_CLASS events_; - - EnqueueArgs(NDRange global) : - queue_(CommandQueue::getDefault()), - offset_(NullRange), - global_(global), - local_(NullRange) - { - - } - - EnqueueArgs(NDRange global, NDRange local) : - queue_(CommandQueue::getDefault()), - offset_(NullRange), - global_(global), - local_(local) - { - - } - - EnqueueArgs(NDRange offset, NDRange global, NDRange local) : - queue_(CommandQueue::getDefault()), - offset_(offset), - global_(global), - local_(local) - { - - } - - EnqueueArgs(Event e, NDRange global) : - queue_(CommandQueue::getDefault()), - offset_(NullRange), - global_(global), - local_(NullRange) - { - events_.push_back(e); - } - - EnqueueArgs(Event e, NDRange global, NDRange local) : - queue_(CommandQueue::getDefault()), - offset_(NullRange), - global_(global), - local_(local) - { - events_.push_back(e); - } - - EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : - queue_(CommandQueue::getDefault()), - offset_(offset), - global_(global), - local_(local) - { - events_.push_back(e); - } - - EnqueueArgs(const VECTOR_CLASS &events, NDRange global) : - queue_(CommandQueue::getDefault()), - offset_(NullRange), - global_(global), - local_(NullRange), - events_(events) - { - - } - - EnqueueArgs(const VECTOR_CLASS &events, NDRange global, NDRange local) : - queue_(CommandQueue::getDefault()), - offset_(NullRange), - global_(global), - local_(local), - events_(events) - { - - } - - EnqueueArgs(const VECTOR_CLASS &events, NDRange offset, NDRange global, NDRange local) : - queue_(CommandQueue::getDefault()), - offset_(offset), - global_(global), - local_(local), - events_(events) - { - - } - - EnqueueArgs(CommandQueue &queue, NDRange global) : - queue_(queue), - offset_(NullRange), - global_(global), - local_(NullRange) - { - - } - - EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : - queue_(queue), - offset_(NullRange), - global_(global), - local_(local) - { - - } - - EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : - queue_(queue), - offset_(offset), - global_(global), - local_(local) - { - - } - - EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : - queue_(queue), - offset_(NullRange), - global_(global), - local_(NullRange) - { - events_.push_back(e); - } - - EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : - queue_(queue), - offset_(NullRange), - global_(global), - local_(local) - { - events_.push_back(e); - } - - EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : - queue_(queue), - offset_(offset), - global_(global), - local_(local) - { - events_.push_back(e); - } - - EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS &events, NDRange global) : - queue_(queue), - offset_(NullRange), - global_(global), - local_(NullRange), - events_(events) - { - - } - - EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS &events, NDRange global, NDRange local) : - queue_(queue), - offset_(NullRange), - global_(global), - local_(local), - events_(events) - { - - } - - EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS &events, NDRange offset, NDRange global, NDRange local) : - queue_(queue), - offset_(offset), - global_(global), - local_(local), - events_(events) - { - - } - }; - - namespace detail { - - class NullType {}; - - template - struct SetArg - { - static void set(Kernel kernel, T0 arg) - { - kernel.setArg(index, arg); - } - }; - - template - struct SetArg - { - static void set(Kernel, NullType) - { - } - }; - - template < - typename T0, typename T1, typename T2, typename T3, - typename T4, typename T5, typename T6, typename T7, - typename T8, typename T9, typename T10, typename T11, - typename T12, typename T13, typename T14, typename T15, - typename T16, typename T17, typename T18, typename T19, - typename T20, typename T21, typename T22, typename T23, - typename T24, typename T25, typename T26, typename T27, - typename T28, typename T29, typename T30, typename T31 - > - class KernelFunctorGlobal - { - private: - Kernel kernel_; - - public: - KernelFunctorGlobal( - Kernel kernel) : - kernel_(kernel) - {} - - KernelFunctorGlobal( - const Program& program, - const STRING_CLASS name, - cl_int * err = NULL) : - kernel_(program, name.c_str(), err) - {} - - Event operator() ( - const EnqueueArgs& args, - T0 t0, - T1 t1 = NullType(), - T2 t2 = NullType(), - T3 t3 = NullType(), - T4 t4 = NullType(), - T5 t5 = NullType(), - T6 t6 = NullType(), - T7 t7 = NullType(), - T8 t8 = NullType(), - T9 t9 = NullType(), - T10 t10 = NullType(), - T11 t11 = NullType(), - T12 t12 = NullType(), - T13 t13 = NullType(), - T14 t14 = NullType(), - T15 t15 = NullType(), - T16 t16 = NullType(), - T17 t17 = NullType(), - T18 t18 = NullType(), - T19 t19 = NullType(), - T20 t20 = NullType(), - T21 t21 = NullType(), - T22 t22 = NullType(), - T23 t23 = NullType(), - T24 t24 = NullType(), - T25 t25 = NullType(), - T26 t26 = NullType(), - T27 t27 = NullType(), - T28 t28 = NullType(), - T29 t29 = NullType(), - T30 t30 = NullType(), - T31 t31 = NullType() - ) - { - Event event; - SetArg<0, T0>::set(kernel_, t0); - SetArg<1, T1>::set(kernel_, t1); - SetArg<2, T2>::set(kernel_, t2); - SetArg<3, T3>::set(kernel_, t3); - SetArg<4, T4>::set(kernel_, t4); - SetArg<5, T5>::set(kernel_, t5); - SetArg<6, T6>::set(kernel_, t6); - SetArg<7, T7>::set(kernel_, t7); - SetArg<8, T8>::set(kernel_, t8); - SetArg<9, T9>::set(kernel_, t9); - SetArg<10, T10>::set(kernel_, t10); - SetArg<11, T11>::set(kernel_, t11); - SetArg<12, T12>::set(kernel_, t12); - SetArg<13, T13>::set(kernel_, t13); - SetArg<14, T14>::set(kernel_, t14); - SetArg<15, T15>::set(kernel_, t15); - SetArg<16, T16>::set(kernel_, t16); - SetArg<17, T17>::set(kernel_, t17); - SetArg<18, T18>::set(kernel_, t18); - SetArg<19, T19>::set(kernel_, t19); - SetArg<20, T20>::set(kernel_, t20); - SetArg<21, T21>::set(kernel_, t21); - SetArg<22, T22>::set(kernel_, t22); - SetArg<23, T23>::set(kernel_, t23); - SetArg<24, T24>::set(kernel_, t24); - SetArg<25, T25>::set(kernel_, t25); - SetArg<26, T26>::set(kernel_, t26); - SetArg<27, T27>::set(kernel_, t27); - SetArg<28, T28>::set(kernel_, t28); - SetArg<29, T29>::set(kernel_, t29); - SetArg<30, T30>::set(kernel_, t30); - SetArg<31, T31>::set(kernel_, t31); - - args.queue_.enqueueNDRangeKernel( - kernel_, - args.offset_, - args.global_, - args.local_, - &args.events_, - &event); - - return event; - } - - }; - - //------------------------------------------------------------------------------------------------------ - - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22, - typename T23, - typename T24, - typename T25, - typename T26, - typename T27, - typename T28, - typename T29, - typename T30, - typename T31> - struct functionImplementation_ - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - T29, - T30, - T31> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - T29, - T30, - T31); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22, - T23 arg23, - T24 arg24, - T25 arg25, - T26 arg26, - T27 arg27, - T28 arg28, - T29 arg29, - T30 arg30, - T31 arg31) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22, - arg23, - arg24, - arg25, - arg26, - arg27, - arg28, - arg29, - arg30, - arg31); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22, - typename T23, - typename T24, - typename T25, - typename T26, - typename T27, - typename T28, - typename T29, - typename T30> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - T29, - T30, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - T29, - T30, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - T29, - T30); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22, - T23 arg23, - T24 arg24, - T25 arg25, - T26 arg26, - T27 arg27, - T28 arg28, - T29 arg29, - T30 arg30) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22, - arg23, - arg24, - arg25, - arg26, - arg27, - arg28, - arg29, - arg30); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22, - typename T23, - typename T24, - typename T25, - typename T26, - typename T27, - typename T28, - typename T29> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - T29, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - T29, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - T29); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22, - T23 arg23, - T24 arg24, - T25 arg25, - T26 arg26, - T27 arg27, - T28 arg28, - T29 arg29) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22, - arg23, - arg24, - arg25, - arg26, - arg27, - arg28, - arg29); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22, - typename T23, - typename T24, - typename T25, - typename T26, - typename T27, - typename T28> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22, - T23 arg23, - T24 arg24, - T25 arg25, - T26 arg26, - T27 arg27, - T28 arg28) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22, - arg23, - arg24, - arg25, - arg26, - arg27, - arg28); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22, - typename T23, - typename T24, - typename T25, - typename T26, - typename T27> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22, - T23 arg23, - T24 arg24, - T25 arg25, - T26 arg26, - T27 arg27) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22, - arg23, - arg24, - arg25, - arg26, - arg27); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22, - typename T23, - typename T24, - typename T25, - typename T26> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22, - T23 arg23, - T24 arg24, - T25 arg25, - T26 arg26) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22, - arg23, - arg24, - arg25, - arg26); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22, - typename T23, - typename T24, - typename T25> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22, - T23 arg23, - T24 arg24, - T25 arg25) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22, - arg23, - arg24, - arg25); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22, - typename T23, - typename T24> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22, - T23 arg23, - T24 arg24) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22, - arg23, - arg24); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22, - typename T23> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22, - T23 arg23) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22, - arg23); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - T6, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - T5, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - T4, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4); - } - - - }; - - template< - typename T0, - typename T1, - typename T2, - typename T3> - struct functionImplementation_ - < T0, - T1, - T2, - T3, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3); - } - - - }; - - template< - typename T0, - typename T1, - typename T2> - struct functionImplementation_ - < T0, - T1, - T2, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2); - } - - - }; - - template< - typename T0, - typename T1> - struct functionImplementation_ - < T0, - T1, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - T1, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1) - { - return functor_( - enqueueArgs, - arg0, - arg1); - } - - - }; - - template< - typename T0> - struct functionImplementation_ - < T0, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> - { - typedef detail::KernelFunctorGlobal< - T0, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - -#if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); -#endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0) - { - return functor_( - enqueueArgs, - arg0); - } - - - }; - - - - - - } // namespace detail - - //---------------------------------------------------------------------------------------------- - - template < - typename T0, typename T1 = detail::NullType, typename T2 = detail::NullType, - typename T3 = detail::NullType, typename T4 = detail::NullType, - typename T5 = detail::NullType, typename T6 = detail::NullType, - typename T7 = detail::NullType, typename T8 = detail::NullType, - typename T9 = detail::NullType, typename T10 = detail::NullType, - typename T11 = detail::NullType, typename T12 = detail::NullType, - typename T13 = detail::NullType, typename T14 = detail::NullType, - typename T15 = detail::NullType, typename T16 = detail::NullType, - typename T17 = detail::NullType, typename T18 = detail::NullType, - typename T19 = detail::NullType, typename T20 = detail::NullType, - typename T21 = detail::NullType, typename T22 = detail::NullType, - typename T23 = detail::NullType, typename T24 = detail::NullType, - typename T25 = detail::NullType, typename T26 = detail::NullType, - typename T27 = detail::NullType, typename T28 = detail::NullType, - typename T29 = detail::NullType, typename T30 = detail::NullType, - typename T31 = detail::NullType - > - struct make_kernel : - public detail::functionImplementation_< - T0, T1, T2, T3, - T4, T5, T6, T7, - T8, T9, T10, T11, - T12, T13, T14, T15, - T16, T17, T18, T19, - T20, T21, T22, T23, - T24, T25, T26, T27, - T28, T29, T30, T31 - > - { - public: - typedef detail::KernelFunctorGlobal< - T0, T1, T2, T3, - T4, T5, T6, T7, - T8, T9, T10, T11, - T12, T13, T14, T15, - T16, T17, T18, T19, - T20, T21, T22, T23, - T24, T25, T26, T27, - T28, T29, T30, T31 - > FunctorType; - - make_kernel( - const Program& program, - const STRING_CLASS name, - cl_int * err = NULL) : - detail::functionImplementation_< - T0, T1, T2, T3, - T4, T5, T6, T7, - T8, T9, T10, T11, - T12, T13, T14, T15, - T16, T17, T18, T19, - T20, T21, T22, T23, - T24, T25, T26, T27, - T28, T29, T30, T31 - >( - FunctorType(program, name, err)) - {} - - make_kernel( - const Kernel kernel) : - detail::functionImplementation_< - T0, T1, T2, T3, - T4, T5, T6, T7, - T8, T9, T10, T11, - T12, T13, T14, T15, - T16, T17, T18, T19, - T20, T21, T22, T23, - T24, T25, T26, T27, - T28, T29, T30, T31 - >( - FunctorType(kernel)) - {} - }; - - - //---------------------------------------------------------------------------------------------------------------------- - -#undef __ERR_STR -#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS) -#undef __GET_DEVICE_INFO_ERR -#undef __GET_PLATFORM_INFO_ERR -#undef __GET_DEVICE_IDS_ERR -#undef __GET_CONTEXT_INFO_ERR -#undef __GET_EVENT_INFO_ERR -#undef __GET_EVENT_PROFILE_INFO_ERR -#undef __GET_MEM_OBJECT_INFO_ERR -#undef __GET_IMAGE_INFO_ERR -#undef __GET_SAMPLER_INFO_ERR -#undef __GET_KERNEL_INFO_ERR -#undef __GET_KERNEL_ARG_INFO_ERR -#undef __GET_KERNEL_WORK_GROUP_INFO_ERR -#undef __GET_PROGRAM_INFO_ERR -#undef __GET_PROGRAM_BUILD_INFO_ERR -#undef __GET_COMMAND_QUEUE_INFO_ERR - -#undef __CREATE_CONTEXT_ERR -#undef __CREATE_CONTEXT_FROM_TYPE_ERR -#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR - -#undef __CREATE_BUFFER_ERR -#undef __CREATE_SUBBUFFER_ERR -#undef __CREATE_IMAGE2D_ERR -#undef __CREATE_IMAGE3D_ERR -#undef __CREATE_SAMPLER_ERR -#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR - -#undef __CREATE_USER_EVENT_ERR -#undef __SET_USER_EVENT_STATUS_ERR -#undef __SET_EVENT_CALLBACK_ERR -#undef __SET_PRINTF_CALLBACK_ERR - -#undef __WAIT_FOR_EVENTS_ERR - -#undef __CREATE_KERNEL_ERR -#undef __SET_KERNEL_ARGS_ERR -#undef __CREATE_PROGRAM_WITH_SOURCE_ERR -#undef __CREATE_PROGRAM_WITH_BINARY_ERR -#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR -#undef __BUILD_PROGRAM_ERR -#undef __CREATE_KERNELS_IN_PROGRAM_ERR - -#undef __CREATE_COMMAND_QUEUE_ERR -#undef __SET_COMMAND_QUEUE_PROPERTY_ERR -#undef __ENQUEUE_READ_BUFFER_ERR -#undef __ENQUEUE_WRITE_BUFFER_ERR -#undef __ENQUEUE_READ_BUFFER_RECT_ERR -#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR -#undef __ENQEUE_COPY_BUFFER_ERR -#undef __ENQEUE_COPY_BUFFER_RECT_ERR -#undef __ENQUEUE_READ_IMAGE_ERR -#undef __ENQUEUE_WRITE_IMAGE_ERR -#undef __ENQUEUE_COPY_IMAGE_ERR -#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR -#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR -#undef __ENQUEUE_MAP_BUFFER_ERR -#undef __ENQUEUE_MAP_IMAGE_ERR -#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR -#undef __ENQUEUE_NDRANGE_KERNEL_ERR -#undef __ENQUEUE_TASK_ERR -#undef __ENQUEUE_NATIVE_KERNEL - -#undef __CL_EXPLICIT_CONSTRUCTORS - -#undef __UNLOAD_COMPILER_ERR -#endif //__CL_USER_OVERRIDE_ERROR_STRINGS - -#undef __CL_FUNCTION_TYPE - - // Extensions - /** - * Deprecated APIs for 1.2 - */ -#if defined(CL_VERSION_1_1) -#undef __INIT_CL_EXT_FCN_PTR -#endif // #if defined(CL_VERSION_1_1) -#undef __CREATE_SUB_DEVICES - -#if defined(USE_CL_DEVICE_FISSION) -#undef __PARAM_NAME_DEVICE_FISSION -#endif // USE_CL_DEVICE_FISSION - -#undef __DEFAULT_NOT_INITIALIZED -#undef __DEFAULT_BEING_INITIALIZED -#undef __DEFAULT_INITIALIZED - -} // namespace cl - -#ifdef _WIN32 -#pragma pop_macro("max") -#endif // _WIN32 - -#endif // CL_HPP_ diff --git a/ocl_device_utils/ocl_device_utils.cpp b/ocl_device_utils/ocl_device_utils.cpp deleted file mode 100644 index f7a802d00..000000000 --- a/ocl_device_utils/ocl_device_utils.cpp +++ /dev/null @@ -1,146 +0,0 @@ -#include "ocl_device_utils.h" - -#include -#include -#include -#include - -using namespace std; -using namespace cl; - - -bool ocl_device_utils::_hasQueried = false; -std::vector ocl_device_utils::_platformNames; -std::vector ocl_device_utils::_devicesPlatformsDevices; - -vector ocl_device_utils::getPlatforms() { - vector platforms; - try { - Platform::get(&platforms); - } - catch (Error const& err) { -#if defined(CL_PLATFORM_NOT_FOUND_KHR) - if (err.err() == CL_PLATFORM_NOT_FOUND_KHR) - cout << "No OpenCL platforms found" << endl; - else -#endif - throw err; - } - return platforms; -} - -void ocl_device_utils::print_opencl_devices() { - ocl_device_utils::QueryDevices(); - ocl_device_utils::PrintDevices(); -} - -vector ocl_device_utils::getDevices(vector const& _platforms, unsigned _platformId) { - vector devices; - try { - _platforms[_platformId].getDevices(/*CL_DEVICE_TYPE_CPU| */CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, &devices); - } - catch (Error const& err) { - // if simply no devices found return empty vector - if (err.err() != CL_DEVICE_NOT_FOUND) - throw err; - } - return devices; -} - -string ocl_device_utils::StringnNullTerminatorFix(const string& str) { - return string(str.c_str(), strlen(str.c_str())); -} - -bool ocl_device_utils::QueryDevices() { - if (!_hasQueried) { - _hasQueried = true; - try { - // get platforms - auto platforms = getPlatforms(); - if (platforms.empty()) { - cout << "No OpenCL platforms found" << endl; - return false; - } - else { - for (auto i_pId = 0u; i_pId < platforms.size(); ++i_pId) { - string platformName = StringnNullTerminatorFix(platforms[i_pId].getInfo()); - if (std::find(_platformNames.begin(), _platformNames.end(), platformName) == _platformNames.end()) { - PrintInfo current; - _platformNames.push_back(platformName); - // new - current.PlatformName = platformName; - current.PlatformNum = i_pId; - - auto clDevs = getDevices(platforms, i_pId); - for (auto i_devId = 0u; i_devId < clDevs.size(); ++i_devId) { - OpenCLDevice curDevice; - curDevice.DeviceID = i_devId; - curDevice._CL_DEVICE_NAME = StringnNullTerminatorFix(clDevs[i_devId].getInfo()); - switch (clDevs[i_devId].getInfo()) { - case CL_DEVICE_TYPE_CPU: - curDevice._CL_DEVICE_TYPE = "CPU"; - break; - case CL_DEVICE_TYPE_GPU: - curDevice._CL_DEVICE_TYPE = "GPU"; - break; - case CL_DEVICE_TYPE_ACCELERATOR: - curDevice._CL_DEVICE_TYPE = "ACCELERATOR"; - break; - default: - curDevice._CL_DEVICE_TYPE = "DEFAULT"; - break; - } - - - curDevice._CL_DEVICE_GLOBAL_MEM_SIZE = clDevs[i_devId].getInfo(); - curDevice._CL_DEVICE_VENDOR = StringnNullTerminatorFix(clDevs[i_devId].getInfo()); - curDevice._CL_DEVICE_VERSION = StringnNullTerminatorFix(clDevs[i_devId].getInfo()); - curDevice._CL_DRIVER_VERSION = StringnNullTerminatorFix(clDevs[i_devId].getInfo()); - - current.Devices.push_back(curDevice); - } - _devicesPlatformsDevices.push_back(current); - } - } - } - } - catch (exception &ex) { - // TODO - cout << "ocl_device_utils::QueryDevices() exception: " << ex.what() << endl; - return false; - } - return true; - } - - return false; -} - -int ocl_device_utils::GetCountForPlatform(int platformID) { - for (const auto &platInfo : _devicesPlatformsDevices) - { - if (platformID == platInfo.PlatformNum) { - return platInfo.Devices.size(); - } - } - return 0; -} - -void ocl_device_utils::PrintDevices() { - int allDevsCount = 0; - for (const auto &platInfo : _devicesPlatformsDevices) { - allDevsCount += platInfo.Devices.size(); - } - cout << "Number of OpenCL devices found: " << allDevsCount << endl; - { - int devPlatformsComma = _devicesPlatformsDevices.size(); - for (const auto &platInfo : _devicesPlatformsDevices) { - cout << "\tPlatform: " << platInfo.PlatformName << " | " << "PlatformNum: " << platInfo.PlatformNum << endl; - cout << "\t\tDevices: " << endl; - // device print - int devComma = platInfo.Devices.size(); - for (const auto &dev : platInfo.Devices) { - cout << "\t\t\t#" << dev.DeviceID << " " << dev._CL_DEVICE_NAME << " | " << dev._CL_DEVICE_TYPE << endl; - } - } - } -} \ No newline at end of file diff --git a/ocl_device_utils/ocl_device_utils.h b/ocl_device_utils/ocl_device_utils.h deleted file mode 100644 index cf74aaf2b..000000000 --- a/ocl_device_utils/ocl_device_utils.h +++ /dev/null @@ -1,34 +0,0 @@ -#pragma once - -#define __CL_ENABLE_EXCEPTIONS -#define CL_USE_DEPRECATED_OPENCL_2_0_APIS - -#include "cl_ext.hpp" -#include -#include -#include "OpenCLDevice.h" - - -struct PrintInfo { - std::string PlatformName; - int PlatformNum; - std::vector Devices; -}; - -class ocl_device_utils { -public: - static bool QueryDevices(); - static void PrintDevices(); - static int GetCountForPlatform(int platformID); - static void print_opencl_devices(); - -private: - static std::vector getDevices(std::vector const& _platforms, unsigned _platformId); - static std::vector getPlatforms(); - - static bool _hasQueried; - static std::vector _platformNames; - static std::vector _devicesPlatformsDevices; - - static std::string StringnNullTerminatorFix(const std::string& str); -}; \ No newline at end of file diff --git a/ocl_device_utils/ocl_device_utils.vcxproj b/ocl_device_utils/ocl_device_utils.vcxproj deleted file mode 100644 index 4830defa1..000000000 --- a/ocl_device_utils/ocl_device_utils.vcxproj +++ /dev/null @@ -1,95 +0,0 @@ - - - - - Debug - x64 - - - Release - x64 - - - - - - - - - - - - - - {5DBCE38A-C8D2-4498-A92A-9AF8D5196135} - Win32Proj - ocl_device_utils - - - - StaticLibrary - true - v120 - Unicode - - - StaticLibrary - false - v120 - true - Unicode - - - - - - - - - - - - - $(SolutionDir)$(Platform)\$(Configuration)\ - $(Platform)\$(Configuration)\ - - - $(SolutionDir)$(Platform)\$(Configuration)\ - $(Platform)\$(Configuration)\ - - - - - - Level3 - Disabled - WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) - $(AMDAPPSDKROOT)\include\ - - - Windows - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) - $(AMDAPPSDKROOT)\include\;%(AdditionalIncludeDirectories) - - - Windows - true - true - true - - - - - - \ No newline at end of file diff --git a/ocl_device_utils/ocl_device_utils.vcxproj.filters b/ocl_device_utils/ocl_device_utils.vcxproj.filters deleted file mode 100644 index 1c4a6cd21..000000000 --- a/ocl_device_utils/ocl_device_utils.vcxproj.filters +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/ocl_device_utils/opencl.cpp b/ocl_device_utils/opencl.cpp deleted file mode 100644 index cea4c9082..000000000 --- a/ocl_device_utils/opencl.cpp +++ /dev/null @@ -1,174 +0,0 @@ -#include "opencl.h" -#include -#include -#include -#include - -extern cl_platform_id gPlatform; -// extern cl_program gProgram; - -bool clInitialize(int requiredPlatform, std::vector &gpus) -{ - cl_platform_id platforms[64]; - cl_uint numPlatforms; - OCLR(clGetPlatformIDs(sizeof(platforms)/sizeof(cl_platform_id), platforms, &numPlatforms), false); - if (!numPlatforms) { - printf(" no OpenCL platforms found\n"); - return false; - } - - /*int platformIdx = -1; - if (requiredPlatform) { - for (decltype(numPlatforms) i = 0; i < numPlatforms; i++) { - char name[1024] = {0}; - OCLR(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(name), name, 0), false); - printf("found platform[%i] name = '%s'\n", (int)i, name); - if (strcmp(name, requiredPlatform) == 0) { - platformIdx = i; - break; - } - } - } else { - platformIdx = 0; - }*/ - - int platformIdx = requiredPlatform; - - - if (platformIdx == -1) { - printf(" platform %s not exists\n", requiredPlatform); - return false; - } - - gPlatform = platforms[platformIdx]; - - cl_uint numDevices = 0; - cl_device_id devices[64]; - clGetDeviceIDs(gPlatform, CL_DEVICE_TYPE_GPU, sizeof(devices)/sizeof(cl_device_id), devices, &numDevices); - if (numDevices) { - printf(" found %d devices\n", numDevices); - } else { - printf(" no OpenCL GPU devices found.\n"); - return false; - } - - for (decltype(numDevices) i = 0; i < numDevices; i++) { - gpus.push_back(devices[i]); - } - - return true; -} - -bool clCompileKernel(cl_context gContext, - cl_device_id gpu, - const char *binaryName, - const std::vector &sources, - const char *arguments, - cl_int *binstatus, - cl_program *gProgram) -{ - std::ifstream testfile(binaryName); - -// size_t binsizes[64]; - -// const unsigned char *binaries[64]; - - if(!testfile) { - - - printf(" compiling ...\n"); - - std::string sourceFile; - for (auto &i: sources) { - std::ifstream stream; - stream.exceptions(std::ifstream::failbit | std::ifstream::badbit); - try { - stream.open(i); - } catch (std::system_error& e) { - fprintf(stderr, " %s\n", e.code().message().c_str()); - return false; - } - std::string str((std::istreambuf_iterator(stream)), std::istreambuf_iterator()); - sourceFile.append(str); - } - - printf(" source: %u bytes\n", (unsigned)sourceFile.size()); - if(sourceFile.size() < 1){ - fprintf(stderr, " source files not found or empty\n"); - return false; - } - - cl_int error; - const char *sources[] = { sourceFile.c_str(), 0 }; - *gProgram = clCreateProgramWithSource(gContext, 1, sources, 0, &error); - OCLR(error, false); - - if (clBuildProgram(*gProgram, 1, &gpu, arguments, 0, 0) != CL_SUCCESS) { - size_t logSize; - clGetProgramBuildInfo(*gProgram, gpu, CL_PROGRAM_BUILD_LOG, 0, 0, &logSize); - - std::unique_ptr log(new char[logSize]); - clGetProgramBuildInfo(*gProgram, gpu, CL_PROGRAM_BUILD_LOG, logSize, log.get(), 0); - printf("%s\n", log.get()); - - return false; - } - - size_t binsize; - OCLR(clGetProgramInfo(*gProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binsize, 0), false); -// for (size_t i = 0; i < 1; i++) { - if(!binsize) { - printf(" no binary available!\n"); - return false; - } -// } - - printf(" binsize = %u bytes\n", (unsigned)binsize); -// std::unique_ptr binary(new unsigned char[binsize+1]); - -// for (size_t i = 0; i < gpus.size(); i++) - std::unique_ptr binary(new unsigned char[binsize+1]); -// binaries[i] = new unsigned char[binsizes[i]]; - -// for (auto &b: binaries) -// b = binary.get(); - OCLR(clGetProgramInfo(*gProgram, CL_PROGRAM_BINARIES, sizeof(void*), &binary, 0), false); - - { - std::ofstream bin(binaryName, std::ofstream::binary | std::ofstream::trunc); - bin.write((const char*)binary.get(), binsize); - bin.close(); - } - - OCLR(clReleaseProgram(*gProgram), false); - } - - std::ifstream bfile(binaryName, std::ifstream::binary); - if(!bfile) { - printf(" %s not found\n", binaryName); - return false; - } - - bfile.seekg(0, bfile.end); - size_t binsize = bfile.tellg(); - bfile.seekg(0, bfile.beg); - if(!binsize){ - printf(" %s empty\n", binaryName); - return false; - } - - std::vector binary(binsize+1); - bfile.read(&binary[0], binsize); - bfile.close(); - - cl_int error; -// binstatus.resize(gpus.size(), 0); -// std::vector binsizes(gpus.size(), binsize); -// std::vector binaries(gpus.size(), (const unsigned char*)&binary[0]); - const unsigned char *binaryPtr = (const unsigned char*)&binary[0]; - - *gProgram = clCreateProgramWithBinary(gContext, 1, &gpu, &binsize, &binaryPtr, binstatus, &error); - OCLR(error, false); - OCLR(clBuildProgram(*gProgram, 1, &gpu, 0, 0, 0), false); - return true; -} diff --git a/ocl_device_utils/opencl.h b/ocl_device_utils/opencl.h deleted file mode 100644 index 566d9ffa2..000000000 --- a/ocl_device_utils/opencl.h +++ /dev/null @@ -1,131 +0,0 @@ -/* - * opencl.h - * - * Created on: 01.05.2014 - * Author: mad - */ - -#ifndef OPENCL_H_ -#define OPENCL_H_ - -#pragma warning(disable: 4996) -#include -#include -#include -#include - -// extern cl_context gContext; - - - -#define OCL(error) \ - if(cl_int err = error){ \ - printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); \ - return; \ - } - -#define OCLR(error, ret) \ - if(cl_int err = error){ \ - printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); \ - return ret; \ - } - -#define OCLE(error) \ - if(cl_int err = error){ \ - printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); \ - exit(err); \ - } - - - - - -template -class clBuffer { -public: - - clBuffer() { - - Size = 0; - HostData = 0; - DeviceData = 0; - - } - - ~clBuffer() { - - if(HostData) - delete [] HostData; - - if(DeviceData) - clReleaseMemObject(DeviceData); - - } - - void init(cl_context gContext, int size, cl_mem_flags flags = 0) { - - Size = size; - - if(!(flags & CL_MEM_HOST_NO_ACCESS)){ - HostData = new T[Size]; - memset(HostData, 0, Size*sizeof(T)); - }else - HostData = 0; - - //printf("clCreateBuffer: size = %d, %d bytes\n", Size, Size*sizeof(T)); - - cl_int error; - if (flags & CL_MEM_HOST_NO_ACCESS) - flags = CL_MEM_READ_WRITE; - DeviceData = clCreateBuffer(gContext, flags, Size*sizeof(T), 0, &error); - OCL(error); - - } - - void copyToDevice(cl_command_queue cq, bool blocking = true) { - - OCL(clEnqueueWriteBuffer(cq, DeviceData, blocking, 0, Size*sizeof(T), HostData, 0, 0, 0)); - - } - - void copyToHost(cl_command_queue cq, bool blocking = true, unsigned size = 0) { - - if(size == 0) - size = Size; - - OCL(clEnqueueReadBuffer(cq, DeviceData, blocking, 0, size*sizeof(T), HostData, 0, 0, 0)); - - } - - T& get(int index) { - return HostData[index]; - } - - T& operator[](int index) { - return HostData[index]; - } - -public: - - int Size; - T* HostData; - cl_mem DeviceData; - - -}; - - -bool clInitialize(int requiredPlatform, std::vector &gpus); -bool clCompileKernel(cl_context gContext, - cl_device_id gpu, - const char *binaryName, - const std::vector &sources, - const char *arguments, - cl_int *binstatus, - cl_program *gProgram); - - - - - -#endif /* OPENCL_H_ */ diff --git a/ocl_silentarmy/ocl_silentarmy.cpp b/ocl_silentarmy/ocl_silentarmy.cpp deleted file mode 100644 index d67a3fa76..000000000 --- a/ocl_silentarmy/ocl_silentarmy.cpp +++ /dev/null @@ -1,536 +0,0 @@ -#include "ocl_silentarmy.hpp" - -//#define _CRT_SECURE_NO_WARNINGS - -#include -#include -#include -#include -#include -#include -#include -//#include -#include -#include -//#include -//#include -#include - - -#include "opencl.h" - -#include - -#include "sa_blake.h" - -typedef uint8_t uchar; -typedef uint32_t uint; -typedef uint64_t ulong; -#include "param.h" - -#define MIN(A, B) (((A) < (B)) ? (A) : (B)) -#define MAX(A, B) (((A) > (B)) ? (A) : (B)) - -#define WN PARAM_N -#define WK PARAM_K - -#define COLLISION_BIT_LENGTH (WN / (WK+1)) -#define COLLISION_BYTE_LENGTH ((COLLISION_BIT_LENGTH+7)/8) -#define FINAL_FULL_WIDTH (2*COLLISION_BYTE_LENGTH+sizeof(uint32_t)*(1 << (WK))) - -#define NDIGITS (WK+1) -#define DIGITBITS (WN/(NDIGITS)) -#define PROOFSIZE (1u<= '0' && c <= '9') return c - '0'; - else if (c >= 'a' && c <= 'f') return 10 + c - 'a'; - else if (c >= 'A' && c <= 'F') return 10 + c - 'A'; - printf("Invalid hex char at offset %zd: ...%c...\n", off, c); - return 0; -} - -unsigned nr_compute_units(const char *gpu) -{ - if (!strcmp(gpu, "rx480")) return 36; - fprintf(stderr, "Unknown GPU: %s\n", gpu); - return 0; -} - -static void compress(uint8_t *out, uint32_t *inputs, uint32_t n) -{ - uint32_t byte_pos = 0; - int32_t bits_left = PREFIX + 1; - uint8_t x = 0; - uint8_t x_bits_used = 0; - uint8_t *pOut = out; - while (byte_pos < n) - { - if (bits_left >= 8 - x_bits_used) - { - x |= inputs[byte_pos] >> (bits_left - 8 + x_bits_used); - bits_left -= 8 - x_bits_used; - x_bits_used = 8; - } - else if (bits_left > 0) - { - uint32_t mask = ~(-1 << (8 - x_bits_used)); - mask = ((~mask) >> bits_left) & mask; - x |= (inputs[byte_pos] << (8 - x_bits_used - bits_left)) & mask; - x_bits_used += bits_left; - bits_left = 0; - } - else if (bits_left <= 0) - { - assert(!bits_left); - byte_pos++; - bits_left = PREFIX + 1; - } - if (x_bits_used == 8) - { - *pOut++ = x; - x = x_bits_used = 0; - } - } -} - -void get_program_build_log(cl_program program, cl_device_id device) -{ - cl_int status; - char val[2 * 1024 * 1024]; - size_t ret = 0; - status = clGetProgramBuildInfo(program, device, - CL_PROGRAM_BUILD_LOG, - sizeof(val), // size_t param_value_size - &val, // void *param_value - &ret); // size_t *param_value_size_ret - if (status != CL_SUCCESS) - printf("clGetProgramBuildInfo (%d)\n", status); - fprintf(stderr, "%s\n", val); -} - -size_t select_work_size_blake(void) -{ - size_t work_size = - 64 * /* thread per wavefront */ - BLAKE_WPS * /* wavefront per simd */ - 4 * /* simd per compute unit */ - nr_compute_units("rx480"); - // Make the work group size a multiple of the nr of wavefronts, while - // dividing the number of inputs. This results in the worksize being a - // power of 2. - while (NR_INPUTS % work_size) - work_size += 64; - //debug("Blake: work size %zd\n", work_size); - return work_size; -} - -static void init_ht(cl_command_queue queue, cl_kernel k_init_ht, cl_mem buf_ht) -{ - size_t global_ws = NR_ROWS; - size_t local_ws = 64; - cl_int status; -#if 0 - uint32_t pat = -1; - status = clEnqueueFillBuffer(queue, buf_ht, &pat, sizeof(pat), 0, - NR_ROWS * NR_SLOTS * SLOT_LEN, - 0, // cl_uint num_events_in_wait_list - NULL, // cl_event *event_wait_list - NULL); // cl_event *event - if (status != CL_SUCCESS) - fatal("clEnqueueFillBuffer (%d)\n", status); -#endif - status = clSetKernelArg(k_init_ht, 0, sizeof(buf_ht), &buf_ht); - if (status != CL_SUCCESS) - printf("clSetKernelArg (%d)\n", status); - check_clEnqueueNDRangeKernel(queue, k_init_ht, - 1, // cl_uint work_dim - NULL, // size_t *global_work_offset - &global_ws, // size_t *global_work_size - &local_ws, // size_t *local_work_size - 0, // cl_uint num_events_in_wait_list - NULL, // cl_event *event_wait_list - NULL); // cl_event *event -} - - -/* -** Sort a pair of binary blobs (a, b) which are consecutive in memory and -** occupy a total of 2*len 32-bit words. -** -** a points to the pair -** len number of 32-bit words in each pair -*/ -void sort_pair(uint32_t *a, uint32_t len) -{ - uint32_t *b = a + len; - uint32_t tmp, need_sorting = 0; - for (uint32_t i = 0; i < len; i++) - if (need_sorting || a[i] > b[i]) - { - need_sorting = 1; - tmp = a[i]; - a[i] = b[i]; - b[i] = tmp; - } - else if (a[i] < b[i]) - return; -} -static uint32_t verify_sol(sols_t *sols, unsigned sol_i) -{ - uint32_t *inputs = sols->values[sol_i]; - uint32_t seen_len = (1 << (PREFIX + 1)) / 8; - uint8_t seen[(1 << (PREFIX + 1)) / 8]; - uint32_t i; - uint8_t tmp; - // look for duplicate inputs - memset(seen, 0, seen_len); - for (i = 0; i < (1 << PARAM_K); i++) - { - tmp = seen[inputs[i] / 8]; - seen[inputs[i] / 8] |= 1 << (inputs[i] & 7); - if (tmp == seen[inputs[i] / 8]) - { - // at least one input value is a duplicate - sols->valid[sol_i] = 0; - return 0; - } - } - // the valid flag is already set by the GPU, but set it again because - // I plan to change the GPU code to not set it - sols->valid[sol_i] = 1; - // sort the pairs in place - for (uint32_t level = 0; level < PARAM_K; level++) - for (i = 0; i < (1 << PARAM_K); i += (2 << level)) - sort_pair(&inputs[i], 1 << level); - return 1; -} - - - -ocl_silentarmy::ocl_silentarmy(int platf_id, int dev_id) { - platform_id = platf_id; - device_id = dev_id; - // TODO - threadsNum = 8192; - wokrsize = 128; // 256; -} - -std::string ocl_silentarmy::getdevinfo() { - /*TODO get name*/ - return "GPU_ID(" + std::to_string(device_id)+ ")"; -} - -// STATICS START -int ocl_silentarmy::getcount() { /*TODO*/ - return 0; -} - -void ocl_silentarmy::getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version) { /*TODO*/ } - -void ocl_silentarmy::start(ocl_silentarmy& device_context) { - /*TODO*/ - device_context.is_init_success = false; - device_context.oclc = new OclContext(); - - std::vector allGpus; - if (!clInitialize(device_context.platform_id, allGpus)) { - return; - } - - // this is kinda stupid but it works - std::vector gpus; - for (unsigned i = 0; i < allGpus.size(); ++i) { - if (i == device_context.device_id) { - printf("Using device %d as GPU %d\n", i, (int)gpus.size()); - device_context.oclc->_dev_id = allGpus[i]; - gpus.push_back(allGpus[i]); - } - } - - if (!gpus.size()){ - printf("Device id %d not found\n", device_context.device_id); - return; - } - - // context create - for (unsigned i = 0; i < gpus.size(); i++) { - cl_context_properties props[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)device_context.oclc->platform_id, 0 }; - cl_int error; - device_context.oclc->_context = clCreateContext(NULL, 1, &gpus[i], 0, 0, &error); - //OCLR(error, false); - if (cl_int err = error) { - printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); - return; - } - } - - std::vector binstatus; - binstatus.resize(gpus.size()); - - for (size_t i = 0; i < gpus.size(); i++) { - char kernelName[64]; - sprintf(kernelName, "silentarmy_gpu%u.bin", (unsigned)i); - if (!clCompileKernel(device_context.oclc->_context, - gpus[i], - kernelName, - { "zcash/gpu/kernel.cl" }, - "", - &binstatus[i], - &device_context.oclc->_program)) { - return; - } - } - - for (unsigned i = 0; i < gpus.size(); ++i) { - if (binstatus[i] == CL_SUCCESS) { - if (!device_context.oclc->init(gpus[i], device_context.threadsNum, device_context.wokrsize)) { - printf("Init failed"); - return; - } - } - else { - printf("GPU %d: failed to load kernel\n", i); - return; - } - } - - device_context.is_init_success = true; -} - -void ocl_silentarmy::stop(ocl_silentarmy& device_context) { - if (device_context.oclc != nullptr) delete device_context.oclc; -} - -void ocl_silentarmy::solve(const char *tequihash_header, - unsigned int tequihash_header_len, - const char* nonce, - unsigned int nonce_len, - std::function cancelf, - std::function&, size_t, const unsigned char*)> solutionf, - std::function hashdonef, - ocl_silentarmy& device_context) { - - unsigned char context[140]; - memset(context, 0, 140); - memcpy(context, tequihash_header, tequihash_header_len); - memcpy(context + tequihash_header_len, nonce, nonce_len); - - OclContext *miner = device_context.oclc; - clFlush(miner->queue); - - blake2b_state_t initialCtx; - zcash_blake2b_init(&initialCtx, ZCASH_HASH_LEN, PARAM_N, PARAM_K); - zcash_blake2b_update(&initialCtx, (const uint8_t*)context, 128, 0); - - cl_mem buf_blake_st; - buf_blake_st = check_clCreateBuffer(miner->_context, CL_MEM_READ_ONLY | - CL_MEM_COPY_HOST_PTR, sizeof(blake2b_state_s), &initialCtx); - - - for (unsigned round = 0; round < PARAM_K; round++) - { - if (round < 2) - init_ht(miner->queue, miner->k_init_ht, miner->buf_ht[round % 2]); - if (!round) - { - check_clSetKernelArg(miner->k_rounds[round], 0, &buf_blake_st); - check_clSetKernelArg(miner->k_rounds[round], 1, &miner->buf_ht[round % 2]); - miner->global_ws = select_work_size_blake(); - } - else - { - check_clSetKernelArg(miner->k_rounds[round], 0, &miner->buf_ht[(round - 1) % 2]); - check_clSetKernelArg(miner->k_rounds[round], 1, &miner->buf_ht[round % 2]); - miner->global_ws = NR_ROWS; - } - check_clSetKernelArg(miner->k_rounds[round], 2, &miner->buf_dbg); - if (round == PARAM_K - 1) - check_clSetKernelArg(miner->k_rounds[round], 3, &miner->buf_sols); - check_clEnqueueNDRangeKernel(miner->queue, miner->k_rounds[round], 1, NULL, - &miner->global_ws, &miner->local_work_size, 0, NULL, NULL); - // cancel function - if (cancelf()) return; - } - check_clSetKernelArg(miner->k_sols, 0, &miner->buf_ht[0]); - check_clSetKernelArg(miner->k_sols, 1, &miner->buf_ht[1]); - check_clSetKernelArg(miner->k_sols, 2, &miner->buf_sols); - miner->global_ws = NR_ROWS; - check_clEnqueueNDRangeKernel(miner->queue, miner->k_sols, 1, NULL, - &miner->global_ws, &miner->local_work_size, 0, NULL, NULL); - - check_clEnqueueReadBuffer(miner->queue, miner->buf_sols, - CL_TRUE, // cl_bool blocking_read - 0, // size_t offset - sizeof(*miner->sols), // size_t size - miner->sols, // void *ptr - 0, // cl_uint num_events_in_wait_list - NULL, // cl_event *event_wait_list - NULL); // cl_event *event - - if (miner->sols->nr > MAX_SOLS) - miner->sols->nr = MAX_SOLS; - - clReleaseMemObject(buf_blake_st); - - for (unsigned sol_i = 0; sol_i < miner->sols->nr; sol_i++) { - verify_sol(miner->sols, sol_i); - } - - uint8_t proof[COMPRESSED_PROOFSIZE * 2]; - for (uint32_t i = 0; i < miner->sols->nr; i++) { - if (miner->sols->valid[i]) { - compress(proof, (uint32_t *)(miner->sols->values[i]), 1 << PARAM_K); - solutionf(std::vector(0), 1344, proof); - } - } - hashdonef(); -} - -// STATICS END - diff --git a/ocl_silentarmy/ocl_silentarmy.hpp b/ocl_silentarmy/ocl_silentarmy.hpp deleted file mode 100644 index 4740ac8b5..000000000 --- a/ocl_silentarmy/ocl_silentarmy.hpp +++ /dev/null @@ -1,58 +0,0 @@ -#pragma once -#ifdef _LIB -#define DLL_OCL_SILENTARMY __declspec(dllexport) -#else -#define DLL_OCL_SILENTARMY -#endif - -// remove after -#include -#include -#include -#include - -struct OclContext; - - - -struct DLL_OCL_SILENTARMY ocl_silentarmy -{ - //int threadsperblock; - int blocks; - int device_id; - int platform_id; - - OclContext* oclc; - // threads - unsigned threadsNum; // TMP - unsigned wokrsize; - - bool is_init_success = false; - - ocl_silentarmy(int platf_id, int dev_id); - - std::string getdevinfo(); - - static int getcount(); - - static void getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version); - - static void start(ocl_silentarmy& device_context); - - static void stop(ocl_silentarmy& device_context); - - static void solve(const char *tequihash_header, - unsigned int tequihash_header_len, - const char* nonce, - unsigned int nonce_len, - std::function cancelf, - std::function&, size_t, const unsigned char*)> solutionf, - std::function hashdonef, - ocl_silentarmy& device_context); - - std::string getname() { return "OCL_SILENTARMY"; } - -private: - std::string m_gpu_name; - std::string m_version; -}; \ No newline at end of file diff --git a/ocl_silentarmy/ocl_silentarmy.vcxproj b/ocl_silentarmy/ocl_silentarmy.vcxproj deleted file mode 100644 index 1aae0f6ca..000000000 --- a/ocl_silentarmy/ocl_silentarmy.vcxproj +++ /dev/null @@ -1,98 +0,0 @@ - - - - - Debug - x64 - - - Release - x64 - - - - - - - - - - - - - - - - - {AB01E715-795A-4089-8DF0-AE6EBDC1AB48} - Win32Proj - ocl_silentarmy - - - - StaticLibrary - true - v120 - Unicode - - - StaticLibrary - false - v120 - true - Unicode - - - - - - - - - - - - - $(SolutionDir)$(Platform)\$(Configuration)\ - $(Platform)\$(Configuration)\ - - - $(Platform)\$(Configuration)\ - $(SolutionDir)$(Platform)\$(Configuration)\ - - - - - - Level3 - Disabled - WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) - ..\ocl_device_utils;$(AMDAPPSDKROOT)\include\;%(AdditionalIncludeDirectories) - - - Windows - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) - ..\ocl_device_utils;$(AMDAPPSDKROOT)\include\;%(AdditionalIncludeDirectories) - - - Windows - true - true - true - - - - - - \ No newline at end of file diff --git a/ocl_silentarmy/ocl_silentarmy.vcxproj.filters b/ocl_silentarmy/ocl_silentarmy.vcxproj.filters deleted file mode 100644 index 9659f2c07..000000000 --- a/ocl_silentarmy/ocl_silentarmy.vcxproj.filters +++ /dev/null @@ -1,28 +0,0 @@ - - - - - - - - - - - - - - {34381c66-ca5c-4daa-aa30-58dcf33e2d66} - - - {c7687099-e206-4d36-8836-f7032bffc7da} - - - - - zcash\gpu - - - zcash\gpu - - - \ No newline at end of file diff --git a/ocl_silentarmy/param.h b/ocl_silentarmy/param.h deleted file mode 100644 index 51ef42ea9..000000000 --- a/ocl_silentarmy/param.h +++ /dev/null @@ -1,66 +0,0 @@ -#define PARAM_N 200 -#define PARAM_K 9 -#define PREFIX (PARAM_N / (PARAM_K + 1)) -#define NR_INPUTS (1 << PREFIX) -// Approximate log base 2 of number of elements in hash tables -#define APX_NR_ELMS_LOG (PREFIX + 1) -// Number of rows and slots is affected by this. 20 offers the best performance -// but occasionally misses ~1% of solutions. -#define NR_ROWS_LOG 20 - -// Make hash tables OVERHEAD times larger than necessary to store the average -// number of elements per row. The ideal value is as small as possible to -// reduce memory usage, but not too small or else elements are dropped from the -// hash tables. -// -// The actual number of elements per row is closer to the theoretical average -// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be -// smaller. -// -// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease -// performance as they cause VRAM channel conflicts. -#if NR_ROWS_LOG == 16 -#define OVERHEAD 3 -#elif NR_ROWS_LOG == 18 -#define OVERHEAD 5 -#elif NR_ROWS_LOG == 19 -#define OVERHEAD 9 -#elif NR_ROWS_LOG == 20 -#define OVERHEAD 13 -#endif - -#define NR_ROWS (1 << NR_ROWS_LOG) -#define NR_SLOTS ((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD) -// Length of 1 element (slot) in bytes -#define SLOT_LEN 32 -// Total size of hash table -#define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN) -// Length of Zcash block header and nonce -#define ZCASH_BLOCK_HEADER_LEN 140 -#define ZCASH_NONCE_LEN 32 -// Number of bytes Zcash needs out of Blake -#define ZCASH_HASH_LEN 50 -// Number of wavefronts per SIMD for the Blake kernel. -// Blake is ALU-bound (beside the atomic counter being incremented) so we need -// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer -// instructions. 10 is the max supported by the hw. -#define BLAKE_WPS 10 -#define MAX_SOLS 2000 - -// Optional features -#undef ENABLE_DEBUG - -/* -** Return the offset of Xi in bytes from the beginning of the slot. -*/ -#define xi_offset_for_round(round) (8 + ((round) / 2) * 4) - -// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values -#define SOL_SIZE ((1 << PARAM_K) * 4) -typedef struct sols_s -{ - uint nr; - uint likely_invalids; - uchar valid[MAX_SOLS]; - uint values[MAX_SOLS][(1 << PARAM_K)]; -} sols_t; diff --git a/ocl_silentarmy/sa_blake.cpp b/ocl_silentarmy/sa_blake.cpp deleted file mode 100644 index c10800de8..000000000 --- a/ocl_silentarmy/sa_blake.cpp +++ /dev/null @@ -1,104 +0,0 @@ -#include -#include -#include -#include "sa_blake.h" - -static const uint32_t blake2b_block_len = 128; -static const uint32_t blake2b_rounds = 12; -static const uint64_t blake2b_iv[8] = -{ - 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, - 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, - 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, - 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL, -}; -static const uint8_t blake2b_sigma[12][16] = -{ - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, -}; - -/* -** Init the state according to Zcash parameters. -*/ -void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, - uint32_t n, uint32_t k) -{ - assert(n > k); - assert(hash_len <= 64); - st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len); - for (uint32_t i = 1; i <= 5; i++) - st->h[i] = blake2b_iv[i]; - st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW"; - st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n); - st->bytes = 0; -} - -static uint64_t rotr64(uint64_t a, uint8_t bits) -{ - return (a >> bits) | (a << (64 - bits)); -} - -static void mix(uint64_t *va, uint64_t *vb, uint64_t *vc, uint64_t *vd, - uint64_t x, uint64_t y) -{ - *va = (*va + *vb + x); - *vd = rotr64(*vd ^ *va, 32); - *vc = (*vc + *vd); - *vb = rotr64(*vb ^ *vc, 24); - *va = (*va + *vb + y); - *vd = rotr64(*vd ^ *va, 16); - *vc = (*vc + *vd); - *vb = rotr64(*vb ^ *vc, 63); -} - -/* -** Process either a full message block or the final partial block. -** Note that v[13] is not XOR'd because st->bytes is assumed to never overflow. -** -** _msg pointer to message (must be zero-padded to 128 bytes if final block) -** msg_len must be 128 (<= 128 allowed only for final partial block) -** is_final indicate if this is the final block -*/ -void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg, - uint32_t msg_len, uint32_t is_final) -{ - const uint64_t *m = (const uint64_t *)_msg; - uint64_t v[16]; - assert(msg_len <= 128); - assert(st->bytes <= UINT64_MAX - msg_len); - memcpy(v + 0, st->h, 8 * sizeof (*v)); - memcpy(v + 8, blake2b_iv, 8 * sizeof (*v)); - v[12] ^= (st->bytes += msg_len); - v[14] ^= is_final ? -1 : 0; - for (uint32_t round = 0; round < blake2b_rounds; round++) - { - const uint8_t *s = blake2b_sigma[round]; - mix(v + 0, v + 4, v + 8, v + 12, m[s[0]], m[s[1]]); - mix(v + 1, v + 5, v + 9, v + 13, m[s[2]], m[s[3]]); - mix(v + 2, v + 6, v + 10, v + 14, m[s[4]], m[s[5]]); - mix(v + 3, v + 7, v + 11, v + 15, m[s[6]], m[s[7]]); - mix(v + 0, v + 5, v + 10, v + 15, m[s[8]], m[s[9]]); - mix(v + 1, v + 6, v + 11, v + 12, m[s[10]], m[s[11]]); - mix(v + 2, v + 7, v + 8, v + 13, m[s[12]], m[s[13]]); - mix(v + 3, v + 4, v + 9, v + 14, m[s[14]], m[s[15]]); - } - for (uint32_t i = 0; i < 8; i++) - st->h[i] ^= v[i] ^ v[i + 8]; -} - -void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen) -{ - assert(outlen <= 64); - memcpy(out, st->h, outlen); -} diff --git a/ocl_silentarmy/sa_blake.h b/ocl_silentarmy/sa_blake.h deleted file mode 100644 index 40270a95e..000000000 --- a/ocl_silentarmy/sa_blake.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once -typedef struct blake2b_state_s -{ - uint64_t h[8]; - uint64_t bytes; -} blake2b_state_t; -void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, - uint32_t n, uint32_t k); -void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg, - uint32_t msg_len, uint32_t is_final); -void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen); diff --git a/ocl_silentarmy/zcash/gpu/input.cl b/ocl_silentarmy/zcash/gpu/input.cl deleted file mode 100644 index f5112c816..000000000 --- a/ocl_silentarmy/zcash/gpu/input.cl +++ /dev/null @@ -1,704 +0,0 @@ -#include "param.h" - -/* -** Assuming NR_ROWS_LOG == 16, the hash table slots have this layout (length in -** bytes in parens): -** -** round 0, table 0: cnt(4) i(4) pad(0) Xi(23.0) pad(1) -** round 1, table 1: cnt(4) i(4) pad(0.5) Xi(20.5) pad(3) -** round 2, table 0: cnt(4) i(4) i(4) pad(0) Xi(18.0) pad(2) -** round 3, table 1: cnt(4) i(4) i(4) pad(0.5) Xi(15.5) pad(4) -** round 4, table 0: cnt(4) i(4) i(4) i(4) pad(0) Xi(13.0) pad(3) -** round 5, table 1: cnt(4) i(4) i(4) i(4) pad(0.5) Xi(10.5) pad(5) -** round 6, table 0: cnt(4) i(4) i(4) i(4) i(4) pad(0) Xi( 8.0) pad(4) -** round 7, table 1: cnt(4) i(4) i(4) i(4) i(4) pad(0.5) Xi( 5.5) pad(6) -** round 8, table 0: cnt(4) i(4) i(4) i(4) i(4) i(4) pad(0) Xi( 3.0) pad(5) -** -** If the first byte of Xi is 0xAB then: -** - on even rounds, 'A' is part of the colliding PREFIX, 'B' is part of Xi -** - on odd rounds, 'A' and 'B' are both part of the colliding PREFIX, but -** 'A' is considered redundant padding as it was used to compute the row # -** -** - cnt is an atomic counter keeping track of the number of used slots. -** it is used in the first slot only; subsequent slots replace it with -** 4 padding bytes -** - i encodes either the 21-bit input value (round 0) or a reference to two -** inputs from the previous round -** -** Formula for Xi length and pad length above: -** > for i in range(9): -** > xi=(200-20*i-NR_ROWS_LOG)/8.; ci=8+4*((i)/2); print xi,32-ci-xi -** -** Note that the fractional .5-byte/4-bit padding following Xi for odd rounds -** is the 4 most significant bits of the last byte of Xi. -*/ - -__constant ulong blake_iv[] = -{ - 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, - 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, - 0x510e527fade682d1, 0x9b05688c2b3e6c1f, - 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, -}; - -/* -** Reset counters in hash table. -*/ -__kernel -void kernel_init_ht(__global char *ht) -{ - uint tid = get_global_id(0); - *(__global uint *)(ht + tid * NR_SLOTS * SLOT_LEN) = 0; -} - -/* -** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they -** represent (hex notation, group of 5 hex digits are a group of PREFIX bits): -** aa aa ab bb bb cc cc cd dd... [round 0] -** -------------------- -** ...ab bb bb cc cc cd dd... [odd round] -** -------------- -** ...cc cc cd dd... [next even round] -** ----- -** Bytes underlined are going to be stored in the slot. Preceding bytes -** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are -** used to compute the row number. -** -** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter) -** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble) -** TODO: update lines below with padding nibbles -** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter) -** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter) -** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter) -** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter) -** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter) -** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter) -** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter) -** -** Return 0 if successfully stored, or 1 if the row overflowed. -*/ -uint ht_store(uint round, __global char *ht, uint i, - ulong xi0, ulong xi1, ulong xi2, ulong xi3) -{ - uint row; - __global char *p; - uint cnt; -#if NR_ROWS_LOG == 16 - if (!(round % 2)) - row = (xi0 & 0xffff); - else - // if we have in hex: "ab cd ef..." (little endian xi0) then this - // formula computes the row as 0xdebc. it skips the 'a' nibble as it - // is part of the PREFIX. The Xi will be stored starting with "ef..."; - // 'e' will be considered padding and 'f' is part of the current PREFIX - row = ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | - ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); -#elif NR_ROWS_LOG == 18 - if (!(round % 2)) - row = (xi0 & 0xffff) | ((xi0 & 0xc00000) >> 6); - else - row = ((xi0 & 0xc0000) >> 2) | - ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | - ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); -#elif NR_ROWS_LOG == 19 - if (!(round % 2)) - row = (xi0 & 0xffff) | ((xi0 & 0xe00000) >> 5); - else - row = ((xi0 & 0xe0000) >> 1) | - ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | - ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); -#elif NR_ROWS_LOG == 20 - if (!(round % 2)) - row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4); - else - row = ((xi0 & 0xf0000) >> 0) | - ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | - ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); -#else -#error "unsupported NR_ROWS_LOG" -#endif - xi0 = (xi0 >> 16) | (xi1 << (64 - 16)); - xi1 = (xi1 >> 16) | (xi2 << (64 - 16)); - xi2 = (xi2 >> 16) | (xi3 << (64 - 16)); - p = ht + row * NR_SLOTS * SLOT_LEN; - cnt = atomic_inc((__global uint *)p); - if (cnt >= NR_SLOTS) - return 1; - p += cnt * SLOT_LEN + xi_offset_for_round(round); - // store "i" (always 4 bytes before Xi) - *(__global uint *)(p - 4) = i; - if (round == 0 || round == 1) - { - // store 24 bytes - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - *(__global ulong *)(p + 16) = xi2; - } - else if (round == 2) - { - // store 20 bytes - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - *(__global uint *)(p + 16) = xi2; - } - else if (round == 3 || round == 4) - { - // store 16 bytes - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - - } - else if (round == 5) - { - // store 12 bytes - *(__global ulong *)(p + 0) = xi0; - *(__global uint *)(p + 8) = xi1; - } - else if (round == 6 || round == 7) - { - // store 8 bytes - *(__global ulong *)(p + 0) = xi0; - } - else if (round == 8) - { - // store 4 bytes - *(__global uint *)(p + 0) = xi0; - } - return 0; -} - -#define mix(va, vb, vc, vd, x, y) \ - va = (va + vb + x); \ - vd = rotate((vd ^ va), (ulong)64 - 32); \ - vc = (vc + vd); \ - vb = rotate((vb ^ vc), (ulong)64 - 24); \ - va = (va + vb + y); \ - vd = rotate((vd ^ va), (ulong)64 - 16); \ - vc = (vc + vd); \ - vb = rotate((vb ^ vc), (ulong)64 - 63); - -/* -** Execute round 0 (blake). -** -** Note: making the work group size less than or equal to the wavefront size -** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local -** Memory (LDS) Optimization 2-10" in: -** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/ -*/ -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) -void kernel_round0(__global ulong *blake_state, __global char *ht, - __global uint *debug) -{ - uint tid = get_global_id(0); - ulong v[16]; - uint inputs_per_thread = NR_INPUTS / get_global_size(0); - uint input = tid * inputs_per_thread; - uint input_end = (tid + 1) * inputs_per_thread; - uint dropped = 0; - while (input < input_end) - { - // shift "i" to occupy the high 32 bits of the second ulong word in the - // message block - ulong word1 = (ulong)input << 32; - // init vector v - v[0] = blake_state[0]; - v[1] = blake_state[1]; - v[2] = blake_state[2]; - v[3] = blake_state[3]; - v[4] = blake_state[4]; - v[5] = blake_state[5]; - v[6] = blake_state[6]; - v[7] = blake_state[7]; - v[8] = blake_iv[0]; - v[9] = blake_iv[1]; - v[10] = blake_iv[2]; - v[11] = blake_iv[3]; - v[12] = blake_iv[4]; - v[13] = blake_iv[5]; - v[14] = blake_iv[6]; - v[15] = blake_iv[7]; - // mix in length of data - v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */; - // last block - v[14] ^= -1; - - // round 1 - mix(v[0], v[4], v[8], v[12], 0, word1); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 2 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], word1, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 3 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, word1); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 4 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, word1); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 5 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, word1); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 6 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], word1, 0); - // round 7 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], word1, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 8 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, word1); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 9 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], word1, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 10 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], word1, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 11 - mix(v[0], v[4], v[8], v[12], 0, word1); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 12 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], word1, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - - // compress v into the blake state; this produces the 50-byte hash - // (two Xi values) - ulong h[7]; - h[0] = blake_state[0] ^ v[0] ^ v[8]; - h[1] = blake_state[1] ^ v[1] ^ v[9]; - h[2] = blake_state[2] ^ v[2] ^ v[10]; - h[3] = blake_state[3] ^ v[3] ^ v[11]; - h[4] = blake_state[4] ^ v[4] ^ v[12]; - h[5] = blake_state[5] ^ v[5] ^ v[13]; - h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff; - - // store the two Xi values in the hash table -#if ZCASH_HASH_LEN == 50 - dropped += ht_store(0, ht, input * 2, - h[0], - h[1], - h[2], - h[3]); - dropped += ht_store(0, ht, input * 2 + 1, - (h[3] >> 8) | (h[4] << (64 - 8)), - (h[4] >> 8) | (h[5] << (64 - 8)), - (h[5] >> 8) | (h[6] << (64 - 8)), - (h[6] >> 8)); -#else -#error "unsupported ZCASH_HASH_LEN" -#endif - - input++; - } -#ifdef ENABLE_DEBUG - debug[tid * 2] = 0; - debug[tid * 2 + 1] = dropped; -#endif -} - -#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8) - -#define ENCODE_INPUTS(row, slot0, slot1) \ - ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff)) -#define DECODE_ROW(REF) (REF >> 16) -#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff) -#define DECODE_SLOT0(REF) (REF & 0xff) - -#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7) - -#define ENCODE_INPUTS(row, slot0, slot1) \ - ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f)) -#define DECODE_ROW(REF) (REF >> 14) -#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f) -#define DECODE_SLOT0(REF) (REF & 0x7f) - -#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6) - -#define ENCODE_INPUTS(row, slot0, slot1) \ - ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */ -#define DECODE_ROW(REF) (REF >> 13) -#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) -#define DECODE_SLOT0(REF) (REF & 0x3f) - -#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6) - -#define ENCODE_INPUTS(row, slot0, slot1) \ - ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) -#define DECODE_ROW(REF) (REF >> 12) -#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) -#define DECODE_SLOT0(REF) (REF & 0x3f) - -#else -#error "unsupported NR_ROWS_LOG" -#endif - -/* -** XOR a pair of Xi values computed at "round - 1" and store the result in the -** hash table being built for "round". Note that when building the table for -** even rounds we need to skip 1 padding byte present in the "round - 1" table -** (the "0xAB" byte mentioned in the description at the top of this file.) But -** also note we can't load data directly past this byte because this would -** cause an unaligned memory access which is undefined per the OpenCL spec. -** -** Return 0 if successfully stored, or 1 if the row overflowed. -*/ -uint xor_and_store(uint round, __global char *ht_dst, uint row, - uint slot_a, uint slot_b, __global ulong *a, __global ulong *b) -{ - ulong xi0, xi1, xi2; -#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20 - // Note: for NR_ROWS_LOG == 20, for odd rounds, we could optimize by not - // storing the byte containing bits from the previous PREFIX block for - if (round == 1 || round == 2) - { - // xor 24 bytes - xi0 = *(a++) ^ *(b++); - xi1 = *(a++) ^ *(b++); - xi2 = *a ^ *b; - if (round == 2) - { - // skip padding byte - xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); - xi1 = (xi1 >> 8) | (xi2 << (64 - 8)); - xi2 = (xi2 >> 8); - } - } - else if (round == 3) - { - // xor 20 bytes - xi0 = *a++ ^ *b++; - xi1 = *a++ ^ *b++; - xi2 = *(__global uint *)a ^ *(__global uint *)b; - } - else if (round == 4 || round == 5) - { - // xor 16 bytes - xi0 = *a++ ^ *b++; - xi1 = *a ^ *b; - xi2 = 0; - if (round == 4) - { - // skip padding byte - xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); - xi1 = (xi1 >> 8); - } - } - else if (round == 6) - { - // xor 12 bytes - xi0 = *a++ ^ *b++; - xi1 = *(__global uint *)a ^ *(__global uint *)b; - xi2 = 0; - if (round == 6) - { - // skip padding byte - xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); - xi1 = (xi1 >> 8); - } - } - else if (round == 7 || round == 8) - { - // xor 8 bytes - xi0 = *a ^ *b; - xi1 = 0; - xi2 = 0; - if (round == 8) - { - // skip padding byte - xi0 = (xi0 >> 8); - } - } - // invalid solutions (which start happenning in round 5) have duplicate - // inputs and xor to zero, so discard them - if (!xi0 && !xi1) - return 0; -#else -#error "unsupported NR_ROWS_LOG" -#endif - return ht_store(round, ht_dst, ENCODE_INPUTS(row, slot_a, slot_b), - xi0, xi1, xi2, 0); -} - -/* -** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi, -** store them in ht_dst. -*/ -void equihash_round(uint round, __global char *ht_src, __global char *ht_dst, - __global uint *debug) -{ - uint tid = get_global_id(0); - uint tlid = get_local_id(0); - __global char *p; - uint cnt; - uchar first_words[NR_SLOTS]; - uchar mask; - uint i, j; - // NR_SLOTS is already oversized (by a factor of OVERHEAD), but we want to - // make it even larger - ushort collisions[NR_SLOTS * 3]; - uint nr_coll = 0; - uint n; - uint dropped_coll, dropped_stor; - __global ulong *a, *b; - uint xi_offset; - // read first words of Xi from the previous (round - 1) hash table - xi_offset = xi_offset_for_round(round - 1); - // the mask is also computed to read data from the previous round -#if NR_ROWS_LOG == 16 - mask = ((!(round % 2)) ? 0x0f : 0xf0); -#elif NR_ROWS_LOG == 18 - mask = ((!(round % 2)) ? 0x03 : 0x30); -#elif NR_ROWS_LOG == 19 - mask = ((!(round % 2)) ? 0x01 : 0x10); -#elif NR_ROWS_LOG == 20 - mask = 0; /* we can vastly simplify the code below */ -#else -#error "unsupported NR_ROWS_LOG" -#endif - p = (ht_src + tid * NR_SLOTS * SLOT_LEN); - cnt = *(__global uint *)p; - cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in prev. round - p += xi_offset; - for (i = 0; i < cnt; i++, p += SLOT_LEN) - first_words[i] = *(__global uchar *)p; - // find collisions - nr_coll = 0; - dropped_coll = 0; - for (i = 0; i < cnt; i++) - for (j = i + 1; j < cnt; j++) - if ((first_words[i] & mask) == - (first_words[j] & mask)) - { - // collision! - if (nr_coll >= sizeof (collisions) / sizeof (*collisions)) - dropped_coll++; - else -#if NR_SLOTS <= (1 << 8) - // note: this assumes slots can be encoded in 8 bits - collisions[nr_coll++] = - ((ushort)j << 8) | ((ushort)i & 0xff); -#else -#error "unsupported NR_SLOTS" -#endif - } - // XOR colliding pairs of Xi - dropped_stor = 0; - for (n = 0; n < nr_coll; n++) - { - i = collisions[n] & 0xff; - j = collisions[n] >> 8; - a = (__global ulong *) - (ht_src + tid * NR_SLOTS * SLOT_LEN + i * SLOT_LEN + xi_offset); - b = (__global ulong *) - (ht_src + tid * NR_SLOTS * SLOT_LEN + j * SLOT_LEN + xi_offset); - dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b); - } - if (round < 8) - // reset the counter in preparation of the next round - *(__global uint *)(ht_src + tid * NR_SLOTS * SLOT_LEN) = 0; -#ifdef ENABLE_DEBUG - debug[tid * 2] = dropped_coll; - debug[tid * 2 + 1] = dropped_stor; -#endif -} - -/* -** This defines kernel_round1, kernel_round2, ..., kernel_round7. -*/ -#define KERNEL_ROUND(N) \ -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) \ -void kernel_round ## N(__global char *ht_src, __global char *ht_dst, \ - __global uint *debug) \ -{ \ - equihash_round(N, ht_src, ht_dst, debug); \ -} -KERNEL_ROUND(1) -KERNEL_ROUND(2) -KERNEL_ROUND(3) -KERNEL_ROUND(4) -KERNEL_ROUND(5) -KERNEL_ROUND(6) -KERNEL_ROUND(7) - -// kernel_round8 takes an extra argument, "sols" -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) -void kernel_round8(__global char *ht_src, __global char *ht_dst, - __global uint *debug, __global sols_t *sols) -{ - uint tid = get_global_id(0); - equihash_round(8, ht_src, ht_dst, debug); - if (!tid) - sols->nr = sols->likely_invalids = 0; -} - -uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot) -{ - return *(__global uint *)(ht + row * NR_SLOTS * SLOT_LEN + - slot * SLOT_LEN + xi_offset - 4); -} - -void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs, - uint round) -{ - __global char *ht = htabs[round % 2]; - uint i = nr_inputs - 1; - uint j = nr_inputs * 2 - 1; - uint xi_offset = xi_offset_for_round(round); - do - { - ins[j] = expand_ref(ht, xi_offset, - DECODE_ROW(ins[i]), DECODE_SLOT1(ins[i])); - ins[j - 1] = expand_ref(ht, xi_offset, - DECODE_ROW(ins[i]), DECODE_SLOT0(ins[i])); - if (!i) - break ; - i--; - j -= 2; - } - while (1); -} - -/* -** Verify if a potential solution is in fact valid. -*/ -void potential_sol(__global char **htabs, __global sols_t *sols, - uint ref0, uint ref1) -{ - uint sol_i; - uint nr_values; - sol_i = atomic_inc(&sols->nr); - if (sol_i >= MAX_SOLS) - return ; - sols->valid[sol_i] = 0; - nr_values = 0; - sols->values[sol_i][nr_values++] = ref0; - sols->values[sol_i][nr_values++] = ref1; - uint round = PARAM_K - 1; - do - { - round--; - expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round); - nr_values *= 2; - } - while (round > 0); - sols->valid[sol_i] = 1; -} - -/* -** Scan the hash tables to find Equihash solutions. -*/ -__kernel -void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols) -{ - uint tid = get_global_id(0); - __global char *htabs[2] = { ht0, ht1 }; - uint ht_i = (PARAM_K - 1) % 2; // table filled at last round - uint cnt; - uint xi_offset = xi_offset_for_round(PARAM_K - 1); - uint i, j; - __global char *a, *b; - uint ref_i, ref_j; - // it's ok for the collisions array to be so small, as if it fills up - // the potential solutions are likely invalid (many duplicate inputs) - ulong collisions[5]; - uint coll; -#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20 - // in the final hash table, we are looking for a match on both the bits - // part of the previous PREFIX colliding bits, and the last PREFIX bits. - uint mask = 0xffffff; -#else -#error "unsupported NR_ROWS_LOG" -#endif - a = htabs[ht_i] + tid * NR_SLOTS * SLOT_LEN; - cnt = *(__global uint *)a; - cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round - coll = 0; - a += xi_offset; - for (i = 0; i < cnt; i++, a += SLOT_LEN) - for (j = i + 1, b = a + SLOT_LEN; j < cnt; j++, b += SLOT_LEN) - if (((*(__global uint *)a) & mask) == - ((*(__global uint *)b) & mask)) - { - ref_i = *(__global uint *)(a - 4); - ref_j = *(__global uint *)(b - 4); - if (coll < sizeof (collisions) / sizeof (*collisions)) - collisions[coll++] = ((ulong)ref_i << 32) | ref_j; - else - atomic_inc(&sols->likely_invalids); - } - if (!coll) - return ; - for (i = 0; i < coll; i++) - potential_sol(htabs, sols, collisions[i] >> 32, - collisions[i] & 0xffffffff); -} diff --git a/ocl_silentarmy/zcash/gpu/kernel.cl b/ocl_silentarmy/zcash/gpu/kernel.cl deleted file mode 100644 index 0fdc74d83..000000000 --- a/ocl_silentarmy/zcash/gpu/kernel.cl +++ /dev/null @@ -1,555 +0,0 @@ -# 1 "input.cl" -# 1 "" -# 1 "" -# 1 "/usr/include/stdc-predef.h" 1 3 4 -# 1 "" 2 -# 1 "input.cl" -# 1 "param.h" 1 -# 60 "param.h" -typedef struct sols_s -{ - uint nr; - uint likely_invalids; - uchar valid[2000]; - uint values[2000][(1 << 9)]; -} sols_t; -# 2 "input.cl" 2 -# 36 "input.cl" -__constant ulong blake_iv[] = -{ - 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, - 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, - 0x510e527fade682d1, 0x9b05688c2b3e6c1f, - 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, -}; - - - - -__kernel -void kernel_init_ht(__global char *ht) -{ - uint tid = get_global_id(0); - *(__global uint *)(ht + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32) = 0; -} -# 80 "input.cl" -uint ht_store(uint round, __global char *ht, uint i, - ulong xi0, ulong xi1, ulong xi2, ulong xi3) -{ - uint row; - __global char *p; - uint cnt; -# 111 "input.cl" - if (!(round % 2)) - row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4); - else - row = ((xi0 & 0xf0000) >> 0) | - ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | - ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); - - - - xi0 = (xi0 >> 16) | (xi1 << (64 - 16)); - xi1 = (xi1 >> 16) | (xi2 << (64 - 16)); - xi2 = (xi2 >> 16) | (xi3 << (64 - 16)); - p = ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32; - cnt = atomic_inc((__global uint *)p); - if (cnt >= ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9)) - return 1; - p += cnt * 32 + (8 + ((round) / 2) * 4); - - *(__global uint *)(p - 4) = i; - if (round == 0 || round == 1) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - *(__global ulong *)(p + 16) = xi2; - } - else if (round == 2) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - *(__global uint *)(p + 16) = xi2; - } - else if (round == 3 || round == 4) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - - } - else if (round == 5) - { - - *(__global ulong *)(p + 0) = xi0; - *(__global uint *)(p + 8) = xi1; - } - else if (round == 6 || round == 7) - { - - *(__global ulong *)(p + 0) = xi0; - } - else if (round == 8) - { - - *(__global uint *)(p + 0) = xi0; - } - return 0; -} -# 188 "input.cl" -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) -void kernel_round0(__global ulong *blake_state, __global char *ht, - __global uint *debug) -{ - uint tid = get_global_id(0); - ulong v[16]; - uint inputs_per_thread = (1 << (200 / (9 + 1))) / get_global_size(0); - uint input = tid * inputs_per_thread; - uint input_end = (tid + 1) * inputs_per_thread; - uint dropped = 0; - while (input < input_end) - { - - - ulong word1 = (ulong)input << 32; - - v[0] = blake_state[0]; - v[1] = blake_state[1]; - v[2] = blake_state[2]; - v[3] = blake_state[3]; - v[4] = blake_state[4]; - v[5] = blake_state[5]; - v[6] = blake_state[6]; - v[7] = blake_state[7]; - v[8] = blake_iv[0]; - v[9] = blake_iv[1]; - v[10] = blake_iv[2]; - v[11] = blake_iv[3]; - v[12] = blake_iv[4]; - v[13] = blake_iv[5]; - v[14] = blake_iv[6]; - v[15] = blake_iv[7]; - - v[12] ^= 140 + 4 ; - - v[14] ^= -1; - - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + word1); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + word1); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + word1); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + word1); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + word1); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + word1); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 32); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 24); v[0] = (v[0] + v[4] + 0); v[12] = rotate((v[12] ^ v[0]), (ulong)64 - 16); v[8] = (v[8] + v[12]); v[4] = rotate((v[4] ^ v[8]), (ulong)64 - 63);; - v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 32); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 24); v[1] = (v[1] + v[5] + 0); v[13] = rotate((v[13] ^ v[1]), (ulong)64 - 16); v[9] = (v[9] + v[13]); v[5] = rotate((v[5] ^ v[9]), (ulong)64 - 63);; - v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 32); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 24); v[2] = (v[2] + v[6] + 0); v[14] = rotate((v[14] ^ v[2]), (ulong)64 - 16); v[10] = (v[10] + v[14]); v[6] = rotate((v[6] ^ v[10]), (ulong)64 - 63);; - v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 32); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 24); v[3] = (v[3] + v[7] + 0); v[15] = rotate((v[15] ^ v[3]), (ulong)64 - 16); v[11] = (v[11] + v[15]); v[7] = rotate((v[7] ^ v[11]), (ulong)64 - 63);; - v[0] = (v[0] + v[5] + word1); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 32); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 24); v[0] = (v[0] + v[5] + 0); v[15] = rotate((v[15] ^ v[0]), (ulong)64 - 16); v[10] = (v[10] + v[15]); v[5] = rotate((v[5] ^ v[10]), (ulong)64 - 63);; - v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 32); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 24); v[1] = (v[1] + v[6] + 0); v[12] = rotate((v[12] ^ v[1]), (ulong)64 - 16); v[11] = (v[11] + v[12]); v[6] = rotate((v[6] ^ v[11]), (ulong)64 - 63);; - v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 32); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 24); v[2] = (v[2] + v[7] + 0); v[13] = rotate((v[13] ^ v[2]), (ulong)64 - 16); v[8] = (v[8] + v[13]); v[7] = rotate((v[7] ^ v[8]), (ulong)64 - 63);; - v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 32); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 24); v[3] = (v[3] + v[4] + 0); v[14] = rotate((v[14] ^ v[3]), (ulong)64 - 16); v[9] = (v[9] + v[14]); v[4] = rotate((v[4] ^ v[9]), (ulong)64 - 63);; - - - - ulong h[7]; - h[0] = blake_state[0] ^ v[0] ^ v[8]; - h[1] = blake_state[1] ^ v[1] ^ v[9]; - h[2] = blake_state[2] ^ v[2] ^ v[10]; - h[3] = blake_state[3] ^ v[3] ^ v[11]; - h[4] = blake_state[4] ^ v[4] ^ v[12]; - h[5] = blake_state[5] ^ v[5] ^ v[13]; - h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff; - - - - dropped += ht_store(0, ht, input * 2, - h[0], - h[1], - h[2], - h[3]); - dropped += ht_store(0, ht, input * 2 + 1, - (h[3] >> 8) | (h[4] << (64 - 8)), - (h[4] >> 8) | (h[5] << (64 - 8)), - (h[5] >> 8) | (h[6] << (64 - 8)), - (h[6] >> 8)); - - - - - input++; - } - - - - -} -# 415 "input.cl" -uint xor_and_store(uint round, __global char *ht_dst, uint row, - uint slot_a, uint slot_b, __global ulong *a, __global ulong *b) -{ - ulong xi0, xi1, xi2; - - - - if (round == 1 || round == 2) - { - - xi0 = *(a++) ^ *(b++); - xi1 = *(a++) ^ *(b++); - xi2 = *a ^ *b; - if (round == 2) - { - - xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); - xi1 = (xi1 >> 8) | (xi2 << (64 - 8)); - xi2 = (xi2 >> 8); - } - } - else if (round == 3) - { - - xi0 = *a++ ^ *b++; - xi1 = *a++ ^ *b++; - xi2 = *(__global uint *)a ^ *(__global uint *)b; - } - else if (round == 4 || round == 5) - { - - xi0 = *a++ ^ *b++; - xi1 = *a ^ *b; - xi2 = 0; - if (round == 4) - { - - xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); - xi1 = (xi1 >> 8); - } - } - else if (round == 6) - { - - xi0 = *a++ ^ *b++; - xi1 = *(__global uint *)a ^ *(__global uint *)b; - xi2 = 0; - if (round == 6) - { - - xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); - xi1 = (xi1 >> 8); - } - } - else if (round == 7 || round == 8) - { - - xi0 = *a ^ *b; - xi1 = 0; - xi2 = 0; - if (round == 8) - { - - xi0 = (xi0 >> 8); - } - } - - - if (!xi0 && !xi1) - return 0; - - - - return ht_store(round, ht_dst, ((row << 12) | ((slot_b & 0x3f) << 6) | (slot_a & 0x3f)), - xi0, xi1, xi2, 0); -} - - - - - -void equihash_round(uint round, __global char *ht_src, __global char *ht_dst, - __global uint *debug) -{ - uint tid = get_global_id(0); - uint tlid = get_local_id(0); - __global char *p; - uint cnt; - uchar first_words[((1 << (((200 / (9 + 1)) + 1) - 20)) * 9)]; - uchar mask; - uint i, j; - - - ushort collisions[((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 3]; - uint nr_coll = 0; - uint n; - uint dropped_coll, dropped_stor; - __global ulong *a, *b; - uint xi_offset; - - xi_offset = (8 + ((round - 1) / 2) * 4); -# 524 "input.cl" - mask = 0; - - - - p = (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32); - cnt = *(__global uint *)p; - cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 9)); - p += xi_offset; - for (i = 0; i < cnt; i++, p += 32) - first_words[i] = *(__global uchar *)p; - - nr_coll = 0; - dropped_coll = 0; - for (i = 0; i < cnt; i++) - for (j = i + 1; j < cnt; j++) - if ((first_words[i] & mask) == - (first_words[j] & mask)) - { - - if (nr_coll >= sizeof (collisions) / sizeof (*collisions)) - dropped_coll++; - else - - - collisions[nr_coll++] = - ((ushort)j << 8) | ((ushort)i & 0xff); - - - - } - - dropped_stor = 0; - for (n = 0; n < nr_coll; n++) - { - i = collisions[n] & 0xff; - j = collisions[n] >> 8; - a = (__global ulong *) - (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 + i * 32 + xi_offset); - b = (__global ulong *) - (ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 + j * 32 + xi_offset); - dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b); - } - if (round < 8) - - *(__global uint *)(ht_src + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32) = 0; - - - - -} -# 585 "input.cl" -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round1(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(1, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round2(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(2, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round3(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(3, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round4(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(4, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round5(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(5, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round6(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(6, ht_src, ht_dst, debug); } -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void kernel_round7(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(7, ht_src, ht_dst, debug); } - - -__kernel __attribute__((reqd_work_group_size(64, 1, 1))) -void kernel_round8(__global char *ht_src, __global char *ht_dst, - __global uint *debug, __global sols_t *sols) -{ - uint tid = get_global_id(0); - equihash_round(8, ht_src, ht_dst, debug); - if (!tid) - sols->nr = sols->likely_invalids = 0; -} - -uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot) -{ - return *(__global uint *)(ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32 + - slot * 32 + xi_offset - 4); -} - -void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs, - uint round) -{ - __global char *ht = htabs[round % 2]; - uint i = nr_inputs - 1; - uint j = nr_inputs * 2 - 1; - uint xi_offset = (8 + ((round) / 2) * 4); - do - { - ins[j] = expand_ref(ht, xi_offset, - (ins[i] >> 12), ((ins[i] >> 6) & 0x3f)); - ins[j - 1] = expand_ref(ht, xi_offset, - (ins[i] >> 12), (ins[i] & 0x3f)); - if (!i) - break ; - i--; - j -= 2; - } - while (1); -} - - - - -void potential_sol(__global char **htabs, __global sols_t *sols, - uint ref0, uint ref1) -{ - uint sol_i; - uint nr_values; - sol_i = atomic_inc(&sols->nr); - if (sol_i >= 2000) - return ; - sols->valid[sol_i] = 0; - nr_values = 0; - sols->values[sol_i][nr_values++] = ref0; - sols->values[sol_i][nr_values++] = ref1; - uint round = 9 - 1; - do - { - round--; - expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round); - nr_values *= 2; - } - while (round > 0); - sols->valid[sol_i] = 1; -} - - - - -__kernel -void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols) -{ - uint tid = get_global_id(0); - __global char *htabs[2] = { ht0, ht1 }; - uint ht_i = (9 - 1) % 2; - uint cnt; - uint xi_offset = (8 + ((9 - 1) / 2) * 4); - uint i, j; - __global char *a, *b; - uint ref_i, ref_j; - - - ulong collisions[5]; - uint coll; - - - - uint mask = 0xffffff; - - - - a = htabs[ht_i] + tid * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 9) * 32; - cnt = *(__global uint *)a; - cnt = min(cnt, (uint)((1 << (((200 / (9 + 1)) + 1) - 20)) * 9)); - coll = 0; - a += xi_offset; - for (i = 0; i < cnt; i++, a += 32) - for (j = i + 1, b = a + 32; j < cnt; j++, b += 32) - if (((*(__global uint *)a) & mask) == - ((*(__global uint *)b) & mask)) - { - ref_i = *(__global uint *)(a - 4); - ref_j = *(__global uint *)(b - 4); - if (coll < sizeof (collisions) / sizeof (*collisions)) - collisions[coll++] = ((ulong)ref_i << 32) | ref_j; - else - atomic_inc(&sols->likely_invalids); - } - if (!coll) - return ; - for (i = 0; i < coll; i++) - potential_sol(htabs, sols, collisions[i] >> 32, - collisions[i] & 0xffffffff); -} diff --git a/ocl_xpm/ocl_xmp.cpp b/ocl_xpm/ocl_xmp.cpp deleted file mode 100644 index d0a96a2a8..000000000 --- a/ocl_xpm/ocl_xmp.cpp +++ /dev/null @@ -1,305 +0,0 @@ -#include "ocl_xmp.hpp" - - - -// miner instance -#include "opencl.h" -#include - -#include - -// is this really needed? -//#include "uint256.h" - -// hardcoded defines, looks like not working -// hardcoded defines fix this -#define RESTBITS 4 -#define XINTREE -#define UNROLL -#define __OPENCL_HOST__ -#include "zcash/gpu/common.h" - -struct MinerInstance { - cl_context _context; - cl_program _program; - - cl_command_queue queue; - clBuffer blake2bState; - clBuffer heap0; - clBuffer heap1; - clBuffer nslots; - clBuffer sols; - clBuffer numSols; - cl_kernel _digitHKernel; - cl_kernel _digitOKernel; - cl_kernel _digitEKernel; - cl_kernel _digitKKernel; - cl_kernel _digitKernels[9]; - - //hide_xmp_hack::uint256 nonce; // TODO IS THIS NEEDED???? - - bool init(cl_context context, cl_program program, cl_device_id dev, unsigned threadsNum, unsigned threadsPerBlock); -}; - -cl_context gContext = 0; -cl_program gProgram = 0; -cl_platform_id gPlatform = 0; - - -bool MinerInstance::init(cl_context context, - cl_program program, - cl_device_id dev, - unsigned int threadsNum, - unsigned int threadsPerBlock) -{ - cl_int error; - - _context = context; - _program = program; - queue = clCreateCommandQueue(context, dev, 0, &error); - - blake2bState.init(context, 1, CL_MEM_READ_WRITE); - heap0.init(context, sizeof(digit0) / sizeof(uint32_t), CL_MEM_HOST_NO_ACCESS); - heap1.init(context, sizeof(digit1) / sizeof(uint32_t), CL_MEM_HOST_NO_ACCESS); - nslots.init(context, 2, CL_MEM_READ_WRITE); - sols.init(context, MAXSOLS, CL_MEM_READ_WRITE); - numSols.init(context, 1, CL_MEM_READ_WRITE); - - _digitHKernel = clCreateKernel(program, "digitH", &error); - _digitOKernel = clCreateKernel(program, "digitOdd", &error); - _digitEKernel = clCreateKernel(program, "digitEven", &error); - _digitKKernel = clCreateKernel(program, "digitK", &error); - OCLR(clSetKernelArg(_digitHKernel, 0, sizeof(cl_mem), &blake2bState.DeviceData), 1); - OCLR(clSetKernelArg(_digitHKernel, 1, sizeof(cl_mem), &heap0.DeviceData), 1); - OCLR(clSetKernelArg(_digitHKernel, 2, sizeof(cl_mem), &nslots.DeviceData), 1); - - OCLR(clSetKernelArg(_digitOKernel, 1, sizeof(cl_mem), &heap0.DeviceData), 1); - OCLR(clSetKernelArg(_digitOKernel, 2, sizeof(cl_mem), &heap1.DeviceData), 1); - OCLR(clSetKernelArg(_digitOKernel, 3, sizeof(cl_mem), &nslots.DeviceData), 1); - OCLR(clSetKernelArg(_digitEKernel, 1, sizeof(cl_mem), &heap0.DeviceData), 1); - OCLR(clSetKernelArg(_digitEKernel, 2, sizeof(cl_mem), &heap1.DeviceData), 1); - OCLR(clSetKernelArg(_digitEKernel, 3, sizeof(cl_mem), &nslots.DeviceData), 1); - - for (unsigned i = 1; i <= 8; i++) { - char kernelName[32]; - sprintf(kernelName, "digit_%u", i); - _digitKernels[i] = clCreateKernel(program, kernelName, &error); - OCLR(clSetKernelArg(_digitKernels[i], 0, sizeof(cl_mem), &heap0.DeviceData), 1); - OCLR(clSetKernelArg(_digitKernels[i], 1, sizeof(cl_mem), &heap1.DeviceData), 1); - OCLR(clSetKernelArg(_digitKernels[i], 2, sizeof(cl_mem), &nslots.DeviceData), 1); - } - - OCLR(clSetKernelArg(_digitKKernel, 0, sizeof(cl_mem), &heap0.DeviceData), 1); - OCLR(clSetKernelArg(_digitKKernel, 1, sizeof(cl_mem), &heap1.DeviceData), 1); - OCLR(clSetKernelArg(_digitKKernel, 2, sizeof(cl_mem), &nslots.DeviceData), 1); - OCLR(clSetKernelArg(_digitKKernel, 3, sizeof(cl_mem), &sols.DeviceData), 1); - OCLR(clSetKernelArg(_digitKKernel, 4, sizeof(cl_mem), &numSols.DeviceData), 1); - - return true; -} - -//////////////////////////// -////statics non class START - -static void setheader(blake2b_state *ctx, const char *header, const uint32_t headerlen) -{ - uint32_t le_N = WN; - uint32_t le_K = WK; - char personal[] = "ZcashPoW01230123"; - memcpy(personal + 8, &le_N, 4); - memcpy(personal + 12, &le_K, 4); - blake2b_param P[1]; - P->digest_length = HASHOUT; - P->key_length = 0; - P->fanout = 1; - P->depth = 1; - P->leaf_length = 0; - P->node_offset = 0; - P->node_depth = 0; - P->inner_length = 0; - memset(P->reserved, 0, sizeof(P->reserved)); - memset(P->salt, 0, sizeof(P->salt)); - memcpy(P->personal, (const uint8_t *)personal, 16); - blake2b_init_param(ctx, P); - blake2b_update(ctx, (const uint8_t*)header, headerlen); -} - -static void setnonce(blake2b_state *ctx, const uint8_t *nonce) -{ - blake2b_update(ctx, nonce, 32); -} - -static int inline digit(cl_command_queue clQueue, cl_kernel kernel, size_t nthreads, size_t threadsPerBlock) -{ - size_t globalSize[] = { nthreads, 1, 1 }; - size_t localSize[] = { threadsPerBlock, 1 }; - OCLR(clEnqueueNDRangeKernel(clQueue, kernel, 1, 0, globalSize, localSize, 0, 0, 0), 1); - return 0; -} - - -////statics non class END -//////////////////////////// - -ocl_xmp::ocl_xmp(int platf_id, int dev_id) { /*TODO*/ - platform_id = platf_id; - device_id = dev_id; - // TODO - threadsNum = 8192; - wokrsize = 128; // 256; - //threadsperblock = 128; -} - -std::string ocl_xmp::getdevinfo() { /*TODO*/ - return "GPU_ID(" + std::to_string(device_id) + ")"; -} - -// STATICS START -int ocl_xmp::getcount() { /*TODO*/ - return 0; -} - -void ocl_xmp::getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version) { /*TODO*/ } - -void ocl_xmp::start(ocl_xmp& device_context) { - /*TODO*/ - device_context.is_init_success = false; - cl_context gContext[64] = { 0 }; - cl_program gProgram[64] = { 0 }; - - - std::vector allGpus; - if (!clInitialize(device_context.platform_id, allGpus)) { - return; - } - - // this is kinda stupid but it works - std::vector gpus; - for (unsigned i = 0; i < allGpus.size(); ++i) { - if (i == device_context.device_id) { - printf("Using device %d as GPU %d\n", i, (int)gpus.size()); - gpus.push_back(allGpus[i]); - } - } - - if (!gpus.size()){ - printf("Device id %d not found\n", device_context.device_id); - return; - } - - // context create - for (unsigned i = 0; i < gpus.size(); i++) { - cl_context_properties props[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)gPlatform, 0 }; - cl_int error; - gContext[i] = clCreateContext(props, 1, &gpus[i], 0, 0, &error); - //OCLR(error, false); - if (cl_int err = error) { - printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); - return; - } - } - - std::vector binstatus; - binstatus.resize(gpus.size()); - - for (size_t i = 0; i < gpus.size(); i++) { - char kernelName[64]; - sprintf(kernelName, "equiw200k9_gpu%u.bin", (unsigned)i); - if (!clCompileKernel(gContext[i], - gpus[i], - kernelName, - { "zcash/gpu/equihash.cl" }, - "-I./zcash/gpu -DXINTREE -DWN=200 -DWK=9 -DRESTBITS=4 -DUNROLL", - &binstatus[i], - &gProgram[i])) { - return; - } - } - - for (unsigned i = 0; i < gpus.size(); ++i) { - if (binstatus[i] == CL_SUCCESS) { - device_context.context = new MinerInstance(); - if (!device_context.context->init(gContext[i], gProgram[i], gpus[i], device_context.threadsNum, device_context.wokrsize)) { - printf("Init failed"); - return; - } - } - else { - printf("GPU %d: failed to load kernel\n", i); - return; - } - } - - device_context.is_init_success = true; -} - -void ocl_xmp::stop(ocl_xmp& device_context) { /*TODO*/ } - -void ocl_xmp::solve(const char *tequihash_header, - unsigned int tequihash_header_len, - const char* nonce, - unsigned int nonce_len, - std::function cancelf, - std::function&, size_t, const unsigned char*)> solutionf, - std::function hashdonef, - ocl_xmp& device_context) { - if (device_context.is_init_success == false) { - printf("fail OCL\n"); - //cancelf(); - return; - } - - // move to context or somewhere or leave? - blake2b_state initialCtx; - setheader(&initialCtx, tequihash_header, tequihash_header_len); - - MinerInstance *miner = device_context.context; - clFlush(miner->queue); - - /*hide_xmp_hack::uint256 nNonce = hide_xmp_hack::uint256(nonce); - miner->nonce = nNonce;*/ - *miner->blake2bState.HostData = initialCtx; - setnonce(miner->blake2bState.HostData, (const uint8_t*)nonce); - memset(miner->nslots.HostData, 0, 2 * sizeof(bsizes)); - *miner->numSols.HostData = 0; - miner->blake2bState.copyToDevice(miner->queue, false); - miner->nslots.copyToDevice(miner->queue, false); - miner->numSols.copyToDevice(miner->queue, false); - - digit(miner->queue, miner->_digitHKernel, device_context.threadsNum, device_context.wokrsize); -#if BUCKBITS == 16 && RESTBITS == 4 && defined XINTREE && defined(UNROLL) - for (unsigned i = 1; i <= 8; i++) - digit(miner->queue, miner->_digitKernels[i], device_context.threadsNum, device_context.wokrsize); -#else - size_t globalSize[] = { _threadsNum, 1, 1 }; - size_t localSize[] = { _threadsPerBlocksNum, 1 }; - for (unsigned r = 1; r < WK; r++) { - if (r & 1) { - OCL(clSetKernelArg(miner->_digitOKernel, 0, sizeof(cl_uint), &r)); - OCL(clEnqueueNDRangeKernel(miner->queue, miner->_digitOKernel, 1, 0, globalSize, localSize, 0, 0, 0)); - } - else { - OCL(clSetKernelArg(miner->_digitEKernel, 0, sizeof(cl_uint), &r)); - OCL(clEnqueueNDRangeKernel(miner->queue, miner->_digitEKernel, 1, 0, globalSize, localSize, 0, 0, 0)); - } - } -#endif - digit(miner->queue, miner->_digitKKernel, device_context.threadsNum, device_context.wokrsize); - - // get solutions - miner->sols.copyToHost(miner->queue, true); - miner->numSols.copyToHost(miner->queue, true); - for (unsigned s = 0; s < miner->numSols.HostData[0]; s++) - { - std::vector index_vector(PROOFSIZE); - for (u32 i = 0; i < PROOFSIZE; i++) { - index_vector[i] = miner->sols[s][i]; - } - - solutionf(index_vector, DIGITBITS, nullptr); - if (cancelf()) return; - } - hashdonef(); -} - -// STATICS END \ No newline at end of file diff --git a/ocl_xpm/ocl_xpm.vcxproj b/ocl_xpm/ocl_xpm.vcxproj deleted file mode 100644 index 7c2c299c5..000000000 --- a/ocl_xpm/ocl_xpm.vcxproj +++ /dev/null @@ -1,100 +0,0 @@ - - - - - Debug - x64 - - - Release - x64 - - - - - - - - - - - - - - - {5EC9EDEB-8E49-4126-9161-1560683CBC71} - Win32Proj - ocl_xpm - - - - StaticLibrary - true - v120 - MultiByte - - - StaticLibrary - false - v120 - true - MultiByte - - - - - - - - - - - - - true - - - false - - - - NotUsing - Level3 - Disabled - WIN32;_DEBUG;_WINDOWS;_USRDLL;OCL_XPM_EXPORTS;%(PreprocessorDefinitions) - - - ..\ocl_device_utils;..\cpu_tromp;..\3rdparty\include;$(AMDAPPSDKROOT)\include\ - - - Windows - true - OpenCL.lib;%(AdditionalDependencies) - ..\3rdparty\libs\win64;$(AMDAPPSDKROOT)\lib\x86_64\ - - - - - Level3 - NotUsing - MaxSpeed - true - true - WIN32;NDEBUG;_WINDOWS;_USRDLL;OCL_XPM_EXPORTS;%(PreprocessorDefinitions) - - - ..\ocl_device_utils;..\3rdparty\include;$(AMDAPPSDKROOT)\include\;%(AdditionalIncludeDirectories) - - - Windows - true - true - true - OpenCL.lib;%(AdditionalDependencies) - ..\3rdparty\libs\win64;$(AMDAPPSDKROOT)\lib\x86_64\ - - - - - - \ No newline at end of file diff --git a/ocl_xpm/ocl_xpm.vcxproj.filters b/ocl_xpm/ocl_xpm.vcxproj.filters deleted file mode 100644 index ae440bef8..000000000 --- a/ocl_xpm/ocl_xpm.vcxproj.filters +++ /dev/null @@ -1,26 +0,0 @@ - - - - - {69f1aa4c-1be3-4265-a93c-b58266bad10b} - - - {a95c2e64-90c0-48d9-9287-46723392025d} - - - - - - zcash\gpu - - - - - - - - - zcash\gpu - - - \ No newline at end of file diff --git a/ocl_xpm/zcash/gpu/blake2bcl.h b/ocl_xpm/zcash/gpu/blake2bcl.h deleted file mode 100644 index 13cad965c..000000000 --- a/ocl_xpm/zcash/gpu/blake2bcl.h +++ /dev/null @@ -1,150 +0,0 @@ -// Blake2-B CUDA Implementation -// tpruvot@github July 2016 -// permission granted to use under MIT license -// modified for use in Zcash by John Tromp September 2016 - -/** - * uint2 direct ops by c++ operator definitions - */ - -// static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { -// return make_uint2(a.x ^ b.x, a.y ^ b.y); -// } - -// uint2 ROR/ROL methods -uint2 ROR2(const uint2 a, const int offset) { - uint2 result; - if (!offset) - result = a; - else if (offset < 32) { - result.y = ((a.y >> offset) | (a.x << (32 - offset))); - result.x = ((a.x >> offset) | (a.y << (32 - offset))); - } else if (offset == 32) { - result.y = a.x; - result.x = a.y; - } else { - result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset))); - result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset))); - } - return result; -} - -uint2 SWAPUINT2(uint2 value) { - uint2 result; - result.x = value.y; - result.y = value.x; - return result; -// return make_uint2(value.y, value.x); -} - -#define ROR24(u) ROR2(u,24) -#define ROR16(u) ROR2(u,16) - -__constant int8_t blake2b_sigma[12][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 } , - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } -}; - -void G(const int32_t r, const int32_t i, uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d, uint64_t const m[16]) { - *a += *b + m[ blake2b_sigma[r][2*i] ]; - ((uint2*)d)[0] = SWAPUINT2( ((uint2*)d)[0] ^ ((uint2*)a)[0] ); - *c += *d; - ((uint2*)b)[0] = ROR24( ((uint2*)b)[0] ^ ((uint2*)c)[0] ); - *a += *b + m[ blake2b_sigma[r][2*i+1] ]; - ((uint2*)d)[0] = ROR16( ((uint2*)d)[0] ^ ((uint2*)a)[0] ); - *c += *d; - ((uint2*)b)[0] = ROR2( ((uint2*)b)[0] ^ ((uint2*)c)[0], 63U); -} - -#define ROUND(r) \ - G(r, 0, &v[0], &v[4], &v[ 8], &v[12], m); \ - G(r, 1, &v[1], &v[5], &v[ 9], &v[13], m); \ - G(r, 2, &v[2], &v[6], &v[10], &v[14], m); \ - G(r, 3, &v[3], &v[7], &v[11], &v[15], m); \ - G(r, 4, &v[0], &v[5], &v[10], &v[15], m); \ - G(r, 5, &v[1], &v[6], &v[11], &v[12], m); \ - G(r, 6, &v[2], &v[7], &v[ 8], &v[13], m); \ - G(r, 7, &v[3], &v[4], &v[ 9], &v[14], m); - -void blake2b_gpu_hash(blake2b_state *state, uint32_t idx, uint8_t *hash, uint32_t outlen) { - const uint32_t leb = idx; - *(uint32_t*)(state->buf + state->buflen) = leb; - state->buflen += 4; - state->counter += state->buflen; - for (unsigned i = 0; i < BLAKE2B_BLOCKBYTES - state->buflen; i++) - state->buf[i+state->buflen] = 0; - - uint64_t *d_data = (uint64_t *)state->buf; - uint64_t m[16]; - - m[0] = d_data[0]; - m[1] = d_data[1]; - m[2] = d_data[2]; - m[3] = d_data[3]; - m[4] = d_data[4]; - m[5] = d_data[5]; - m[6] = d_data[6]; - m[7] = d_data[7]; - m[8] = d_data[8]; - m[9] = d_data[9]; - m[10] = d_data[10]; - m[11] = d_data[11]; - m[12] = d_data[12]; - m[13] = d_data[13]; - m[14] = d_data[14]; - m[15] = d_data[15]; - - uint64_t v[16]; - - v[0] = state->h[0]; - v[1] = state->h[1]; - v[2] = state->h[2]; - v[3] = state->h[3]; - v[4] = state->h[4]; - v[5] = state->h[5]; - v[6] = state->h[6]; - v[7] = state->h[7]; - v[8] = 0x6a09e667f3bcc908; - v[9] = 0xbb67ae8584caa73b; - v[10] = 0x3c6ef372fe94f82b; - v[11] = 0xa54ff53a5f1d36f1; - v[12] = 0x510e527fade682d1 ^ state->counter; - v[13] = 0x9b05688c2b3e6c1f; - v[14] = 0x1f83d9abfb41bd6b ^ 0xffffffffffffffff; - v[15] = 0x5be0cd19137e2179; - - ROUND( 0 ); - ROUND( 1 ); - ROUND( 2 ); - ROUND( 3 ); - ROUND( 4 ); - ROUND( 5 ); - ROUND( 6 ); - ROUND( 7 ); - ROUND( 8 ); - ROUND( 9 ); - ROUND( 10 ); - ROUND( 11 ); - - state->h[0] ^= v[0] ^ v[ 8]; - state->h[1] ^= v[1] ^ v[ 9]; - state->h[2] ^= v[2] ^ v[10]; - state->h[3] ^= v[3] ^ v[11]; - state->h[4] ^= v[4] ^ v[12]; - state->h[5] ^= v[5] ^ v[13]; - state->h[6] ^= v[6] ^ v[14]; - state->h[7] ^= v[7] ^ v[15]; - - for (unsigned i = 0; i < outlen; i++) - hash[i] = ((uint8_t*)state->h)[i]; -} diff --git a/ocl_xpm/zcash/gpu/common.h b/ocl_xpm/zcash/gpu/common.h deleted file mode 100644 index 8c7727406..000000000 --- a/ocl_xpm/zcash/gpu/common.h +++ /dev/null @@ -1,159 +0,0 @@ -#if defined(__OPENCL_HOST__) -#define __global -//#include "blake2/blake2.h" -//#include "equi.h" -#include "../cpu_tromp/equi.h" - -#else -typedef char int8_t; -typedef uchar uint8_t; -typedef short int16_t; -typedef ushort uint16_t; -typedef int int32_t; -typedef uint uint32_t; -typedef long int64_t; -typedef ulong uint64_t; - -#if defined(_MSC_VER) -#define ALIGN(x) __declspec(align(x)) -#else -#define ALIGN(x) __attribute__ ((__aligned__(x))) -#endif - -enum blake2b_constant -{ - BLAKE2B_BLOCKBYTES = 128, - BLAKE2B_OUTBYTES = 64, - BLAKE2B_KEYBYTES = 64, - BLAKE2B_SALTBYTES = 16, - BLAKE2B_PERSONALBYTES = 16 -}; - -#pragma pack(push, 1) -ALIGN( 64 ) typedef struct __blake2b_state { - uint64_t h[8]; - uint8_t buf[BLAKE2B_BLOCKBYTES]; - uint16_t counter; - uint8_t buflen; - uint8_t lastblock; -} blake2b_state; -#pragma pack(pop) -#endif - -#define COLLISION_BIT_LENGTH (WN / (WK+1)) -#define COLLISION_BYTE_LENGTH ((COLLISION_BIT_LENGTH+7)/8) -#define FINAL_FULL_WIDTH (2*COLLISION_BYTE_LENGTH+sizeof(uint32_t)*(1 << (WK))) - - -#define NDIGITS (WK+1) -#define DIGITBITS (WN/(NDIGITS)) -//#define PROOFSIZE (1u< 64 -#error cant use XBITMAP with more than 64 slots -#endif - uint64_t xhashmap[NRESTS]; - uint64_t xmap; -#else - xslot nxhashslots[NRESTS]; - xslot xhashslots[NRESTS][XFULL]; - xslot *xx; - uint32_t n0; - uint32_t n1; -#endif - uint32_t s0; -} collisiondata; - - -typedef struct equi { - blake2b_state blake_ctx; - htalloc hta; - __global bsizes *nslots; - __global proof *sols; - uint32_t nsols; - uint32_t nthreads; -} equi; diff --git a/ocl_xpm/zcash/gpu/equihash.cl b/ocl_xpm/zcash/gpu/equihash.cl deleted file mode 100644 index 213a8e4d6..000000000 --- a/ocl_xpm/zcash/gpu/equihash.cl +++ /dev/null @@ -1,1038 +0,0 @@ -#include "common.h" - -#include "blake2bcl.h" - -#define tree0_ptr(heap, r) ((__global bucket0 *)(heap + r)) -#define tree1_ptr(heap, r) ((__global bucket1 *)(heap + r)) - -uint32_t tree_bucket(tree t) -{ - const uint32_t bucketMask = ((1u<> BUCKBITS) & SLOTMASK; -} - -uint32_t tree_slotid1(tree t) -{ - const uint32_t slotMask = ((1u<> (BUCKBITS+SLOTBITS)) & SLOTMASK; -} - -uint32_t tree_xhash(tree t) -{ - return t >> (2*SLOTBITS + BUCKBITS); -} - -uint32_t tree_getindex(const tree t) -{ - const uint32_t bucketMask = ((1u<> BUCKBITS); -} - -void tree_setindex(tree *t, uint32_t idx) -{ - const uint32_t bucketMask = ((1u<> SLOTBITS); - (*t) |= ((idx & slotMask) << BUCKBITS); -} - -void tree_setxhash(tree *t, uint32_t xhash) -{ - const uint32_t xhashMask = ((1u << RESTBITS)-1); - (*t) &= ~(xhashMask << (2*SLOTBITS + BUCKBITS)); - (*t) |= (xhash << (2*SLOTBITS + BUCKBITS)); -} - -tree tree_create3(uint32_t bucketId, uint32_t s0, uint32_t s1) -{ - return bucketId | (s0 << BUCKBITS) | (s1 << (BUCKBITS+SLOTBITS)); -} - -tree tree_create4(uint32_t bucketId, uint32_t s0, uint32_t s1, uint32_t xhash) -{ - return bucketId | (s0 << BUCKBITS) | (s1 << (BUCKBITS+SLOTBITS)) | (xhash << (2*SLOTBITS+BUCKBITS));; -} - -// size (in bytes) of hash in round 0 <= r < WK -uint32_t hashsize(const uint32_t r) -{ -#ifdef XINTREE - const uint32_t hashbits = WN - (r+1) * DIGITBITS; -#else - const uint32_t hashbits = WN - (r+1) * DIGITBITS + RESTBITS; -#endif - return (hashbits + 7) / 8; -} - -uint32_t hashwords(uint32_t bytes) -{ - return (bytes + 3) / 4; -} - -htlayout htlayout_create_2(uint32_t r) -{ - htlayout R; - R.prevhashunits = 0; - R.dunits = 0; - - uint32_t nexthashbytes = hashsize(r); - R.nexthashunits = hashwords(nexthashbytes); - - R.prevbo = 0; - R.nextbo = R.nexthashunits * sizeof(hashunit) - nexthashbytes; // 0-3 - if (r) { - uint32_t prevhashbytes = hashsize(r-1); - R.prevhashunits = hashwords(prevhashbytes); - R.prevbo = R.prevhashunits * sizeof(hashunit) - prevhashbytes; // 0-3 - R.dunits = R.prevhashunits - R.nexthashunits; - } - - return R; -} - -uint32_t htlayout_getxhash0(uint32_t prevbo, __global const slot0 *pslot) -{ -#ifdef XINTREE - return tree_xhash(pslot->attr); -#elif WN == 200 && RESTBITS == 4 - return pslot->hash->bytes[prevbo] >> 4; -#elif WN == 200 && RESTBITS == 8 - return (pslot->hash->bytes[prevbo] & 0xf) << 4 | pslot->hash->bytes[prevbo+1] >> 4; -#elif WN == 144 && RESTBITS == 4 - return pslot->hash->bytes[prevbo] & 0xf; -#elif WN == 200 && RESTBITS == 6 - return (pslot->hash->bytes[prevbo] & 0x3) << 4 | pslot->hash->bytes[prevbo+1] >> 4; -#else -#error non implemented -#endif -} - -uint32_t htlayout_getxhash1(uint32_t prevbo, __global const slot1 *pslot) -{ -#ifdef XINTREE - return tree_xhash(pslot->attr); -#elif WN == 200 && RESTBITS == 4 - return pslot->hash->bytes[prevbo] & 0xf; -#elif WN == 200 && RESTBITS == 8 - return pslot->hash->bytes[prevbo]; -#elif WN == 144 && RESTBITS == 4 - return pslot->hash->bytes[prevbo] & 0xf; -#elif WN == 200 && RESTBITS == 6 - return pslot->hash->bytes[prevbo] & 0x3f; -#else -#error non implemented -#endif -} - -bool htlayout_equal(uint32_t prevhashunits, __global const hashunit *hash0, __global const hashunit *hash1) -{ - return hash0[prevhashunits-1].word == hash1[prevhashunits-1].word; -} - -void collisiondata_clear(collisiondata *data) -{ -#ifdef XBITMAP - // memset(xhashmap, 0, NRESTS * sizeof(u64)); - for (unsigned i = 0; i < NRESTS; i++) - data->xhashmap[i] = 0; -#else - // memset(nxhashslots, 0, NRESTS * sizeof(xslot)); - for (unsigned i = 0; i < NRESTS; i++) - data->nxhashslots[i] = 0; -#endif -} - -bool collisiondata_addslot(collisiondata *data, uint32_t s1, uint32_t xh) -{ -#ifdef XBITMAP - data->xmap = data->xhashmap[xh]; - data->xhashmap[xh] |= (uint64_t)1 << s1; - data->s0 = ~0; - return true; -#else - data->n1 = (uint32_t)data->nxhashslots[xh]++; - if (data->n1 >= XFULL) - return false; - data->xx = data->xhashslots[xh]; - data->xx[data->n1] = s1; - data->n0 = 0; - return true; -#endif -} - -bool collisiondata_nextcollision(collisiondata *data) -{ -#ifdef XBITMAP - return data->xmap != 0; -#else - return data->n0 < data->n1; -#endif -} - -uint64_t __ffsll(uint64_t x) -{ - return x ? (64 - clz(x & -x)) : 0; -} - -uint32_t collisiondata_slot(collisiondata *data) { -#ifdef XBITMAP - const uint32_t ffs = __ffsll(xmap); - data->s0 += ffs; - data->xmap >>= ffs; - return data->s0; -#else - return (uint32_t)data->xx[data->n0++]; -#endif -} - -uint32_t equi_getnslots(__global bsizes *nslots, const uint32_t r, const uint32_t bid) -{ - __global uint32_t *nslot = &nslots[r&1][bid]; - const uint32_t n = min(*nslot, NSLOTS); - *nslot = 0; - return n; -} - -void equi_orderindices(__global uint32_t *indices, uint32_t size) -{ - if (indices[0] > indices[size]) { - for (uint32_t i = 0; i < size; i++) { - const uint32_t tmp = indices[i]; - indices[i] = indices[size+i]; - indices[size+i] = tmp; - } - } -} - -void local_orderindices(uint32_t *indices, uint32_t size) -{ - if (indices[0] > indices[size]) { - for (uint32_t i = 0; i < size; i++) { - const uint32_t tmp = indices[i]; - indices[i] = indices[size+i]; - indices[size+i] = tmp; - } - } -} - - -void equi_listindices1(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 0)[tree_bucket(t)]; - const uint32_t size = 1 << 0; - indices[0] = tree_getindex((*buck)[tree_slotid0(t)].attr); - indices[size] = tree_getindex((*buck)[tree_slotid1(t)].attr); - equi_orderindices(indices, size); -} - -void equi_listindices2(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 0)[tree_bucket(t)]; - const uint32_t size = 1 << 1; - equi_listindices1(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices1(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices3(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 1)[tree_bucket(t)]; - const uint32_t size = 1 << 2; - equi_listindices2(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices2(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices4(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 1)[tree_bucket(t)]; - const uint32_t size = 1 << 3; - equi_listindices3(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices3(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices5(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 2)[tree_bucket(t)]; - const uint32_t size = 1 << 4; - equi_listindices4(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices4(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices6(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 2)[tree_bucket(t)]; - const uint32_t size = 1 << 5; - equi_listindices5(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices5(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices7(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 3)[tree_bucket(t)]; - const uint32_t size = 1 << 6; - equi_listindices6(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices6(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices8(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 3)[tree_bucket(t)]; - const uint32_t size = 1 << 7; - equi_listindices7(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices7(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void equi_listindices9(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - __global uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 4)[tree_bucket(t)]; - const uint32_t size = 1 << 8; - equi_listindices8(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - equi_listindices8(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - equi_orderindices(indices, size); -} - -void local_listindices1(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 0)[tree_bucket(t)]; - const uint32_t size = 1 << 0; - indices[0] = tree_getindex((*buck)[tree_slotid0(t)].attr); - indices[size] = tree_getindex((*buck)[tree_slotid1(t)].attr); - local_orderindices(indices, size); -} - -void local_listindices2(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 0)[tree_bucket(t)]; - const uint32_t size = 1 << 1; - local_listindices1(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices1(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices3(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 1)[tree_bucket(t)]; - const uint32_t size = 1 << 2; - local_listindices2(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices2(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices4(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 1)[tree_bucket(t)]; - const uint32_t size = 1 << 3; - local_listindices3(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices3(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices5(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 2)[tree_bucket(t)]; - const uint32_t size = 1 << 4; - local_listindices4(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices4(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices6(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 2)[tree_bucket(t)]; - const uint32_t size = 1 << 5; - local_listindices5(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices5(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices7(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 3)[tree_bucket(t)]; - const uint32_t size = 1 << 6; - local_listindices6(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices6(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices8(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket1 *buck = &tree1_ptr(heap1, 3)[tree_bucket(t)]; - const uint32_t size = 1 << 7; - local_listindices7(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices7(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -void local_listindices9(__global uint32_t *heap0, - __global uint32_t *heap1, - const tree t, - uint32_t *indices) -{ - const __global bucket0 *buck = &tree0_ptr(heap0, 4)[tree_bucket(t)]; - const uint32_t size = 1 << 8; - local_listindices8(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices); - local_listindices8(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size); - local_orderindices(indices, size); -} - -// proper dupe test is a little costly on GPU, so allow false negatives -bool equi_probdupe(uint32_t *prf) { - unsigned short susp[PROOFSIZE]; - for (unsigned i = 0; i < PROOFSIZE; i++) - susp[i] = 0xFFFF; - - for (unsigned i = 0; i < PROOFSIZE; i++) { - uint32_t bin = prf[i] & (PROOFSIZE-1); - unsigned short msb = prf[i] >> WK; - if (msb == susp[bin]) - return true; - susp[bin] = msb; - } - - return false; -} - -void equi_candidate(__global uint32_t *heap0, - __global uint32_t *heap1, - __global proof *sols, - __global uint32_t *nsols, - const tree t) -{ - proof prf; -#if WK==9 - local_listindices9(heap0, heap1, t, (uint32_t*)&prf); -#elif WK==5 - local_listindices5(heap0, heap1, t, (uint32_t*)&prf); -#else -#error not implemented -#endif - if (equi_probdupe(prf)) - return; - uint32_t soli = atomic_inc(nsols); - if (soli < MAXSOLS) -#if WK==9 - equi_listindices9(heap0, heap1, t, sols[soli]); -#elif WK==5 - equi_listindices5(heap0, heap1, t, sols[soli]); -#else -#error not implemented -#endif -} - - -__kernel void digitH(__global blake2b_state *blake2bState, - __global const uint32_t *heap0, - __global bsizes *nslots) -{ - uint8_t hash[HASHOUT]; - blake2b_state state; - // equi::htlayout htl(eq, 0); - htlayout htl = htlayout_create_2(0); - const uint32_t hashbytes = hashsize(0); - // const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x; - const uint32_t id = get_global_id(0); - for (uint32_t block = id; block < NBLOCKS; block += get_global_size(0)) { - state = *blake2bState; - blake2b_gpu_hash(&state, block, hash, HASHOUT); - for (uint32_t i = 0; i < HASHESPERBLAKE; i++) { - const uint8_t *ph = hash + i * WN/8; -#if BUCKBITS == 16 && RESTBITS == 4 - const uint32_t bucketid = ((uint32_t)ph[0] << 8) | ph[1]; -#ifdef XINTREE - const uint32_t xhash = ph[2] >> 4; -#endif -#elif BUCKBITS == 14 && RESTBITS == 6 - const uint32_t bucketid = ((uint32_t)ph[0] << 6) | ph[1] >> 2; -#elif BUCKBITS == 12 && RESTBITS == 8 - const uint32_t bucketid = ((uint32_t)ph[0] << 4) | ph[1] >> 4; -#elif BUCKBITS == 20 && RESTBITS == 4 - const uint32_t bucketid = ((((uint32_t)ph[0] << 8) | ph[1]) << 4) | ph[2] >> 4; -#ifdef XINTREE - const uint32_t xhash = ph[2] & 0xf; -#endif -#elif BUCKBITS == 12 && RESTBITS == 4 - const uint32_t bucketid = ((uint32_t)ph[0] << 4) | ph[1] >> 4; - const uint32_t xhash = ph[1] & 0xf; -#else -#error not implemented -#endif - const uint32_t slot = atomic_inc(&nslots[0][bucketid]); - if (slot >= NSLOTS) - continue; - tree leaf; - tree_setindex(&leaf, block*HASHESPERBLAKE+i); -#ifdef XINTREE - tree_setxhash(&leaf, xhash); -#endif - __global slot0 *s = &tree0_ptr(heap0, 0)[bucketid][slot]; - s->attr = leaf; - - // memcpy(s.hash->bytes+htl.nextbo, ph+WN/8-hashbytes, hashbytes); - for (unsigned i = 0; i < hashbytes; i++) - ((__global uint8_t*)s->hash->bytes+htl.nextbo)[i] = ((uint8_t*)(ph+WN/8-hashbytes))[i]; - } - } -} - -__kernel void digitOdd(const uint32_t r, - __global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) -{ - // equi::htlayout htl(eq, r); -// htlayout htl = htlayout_create(eq, r); - htlayout htl = htlayout_create_2(r); - collisiondata cd; - - // const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x; - const uint32_t id = get_global_id(0); - - for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - // cd.clear(); - collisiondata_clear(&cd); -// __global slot0 *buck = htl.hta.trees0[(r-1)/2][bucketid]; // optimize by updating previous buck?! - __global slot0 *buck = tree0_ptr(heap0, (r-1)/2)[bucketid]; // optimize by updating previous buck?! - uint32_t bsize = equi_getnslots(nslots, r-1, bucketid); // optimize by putting bucketsize with block?! - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot0 *pslot1 = buck + s1; // optimize by updating previous pslot1?! - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot0 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; -#if WN == 200 && BUCKBITS == 16 && RESTBITS == 4 && defined(XINTREE) - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4 - | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - xhash &= 0xf; -#elif WN == 144 && BUCKBITS == 20 && RESTBITS == 4 - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 8) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 4) - | (xhash = bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 4; - xhash &= 0xf; -#elif WN == 96 && BUCKBITS == 12 && RESTBITS == 4 - xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4) - | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - xhash &= 0xf; -#elif WN == 200 && BUCKBITS == 14 && RESTBITS == 6 - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) & 0xf) << 8) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 2 - | (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 6; -#else -#error not implemented -#endif - const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]); - if (xorslot >= NSLOTS) - continue; -#ifdef XINTREE - tree xort = tree_create4(bucketid, s0, s1, xhash); -#else - tree xort = tree_create3(bucketid, s0, s1); -#endif -// __global slot1 *xs = &htl.hta.trees1[r/2][xorbucketid][xorslot]; - __global slot1 *xs = &tree1_ptr(heap1, r/2)[xorbucketid][xorslot]; - xs->attr = xort; - for (uint32_t i = htl.dunits; i < htl.prevhashunits; i++) - xs->hash[i-htl.dunits].word = pslot0->hash[i].word ^ pslot1->hash[i].word; - } - } - } -} - - -__kernel void digitEven(const uint32_t r, - __global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) -{ - // equi::htlayout htl(eq, r); -// htlayout htl = htlayout_create(eq, r); - htlayout htl = htlayout_create_2(r); - collisiondata cd; - - // const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x; - const uint32_t id = get_global_id(0); - - for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - // cd.clear(); - collisiondata_clear(&cd); -// __global slot1 *buck = htl.hta.trees1[(r-1)/2][bucketid]; // OPTIMIZE BY UPDATING PREVIOUS - __global slot1 *buck = tree1_ptr(heap1, (r-1)/2)[bucketid]; // OPTIMIZE BY UPDATING PREVIOUS - uint32_t bsize = equi_getnslots(nslots, r-1, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot1 *pslot1 = buck + s1; // OPTIMIZE BY UPDATING PREVIOUS - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot1 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; -#if WN == 200 && BUCKBITS == 16 && RESTBITS == 4 && defined(XINTREE) - xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]); - xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; -#elif WN == 144 && BUCKBITS == 20 && RESTBITS == 4 - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 8) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 4) - | (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 4; -#elif WN == 96 && BUCKBITS == 12 && RESTBITS == 4 - xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; -#elif WN == 200 && BUCKBITS == 14 && RESTBITS == 6 - xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 6) - | (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 2; -#else -#error not implemented -#endif - const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]); - if (xorslot >= NSLOTS) - continue; -#ifdef XINTREE - tree xort = tree_create4(bucketid, s0, s1, xhash); -#else - tree xort = tree_create3(bucketid, s0, s1); -#endif -// __global slot0 *xs = &htl.hta.trees0[r/2][xorbucketid][xorslot]; - __global slot0 *xs = &tree0_ptr(heap0, r/2)[xorbucketid][xorslot]; - xs->attr = xort; - for (uint32_t i=htl.dunits; i < htl.prevhashunits; i++) - xs->hash[i-htl.dunits].word = pslot0->hash[i].word ^ pslot1->hash[i].word; - } - } - } -} - - -#ifdef UNROLL - -__kernel void digit_1(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) -{ - htlayout htl = htlayout_create_2(1); - collisiondata cd; - const uint32_t id = get_global_id(0); - - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); - __global slot0 *buck = tree0_ptr(heap0, 0)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 0, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot0 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot0 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4 - | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - xhash &= 0xf; - const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot1 *xs = &htl.hta.trees1[0][xorbucketid][xorslot]; - __global slot1 *xs = &tree1_ptr(heap1, 0)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word; - xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word; - xs->hash[3].word = pslot0->hash[4].word ^ pslot1->hash[4].word; - xs->hash[4].word = pslot0->hash[5].word ^ pslot1->hash[5].word; - } - } - } -} -__kernel void digit_2(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(2); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot1 *buck = htl.hta.trees1[0][bucketid]; - __global slot1 *buck = tree1_ptr(heap1, 0)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 1, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot1 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot1 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]); - xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); - // __global slot0 *xs = &htl.hta.trees0[1][xorbucketid][xorslot]; - __global slot0 *xs = &tree0_ptr(heap0, 1)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word; - xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - xs->hash[2].word = pslot0->hash[2].word ^ pslot1->hash[2].word; - xs->hash[3].word = pslot0->hash[3].word ^ pslot1->hash[3].word; - xs->hash[4].word = pslot0->hash[4].word ^ pslot1->hash[4].word; - } - } - } -} -__kernel void digit_3(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(3); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot0 *buck = htl.hta.trees0[1][bucketid]; - __global slot0 *buck = tree0_ptr(heap0, 1)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 2, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot0 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot0 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4 - | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - xhash &= 0xf; - const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot1 *xs = &htl.hta.trees1[1][xorbucketid][xorslot]; - __global slot1 *xs = &tree1_ptr(heap1, 1)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word; - xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word; - xs->hash[3].word = pslot0->hash[4].word ^ pslot1->hash[4].word; - } - } - } -} -__kernel void digit_4(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(4); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot1 *buck = htl.hta.trees1[1][bucketid]; - __global slot1 *buck = tree1_ptr(heap1, 1)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 3, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot1 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot1 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]); - xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot0 *xs = &htl.hta.trees0[2][xorbucketid][xorslot]; - __global slot0 *xs = &tree0_ptr(heap0, 2)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word; - xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - xs->hash[2].word = pslot0->hash[2].word ^ pslot1->hash[2].word; - xs->hash[3].word = pslot0->hash[3].word ^ pslot1->hash[3].word; - } - } - } -} -__kernel void digit_5(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(5); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot0 *buck = htl.hta.trees0[2][bucketid]; - __global slot0 *buck = tree0_ptr(heap0, 2)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 4, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot0 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot0 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4 - | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - xhash &= 0xf; - const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot1 *xs = &htl.hta.trees1[2][xorbucketid][xorslot]; - __global slot1 *xs = &tree1_ptr(heap1, 2)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word; - xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word; - } - } - } -} -__kernel void digit_6(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(6); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot1 *buck = htl.hta.trees1[2][bucketid]; - __global slot1 *buck = tree1_ptr(heap1, 2)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 5, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot1 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot1 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]); - xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot0 *xs = &htl.hta.trees0[3][xorbucketid][xorslot]; - __global slot0 *xs = &tree0_ptr(heap0, 3)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word; - } - } - } -} -__kernel void digit_7(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(7); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot0 *buck = htl.hta.trees0[3][bucketid]; - __global slot0 *buck = tree0_ptr(heap0, 3)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 6, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot0 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot0 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4 - | (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - xhash &= 0xf; - const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot1 *xs = &htl.hta.trees1[3][xorbucketid][xorslot]; - __global slot1 *xs = &tree1_ptr(heap1, 3)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word; - xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - } - } - } -} -__kernel void digit_8(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots) { - htlayout htl = htlayout_create_2(8); - collisiondata cd; - const uint32_t id = get_global_id(0); - for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); -// __global slot1 *buck = htl.hta.trees1[3][bucketid]; - __global slot1 *buck = tree1_ptr(heap1, 3)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, 7, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot1 *pslot1 = buck + s1; // OPTIMIZE BY UPDATING PREVIOUS - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1))) - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot1 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) - continue; - uint32_t xorbucketid; - uint32_t xhash; - __global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes; - xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8) - | (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]); - xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4; - const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]); - if (xorslot >= NSLOTS) - continue; - tree xort = tree_create4(bucketid, s0, s1, xhash); -// __global slot0 *xs = &htl.hta.trees0[4][xorbucketid][xorslot]; - __global slot0 *xs = &tree0_ptr(heap0, 4)[xorbucketid][xorslot]; - xs->attr = xort; - xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word; - } - } - } -} -#endif //UNROLL - -__kernel void digitK(__global uint32_t *heap0, - __global uint32_t *heap1, - __global bsizes *nslots, - __global proof *sols, - __global uint32_t *nsols) { - collisiondata cd; - htlayout htl = htlayout_create_2(WK); - const uint32_t id = get_global_id(0); - for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) { - collisiondata_clear(&cd); - __global slot0 *buck = tree0_ptr(heap0, (WK-1)/2)[bucketid]; - uint32_t bsize = equi_getnslots(nslots, WK-1, bucketid); - for (uint32_t s1 = 0; s1 < bsize; s1++) { - __global const slot0 *pslot1 = buck + s1; - if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) // assume WK odd - continue; - for (; collisiondata_nextcollision(&cd); ) { - const uint32_t s0 = collisiondata_slot(&cd); - __global const slot0 *pslot0 = buck + s0; - if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) { - tree xort = tree_create3(bucketid, s0, s1); - equi_candidate(heap0, heap1, sols, nsols, xort); - } - } - } - } -}