forked from nicehash/nheqminer
-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
maztheman
committed
Dec 20, 2016
1 parent
b860e27
commit 1c57e7b
Showing
44 changed files
with
19,292 additions
and
170 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
#pragma once | ||
|
||
#include <cassert> | ||
|
||
namespace blake { | ||
|
||
|
||
namespace impl { | ||
|
||
static const uint32_t blake2b_block_len = 128; | ||
static const uint32_t blake2b_rounds = 12; | ||
static const uint64_t blake2b_iv[8] = | ||
{ | ||
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, | ||
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, | ||
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, | ||
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL, | ||
}; | ||
|
||
static const uint8_t blake2b_sigma[12][16] = | ||
{ | ||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, | ||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, | ||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, | ||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, | ||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, | ||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, | ||
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, | ||
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, | ||
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, | ||
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, | ||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, | ||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, | ||
}; | ||
|
||
inline uint64_t rotr64(uint64_t a, uint8_t bits) | ||
{ | ||
return (a >> bits) | (a << (64 - bits)); | ||
} | ||
|
||
inline void mix(uint64_t *va, uint64_t *vb, uint64_t *vc, uint64_t *vd, | ||
uint64_t x, uint64_t y) | ||
{ | ||
*va = (*va + *vb + x); | ||
*vd = rotr64(*vd ^ *va, 32); | ||
*vc = (*vc + *vd); | ||
*vb = rotr64(*vb ^ *vc, 24); | ||
*va = (*va + *vb + y); | ||
*vd = rotr64(*vd ^ *va, 16); | ||
*vc = (*vc + *vd); | ||
*vb = rotr64(*vb ^ *vc, 63); | ||
} | ||
|
||
} | ||
|
||
|
||
typedef struct blake2b_state_s | ||
{ | ||
uint64_t h[8]; | ||
uint64_t bytes; | ||
} blake2b_state_t; | ||
|
||
inline void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, uint32_t n, uint32_t k) | ||
{ | ||
using namespace blake::impl; | ||
|
||
assert(n > k); | ||
assert(hash_len <= 64); | ||
st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len); | ||
for (uint32_t i = 1; i <= 5; i++) | ||
st->h[i] = blake2b_iv[i]; | ||
st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW"; | ||
st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n); | ||
st->bytes = 0; | ||
|
||
} | ||
|
||
inline void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg, uint32_t msg_len, uint32_t is_final) | ||
{ | ||
using namespace blake::impl; | ||
|
||
const uint64_t *m = (const uint64_t *)_msg; | ||
uint64_t v[16]; | ||
assert(msg_len <= 128); | ||
assert(st->bytes <= UINT64_MAX - msg_len); | ||
memcpy(v + 0, st->h, 8 * sizeof (*v)); | ||
memcpy(v + 8, blake2b_iv, 8 * sizeof (*v)); | ||
v[12] ^= (st->bytes += msg_len); | ||
v[14] ^= is_final ? -1 : 0; | ||
for (uint32_t round = 0; round < blake2b_rounds; round++) | ||
{ | ||
const uint8_t *s = blake2b_sigma[round]; | ||
mix(v + 0, v + 4, v + 8, v + 12, m[s[0]], m[s[1]]); | ||
mix(v + 1, v + 5, v + 9, v + 13, m[s[2]], m[s[3]]); | ||
mix(v + 2, v + 6, v + 10, v + 14, m[s[4]], m[s[5]]); | ||
mix(v + 3, v + 7, v + 11, v + 15, m[s[6]], m[s[7]]); | ||
mix(v + 0, v + 5, v + 10, v + 15, m[s[8]], m[s[9]]); | ||
mix(v + 1, v + 6, v + 11, v + 12, m[s[10]], m[s[11]]); | ||
mix(v + 2, v + 7, v + 8, v + 13, m[s[12]], m[s[13]]); | ||
mix(v + 3, v + 4, v + 9, v + 14, m[s[14]], m[s[15]]); | ||
} | ||
for (uint32_t i = 0; i < 8; i++) | ||
st->h[i] ^= v[i] ^ v[i + 8]; | ||
} | ||
|
||
inline void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen) | ||
{ | ||
assert(outlen <= 64); | ||
memcpy(out, st->h, outlen); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
#pragma once | ||
|
||
namespace ocl { | ||
namespace algorithm { | ||
|
||
template<int _PREFIX> | ||
inline void compress(uint8_t *out, uint32_t *inputs, uint32_t n) | ||
{ | ||
uint32_t byte_pos = 0; | ||
int32_t bits_left = _PREFIX + 1; | ||
uint8_t x = 0; | ||
uint8_t x_bits_used = 0; | ||
uint8_t *pOut = out; | ||
while (byte_pos < n) | ||
{ | ||
if (bits_left >= 8 - x_bits_used) | ||
{ | ||
x |= inputs[byte_pos] >> (bits_left - 8 + x_bits_used); | ||
bits_left -= 8 - x_bits_used; | ||
x_bits_used = 8; | ||
} | ||
else if (bits_left > 0) | ||
{ | ||
uint32_t mask = ~(-1 << (8 - x_bits_used)); | ||
mask = ((~mask) >> bits_left) & mask; | ||
x |= (inputs[byte_pos] << (8 - x_bits_used - bits_left)) & mask; | ||
x_bits_used += bits_left; | ||
bits_left = 0; | ||
} | ||
else if (bits_left <= 0) | ||
{ | ||
assert(!bits_left); | ||
byte_pos++; | ||
bits_left = _PREFIX + 1; | ||
} | ||
if (x_bits_used == 8) | ||
{ | ||
*pOut++ = x; | ||
x = x_bits_used = 0; | ||
} | ||
} | ||
} | ||
|
||
|
||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
#pragma once | ||
|
||
namespace ocl { | ||
namespace algorithm { | ||
namespace algorithm_detail { | ||
|
||
struct gatelessgate_context { | ||
|
||
|
||
|
||
}; | ||
|
||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
#pragma once | ||
#include <ocl/opencl.hpp> | ||
#include <ocl/sols.hpp> | ||
#include <cstdint> | ||
|
||
namespace ocl { | ||
namespace algorithm { | ||
namespace algorithm_detail { | ||
|
||
|
||
|
||
struct silentarmy_context { | ||
cl_context _context; | ||
cl_program _program; | ||
cl_device_id _dev_id; | ||
cl_platform_id platform_id = 0; | ||
cl_command_queue queue; | ||
|
||
|
||
cl_kernel k_init_ht; | ||
cl_kernel k_rounds[SA_PARAM_K]; | ||
cl_kernel k_sols; | ||
|
||
cl_mem buf_ht[2], buf_sols, buf_dbg, rowCounters[2]; | ||
size_t global_ws; | ||
size_t local_work_size = 64; | ||
|
||
sols_t *sols; | ||
|
||
bool init(cl_device_id dev, unsigned threadsNum, unsigned threadsPerBlock) { | ||
cl_int error; | ||
|
||
queue = clCreateCommandQueue(_context, dev, 0, &error); | ||
|
||
#ifdef SA_ENABLE_DEBUG | ||
size_t dbg_size = SA_NR_ROWS; | ||
#else | ||
size_t dbg_size = 1; | ||
#endif | ||
|
||
buf_dbg = check_clCreateBuffer(_context, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, dbg_size, NULL); | ||
buf_ht[0] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_HT_SIZE, NULL); | ||
buf_ht[1] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_HT_SIZE, NULL); | ||
buf_sols = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, sizeof(sols_t), NULL); | ||
|
||
rowCounters[0] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_NR_ROWS, NULL); | ||
rowCounters[1] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_NR_ROWS, NULL); | ||
|
||
|
||
|
||
fprintf(stderr, "Hash tables will use %.1f MB\n", 2.0 * SA_HT_SIZE / 1e6); | ||
|
||
k_init_ht = clCreateKernel(_program, "kernel_init_ht", &error); | ||
for (unsigned i = 0; i < SA_PARAM_K; i++) { | ||
char kernelName[128]; | ||
sprintf(kernelName, "kernel_round%d", i); | ||
k_rounds[i] = clCreateKernel(_program, kernelName, &error); | ||
} | ||
|
||
sols = (sols_t *)malloc(sizeof(*sols)); | ||
|
||
k_sols = clCreateKernel(_program, "kernel_sols", &error); | ||
return true; | ||
|
||
|
||
} | ||
|
||
~silentarmy_context() { | ||
clReleaseMemObject(buf_dbg); | ||
clReleaseMemObject(buf_ht[0]); | ||
clReleaseMemObject(buf_ht[1]); | ||
clReleaseMemObject(rowCounters[0]); | ||
clReleaseMemObject(rowCounters[1]); | ||
free(sols); | ||
} | ||
|
||
|
||
}; | ||
|
||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
#pragma once | ||
#include <ocl/opencl.hpp> | ||
#include <ocl/sols.hpp> | ||
#include <cstdint> | ||
|
||
namespace ocl { | ||
namespace algorithm { | ||
namespace algorithm_detail { | ||
|
||
inline void init_ht(cl_command_queue queue, cl_kernel k_init_ht, cl_mem buf_ht, cl_mem rowCounters) | ||
{ | ||
size_t global_ws = SA_NR_ROWS / SA_ROWS_PER_UINT; | ||
size_t local_ws = 256; | ||
cl_int status; | ||
#if 0 | ||
uint32_t pat = -1; | ||
status = clEnqueueFillBuffer(queue, buf_ht, &pat, sizeof(pat), 0, | ||
SA_NR_ROWS * SA_NR_SLOTS * SA_SLOT_LEN, | ||
0, // cl_uint num_events_in_wait_list | ||
NULL, // cl_event *event_wait_list | ||
NULL); // cl_event *event | ||
if (status != CL_SUCCESS) | ||
fatal("clEnqueueFillBuffer (%d)\n", status); | ||
#endif | ||
status = clSetKernelArg(k_init_ht, 0, sizeof(buf_ht), &buf_ht); | ||
status = clSetKernelArg(k_init_ht, 1, sizeof(rowCounters), &rowCounters); | ||
if (status != CL_SUCCESS) | ||
printf("clSetKernelArg (%d)\n", status); | ||
check_clEnqueueNDRangeKernel(queue, k_init_ht, | ||
1, // cl_uint work_dim | ||
NULL, // size_t *global_work_offset | ||
&global_ws, // size_t *global_work_size | ||
&local_ws, // size_t *local_work_size | ||
0, // cl_uint num_events_in_wait_list | ||
NULL, // cl_event *event_wait_list | ||
NULL); // cl_event *event | ||
} | ||
|
||
|
||
/* | ||
** Sort a pair of binary blobs (a, b) which are consecutive in memory and | ||
** occupy a total of 2*len 32-bit words. | ||
** | ||
** a points to the pair | ||
** len number of 32-bit words in each pair | ||
*/ | ||
inline void sort_pair(uint32_t *a, uint32_t len) | ||
{ | ||
uint32_t *b = a + len; | ||
uint32_t tmp, need_sorting = 0; | ||
for (uint32_t i = 0; i < len; i++) | ||
if (need_sorting || a[i] > b[i]) | ||
{ | ||
need_sorting = 1; | ||
tmp = a[i]; | ||
a[i] = b[i]; | ||
b[i] = tmp; | ||
} | ||
else if (a[i] < b[i]) | ||
return; | ||
} | ||
|
||
inline uint32_t verify_sol(sols_t *sols, unsigned sol_i) | ||
{ | ||
uint32_t *inputs = sols->values[sol_i]; | ||
uint32_t seen_len = (1 << (SA_PREFIX + 1)) / 8; | ||
uint8_t seen[(1 << (SA_PREFIX + 1)) / 8]; | ||
uint32_t i; | ||
uint8_t tmp; | ||
// look for duplicate inputs | ||
memset(seen, 0, seen_len); | ||
for (i = 0; i < (1 << SA_PARAM_K); i++) | ||
{ | ||
tmp = seen[inputs[i] / 8]; | ||
seen[inputs[i] / 8] |= 1 << (inputs[i] & 7); | ||
if (tmp == seen[inputs[i] / 8]) | ||
{ | ||
// at least one input value is a duplicate | ||
sols->valid[sol_i] = 0; | ||
return 0; | ||
} | ||
} | ||
// the valid flag is already set by the GPU, but set it again because | ||
// I plan to change the GPU code to not set it | ||
sols->valid[sol_i] = 1; | ||
// sort the pairs in place | ||
for (uint32_t level = 0; level < SA_PARAM_K; level++) | ||
for (i = 0; i < (1 << SA_PARAM_K); i += (2 << level)) | ||
sort_pair(&inputs[i], 1 << level); | ||
return 1; | ||
} | ||
|
||
|
||
inline size_t select_work_size_blake(cl_device_id device_id) | ||
{ | ||
|
||
size_t work_size = | ||
64 * /* thread per wavefront */ | ||
SA_BLAKE_WPS * /* wavefront per simd */ | ||
4 * /* simd per compute unit */ | ||
nr_compute_units(device_id); | ||
// Make the work group size a multiple of the nr of wavefronts, while | ||
// dividing the number of inputs. This results in the worksize being a | ||
// power of 2. | ||
while (SA_NR_INPUTS % work_size) | ||
work_size += 64; | ||
|
||
return work_size; | ||
} | ||
|
||
|
||
} | ||
} | ||
} |
Oops, something went wrong.