Skip to content

Commit

Permalink
AMD/NVIDIA GATELESS GATE v0.0.1
Browse files Browse the repository at this point in the history
  • Loading branch information
maztheman committed Dec 20, 2016
1 parent b860e27 commit 1c57e7b
Show file tree
Hide file tree
Showing 44 changed files with 19,292 additions and 170 deletions.
112 changes: 112 additions & 0 deletions contrib/blake/blake.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#pragma once

#include <cassert>

namespace blake {


namespace impl {

static const uint32_t blake2b_block_len = 128;
static const uint32_t blake2b_rounds = 12;
static const uint64_t blake2b_iv[8] =
{
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL,
};

static const uint8_t blake2b_sigma[12][16] =
{
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
};

inline uint64_t rotr64(uint64_t a, uint8_t bits)
{
return (a >> bits) | (a << (64 - bits));
}

inline void mix(uint64_t *va, uint64_t *vb, uint64_t *vc, uint64_t *vd,
uint64_t x, uint64_t y)
{
*va = (*va + *vb + x);
*vd = rotr64(*vd ^ *va, 32);
*vc = (*vc + *vd);
*vb = rotr64(*vb ^ *vc, 24);
*va = (*va + *vb + y);
*vd = rotr64(*vd ^ *va, 16);
*vc = (*vc + *vd);
*vb = rotr64(*vb ^ *vc, 63);
}

}


typedef struct blake2b_state_s
{
uint64_t h[8];
uint64_t bytes;
} blake2b_state_t;

inline void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, uint32_t n, uint32_t k)
{
using namespace blake::impl;

assert(n > k);
assert(hash_len <= 64);
st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len);
for (uint32_t i = 1; i <= 5; i++)
st->h[i] = blake2b_iv[i];
st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW";
st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n);
st->bytes = 0;

}

inline void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg, uint32_t msg_len, uint32_t is_final)
{
using namespace blake::impl;

const uint64_t *m = (const uint64_t *)_msg;
uint64_t v[16];
assert(msg_len <= 128);
assert(st->bytes <= UINT64_MAX - msg_len);
memcpy(v + 0, st->h, 8 * sizeof (*v));
memcpy(v + 8, blake2b_iv, 8 * sizeof (*v));
v[12] ^= (st->bytes += msg_len);
v[14] ^= is_final ? -1 : 0;
for (uint32_t round = 0; round < blake2b_rounds; round++)
{
const uint8_t *s = blake2b_sigma[round];
mix(v + 0, v + 4, v + 8, v + 12, m[s[0]], m[s[1]]);
mix(v + 1, v + 5, v + 9, v + 13, m[s[2]], m[s[3]]);
mix(v + 2, v + 6, v + 10, v + 14, m[s[4]], m[s[5]]);
mix(v + 3, v + 7, v + 11, v + 15, m[s[6]], m[s[7]]);
mix(v + 0, v + 5, v + 10, v + 15, m[s[8]], m[s[9]]);
mix(v + 1, v + 6, v + 11, v + 12, m[s[10]], m[s[11]]);
mix(v + 2, v + 7, v + 8, v + 13, m[s[12]], m[s[13]]);
mix(v + 3, v + 4, v + 9, v + 14, m[s[14]], m[s[15]]);
}
for (uint32_t i = 0; i < 8; i++)
st->h[i] ^= v[i] ^ v[i + 8];
}

inline void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen)
{
assert(outlen <= 64);
memcpy(out, st->h, outlen);
}

}
46 changes: 46 additions & 0 deletions contrib/ocl/algorithm/compress.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#pragma once

namespace ocl {
namespace algorithm {

template<int _PREFIX>
inline void compress(uint8_t *out, uint32_t *inputs, uint32_t n)
{
uint32_t byte_pos = 0;
int32_t bits_left = _PREFIX + 1;
uint8_t x = 0;
uint8_t x_bits_used = 0;
uint8_t *pOut = out;
while (byte_pos < n)
{
if (bits_left >= 8 - x_bits_used)
{
x |= inputs[byte_pos] >> (bits_left - 8 + x_bits_used);
bits_left -= 8 - x_bits_used;
x_bits_used = 8;
}
else if (bits_left > 0)
{
uint32_t mask = ~(-1 << (8 - x_bits_used));
mask = ((~mask) >> bits_left) & mask;
x |= (inputs[byte_pos] << (8 - x_bits_used - bits_left)) & mask;
x_bits_used += bits_left;
bits_left = 0;
}
else if (bits_left <= 0)
{
assert(!bits_left);
byte_pos++;
bits_left = _PREFIX + 1;
}
if (x_bits_used == 8)
{
*pOut++ = x;
x = x_bits_used = 0;
}
}
}


}
}
15 changes: 15 additions & 0 deletions contrib/ocl/algorithm/detail/gatelessgate_context.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#pragma once

namespace ocl {
namespace algorithm {
namespace algorithm_detail {

struct gatelessgate_context {



};

}
}
}
82 changes: 82 additions & 0 deletions contrib/ocl/algorithm/detail/silentarmy_context.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#pragma once
#include <ocl/opencl.hpp>
#include <ocl/sols.hpp>
#include <cstdint>

namespace ocl {
namespace algorithm {
namespace algorithm_detail {



struct silentarmy_context {
cl_context _context;
cl_program _program;
cl_device_id _dev_id;
cl_platform_id platform_id = 0;
cl_command_queue queue;


cl_kernel k_init_ht;
cl_kernel k_rounds[SA_PARAM_K];
cl_kernel k_sols;

cl_mem buf_ht[2], buf_sols, buf_dbg, rowCounters[2];
size_t global_ws;
size_t local_work_size = 64;

sols_t *sols;

bool init(cl_device_id dev, unsigned threadsNum, unsigned threadsPerBlock) {
cl_int error;

queue = clCreateCommandQueue(_context, dev, 0, &error);

#ifdef SA_ENABLE_DEBUG
size_t dbg_size = SA_NR_ROWS;
#else
size_t dbg_size = 1;
#endif

buf_dbg = check_clCreateBuffer(_context, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, dbg_size, NULL);
buf_ht[0] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_HT_SIZE, NULL);
buf_ht[1] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_HT_SIZE, NULL);
buf_sols = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, sizeof(sols_t), NULL);

rowCounters[0] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_NR_ROWS, NULL);
rowCounters[1] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_NR_ROWS, NULL);



fprintf(stderr, "Hash tables will use %.1f MB\n", 2.0 * SA_HT_SIZE / 1e6);

k_init_ht = clCreateKernel(_program, "kernel_init_ht", &error);
for (unsigned i = 0; i < SA_PARAM_K; i++) {
char kernelName[128];
sprintf(kernelName, "kernel_round%d", i);
k_rounds[i] = clCreateKernel(_program, kernelName, &error);
}

sols = (sols_t *)malloc(sizeof(*sols));

k_sols = clCreateKernel(_program, "kernel_sols", &error);
return true;


}

~silentarmy_context() {
clReleaseMemObject(buf_dbg);
clReleaseMemObject(buf_ht[0]);
clReleaseMemObject(buf_ht[1]);
clReleaseMemObject(rowCounters[0]);
clReleaseMemObject(rowCounters[1]);
free(sols);
}


};

}
}
}
114 changes: 114 additions & 0 deletions contrib/ocl/algorithm/detail/silentarmy_detail.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#pragma once
#include <ocl/opencl.hpp>
#include <ocl/sols.hpp>
#include <cstdint>

namespace ocl {
namespace algorithm {
namespace algorithm_detail {

inline void init_ht(cl_command_queue queue, cl_kernel k_init_ht, cl_mem buf_ht, cl_mem rowCounters)
{
size_t global_ws = SA_NR_ROWS / SA_ROWS_PER_UINT;
size_t local_ws = 256;
cl_int status;
#if 0
uint32_t pat = -1;
status = clEnqueueFillBuffer(queue, buf_ht, &pat, sizeof(pat), 0,
SA_NR_ROWS * SA_NR_SLOTS * SA_SLOT_LEN,
0, // cl_uint num_events_in_wait_list
NULL, // cl_event *event_wait_list
NULL); // cl_event *event
if (status != CL_SUCCESS)
fatal("clEnqueueFillBuffer (%d)\n", status);
#endif
status = clSetKernelArg(k_init_ht, 0, sizeof(buf_ht), &buf_ht);
status = clSetKernelArg(k_init_ht, 1, sizeof(rowCounters), &rowCounters);
if (status != CL_SUCCESS)
printf("clSetKernelArg (%d)\n", status);
check_clEnqueueNDRangeKernel(queue, k_init_ht,
1, // cl_uint work_dim
NULL, // size_t *global_work_offset
&global_ws, // size_t *global_work_size
&local_ws, // size_t *local_work_size
0, // cl_uint num_events_in_wait_list
NULL, // cl_event *event_wait_list
NULL); // cl_event *event
}


/*
** Sort a pair of binary blobs (a, b) which are consecutive in memory and
** occupy a total of 2*len 32-bit words.
**
** a points to the pair
** len number of 32-bit words in each pair
*/
inline void sort_pair(uint32_t *a, uint32_t len)
{
uint32_t *b = a + len;
uint32_t tmp, need_sorting = 0;
for (uint32_t i = 0; i < len; i++)
if (need_sorting || a[i] > b[i])
{
need_sorting = 1;
tmp = a[i];
a[i] = b[i];
b[i] = tmp;
}
else if (a[i] < b[i])
return;
}

inline uint32_t verify_sol(sols_t *sols, unsigned sol_i)
{
uint32_t *inputs = sols->values[sol_i];
uint32_t seen_len = (1 << (SA_PREFIX + 1)) / 8;
uint8_t seen[(1 << (SA_PREFIX + 1)) / 8];
uint32_t i;
uint8_t tmp;
// look for duplicate inputs
memset(seen, 0, seen_len);
for (i = 0; i < (1 << SA_PARAM_K); i++)
{
tmp = seen[inputs[i] / 8];
seen[inputs[i] / 8] |= 1 << (inputs[i] & 7);
if (tmp == seen[inputs[i] / 8])
{
// at least one input value is a duplicate
sols->valid[sol_i] = 0;
return 0;
}
}
// the valid flag is already set by the GPU, but set it again because
// I plan to change the GPU code to not set it
sols->valid[sol_i] = 1;
// sort the pairs in place
for (uint32_t level = 0; level < SA_PARAM_K; level++)
for (i = 0; i < (1 << SA_PARAM_K); i += (2 << level))
sort_pair(&inputs[i], 1 << level);
return 1;
}


inline size_t select_work_size_blake(cl_device_id device_id)
{

size_t work_size =
64 * /* thread per wavefront */
SA_BLAKE_WPS * /* wavefront per simd */
4 * /* simd per compute unit */
nr_compute_units(device_id);
// Make the work group size a multiple of the nr of wavefronts, while
// dividing the number of inputs. This results in the worksize being a
// power of 2.
while (SA_NR_INPUTS % work_size)
work_size += 64;

return work_size;
}


}
}
}
Loading

0 comments on commit 1c57e7b

Please sign in to comment.