AMD/NVIDIA GATELESS GATE v0.0.1

maztheman · Dec 20, 2016 · 1c57e7b · 1c57e7b
1 parent b860e27
commit 1c57e7b
Show file tree

Hide file tree

Showing 44 changed files with 19,292 additions and 170 deletions.
diff --git a/contrib/blake/blake.hpp b/contrib/blake/blake.hpp
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <cassert>
+
+namespace blake {
+
+
+namespace impl {
+
+static const uint32_t   blake2b_block_len = 128;
+static const uint32_t   blake2b_rounds = 12;
+static const uint64_t   blake2b_iv[8] =
+{
+    0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+    0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+    0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+    0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL,
+};
+
+static const uint8_t    blake2b_sigma[12][16] =
+{
+      {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+      { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+      { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+      {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+      {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+      {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
+      { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
+      { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
+      {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
+      { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
+      {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+      { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+};
+
+inline uint64_t rotr64(uint64_t a, uint8_t bits)
+{
+    return (a >> bits) | (a << (64 - bits));
+}
+
+inline void mix(uint64_t *va, uint64_t *vb, uint64_t *vc, uint64_t *vd,
+        uint64_t x, uint64_t y)
+{
+    *va = (*va + *vb + x);
+    *vd = rotr64(*vd ^ *va, 32);
+    *vc = (*vc + *vd);
+    *vb = rotr64(*vb ^ *vc, 24);
+    *va = (*va + *vb + y);
+    *vd = rotr64(*vd ^ *va, 16);
+    *vc = (*vc + *vd);
+    *vb = rotr64(*vb ^ *vc, 63);
+}
+
+}
+
+
+typedef struct  blake2b_state_s
+{
+    uint64_t    h[8];
+    uint64_t    bytes;
+}               blake2b_state_t;
+
+inline void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, uint32_t n, uint32_t k)
+{
+	using namespace blake::impl;
+
+	assert(n > k);
+    assert(hash_len <= 64);
+    st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len);
+    for (uint32_t i = 1; i <= 5; i++)
+        st->h[i] = blake2b_iv[i];
+    st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW";
+    st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n);
+    st->bytes = 0;	
+
+}
+
+inline void zcash_blake2b_update(blake2b_state_t *st, const uint8_t *_msg, uint32_t msg_len, uint32_t is_final)
+{
+	using namespace blake::impl;
+
+	const uint64_t      *m = (const uint64_t *)_msg;
+    uint64_t            v[16];
+    assert(msg_len <= 128);
+    assert(st->bytes <= UINT64_MAX - msg_len);
+    memcpy(v + 0, st->h, 8 * sizeof (*v));
+    memcpy(v + 8, blake2b_iv, 8 * sizeof (*v));
+    v[12] ^= (st->bytes += msg_len);
+    v[14] ^= is_final ? -1 : 0;
+    for (uint32_t round = 0; round < blake2b_rounds; round++)
+      {
+        const uint8_t   *s = blake2b_sigma[round];
+        mix(v + 0, v + 4, v + 8,  v + 12, m[s[0]],  m[s[1]]);
+        mix(v + 1, v + 5, v + 9,  v + 13, m[s[2]],  m[s[3]]);
+        mix(v + 2, v + 6, v + 10, v + 14, m[s[4]],  m[s[5]]);
+        mix(v + 3, v + 7, v + 11, v + 15, m[s[6]],  m[s[7]]);
+        mix(v + 0, v + 5, v + 10, v + 15, m[s[8]],  m[s[9]]);
+        mix(v + 1, v + 6, v + 11, v + 12, m[s[10]], m[s[11]]);
+        mix(v + 2, v + 7, v + 8,  v + 13, m[s[12]], m[s[13]]);
+        mix(v + 3, v + 4, v + 9,  v + 14, m[s[14]], m[s[15]]);
+      }
+    for (uint32_t i = 0; i < 8; i++)
+        st->h[i] ^= v[i] ^ v[i + 8];
+}
+
+inline void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen)
+{
+	assert(outlen <= 64);
+    memcpy(out, st->h, outlen);
+}
+
+}
diff --git a/contrib/ocl/algorithm/compress.hpp b/contrib/ocl/algorithm/compress.hpp
@@ -0,0 +1,46 @@
+#pragma once
+
+namespace ocl {
+namespace algorithm {
+
+template<int _PREFIX>
+inline void compress(uint8_t *out, uint32_t *inputs, uint32_t n)
+{
+	uint32_t byte_pos = 0;
+	int32_t bits_left = _PREFIX + 1;
+	uint8_t x = 0;
+	uint8_t x_bits_used = 0;
+	uint8_t *pOut = out;
+	while (byte_pos < n)
+	{
+		if (bits_left >= 8 - x_bits_used)
+		{
+			x |= inputs[byte_pos] >> (bits_left - 8 + x_bits_used);
+			bits_left -= 8 - x_bits_used;
+			x_bits_used = 8;
+		}
+		else if (bits_left > 0)
+		{
+			uint32_t mask = ~(-1 << (8 - x_bits_used));
+			mask = ((~mask) >> bits_left) & mask;
+			x |= (inputs[byte_pos] << (8 - x_bits_used - bits_left)) & mask;
+			x_bits_used += bits_left;
+			bits_left = 0;
+		}
+		else if (bits_left <= 0)
+		{
+			assert(!bits_left);
+			byte_pos++;
+			bits_left = _PREFIX + 1;
+		}
+		if (x_bits_used == 8)
+		{
+			*pOut++ = x;
+			x = x_bits_used = 0;
+		}
+	}
+}
+
+
+}
+}
diff --git a/contrib/ocl/algorithm/detail/gatelessgate_context.hpp b/contrib/ocl/algorithm/detail/gatelessgate_context.hpp
@@ -0,0 +1,15 @@
+#pragma once
+
+namespace ocl {
+namespace algorithm {
+namespace algorithm_detail {
+
+struct gatelessgate_context {
+
+
+
+};
+
+}	
+}
+}
diff --git a/contrib/ocl/algorithm/detail/silentarmy_context.hpp b/contrib/ocl/algorithm/detail/silentarmy_context.hpp
@@ -0,0 +1,82 @@
+#pragma once
+#include <ocl/opencl.hpp>
+#include <ocl/sols.hpp>
+#include <cstdint>
+
+namespace ocl {
+namespace algorithm {
+namespace algorithm_detail {
+
+
+
+struct silentarmy_context {
+	cl_context _context;
+	cl_program _program;
+	cl_device_id _dev_id;
+	cl_platform_id platform_id = 0;
+	cl_command_queue queue;
+
+
+	cl_kernel k_init_ht;
+	cl_kernel k_rounds[SA_PARAM_K];
+	cl_kernel k_sols;
+
+	cl_mem buf_ht[2], buf_sols, buf_dbg, rowCounters[2];
+	size_t global_ws;
+	size_t local_work_size = 64;
+
+	sols_t	*sols;
+
+	bool init(cl_device_id dev, unsigned threadsNum, unsigned threadsPerBlock) {
+		cl_int error;
+
+		queue = clCreateCommandQueue(_context, dev, 0, &error);
+
+	#ifdef SA_ENABLE_DEBUG
+		size_t              dbg_size = SA_NR_ROWS;
+	#else
+		size_t              dbg_size = 1;
+	#endif
+
+		buf_dbg = check_clCreateBuffer(_context, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, dbg_size, NULL);
+		buf_ht[0] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_HT_SIZE, NULL);
+		buf_ht[1] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_HT_SIZE, NULL);
+		buf_sols = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, sizeof(sols_t), NULL);
+
+		rowCounters[0] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_NR_ROWS, NULL);
+		rowCounters[1] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, SA_NR_ROWS, NULL);
+
+
+
+		fprintf(stderr, "Hash tables will use %.1f MB\n", 2.0 * SA_HT_SIZE / 1e6);
+
+		k_init_ht = clCreateKernel(_program, "kernel_init_ht", &error);
+		for (unsigned i = 0; i < SA_PARAM_K; i++) {
+			char kernelName[128];
+			sprintf(kernelName, "kernel_round%d", i);
+			k_rounds[i] = clCreateKernel(_program, kernelName, &error);
+		}
+
+		sols = (sols_t *)malloc(sizeof(*sols));
+
+		k_sols = clCreateKernel(_program, "kernel_sols", &error);
+		return true;		
+
+
+	}
+
+	~silentarmy_context() {
+		clReleaseMemObject(buf_dbg);
+		clReleaseMemObject(buf_ht[0]);
+		clReleaseMemObject(buf_ht[1]);
+		clReleaseMemObject(rowCounters[0]);
+		clReleaseMemObject(rowCounters[1]);
+		free(sols);
+	}
+
+
+};
+
+}	
+}
+}
diff --git a/contrib/ocl/algorithm/detail/silentarmy_detail.hpp b/contrib/ocl/algorithm/detail/silentarmy_detail.hpp
@@ -0,0 +1,114 @@
+#pragma once
+#include <ocl/opencl.hpp>
+#include <ocl/sols.hpp>
+#include <cstdint>
+
+namespace ocl {
+namespace algorithm {
+namespace algorithm_detail {
+
+inline void init_ht(cl_command_queue queue, cl_kernel k_init_ht, cl_mem buf_ht, cl_mem rowCounters)
+{
+	size_t      global_ws = SA_NR_ROWS / SA_ROWS_PER_UINT;
+	size_t      local_ws = 256;
+	cl_int      status;
+#if 0
+	uint32_t    pat = -1;
+	status = clEnqueueFillBuffer(queue, buf_ht, &pat, sizeof(pat), 0,
+		SA_NR_ROWS * SA_NR_SLOTS * SA_SLOT_LEN,
+		0,		// cl_uint	num_events_in_wait_list
+		NULL,	// cl_event	*event_wait_list
+		NULL);	// cl_event	*event
+	if (status != CL_SUCCESS)
+		fatal("clEnqueueFillBuffer (%d)\n", status);
+#endif
+	status = clSetKernelArg(k_init_ht, 0, sizeof(buf_ht), &buf_ht);
+	status = clSetKernelArg(k_init_ht, 1, sizeof(rowCounters), &rowCounters);
+	if (status != CL_SUCCESS)
+		printf("clSetKernelArg (%d)\n", status);
+	check_clEnqueueNDRangeKernel(queue, k_init_ht,
+		1,		// cl_uint	work_dim
+		NULL,	// size_t	*global_work_offset
+		&global_ws,	// size_t	*global_work_size
+		&local_ws,	// size_t	*local_work_size
+		0,		// cl_uint	num_events_in_wait_list
+		NULL,	// cl_event	*event_wait_list
+		NULL);	// cl_event	*event
+}
+
+
+/*
+** Sort a pair of binary blobs (a, b) which are consecutive in memory and
+** occupy a total of 2*len 32-bit words.
+**
+** a            points to the pair
+** len          number of 32-bit words in each pair
+*/
+inline void sort_pair(uint32_t *a, uint32_t len)
+{
+	uint32_t    *b = a + len;
+	uint32_t     tmp, need_sorting = 0;
+	for (uint32_t i = 0; i < len; i++)
+		if (need_sorting || a[i] > b[i])
+		{
+			need_sorting = 1;
+			tmp = a[i];
+			a[i] = b[i];
+			b[i] = tmp;
+		}
+		else if (a[i] < b[i])
+			return;
+}
+
+inline uint32_t verify_sol(sols_t *sols, unsigned sol_i)
+{
+	uint32_t  *inputs = sols->values[sol_i];
+	uint32_t  seen_len = (1 << (SA_PREFIX + 1)) / 8;
+	uint8_t seen[(1 << (SA_PREFIX + 1)) / 8];
+	uint32_t  i;
+	uint8_t tmp;
+	// look for duplicate inputs
+	memset(seen, 0, seen_len);
+	for (i = 0; i < (1 << SA_PARAM_K); i++)
+	{
+		tmp = seen[inputs[i] / 8];
+		seen[inputs[i] / 8] |= 1 << (inputs[i] & 7);
+		if (tmp == seen[inputs[i] / 8])
+		{
+			// at least one input value is a duplicate
+			sols->valid[sol_i] = 0;
+			return 0;
+		}
+	}
+	// the valid flag is already set by the GPU, but set it again because
+	// I plan to change the GPU code to not set it
+	sols->valid[sol_i] = 1;
+	// sort the pairs in place
+	for (uint32_t level = 0; level < SA_PARAM_K; level++)
+		for (i = 0; i < (1 << SA_PARAM_K); i += (2 << level))
+			sort_pair(&inputs[i], 1 << level);
+	return 1;
+}
+
+
+inline size_t select_work_size_blake(cl_device_id device_id)
+{
+
+	size_t work_size =
+		64 * /* thread per wavefront */
+		SA_BLAKE_WPS * /* wavefront per simd */
+		4 * /* simd per compute unit */
+		nr_compute_units(device_id);
+	// Make the work group size a multiple of the nr of wavefronts, while
+	// dividing the number of inputs. This results in the worksize being a
+	// power of 2.
+	while (SA_NR_INPUTS % work_size)
+		work_size += 64;
+
+	return work_size;
+}
+
+
+}
+}
+}