added in ocl silent army v5 and reverted some changes in the cuda

version...probably wont work great on any pre-maxwell cards
maztheman · Nov 14, 2016 · 4c5ba6f · 4c5ba6f
1 parent 5929dca
commit 4c5ba6f
Show file tree

Hide file tree

Showing 7 changed files with 795 additions and 519 deletions.
diff --git a/cuda_silentarmy/cuda_silentarmy.vcxproj b/cuda_silentarmy/cuda_silentarmy.vcxproj
@@ -156,9 +156,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
     </PostBuildEvent>
     <CudaCompile>
       <TargetMachinePlatform>64</TargetMachinePlatform>
-      <CodeGeneration>compute_61,sm_61;compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30</CodeGeneration>
+      <CodeGeneration>compute_61,sm_61;compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;</CodeGeneration>
       <PtxAsOptionV>false</PtxAsOptionV>
     </CudaCompile>
+    <CudaLink>
+      <AdditionalOptions>-E %(AdditionalOptions)</AdditionalOptions>
+    </CudaLink>
   </ItemDefinitionGroup>
   <ItemGroup>
     <CudaCompile Include="kernel.cu" />

diff --git a/cuda_silentarmy/kernel.cu b/cuda_silentarmy/kernel.cu
@@ -83,7 +83,7 @@ __constant__ ulong blake_iv[] =
 ** Reset counters in hash table.
 */
 __global__
-void kernel_init_ht(uint *rowCounters)
+void kernel_init_ht(uint* rowCounters)
 {
 	rowCounters[blockIdx.x * blockDim.x + threadIdx.x] = 0;
 }
@@ -159,11 +159,16 @@ __device__ uint ht_store(uint round, char *ht, uint i,
 	xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
 	xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
 	p = ht + row * NR_SLOTS * SLOT_LEN;
-	uint xcnt = atomicAdd(&rowCounters[row], 1);
+	uint rowIdx = row / ROWS_PER_UINT;
+	uint rowOffset = BITS_PER_ROW * (row & (ROWS_PER_UINT - 1));//ASSUME ROWS_PER_UINT is POWER OF 2
+	uint xcnt = atomicAdd(&rowCounters[rowIdx], 1 << rowOffset);
+	//printf("inc index %u round %u\n", rowIdx, round);
+	xcnt = (xcnt >> rowOffset) & ROW_MASK;
 	cnt = xcnt;
+	//printf("row %u rowOffset %u count is %u\n", rowIdx, rowOffset, cnt);
 	if (cnt >= NR_SLOTS) {
 		// avoid overflows
-		atomicSub(&rowCounters[row], 1);
+		atomicSub(&rowCounters[rowIdx], 1 << rowOffset);
 		return 1;
 	}
 	p += cnt * SLOT_LEN + xi_offset_for_round(round);
@@ -611,7 +616,9 @@ __device__ void equihash_round(uint round,
 	collisionsNum = 0;
 	__syncthreads();
 	p = (ht_src + tid * NR_SLOTS * SLOT_LEN);
-	cnt = rowCountersSrc[blockIdx.x * blockDim.x + threadIdx.x];
+	uint rowIdx = tid / ROWS_PER_UINT;
+	uint rowOffset = BITS_PER_ROW * (tid & (ROWS_PER_UINT - 1));
+	cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK;
 	cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in prev. round
 	if (!cnt) {
 		// no elements in row, no collisions
@@ -829,9 +836,11 @@ void __launch_bounds__(64) kernel_sols(char *ht0, char *ht1, sols_t *sols, uint
 #else
 #error "unsupported NR_ROWS_LOG"
 #endif
-	
+
 	a = htabs[ht_i] + tid * NR_SLOTS * SLOT_LEN;
-	cnt = rowCountersSrc[blockIdx.x * blockDim.x + threadIdx.x];
+	uint rowIdx = tid / ROWS_PER_UINT;
+	uint rowOffset = BITS_PER_ROW * (tid & (ROWS_PER_UINT - 1));
+	cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK;
 	cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round
 	coll = 0;
 	a += xi_offset;
@@ -853,6 +862,7 @@ exit1:
 }
 struct __align__(64) c_context {
 	char* buf_ht[2], *buf_sols, *buf_dbg;
+	uint *rowCounters[2];
 	sols_t	*sols;
 	u32 nthreads;
 	size_t global_ws;
@@ -987,6 +997,8 @@ sa_cuda_context::sa_cuda_context(int tpb, int blocks, int id)
 	checkCudaErrors(cudaMalloc((void**)&eq->buf_ht[0], HT_SIZE));
 	checkCudaErrors(cudaMalloc((void**)&eq->buf_ht[1], HT_SIZE));
 	checkCudaErrors(cudaMalloc((void**)&eq->buf_sols, sizeof(sols_t)));
+	checkCudaErrors(cudaMalloc((void**)&eq->rowCounters[0], NR_ROWS));
+	checkCudaErrors(cudaMalloc((void**)&eq->rowCounters[1], NR_ROWS));
 
 	eq->sols = (sols_t *)malloc(sizeof(sols_t));
 }
@@ -1007,13 +1019,14 @@ void sa_cuda_context::solve(const char * tequihash_header, unsigned int tequihas
 {
 	checkCudaErrors(cudaSetDevice(device_id));
 
+
 	unsigned char context[140];
 	memset(context, 0, 140);
 	memcpy(context, tequihash_header, tequihash_header_len);
 	memcpy(context + tequihash_header_len, nonce, nonce_len);
 
 	c_context *miner = eq;
-	
+
 	//FUNCTION<<<totalblocks, threadsperblock>>>(ARGUMENTS)
 
 	blake2b_state_t initialCtx;
@@ -1023,52 +1036,47 @@ void sa_cuda_context::solve(const char * tequihash_header, unsigned int tequihas
 	void* buf_blake_st;
 	checkCudaErrors(cudaMalloc((void**)&buf_blake_st, sizeof(blake2b_state_s)));
 	checkCudaErrors(cudaMemcpy(buf_blake_st, &initialCtx, sizeof(blake2b_state_s), cudaMemcpyHostToDevice));
-
-	uint* rowCounters[2] = {0};
-
-	checkCudaErrors(cudaMalloc((void**)&rowCounters[0], NR_ROWS * sizeof(uint)));
-	checkCudaErrors(cudaMalloc((void**)&rowCounters[1], NR_ROWS * sizeof(uint)));
 
 	const size_t blake_work_size = select_work_size_blake() / 64;
 	const size_t round_work_size = NR_ROWS / 64;
 
 	for (unsigned round = 0; round < PARAM_K; round++) {
 		// Now on every round!!!!
-		kernel_init_ht<<<round_work_size, 64>> >(rowCounters[round & 1]);
+		kernel_init_ht << <NR_ROWS / ROWS_PER_UINT / 256, 256 >> >(miner->rowCounters[round & 1]);
 		cudaThreadSynchronize();
 
 		switch (round) {
 		case 0:
-			kernel_round0 << <blake_work_size, 64 >> >((ulong*)buf_blake_st, miner->buf_ht[round & 1], rowCounters[round & 1], (uint*)miner->buf_dbg);
+			kernel_round0 << <blake_work_size, 64 >> >((ulong*)buf_blake_st, miner->buf_ht[round & 1], miner->rowCounters[round & 1], (uint*)miner->buf_dbg);
 			break;
 		case 1:
-			kernel_round1 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], rowCounters[(round - 1) & 1], rowCounters[round & 1], (uint*)miner->buf_dbg);
+			kernel_round1 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], miner->rowCounters[(round - 1) & 1], miner->rowCounters[round & 1], (uint*)miner->buf_dbg);
 			break;
 		case 2:
-			kernel_round2 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], rowCounters[(round - 1) & 1], rowCounters[round & 1], (uint*)miner->buf_dbg);
+			kernel_round2 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], miner->rowCounters[(round - 1) & 1], miner->rowCounters[round & 1], (uint*)miner->buf_dbg);
 			break;
 		case 3:
-			kernel_round3 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], rowCounters[(round - 1) & 1], rowCounters[round & 1], (uint*)miner->buf_dbg);
+			kernel_round3 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], miner->rowCounters[(round - 1) & 1], miner->rowCounters[round & 1], (uint*)miner->buf_dbg);
 			break;
 		case 4:
-			kernel_round4 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], rowCounters[(round - 1) & 1], rowCounters[round & 1], (uint*)miner->buf_dbg);
+			kernel_round4 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], miner->rowCounters[(round - 1) & 1], miner->rowCounters[round & 1], (uint*)miner->buf_dbg);
 			break;
 		case 5:
-			kernel_round5 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], rowCounters[(round - 1) & 1], rowCounters[round & 1], (uint*)miner->buf_dbg);
+			kernel_round5 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], miner->rowCounters[(round - 1) & 1], miner->rowCounters[round & 1], (uint*)miner->buf_dbg);
 			break;
 		case 6:
-			kernel_round6 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], rowCounters[(round - 1) & 1], rowCounters[round & 1], (uint*)miner->buf_dbg);
+			kernel_round6 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], miner->rowCounters[(round - 1) & 1], miner->rowCounters[round & 1], (uint*)miner->buf_dbg);
 			break;
 		case 7:
-			kernel_round7 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], rowCounters[(round - 1) & 1], rowCounters[round & 1], (uint*)miner->buf_dbg);
+			kernel_round7 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], miner->rowCounters[(round - 1) & 1], miner->rowCounters[round & 1], (uint*)miner->buf_dbg);
 			break;
 		case 8:
-			kernel_round8 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], rowCounters[(round - 1) & 1], rowCounters[round & 1], (uint*)miner->buf_dbg, (sols_t*)miner->buf_sols);
+			kernel_round8 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], miner->rowCounters[(round - 1) & 1], miner->rowCounters[round & 1], (uint*)miner->buf_dbg, (sols_t*)miner->buf_sols);
 			break;
 		}
 		if (cancelf()) return;
 	}
-	kernel_sols<<<round_work_size, 64>>>(miner->buf_ht[0], miner->buf_ht[1], (sols_t*)miner->buf_sols, rowCounters[0], rowCounters[1]);
+	kernel_sols << <round_work_size, 64 >> >(miner->buf_ht[0], miner->buf_ht[1], (sols_t*)miner->buf_sols, miner->rowCounters[0], miner->rowCounters[1]);
 
 	checkCudaErrors(cudaMemcpy(miner->sols, miner->buf_sols, sizeof(sols_t), cudaMemcpyDeviceToHost));
 
@@ -1079,6 +1087,8 @@ void sa_cuda_context::solve(const char * tequihash_header, unsigned int tequihas
 		verify_sol(miner->sols, sol_i);
 	}
 
+	checkCudaErrors(cudaFree(buf_blake_st));
+
 	uint8_t proof[COMPRESSED_PROOFSIZE * 2];
 	for (uint32_t i = 0; i < miner->sols->nr; i++) {
 		if (miner->sols->valid[i]) {

diff --git a/cuda_silentarmy/param.h b/cuda_silentarmy/param.h
@@ -11,7 +11,7 @@
 #define OPTIM_SIMPLIFY_ROUND		1
 
 // Number of collision items to track, per thread
-#define COLL_DATA_SIZE_PER_TH		(NR_SLOTS * 5)
+#define COLL_DATA_SIZE_PER_TH		(NR_SLOTS * 1)
 
 // Make hash tables OVERHEAD times larger than necessary to store the average
 // number of elements per row. The ideal value is as small as possible to

diff --git a/cuda_silentarmy/sa_cuda_context.hpp b/cuda_silentarmy/sa_cuda_context.hpp
@@ -20,6 +20,7 @@ do {														\
 			"CUDA error '%s' in func '%s' line %d",			\
 			cudaGetErrorString(err), __FUNCTION__, __LINE__);	\
 		printf("<error> %s\n", errorBuff); \
+		exit(0); \
 		}														\
 } while (0)
 

diff --git a/ocl_silentarmy/ocl_silentarmy.cpp b/ocl_silentarmy/ocl_silentarmy.cpp
@@ -62,7 +62,7 @@ struct OclContext {
 	cl_kernel k_rounds[PARAM_K];
 	cl_kernel k_sols;
 
-	cl_mem buf_ht[2], buf_sols, buf_dbg;
+	cl_mem buf_ht[2], buf_sols, buf_dbg, rowCounters[2];
 	size_t global_ws;
 	size_t local_work_size = 64;
 
@@ -74,6 +74,8 @@ struct OclContext {
 		clReleaseMemObject(buf_dbg);
 		clReleaseMemObject(buf_ht[0]);
 		clReleaseMemObject(buf_ht[1]);
+		clReleaseMemObject(rowCounters[0]);
+		clReleaseMemObject(rowCounters[1]);
 		free(sols);
 	}
 };
@@ -101,6 +103,10 @@ bool OclContext::init(
 	buf_ht[1] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, HT_SIZE, NULL);
 	buf_sols = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, sizeof(sols_t), NULL);
 
+	rowCounters[0] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, NR_ROWS, NULL);
+	rowCounters[1] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, NR_ROWS, NULL);
+
+
 
 	fprintf(stderr, "Hash tables will use %.1f MB\n", 2.0 * HT_SIZE / 1e6);
 
@@ -268,10 +274,10 @@ size_t select_work_size_blake(void)
 	return work_size;
 }
 
-static void init_ht(cl_command_queue queue, cl_kernel k_init_ht, cl_mem buf_ht)
+static void init_ht(cl_command_queue queue, cl_kernel k_init_ht, cl_mem buf_ht, cl_mem rowCounters)
 {
-	size_t      global_ws = NR_ROWS;
-	size_t      local_ws = 64;
+	size_t      global_ws = NR_ROWS / ROWS_PER_UINT;
+	size_t      local_ws = 256;
 	cl_int      status;
 #if 0
 	uint32_t    pat = -1;
@@ -284,6 +290,7 @@ static void init_ht(cl_command_queue queue, cl_kernel k_init_ht, cl_mem buf_ht)
 		fatal("clEnqueueFillBuffer (%d)\n", status);
 #endif
 	status = clSetKernelArg(k_init_ht, 0, sizeof(buf_ht), &buf_ht);
+	status = clSetKernelArg(k_init_ht, 1, sizeof(rowCounters), &rowCounters);
 	if (status != CL_SUCCESS)
 		printf("clSetKernelArg (%d)\n", status);
 	check_clEnqueueNDRangeKernel(queue, k_init_ht,
@@ -494,24 +501,25 @@ void ocl_silentarmy::solve(const char *tequihash_header,
 
 	for (unsigned round = 0; round < PARAM_K; round++)
 	{
-		if (round < 2) {
-			init_ht(miner->queue, miner->k_init_ht, miner->buf_ht[round & 1]);
-		}
+		init_ht(miner->queue, miner->k_init_ht, miner->buf_ht[round & 1], miner->rowCounters[round & 1]);
 		if (!round)
 		{
 			check_clSetKernelArg(miner->k_rounds[round], 0, &buf_blake_st);
 			check_clSetKernelArg(miner->k_rounds[round], 1, &miner->buf_ht[round & 1]);
+			check_clSetKernelArg(miner->k_rounds[round], 2, &miner->rowCounters[round & 2]);
 			miner->global_ws = select_work_size_blake();
 		}
 		else
 		{
 			check_clSetKernelArg(miner->k_rounds[round], 0, &miner->buf_ht[(round - 1) & 1]);
 			check_clSetKernelArg(miner->k_rounds[round], 1, &miner->buf_ht[round & 1]);
+			check_clSetKernelArg(miner->k_rounds[round], 2, &miner->rowCounters[(round - 1) & 1]);
+			check_clSetKernelArg(miner->k_rounds[round], 3, &miner->rowCounters[round & 1]);
 			miner->global_ws = NR_ROWS;
 		}
-		check_clSetKernelArg(miner->k_rounds[round], 2, &miner->buf_dbg);
+		check_clSetKernelArg(miner->k_rounds[round], round == 0 ? 3 : 4, &miner->buf_dbg);
 		if (round == PARAM_K - 1)
-			check_clSetKernelArg(miner->k_rounds[round], 3, &miner->buf_sols);
+			check_clSetKernelArg(miner->k_rounds[round], 5, &miner->buf_sols);
 		check_clEnqueueNDRangeKernel(miner->queue, miner->k_rounds[round], 1, NULL,
 			&miner->global_ws, &miner->local_work_size, 0, NULL, NULL);
 		// cancel function
@@ -520,6 +528,8 @@ void ocl_silentarmy::solve(const char *tequihash_header,
 	check_clSetKernelArg(miner->k_sols, 0, &miner->buf_ht[0]);
 	check_clSetKernelArg(miner->k_sols, 1, &miner->buf_ht[1]);
 	check_clSetKernelArg(miner->k_sols, 2, &miner->buf_sols);
+	check_clSetKernelArg(miner->k_sols, 3, &miner->rowCounters[0]);
+	check_clSetKernelArg(miner->k_sols, 4, &miner->rowCounters[1]);
 	miner->global_ws = NR_ROWS;
 	check_clEnqueueNDRangeKernel(miner->queue, miner->k_sols, 1, NULL,
 		&miner->global_ws, &miner->local_work_size, 0, NULL, NULL);

diff --git a/ocl_silentarmy/param.h b/ocl_silentarmy/param.h
@@ -4,11 +4,15 @@
 #define NR_INPUTS                       (1 << PREFIX)
 // Approximate log base 2 of number of elements in hash tables
 #define APX_NR_ELMS_LOG                 (PREFIX + 1)
-// Number of rows and slots is affected by this. 20 offers the best performance
-// but occasionally misses ~1% of solutions.
+// Number of rows and slots is affected by this; 20 offers the best performance
 #define NR_ROWS_LOG                     20
 
-#define OPTIM_SIMPLIFY_ROUND			1
+// Setting this to 1 might make SILENTARMY faster, see TROUBLESHOOTING.md
+#define OPTIM_SIMPLIFY_ROUND		1
+
+// Number of collision items to track, per thread
+#define COLL_DATA_SIZE_PER_TH		(NR_SLOTS * 1)
+
 // Make hash tables OVERHEAD times larger than necessary to store the average
 // number of elements per row. The ideal value is as small as possible to
 // reduce memory usage, but not too small or else elements are dropped from the
@@ -21,13 +25,14 @@
 // Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease
 // performance as they cause VRAM channel conflicts.
 #if NR_ROWS_LOG == 16
+#error "NR_ROWS_LOG = 16 is currently broken - do not use"
 #define OVERHEAD                        3
 #elif NR_ROWS_LOG == 18
 #define OVERHEAD                        3
 #elif NR_ROWS_LOG == 19
-#define OVERHEAD						5
+#define OVERHEAD                        5
 #elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND
-#define OVERHEAD						6
+#define OVERHEAD                        6
 #elif NR_ROWS_LOG == 20
 #define OVERHEAD                        9
 #endif
@@ -38,17 +43,41 @@
 #define SLOT_LEN                        32
 // Total size of hash table
 #define HT_SIZE				(NR_ROWS * NR_SLOTS * SLOT_LEN)
-// Length of Zcash block header and nonce
+// Length of Zcash block header, nonce (part of header)
 #define ZCASH_BLOCK_HEADER_LEN		140
+// Offset of nTime in header
+#define ZCASH_BLOCK_OFFSET_NTIME        (4 + 3 * 32)
+// Length of nonce
 #define ZCASH_NONCE_LEN			32
+// Length of encoded representation of solution size
+#define ZCASH_SOLSIZE_LEN		3
+// Solution size (1344 = 0x540) represented as a compact integer, in hex
+#define ZCASH_SOLSIZE_HEX               "fd4005"
+// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes)
+#define ZCASH_SOL_LEN                   ((1 << PARAM_K) * (PREFIX + 1) / 8)
+// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization
+#define N_ZERO_BYTES			12
 // Number of bytes Zcash needs out of Blake
 #define ZCASH_HASH_LEN                  50
 // Number of wavefronts per SIMD for the Blake kernel.
 // Blake is ALU-bound (beside the atomic counter being incremented) so we need
 // at least 2 wavefronts per SIMD to hide the 2-clock latency of integer
 // instructions. 10 is the max supported by the hw.
 #define BLAKE_WPS               	10
-#define MAX_SOLS					10
+// Maximum number of solutions reported by kernel to host
+#define MAX_SOLS			10
+// Length of SHA256 target
+#define SHA256_TARGET_LEN               (256 / 8)
+
+#if (NR_SLOTS < 16)
+#define BITS_PER_ROW 4
+#define ROWS_PER_UINT 8
+#define ROW_MASK 0x0F
+#else
+#define BITS_PER_ROW 8
+#define ROWS_PER_UINT 4
+#define ROW_MASK 0xFF
+#endif
 
 // Optional features
 #undef ENABLE_DEBUG
@@ -60,10 +89,11 @@
 
 // An (uncompressed) solution stores (1 << PARAM_K) 32-bit values
 #define SOL_SIZE			((1 << PARAM_K) * 4)
-typedef struct	sols_s
+
+typedef struct sols_s
 {
-    uint	nr;
-    uint	likely_invalids;
-    uchar	valid[MAX_SOLS];
-    uint	values[MAX_SOLS][(1 << PARAM_K)];
-}		sols_t;
+	uint nr;
+	uint likely_invalids;
+	uchar valid[MAX_SOLS];
+	uint values[MAX_SOLS][(1 << PARAM_K)];
+} sols_t;