Skip to content

Commit

Permalink
added in ocl silent army v5 and reverted some changes in the cuda
Browse files Browse the repository at this point in the history
version...probably wont work great on any pre-maxwell cards
  • Loading branch information
maztheman committed Nov 14, 2016
1 parent 5929dca commit 4c5ba6f
Show file tree
Hide file tree
Showing 7 changed files with 795 additions and 519 deletions.
5 changes: 4 additions & 1 deletion cuda_silentarmy/cuda_silentarmy.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -156,9 +156,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
</PostBuildEvent>
<CudaCompile>
<TargetMachinePlatform>64</TargetMachinePlatform>
<CodeGeneration>compute_61,sm_61;compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30</CodeGeneration>
<CodeGeneration>compute_61,sm_61;compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;</CodeGeneration>
<PtxAsOptionV>false</PtxAsOptionV>
</CudaCompile>
<CudaLink>
<AdditionalOptions>-E %(AdditionalOptions)</AdditionalOptions>
</CudaLink>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="kernel.cu" />
Expand Down
56 changes: 33 additions & 23 deletions cuda_silentarmy/kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ __constant__ ulong blake_iv[] =
** Reset counters in hash table.
*/
__global__
void kernel_init_ht(uint *rowCounters)
void kernel_init_ht(uint* rowCounters)
{
rowCounters[blockIdx.x * blockDim.x + threadIdx.x] = 0;
}
Expand Down Expand Up @@ -159,11 +159,16 @@ __device__ uint ht_store(uint round, char *ht, uint i,
xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
p = ht + row * NR_SLOTS * SLOT_LEN;
uint xcnt = atomicAdd(&rowCounters[row], 1);
uint rowIdx = row / ROWS_PER_UINT;
uint rowOffset = BITS_PER_ROW * (row & (ROWS_PER_UINT - 1));//ASSUME ROWS_PER_UINT is POWER OF 2
uint xcnt = atomicAdd(&rowCounters[rowIdx], 1 << rowOffset);
//printf("inc index %u round %u\n", rowIdx, round);
xcnt = (xcnt >> rowOffset) & ROW_MASK;
cnt = xcnt;
//printf("row %u rowOffset %u count is %u\n", rowIdx, rowOffset, cnt);
if (cnt >= NR_SLOTS) {
// avoid overflows
atomicSub(&rowCounters[row], 1);
atomicSub(&rowCounters[rowIdx], 1 << rowOffset);
return 1;
}
p += cnt * SLOT_LEN + xi_offset_for_round(round);
Expand Down Expand Up @@ -611,7 +616,9 @@ __device__ void equihash_round(uint round,
collisionsNum = 0;
__syncthreads();
p = (ht_src + tid * NR_SLOTS * SLOT_LEN);
cnt = rowCountersSrc[blockIdx.x * blockDim.x + threadIdx.x];
uint rowIdx = tid / ROWS_PER_UINT;
uint rowOffset = BITS_PER_ROW * (tid & (ROWS_PER_UINT - 1));
cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK;
cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in prev. round
if (!cnt) {
// no elements in row, no collisions
Expand Down Expand Up @@ -829,9 +836,11 @@ void __launch_bounds__(64) kernel_sols(char *ht0, char *ht1, sols_t *sols, uint
#else
#error "unsupported NR_ROWS_LOG"
#endif

a = htabs[ht_i] + tid * NR_SLOTS * SLOT_LEN;
cnt = rowCountersSrc[blockIdx.x * blockDim.x + threadIdx.x];
uint rowIdx = tid / ROWS_PER_UINT;
uint rowOffset = BITS_PER_ROW * (tid & (ROWS_PER_UINT - 1));
cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK;
cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round
coll = 0;
a += xi_offset;
Expand All @@ -853,6 +862,7 @@ exit1:
}
struct __align__(64) c_context {
char* buf_ht[2], *buf_sols, *buf_dbg;
uint *rowCounters[2];
sols_t *sols;
u32 nthreads;
size_t global_ws;
Expand Down Expand Up @@ -987,6 +997,8 @@ sa_cuda_context::sa_cuda_context(int tpb, int blocks, int id)
checkCudaErrors(cudaMalloc((void**)&eq->buf_ht[0], HT_SIZE));
checkCudaErrors(cudaMalloc((void**)&eq->buf_ht[1], HT_SIZE));
checkCudaErrors(cudaMalloc((void**)&eq->buf_sols, sizeof(sols_t)));
checkCudaErrors(cudaMalloc((void**)&eq->rowCounters[0], NR_ROWS));
checkCudaErrors(cudaMalloc((void**)&eq->rowCounters[1], NR_ROWS));

eq->sols = (sols_t *)malloc(sizeof(sols_t));
}
Expand All @@ -1007,13 +1019,14 @@ void sa_cuda_context::solve(const char * tequihash_header, unsigned int tequihas
{
checkCudaErrors(cudaSetDevice(device_id));


unsigned char context[140];
memset(context, 0, 140);
memcpy(context, tequihash_header, tequihash_header_len);
memcpy(context + tequihash_header_len, nonce, nonce_len);

c_context *miner = eq;

//FUNCTION<<<totalblocks, threadsperblock>>>(ARGUMENTS)

blake2b_state_t initialCtx;
Expand All @@ -1023,52 +1036,47 @@ void sa_cuda_context::solve(const char * tequihash_header, unsigned int tequihas
void* buf_blake_st;
checkCudaErrors(cudaMalloc((void**)&buf_blake_st, sizeof(blake2b_state_s)));
checkCudaErrors(cudaMemcpy(buf_blake_st, &initialCtx, sizeof(blake2b_state_s), cudaMemcpyHostToDevice));

uint* rowCounters[2] = {0};

checkCudaErrors(cudaMalloc((void**)&rowCounters[0], NR_ROWS * sizeof(uint)));
checkCudaErrors(cudaMalloc((void**)&rowCounters[1], NR_ROWS * sizeof(uint)));

const size_t blake_work_size = select_work_size_blake() / 64;
const size_t round_work_size = NR_ROWS / 64;

for (unsigned round = 0; round < PARAM_K; round++) {
// Now on every round!!!!
kernel_init_ht<<<round_work_size, 64>> >(rowCounters[round & 1]);
kernel_init_ht << <NR_ROWS / ROWS_PER_UINT / 256, 256 >> >(miner->rowCounters[round & 1]);
cudaThreadSynchronize();

switch (round) {
case 0:
kernel_round0 << <blake_work_size, 64 >> >((ulong*)buf_blake_st, miner->buf_ht[round & 1], rowCounters[round & 1], (uint*)miner->buf_dbg);
kernel_round0 << <blake_work_size, 64 >> >((ulong*)buf_blake_st, miner->buf_ht[round & 1], miner->rowCounters[round & 1], (uint*)miner->buf_dbg);
break;
case 1:
kernel_round1 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], rowCounters[(round - 1) & 1], rowCounters[round & 1], (uint*)miner->buf_dbg);
kernel_round1 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], miner->rowCounters[(round - 1) & 1], miner->rowCounters[round & 1], (uint*)miner->buf_dbg);
break;
case 2:
kernel_round2 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], rowCounters[(round - 1) & 1], rowCounters[round & 1], (uint*)miner->buf_dbg);
kernel_round2 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], miner->rowCounters[(round - 1) & 1], miner->rowCounters[round & 1], (uint*)miner->buf_dbg);
break;
case 3:
kernel_round3 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], rowCounters[(round - 1) & 1], rowCounters[round & 1], (uint*)miner->buf_dbg);
kernel_round3 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], miner->rowCounters[(round - 1) & 1], miner->rowCounters[round & 1], (uint*)miner->buf_dbg);
break;
case 4:
kernel_round4 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], rowCounters[(round - 1) & 1], rowCounters[round & 1], (uint*)miner->buf_dbg);
kernel_round4 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], miner->rowCounters[(round - 1) & 1], miner->rowCounters[round & 1], (uint*)miner->buf_dbg);
break;
case 5:
kernel_round5 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], rowCounters[(round - 1) & 1], rowCounters[round & 1], (uint*)miner->buf_dbg);
kernel_round5 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], miner->rowCounters[(round - 1) & 1], miner->rowCounters[round & 1], (uint*)miner->buf_dbg);
break;
case 6:
kernel_round6 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], rowCounters[(round - 1) & 1], rowCounters[round & 1], (uint*)miner->buf_dbg);
kernel_round6 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], miner->rowCounters[(round - 1) & 1], miner->rowCounters[round & 1], (uint*)miner->buf_dbg);
break;
case 7:
kernel_round7 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], rowCounters[(round - 1) & 1], rowCounters[round & 1], (uint*)miner->buf_dbg);
kernel_round7 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], miner->rowCounters[(round - 1) & 1], miner->rowCounters[round & 1], (uint*)miner->buf_dbg);
break;
case 8:
kernel_round8 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], rowCounters[(round - 1) & 1], rowCounters[round & 1], (uint*)miner->buf_dbg, (sols_t*)miner->buf_sols);
kernel_round8 << <round_work_size, 64 >> >(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], miner->rowCounters[(round - 1) & 1], miner->rowCounters[round & 1], (uint*)miner->buf_dbg, (sols_t*)miner->buf_sols);
break;
}
if (cancelf()) return;
}
kernel_sols<<<round_work_size, 64>>>(miner->buf_ht[0], miner->buf_ht[1], (sols_t*)miner->buf_sols, rowCounters[0], rowCounters[1]);
kernel_sols << <round_work_size, 64 >> >(miner->buf_ht[0], miner->buf_ht[1], (sols_t*)miner->buf_sols, miner->rowCounters[0], miner->rowCounters[1]);

checkCudaErrors(cudaMemcpy(miner->sols, miner->buf_sols, sizeof(sols_t), cudaMemcpyDeviceToHost));

Expand All @@ -1079,6 +1087,8 @@ void sa_cuda_context::solve(const char * tequihash_header, unsigned int tequihas
verify_sol(miner->sols, sol_i);
}

checkCudaErrors(cudaFree(buf_blake_st));

uint8_t proof[COMPRESSED_PROOFSIZE * 2];
for (uint32_t i = 0; i < miner->sols->nr; i++) {
if (miner->sols->valid[i]) {
Expand Down
2 changes: 1 addition & 1 deletion cuda_silentarmy/param.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#define OPTIM_SIMPLIFY_ROUND 1

// Number of collision items to track, per thread
#define COLL_DATA_SIZE_PER_TH (NR_SLOTS * 5)
#define COLL_DATA_SIZE_PER_TH (NR_SLOTS * 1)

// Make hash tables OVERHEAD times larger than necessary to store the average
// number of elements per row. The ideal value is as small as possible to
Expand Down
1 change: 1 addition & 0 deletions cuda_silentarmy/sa_cuda_context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ do { \
"CUDA error '%s' in func '%s' line %d", \
cudaGetErrorString(err), __FUNCTION__, __LINE__); \
printf("<error> %s\n", errorBuff); \
exit(0); \
} \
} while (0)

Expand Down
28 changes: 19 additions & 9 deletions ocl_silentarmy/ocl_silentarmy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ struct OclContext {
cl_kernel k_rounds[PARAM_K];
cl_kernel k_sols;

cl_mem buf_ht[2], buf_sols, buf_dbg;
cl_mem buf_ht[2], buf_sols, buf_dbg, rowCounters[2];
size_t global_ws;
size_t local_work_size = 64;

Expand All @@ -74,6 +74,8 @@ struct OclContext {
clReleaseMemObject(buf_dbg);
clReleaseMemObject(buf_ht[0]);
clReleaseMemObject(buf_ht[1]);
clReleaseMemObject(rowCounters[0]);
clReleaseMemObject(rowCounters[1]);
free(sols);
}
};
Expand Down Expand Up @@ -101,6 +103,10 @@ bool OclContext::init(
buf_ht[1] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, HT_SIZE, NULL);
buf_sols = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, sizeof(sols_t), NULL);

rowCounters[0] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, NR_ROWS, NULL);
rowCounters[1] = check_clCreateBuffer(_context, CL_MEM_READ_WRITE, NR_ROWS, NULL);



fprintf(stderr, "Hash tables will use %.1f MB\n", 2.0 * HT_SIZE / 1e6);

Expand Down Expand Up @@ -268,10 +274,10 @@ size_t select_work_size_blake(void)
return work_size;
}

static void init_ht(cl_command_queue queue, cl_kernel k_init_ht, cl_mem buf_ht)
static void init_ht(cl_command_queue queue, cl_kernel k_init_ht, cl_mem buf_ht, cl_mem rowCounters)
{
size_t global_ws = NR_ROWS;
size_t local_ws = 64;
size_t global_ws = NR_ROWS / ROWS_PER_UINT;
size_t local_ws = 256;
cl_int status;
#if 0
uint32_t pat = -1;
Expand All @@ -284,6 +290,7 @@ static void init_ht(cl_command_queue queue, cl_kernel k_init_ht, cl_mem buf_ht)
fatal("clEnqueueFillBuffer (%d)\n", status);
#endif
status = clSetKernelArg(k_init_ht, 0, sizeof(buf_ht), &buf_ht);
status = clSetKernelArg(k_init_ht, 1, sizeof(rowCounters), &rowCounters);
if (status != CL_SUCCESS)
printf("clSetKernelArg (%d)\n", status);
check_clEnqueueNDRangeKernel(queue, k_init_ht,
Expand Down Expand Up @@ -494,24 +501,25 @@ void ocl_silentarmy::solve(const char *tequihash_header,

for (unsigned round = 0; round < PARAM_K; round++)
{
if (round < 2) {
init_ht(miner->queue, miner->k_init_ht, miner->buf_ht[round & 1]);
}
init_ht(miner->queue, miner->k_init_ht, miner->buf_ht[round & 1], miner->rowCounters[round & 1]);
if (!round)
{
check_clSetKernelArg(miner->k_rounds[round], 0, &buf_blake_st);
check_clSetKernelArg(miner->k_rounds[round], 1, &miner->buf_ht[round & 1]);
check_clSetKernelArg(miner->k_rounds[round], 2, &miner->rowCounters[round & 2]);
miner->global_ws = select_work_size_blake();
}
else
{
check_clSetKernelArg(miner->k_rounds[round], 0, &miner->buf_ht[(round - 1) & 1]);
check_clSetKernelArg(miner->k_rounds[round], 1, &miner->buf_ht[round & 1]);
check_clSetKernelArg(miner->k_rounds[round], 2, &miner->rowCounters[(round - 1) & 1]);
check_clSetKernelArg(miner->k_rounds[round], 3, &miner->rowCounters[round & 1]);
miner->global_ws = NR_ROWS;
}
check_clSetKernelArg(miner->k_rounds[round], 2, &miner->buf_dbg);
check_clSetKernelArg(miner->k_rounds[round], round == 0 ? 3 : 4, &miner->buf_dbg);
if (round == PARAM_K - 1)
check_clSetKernelArg(miner->k_rounds[round], 3, &miner->buf_sols);
check_clSetKernelArg(miner->k_rounds[round], 5, &miner->buf_sols);
check_clEnqueueNDRangeKernel(miner->queue, miner->k_rounds[round], 1, NULL,
&miner->global_ws, &miner->local_work_size, 0, NULL, NULL);
// cancel function
Expand All @@ -520,6 +528,8 @@ void ocl_silentarmy::solve(const char *tequihash_header,
check_clSetKernelArg(miner->k_sols, 0, &miner->buf_ht[0]);
check_clSetKernelArg(miner->k_sols, 1, &miner->buf_ht[1]);
check_clSetKernelArg(miner->k_sols, 2, &miner->buf_sols);
check_clSetKernelArg(miner->k_sols, 3, &miner->rowCounters[0]);
check_clSetKernelArg(miner->k_sols, 4, &miner->rowCounters[1]);
miner->global_ws = NR_ROWS;
check_clEnqueueNDRangeKernel(miner->queue, miner->k_sols, 1, NULL,
&miner->global_ws, &miner->local_work_size, 0, NULL, NULL);
Expand Down
56 changes: 43 additions & 13 deletions ocl_silentarmy/param.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,15 @@
#define NR_INPUTS (1 << PREFIX)
// Approximate log base 2 of number of elements in hash tables
#define APX_NR_ELMS_LOG (PREFIX + 1)
// Number of rows and slots is affected by this. 20 offers the best performance
// but occasionally misses ~1% of solutions.
// Number of rows and slots is affected by this; 20 offers the best performance
#define NR_ROWS_LOG 20

#define OPTIM_SIMPLIFY_ROUND 1
// Setting this to 1 might make SILENTARMY faster, see TROUBLESHOOTING.md
#define OPTIM_SIMPLIFY_ROUND 1

// Number of collision items to track, per thread
#define COLL_DATA_SIZE_PER_TH (NR_SLOTS * 1)

// Make hash tables OVERHEAD times larger than necessary to store the average
// number of elements per row. The ideal value is as small as possible to
// reduce memory usage, but not too small or else elements are dropped from the
Expand All @@ -21,13 +25,14 @@
// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease
// performance as they cause VRAM channel conflicts.
#if NR_ROWS_LOG == 16
#error "NR_ROWS_LOG = 16 is currently broken - do not use"
#define OVERHEAD 3
#elif NR_ROWS_LOG == 18
#define OVERHEAD 3
#elif NR_ROWS_LOG == 19
#define OVERHEAD 5
#define OVERHEAD 5
#elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND
#define OVERHEAD 6
#define OVERHEAD 6
#elif NR_ROWS_LOG == 20
#define OVERHEAD 9
#endif
Expand All @@ -38,17 +43,41 @@
#define SLOT_LEN 32
// Total size of hash table
#define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN)
// Length of Zcash block header and nonce
// Length of Zcash block header, nonce (part of header)
#define ZCASH_BLOCK_HEADER_LEN 140
// Offset of nTime in header
#define ZCASH_BLOCK_OFFSET_NTIME (4 + 3 * 32)
// Length of nonce
#define ZCASH_NONCE_LEN 32
// Length of encoded representation of solution size
#define ZCASH_SOLSIZE_LEN 3
// Solution size (1344 = 0x540) represented as a compact integer, in hex
#define ZCASH_SOLSIZE_HEX "fd4005"
// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes)
#define ZCASH_SOL_LEN ((1 << PARAM_K) * (PREFIX + 1) / 8)
// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization
#define N_ZERO_BYTES 12
// Number of bytes Zcash needs out of Blake
#define ZCASH_HASH_LEN 50
// Number of wavefronts per SIMD for the Blake kernel.
// Blake is ALU-bound (beside the atomic counter being incremented) so we need
// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer
// instructions. 10 is the max supported by the hw.
#define BLAKE_WPS 10
#define MAX_SOLS 10
// Maximum number of solutions reported by kernel to host
#define MAX_SOLS 10
// Length of SHA256 target
#define SHA256_TARGET_LEN (256 / 8)

#if (NR_SLOTS < 16)
#define BITS_PER_ROW 4
#define ROWS_PER_UINT 8
#define ROW_MASK 0x0F
#else
#define BITS_PER_ROW 8
#define ROWS_PER_UINT 4
#define ROW_MASK 0xFF
#endif

// Optional features
#undef ENABLE_DEBUG
Expand All @@ -60,10 +89,11 @@

// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values
#define SOL_SIZE ((1 << PARAM_K) * 4)
typedef struct sols_s

typedef struct sols_s
{
uint nr;
uint likely_invalids;
uchar valid[MAX_SOLS];
uint values[MAX_SOLS][(1 << PARAM_K)];
} sols_t;
uint nr;
uint likely_invalids;
uchar valid[MAX_SOLS];
uint values[MAX_SOLS][(1 << PARAM_K)];
} sols_t;
Loading

0 comments on commit 4c5ba6f

Please sign in to comment.