diff --git a/cuda_silentarmy/cuda_silentarmy.cpp b/cuda_silentarmy/cuda_silentarmy.cpp index f9180d664..6ad2b4a4f 100644 --- a/cuda_silentarmy/cuda_silentarmy.cpp +++ b/cuda_silentarmy/cuda_silentarmy.cpp @@ -14,7 +14,7 @@ cuda_sa_solver::cuda_sa_solver(int platf_id, int dev_id) // todo: determine default values for various GPUs here threadsperblock = 64; - blocks = m_sm_count * 32; + blocks = m_sm_count * 7; } std::string cuda_sa_solver::getdevinfo() diff --git a/cuda_silentarmy/kernel.cu b/cuda_silentarmy/kernel.cu index cc8910d4b..25da48c9e 100644 --- a/cuda_silentarmy/kernel.cu +++ b/cuda_silentarmy/kernel.cu @@ -397,7 +397,7 @@ __device__ uint xor_and_store(uint round, char *ht_dst, uint row, uint slot_a, u ** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi, ** store them in ht_dst. */ -__device__ void equihash_round(uint size, uint round, char *ht_src, char *ht_dst, uint *debug) +__device__ void equihash_round(uint round, char *ht_src, char *ht_dst, uint *debug) { uint tid = blockIdx.x * blockDim.x + threadIdx.x; char *p; @@ -406,6 +406,7 @@ __device__ void equihash_round(uint size, uint round, char *ht_src, char *ht_dst uint dropped_stor = 0; ulong *a, *b; uint xi_offset; + static uint size = NR_ROWS; static uint stride = NR_SLOTS * SLOT_LEN; xi_offset = (8 + ((round - 1) / 2) * 4); @@ -435,57 +436,49 @@ __device__ void equihash_round(uint size, uint round, char *ht_src, char *ht_dst } __global__ void -__launch_bounds__(64, 1) -kernel_round1(uint size, char *ht_src, char *ht_dst, uint *debug) +kernel_round1(char *ht_src, char *ht_dst, uint *debug) { - equihash_round(size, 1, ht_src, ht_dst, debug); + equihash_round(1, ht_src, ht_dst, debug); } __global__ void -__launch_bounds__(64, 1) -kernel_round2(uint size, char *ht_src, char *ht_dst, uint *debug) +kernel_round2(char *ht_src, char *ht_dst, uint *debug) { - equihash_round(size, 2, ht_src, ht_dst, debug); + equihash_round(2, ht_src, ht_dst, debug); } __global__ void -__launch_bounds__(64, 1) -kernel_round3(uint size, char *ht_src, char *ht_dst, uint *debug) +kernel_round3(char *ht_src, char *ht_dst, uint *debug) { - equihash_round(size, 3, ht_src, ht_dst, debug); + equihash_round(3, ht_src, ht_dst, debug); } __global__ void -__launch_bounds__(64, 1) -kernel_round4(uint size, char *ht_src, char *ht_dst, uint *debug) +kernel_round4(char *ht_src, char *ht_dst, uint *debug) { - equihash_round(size, 4, ht_src, ht_dst, debug); + equihash_round(4, ht_src, ht_dst, debug); } __global__ void -__launch_bounds__(64, 1) -kernel_round5(uint size, char *ht_src, char *ht_dst, uint *debug) +kernel_round5(char *ht_src, char *ht_dst, uint *debug) { - equihash_round(size, 5, ht_src, ht_dst, debug); + equihash_round(5, ht_src, ht_dst, debug); } __global__ void -__launch_bounds__(64, 1) -kernel_round6(uint size, char *ht_src, char *ht_dst, uint *debug) +kernel_round6(char *ht_src, char *ht_dst, uint *debug) { - equihash_round(size, 6, ht_src, ht_dst, debug); + equihash_round(6, ht_src, ht_dst, debug); } __global__ void -__launch_bounds__(64, 1) -kernel_round7(uint size, char *ht_src, char *ht_dst, uint *debug) +kernel_round7(char *ht_src, char *ht_dst, uint *debug) { - equihash_round(size, 7, ht_src, ht_dst, debug); + equihash_round(7, ht_src, ht_dst, debug); } // kernel_round8 takes an extra argument, "sols" __global__ void -__launch_bounds__(64, 1) -kernel_round8(uint size, char *ht_src, char *ht_dst, uint *debug, sols_t *sols) +kernel_round8(char *ht_src, char *ht_dst, uint *debug, sols_t *sols) { uint tid = blockIdx.x * blockDim.x + threadIdx.x; - equihash_round(size, 8, ht_src, ht_dst, debug); + equihash_round(8, ht_src, ht_dst, debug); if (!tid) sols->nr = sols->likely_invalids = 0; } @@ -789,28 +782,28 @@ void sa_cuda_context::solve(const char * tequihash_header, unsigned int tequihas kernel_round0<<>>((ulong*)buf_blake_st, miner->buf_ht[round & 1], (uint*)miner->buf_dbg); break; case 1: - kernel_round1<<>>(NR_ROWS, miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], (uint*)miner->buf_dbg); + kernel_round1<<>>(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], (uint*)miner->buf_dbg); break; case 2: - kernel_round2<<< totalblocks, threadsperblock>>>(NR_ROWS, miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], (uint*)miner->buf_dbg); + kernel_round2<<< totalblocks, threadsperblock>>>(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], (uint*)miner->buf_dbg); break; case 3: - kernel_round3<<>>(NR_ROWS, miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], (uint*)miner->buf_dbg); + kernel_round3<<>>(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], (uint*)miner->buf_dbg); break; case 4: - kernel_round4<<>>(NR_ROWS, miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], (uint*)miner->buf_dbg); + kernel_round4<<>>(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], (uint*)miner->buf_dbg); break; case 5: - kernel_round5<<>>(NR_ROWS, miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], (uint*)miner->buf_dbg); + kernel_round5<<>>(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], (uint*)miner->buf_dbg); break; case 6: - kernel_round6<<>>(NR_ROWS, miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], (uint*)miner->buf_dbg); + kernel_round6<<>>(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], (uint*)miner->buf_dbg); break; case 7: - kernel_round7<<>>(NR_ROWS, miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], (uint*)miner->buf_dbg); + kernel_round7<<>>(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], (uint*)miner->buf_dbg); break; case 8: - kernel_round8<<>>(NR_ROWS, miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], (uint*)miner->buf_dbg, (sols_t*)miner->buf_sols); + kernel_round8<<>>(miner->buf_ht[(round - 1) & 1], miner->buf_ht[round & 1], (uint*)miner->buf_dbg, (sols_t*)miner->buf_sols); break; } if (cancelf()) return; diff --git a/nheqminer/nheqminer.vcxproj b/nheqminer/nheqminer.vcxproj index 815f10fac..a15fe80a9 100644 --- a/nheqminer/nheqminer.vcxproj +++ b/nheqminer/nheqminer.vcxproj @@ -84,7 +84,7 @@ MaxSpeed true true - WIN32;NDEBUG;_CONSOLE;USE_CPU_TROMP;USE_CPU_XENONCAT;USE_CUDA_TROMP;USE_OCL_XMP;USE_OCL_SILENTARMY;%(PreprocessorDefinitions) + WIN32;NDEBUG;_CONSOLE;USE_CPU_TROMP;USE_CPU_XENONCAT;USE_CUDA_TROMP;USE_OCL_XMP;USE_OCL_SILENTARMY;CONSOLE_COLORS;%(PreprocessorDefinitions) NotSet -D_WIN32_WINNT=0x0601 %(AdditionalOptions) 4068;4996;4503;4267;4180;4290;4244;4800;4334;4251 diff --git a/ocl_silentarmy/zcash/gpu/input.cl b/ocl_silentarmy/zcash/gpu/input.cl index e52712e33..6d07bb9c3 100644 --- a/ocl_silentarmy/zcash/gpu/input.cl +++ b/ocl_silentarmy/zcash/gpu/input.cl @@ -636,8 +636,7 @@ void kernel_round8(__global char *ht_src, __global char *ht_dst, uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot) { - return *(__global uint *)(ht + row * NR_SLOTS * SLOT_LEN + - slot * SLOT_LEN + xi_offset - 4); + return *(__global uint *)(ht + row * NR_SLOTS * SLOT_LEN + slot * SLOT_LEN + xi_offset - 4); } void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs,