From be7a19c8b9fbe8e8565ad1d6a9e688c6c0b588d1 Mon Sep 17 00:00:00 2001 From: maztheman Date: Thu, 15 Dec 2016 10:06:19 -0700 Subject: [PATCH] some fixes and probably issues too :> --- cpu_tromp/cpu_tromp.vcxproj | 7 +- cpu_tromp/equi.h | 6 +- cpu_xenoncat/cpu_xenoncat.vcxproj | 5 +- cuda_silentarmy/cuda_silentarmy.vcxproj | 8 +- cuda_silentarmy/kernel.cu | 18 +- cuda_silentarmy/sa_blake.cpp | 2 +- .../cuda_silentarmy_sm30.vcxproj | 7 +- cuda_silentarmy_sm30/kernel.cu | 370 ++++++++++++------ cuda_tromp/cuda_tromp.vcxproj | 10 +- nheqminer/main.cpp | 5 +- nheqminer/nheqminer.vcxproj | 3 +- ocl_device_utils/ocl_device_utils.cpp | 114 ++---- ocl_device_utils/ocl_device_utils.vcxproj | 3 +- ocl_device_utils/opencl.cpp | 93 +++-- ocl_device_utils/opencl.h | 2 +- ocl_silentarmy/ocl_silentarmy.cpp | 13 +- ocl_silentarmy/ocl_silentarmy.vcxproj | 3 +- ocl_silentarmy/sa_blake.cpp | 2 +- ocl_silentarmy/zcash/gpu/kernel.cl | 6 +- ocl_xpm/ocl_xmp.cpp | 2 +- ocl_xpm/ocl_xpm.vcxproj | 3 +- 21 files changed, 399 insertions(+), 283 deletions(-) diff --git a/cpu_tromp/cpu_tromp.vcxproj b/cpu_tromp/cpu_tromp.vcxproj index f3156f386..50a3afad8 100644 --- a/cpu_tromp/cpu_tromp.vcxproj +++ b/cpu_tromp/cpu_tromp.vcxproj @@ -18,6 +18,7 @@ {6C180164-4DBE-45D7-85E0-7BDFACF3FC7B} Win32Proj cpu_tromp + 8.1 @@ -29,14 +30,14 @@ DynamicLibrary false - v140 + v120 true MultiByte DynamicLibrary false - v140 + v120 true MultiByte @@ -93,7 +94,7 @@ true true WIN32;NDEBUG;_WINDOWS;_USRDLL;CPU_TROMP_EXPORTS;%(PreprocessorDefinitions) - AdvancedVectorExtensions + NotSet 4244;4334 ..\3rdparty\include;%(AdditionalIncludeDirectories) diff --git a/cpu_tromp/equi.h b/cpu_tromp/equi.h index 2b10ce7b2..b9237c359 100644 --- a/cpu_tromp/equi.h +++ b/cpu_tromp/equi.h @@ -49,7 +49,7 @@ typedef u32 proof[PROOFSIZE]; void setheader(blake2b_state *ctx, const char *header, const u32 headerLen, const char* nce, const u32 nonceLen) { uint32_t le_N = WN; uint32_t le_K = WK; - uchar personal[] = "ZcashPoW01230123"; + uchar personal[] = "DeepWebCa01230123"; memcpy(personal+8, &le_N, 4); memcpy(personal+12, &le_K, 4); blake2b_param P[1]; @@ -73,7 +73,7 @@ enum verify_code { POW_OK, POW_DUPLICATE, POW_OUT_OF_ORDER, POW_NONZERO_XOR }; const char *errstr[] = { "OK", "duplicate index", "indices out of order", "nonzero xor" }; void genhash(blake2b_state *ctx, u32 idx, uchar *hash) { - constexpr int hash_size = WN / 8; + const int hash_size = WN / 8; blake2b_state state = *ctx; u32 leb = (idx / HASHESPERBLAKE); blake2b_update(&state, (uchar *)&leb, sizeof(u32)); @@ -83,7 +83,7 @@ void genhash(blake2b_state *ctx, u32 idx, uchar *hash) { } int verifyrec(blake2b_state *ctx, u32 *indices, uchar *hash, int r) { - constexpr int hash_size = WN / 8; + const int hash_size = WN / 8; if (r == 0) { genhash(ctx, *indices, hash); return POW_OK; diff --git a/cpu_xenoncat/cpu_xenoncat.vcxproj b/cpu_xenoncat/cpu_xenoncat.vcxproj index 7d761baa1..96d3a181f 100644 --- a/cpu_xenoncat/cpu_xenoncat.vcxproj +++ b/cpu_xenoncat/cpu_xenoncat.vcxproj @@ -14,6 +14,7 @@ {299E011B-5242-4EDA-B2F2-73C9B48F12FD} Win32Proj cpu_xenoncat + 8.1 @@ -25,7 +26,7 @@ StaticLibrary false - v140 + v120 true MultiByte @@ -77,7 +78,7 @@ asm\fasm.exe asm\xenoncat_AVX2.asm asm\xenoncatavx2.obj true WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) true - AdvancedVectorExtensions + NotSet Console diff --git a/cuda_silentarmy/cuda_silentarmy.vcxproj b/cuda_silentarmy/cuda_silentarmy.vcxproj index 609e5e393..76f286295 100644 --- a/cuda_silentarmy/cuda_silentarmy.vcxproj +++ b/cuda_silentarmy/cuda_silentarmy.vcxproj @@ -22,6 +22,8 @@ {76AC1E54-C6AC-465B-AF41-416B2C3874C1} cuda_silentarmy 8.1 + + @@ -48,7 +50,7 @@ false true MultiByte - v140 + v120 @@ -142,7 +144,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" true true WIN32;WIN64;_LIB;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - AdvancedVectorExtensions + NotSet true @@ -157,7 +159,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" 64 - compute_61,sm_61;compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30; + compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30; true true true diff --git a/cuda_silentarmy/kernel.cu b/cuda_silentarmy/kernel.cu index d3ef84e37..5f773a9a9 100644 --- a/cuda_silentarmy/kernel.cu +++ b/cuda_silentarmy/kernel.cu @@ -2026,7 +2026,7 @@ __device__ void potential_sol(char **htabs, uint ref0, uint ref1) sols.valid[sol_i] = 1; } -constexpr uint c_kernel_sol_counters = 32768 * (THRD / THREADS_PER_ROW); +const uint c_kernel_sol_counters = 32768 * (THRD / THREADS_PER_ROW); __device__ uint kernel_sol_counters[c_kernel_sol_counters]; /* @@ -2354,11 +2354,11 @@ checkCudaErrors(cudaDeviceSynchronize()); static inline void solve_new(c_context *miner, unsigned round) { - constexpr uint32_t INIT_THREADS = 256; - constexpr uint32_t INIT_DIM = NR_ROWS / ROWS_PER_UINT / INIT_THREADS; + const uint32_t INIT_THREADS = 256; + const uint32_t INIT_DIM = NR_ROWS / ROWS_PER_UINT / INIT_THREADS; - constexpr uint32_t ROUND_THREADS = THRD; - constexpr uint32_t ROUND_DIM = NR_ROWS / ROUND_THREADS; + const uint32_t ROUND_THREADS = THRD; + const uint32_t ROUND_DIM = NR_ROWS / ROUND_THREADS; static uint32_t ROUND0_DIM = select_work_size_blake() / ROUND_THREADS; // Now on every round!!!! @@ -2404,11 +2404,11 @@ static inline void solve_new(c_context *miner, unsigned round) static inline void solve_old(unsigned round, c_context *miner) { - constexpr uint32_t INIT_DIM = NR_ROWS / ROWS_PER_UINT / 256; - constexpr uint32_t INIT_THREADS = 256; + const uint32_t INIT_DIM = NR_ROWS / ROWS_PER_UINT / 256; + const uint32_t INIT_THREADS = 256; - constexpr uint32_t ROUND_THREADS = THRD; - constexpr uint32_t ROUND_DIM = NR_ROWS / ROUND_THREADS; + const uint32_t ROUND_THREADS = THRD; + const uint32_t ROUND_DIM = NR_ROWS / ROUND_THREADS; static uint32_t ROUND0_DIM = select_work_size_blake() / ROUND_THREADS; switch (round) { diff --git a/cuda_silentarmy/sa_blake.cpp b/cuda_silentarmy/sa_blake.cpp index c3d6f5b78..7d0dfa13d 100644 --- a/cuda_silentarmy/sa_blake.cpp +++ b/cuda_silentarmy/sa_blake.cpp @@ -39,7 +39,7 @@ void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len); for (uint32_t i = 1; i <= 5; i++) st->h[i] = blake2b_iv[i]; - st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW"; + st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"DeepWebCa"; st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n); st->bytes = 0; } diff --git a/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj b/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj index 7d7843933..9efc78965 100644 --- a/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj +++ b/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj @@ -22,6 +22,8 @@ {53E62B3D-3FA6-4B53-8175-2B93753D98C4} cuda_silentarmy_sm30 8.1 + + @@ -31,7 +33,7 @@ v140 - Application + DynamicLibrary true MultiByte v140 @@ -93,7 +95,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" Level3 Disabled - WIN32;WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + WIN32;WIN64;_LIB;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) true @@ -151,6 +153,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" 64 compute_30,sm_30;compute_20,sm_20 true + true diff --git a/cuda_silentarmy_sm30/kernel.cu b/cuda_silentarmy_sm30/kernel.cu index 0434f6a52..459b94b3a 100644 --- a/cuda_silentarmy_sm30/kernel.cu +++ b/cuda_silentarmy_sm30/kernel.cu @@ -63,7 +63,7 @@ void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len); for (uint32_t i = 1; i <= 5; i++) st->h[i] = blake2b_iv[i]; - st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW"; + st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"DeepWebCa"; st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n); st->bytes = 0; } @@ -147,9 +147,9 @@ void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen) #define xi_offset_for_round(round) (8 + ((round) / 2) * 4) -constexpr uint32_t c_NR_SLOTS = NR_SLOTS; -constexpr uint32_t c_ROW_LEN = c_NR_SLOTS * SLOT_LEN; -//constexpr uint32_t c_NR_ROWS = NR_ROWS; +const uint32_t c_NR_SLOTS = NR_SLOTS; +const uint32_t c_ROW_LEN = c_NR_SLOTS * SLOT_LEN; +//const uint32_t c_NR_ROWS = NR_ROWS; #define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN) @@ -191,14 +191,15 @@ __constant__ uint64_t blake_iv[] = }; -__global__ void kernel_init_0() +__global__ void kernel_init_0(int offset) { - rowCounter0[(blockDim.x * blockIdx.x) + threadIdx.x] = 0; + rowCounter0[(blockDim.x * blockIdx.x) + threadIdx.x + offset] = 0; } -__global__ void kernel_init_1() + +__global__ void kernel_init_1(int offset) { - rowCounter1[(blockDim.x * blockIdx.x) + threadIdx.x] = 0; + rowCounter1[(blockDim.x * blockIdx.x) + threadIdx.x + offset] = 0; } @@ -206,6 +207,136 @@ typedef uint64_t ulong; typedef uint32_t uint; typedef uint8_t uchar; +#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff)) +#define DECODE_ROW(REF) (REF >> 16) +#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff) +#define DECODE_SLOT0(REF) (REF & 0xff) + +#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f)) +#define DECODE_ROW(REF) (REF >> 14) +#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f) +#define DECODE_SLOT0(REF) (REF & 0x7f) + +#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */ +#define DECODE_ROW(REF) (REF >> 13) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) +#define DECODE_ROW(REF) (REF >> 12) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#else +#error "unsupported NR_ROWS_LOG" +#endif + + +/* +** Access a half-aligned long, that is a long aligned on a 4-byte boundary. +*/ +__device__ ulong half_aligned_long(ulong *p, uint offset) +{ + return + (((ulong)*(uint *)((char *)p + offset + 0)) << 0) | + (((ulong)*(uint *)((char *)p + offset + 4)) << 32); +} + +/* +** Access a well-aligned int. +*/ +__device__ uint well_aligned_int(ulong *_p, uint offset) +{ + char *p = (char *)_p; + return *(uint *)(p + offset); +} + +__device__ uint xor_and_store3(char* ht, uint tid, uint slot_a, uint slot_b, ulong* a, ulong* b, uint* rowCounters) +{ + ulong xi0, xi1, xi2, xi3; + char *p; + uint i = ENCODE_INPUTS(tid, slot_a, slot_b); + + /* + + (((ulong)*(uint *)((char *)p + offset + 0)) << 0) | + (((ulong)*(uint *)((char *)p + offset + 4)) << 32); + + */ + + /* + char *p = (char *)_p; + return *(uint *)(p + offset); + + */ + uint16_t a0, a1, a2, a3; + ulong a0h, a0l, test2, test3; + + asm volatile ("{\n\t" + //".reg .b16 a0,a1,a2,a3;\n\t" + "ld.global.b64 %0, [%1];\n\t" + //"mov.b64 {%4, %3, %2, %1}, %0;\n\t" +// "mov.b64 %1, {a0, a1, a2, a3};\n\t" + "}\n\t" : "=l"(test3) : // , "=h"(a0), "=h"(a1), "=h"(a2), "=h"(a3) : + "l"(a)); + + ulong test1 = half_aligned_long(a, 0); + + printf("test1 %lX | %lX | %02X %02X %02X %02X\n", test1, test3, a0, a1, a2, a3); + + + // xor 20 bytes + xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0); + xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8); + xi2 = well_aligned_int(a, 16) ^ well_aligned_int(b, 16); + xi3 = 0; + + if (!xi0 && !xi1) + return 0; + + uint row; + + row = ((xi0 & 0xf0000) >> 0) | + ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | + ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); + + xi0 = (xi0 >> 16) | (xi1 << (64 - 16)); + xi1 = (xi1 >> 16) | (xi2 << (64 - 16)); + xi2 = (xi2 >> 16) | (xi3 << (64 - 16)); + + uint cnt = atomicAdd(&rowCounters[row], 1); + if (cnt >= c_NR_SLOTS) { + // avoid overflows + atomicSub(&rowCounters[row], 1); + return 1; + } + + + + p = ht + row * c_ROW_LEN; + p += cnt * SLOT_LEN + 12;//xi_offset is 12 + // store "i" (always 4 bytes before Xi) + *(uint *)(p - 4) = i; + + *(uint *)(p + 0) = xi0; + *(ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32); + *(uint *)(p + 12) = (xi1 >> 32); + + return 0; +} + __device__ uint ht_store(uint round, char *ht, uint i, ulong xi0, ulong xi1, ulong xi2, ulong xi3, uint *rowCounters) { @@ -275,6 +406,16 @@ __device__ uint ht_store(uint round, char *ht, uint i, } else if (round == 3) { + /*ulong* p1 = (ulong*)p, *p2 = (ulong*)(p + 8); + uint xi0l, xi0h, xi1l, xi1h; + asm("{\n\t" + "mov.b64 {%0, %1}, %6\n\t" + "mov.b64 {%2, %3}, %7\n\t" + "st.global.b64 [%4], {%2, %0}\n\t" + "st.global.b64 [%5], {%1, %3}\n\t" + "}\n\t" : "=r"(xi0l), "=r"(xi0h), "=r"(xi1l), "=r"(xi1h), "=l"(p1), "=l"(p2) : + "l"(xi0), "l"(xi1) + );*/ // store 16 bytes *(uint *)(p + 0) = xi0; *(ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32); @@ -319,15 +460,14 @@ vc = (vc + vd); \ vb = rotate((vb ^ vc), (ulong)64 - 63); __global__ -void kernel_round0(char *ht, uint32_t inputs_per_thread) +void kernel_round0(char *ht, uint32_t inputs_per_thread, int offset) { typedef uint64_t ulong; uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; uint64_t v[16]; - //uint32_t inputs_per_thread = c_NR_ROWS / (gridDim.x * blockDim.x); - uint32_t input = tid * inputs_per_thread; - uint32_t input_end = (tid + 1) * inputs_per_thread; + uint32_t input = (tid * inputs_per_thread) + offset; + uint32_t input_end = ((tid + 1) * inputs_per_thread) + offset; uint32_t dropped = 0; while (input < input_end) { @@ -500,60 +640,8 @@ void kernel_round0(char *ht, uint32_t inputs_per_thread) #endif } -#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8) -#define ENCODE_INPUTS(row, slot0, slot1) \ - ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff)) -#define DECODE_ROW(REF) (REF >> 16) -#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff) -#define DECODE_SLOT0(REF) (REF & 0xff) -#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7) - -#define ENCODE_INPUTS(row, slot0, slot1) \ - ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f)) -#define DECODE_ROW(REF) (REF >> 14) -#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f) -#define DECODE_SLOT0(REF) (REF & 0x7f) - -#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6) - -#define ENCODE_INPUTS(row, slot0, slot1) \ - ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */ -#define DECODE_ROW(REF) (REF >> 13) -#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) -#define DECODE_SLOT0(REF) (REF & 0x3f) - -#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6) - -#define ENCODE_INPUTS(row, slot0, slot1) \ - ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) -#define DECODE_ROW(REF) (REF >> 12) -#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) -#define DECODE_SLOT0(REF) (REF & 0x3f) - -#else -#error "unsupported NR_ROWS_LOG" -#endif - -/* -** Access a half-aligned long, that is a long aligned on a 4-byte boundary. -*/ -__device__ ulong half_aligned_long(ulong *p, uint offset) -{ - return - (((ulong)*(uint *)((char *)p + offset + 0)) << 0) | - (((ulong)*(uint *)((char *)p + offset + 4)) << 32); -} - -/* -** Access a well-aligned int. -*/ -__device__ uint well_aligned_int(ulong *_p, uint offset) -{ - char *p = (char *)_p; - return *(uint *)(p + offset); -} /* ** XOR a pair of Xi values computed at "round - 1" and store the result in the @@ -569,6 +657,10 @@ __device__ uint xor_and_store(uint round, char *ht_dst, uint row, uint slot_a, uint slot_b, ulong *a, ulong *b, uint *rowCounters) { + if (round == 3) { + return xor_and_store3(ht_dst, row, slot_a, slot_b, a, b, rowCounters); + } + ulong xi0, xi1, xi2; #if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20 // Note: for NR_ROWS_LOG == 20, for odd rounds, we could optimize by not @@ -643,22 +735,33 @@ __device__ uint xor_and_store(uint round, char *ht_dst, uint row, xi0, xi1, xi2, 0, rowCounters); } -__device__ void equihash_round_cm3(uint round, char *ht_src, char *ht_dst, uint *rowCountersSrc, uint *rowCountersDst) + +#define addr_offset(addr, offset) ((uint*)(addr) + offset) + +__device__ void equihash_round(uint round, char *ht_src, char *ht_dst, uint *rowCountersSrc, uint *rowCountersDst, int offset) { - uint tid = blockIdx.x * blockDim.x + threadIdx.x; + uint tid = blockIdx.x * blockDim.x + threadIdx.x + offset; char *p; uint cnt; uint i, j; uint dropped_stor = 0; ulong *a, *b; + uchar xi_offsets[] = { + 8, 8, 12, 12, 16, 16, 20, 20 + }; uint xi_offset; - xi_offset = (8 + ((round - 1) / 2) * 4); - + xi_offset = xi_offsets[round - 1]; + uint* c[8]; + cnt = rowCountersSrc[tid]; cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in prev. round + if (!cnt) {// no elements in row, no collisions return; } + + ulong da[4] = { 0 }; + // find collisions p = (ht_src + tid * c_ROW_LEN) + xi_offset; for (i = 0; i < cnt; i++) { @@ -670,36 +773,33 @@ __device__ void equihash_round_cm3(uint round, char *ht_src, char *ht_dst, uint } } -#define KERNEL_ROUND_ODD_OLD(N) \ +#define KERNEL_ROUND_ODD(N) \ __global__ \ -void kernel_round_cm3_ ## N( char *ht_src, char *ht_dst) \ +void kernel_round ## N ## ( char *ht_src, char *ht_dst, int offset) \ { \ - equihash_round_cm3(N, ht_src, ht_dst, rowCounter0, rowCounter1); \ + equihash_round(N, ht_src, ht_dst, rowCounter0, rowCounter1, offset); \ } - -#define KERNEL_ROUND_EVEN_OLD(N) \ +#define KERNEL_ROUND_EVEN(N) \ __global__ \ -void kernel_round_cm3_ ## N(char *ht_src, char *ht_dst) \ +void kernel_round ## N ## (char *ht_src, char *ht_dst, int offset) \ { \ - equihash_round_cm3(N, ht_src, ht_dst, rowCounter1, rowCounter0); \ + equihash_round(N, ht_src, ht_dst, rowCounter1, rowCounter0, offset); \ } - -KERNEL_ROUND_ODD_OLD(1) -KERNEL_ROUND_EVEN_OLD(2) -KERNEL_ROUND_ODD_OLD(3) -KERNEL_ROUND_EVEN_OLD(4) -KERNEL_ROUND_ODD_OLD(5) -KERNEL_ROUND_EVEN_OLD(6) -KERNEL_ROUND_ODD_OLD(7) - +KERNEL_ROUND_ODD(1) +KERNEL_ROUND_EVEN(2) +KERNEL_ROUND_ODD(3) +KERNEL_ROUND_EVEN(4) +KERNEL_ROUND_ODD(5) +KERNEL_ROUND_EVEN(6) +KERNEL_ROUND_ODD(7) __global__ -void kernel_round_cm3_8(char *ht_src, char *ht_dst) +void kernel_round8(char *ht_src, char *ht_dst, int offset) { - uint tid = blockIdx.x * blockDim.x + threadIdx.x; - equihash_round_cm3(8, ht_src, ht_dst, rowCounter1, rowCounter0); + uint tid = blockIdx.x * blockDim.x + threadIdx.x + offset; + equihash_round(8, ht_src, ht_dst, rowCounter1, rowCounter0, offset); if (!tid) { sols.nr = sols.likely_invalids = 0; } @@ -777,11 +877,10 @@ __device__ void potential_sol(const char **htabs, uint ref0, uint ref1) ** Scan the hash tables to find Equihash solutions. */ __global__ -void kernel_sols(const char *ht0, const char *ht1) +void kernel_sols(const char *ht0, const char *ht1, int offset) { - uint tid = blockIdx.x * blockDim.x + threadIdx.x; + uint tid = blockIdx.x * blockDim.x + threadIdx.x + offset; const char *htabs[2] = { ht0, ht1 }; - //uint *hcounters[2] = { rowCounter0, rowCounter1 }; uint ht_i = (PARAM_K - 1) & 1; // table filled at last round uint cnt; uint xi_offset = xi_offset_for_round(PARAM_K - 1); @@ -791,7 +890,6 @@ void kernel_sols(const char *ht0, const char *ht1) // it's ok for the collisions array to be so small, as if it fills up // the potential solutions are likely invalid (many duplicate inputs) ulong collisions; - //uint coll; #if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20 // in the final hash table, we are looking for a match on both the bits // part of the previous PREFIX colliding bits, and the last PREFIX bits. @@ -803,7 +901,7 @@ void kernel_sols(const char *ht0, const char *ht1) a = htabs[ht_i] + tid * NR_SLOTS * SLOT_LEN; cnt = rowCounter0[tid]; cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round - //coll = 0; + //coll = 0; a += xi_offset; for (i = 0; i < cnt; i++, a += SLOT_LEN) { uint a_data = ((*(uint *)a) & mask); @@ -822,6 +920,7 @@ exit1: potential_sol(htabs, collisions >> 32, collisions & 0xffffffff); } + static void sort_pair(uint32_t *a, uint32_t len) { uint32_t *b = a + len; @@ -874,6 +973,9 @@ struct __align__(64) c_context { uint32_t nthreads; size_t global_ws; + cudaStream_t s1; + cudaStream_t s2; + c_context(const uint32_t n_threads) { nthreads = n_threads; } @@ -941,6 +1043,8 @@ sa_cuda_context::sa_cuda_context(int tpb, int blocks, int id) checkCudaErrors(cudaMalloc((void**)&eq->buf_ht[0], HT_SIZE)); checkCudaErrors(cudaMalloc((void**)&eq->buf_ht[1], HT_SIZE)); checkCudaErrors(cudaMallocHost(&eq->sols, sizeof(*eq->sols))); + checkCudaErrors(cudaStreamCreate(&eq->s1)); + checkCudaErrors(cudaStreamCreate(&eq->s2)); checkCudaErrors(cudaDeviceSynchronize()); } @@ -970,38 +1074,72 @@ void sa_cuda_context::solve(const char * tequihash_header, unsigned int tequihas checkCudaErrors(cudaMemcpyToSymbol(blake, &initialCtx, sizeof(blake2b_state_s), 0, cudaMemcpyHostToDevice)); - constexpr uint32_t THREAD_SHIFT = 7; - constexpr uint32_t THREAD_COUNT = 1 << THREAD_SHIFT; - constexpr uint32_t DIM_SIZE = NR_ROWS >> THREAD_SHIFT; - - kernel_init_0 << > > (); - kernel_round0<<<1024, 64>>> (miner->buf_ht[0], 16); + //const uint32_t THREAD_SHIFT = 8; + //const uint32_t THREAD_COUNT = 1 << THREAD_SHIFT; + const uint32_t NEW_THREAD_SHIFT = 6; + const uint32_t NEW_THREAD = 1 << NEW_THREAD_SHIFT; + const uint32_t DIM_SIZE = NR_ROWS >> (NEW_THREAD_SHIFT + 1); + const uint32_t HALF_SIZE = DIM_SIZE << NEW_THREAD_SHIFT; + //const uint32_t DIM_SIZE = NR_ROWS >> THREAD_SHIFT; + //const uint32_t HALF_SIZE = DIM_SIZE * 128; + const uint32_t ROUND_0_IPT_SHIFT = 3; + const uint32_t ROUND_0_IPT = 1 << ROUND_0_IPT_SHIFT; + const uint32_t ROUND_0_DIM = NR_ROWS >> ((NEW_THREAD_SHIFT + 1) + ROUND_0_IPT_SHIFT); + const uint32_t ROUND_0_THREADS = NEW_THREAD; + + kernel_init_0 << s1 >> > (0); + kernel_init_0 << s2 >> > (HALF_SIZE); + kernel_round0<<s1>>>(miner->buf_ht[0], ROUND_0_IPT, 0); + kernel_round0<<s2>>>(miner->buf_ht[0], ROUND_0_IPT, HALF_SIZE); if (cancelf()) return; - kernel_init_1 << > > (); - kernel_round_cm3_1 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[0], miner->buf_ht[1]); + kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (0); + kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (HALF_SIZE); + kernel_round1 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >>> (miner->buf_ht[0], miner->buf_ht[1], 0); + checkCudaErrors(cudaPeekAtLastError()); + kernel_round1 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >>> (miner->buf_ht[0], miner->buf_ht[1], HALF_SIZE); + checkCudaErrors(cudaPeekAtLastError()); if (cancelf()) return; - kernel_init_0 << > > (); - kernel_round_cm3_2 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[1], miner->buf_ht[0]); + kernel_init_0 << s1 >> > (0); + kernel_init_0 << s2 >> > (HALF_SIZE); + kernel_round2 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[1], miner->buf_ht[0], 0); + kernel_round2 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[1], miner->buf_ht[0], HALF_SIZE); if (cancelf()) return; - kernel_init_1 << > > (); - kernel_round_cm3_3 << > > (miner->buf_ht[0], miner->buf_ht[1]); + kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (0); + kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (HALF_SIZE); + kernel_round3 << s1 >> > (miner->buf_ht[0], miner->buf_ht[1], 0); + kernel_round3 << s2 >> > (miner->buf_ht[0], miner->buf_ht[1], HALF_SIZE); + checkCudaErrors(cudaPeekAtLastError()); if (cancelf()) return; - kernel_init_0 << > > (); - kernel_round_cm3_4 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[1], miner->buf_ht[0]); + kernel_init_0 << s1 >> > (0); + kernel_init_0 << s2 >> > (HALF_SIZE); + kernel_round4 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[1], miner->buf_ht[0], 0); + kernel_round4 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[1], miner->buf_ht[0], HALF_SIZE); if (cancelf()) return; - kernel_init_1 << > > (); - kernel_round_cm3_5 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[0], miner->buf_ht[1]); + kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (0); + kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (HALF_SIZE); + kernel_round5 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[0], miner->buf_ht[1], 0); + kernel_round5 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[0], miner->buf_ht[1], HALF_SIZE); if (cancelf()) return; - kernel_init_0 << > > (); - kernel_round_cm3_6 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[1], miner->buf_ht[0]); + kernel_init_0 << s1 >> > (0); + kernel_init_0 << s2 >> > (HALF_SIZE); + kernel_round6 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[1], miner->buf_ht[0], 0); + kernel_round6 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[1], miner->buf_ht[0], HALF_SIZE); if (cancelf()) return; - kernel_init_1 << > > (); - kernel_round_cm3_7 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[0], miner->buf_ht[1]); + kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (0); + kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (HALF_SIZE); + kernel_round7 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[0], miner->buf_ht[1], 0); + kernel_round7 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[0], miner->buf_ht[1], HALF_SIZE); if (cancelf()) return; - kernel_init_0 << > > (); - kernel_round_cm3_8 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[1], miner->buf_ht[0]); + kernel_init_0 << s1 >> > (0); + kernel_init_0 << s2 >> > (HALF_SIZE); + kernel_round8 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[1], miner->buf_ht[0], 0); + kernel_round8 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[1], miner->buf_ht[0], HALF_SIZE); if (cancelf()) return; - kernel_sols << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[0], miner->buf_ht[1]); + kernel_sols << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[0], miner->buf_ht[1], 0); + kernel_sols << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[0], miner->buf_ht[1], HALF_SIZE); + + checkCudaErrors(cudaStreamSynchronize(miner->s1)); + checkCudaErrors(cudaStreamSynchronize(miner->s2)); checkCudaErrors(cudaMemcpyFromSymbol(miner->sols, sols, sizeof(sols_t), 0, cudaMemcpyDeviceToHost)); @@ -1022,4 +1160,4 @@ void sa_cuda_context::solve(const char * tequihash_header, unsigned int tequihas } hashdonef(); -} \ No newline at end of file +} diff --git a/cuda_tromp/cuda_tromp.vcxproj b/cuda_tromp/cuda_tromp.vcxproj index df673ec3e..082c3dc46 100644 --- a/cuda_tromp/cuda_tromp.vcxproj +++ b/cuda_tromp/cuda_tromp.vcxproj @@ -30,7 +30,9 @@ {33C2B469-F025-4223-B9B6-E69D42FEA7D6} cuda_tromp - $(CUDA_PATH_V7_5) + + + 8.1 @@ -44,14 +46,14 @@ false true MultiByte - v140 + v120 DynamicLibrary false true MultiByte - v140 + v120 @@ -125,7 +127,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" 64 - compute_61,sm_61;compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30; + compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30; true diff --git a/nheqminer/main.cpp b/nheqminer/main.cpp index 1503bb454..f4b34725a 100644 --- a/nheqminer/main.cpp +++ b/nheqminer/main.cpp @@ -265,7 +265,7 @@ int main(int argc, char* argv[]) int cuda_device_count = 0; int cuda_bc = 0; int cuda_tbpc = 0; - int opencl_platform = 0; + int opencl_platform = -1; int opencl_device_count = 0; int force_cpu_ext = -1; int opencl_t = 0; @@ -362,6 +362,9 @@ int main(int argc, char* argv[]) } } break; + case 'p': + opencl_platform = atoi(argv[++i]); + break; case 't': while (opencl_t < 8 && i + 1 < argc) { diff --git a/nheqminer/nheqminer.vcxproj b/nheqminer/nheqminer.vcxproj index a15fe80a9..627cc3f5c 100644 --- a/nheqminer/nheqminer.vcxproj +++ b/nheqminer/nheqminer.vcxproj @@ -14,6 +14,7 @@ {6FF7D209-05A3-4550-93CC-211D33503719} Win32Proj nheqminer + 8.1 @@ -25,7 +26,7 @@ Application false - v140 + v120 true MultiByte diff --git a/ocl_device_utils/ocl_device_utils.cpp b/ocl_device_utils/ocl_device_utils.cpp index d3d3d9269..5aceec21f 100644 --- a/ocl_device_utils/ocl_device_utils.cpp +++ b/ocl_device_utils/ocl_device_utils.cpp @@ -1,5 +1,5 @@ #include "ocl_device_utils.h" - +#include "opencl.h" #include #include #include @@ -14,26 +14,6 @@ std::vector ocl_device_utils::_platformNames; std::vector ocl_device_utils::_devicesPlatformsDevices; std::vector ocl_device_utils::_AllDevices; -static std::vector GetAllDevices() -{ - std::vector retval; - retval.reserve(8); - - cl_platform_id platforms[64]; - cl_uint numPlatforms; - cl_int rc = clGetPlatformIDs(sizeof(platforms) / sizeof(cl_platform_id), platforms, &numPlatforms); - - for (cl_uint i = 0; i < numPlatforms; i++) { - cl_uint numDevices = 0; - cl_device_id devices[64]; - rc = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, sizeof(devices) / sizeof(cl_device_id), devices, &numDevices); - for (cl_uint n = 0; n < numDevices; n++) { - retval.push_back(devices[n]); - } - } - - return retval; -} vector ocl_device_utils::getPlatforms() { vector platforms; @@ -59,7 +39,10 @@ void ocl_device_utils::print_opencl_devices() { vector ocl_device_utils::getDevices(vector const& _platforms, unsigned _platformId) { vector devices; try { - _platforms[_platformId].getDevices(/*CL_DEVICE_TYPE_CPU| */CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, &devices); + auto cl_devices = GetAllDevices(); + for (auto& cl_device : cl_devices) { + devices.push_back({ cl_device }); + } } catch (Error const& err) { // if simply no devices found return empty vector @@ -79,60 +62,39 @@ bool ocl_device_utils::QueryDevices() { try { auto devices = GetAllDevices(); - for (auto& device : devices) { - _AllDevices.emplace_back(cl::Device(device)); - } - - - - - // get platforms - auto platforms = getPlatforms(); - if (platforms.empty()) { - cout << "No OpenCL platforms found" << endl; - return false; - } - else { - for (auto i_pId = 0u; i_pId < platforms.size(); ++i_pId) { - string platformName = StringnNullTerminatorFix(platforms[i_pId].getInfo()); - if (std::find(_platformNames.begin(), _platformNames.end(), platformName) == _platformNames.end()) { - PrintInfo current; - _platformNames.push_back(platformName); - // new - current.PlatformName = platformName; - current.PlatformNum = i_pId; - - auto clDevs = getDevices(platforms, i_pId); - for (auto i_devId = 0u; i_devId < clDevs.size(); ++i_devId) { - OpenCLDevice curDevice; - curDevice.DeviceID = i_devId; - curDevice._CL_DEVICE_NAME = StringnNullTerminatorFix(clDevs[i_devId].getInfo()); - switch (clDevs[i_devId].getInfo()) { - case CL_DEVICE_TYPE_CPU: - curDevice._CL_DEVICE_TYPE = "CPU"; - break; - case CL_DEVICE_TYPE_GPU: - curDevice._CL_DEVICE_TYPE = "GPU"; - break; - case CL_DEVICE_TYPE_ACCELERATOR: - curDevice._CL_DEVICE_TYPE = "ACCELERATOR"; - break; - default: - curDevice._CL_DEVICE_TYPE = "DEFAULT"; - break; - } - - - curDevice._CL_DEVICE_GLOBAL_MEM_SIZE = clDevs[i_devId].getInfo(); - curDevice._CL_DEVICE_VENDOR = StringnNullTerminatorFix(clDevs[i_devId].getInfo()); - curDevice._CL_DEVICE_VERSION = StringnNullTerminatorFix(clDevs[i_devId].getInfo()); - curDevice._CL_DRIVER_VERSION = StringnNullTerminatorFix(clDevs[i_devId].getInfo()); - - current.Devices.push_back(curDevice); - } - _devicesPlatformsDevices.push_back(current); - } + unsigned int device_num = 0; + for (auto& cldevice : devices) { + cl::Device device(cldevice); + cl::Platform platform(device.getInfo()); + _AllDevices.emplace_back(device); + PrintInfo current; + current.PlatformName = StringnNullTerminatorFix(platform.getInfo()); + current.PlatformNum = 0; + OpenCLDevice curDevice; + curDevice.DeviceID = device_num++; + curDevice._CL_DEVICE_NAME = StringnNullTerminatorFix(device.getInfo()); + + switch (device.getInfo()) { + case CL_DEVICE_TYPE_CPU: + curDevice._CL_DEVICE_TYPE = "CPU"; + break; + case CL_DEVICE_TYPE_GPU: + curDevice._CL_DEVICE_TYPE = "GPU"; + break; + case CL_DEVICE_TYPE_ACCELERATOR: + curDevice._CL_DEVICE_TYPE = "ACCELERATOR"; + break; + default: + curDevice._CL_DEVICE_TYPE = "DEFAULT"; + break; } + + curDevice._CL_DEVICE_GLOBAL_MEM_SIZE = device.getInfo(); + curDevice._CL_DEVICE_VENDOR = StringnNullTerminatorFix(device.getInfo()); + curDevice._CL_DEVICE_VERSION = StringnNullTerminatorFix(device.getInfo()); + curDevice._CL_DRIVER_VERSION = StringnNullTerminatorFix(device.getInfo()); + current.Devices.push_back(curDevice); + _devicesPlatformsDevices.push_back(current); } } catch (exception &ex) { @@ -150,7 +112,7 @@ int ocl_device_utils::GetCountForPlatform(int platformID) { for (const auto &platInfo : _devicesPlatformsDevices) { if (platformID == platInfo.PlatformNum) { - return platInfo.Devices.size(); + return (int)platInfo.Devices.size(); } } return 0; diff --git a/ocl_device_utils/ocl_device_utils.vcxproj b/ocl_device_utils/ocl_device_utils.vcxproj index 307e3f32d..b89529a1c 100644 --- a/ocl_device_utils/ocl_device_utils.vcxproj +++ b/ocl_device_utils/ocl_device_utils.vcxproj @@ -24,6 +24,7 @@ {5DBCE38A-C8D2-4498-A92A-9AF8D5196135} Win32Proj ocl_device_utils + 8.1 @@ -35,7 +36,7 @@ StaticLibrary false - v140 + v120 true Unicode diff --git a/ocl_device_utils/opencl.cpp b/ocl_device_utils/opencl.cpp index af5704adf..469188f1c 100644 --- a/ocl_device_utils/opencl.cpp +++ b/ocl_device_utils/opencl.cpp @@ -3,11 +3,15 @@ #include #include #include +#include + +#include "ocl_device_utils.h" extern cl_platform_id gPlatform; // extern cl_program gProgram; -std::vector GetAllDevices() + +/*static std::vector GetAllDevices() { std::vector retval; retval.reserve(8); @@ -19,65 +23,58 @@ std::vector GetAllDevices() for (cl_uint i = 0; i < numPlatforms; i++) { cl_uint numDevices = 0; cl_device_id devices[64]; - rc = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, sizeof(devices) / sizeof(cl_device_id), devices, &numDevices); + rc = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, sizeof(devices) / sizeof(cl_device_id), devices, &numDevices); for (cl_uint n = 0; n < numDevices; n++) { retval.push_back(devices[n]); } } + return retval; +}*/ + +std::vector GetAllDevices(int platform_id) +{ + + std::vector retval; + retval.reserve(8); + + cl_platform_id platforms[64]; + cl_uint numPlatforms; + cl_int rc = clGetPlatformIDs(sizeof(platforms) / sizeof(cl_platform_id), platforms, &numPlatforms); + + for (cl_uint i = 0; i < numPlatforms; i++) { + + if (platform_id != -1 && i != platform_id) { + continue; + } + cl_uint numDevices = 0; + cl_device_id devices[64]; + rc = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, sizeof(devices) / sizeof(cl_device_id), devices, &numDevices); + for (cl_uint n = 0; n < numDevices; n++) { + cl_device_id device = devices[n]; + if (std::find_if(retval.begin(), retval.end(), [&device](cl_device_id p) { + return (p == device); + }) == retval.end()) { + retval.push_back(device); + } + } + } + return retval; } bool clInitialize(int requiredPlatform, std::vector &gpus) { - cl_platform_id platforms[64]; - cl_uint numPlatforms; - OCLR(clGetPlatformIDs(sizeof(platforms)/sizeof(cl_platform_id), platforms, &numPlatforms), false); - if (!numPlatforms) { - printf(" no OpenCL platforms found\n"); - return false; - } - - /*int platformIdx = -1; - if (requiredPlatform) { - for (decltype(numPlatforms) i = 0; i < numPlatforms; i++) { - char name[1024] = {0}; - OCLR(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(name), name, 0), false); - printf("found platform[%i] name = '%s'\n", (int)i, name); - if (strcmp(name, requiredPlatform) == 0) { - platformIdx = i; - break; - } - } - } else { - platformIdx = 0; - }*/ + gpus = GetAllDevices(); - int platformIdx = requiredPlatform; - - - if (platformIdx == -1) { - printf(" platform %s not exists\n", requiredPlatform); - return false; - } - - gPlatform = platforms[platformIdx]; - - cl_uint numDevices = 0; - cl_device_id devices[64]; - clGetDeviceIDs(gPlatform, CL_DEVICE_TYPE_GPU, sizeof(devices)/sizeof(cl_device_id), devices, &numDevices); - if (numDevices) { - printf(" found %d devices\n", numDevices); - } else { - printf(" no OpenCL GPU devices found.\n"); - return false; - } + if (gpus.empty()) { + printf(" no OpenCL platforms found\n"); + return false; + } - for (decltype(numDevices) i = 0; i < numDevices; i++) { - gpus.push_back(devices[i]); - } - - return true; + printf(" found %d devices\n", gpus.size()); + + return true; } bool clCompileKernel(cl_context gContext, diff --git a/ocl_device_utils/opencl.h b/ocl_device_utils/opencl.h index 26f01df27..fa559665b 100644 --- a/ocl_device_utils/opencl.h +++ b/ocl_device_utils/opencl.h @@ -115,7 +115,7 @@ class clBuffer { }; -std::vector GetAllDevices(); +std::vector GetAllDevices(int platform_id = -1); bool clInitialize(int requiredPlatform, std::vector &gpus); bool clCompileKernel(cl_context gContext, diff --git a/ocl_silentarmy/ocl_silentarmy.cpp b/ocl_silentarmy/ocl_silentarmy.cpp index cfb641f33..c4a2cc7b0 100644 --- a/ocl_silentarmy/ocl_silentarmy.cpp +++ b/ocl_silentarmy/ocl_silentarmy.cpp @@ -367,7 +367,7 @@ ocl_silentarmy::ocl_silentarmy(int platf_id, int dev_id) { } std::string ocl_silentarmy::getdevinfo() { - static auto devices = GetAllDevices(); + static auto devices = GetAllDevices(platform_id); auto device = devices[device_id]; std::vector name(256, 0); size_t nActualSize = 0; @@ -387,7 +387,7 @@ int ocl_silentarmy::getcount() { } void ocl_silentarmy::getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version) { - static auto devices = GetAllDevices(); + static auto devices = GetAllDevices(platf_id); if (devices.size() <= d_id) { return; @@ -420,20 +420,23 @@ void ocl_silentarmy::start(ocl_silentarmy& device_context) { /*TODO*/ device_context.is_init_success = false; device_context.oclc = new OclContext; - auto devices = GetAllDevices(); + auto devices = GetAllDevices(device_context.platform_id); + + printf("pid %i, size %u\n", device_context.platform_id, devices.size()); auto device = devices[device_context.device_id]; size_t nActualSize = 0; cl_platform_id platform_id = nullptr; - cl_int rc = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform_id, &nActualSize); + cl_int rc = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform_id, nullptr); + device_context.oclc->_dev_id = device; device_context.oclc->platform_id = platform_id; // context create cl_context_properties props[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)device_context.oclc->platform_id, 0 }; cl_int error; - device_context.oclc->_context = clCreateContext(NULL, 1, &device, 0, 0, &error); + device_context.oclc->_context = clCreateContext(props, 1, &device, 0, 0, &error); //OCLR(error, false); if (cl_int err = error) { printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__); diff --git a/ocl_silentarmy/ocl_silentarmy.vcxproj b/ocl_silentarmy/ocl_silentarmy.vcxproj index a7440fa03..77771fcb6 100644 --- a/ocl_silentarmy/ocl_silentarmy.vcxproj +++ b/ocl_silentarmy/ocl_silentarmy.vcxproj @@ -27,6 +27,7 @@ {AB01E715-795A-4089-8DF0-AE6EBDC1AB48} Win32Proj ocl_silentarmy + 8.1 @@ -38,7 +39,7 @@ StaticLibrary false - v140 + v120 true Unicode diff --git a/ocl_silentarmy/sa_blake.cpp b/ocl_silentarmy/sa_blake.cpp index c10800de8..5a3b321bb 100644 --- a/ocl_silentarmy/sa_blake.cpp +++ b/ocl_silentarmy/sa_blake.cpp @@ -39,7 +39,7 @@ void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len, st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len); for (uint32_t i = 1; i <= 5; i++) st->h[i] = blake2b_iv[i]; - st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW"; + st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"DeepWebCa"; st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n); st->bytes = 0; } diff --git a/ocl_silentarmy/zcash/gpu/kernel.cl b/ocl_silentarmy/zcash/gpu/kernel.cl index 1b63cbb3f..8ba3d6283 100644 --- a/ocl_silentarmy/zcash/gpu/kernel.cl +++ b/ocl_silentarmy/zcash/gpu/kernel.cl @@ -7,7 +7,7 @@ #define APX_NR_ELMS_LOG (PREFIX + 1) // Number of rows and slots is affected by this. 20 offers the best performance // but occasionally misses ~1% of solutions. -#define NR_ROWS_LOG 18 +#define NR_ROWS_LOG 20 // Setting this to 1 might make SILENTARMY faster, see TROUBLESHOOTING.md #define OPTIM_SIMPLIFY_ROUND 1 @@ -15,10 +15,10 @@ // Number of collision items to track, per thread #ifdef cl_nv_pragma_unroll // NVIDIA #define THREADS_PER_ROW 16 -#define LDS_COLL_SIZE (NR_SLOTS * 24 * (64 / THREADS_PER_ROW)) +#define LDS_COLL_SIZE (NR_SLOTS * 24 * (THRD / THREADS_PER_ROW)) #else #define THREADS_PER_ROW 8 -#define LDS_COLL_SIZE (NR_SLOTS * 8 * (64 / THREADS_PER_ROW)) +#define LDS_COLL_SIZE (NR_SLOTS * 8 * (THRD / THREADS_PER_ROW)) #endif // Ratio of time of sleeping before rechecking if task is done (0-1) diff --git a/ocl_xpm/ocl_xmp.cpp b/ocl_xpm/ocl_xmp.cpp index d0a96a2a8..4064e3626 100644 --- a/ocl_xpm/ocl_xmp.cpp +++ b/ocl_xpm/ocl_xmp.cpp @@ -105,7 +105,7 @@ static void setheader(blake2b_state *ctx, const char *header, const uint32_t hea { uint32_t le_N = WN; uint32_t le_K = WK; - char personal[] = "ZcashPoW01230123"; + char personal[] = "DeepWebCa01230123"; memcpy(personal + 8, &le_N, 4); memcpy(personal + 12, &le_K, 4); blake2b_param P[1]; diff --git a/ocl_xpm/ocl_xpm.vcxproj b/ocl_xpm/ocl_xpm.vcxproj index 8d3a4645f..a88544382 100644 --- a/ocl_xpm/ocl_xpm.vcxproj +++ b/ocl_xpm/ocl_xpm.vcxproj @@ -25,6 +25,7 @@ {5EC9EDEB-8E49-4126-9161-1560683CBC71} Win32Proj ocl_xpm + 8.1 @@ -36,7 +37,7 @@ StaticLibrary false - v140 + v120 true MultiByte