diff --git a/cpu_tromp/cpu_tromp.vcxproj b/cpu_tromp/cpu_tromp.vcxproj
index f3156f386..50a3afad8 100644
--- a/cpu_tromp/cpu_tromp.vcxproj
+++ b/cpu_tromp/cpu_tromp.vcxproj
@@ -18,6 +18,7 @@
{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}
Win32Proj
cpu_tromp
+ 8.1
@@ -29,14 +30,14 @@
DynamicLibrary
false
- v140
+ v120
true
MultiByte
DynamicLibrary
false
- v140
+ v120
true
MultiByte
@@ -93,7 +94,7 @@
true
true
WIN32;NDEBUG;_WINDOWS;_USRDLL;CPU_TROMP_EXPORTS;%(PreprocessorDefinitions)
- AdvancedVectorExtensions
+ NotSet
4244;4334
..\3rdparty\include;%(AdditionalIncludeDirectories)
diff --git a/cpu_tromp/equi.h b/cpu_tromp/equi.h
index 2b10ce7b2..b9237c359 100644
--- a/cpu_tromp/equi.h
+++ b/cpu_tromp/equi.h
@@ -49,7 +49,7 @@ typedef u32 proof[PROOFSIZE];
void setheader(blake2b_state *ctx, const char *header, const u32 headerLen, const char* nce, const u32 nonceLen) {
uint32_t le_N = WN;
uint32_t le_K = WK;
- uchar personal[] = "ZcashPoW01230123";
+ uchar personal[] = "DeepWebCa01230123";
memcpy(personal+8, &le_N, 4);
memcpy(personal+12, &le_K, 4);
blake2b_param P[1];
@@ -73,7 +73,7 @@ enum verify_code { POW_OK, POW_DUPLICATE, POW_OUT_OF_ORDER, POW_NONZERO_XOR };
const char *errstr[] = { "OK", "duplicate index", "indices out of order", "nonzero xor" };
void genhash(blake2b_state *ctx, u32 idx, uchar *hash) {
- constexpr int hash_size = WN / 8;
+ const int hash_size = WN / 8;
blake2b_state state = *ctx;
u32 leb = (idx / HASHESPERBLAKE);
blake2b_update(&state, (uchar *)&leb, sizeof(u32));
@@ -83,7 +83,7 @@ void genhash(blake2b_state *ctx, u32 idx, uchar *hash) {
}
int verifyrec(blake2b_state *ctx, u32 *indices, uchar *hash, int r) {
- constexpr int hash_size = WN / 8;
+ const int hash_size = WN / 8;
if (r == 0) {
genhash(ctx, *indices, hash);
return POW_OK;
diff --git a/cpu_xenoncat/cpu_xenoncat.vcxproj b/cpu_xenoncat/cpu_xenoncat.vcxproj
index 7d761baa1..96d3a181f 100644
--- a/cpu_xenoncat/cpu_xenoncat.vcxproj
+++ b/cpu_xenoncat/cpu_xenoncat.vcxproj
@@ -14,6 +14,7 @@
{299E011B-5242-4EDA-B2F2-73C9B48F12FD}
Win32Proj
cpu_xenoncat
+ 8.1
@@ -25,7 +26,7 @@
StaticLibrary
false
- v140
+ v120
true
MultiByte
@@ -77,7 +78,7 @@ asm\fasm.exe asm\xenoncat_AVX2.asm asm\xenoncatavx2.obj
true
WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)
true
- AdvancedVectorExtensions
+ NotSet
Console
diff --git a/cuda_silentarmy/cuda_silentarmy.vcxproj b/cuda_silentarmy/cuda_silentarmy.vcxproj
index 609e5e393..76f286295 100644
--- a/cuda_silentarmy/cuda_silentarmy.vcxproj
+++ b/cuda_silentarmy/cuda_silentarmy.vcxproj
@@ -22,6 +22,8 @@
{76AC1E54-C6AC-465B-AF41-416B2C3874C1}
cuda_silentarmy
8.1
+
+
@@ -48,7 +50,7 @@
false
true
MultiByte
- v140
+ v120
@@ -142,7 +144,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
true
true
WIN32;WIN64;_LIB;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
- AdvancedVectorExtensions
+ NotSet
true
@@ -157,7 +159,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
64
- compute_61,sm_61;compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;
+ compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;
true
true
true
diff --git a/cuda_silentarmy/kernel.cu b/cuda_silentarmy/kernel.cu
index d3ef84e37..5f773a9a9 100644
--- a/cuda_silentarmy/kernel.cu
+++ b/cuda_silentarmy/kernel.cu
@@ -2026,7 +2026,7 @@ __device__ void potential_sol(char **htabs, uint ref0, uint ref1)
sols.valid[sol_i] = 1;
}
-constexpr uint c_kernel_sol_counters = 32768 * (THRD / THREADS_PER_ROW);
+const uint c_kernel_sol_counters = 32768 * (THRD / THREADS_PER_ROW);
__device__ uint kernel_sol_counters[c_kernel_sol_counters];
/*
@@ -2354,11 +2354,11 @@ checkCudaErrors(cudaDeviceSynchronize());
static inline void solve_new(c_context *miner, unsigned round)
{
- constexpr uint32_t INIT_THREADS = 256;
- constexpr uint32_t INIT_DIM = NR_ROWS / ROWS_PER_UINT / INIT_THREADS;
+ const uint32_t INIT_THREADS = 256;
+ const uint32_t INIT_DIM = NR_ROWS / ROWS_PER_UINT / INIT_THREADS;
- constexpr uint32_t ROUND_THREADS = THRD;
- constexpr uint32_t ROUND_DIM = NR_ROWS / ROUND_THREADS;
+ const uint32_t ROUND_THREADS = THRD;
+ const uint32_t ROUND_DIM = NR_ROWS / ROUND_THREADS;
static uint32_t ROUND0_DIM = select_work_size_blake() / ROUND_THREADS;
// Now on every round!!!!
@@ -2404,11 +2404,11 @@ static inline void solve_new(c_context *miner, unsigned round)
static inline void solve_old(unsigned round, c_context *miner)
{
- constexpr uint32_t INIT_DIM = NR_ROWS / ROWS_PER_UINT / 256;
- constexpr uint32_t INIT_THREADS = 256;
+ const uint32_t INIT_DIM = NR_ROWS / ROWS_PER_UINT / 256;
+ const uint32_t INIT_THREADS = 256;
- constexpr uint32_t ROUND_THREADS = THRD;
- constexpr uint32_t ROUND_DIM = NR_ROWS / ROUND_THREADS;
+ const uint32_t ROUND_THREADS = THRD;
+ const uint32_t ROUND_DIM = NR_ROWS / ROUND_THREADS;
static uint32_t ROUND0_DIM = select_work_size_blake() / ROUND_THREADS;
switch (round) {
diff --git a/cuda_silentarmy/sa_blake.cpp b/cuda_silentarmy/sa_blake.cpp
index c3d6f5b78..7d0dfa13d 100644
--- a/cuda_silentarmy/sa_blake.cpp
+++ b/cuda_silentarmy/sa_blake.cpp
@@ -39,7 +39,7 @@ void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len,
st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len);
for (uint32_t i = 1; i <= 5; i++)
st->h[i] = blake2b_iv[i];
- st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW";
+ st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"DeepWebCa";
st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n);
st->bytes = 0;
}
diff --git a/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj b/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj
index 7d7843933..9efc78965 100644
--- a/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj
+++ b/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj
@@ -22,6 +22,8 @@
{53E62B3D-3FA6-4B53-8175-2B93753D98C4}
cuda_silentarmy_sm30
8.1
+
+
@@ -31,7 +33,7 @@
v140
- Application
+ DynamicLibrary
true
MultiByte
v140
@@ -93,7 +95,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
Level3
Disabled
- WIN32;WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
+ WIN32;WIN64;_LIB;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
true
@@ -151,6 +153,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
64
compute_30,sm_30;compute_20,sm_20
true
+ true
diff --git a/cuda_silentarmy_sm30/kernel.cu b/cuda_silentarmy_sm30/kernel.cu
index 0434f6a52..459b94b3a 100644
--- a/cuda_silentarmy_sm30/kernel.cu
+++ b/cuda_silentarmy_sm30/kernel.cu
@@ -63,7 +63,7 @@ void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len,
st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len);
for (uint32_t i = 1; i <= 5; i++)
st->h[i] = blake2b_iv[i];
- st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW";
+ st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"DeepWebCa";
st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n);
st->bytes = 0;
}
@@ -147,9 +147,9 @@ void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen)
#define xi_offset_for_round(round) (8 + ((round) / 2) * 4)
-constexpr uint32_t c_NR_SLOTS = NR_SLOTS;
-constexpr uint32_t c_ROW_LEN = c_NR_SLOTS * SLOT_LEN;
-//constexpr uint32_t c_NR_ROWS = NR_ROWS;
+const uint32_t c_NR_SLOTS = NR_SLOTS;
+const uint32_t c_ROW_LEN = c_NR_SLOTS * SLOT_LEN;
+//const uint32_t c_NR_ROWS = NR_ROWS;
#define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN)
@@ -191,14 +191,15 @@ __constant__ uint64_t blake_iv[] =
};
-__global__ void kernel_init_0()
+__global__ void kernel_init_0(int offset)
{
- rowCounter0[(blockDim.x * blockIdx.x) + threadIdx.x] = 0;
+ rowCounter0[(blockDim.x * blockIdx.x) + threadIdx.x + offset] = 0;
}
-__global__ void kernel_init_1()
+
+__global__ void kernel_init_1(int offset)
{
- rowCounter1[(blockDim.x * blockIdx.x) + threadIdx.x] = 0;
+ rowCounter1[(blockDim.x * blockIdx.x) + threadIdx.x + offset] = 0;
}
@@ -206,6 +207,136 @@ typedef uint64_t ulong;
typedef uint32_t uint;
typedef uint8_t uchar;
+#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+ ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff))
+#define DECODE_ROW(REF) (REF >> 16)
+#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff)
+#define DECODE_SLOT0(REF) (REF & 0xff)
+
+#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+ ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f))
+#define DECODE_ROW(REF) (REF >> 14)
+#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f)
+#define DECODE_SLOT0(REF) (REF & 0x7f)
+
+#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+ ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */
+#define DECODE_ROW(REF) (REF >> 13)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+ ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f))
+#define DECODE_ROW(REF) (REF >> 12)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+
+
+/*
+** Access a half-aligned long, that is a long aligned on a 4-byte boundary.
+*/
+__device__ ulong half_aligned_long(ulong *p, uint offset)
+{
+ return
+ (((ulong)*(uint *)((char *)p + offset + 0)) << 0) |
+ (((ulong)*(uint *)((char *)p + offset + 4)) << 32);
+}
+
+/*
+** Access a well-aligned int.
+*/
+__device__ uint well_aligned_int(ulong *_p, uint offset)
+{
+ char *p = (char *)_p;
+ return *(uint *)(p + offset);
+}
+
+__device__ uint xor_and_store3(char* ht, uint tid, uint slot_a, uint slot_b, ulong* a, ulong* b, uint* rowCounters)
+{
+ ulong xi0, xi1, xi2, xi3;
+ char *p;
+ uint i = ENCODE_INPUTS(tid, slot_a, slot_b);
+
+ /*
+
+ (((ulong)*(uint *)((char *)p + offset + 0)) << 0) |
+ (((ulong)*(uint *)((char *)p + offset + 4)) << 32);
+
+ */
+
+ /*
+ char *p = (char *)_p;
+ return *(uint *)(p + offset);
+
+ */
+ uint16_t a0, a1, a2, a3;
+ ulong a0h, a0l, test2, test3;
+
+ asm volatile ("{\n\t"
+ //".reg .b16 a0,a1,a2,a3;\n\t"
+ "ld.global.b64 %0, [%1];\n\t"
+ //"mov.b64 {%4, %3, %2, %1}, %0;\n\t"
+// "mov.b64 %1, {a0, a1, a2, a3};\n\t"
+ "}\n\t" : "=l"(test3) : // , "=h"(a0), "=h"(a1), "=h"(a2), "=h"(a3) :
+ "l"(a));
+
+ ulong test1 = half_aligned_long(a, 0);
+
+ printf("test1 %lX | %lX | %02X %02X %02X %02X\n", test1, test3, a0, a1, a2, a3);
+
+
+ // xor 20 bytes
+ xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0);
+ xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8);
+ xi2 = well_aligned_int(a, 16) ^ well_aligned_int(b, 16);
+ xi3 = 0;
+
+ if (!xi0 && !xi1)
+ return 0;
+
+ uint row;
+
+ row = ((xi0 & 0xf0000) >> 0) |
+ ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
+ ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
+
+ xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
+ xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
+ xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
+
+ uint cnt = atomicAdd(&rowCounters[row], 1);
+ if (cnt >= c_NR_SLOTS) {
+ // avoid overflows
+ atomicSub(&rowCounters[row], 1);
+ return 1;
+ }
+
+
+
+ p = ht + row * c_ROW_LEN;
+ p += cnt * SLOT_LEN + 12;//xi_offset is 12
+ // store "i" (always 4 bytes before Xi)
+ *(uint *)(p - 4) = i;
+
+ *(uint *)(p + 0) = xi0;
+ *(ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32);
+ *(uint *)(p + 12) = (xi1 >> 32);
+
+ return 0;
+}
+
__device__ uint ht_store(uint round, char *ht, uint i,
ulong xi0, ulong xi1, ulong xi2, ulong xi3, uint *rowCounters)
{
@@ -275,6 +406,16 @@ __device__ uint ht_store(uint round, char *ht, uint i,
}
else if (round == 3)
{
+ /*ulong* p1 = (ulong*)p, *p2 = (ulong*)(p + 8);
+ uint xi0l, xi0h, xi1l, xi1h;
+ asm("{\n\t"
+ "mov.b64 {%0, %1}, %6\n\t"
+ "mov.b64 {%2, %3}, %7\n\t"
+ "st.global.b64 [%4], {%2, %0}\n\t"
+ "st.global.b64 [%5], {%1, %3}\n\t"
+ "}\n\t" : "=r"(xi0l), "=r"(xi0h), "=r"(xi1l), "=r"(xi1h), "=l"(p1), "=l"(p2) :
+ "l"(xi0), "l"(xi1)
+ );*/
// store 16 bytes
*(uint *)(p + 0) = xi0;
*(ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32);
@@ -319,15 +460,14 @@ vc = (vc + vd); \
vb = rotate((vb ^ vc), (ulong)64 - 63);
__global__
-void kernel_round0(char *ht, uint32_t inputs_per_thread)
+void kernel_round0(char *ht, uint32_t inputs_per_thread, int offset)
{
typedef uint64_t ulong;
uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
uint64_t v[16];
- //uint32_t inputs_per_thread = c_NR_ROWS / (gridDim.x * blockDim.x);
- uint32_t input = tid * inputs_per_thread;
- uint32_t input_end = (tid + 1) * inputs_per_thread;
+ uint32_t input = (tid * inputs_per_thread) + offset;
+ uint32_t input_end = ((tid + 1) * inputs_per_thread) + offset;
uint32_t dropped = 0;
while (input < input_end) {
@@ -500,60 +640,8 @@ void kernel_round0(char *ht, uint32_t inputs_per_thread)
#endif
}
-#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8)
-#define ENCODE_INPUTS(row, slot0, slot1) \
- ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff))
-#define DECODE_ROW(REF) (REF >> 16)
-#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff)
-#define DECODE_SLOT0(REF) (REF & 0xff)
-#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
- ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f))
-#define DECODE_ROW(REF) (REF >> 14)
-#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f)
-#define DECODE_SLOT0(REF) (REF & 0x7f)
-
-#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
- ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */
-#define DECODE_ROW(REF) (REF >> 13)
-#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
-#define DECODE_SLOT0(REF) (REF & 0x3f)
-
-#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
- ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f))
-#define DECODE_ROW(REF) (REF >> 12)
-#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
-#define DECODE_SLOT0(REF) (REF & 0x3f)
-
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
-
-/*
-** Access a half-aligned long, that is a long aligned on a 4-byte boundary.
-*/
-__device__ ulong half_aligned_long(ulong *p, uint offset)
-{
- return
- (((ulong)*(uint *)((char *)p + offset + 0)) << 0) |
- (((ulong)*(uint *)((char *)p + offset + 4)) << 32);
-}
-
-/*
-** Access a well-aligned int.
-*/
-__device__ uint well_aligned_int(ulong *_p, uint offset)
-{
- char *p = (char *)_p;
- return *(uint *)(p + offset);
-}
/*
** XOR a pair of Xi values computed at "round - 1" and store the result in the
@@ -569,6 +657,10 @@ __device__ uint xor_and_store(uint round, char *ht_dst, uint row,
uint slot_a, uint slot_b, ulong *a, ulong *b,
uint *rowCounters)
{
+ if (round == 3) {
+ return xor_and_store3(ht_dst, row, slot_a, slot_b, a, b, rowCounters);
+ }
+
ulong xi0, xi1, xi2;
#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
// Note: for NR_ROWS_LOG == 20, for odd rounds, we could optimize by not
@@ -643,22 +735,33 @@ __device__ uint xor_and_store(uint round, char *ht_dst, uint row,
xi0, xi1, xi2, 0, rowCounters);
}
-__device__ void equihash_round_cm3(uint round, char *ht_src, char *ht_dst, uint *rowCountersSrc, uint *rowCountersDst)
+
+#define addr_offset(addr, offset) ((uint*)(addr) + offset)
+
+__device__ void equihash_round(uint round, char *ht_src, char *ht_dst, uint *rowCountersSrc, uint *rowCountersDst, int offset)
{
- uint tid = blockIdx.x * blockDim.x + threadIdx.x;
+ uint tid = blockIdx.x * blockDim.x + threadIdx.x + offset;
char *p;
uint cnt;
uint i, j;
uint dropped_stor = 0;
ulong *a, *b;
+ uchar xi_offsets[] = {
+ 8, 8, 12, 12, 16, 16, 20, 20
+ };
uint xi_offset;
- xi_offset = (8 + ((round - 1) / 2) * 4);
-
+ xi_offset = xi_offsets[round - 1];
+ uint* c[8];
+
cnt = rowCountersSrc[tid];
cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in prev. round
+
if (!cnt) {// no elements in row, no collisions
return;
}
+
+ ulong da[4] = { 0 };
+
// find collisions
p = (ht_src + tid * c_ROW_LEN) + xi_offset;
for (i = 0; i < cnt; i++) {
@@ -670,36 +773,33 @@ __device__ void equihash_round_cm3(uint round, char *ht_src, char *ht_dst, uint
}
}
-#define KERNEL_ROUND_ODD_OLD(N) \
+#define KERNEL_ROUND_ODD(N) \
__global__ \
-void kernel_round_cm3_ ## N( char *ht_src, char *ht_dst) \
+void kernel_round ## N ## ( char *ht_src, char *ht_dst, int offset) \
{ \
- equihash_round_cm3(N, ht_src, ht_dst, rowCounter0, rowCounter1); \
+ equihash_round(N, ht_src, ht_dst, rowCounter0, rowCounter1, offset); \
}
-
-#define KERNEL_ROUND_EVEN_OLD(N) \
+#define KERNEL_ROUND_EVEN(N) \
__global__ \
-void kernel_round_cm3_ ## N(char *ht_src, char *ht_dst) \
+void kernel_round ## N ## (char *ht_src, char *ht_dst, int offset) \
{ \
- equihash_round_cm3(N, ht_src, ht_dst, rowCounter1, rowCounter0); \
+ equihash_round(N, ht_src, ht_dst, rowCounter1, rowCounter0, offset); \
}
-
-KERNEL_ROUND_ODD_OLD(1)
-KERNEL_ROUND_EVEN_OLD(2)
-KERNEL_ROUND_ODD_OLD(3)
-KERNEL_ROUND_EVEN_OLD(4)
-KERNEL_ROUND_ODD_OLD(5)
-KERNEL_ROUND_EVEN_OLD(6)
-KERNEL_ROUND_ODD_OLD(7)
-
+KERNEL_ROUND_ODD(1)
+KERNEL_ROUND_EVEN(2)
+KERNEL_ROUND_ODD(3)
+KERNEL_ROUND_EVEN(4)
+KERNEL_ROUND_ODD(5)
+KERNEL_ROUND_EVEN(6)
+KERNEL_ROUND_ODD(7)
__global__
-void kernel_round_cm3_8(char *ht_src, char *ht_dst)
+void kernel_round8(char *ht_src, char *ht_dst, int offset)
{
- uint tid = blockIdx.x * blockDim.x + threadIdx.x;
- equihash_round_cm3(8, ht_src, ht_dst, rowCounter1, rowCounter0);
+ uint tid = blockIdx.x * blockDim.x + threadIdx.x + offset;
+ equihash_round(8, ht_src, ht_dst, rowCounter1, rowCounter0, offset);
if (!tid) {
sols.nr = sols.likely_invalids = 0;
}
@@ -777,11 +877,10 @@ __device__ void potential_sol(const char **htabs, uint ref0, uint ref1)
** Scan the hash tables to find Equihash solutions.
*/
__global__
-void kernel_sols(const char *ht0, const char *ht1)
+void kernel_sols(const char *ht0, const char *ht1, int offset)
{
- uint tid = blockIdx.x * blockDim.x + threadIdx.x;
+ uint tid = blockIdx.x * blockDim.x + threadIdx.x + offset;
const char *htabs[2] = { ht0, ht1 };
- //uint *hcounters[2] = { rowCounter0, rowCounter1 };
uint ht_i = (PARAM_K - 1) & 1; // table filled at last round
uint cnt;
uint xi_offset = xi_offset_for_round(PARAM_K - 1);
@@ -791,7 +890,6 @@ void kernel_sols(const char *ht0, const char *ht1)
// it's ok for the collisions array to be so small, as if it fills up
// the potential solutions are likely invalid (many duplicate inputs)
ulong collisions;
- //uint coll;
#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
// in the final hash table, we are looking for a match on both the bits
// part of the previous PREFIX colliding bits, and the last PREFIX bits.
@@ -803,7 +901,7 @@ void kernel_sols(const char *ht0, const char *ht1)
a = htabs[ht_i] + tid * NR_SLOTS * SLOT_LEN;
cnt = rowCounter0[tid];
cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round
- //coll = 0;
+ //coll = 0;
a += xi_offset;
for (i = 0; i < cnt; i++, a += SLOT_LEN) {
uint a_data = ((*(uint *)a) & mask);
@@ -822,6 +920,7 @@ exit1:
potential_sol(htabs, collisions >> 32, collisions & 0xffffffff);
}
+
static void sort_pair(uint32_t *a, uint32_t len)
{
uint32_t *b = a + len;
@@ -874,6 +973,9 @@ struct __align__(64) c_context {
uint32_t nthreads;
size_t global_ws;
+ cudaStream_t s1;
+ cudaStream_t s2;
+
c_context(const uint32_t n_threads) {
nthreads = n_threads;
}
@@ -941,6 +1043,8 @@ sa_cuda_context::sa_cuda_context(int tpb, int blocks, int id)
checkCudaErrors(cudaMalloc((void**)&eq->buf_ht[0], HT_SIZE));
checkCudaErrors(cudaMalloc((void**)&eq->buf_ht[1], HT_SIZE));
checkCudaErrors(cudaMallocHost(&eq->sols, sizeof(*eq->sols)));
+ checkCudaErrors(cudaStreamCreate(&eq->s1));
+ checkCudaErrors(cudaStreamCreate(&eq->s2));
checkCudaErrors(cudaDeviceSynchronize());
}
@@ -970,38 +1074,72 @@ void sa_cuda_context::solve(const char * tequihash_header, unsigned int tequihas
checkCudaErrors(cudaMemcpyToSymbol(blake, &initialCtx, sizeof(blake2b_state_s), 0, cudaMemcpyHostToDevice));
- constexpr uint32_t THREAD_SHIFT = 7;
- constexpr uint32_t THREAD_COUNT = 1 << THREAD_SHIFT;
- constexpr uint32_t DIM_SIZE = NR_ROWS >> THREAD_SHIFT;
-
- kernel_init_0 << > > ();
- kernel_round0<<<1024, 64>>> (miner->buf_ht[0], 16);
+ //const uint32_t THREAD_SHIFT = 8;
+ //const uint32_t THREAD_COUNT = 1 << THREAD_SHIFT;
+ const uint32_t NEW_THREAD_SHIFT = 6;
+ const uint32_t NEW_THREAD = 1 << NEW_THREAD_SHIFT;
+ const uint32_t DIM_SIZE = NR_ROWS >> (NEW_THREAD_SHIFT + 1);
+ const uint32_t HALF_SIZE = DIM_SIZE << NEW_THREAD_SHIFT;
+ //const uint32_t DIM_SIZE = NR_ROWS >> THREAD_SHIFT;
+ //const uint32_t HALF_SIZE = DIM_SIZE * 128;
+ const uint32_t ROUND_0_IPT_SHIFT = 3;
+ const uint32_t ROUND_0_IPT = 1 << ROUND_0_IPT_SHIFT;
+ const uint32_t ROUND_0_DIM = NR_ROWS >> ((NEW_THREAD_SHIFT + 1) + ROUND_0_IPT_SHIFT);
+ const uint32_t ROUND_0_THREADS = NEW_THREAD;
+
+ kernel_init_0 << s1 >> > (0);
+ kernel_init_0 << s2 >> > (HALF_SIZE);
+ kernel_round0<<s1>>>(miner->buf_ht[0], ROUND_0_IPT, 0);
+ kernel_round0<<s2>>>(miner->buf_ht[0], ROUND_0_IPT, HALF_SIZE);
if (cancelf()) return;
- kernel_init_1 << > > ();
- kernel_round_cm3_1 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[0], miner->buf_ht[1]);
+ kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (0);
+ kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (HALF_SIZE);
+ kernel_round1 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >>> (miner->buf_ht[0], miner->buf_ht[1], 0);
+ checkCudaErrors(cudaPeekAtLastError());
+ kernel_round1 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >>> (miner->buf_ht[0], miner->buf_ht[1], HALF_SIZE);
+ checkCudaErrors(cudaPeekAtLastError());
if (cancelf()) return;
- kernel_init_0 << > > ();
- kernel_round_cm3_2 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[1], miner->buf_ht[0]);
+ kernel_init_0 << s1 >> > (0);
+ kernel_init_0 << s2 >> > (HALF_SIZE);
+ kernel_round2 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[1], miner->buf_ht[0], 0);
+ kernel_round2 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[1], miner->buf_ht[0], HALF_SIZE);
if (cancelf()) return;
- kernel_init_1 << > > ();
- kernel_round_cm3_3 << > > (miner->buf_ht[0], miner->buf_ht[1]);
+ kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (0);
+ kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (HALF_SIZE);
+ kernel_round3 << s1 >> > (miner->buf_ht[0], miner->buf_ht[1], 0);
+ kernel_round3 << s2 >> > (miner->buf_ht[0], miner->buf_ht[1], HALF_SIZE);
+ checkCudaErrors(cudaPeekAtLastError());
if (cancelf()) return;
- kernel_init_0 << > > ();
- kernel_round_cm3_4 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[1], miner->buf_ht[0]);
+ kernel_init_0 << s1 >> > (0);
+ kernel_init_0 << s2 >> > (HALF_SIZE);
+ kernel_round4 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[1], miner->buf_ht[0], 0);
+ kernel_round4 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[1], miner->buf_ht[0], HALF_SIZE);
if (cancelf()) return;
- kernel_init_1 << > > ();
- kernel_round_cm3_5 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[0], miner->buf_ht[1]);
+ kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (0);
+ kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (HALF_SIZE);
+ kernel_round5 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[0], miner->buf_ht[1], 0);
+ kernel_round5 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[0], miner->buf_ht[1], HALF_SIZE);
if (cancelf()) return;
- kernel_init_0 << > > ();
- kernel_round_cm3_6 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[1], miner->buf_ht[0]);
+ kernel_init_0 << s1 >> > (0);
+ kernel_init_0 << s2 >> > (HALF_SIZE);
+ kernel_round6 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[1], miner->buf_ht[0], 0);
+ kernel_round6 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[1], miner->buf_ht[0], HALF_SIZE);
if (cancelf()) return;
- kernel_init_1 << > > ();
- kernel_round_cm3_7 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[0], miner->buf_ht[1]);
+ kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (0);
+ kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (HALF_SIZE);
+ kernel_round7 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[0], miner->buf_ht[1], 0);
+ kernel_round7 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[0], miner->buf_ht[1], HALF_SIZE);
if (cancelf()) return;
- kernel_init_0 << > > ();
- kernel_round_cm3_8 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[1], miner->buf_ht[0]);
+ kernel_init_0 << s1 >> > (0);
+ kernel_init_0 << s2 >> > (HALF_SIZE);
+ kernel_round8 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[1], miner->buf_ht[0], 0);
+ kernel_round8 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[1], miner->buf_ht[0], HALF_SIZE);
if (cancelf()) return;
- kernel_sols << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[0], miner->buf_ht[1]);
+ kernel_sols << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[0], miner->buf_ht[1], 0);
+ kernel_sols << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[0], miner->buf_ht[1], HALF_SIZE);
+
+ checkCudaErrors(cudaStreamSynchronize(miner->s1));
+ checkCudaErrors(cudaStreamSynchronize(miner->s2));
checkCudaErrors(cudaMemcpyFromSymbol(miner->sols, sols, sizeof(sols_t), 0, cudaMemcpyDeviceToHost));
@@ -1022,4 +1160,4 @@ void sa_cuda_context::solve(const char * tequihash_header, unsigned int tequihas
}
hashdonef();
-}
\ No newline at end of file
+}
diff --git a/cuda_tromp/cuda_tromp.vcxproj b/cuda_tromp/cuda_tromp.vcxproj
index df673ec3e..082c3dc46 100644
--- a/cuda_tromp/cuda_tromp.vcxproj
+++ b/cuda_tromp/cuda_tromp.vcxproj
@@ -30,7 +30,9 @@
{33C2B469-F025-4223-B9B6-E69D42FEA7D6}
cuda_tromp
- $(CUDA_PATH_V7_5)
+
+
+ 8.1
@@ -44,14 +46,14 @@
false
true
MultiByte
- v140
+ v120
DynamicLibrary
false
true
MultiByte
- v140
+ v120
@@ -125,7 +127,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
64
- compute_61,sm_61;compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;
+ compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;
true
diff --git a/nheqminer/main.cpp b/nheqminer/main.cpp
index 1503bb454..f4b34725a 100644
--- a/nheqminer/main.cpp
+++ b/nheqminer/main.cpp
@@ -265,7 +265,7 @@ int main(int argc, char* argv[])
int cuda_device_count = 0;
int cuda_bc = 0;
int cuda_tbpc = 0;
- int opencl_platform = 0;
+ int opencl_platform = -1;
int opencl_device_count = 0;
int force_cpu_ext = -1;
int opencl_t = 0;
@@ -362,6 +362,9 @@ int main(int argc, char* argv[])
}
}
break;
+ case 'p':
+ opencl_platform = atoi(argv[++i]);
+ break;
case 't':
while (opencl_t < 8 && i + 1 < argc)
{
diff --git a/nheqminer/nheqminer.vcxproj b/nheqminer/nheqminer.vcxproj
index a15fe80a9..627cc3f5c 100644
--- a/nheqminer/nheqminer.vcxproj
+++ b/nheqminer/nheqminer.vcxproj
@@ -14,6 +14,7 @@
{6FF7D209-05A3-4550-93CC-211D33503719}
Win32Proj
nheqminer
+ 8.1
@@ -25,7 +26,7 @@
Application
false
- v140
+ v120
true
MultiByte
diff --git a/ocl_device_utils/ocl_device_utils.cpp b/ocl_device_utils/ocl_device_utils.cpp
index d3d3d9269..5aceec21f 100644
--- a/ocl_device_utils/ocl_device_utils.cpp
+++ b/ocl_device_utils/ocl_device_utils.cpp
@@ -1,5 +1,5 @@
#include "ocl_device_utils.h"
-
+#include "opencl.h"
#include
#include
#include
@@ -14,26 +14,6 @@ std::vector ocl_device_utils::_platformNames;
std::vector ocl_device_utils::_devicesPlatformsDevices;
std::vector ocl_device_utils::_AllDevices;
-static std::vector GetAllDevices()
-{
- std::vector retval;
- retval.reserve(8);
-
- cl_platform_id platforms[64];
- cl_uint numPlatforms;
- cl_int rc = clGetPlatformIDs(sizeof(platforms) / sizeof(cl_platform_id), platforms, &numPlatforms);
-
- for (cl_uint i = 0; i < numPlatforms; i++) {
- cl_uint numDevices = 0;
- cl_device_id devices[64];
- rc = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, sizeof(devices) / sizeof(cl_device_id), devices, &numDevices);
- for (cl_uint n = 0; n < numDevices; n++) {
- retval.push_back(devices[n]);
- }
- }
-
- return retval;
-}
vector ocl_device_utils::getPlatforms() {
vector platforms;
@@ -59,7 +39,10 @@ void ocl_device_utils::print_opencl_devices() {
vector ocl_device_utils::getDevices(vector const& _platforms, unsigned _platformId) {
vector devices;
try {
- _platforms[_platformId].getDevices(/*CL_DEVICE_TYPE_CPU| */CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, &devices);
+ auto cl_devices = GetAllDevices();
+ for (auto& cl_device : cl_devices) {
+ devices.push_back({ cl_device });
+ }
}
catch (Error const& err) {
// if simply no devices found return empty vector
@@ -79,60 +62,39 @@ bool ocl_device_utils::QueryDevices() {
try {
auto devices = GetAllDevices();
- for (auto& device : devices) {
- _AllDevices.emplace_back(cl::Device(device));
- }
-
-
-
-
- // get platforms
- auto platforms = getPlatforms();
- if (platforms.empty()) {
- cout << "No OpenCL platforms found" << endl;
- return false;
- }
- else {
- for (auto i_pId = 0u; i_pId < platforms.size(); ++i_pId) {
- string platformName = StringnNullTerminatorFix(platforms[i_pId].getInfo());
- if (std::find(_platformNames.begin(), _platformNames.end(), platformName) == _platformNames.end()) {
- PrintInfo current;
- _platformNames.push_back(platformName);
- // new
- current.PlatformName = platformName;
- current.PlatformNum = i_pId;
-
- auto clDevs = getDevices(platforms, i_pId);
- for (auto i_devId = 0u; i_devId < clDevs.size(); ++i_devId) {
- OpenCLDevice curDevice;
- curDevice.DeviceID = i_devId;
- curDevice._CL_DEVICE_NAME = StringnNullTerminatorFix(clDevs[i_devId].getInfo());
- switch (clDevs[i_devId].getInfo()) {
- case CL_DEVICE_TYPE_CPU:
- curDevice._CL_DEVICE_TYPE = "CPU";
- break;
- case CL_DEVICE_TYPE_GPU:
- curDevice._CL_DEVICE_TYPE = "GPU";
- break;
- case CL_DEVICE_TYPE_ACCELERATOR:
- curDevice._CL_DEVICE_TYPE = "ACCELERATOR";
- break;
- default:
- curDevice._CL_DEVICE_TYPE = "DEFAULT";
- break;
- }
-
-
- curDevice._CL_DEVICE_GLOBAL_MEM_SIZE = clDevs[i_devId].getInfo();
- curDevice._CL_DEVICE_VENDOR = StringnNullTerminatorFix(clDevs[i_devId].getInfo());
- curDevice._CL_DEVICE_VERSION = StringnNullTerminatorFix(clDevs[i_devId].getInfo());
- curDevice._CL_DRIVER_VERSION = StringnNullTerminatorFix(clDevs[i_devId].getInfo());
-
- current.Devices.push_back(curDevice);
- }
- _devicesPlatformsDevices.push_back(current);
- }
+ unsigned int device_num = 0;
+ for (auto& cldevice : devices) {
+ cl::Device device(cldevice);
+ cl::Platform platform(device.getInfo());
+ _AllDevices.emplace_back(device);
+ PrintInfo current;
+ current.PlatformName = StringnNullTerminatorFix(platform.getInfo());
+ current.PlatformNum = 0;
+ OpenCLDevice curDevice;
+ curDevice.DeviceID = device_num++;
+ curDevice._CL_DEVICE_NAME = StringnNullTerminatorFix(device.getInfo());
+
+ switch (device.getInfo()) {
+ case CL_DEVICE_TYPE_CPU:
+ curDevice._CL_DEVICE_TYPE = "CPU";
+ break;
+ case CL_DEVICE_TYPE_GPU:
+ curDevice._CL_DEVICE_TYPE = "GPU";
+ break;
+ case CL_DEVICE_TYPE_ACCELERATOR:
+ curDevice._CL_DEVICE_TYPE = "ACCELERATOR";
+ break;
+ default:
+ curDevice._CL_DEVICE_TYPE = "DEFAULT";
+ break;
}
+
+ curDevice._CL_DEVICE_GLOBAL_MEM_SIZE = device.getInfo();
+ curDevice._CL_DEVICE_VENDOR = StringnNullTerminatorFix(device.getInfo());
+ curDevice._CL_DEVICE_VERSION = StringnNullTerminatorFix(device.getInfo());
+ curDevice._CL_DRIVER_VERSION = StringnNullTerminatorFix(device.getInfo());
+ current.Devices.push_back(curDevice);
+ _devicesPlatformsDevices.push_back(current);
}
}
catch (exception &ex) {
@@ -150,7 +112,7 @@ int ocl_device_utils::GetCountForPlatform(int platformID) {
for (const auto &platInfo : _devicesPlatformsDevices)
{
if (platformID == platInfo.PlatformNum) {
- return platInfo.Devices.size();
+ return (int)platInfo.Devices.size();
}
}
return 0;
diff --git a/ocl_device_utils/ocl_device_utils.vcxproj b/ocl_device_utils/ocl_device_utils.vcxproj
index 307e3f32d..b89529a1c 100644
--- a/ocl_device_utils/ocl_device_utils.vcxproj
+++ b/ocl_device_utils/ocl_device_utils.vcxproj
@@ -24,6 +24,7 @@
{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}
Win32Proj
ocl_device_utils
+ 8.1
@@ -35,7 +36,7 @@
StaticLibrary
false
- v140
+ v120
true
Unicode
diff --git a/ocl_device_utils/opencl.cpp b/ocl_device_utils/opencl.cpp
index af5704adf..469188f1c 100644
--- a/ocl_device_utils/opencl.cpp
+++ b/ocl_device_utils/opencl.cpp
@@ -3,11 +3,15 @@
#include
#include
#include
+#include
+
+#include "ocl_device_utils.h"
extern cl_platform_id gPlatform;
// extern cl_program gProgram;
-std::vector GetAllDevices()
+
+/*static std::vector GetAllDevices()
{
std::vector retval;
retval.reserve(8);
@@ -19,65 +23,58 @@ std::vector GetAllDevices()
for (cl_uint i = 0; i < numPlatforms; i++) {
cl_uint numDevices = 0;
cl_device_id devices[64];
- rc = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, sizeof(devices) / sizeof(cl_device_id), devices, &numDevices);
+ rc = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, sizeof(devices) / sizeof(cl_device_id), devices, &numDevices);
for (cl_uint n = 0; n < numDevices; n++) {
retval.push_back(devices[n]);
}
}
+ return retval;
+}*/
+
+std::vector GetAllDevices(int platform_id)
+{
+
+ std::vector retval;
+ retval.reserve(8);
+
+ cl_platform_id platforms[64];
+ cl_uint numPlatforms;
+ cl_int rc = clGetPlatformIDs(sizeof(platforms) / sizeof(cl_platform_id), platforms, &numPlatforms);
+
+ for (cl_uint i = 0; i < numPlatforms; i++) {
+
+ if (platform_id != -1 && i != platform_id) {
+ continue;
+ }
+ cl_uint numDevices = 0;
+ cl_device_id devices[64];
+ rc = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, sizeof(devices) / sizeof(cl_device_id), devices, &numDevices);
+ for (cl_uint n = 0; n < numDevices; n++) {
+ cl_device_id device = devices[n];
+ if (std::find_if(retval.begin(), retval.end(), [&device](cl_device_id p) {
+ return (p == device);
+ }) == retval.end()) {
+ retval.push_back(device);
+ }
+ }
+ }
+
return retval;
}
bool clInitialize(int requiredPlatform, std::vector &gpus)
{
- cl_platform_id platforms[64];
- cl_uint numPlatforms;
- OCLR(clGetPlatformIDs(sizeof(platforms)/sizeof(cl_platform_id), platforms, &numPlatforms), false);
- if (!numPlatforms) {
- printf(" no OpenCL platforms found\n");
- return false;
- }
-
- /*int platformIdx = -1;
- if (requiredPlatform) {
- for (decltype(numPlatforms) i = 0; i < numPlatforms; i++) {
- char name[1024] = {0};
- OCLR(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(name), name, 0), false);
- printf("found platform[%i] name = '%s'\n", (int)i, name);
- if (strcmp(name, requiredPlatform) == 0) {
- platformIdx = i;
- break;
- }
- }
- } else {
- platformIdx = 0;
- }*/
+ gpus = GetAllDevices();
- int platformIdx = requiredPlatform;
-
-
- if (platformIdx == -1) {
- printf(" platform %s not exists\n", requiredPlatform);
- return false;
- }
-
- gPlatform = platforms[platformIdx];
-
- cl_uint numDevices = 0;
- cl_device_id devices[64];
- clGetDeviceIDs(gPlatform, CL_DEVICE_TYPE_GPU, sizeof(devices)/sizeof(cl_device_id), devices, &numDevices);
- if (numDevices) {
- printf(" found %d devices\n", numDevices);
- } else {
- printf(" no OpenCL GPU devices found.\n");
- return false;
- }
+ if (gpus.empty()) {
+ printf(" no OpenCL platforms found\n");
+ return false;
+ }
- for (decltype(numDevices) i = 0; i < numDevices; i++) {
- gpus.push_back(devices[i]);
- }
-
- return true;
+ printf(" found %d devices\n", gpus.size());
+
+ return true;
}
bool clCompileKernel(cl_context gContext,
diff --git a/ocl_device_utils/opencl.h b/ocl_device_utils/opencl.h
index 26f01df27..fa559665b 100644
--- a/ocl_device_utils/opencl.h
+++ b/ocl_device_utils/opencl.h
@@ -115,7 +115,7 @@ class clBuffer {
};
-std::vector GetAllDevices();
+std::vector GetAllDevices(int platform_id = -1);
bool clInitialize(int requiredPlatform, std::vector &gpus);
bool clCompileKernel(cl_context gContext,
diff --git a/ocl_silentarmy/ocl_silentarmy.cpp b/ocl_silentarmy/ocl_silentarmy.cpp
index cfb641f33..c4a2cc7b0 100644
--- a/ocl_silentarmy/ocl_silentarmy.cpp
+++ b/ocl_silentarmy/ocl_silentarmy.cpp
@@ -367,7 +367,7 @@ ocl_silentarmy::ocl_silentarmy(int platf_id, int dev_id) {
}
std::string ocl_silentarmy::getdevinfo() {
- static auto devices = GetAllDevices();
+ static auto devices = GetAllDevices(platform_id);
auto device = devices[device_id];
std::vector name(256, 0);
size_t nActualSize = 0;
@@ -387,7 +387,7 @@ int ocl_silentarmy::getcount() {
}
void ocl_silentarmy::getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version) {
- static auto devices = GetAllDevices();
+ static auto devices = GetAllDevices(platf_id);
if (devices.size() <= d_id) {
return;
@@ -420,20 +420,23 @@ void ocl_silentarmy::start(ocl_silentarmy& device_context) {
/*TODO*/
device_context.is_init_success = false;
device_context.oclc = new OclContext;
- auto devices = GetAllDevices();
+ auto devices = GetAllDevices(device_context.platform_id);
+
+ printf("pid %i, size %u\n", device_context.platform_id, devices.size());
auto device = devices[device_context.device_id];
size_t nActualSize = 0;
cl_platform_id platform_id = nullptr;
- cl_int rc = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform_id, &nActualSize);
+ cl_int rc = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform_id, nullptr);
+
device_context.oclc->_dev_id = device;
device_context.oclc->platform_id = platform_id;
// context create
cl_context_properties props[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)device_context.oclc->platform_id, 0 };
cl_int error;
- device_context.oclc->_context = clCreateContext(NULL, 1, &device, 0, 0, &error);
+ device_context.oclc->_context = clCreateContext(props, 1, &device, 0, 0, &error);
//OCLR(error, false);
if (cl_int err = error) {
printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__);
diff --git a/ocl_silentarmy/ocl_silentarmy.vcxproj b/ocl_silentarmy/ocl_silentarmy.vcxproj
index a7440fa03..77771fcb6 100644
--- a/ocl_silentarmy/ocl_silentarmy.vcxproj
+++ b/ocl_silentarmy/ocl_silentarmy.vcxproj
@@ -27,6 +27,7 @@
{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}
Win32Proj
ocl_silentarmy
+ 8.1
@@ -38,7 +39,7 @@
StaticLibrary
false
- v140
+ v120
true
Unicode
diff --git a/ocl_silentarmy/sa_blake.cpp b/ocl_silentarmy/sa_blake.cpp
index c10800de8..5a3b321bb 100644
--- a/ocl_silentarmy/sa_blake.cpp
+++ b/ocl_silentarmy/sa_blake.cpp
@@ -39,7 +39,7 @@ void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len,
st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len);
for (uint32_t i = 1; i <= 5; i++)
st->h[i] = blake2b_iv[i];
- st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW";
+ st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"DeepWebCa";
st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n);
st->bytes = 0;
}
diff --git a/ocl_silentarmy/zcash/gpu/kernel.cl b/ocl_silentarmy/zcash/gpu/kernel.cl
index 1b63cbb3f..8ba3d6283 100644
--- a/ocl_silentarmy/zcash/gpu/kernel.cl
+++ b/ocl_silentarmy/zcash/gpu/kernel.cl
@@ -7,7 +7,7 @@
#define APX_NR_ELMS_LOG (PREFIX + 1)
// Number of rows and slots is affected by this. 20 offers the best performance
// but occasionally misses ~1% of solutions.
-#define NR_ROWS_LOG 18
+#define NR_ROWS_LOG 20
// Setting this to 1 might make SILENTARMY faster, see TROUBLESHOOTING.md
#define OPTIM_SIMPLIFY_ROUND 1
@@ -15,10 +15,10 @@
// Number of collision items to track, per thread
#ifdef cl_nv_pragma_unroll // NVIDIA
#define THREADS_PER_ROW 16
-#define LDS_COLL_SIZE (NR_SLOTS * 24 * (64 / THREADS_PER_ROW))
+#define LDS_COLL_SIZE (NR_SLOTS * 24 * (THRD / THREADS_PER_ROW))
#else
#define THREADS_PER_ROW 8
-#define LDS_COLL_SIZE (NR_SLOTS * 8 * (64 / THREADS_PER_ROW))
+#define LDS_COLL_SIZE (NR_SLOTS * 8 * (THRD / THREADS_PER_ROW))
#endif
// Ratio of time of sleeping before rechecking if task is done (0-1)
diff --git a/ocl_xpm/ocl_xmp.cpp b/ocl_xpm/ocl_xmp.cpp
index d0a96a2a8..4064e3626 100644
--- a/ocl_xpm/ocl_xmp.cpp
+++ b/ocl_xpm/ocl_xmp.cpp
@@ -105,7 +105,7 @@ static void setheader(blake2b_state *ctx, const char *header, const uint32_t hea
{
uint32_t le_N = WN;
uint32_t le_K = WK;
- char personal[] = "ZcashPoW01230123";
+ char personal[] = "DeepWebCa01230123";
memcpy(personal + 8, &le_N, 4);
memcpy(personal + 12, &le_K, 4);
blake2b_param P[1];
diff --git a/ocl_xpm/ocl_xpm.vcxproj b/ocl_xpm/ocl_xpm.vcxproj
index 8d3a4645f..a88544382 100644
--- a/ocl_xpm/ocl_xpm.vcxproj
+++ b/ocl_xpm/ocl_xpm.vcxproj
@@ -25,6 +25,7 @@
{5EC9EDEB-8E49-4126-9161-1560683CBC71}
Win32Proj
ocl_xpm
+ 8.1
@@ -36,7 +37,7 @@
StaticLibrary
false
- v140
+ v120
true
MultiByte