Skip to content

Commit

Permalink
More Silentarmy V4 Optimizations!!
Browse files Browse the repository at this point in the history
  • Loading branch information
maztheman committed Nov 10, 2016
1 parent a6fdc19 commit ff58f79
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 46 deletions.
41 changes: 29 additions & 12 deletions cuda_silentarmy/kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ typedef struct sols_s
{
uint nr;
uint likely_invalids;
uchar valid[2000];
uint values[2000][(1 << 9)];
uchar valid[MAX_SOLS];
uint values[MAX_SOLS][(1 << PARAM_K)];
} sols_t;

__constant__ ulong blake_iv[] =
Expand Down Expand Up @@ -495,46 +495,63 @@ __device__ uint expand_ref(char *ht, uint xi_offset, uint row, uint slot)
return *(uint *)(ht + row * ((1 << (((200 / (9 + 1)) + 1) - 20)) * 6) * 32 + slot * 32 + xi_offset - 4);
}

__device__ void expand_refs(uint *ins, uint nr_inputs, char **htabs, uint round)
__device__ uint expand_refs(uint *ins, uint nr_inputs, char **htabs, uint round)
{
char *ht = htabs[round & 1];
uint i = nr_inputs - 1;
uint j = nr_inputs * 2 - 1;
uint xi_offset = (8 + ((round) / 2) * 4);
int dup_to_watch = -1;
do
{
ins[j] = expand_ref(ht, xi_offset,
(ins[i] >> 12), ((ins[i] >> 6) & 0x3f));
ins[j - 1] = expand_ref(ht, xi_offset,
(ins[i] >> 12), (ins[i] & 0x3f));
if (!round) {
if (dup_to_watch == -1) {
dup_to_watch = ins[j];
} else if (ins[j] == dup_to_watch || ins[j - 1] == dup_to_watch) {
return 0;
}
}
if (!i)
break;
i--;
j -= 2;
} while (1);
return 1;
}

/*
** Verify if a potential solution is in fact valid.
*/
__device__ void potential_sol(char **htabs, sols_t *sols, uint ref0, uint ref1)
{
uint sol_i;
uint nr_values;
sol_i = atomicAdd(&sols->nr, 1);
if (sol_i >= 2000)
return;
sols->valid[sol_i] = 0;
uint values_tmp[(1 << PARAM_K)];
uint sol_i;
uint i;
nr_values = 0;
sols->values[sol_i][nr_values++] = ref0;
sols->values[sol_i][nr_values++] = ref1;
uint round = 9 - 1;
values_tmp[nr_values++] = ref0;
values_tmp[nr_values++] = ref1;
uint round = PARAM_K - 1;
do
{
round--;
expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round);
if (!expand_refs(values_tmp, nr_values, htabs, round)) {
return;
}
nr_values *= 2;
} while (round > 0);
//solution looks valid
sol_i = atomicAdd(&sols->nr, 1);
if (sol_i >= MAX_SOLS) {
return;
}
for (i = 0; i < (1 << PARAM_K); i++) {
sols->values[sol_i][i] = values_tmp[i];
}
sols->valid[sol_i] = 1;
}

Expand Down
2 changes: 1 addition & 1 deletion cuda_silentarmy/param.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer
// instructions. 10 is the max supported by the hw.
#define BLAKE_WPS 10
#define MAX_SOLS 2000
#define MAX_SOLS 10

// Optional features
#undef ENABLE_DEBUG
Expand Down
2 changes: 1 addition & 1 deletion ocl_silentarmy/param.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer
// instructions. 10 is the max supported by the hw.
#define BLAKE_WPS 10
#define MAX_SOLS 2000
#define MAX_SOLS 10

// Optional features
#undef ENABLE_DEBUG
Expand Down
78 changes: 46 additions & 32 deletions ocl_silentarmy/zcash/gpu/kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ typedef struct sols_s
{
uint nr;
uint likely_invalids;
uchar valid[2000];
uint values[2000][(1 << 9)];
uchar valid[10];
uint values[10][(1 << 9)];
} sols_t;
__constant ulong blake_iv[] =
{
Expand Down Expand Up @@ -516,25 +516,34 @@ uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
slot * 32 + xi_offset - 4);
}

void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs,
uint expand_refs(uint *ins, uint nr_inputs, __global char **htabs,
uint round)
{
__global char *ht = htabs[round & 1];
uint i = nr_inputs - 1;
uint j = nr_inputs * 2 - 1;
uint xi_offset = (8 + ((round) / 2) * 4);
do
{
ins[j] = expand_ref(ht, xi_offset,
(ins[i] >> 12), ((ins[i] >> 6) & 0x3f));
ins[j - 1] = expand_ref(ht, xi_offset,
(ins[i] >> 12), (ins[i] & 0x3f));
if (!i)
break ;
i--;
j -= 2;
}
int dup_to_watch = -1;
do
{
ins[j] = expand_ref(ht, xi_offset,
(ins[i] >> 12), ((ins[i] >> 6) & 0x3f));
ins[j - 1] = expand_ref(ht, xi_offset,
(ins[i] >> 12), (ins[i] & 0x3f));
if (!round) {
if (dup_to_watch == -1) {
dup_to_watch = ins[j];
} else if (ins[j] == dup_to_watch || ins[j - 1] == dup_to_watch) {
return 0;
}
}
if (!i)
break;
i--;
j -= 2;
}
while (1);
return 1;
}

/*
Expand All @@ -543,24 +552,29 @@ void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs,
void potential_sol(__global char **htabs, __global sols_t *sols,
uint ref0, uint ref1)
{
uint sol_i;
uint nr_values;
sol_i = atomic_inc(&sols->nr);
if (sol_i >= 2000)
return ;
sols->valid[sol_i] = 0;
nr_values = 0;
sols->values[sol_i][nr_values++] = ref0;
sols->values[sol_i][nr_values++] = ref1;
uint round = 9 - 1;
do
{
round--;
expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round);
nr_values *= 2;
}
while (round > 0);
sols->valid[sol_i] = 1;
uint nr_values;
uint values_tmp[(1 << 9)];
uint sol_i;
uint i;
nr_values = 0;
values_tmp[nr_values++] = ref0;
values_tmp[nr_values++] = ref1;
uint round = 9 - 1;
do
{
round--;
if (!expand_refs(values_tmp, nr_values, htabs, round)) {
return;
}
nr_values *= 2;
} while (round > 0);
sol_i = atomic_inc(&sols->nr);
if (sol_i >= 10)
return;
for (i = 0; i < (1 << 9); i++) {
sols->values[sol_i][i] = values_tmp[i];
}
sols->valid[sol_i] = 1;
}

/*
Expand Down

0 comments on commit ff58f79

Please sign in to comment.