From be7a19c8b9fbe8e8565ad1d6a9e688c6c0b588d1 Mon Sep 17 00:00:00 2001
From: maztheman <maztheman@example.com>
Date: Thu, 15 Dec 2016 10:06:19 -0700
Subject: [PATCH] some fixes and probably issues too :>

---
 cpu_tromp/cpu_tromp.vcxproj                   |   7 +-
 cpu_tromp/equi.h                              |   6 +-
 cpu_xenoncat/cpu_xenoncat.vcxproj             |   5 +-
 cuda_silentarmy/cuda_silentarmy.vcxproj       |   8 +-
 cuda_silentarmy/kernel.cu                     |  18 +-
 cuda_silentarmy/sa_blake.cpp                  |   2 +-
 .../cuda_silentarmy_sm30.vcxproj              |   7 +-
 cuda_silentarmy_sm30/kernel.cu                | 370 ++++++++++++------
 cuda_tromp/cuda_tromp.vcxproj                 |  10 +-
 nheqminer/main.cpp                            |   5 +-
 nheqminer/nheqminer.vcxproj                   |   3 +-
 ocl_device_utils/ocl_device_utils.cpp         | 114 ++----
 ocl_device_utils/ocl_device_utils.vcxproj     |   3 +-
 ocl_device_utils/opencl.cpp                   |  93 +++--
 ocl_device_utils/opencl.h                     |   2 +-
 ocl_silentarmy/ocl_silentarmy.cpp             |  13 +-
 ocl_silentarmy/ocl_silentarmy.vcxproj         |   3 +-
 ocl_silentarmy/sa_blake.cpp                   |   2 +-
 ocl_silentarmy/zcash/gpu/kernel.cl            |   6 +-
 ocl_xpm/ocl_xmp.cpp                           |   2 +-
 ocl_xpm/ocl_xpm.vcxproj                       |   3 +-
 21 files changed, 399 insertions(+), 283 deletions(-)
diff --git a/cpu_tromp/cpu_tromp.vcxproj b/cpu_tromp/cpu_tromp.vcxproj
index f3156f386..50a3afad8 100644
--- a/cpu_tromp/cpu_tromp.vcxproj
+++ b/cpu_tromp/cpu_tromp.vcxproj
@@ -18,6 +18,7 @@
     <ProjectGuid>{6C180164-4DBE-45D7-85E0-7BDFACF3FC7B}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>cpu_tromp</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
@@ -29,14 +30,14 @@
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v120</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='ReleaseSSE2|x64'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v120</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>
@@ -93,7 +94,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;CPU_TROMP_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
       <DisableSpecificWarnings>4244;4334</DisableSpecificWarnings>
       <AdditionalIncludeDirectories>..\3rdparty\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     </ClCompile>
diff --git a/cpu_tromp/equi.h b/cpu_tromp/equi.h
index 2b10ce7b2..b9237c359 100644
--- a/cpu_tromp/equi.h
+++ b/cpu_tromp/equi.h
@@ -49,7 +49,7 @@ typedef u32 proof[PROOFSIZE];
 void setheader(blake2b_state *ctx, const char *header, const u32 headerLen, const char* nce, const u32 nonceLen) {
   uint32_t le_N = WN;
   uint32_t le_K = WK;
-  uchar personal[] = "ZcashPoW01230123";
+  uchar personal[] = "DeepWebCa01230123";
   memcpy(personal+8,  &le_N, 4);
   memcpy(personal+12, &le_K, 4);
   blake2b_param P[1];
@@ -73,7 +73,7 @@ enum verify_code { POW_OK, POW_DUPLICATE, POW_OUT_OF_ORDER, POW_NONZERO_XOR };
 const char *errstr[] = { "OK", "duplicate index", "indices out of order", "nonzero xor" };
 
 void genhash(blake2b_state *ctx, u32 idx, uchar *hash) {
-	constexpr int hash_size = WN / 8;
+  const int hash_size = WN / 8;
   blake2b_state state = *ctx;
   u32 leb = (idx / HASHESPERBLAKE);
   blake2b_update(&state, (uchar *)&leb, sizeof(u32));
@@ -83,7 +83,7 @@ void genhash(blake2b_state *ctx, u32 idx, uchar *hash) {
 }
 
 int verifyrec(blake2b_state *ctx, u32 *indices, uchar *hash, int r) {
-  constexpr int hash_size = WN / 8;
+  const int hash_size = WN / 8;
   if (r == 0) {
     genhash(ctx, *indices, hash);
     return POW_OK;
diff --git a/cpu_xenoncat/cpu_xenoncat.vcxproj b/cpu_xenoncat/cpu_xenoncat.vcxproj
index 7d761baa1..96d3a181f 100644
--- a/cpu_xenoncat/cpu_xenoncat.vcxproj
+++ b/cpu_xenoncat/cpu_xenoncat.vcxproj
@@ -14,6 +14,7 @@
     <ProjectGuid>{299E011B-5242-4EDA-B2F2-73C9B48F12FD}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>cpu_xenoncat</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
@@ -25,7 +26,7 @@
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v120</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>
@@ -77,7 +78,7 @@ asm\fasm.exe asm\xenoncat_AVX2.asm asm\xenoncatavx2.obj</Command>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <MultiProcessorCompilation>true</MultiProcessorCompilation>
-      <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
diff --git a/cuda_silentarmy/cuda_silentarmy.vcxproj b/cuda_silentarmy/cuda_silentarmy.vcxproj
index 609e5e393..76f286295 100644
--- a/cuda_silentarmy/cuda_silentarmy.vcxproj
+++ b/cuda_silentarmy/cuda_silentarmy.vcxproj
@@ -22,6 +22,8 @@
     <ProjectGuid>{76AC1E54-C6AC-465B-AF41-416B2C3874C1}</ProjectGuid>
     <RootNamespace>cuda_silentarmy</RootNamespace>
     <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+    <CudaToolkitCustomDir>
+    </CudaToolkitCustomDir>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -48,7 +50,7 @@
     <UseDebugLibraries>false</UseDebugLibraries>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v120</PlatformToolset>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
@@ -142,7 +144,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <PreprocessorDefinitions>WIN32;WIN64;_LIB;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
     </ClCompile>
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
@@ -157,7 +159,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
     </PostBuildEvent>
     <CudaCompile>
       <TargetMachinePlatform>64</TargetMachinePlatform>
-      <CodeGeneration>compute_61,sm_61;compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;</CodeGeneration>
+      <CodeGeneration>compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;</CodeGeneration>
       <PtxAsOptionV>true</PtxAsOptionV>
       <FastMath>true</FastMath>
       <Keep>true</Keep>
diff --git a/cuda_silentarmy/kernel.cu b/cuda_silentarmy/kernel.cu
index d3ef84e37..5f773a9a9 100644
--- a/cuda_silentarmy/kernel.cu
+++ b/cuda_silentarmy/kernel.cu
@@ -2026,7 +2026,7 @@ __device__ void potential_sol(char **htabs, uint ref0, uint ref1)
 	sols.valid[sol_i] = 1;
 }
 
-constexpr uint c_kernel_sol_counters = 32768 * (THRD / THREADS_PER_ROW);
+const uint c_kernel_sol_counters = 32768 * (THRD / THREADS_PER_ROW);
 __device__ uint kernel_sol_counters[c_kernel_sol_counters];
 
 /*
@@ -2354,11 +2354,11 @@ checkCudaErrors(cudaDeviceSynchronize());
 
 static inline void solve_new(c_context *miner, unsigned round)
 {
-	constexpr uint32_t INIT_THREADS = 256;
-	constexpr uint32_t INIT_DIM = NR_ROWS / ROWS_PER_UINT / INIT_THREADS;
+	const uint32_t INIT_THREADS = 256;
+	const uint32_t INIT_DIM = NR_ROWS / ROWS_PER_UINT / INIT_THREADS;
 
-	constexpr uint32_t ROUND_THREADS = THRD;
-	constexpr uint32_t ROUND_DIM = NR_ROWS / ROUND_THREADS;
+	const uint32_t ROUND_THREADS = THRD;
+	const uint32_t ROUND_DIM = NR_ROWS / ROUND_THREADS;
 	static uint32_t ROUND0_DIM = select_work_size_blake() / ROUND_THREADS;
 
 	// Now on every round!!!!
@@ -2404,11 +2404,11 @@ static inline void solve_new(c_context *miner, unsigned round)
 
 static inline void solve_old(unsigned round, c_context *miner)
 {
-	constexpr uint32_t INIT_DIM = NR_ROWS / ROWS_PER_UINT / 256;
-	constexpr uint32_t INIT_THREADS = 256;
+	const uint32_t INIT_DIM = NR_ROWS / ROWS_PER_UINT / 256;
+	const uint32_t INIT_THREADS = 256;
 	
-	constexpr uint32_t ROUND_THREADS = THRD;
-	constexpr uint32_t ROUND_DIM = NR_ROWS / ROUND_THREADS;
+	const uint32_t ROUND_THREADS = THRD;
+	const uint32_t ROUND_DIM = NR_ROWS / ROUND_THREADS;
 	static uint32_t ROUND0_DIM = select_work_size_blake() / ROUND_THREADS;
 	
 	switch (round) {
diff --git a/cuda_silentarmy/sa_blake.cpp b/cuda_silentarmy/sa_blake.cpp
index c3d6f5b78..7d0dfa13d 100644
--- a/cuda_silentarmy/sa_blake.cpp
+++ b/cuda_silentarmy/sa_blake.cpp
@@ -39,7 +39,7 @@ void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len,
 	st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len);
 	for (uint32_t i = 1; i <= 5; i++)
 		st->h[i] = blake2b_iv[i];
-	st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW";
+	st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"DeepWebCa";
 	st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n);
 	st->bytes = 0;
 }
diff --git a/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj b/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj
index 7d7843933..9efc78965 100644
--- a/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj
+++ b/cuda_silentarmy_sm30/cuda_silentarmy_sm30.vcxproj
@@ -22,6 +22,8 @@
     <ProjectGuid>{53E62B3D-3FA6-4B53-8175-2B93753D98C4}</ProjectGuid>
     <RootNamespace>cuda_silentarmy_sm30</RootNamespace>
     <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+    <CudaToolkitCustomDir>
+    </CudaToolkitCustomDir>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -31,7 +33,7 @@
     <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
     <CharacterSet>MultiByte</CharacterSet>
     <PlatformToolset>v140</PlatformToolset>
@@ -93,7 +95,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;WIN64;_LIB;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
@@ -151,6 +153,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
       <TargetMachinePlatform>64</TargetMachinePlatform>
       <CodeGeneration>compute_30,sm_30;compute_20,sm_20</CodeGeneration>
       <PtxAsOptionV>true</PtxAsOptionV>
+      <Keep>true</Keep>
     </CudaCompile>
   </ItemDefinitionGroup>
   <ItemGroup>
diff --git a/cuda_silentarmy_sm30/kernel.cu b/cuda_silentarmy_sm30/kernel.cu
index 0434f6a52..459b94b3a 100644
--- a/cuda_silentarmy_sm30/kernel.cu
+++ b/cuda_silentarmy_sm30/kernel.cu
@@ -63,7 +63,7 @@ void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len,
 	st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len);
 	for (uint32_t i = 1; i <= 5; i++)
 		st->h[i] = blake2b_iv[i];
-	st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW";
+	st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"DeepWebCa";
 	st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n);
 	st->bytes = 0;
 }
@@ -147,9 +147,9 @@ void zcash_blake2b_final(blake2b_state_t *st, uint8_t *out, uint8_t outlen)
 #define xi_offset_for_round(round)	(8 + ((round) / 2) * 4)
 
 
-constexpr uint32_t c_NR_SLOTS = NR_SLOTS;
-constexpr uint32_t c_ROW_LEN = c_NR_SLOTS * SLOT_LEN;
-//constexpr uint32_t c_NR_ROWS = NR_ROWS;
+const uint32_t c_NR_SLOTS = NR_SLOTS;
+const uint32_t c_ROW_LEN = c_NR_SLOTS * SLOT_LEN;
+//const uint32_t c_NR_ROWS = NR_ROWS;
 
 #define HT_SIZE				(NR_ROWS * NR_SLOTS * SLOT_LEN)
 
@@ -191,14 +191,15 @@ __constant__ uint64_t blake_iv[] =
 };
 
 
-__global__ void kernel_init_0()
+__global__ void kernel_init_0(int offset)
 {
-	rowCounter0[(blockDim.x * blockIdx.x) + threadIdx.x] = 0;
+	rowCounter0[(blockDim.x * blockIdx.x) + threadIdx.x + offset] = 0;
 }
 
-__global__ void kernel_init_1()
+
+__global__ void kernel_init_1(int offset)
 {
-	rowCounter1[(blockDim.x * blockIdx.x) + threadIdx.x] = 0;
+	rowCounter1[(blockDim.x * blockIdx.x) + threadIdx.x + offset] = 0;
 }
 
 
@@ -206,6 +207,136 @@ typedef uint64_t ulong;
 typedef uint32_t uint;
 typedef uint8_t uchar;
 
+#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff))
+#define DECODE_ROW(REF)   (REF >> 16)
+#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff)
+#define DECODE_SLOT0(REF) (REF & 0xff)
+
+#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f))
+#define DECODE_ROW(REF)   (REF >> 14)
+#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f)
+#define DECODE_SLOT0(REF) (REF & 0x7f)
+
+#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */
+#define DECODE_ROW(REF)   (REF >> 13)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f))
+#define DECODE_ROW(REF)   (REF >> 12)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+
+
+/*
+** Access a half-aligned long, that is a long aligned on a 4-byte boundary.
+*/
+__device__ ulong half_aligned_long(ulong *p, uint offset)
+{
+	return
+		(((ulong)*(uint *)((char *)p + offset + 0)) << 0) |
+		(((ulong)*(uint *)((char *)p + offset + 4)) << 32);
+}
+
+/*
+** Access a well-aligned int.
+*/
+__device__ uint well_aligned_int(ulong *_p, uint offset)
+{
+	char *p = (char *)_p;
+	return *(uint *)(p + offset);
+}
+
+__device__ uint xor_and_store3(char* ht, uint tid, uint slot_a, uint slot_b, ulong* a, ulong* b, uint* rowCounters)
+{
+	ulong xi0, xi1, xi2, xi3;
+	char       *p;
+	uint i = ENCODE_INPUTS(tid, slot_a, slot_b);
+
+	/*
+	
+	(((ulong)*(uint *)((char *)p + offset + 0)) << 0) |
+	(((ulong)*(uint *)((char *)p + offset + 4)) << 32);
+	
+	*/
+
+	/*
+	char *p = (char *)_p;
+	return *(uint *)(p + offset);
+	
+	*/
+	uint16_t a0, a1, a2, a3;
+	ulong a0h, a0l, test2, test3;
+
+	asm volatile ("{\n\t"
+		//".reg .b16 a0,a1,a2,a3;\n\t"
+		"ld.global.b64 %0, [%1];\n\t"
+		//"mov.b64 {%4, %3, %2, %1}, %0;\n\t"
+//		"mov.b64 %1, {a0, a1, a2, a3};\n\t"
+		"}\n\t" : "=l"(test3) : // , "=h"(a0), "=h"(a1), "=h"(a2), "=h"(a3) :
+		"l"(a));
+
+	ulong test1 = half_aligned_long(a, 0);
+
+	printf("test1 %lX | %lX | %02X %02X %02X %02X\n", test1, test3, a0, a1, a2, a3);
+
+
+	// xor 20 bytes
+	xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0);
+	xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8);
+	xi2 = well_aligned_int(a, 16) ^ well_aligned_int(b, 16);
+	xi3 = 0;
+
+	if (!xi0 && !xi1)
+		return 0;
+
+	uint row;
+
+	row = ((xi0 & 0xf0000) >> 0) |
+		((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
+		((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
+
+	xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
+	xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
+	xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
+
+	uint cnt = atomicAdd(&rowCounters[row], 1);
+	if (cnt >= c_NR_SLOTS) {
+		// avoid overflows
+		atomicSub(&rowCounters[row], 1);
+		return 1;
+	}
+
+
+
+	p = ht + row * c_ROW_LEN;
+	p += cnt * SLOT_LEN + 12;//xi_offset is 12
+	// store "i" (always 4 bytes before Xi)
+	*(uint *)(p - 4) = i;
+
+	*(uint *)(p + 0) = xi0;
+	*(ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32);
+	*(uint *)(p + 12) = (xi1 >> 32);
+
+	return 0;
+}
+
 __device__ uint ht_store(uint round, char *ht, uint i,
 	ulong xi0, ulong xi1, ulong xi2, ulong xi3, uint *rowCounters)
 {
@@ -275,6 +406,16 @@ __device__ uint ht_store(uint round, char *ht, uint i,
 	}
 	else if (round == 3)
 	{
+		/*ulong* p1 = (ulong*)p, *p2 = (ulong*)(p + 8);
+		uint xi0l, xi0h, xi1l, xi1h;
+		asm("{\n\t"
+			"mov.b64 {%0, %1}, %6\n\t"
+			"mov.b64 {%2, %3}, %7\n\t"
+			"st.global.b64 [%4], {%2, %0}\n\t"
+			"st.global.b64 [%5], {%1, %3}\n\t"
+			"}\n\t" : "=r"(xi0l), "=r"(xi0h), "=r"(xi1l), "=r"(xi1h), "=l"(p1), "=l"(p2) :
+			"l"(xi0), "l"(xi1)
+		);*/
 		// store 16 bytes
 		*(uint *)(p + 0) = xi0;
 		*(ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32);
@@ -319,15 +460,14 @@ vc = (vc + vd); \
 vb = rotate((vb ^ vc), (ulong)64 - 63);
 
 __global__
-void kernel_round0(char *ht, uint32_t inputs_per_thread)
+void kernel_round0(char *ht, uint32_t inputs_per_thread, int offset)
 {
 	typedef uint64_t ulong;
 
 	uint32_t                tid = blockIdx.x * blockDim.x + threadIdx.x;
 	uint64_t				v[16];
-	//uint32_t                inputs_per_thread = c_NR_ROWS / (gridDim.x * blockDim.x);
-	uint32_t                input = tid * inputs_per_thread;
-	uint32_t                input_end = (tid + 1) * inputs_per_thread;
+	uint32_t                input = (tid * inputs_per_thread) + offset;
+	uint32_t                input_end = ((tid + 1) * inputs_per_thread) + offset;
 	uint32_t                dropped = 0;
 
 	while (input < input_end) {
@@ -500,60 +640,8 @@ void kernel_round0(char *ht, uint32_t inputs_per_thread)
 #endif
 }
 
-#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8)
 
-#define ENCODE_INPUTS(row, slot0, slot1) \
-    ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff))
-#define DECODE_ROW(REF)   (REF >> 16)
-#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff)
-#define DECODE_SLOT0(REF) (REF & 0xff)
 
-#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
-    ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f))
-#define DECODE_ROW(REF)   (REF >> 14)
-#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f)
-#define DECODE_SLOT0(REF) (REF & 0x7f)
-
-#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
-    ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */
-#define DECODE_ROW(REF)   (REF >> 13)
-#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
-#define DECODE_SLOT0(REF) (REF & 0x3f)
-
-#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
-    ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f))
-#define DECODE_ROW(REF)   (REF >> 12)
-#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
-#define DECODE_SLOT0(REF) (REF & 0x3f)
-
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
-
-/*
-** Access a half-aligned long, that is a long aligned on a 4-byte boundary.
-*/
-__device__ ulong half_aligned_long(ulong *p, uint offset)
-{
-	return
-		(((ulong)*(uint *)((char *)p + offset + 0)) << 0) |
-		(((ulong)*(uint *)((char *)p + offset + 4)) << 32);
-}
-
-/*
-** Access a well-aligned int.
-*/
-__device__ uint well_aligned_int(ulong *_p, uint offset)
-{
-	char *p = (char *)_p;
-	return *(uint *)(p + offset);
-}
 
 /*
 ** XOR a pair of Xi values computed at "round - 1" and store the result in the
@@ -569,6 +657,10 @@ __device__ uint xor_and_store(uint round, char *ht_dst, uint row,
 	uint slot_a, uint slot_b, ulong *a, ulong *b,
 	uint *rowCounters)
 {
+	if (round == 3) {
+		return xor_and_store3(ht_dst, row, slot_a, slot_b, a, b, rowCounters);
+	}
+
 	ulong xi0, xi1, xi2;
 #if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
 	// Note: for NR_ROWS_LOG == 20, for odd rounds, we could optimize by not
@@ -643,22 +735,33 @@ __device__ uint xor_and_store(uint round, char *ht_dst, uint row,
 		xi0, xi1, xi2, 0, rowCounters);
 }
 
-__device__ void equihash_round_cm3(uint round, char *ht_src, char *ht_dst, uint *rowCountersSrc, uint *rowCountersDst)
+
+#define addr_offset(addr, offset) ((uint*)(addr) + offset)
+
+__device__ void equihash_round(uint round, char *ht_src, char *ht_dst, uint *rowCountersSrc, uint *rowCountersDst, int offset)
 {
-	uint                tid = blockIdx.x * blockDim.x + threadIdx.x;
+	uint                tid = blockIdx.x * blockDim.x + threadIdx.x + offset;
 	char				*p;
 	uint				cnt;
 	uint                i, j;
 	uint				dropped_stor = 0;
 	ulong				*a, *b;
+	uchar				xi_offsets[] = {
+		8, 8, 12, 12, 16, 16, 20, 20
+	};
 	uint				xi_offset;
-	xi_offset = (8 + ((round - 1) / 2) * 4);
-
+	xi_offset = xi_offsets[round - 1];
+	uint*				c[8];
+	
 	cnt = rowCountersSrc[tid];
 	cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in prev. round
+	
 	if (!cnt) {// no elements in row, no collisions
 		return;
 	}
+
+	ulong da[4] = { 0 };
+
 	// find collisions
 	p = (ht_src + tid * c_ROW_LEN) + xi_offset;
 	for (i = 0; i < cnt; i++) {
@@ -670,36 +773,33 @@ __device__ void equihash_round_cm3(uint round, char *ht_src, char *ht_dst, uint
 	}
 }
 
-#define KERNEL_ROUND_ODD_OLD(N) \
+#define KERNEL_ROUND_ODD(N) \
 __global__  \
-void kernel_round_cm3_ ## N( char *ht_src,  char *ht_dst) \
+void kernel_round ## N ## ( char *ht_src,  char *ht_dst, int offset) \
 { \
-    equihash_round_cm3(N, ht_src, ht_dst, rowCounter0, rowCounter1); \
+    equihash_round(N, ht_src, ht_dst, rowCounter0, rowCounter1, offset); \
 }
 
-
-#define KERNEL_ROUND_EVEN_OLD(N) \
+#define KERNEL_ROUND_EVEN(N) \
 __global__  \
-void kernel_round_cm3_ ## N(char *ht_src,  char *ht_dst) \
+void kernel_round ## N ## (char *ht_src,  char *ht_dst, int offset) \
 { \
-    equihash_round_cm3(N, ht_src, ht_dst, rowCounter1, rowCounter0); \
+    equihash_round(N, ht_src, ht_dst, rowCounter1, rowCounter0, offset); \
 }
 
-
-KERNEL_ROUND_ODD_OLD(1)
-KERNEL_ROUND_EVEN_OLD(2)
-KERNEL_ROUND_ODD_OLD(3)
-KERNEL_ROUND_EVEN_OLD(4)
-KERNEL_ROUND_ODD_OLD(5)
-KERNEL_ROUND_EVEN_OLD(6)
-KERNEL_ROUND_ODD_OLD(7)
-
+KERNEL_ROUND_ODD(1)
+KERNEL_ROUND_EVEN(2)
+KERNEL_ROUND_ODD(3)
+KERNEL_ROUND_EVEN(4)
+KERNEL_ROUND_ODD(5)
+KERNEL_ROUND_EVEN(6)
+KERNEL_ROUND_ODD(7)
 
 __global__
-void kernel_round_cm3_8(char *ht_src, char *ht_dst)
+void kernel_round8(char *ht_src, char *ht_dst, int offset)
 {
-	uint tid = blockIdx.x * blockDim.x + threadIdx.x;
-	equihash_round_cm3(8, ht_src, ht_dst, rowCounter1, rowCounter0);
+	uint tid = blockIdx.x * blockDim.x + threadIdx.x + offset;
+	equihash_round(8, ht_src, ht_dst, rowCounter1, rowCounter0, offset);
 	if (!tid) {
 		sols.nr = sols.likely_invalids = 0;
 	}
@@ -777,11 +877,10 @@ __device__ void potential_sol(const char **htabs, uint ref0, uint ref1)
 ** Scan the hash tables to find Equihash solutions.
 */
 __global__
-void kernel_sols(const char *ht0, const char *ht1)
+void kernel_sols(const char *ht0, const char *ht1, int offset)
 {
-	uint		tid = blockIdx.x * blockDim.x + threadIdx.x;
+	uint		tid = blockIdx.x * blockDim.x + threadIdx.x + offset;
 	const char	*htabs[2] = { ht0, ht1 };
-	//uint	*hcounters[2] = { rowCounter0, rowCounter1 };
 	uint		ht_i = (PARAM_K - 1) & 1; // table filled at last round
 	uint		cnt;
 	uint		xi_offset = xi_offset_for_round(PARAM_K - 1);
@@ -791,7 +890,6 @@ void kernel_sols(const char *ht0, const char *ht1)
 	// it's ok for the collisions array to be so small, as if it fills up
 	// the potential solutions are likely invalid (many duplicate inputs)
 	ulong		collisions;
-	//uint		coll;
 #if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
 	// in the final hash table, we are looking for a match on both the bits
 	// part of the previous PREFIX colliding bits, and the last PREFIX bits.
@@ -803,7 +901,7 @@ void kernel_sols(const char *ht0, const char *ht1)
 	a = htabs[ht_i] + tid * NR_SLOTS * SLOT_LEN;
 	cnt = rowCounter0[tid];
 	cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round
-	//coll = 0;
+									//coll = 0;
 	a += xi_offset;
 	for (i = 0; i < cnt; i++, a += SLOT_LEN) {
 		uint a_data = ((*(uint *)a) & mask);
@@ -822,6 +920,7 @@ exit1:
 	potential_sol(htabs, collisions >> 32, collisions & 0xffffffff);
 }
 
+
 static void sort_pair(uint32_t *a, uint32_t len)
 {
 	uint32_t    *b = a + len;
@@ -874,6 +973,9 @@ struct __align__(64) c_context {
 	uint32_t nthreads;
 	size_t global_ws;
 
+	cudaStream_t s1;
+	cudaStream_t s2;
+
 	c_context(const uint32_t n_threads) {
 		nthreads = n_threads;
 	}
@@ -941,6 +1043,8 @@ sa_cuda_context::sa_cuda_context(int tpb, int blocks, int id)
 	checkCudaErrors(cudaMalloc((void**)&eq->buf_ht[0], HT_SIZE));
 	checkCudaErrors(cudaMalloc((void**)&eq->buf_ht[1], HT_SIZE));
 	checkCudaErrors(cudaMallocHost(&eq->sols, sizeof(*eq->sols)));
+	checkCudaErrors(cudaStreamCreate(&eq->s1));
+	checkCudaErrors(cudaStreamCreate(&eq->s2));
 	checkCudaErrors(cudaDeviceSynchronize());
 }
 
@@ -970,38 +1074,72 @@ void sa_cuda_context::solve(const char * tequihash_header, unsigned int tequihas
 
 	checkCudaErrors(cudaMemcpyToSymbol(blake, &initialCtx, sizeof(blake2b_state_s), 0, cudaMemcpyHostToDevice));
 
-	constexpr uint32_t THREAD_SHIFT = 7;
-	constexpr uint32_t THREAD_COUNT = 1 << THREAD_SHIFT;
-	constexpr uint32_t DIM_SIZE = NR_ROWS >> THREAD_SHIFT;
-
-	kernel_init_0 << <DIM_SIZE, THREAD_COUNT >> > ();
-	kernel_round0<<<1024, 64>>> (miner->buf_ht[0], 16);
+	//const uint32_t THREAD_SHIFT = 8;
+	//const uint32_t THREAD_COUNT = 1 << THREAD_SHIFT;
+	const uint32_t NEW_THREAD_SHIFT = 6;
+	const uint32_t NEW_THREAD = 1 << NEW_THREAD_SHIFT;
+	const uint32_t DIM_SIZE = NR_ROWS >> (NEW_THREAD_SHIFT + 1);
+	const uint32_t HALF_SIZE = DIM_SIZE << NEW_THREAD_SHIFT;
+	//const uint32_t DIM_SIZE = NR_ROWS >> THREAD_SHIFT;
+	//const uint32_t HALF_SIZE = DIM_SIZE * 128;
+	const uint32_t ROUND_0_IPT_SHIFT = 3;
+	const uint32_t ROUND_0_IPT = 1 << ROUND_0_IPT_SHIFT;
+	const uint32_t ROUND_0_DIM = NR_ROWS >> ((NEW_THREAD_SHIFT + 1) + ROUND_0_IPT_SHIFT);
+	const uint32_t ROUND_0_THREADS = NEW_THREAD;
+
+	kernel_init_0 << <DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (0);
+	kernel_init_0 << <DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (HALF_SIZE);
+	kernel_round0<<<ROUND_0_DIM, ROUND_0_THREADS, 0, miner->s1>>>(miner->buf_ht[0], ROUND_0_IPT, 0);
+	kernel_round0<<<ROUND_0_DIM, ROUND_0_THREADS, 0, miner->s2>>>(miner->buf_ht[0], ROUND_0_IPT, HALF_SIZE);
 	if (cancelf()) return;
-	kernel_init_1 << <DIM_SIZE, THREAD_COUNT >> > ();
-	kernel_round_cm3_1 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[0], miner->buf_ht[1]);
+	kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (0);
+	kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (HALF_SIZE);
+	kernel_round1 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >>> (miner->buf_ht[0], miner->buf_ht[1], 0);
+	checkCudaErrors(cudaPeekAtLastError());
+	kernel_round1 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >>> (miner->buf_ht[0], miner->buf_ht[1], HALF_SIZE);
+	checkCudaErrors(cudaPeekAtLastError());
 	if (cancelf()) return;
-	kernel_init_0 << <DIM_SIZE, THREAD_COUNT >> > ();
-	kernel_round_cm3_2 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[1], miner->buf_ht[0]);
+	kernel_init_0 << <DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (0);
+	kernel_init_0 << <DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (HALF_SIZE);
+	kernel_round2 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[1], miner->buf_ht[0], 0);
+	kernel_round2 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[1], miner->buf_ht[0], HALF_SIZE);
 	if (cancelf()) return;
-	kernel_init_1 << <DIM_SIZE, THREAD_COUNT >> > ();
-	kernel_round_cm3_3 << <DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[0], miner->buf_ht[1]);
+	kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (0);
+	kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (HALF_SIZE);
+	kernel_round3 << <DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[0], miner->buf_ht[1], 0);
+	kernel_round3 << <DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[0], miner->buf_ht[1], HALF_SIZE);
+	checkCudaErrors(cudaPeekAtLastError());
 	if (cancelf()) return;
-	kernel_init_0 << <DIM_SIZE, THREAD_COUNT >> > ();
-	kernel_round_cm3_4 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[1], miner->buf_ht[0]);
+	kernel_init_0 << <DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (0);
+	kernel_init_0 << <DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (HALF_SIZE);
+	kernel_round4 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[1], miner->buf_ht[0], 0);
+	kernel_round4 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[1], miner->buf_ht[0], HALF_SIZE);
 	if (cancelf()) return;
-	kernel_init_1 << <DIM_SIZE, THREAD_COUNT >> > ();
-	kernel_round_cm3_5 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[0], miner->buf_ht[1]);
+	kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (0);
+	kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (HALF_SIZE);
+	kernel_round5 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[0], miner->buf_ht[1], 0);
+	kernel_round5 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[0], miner->buf_ht[1], HALF_SIZE);
 	if (cancelf()) return;
-	kernel_init_0 << <DIM_SIZE, THREAD_COUNT >> > ();
-	kernel_round_cm3_6 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[1], miner->buf_ht[0]);
+	kernel_init_0 << <DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (0);
+	kernel_init_0 << <DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (HALF_SIZE);
+	kernel_round6 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[1], miner->buf_ht[0], 0);
+	kernel_round6 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[1], miner->buf_ht[0], HALF_SIZE);
 	if (cancelf()) return;
-	kernel_init_1 << <DIM_SIZE, THREAD_COUNT >> > ();
-	kernel_round_cm3_7 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[0], miner->buf_ht[1]);
+	kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (0);
+	kernel_init_1 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (HALF_SIZE);
+	kernel_round7 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[0], miner->buf_ht[1], 0);
+	kernel_round7 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[0], miner->buf_ht[1], HALF_SIZE);
 	if (cancelf()) return;
-	kernel_init_0 << <DIM_SIZE, THREAD_COUNT >> > ();
-	kernel_round_cm3_8 << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[1], miner->buf_ht[0]);
+	kernel_init_0 << <DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (0);
+	kernel_init_0 << <DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (HALF_SIZE);
+	kernel_round8 << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[1], miner->buf_ht[0], 0);
+	kernel_round8 << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[1], miner->buf_ht[0], HALF_SIZE);
 	if (cancelf()) return;
-	kernel_sols << < DIM_SIZE, THREAD_COUNT >> > (miner->buf_ht[0], miner->buf_ht[1]);
+	kernel_sols << < DIM_SIZE, NEW_THREAD, 0, miner->s1 >> > (miner->buf_ht[0], miner->buf_ht[1], 0);
+	kernel_sols << < DIM_SIZE, NEW_THREAD, 0, miner->s2 >> > (miner->buf_ht[0], miner->buf_ht[1], HALF_SIZE);
+
+	checkCudaErrors(cudaStreamSynchronize(miner->s1));
+	checkCudaErrors(cudaStreamSynchronize(miner->s2));
 
 	checkCudaErrors(cudaMemcpyFromSymbol(miner->sols, sols, sizeof(sols_t), 0, cudaMemcpyDeviceToHost));
 
@@ -1022,4 +1160,4 @@ void sa_cuda_context::solve(const char * tequihash_header, unsigned int tequihas
 	}
 	hashdonef();
 
-}
\ No newline at end of file
+}
diff --git a/cuda_tromp/cuda_tromp.vcxproj b/cuda_tromp/cuda_tromp.vcxproj
index df673ec3e..082c3dc46 100644
--- a/cuda_tromp/cuda_tromp.vcxproj
+++ b/cuda_tromp/cuda_tromp.vcxproj
@@ -30,7 +30,9 @@
   <PropertyGroup Label="Globals">
     <ProjectGuid>{33C2B469-F025-4223-B9B6-E69D42FEA7D6}</ProjectGuid>
     <RootNamespace>cuda_tromp</RootNamespace>
-    <CudaToolkitCustomDir>$(CUDA_PATH_V7_5)</CudaToolkitCustomDir>
+    <CudaToolkitCustomDir>
+    </CudaToolkitCustomDir>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
@@ -44,14 +46,14 @@
     <UseDebugLibraries>false</UseDebugLibraries>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v120</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release7.5|x64'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v120</PlatformToolset>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
@@ -125,7 +127,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
     </PostBuildEvent>
     <CudaCompile>
       <TargetMachinePlatform>64</TargetMachinePlatform>
-      <CodeGeneration>compute_61,sm_61;compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;</CodeGeneration>
+      <CodeGeneration>compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;</CodeGeneration>
       <PtxAsOptionV>true</PtxAsOptionV>
     </CudaCompile>
   </ItemDefinitionGroup>
diff --git a/nheqminer/main.cpp b/nheqminer/main.cpp
index 1503bb454..f4b34725a 100644
--- a/nheqminer/main.cpp
+++ b/nheqminer/main.cpp
@@ -265,7 +265,7 @@ int main(int argc, char* argv[])
 	int cuda_device_count = 0;
 	int cuda_bc = 0;
 	int cuda_tbpc = 0;
-	int opencl_platform = 0;
+	int opencl_platform = -1;
 	int opencl_device_count = 0;
 	int force_cpu_ext = -1;
 	int opencl_t = 0;
@@ -362,6 +362,9 @@ int main(int argc, char* argv[])
 					}
 				}
 				break;
+			case 'p':
+				opencl_platform = atoi(argv[++i]);
+				break;
 			case 't':
 				while (opencl_t < 8 && i + 1 < argc)
 				{
diff --git a/nheqminer/nheqminer.vcxproj b/nheqminer/nheqminer.vcxproj
index a15fe80a9..627cc3f5c 100644
--- a/nheqminer/nheqminer.vcxproj
+++ b/nheqminer/nheqminer.vcxproj
@@ -14,6 +14,7 @@
     <ProjectGuid>{6FF7D209-05A3-4550-93CC-211D33503719}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>nheqminer</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
@@ -25,7 +26,7 @@
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v120</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>
diff --git a/ocl_device_utils/ocl_device_utils.cpp b/ocl_device_utils/ocl_device_utils.cpp
index d3d3d9269..5aceec21f 100644
--- a/ocl_device_utils/ocl_device_utils.cpp
+++ b/ocl_device_utils/ocl_device_utils.cpp
@@ -1,5 +1,5 @@
 #include "ocl_device_utils.h"
-
+#include "opencl.h"
 #include <iostream>
 #include <stdexcept>
 #include <utility>
@@ -14,26 +14,6 @@ std::vector<std::string> ocl_device_utils::_platformNames;
 std::vector<PrintInfo> ocl_device_utils::_devicesPlatformsDevices;
 std::vector<cl::Device> ocl_device_utils::_AllDevices;
 
-static std::vector<cl_device_id> GetAllDevices()
-{
-	std::vector<cl_device_id> retval;
-	retval.reserve(8);
-
-	cl_platform_id platforms[64];
-	cl_uint numPlatforms;
-	cl_int rc = clGetPlatformIDs(sizeof(platforms) / sizeof(cl_platform_id), platforms, &numPlatforms);
-
-	for (cl_uint i = 0; i < numPlatforms; i++) {
-		cl_uint numDevices = 0;
-		cl_device_id devices[64];
-		rc = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, sizeof(devices) / sizeof(cl_device_id), devices, &numDevices);
-		for (cl_uint n = 0; n < numDevices; n++) {
-			retval.push_back(devices[n]);
-		}
-	}
-
-	return retval;
-}
 
 vector<Platform> ocl_device_utils::getPlatforms() {
 	vector<Platform> platforms;
@@ -59,7 +39,10 @@ void ocl_device_utils::print_opencl_devices() {
 vector<Device> ocl_device_utils::getDevices(vector<Platform> const& _platforms, unsigned _platformId) {
 	vector<Device> devices;
 	try {
-		_platforms[_platformId].getDevices(/*CL_DEVICE_TYPE_CPU| */CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, &devices);
+		auto cl_devices = GetAllDevices();
+		for (auto& cl_device : cl_devices) {
+			devices.push_back({ cl_device });
+		}
 	}
 	catch (Error const& err) {
 		// if simply no devices found return empty vector
@@ -79,60 +62,39 @@ bool ocl_device_utils::QueryDevices() {
 		try {
 			auto devices = GetAllDevices();
 
-			for (auto& device : devices) {
-				_AllDevices.emplace_back(cl::Device(device));
-			}
-
-			
-
-
-			// get platforms
-			auto platforms = getPlatforms();
-			if (platforms.empty()) {
-				cout << "No OpenCL platforms found" << endl;
-				return false;
-			}
-			else {
-				for (auto i_pId = 0u; i_pId < platforms.size(); ++i_pId) {
-					string platformName = StringnNullTerminatorFix(platforms[i_pId].getInfo<CL_PLATFORM_NAME>());
-					if (std::find(_platformNames.begin(), _platformNames.end(), platformName) == _platformNames.end()) {
-						PrintInfo current;
-						_platformNames.push_back(platformName);
-						// new
-						current.PlatformName = platformName;
-						current.PlatformNum = i_pId;
-
-						auto clDevs = getDevices(platforms, i_pId);
-						for (auto i_devId = 0u; i_devId < clDevs.size(); ++i_devId) {
-							OpenCLDevice curDevice;
-							curDevice.DeviceID = i_devId;
-							curDevice._CL_DEVICE_NAME = StringnNullTerminatorFix(clDevs[i_devId].getInfo<CL_DEVICE_NAME>());
-							switch (clDevs[i_devId].getInfo<CL_DEVICE_TYPE>()) {
-							case CL_DEVICE_TYPE_CPU:
-								curDevice._CL_DEVICE_TYPE = "CPU";
-								break;
-							case CL_DEVICE_TYPE_GPU:
-								curDevice._CL_DEVICE_TYPE = "GPU";
-								break;
-							case CL_DEVICE_TYPE_ACCELERATOR:
-								curDevice._CL_DEVICE_TYPE = "ACCELERATOR";
-								break;
-							default:
-								curDevice._CL_DEVICE_TYPE = "DEFAULT";
-								break;
-							}
-
-
-							curDevice._CL_DEVICE_GLOBAL_MEM_SIZE = clDevs[i_devId].getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
-							curDevice._CL_DEVICE_VENDOR = StringnNullTerminatorFix(clDevs[i_devId].getInfo<CL_DEVICE_VENDOR>());
-							curDevice._CL_DEVICE_VERSION = StringnNullTerminatorFix(clDevs[i_devId].getInfo<CL_DEVICE_VERSION>());
-							curDevice._CL_DRIVER_VERSION = StringnNullTerminatorFix(clDevs[i_devId].getInfo<CL_DRIVER_VERSION>());
-
-							current.Devices.push_back(curDevice);
-						}
-						_devicesPlatformsDevices.push_back(current);
-					}
+			unsigned int device_num = 0;
+			for (auto& cldevice : devices) {
+				cl::Device device(cldevice);
+				cl::Platform platform(device.getInfo<CL_DEVICE_PLATFORM>());
+				_AllDevices.emplace_back(device);
+				PrintInfo current;
+				current.PlatformName = StringnNullTerminatorFix(platform.getInfo<CL_PLATFORM_NAME>());
+				current.PlatformNum = 0;
+				OpenCLDevice curDevice;
+				curDevice.DeviceID = device_num++;
+				curDevice._CL_DEVICE_NAME = StringnNullTerminatorFix(device.getInfo<CL_DEVICE_NAME>());
+
+				switch (device.getInfo<CL_DEVICE_TYPE>()) {
+					case CL_DEVICE_TYPE_CPU:
+						curDevice._CL_DEVICE_TYPE = "CPU";
+						break;
+					case CL_DEVICE_TYPE_GPU:
+						curDevice._CL_DEVICE_TYPE = "GPU";
+						break;
+					case CL_DEVICE_TYPE_ACCELERATOR:
+						curDevice._CL_DEVICE_TYPE = "ACCELERATOR";
+						break;
+					default:
+						curDevice._CL_DEVICE_TYPE = "DEFAULT";
+						break;
 				}
+
+				curDevice._CL_DEVICE_GLOBAL_MEM_SIZE = device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
+				curDevice._CL_DEVICE_VENDOR = StringnNullTerminatorFix(device.getInfo<CL_DEVICE_VENDOR>());
+				curDevice._CL_DEVICE_VERSION = StringnNullTerminatorFix(device.getInfo<CL_DEVICE_VERSION>());
+				curDevice._CL_DRIVER_VERSION = StringnNullTerminatorFix(device.getInfo<CL_DRIVER_VERSION>());
+				current.Devices.push_back(curDevice);
+				_devicesPlatformsDevices.push_back(current);
 			}
 		}
 		catch (exception &ex) {
@@ -150,7 +112,7 @@ int ocl_device_utils::GetCountForPlatform(int platformID) {
 	for (const auto &platInfo : _devicesPlatformsDevices)
 	{
 		if (platformID == platInfo.PlatformNum) {
-			return platInfo.Devices.size();
+			return (int)platInfo.Devices.size();
 		}
 	}
 	return 0;
diff --git a/ocl_device_utils/ocl_device_utils.vcxproj b/ocl_device_utils/ocl_device_utils.vcxproj
index 307e3f32d..b89529a1c 100644
--- a/ocl_device_utils/ocl_device_utils.vcxproj
+++ b/ocl_device_utils/ocl_device_utils.vcxproj
@@ -24,6 +24,7 @@
     <ProjectGuid>{5DBCE38A-C8D2-4498-A92A-9AF8D5196135}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>ocl_device_utils</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
@@ -35,7 +36,7 @@
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v120</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
diff --git a/ocl_device_utils/opencl.cpp b/ocl_device_utils/opencl.cpp
index af5704adf..469188f1c 100644
--- a/ocl_device_utils/opencl.cpp
+++ b/ocl_device_utils/opencl.cpp
@@ -3,11 +3,15 @@
 #include <vector>
 #include <memory>
 #include <stdio.h>
+#include <algorithm>
+
+#include "ocl_device_utils.h"
 
 extern cl_platform_id gPlatform;
 // extern cl_program gProgram;
 
-std::vector<cl_device_id> GetAllDevices()
+
+/*static std::vector<cl_device_id> GetAllDevices()
 {
 	std::vector<cl_device_id> retval;
 	retval.reserve(8);
@@ -19,65 +23,58 @@ std::vector<cl_device_id> GetAllDevices()
 	for (cl_uint i = 0; i < numPlatforms; i++) {
 		cl_uint numDevices = 0;
 		cl_device_id devices[64];
-		rc = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, sizeof(devices) / sizeof(cl_device_id), devices, &numDevices);
+		rc = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, sizeof(devices) / sizeof(cl_device_id), devices, &numDevices);
 		for (cl_uint n = 0; n < numDevices; n++) {
 			retval.push_back(devices[n]);
 		}
 	}
 
+	return retval;
+}*/
+
+std::vector<cl_device_id> GetAllDevices(int platform_id)
+{
+
+	std::vector<cl_device_id> retval;
+	retval.reserve(8);
+
+	cl_platform_id platforms[64];
+	cl_uint numPlatforms;
+	cl_int rc = clGetPlatformIDs(sizeof(platforms) / sizeof(cl_platform_id), platforms, &numPlatforms);
+
+	for (cl_uint i = 0; i < numPlatforms; i++) {
+		
+		if (platform_id != -1 && i != platform_id) {
+			continue;
+		}
+		cl_uint numDevices = 0;
+		cl_device_id devices[64];
+		rc = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, sizeof(devices) / sizeof(cl_device_id), devices, &numDevices);
+		for (cl_uint n = 0; n < numDevices; n++) {
+			cl_device_id device = devices[n];
+			if (std::find_if(retval.begin(), retval.end(), [&device](cl_device_id p) {
+				return (p == device);
+			}) == retval.end()) {
+				retval.push_back(device);
+			}
+		}
+	}
+
 	return retval;
 }
 
 bool clInitialize(int requiredPlatform, std::vector<cl_device_id> &gpus)
 {
-  cl_platform_id platforms[64];
-  cl_uint numPlatforms;
-  OCLR(clGetPlatformIDs(sizeof(platforms)/sizeof(cl_platform_id), platforms, &numPlatforms), false);
-  if (!numPlatforms) {
-    printf("<error> no OpenCL platforms found\n");
-    return false;
-  }
-  
-  /*int platformIdx = -1;
-  if (requiredPlatform) {
-    for (decltype(numPlatforms) i = 0; i < numPlatforms; i++) {
-      char name[1024] = {0};
-      OCLR(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(name), name, 0), false);
-      printf("found platform[%i] name = '%s'\n", (int)i, name);
-      if (strcmp(name, requiredPlatform) == 0) {
-        platformIdx = i;
-        break;
-      }
-    }
-  } else {
-    platformIdx = 0;
-  }*/
+	gpus = GetAllDevices();
 
-  int platformIdx = requiredPlatform;
-  
-  
-  if (platformIdx == -1) {
-    printf("<error> platform %s not exists\n", requiredPlatform);
-    return false;
-  }
-  
-  gPlatform = platforms[platformIdx];
-  
-  cl_uint numDevices = 0;
-  cl_device_id devices[64];
-  clGetDeviceIDs(gPlatform, CL_DEVICE_TYPE_GPU, sizeof(devices)/sizeof(cl_device_id), devices, &numDevices);
-  if (numDevices) {
-    printf("<info> found %d devices\n", numDevices);
-  } else {
-    printf("<error> no OpenCL GPU devices found.\n");
-    return false;
-  }
+	if (gpus.empty()) {
+		printf("<error> no OpenCL platforms found\n");
+		return false;
+	}
 
-  for (decltype(numDevices) i = 0; i < numDevices; i++) {
-    gpus.push_back(devices[i]);
-  }
-  
-  return true;
+	printf("<info> found %d devices\n", gpus.size());
+
+	return true;
 }
 
 bool clCompileKernel(cl_context gContext,
diff --git a/ocl_device_utils/opencl.h b/ocl_device_utils/opencl.h
index 26f01df27..fa559665b 100644
--- a/ocl_device_utils/opencl.h
+++ b/ocl_device_utils/opencl.h
@@ -115,7 +115,7 @@ class clBuffer {
 };
 
 
-std::vector<cl_device_id> GetAllDevices();
+std::vector<cl_device_id> GetAllDevices(int platform_id = -1);
 
 bool clInitialize(int requiredPlatform, std::vector<cl_device_id> &gpus);
 bool clCompileKernel(cl_context gContext,
diff --git a/ocl_silentarmy/ocl_silentarmy.cpp b/ocl_silentarmy/ocl_silentarmy.cpp
index cfb641f33..c4a2cc7b0 100644
--- a/ocl_silentarmy/ocl_silentarmy.cpp
+++ b/ocl_silentarmy/ocl_silentarmy.cpp
@@ -367,7 +367,7 @@ ocl_silentarmy::ocl_silentarmy(int platf_id, int dev_id) {
 }
 
 std::string ocl_silentarmy::getdevinfo() {
-	static auto devices = GetAllDevices();
+	static auto devices = GetAllDevices(platform_id);
 	auto device = devices[device_id];
 	std::vector<char> name(256, 0);
 	size_t nActualSize = 0;
@@ -387,7 +387,7 @@ int ocl_silentarmy::getcount() {
 }
 
 void ocl_silentarmy::getinfo(int platf_id, int d_id, std::string& gpu_name, int& sm_count, std::string& version) { 
-	static auto devices = GetAllDevices();
+	static auto devices = GetAllDevices(platf_id);
 
 	if (devices.size() <= d_id) {
 		return;
@@ -420,20 +420,23 @@ void ocl_silentarmy::start(ocl_silentarmy& device_context) {
 	/*TODO*/
 	device_context.is_init_success = false;
 	device_context.oclc = new OclContext;
-	auto devices = GetAllDevices();
+	auto devices = GetAllDevices(device_context.platform_id);
+
+	printf("pid %i, size %u\n", device_context.platform_id, devices.size());
 	auto device = devices[device_context.device_id];
 
 	size_t nActualSize = 0;
 	cl_platform_id platform_id = nullptr;
-	cl_int rc = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform_id, &nActualSize);
+	cl_int rc = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform_id, nullptr);
 	
+
 	device_context.oclc->_dev_id = device;
 	device_context.oclc->platform_id = platform_id;
 
 	// context create
 	cl_context_properties props[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)device_context.oclc->platform_id, 0 };
 	cl_int error;
-	device_context.oclc->_context = clCreateContext(NULL, 1, &device, 0, 0, &error);
+	device_context.oclc->_context = clCreateContext(props, 1, &device, 0, 0, &error);
 	//OCLR(error, false);
 	if (cl_int err = error) {
 		printf("OpenCL error: %d at %s:%d\n", err, __FILE__, __LINE__);
diff --git a/ocl_silentarmy/ocl_silentarmy.vcxproj b/ocl_silentarmy/ocl_silentarmy.vcxproj
index a7440fa03..77771fcb6 100644
--- a/ocl_silentarmy/ocl_silentarmy.vcxproj
+++ b/ocl_silentarmy/ocl_silentarmy.vcxproj
@@ -27,6 +27,7 @@
     <ProjectGuid>{AB01E715-795A-4089-8DF0-AE6EBDC1AB48}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>ocl_silentarmy</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
@@ -38,7 +39,7 @@
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v120</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
diff --git a/ocl_silentarmy/sa_blake.cpp b/ocl_silentarmy/sa_blake.cpp
index c10800de8..5a3b321bb 100644
--- a/ocl_silentarmy/sa_blake.cpp
+++ b/ocl_silentarmy/sa_blake.cpp
@@ -39,7 +39,7 @@ void zcash_blake2b_init(blake2b_state_t *st, uint8_t hash_len,
     st->h[0] = blake2b_iv[0] ^ (0x01010000 | hash_len);
     for (uint32_t i = 1; i <= 5; i++)
         st->h[i] = blake2b_iv[i];
-    st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"ZcashPoW";
+    st->h[6] = blake2b_iv[6] ^ *(uint64_t *)"DeepWebCa";
     st->h[7] = blake2b_iv[7] ^ (((uint64_t)k << 32) | n);
     st->bytes = 0;
 }
diff --git a/ocl_silentarmy/zcash/gpu/kernel.cl b/ocl_silentarmy/zcash/gpu/kernel.cl
index 1b63cbb3f..8ba3d6283 100644
--- a/ocl_silentarmy/zcash/gpu/kernel.cl
+++ b/ocl_silentarmy/zcash/gpu/kernel.cl
@@ -7,7 +7,7 @@
 #define APX_NR_ELMS_LOG                 (PREFIX + 1)
 // Number of rows and slots is affected by this. 20 offers the best performance
 // but occasionally misses ~1% of solutions.
-#define NR_ROWS_LOG                     18
+#define NR_ROWS_LOG                     20
 
 // Setting this to 1 might make SILENTARMY faster, see TROUBLESHOOTING.md
 #define OPTIM_SIMPLIFY_ROUND		1
@@ -15,10 +15,10 @@
 // Number of collision items to track, per thread
 #ifdef cl_nv_pragma_unroll // NVIDIA
 #define THREADS_PER_ROW 16
-#define LDS_COLL_SIZE (NR_SLOTS * 24 * (64 / THREADS_PER_ROW))
+#define LDS_COLL_SIZE (NR_SLOTS * 24 * (THRD / THREADS_PER_ROW))
 #else
 #define THREADS_PER_ROW 8
-#define LDS_COLL_SIZE (NR_SLOTS * 8 * (64 / THREADS_PER_ROW))
+#define LDS_COLL_SIZE (NR_SLOTS * 8 * (THRD / THREADS_PER_ROW))
 #endif
 
 // Ratio of time of sleeping before rechecking if task is done (0-1)
diff --git a/ocl_xpm/ocl_xmp.cpp b/ocl_xpm/ocl_xmp.cpp
index d0a96a2a8..4064e3626 100644
--- a/ocl_xpm/ocl_xmp.cpp
+++ b/ocl_xpm/ocl_xmp.cpp
@@ -105,7 +105,7 @@ static void setheader(blake2b_state *ctx, const char *header, const uint32_t hea
 {
 	uint32_t le_N = WN;
 	uint32_t le_K = WK;
-	char personal[] = "ZcashPoW01230123";
+	char personal[] = "DeepWebCa01230123";
 	memcpy(personal + 8, &le_N, 4);
 	memcpy(personal + 12, &le_K, 4);
 	blake2b_param P[1];
diff --git a/ocl_xpm/ocl_xpm.vcxproj b/ocl_xpm/ocl_xpm.vcxproj
index 8d3a4645f..a88544382 100644
--- a/ocl_xpm/ocl_xpm.vcxproj
+++ b/ocl_xpm/ocl_xpm.vcxproj
@@ -25,6 +25,7 @@
     <ProjectGuid>{5EC9EDEB-8E49-4126-9161-1560683CBC71}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>ocl_xpm</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
@@ -36,7 +37,7 @@
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v120</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>