From e0c4392445a75b867daf7a0bbb7b08f266211631 Mon Sep 17 00:00:00 2001
From: Markus Battarbee <markus.battarbee@gmail.com>
Date: Tue, 6 Feb 2024 11:34:17 +0200
Subject: [PATCH 01/10] Remove nodiscard options which are breaking compilation
 on lumi

---
 include/common.h               | 2 +-
 include/splitvector/splitvec.h | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/include/common.h b/include/common.h
index 70c792a..83abefe 100644
--- a/include/common.h
+++ b/include/common.h
@@ -50,7 +50,7 @@ constexpr inline size_t nextPow2(size_t v) noexcept {
  * @brief Computes the next optimal overflow for the hasher kernels
  */
 HASHINATOR_HOSTDEVICE
-[[nodiscard]]
+//[[nodiscard]]
 constexpr inline size_t nextOverflow(size_t currentOverflow, size_t virtualWarp) noexcept {
     size_t remainder = currentOverflow % virtualWarp;
     return ((remainder)==0)?currentOverflow: currentOverflow + (virtualWarp - remainder);
diff --git a/include/splitvector/splitvec.h b/include/splitvector/splitvec.h
index 3803fa2..10290a5 100644
--- a/include/splitvector/splitvec.h
+++ b/include/splitvector/splitvec.h
@@ -524,10 +524,10 @@ class SplitVector {
     * @brief  Returns the residency information of this
     *         Splitvector
     */
-   HOSTDEVICE
-   [[nodiscard]] inline Residency getResidency()const noexcept{
-      return _location;
-   }
+   // HOSTDEVICE
+   // [[nodiscard]] inline Residency getResidency()const noexcept{
+   //    return _location;
+   // }
 
    /**
     * @brief Copies metadata to a provided destination SplitInfo structure.

From f266573c61f7bed57528c867e9be4861cdd26817 Mon Sep 17 00:00:00 2001
From: kstppd <kpapadakis@protonmail.com>
Date: Wed, 7 Feb 2024 13:12:05 +0200
Subject: [PATCH 02/10] Reduce stream syncs in split tools

---
 include/splitvector/split_tools.h     | 23 ++-----
 meson.build                           |  2 +
 unit_tests/stream_compaction/bench.cu | 93 +++++++++++++++++++++++++++
 3 files changed, 102 insertions(+), 16 deletions(-)
 create mode 100644 unit_tests/stream_compaction/bench.cu

diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h
index e754965..c319826 100644
--- a/include/splitvector/split_tools.h
+++ b/include/splitvector/split_tools.h
@@ -622,14 +622,12 @@ uint32_t copy_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& i
    uint32_t* d_counts;
    uint32_t* d_offsets;
    d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s));
 
    // Phase 1 -- Calculate per warp workload
    size_t _size = input.size();
    split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input.data(), d_counts, rule, _size);
    d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s));
 
    // Step 2 -- Exclusive Prefix Scan on offsets
@@ -638,17 +636,14 @@ uint32_t copy_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& i
    } else {
       split_prefix_scan_raw<uint32_t, BLOCKSIZE, WARP>(d_counts, d_offsets, mPool, nBlocks, s);
    }
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
 
    // Step 3 -- Compaction
    uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t));
    split::tools::split_compact_raw<T, Rule, BLOCKSIZE, WARP>
        <<<nBlocks, BLOCKSIZE, 2 * (BLOCKSIZE / WARP) * sizeof(unsigned int), s>>>(
            input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval);
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    uint32_t numel;
    SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    return numel;
 }
 
@@ -659,13 +654,11 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule,
    uint32_t* d_counts;
    uint32_t* d_offsets;
    d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t), s));
 
    // Phase 1 -- Calculate per warp workload
    split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input, d_counts, rule, size);
    d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s));
 
    // Step 2 -- Exclusive Prefix Scan on offsets
@@ -674,17 +667,14 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule,
    } else {
       split_prefix_scan_raw<uint32_t, BLOCKSIZE, WARP>(d_counts, d_offsets, mPool, nBlocks, s);
    }
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
 
    // Step 3 -- Compaction
    uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t));
    split::tools::split_compact_raw<T, Rule, BLOCKSIZE, WARP>
       <<<nBlocks, BLOCKSIZE, 2 * (BLOCKSIZE / WARP) * sizeof(unsigned int), s>>>(
          input, d_counts, d_offsets, output, rule, size, nBlocks, retval);
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    uint32_t numel;
    SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    return numel;
 }
 
@@ -697,16 +687,13 @@ size_t copy_keys_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>
 
    uint32_t* d_counts;
    uint32_t* d_offsets;
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s));
 
    // Phase 1 -- Calculate per warp workload
    size_t _size = input.size();
    split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input.data(), d_counts, rule, _size);
    d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s));
 
    // Step 2 -- Exclusive Prefix Scan on offsets
@@ -715,17 +702,14 @@ size_t copy_keys_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>
    } else {
       split_prefix_scan_raw<uint32_t, BLOCKSIZE, WARP>(d_counts, d_offsets, mPool, nBlocks, s);
    }
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
 
    // Step 3 -- Compaction
    uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t));
    split::tools::split_compact_keys_raw<T, U, Rule, BLOCKSIZE, WARP>
        <<<nBlocks, BLOCKSIZE, 2 * (BLOCKSIZE / WARP) * sizeof(unsigned int), s>>>(
            input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval);
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    uint32_t numel;
    SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    return numel;
 }
 
@@ -779,6 +763,7 @@ void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& inpu
    const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t);
    Cuda_mempool mPool(memory_for_pool, s);
    auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s);
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    output.erase(&output[len], output.end());
 }
 
@@ -812,6 +797,7 @@ void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
    const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t);
    Cuda_mempool mPool(memory_for_pool, s);
    auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s);
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    output.erase(&output[len], output.end());
 }
 
@@ -827,6 +813,7 @@ void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& inpu
       nBlocks += 1;
    }
    auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, std::forward<Cuda_mempool>(mPool), s);
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    output.erase(&output[len], output.end());
 }
 
@@ -842,6 +829,7 @@ void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
       nBlocks += 1;
    }
    auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s);
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    output.erase(&output[len], output.end());
 }
 
@@ -859,6 +847,7 @@ void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& inpu
    assert(stack && "Invalid stack!");
    Cuda_mempool mPool(stack, max_size);
    auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s);
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    output.erase(&output[len], output.end());
 }
 
@@ -876,6 +865,7 @@ void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
    assert(stack && "Invalid stack!");
    Cuda_mempool mPool(stack, max_size);
    auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s);
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    output.erase(&output[len], output.end());
 }
 
@@ -892,6 +882,7 @@ size_t copy_if(T* input, T* output, size_t size, Rule rule, void* stack, size_t
    assert(stack && "Invalid stack!");
    Cuda_mempool mPool(stack, max_size);
    auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s);
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    return len;
 }
 
diff --git a/meson.build b/meson.build
index e48b066..5c10eb6 100644
--- a/meson.build
+++ b/meson.build
@@ -24,6 +24,7 @@ compaction3_unit = executable('compaction3_test', 'unit_tests/stream_compaction/
 pointer_unit = executable('pointer_test', 'unit_tests/pointer_test/main.cu',dependencies :gtest_dep )
 hybridCPU = executable('hybrid_cpu', 'unit_tests/hybrid/main.cu',cpp_args:'-DHASHINATOR_CPU_ONLY_MODE',dependencies :gtest_dep )
 hashinator_bench = executable('bench', 'unit_tests/benchmark/main.cu', dependencies :gtest_dep,link_args:'-lnvToolsExt')
+compaction_bench = executable('streamBench', 'unit_tests/stream_compaction/bench.cu' ,link_args:'-lnvToolsExt')
 deletion_mechanism = executable('deletion', 'unit_tests/delete_by_compaction/main.cu', dependencies :gtest_dep)
 insertion_mechanism = executable('insertion', 'unit_tests/insertion_mechanism/main.cu', dependencies :gtest_dep)
 tombstoneTest = executable('tbPerf', 'unit_tests/benchmark/tbPerf.cu', dependencies :gtest_dep)
@@ -39,6 +40,7 @@ test('CompactionTest',  compaction_unit)
 test('CompactionTest2',  compaction2_unit)
 test('CompactionTest3',  compaction3_unit)
 test('HashinatorBench',  hashinator_bench)
+test('CompactionBench',  compaction_bench)
 test('Insertion',  insertion_mechanism)
 test('Deletion',  deletion_mechanism)
 test('PointerTest',  pointer_unit)
diff --git a/unit_tests/stream_compaction/bench.cu b/unit_tests/stream_compaction/bench.cu
new file mode 100644
index 0000000..74b8ba6
--- /dev/null
+++ b/unit_tests/stream_compaction/bench.cu
@@ -0,0 +1,93 @@
+#include <iostream>
+#include <stdlib.h>
+#include <chrono>
+#include <limits>
+#include <random>
+#include "../../include/splitvector/splitvec.h"
+#include "../../include/splitvector/split_tools.h"
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <nvToolsExt.h>
+using namespace std::chrono;
+using type_t = uint32_t;
+using  splitvector = split::SplitVector<type_t> ;
+using  thrustvector = thrust::device_vector<type_t> ;
+constexpr int R = 100;
+#define PROFILE_START(msg)   nvtxRangePushA((msg))
+#define PROFILE_END() nvtxRangePop()
+
+template <class Fn, class ... Args>
+auto timeMe(Fn fn, Args && ... args){
+   std::chrono::time_point<std::chrono::_V2::system_clock, std::chrono::_V2::system_clock::duration> start,stop;
+   double total_time=0;
+   start = std::chrono::high_resolution_clock::now();
+   fn(args...);
+   stop = std::chrono::high_resolution_clock::now();
+   auto duration = duration_cast<microseconds>(stop- start).count();
+   total_time+=duration;
+   return total_time;
+}
+
+template <typename T>
+void fillVec(T& vec,size_t sz){
+   std::random_device dev;
+   std::mt19937 rng(dev());
+   std::uniform_int_distribution<std::mt19937::result_type> dist(0,std::numeric_limits<type_t>::max());
+   for (size_t i=0; i< sz;++i){
+      vec[i]=i;//dist(rng);
+   }
+   return;
+}
+
+
+void stream_compaction_split(splitvector& v,splitvector& output,  type_t* stack, size_t sz){
+   auto pred =[]__host__ __device__ (type_t  element)->bool{ return (element%2)==0 ;};
+   split::tools::copy_if(v.data(),output.data(),sz,pred,(void*)stack,sz);
+}
+
+void stream_compaction_thrust(thrustvector& v,thrustvector& output){
+   auto pred =[]__host__ __device__ (type_t  element)->bool{ return (element%2)==0 ;};
+   thrust::copy_if(thrust::device, v.begin(), v.end(), output.begin(), pred);
+}
+
+int main(int argc, char* argv[]){
+
+
+   int sz=10;
+   if (argc>=2){
+      sz=atoi(argv[1]);
+   }
+   size_t N = 1<<sz;
+   srand(1);
+   splitvector v0(N),v0_out(N);
+   thrustvector v1(N),v1_out(N);
+   fillVec(v0,N);
+   fillVec(v1,N);
+   splitvector stack(N);
+   v0.optimizeGPU();
+   v0_out.optimizeGPU();
+   stack.optimizeGPU();
+   split_gpuDeviceSynchronize();
+
+   
+   double t_split={0};
+   double t_thrust={0};
+   for (int i =0; i<R; i++){
+
+   v0.optimizeGPU();
+   v0_out.optimizeGPU();
+   stack.optimizeGPU();
+      PROFILE_START("THRUST");
+      stream_compaction_thrust(v1,v1_out);
+      t_thrust+=timeMe(stream_compaction_thrust,v1,v1_out);
+      PROFILE_END();
+
+      PROFILE_START("SPLIT");
+      stream_compaction_split(v0,v0_out,stack.data(),N);
+      t_split+=timeMe(stream_compaction_split,v0,v0_out,stack.data(),N);
+      PROFILE_END();
+   }
+   printf("%d\t%f\t%f\n",sz,t_split/R,t_thrust/R);;
+   return 0;
+}

From 4b360ae326303e605c6ca64825f6025aefef62b0 Mon Sep 17 00:00:00 2001
From: kstppd <kpapadakis@protonmail.com>
Date: Mon, 19 Feb 2024 11:48:50 +0200
Subject: [PATCH 03/10] WIP in faster stream compactions

---
 include/splitvector/gpu_wrappers.h    | 21 +++++++++
 include/splitvector/split_tools.h     | 63 +++++++++++++++++++++++++++
 unit_tests/benchmark/main.cu          | 24 +++++-----
 unit_tests/stream_compaction/bench.cu | 53 +++++++++++++---------
 4 files changed, 129 insertions(+), 32 deletions(-)

diff --git a/include/splitvector/gpu_wrappers.h b/include/splitvector/gpu_wrappers.h
index b7d7a6e..fa00d3d 100644
--- a/include/splitvector/gpu_wrappers.h
+++ b/include/splitvector/gpu_wrappers.h
@@ -261,5 +261,26 @@ __device__ __forceinline__ T s_shuffle_down(T variable, unsigned int delta, U ma
    return __shfl_down(variable, delta);
 #endif
 }
+
+/**
+ * @brief Wrapper for performing an up register shuffle operation.
+ *
+ * @tparam T The data type of the variable.
+ * @tparam U The data type of the mask.
+ * @param variable The variable to shuffle.
+ * @param delta The offset.
+ * @param mask Voting mask.
+ * @return The shuffled variable.
+ */
+template <typename T, typename U>
+__device__ __forceinline__ T s_shuffle_up(T variable, unsigned int delta, U mask = 0) noexcept {
+   static_assert(std::is_integral<T>::value && "Only integers supported");
+#ifdef __NVCC__
+   return __shfl_up_sync(mask, variable, delta);
+#endif
+#ifdef __HIP__
+   return __shfl_up(variable, delta);
+#endif
+}
 } // namespace split
 #endif
diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h
index c319826..58d01c3 100644
--- a/include/splitvector/split_tools.h
+++ b/include/splitvector/split_tools.h
@@ -886,5 +886,68 @@ size_t copy_if(T* input, T* output, size_t size, Rule rule, void* stack, size_t
    return len;
 }
 
+template <typename T, typename Rule> 
+__global__ void blockCompact(T* input,T* output,size_t inputSize,Rule rule) 
+{
+   __shared__ uint32_t warpSums[WARPLENGTH];
+   constexpr int VIRTUALWARP = WARPLENGTH;
+   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+   const size_t wid = tid /WARPLENGTH;
+   const size_t w_tid = tid % WARPLENGTH;
+   //full warp votes for rule-> mask = [01010101010101010101010101010101] 
+   const int active=rule(input[tid]); 
+   const auto mask = split::s_warpVote(active==1,SPLIT_VOTING_MASK);
+   const auto warpCount=s_pop_count(mask);
+   if (w_tid==0){
+      warpSums[wid]= (wid==0)?0:warpCount;
+   }
+   __syncthreads();
+   //Prefix scan WarpSums on the first warp
+   if (wid==0){
+      auto value = warpSums[w_tid];
+       for (int d=1; d<32; d=2*d) {
+         int res= __shfl_up_sync(SPLIT_VOTING_MASK, value,d);
+         if (tid%32 >= d) value+= res;
+      }
+   warpSums[w_tid]=value;
+   }
+   __syncthreads();
+   auto offset= (wid==0)?0:warpSums[wid];
+   auto pp=s_pop_count(mask>>(static_cast<uint32_t>(std::min(WARPLENGTH,(int)inputSize))-w_tid));
+   const auto warpTidWriteIndex  =offset + pp;
+   if(active){
+      output[warpTidWriteIndex]=input[tid];
+   }
+}
+
+
+template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
+size_t copy_if_small(T* input, T* output, size_t size, Rule rule, void* stack, size_t max_size,
+               split_gpuStream_t s = 0) {
+
+   // Figure out Blocks to use
+   size_t _s = std::ceil((float(size)) / (float)BLOCKSIZE);
+   size_t nBlocks = nextPow2(_s);
+   if (nBlocks == 0) {
+      nBlocks += 1;
+   }
+   assert(stack && "Invalid stack!");
+   Cuda_mempool mPool(stack, max_size);
+   auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s);
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+   return len;
+}
+
+template <typename T, typename Rule>
+size_t copy_if_small2(T* input, T* output, size_t size, Rule rule, void* stack, size_t max_size,
+               split_gpuStream_t s = 0) {
+   (void)stack;
+   (void)max_size;
+   // Figure out Blocks to use
+   split::tools::blockCompact<<<1,size>>>(input,output,size,rule);
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+   return 0;
+}
+
 } // namespace tools
 } // namespace split
diff --git a/unit_tests/benchmark/main.cu b/unit_tests/benchmark/main.cu
index f4591df..bd8bada 100644
--- a/unit_tests/benchmark/main.cu
+++ b/unit_tests/benchmark/main.cu
@@ -112,28 +112,28 @@ int main(int argc, char* argv[]){
    hmap.memAdvise(cudaMemAdviseSetAccessedBy,device);
    hmap.optimizeGPU();
    hmap.optimizeGPU();
+   double t_insert={0};
+   double t_retrieve={0};
+   double t_extract={0};
+   double t_erase={0};
+
+   for (int i =0; i<R; i++){
    vector cpu_src;
    key_vec cpu_keys;
    val_vec cpu_vals;
    generateNonDuplicatePairs(cpu_keys,cpu_vals,1<<sz);
    //std::cout<<"Generated "<<cpu_keys.size()<<" unique keys!"<<std::endl;
 
-   key_type* gpuKeys;
+
    key_vec  spare;
+   spare.resize(1<<sz);
+   key_type* gpuKeys;
    val_type* gpuVals;
    split_gpuMalloc((void **) &gpuKeys, (1<<sz)*sizeof(key_type));
    split_gpuMalloc((void **) &stack, bytes);
    split_gpuMalloc((void **) &gpuVals, (1<<sz)*sizeof(val_type));
    split_gpuMemcpy(gpuKeys,cpu_keys.data(),(1<<sz)*sizeof(key_type),split_gpuMemcpyHostToDevice);
    split_gpuMemcpy(gpuVals,cpu_vals.data(),(1<<sz)*sizeof(key_type),split_gpuMemcpyHostToDevice);
-   spare.resize(1<<sz);
-
-   double t_insert={0};
-   double t_retrieve={0};
-   double t_extract={0};
-   double t_erase={0};
-
-   for (int i =0; i<R; i++){
       hmap.optimizeGPU();
 
       PROFILE_START("insert");
@@ -157,6 +157,9 @@ int main(int argc, char* argv[]){
       PROFILE_END();
       
       hmap.clear();
+   split_gpuFree(gpuKeys);
+   split_gpuFree(gpuVals);
+   split_gpuFree(stack);
    }
    t_insert/=(float)R;
    t_retrieve/=(float)R;
@@ -164,9 +167,6 @@ int main(int argc, char* argv[]){
    t_erase/=(float)R;
 
    printf("%d %d %d %d %d\n",sz,(int)t_insert,(int)t_retrieve,(int)t_extract,(int)t_erase);
-   split_gpuFree(gpuKeys);
-   split_gpuFree(gpuVals);
-   split_gpuFree(stack);
    return 0;
 
 }
diff --git a/unit_tests/stream_compaction/bench.cu b/unit_tests/stream_compaction/bench.cu
index 74b8ba6..78e159b 100644
--- a/unit_tests/stream_compaction/bench.cu
+++ b/unit_tests/stream_compaction/bench.cu
@@ -41,11 +41,17 @@ void fillVec(T& vec,size_t sz){
 }
 
 
-void stream_compaction_split(splitvector& v,splitvector& output,  type_t* stack, size_t sz){
+void stream_compaction_split_old(splitvector& v,splitvector& output,  type_t* stack, size_t sz){
    auto pred =[]__host__ __device__ (type_t  element)->bool{ return (element%2)==0 ;};
-   split::tools::copy_if(v.data(),output.data(),sz,pred,(void*)stack,sz);
+   auto len = split::tools::copy_if_small(v.data(),output.data(),sz,pred,(void*)stack,sz);
 }
 
+
+
+void stream_compaction_split(splitvector& v,splitvector& output,  type_t* stack, size_t sz){
+   auto pred =[]__host__ __device__ (type_t  element)->bool{ return (element%2)==0 ;};
+   auto len = split::tools::copy_if_small2(v.data(),output.data(),sz,pred,(void*)stack,sz);
+}
 void stream_compaction_thrust(thrustvector& v,thrustvector& output){
    auto pred =[]__host__ __device__ (type_t  element)->bool{ return (element%2)==0 ;};
    thrust::copy_if(thrust::device, v.begin(), v.end(), output.begin(), pred);
@@ -54,40 +60,47 @@ void stream_compaction_thrust(thrustvector& v,thrustvector& output){
 int main(int argc, char* argv[]){
 
 
-   int sz=10;
+
+   int sz=6;
    if (argc>=2){
       sz=atoi(argv[1]);
    }
-   size_t N = 1<<sz;
-   srand(1);
+   size_t N = 64;
    splitvector v0(N),v0_out(N);
-   thrustvector v1(N),v1_out(N);
+   srand(1);
    fillVec(v0,N);
-   fillVec(v1,N);
    splitvector stack(N);
+
    v0.optimizeGPU();
    v0_out.optimizeGPU();
    stack.optimizeGPU();
    split_gpuDeviceSynchronize();
-
    
    double t_split={0};
-   double t_thrust={0};
-   for (int i =0; i<R; i++){
-
-   v0.optimizeGPU();
-   v0_out.optimizeGPU();
-   stack.optimizeGPU();
-      PROFILE_START("THRUST");
-      stream_compaction_thrust(v1,v1_out);
-      t_thrust+=timeMe(stream_compaction_thrust,v1,v1_out);
-      PROFILE_END();
+   double t_split_old={0};
 
+   for (size_t i =0 ; i < R ; ++i){
       PROFILE_START("SPLIT");
-      stream_compaction_split(v0,v0_out,stack.data(),N);
       t_split+=timeMe(stream_compaction_split,v0,v0_out,stack.data(),N);
       PROFILE_END();
+      splitvector vv(v0_out);
+
+      PROFILE_START("SPLITOLD");
+      t_split_old+=timeMe(stream_compaction_split_old,v0,v0_out,stack.data(),N);
+      PROFILE_END();
+      assert(vv==v0_out);
    }
-   printf("%d\t%f\t%f\n",sz,t_split/R,t_thrust/R);;
+
+   printf("%d\t%f,%f\n",sz,t_split_old/R,t_split/R);
+
    return 0;
 }
+
+
+
+
+
+
+
+
+

From 73d152c433921ef30d0839497bba5e0d9e8e1b70 Mon Sep 17 00:00:00 2001
From: kstppd <kpapadakis@protonmail.com>
Date: Mon, 19 Feb 2024 12:10:22 +0200
Subject: [PATCH 04/10] Revert a few stream syncs that are actually needed

---
 include/splitvector/split_tools.h | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h
index 58d01c3..12597ad 100644
--- a/include/splitvector/split_tools.h
+++ b/include/splitvector/split_tools.h
@@ -644,6 +644,7 @@ uint32_t copy_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& i
            input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval);
    uint32_t numel;
    SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    return numel;
 }
 
@@ -675,6 +676,7 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule,
          input, d_counts, d_offsets, output, rule, size, nBlocks, retval);
    uint32_t numel;
    SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    return numel;
 }
 
@@ -710,6 +712,7 @@ size_t copy_keys_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>
            input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval);
    uint32_t numel;
    SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    return numel;
 }
 
@@ -797,7 +800,6 @@ void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
    const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t);
    Cuda_mempool mPool(memory_for_pool, s);
    auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s);
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    output.erase(&output[len], output.end());
 }
 
@@ -813,7 +815,6 @@ void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& inpu
       nBlocks += 1;
    }
    auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, std::forward<Cuda_mempool>(mPool), s);
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    output.erase(&output[len], output.end());
 }
 
@@ -829,7 +830,6 @@ void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
       nBlocks += 1;
    }
    auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s);
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    output.erase(&output[len], output.end());
 }
 
@@ -847,7 +847,6 @@ void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& inpu
    assert(stack && "Invalid stack!");
    Cuda_mempool mPool(stack, max_size);
    auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s);
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    output.erase(&output[len], output.end());
 }
 
@@ -865,7 +864,6 @@ void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
    assert(stack && "Invalid stack!");
    Cuda_mempool mPool(stack, max_size);
    auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s);
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    output.erase(&output[len], output.end());
 }
 
@@ -882,7 +880,6 @@ size_t copy_if(T* input, T* output, size_t size, Rule rule, void* stack, size_t
    assert(stack && "Invalid stack!");
    Cuda_mempool mPool(stack, max_size);
    auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s);
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    return len;
 }
 

From e6f1ad3a67a27eadb926976730c9241bdd4e0711 Mon Sep 17 00:00:00 2001
From: kstppd <kpapadakis@protonmail.com>
Date: Fri, 1 Mar 2024 21:48:13 +0200
Subject: [PATCH 05/10] Block compaction with one kernel up to 1024 elements

---
 include/splitvector/split_tools.h     | 312 ++++++++++++++++++++------
 unit_tests/stream_compaction/bench.cu |  66 ++----
 2 files changed, 268 insertions(+), 110 deletions(-)

diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h
index 12597ad..f4614e2 100644
--- a/include/splitvector/split_tools.h
+++ b/include/splitvector/split_tools.h
@@ -433,6 +433,7 @@ void split_prefix_scan(split::SplitVector<T, split::split_unified_allocator<T>>&
  * Modified from (http://www-graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2) to support 64-bit uints
  * Included here as well for standalone use of splitvec outside of hashintor
  */
+__host__ __device__ 
 constexpr inline size_t nextPow2(size_t v) noexcept {
    v--;
    v |= v >> 1;
@@ -612,6 +613,240 @@ void split_prefix_scan_raw(T* input, T* output, Cuda_mempool& mPool, const size_
    }
 }
 
+
+
+#ifdef __NVCC__
+template <typename T, typename Rule> 
+__global__ void block_compact(T* input,T* output,size_t inputSize,Rule rule,uint32_t *retval) 
+{
+   __shared__ uint32_t warpSums[WARPLENGTH];
+   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+   const size_t wid = tid /WARPLENGTH;
+   const size_t w_tid = tid % WARPLENGTH;
+   //full warp votes for rule-> mask = [01010101010101010101010101010101] 
+   const int active=(tid<inputSize)?rule(input[tid]):false; 
+   const auto mask = split::s_warpVote(active==1,SPLIT_VOTING_MASK);
+   const auto warpCount=s_pop_count(mask);
+   if (w_tid==0){
+      warpSums[wid]=warpCount;
+   }
+   __syncthreads();
+   //Figure out the total here because we overwrite shared mem later
+   if (wid==0){
+      int activeWARPS=nextPow2( static_cast<int>( ceilf(static_cast<float>(inputSize)/WARPLENGTH) ));
+      auto reduceCounts=[activeWARPS](int localCount)->int{
+         for (int i = activeWARPS / 2; i > 0; i = i / 2) {
+            localCount += split::s_shuffle_down(localCount, i, SPLIT_VOTING_MASK);
+         }
+         return localCount;
+      };
+      auto localCount=warpSums[w_tid];
+      int totalCount = reduceCounts(localCount);
+      if (w_tid==0){
+         *retval=(uint32_t)(totalCount);
+      }
+   }
+   //Prefix scan WarpSums on the first warp
+   if (wid==0){
+      auto value = warpSums[w_tid];
+       for (int d=1; d<32; d=2*d) {
+         int res= __shfl_up_sync(SPLIT_VOTING_MASK, value,d);
+         if (tid%32 >= d) value+= res;
+      }
+   warpSums[w_tid]=value;
+   }
+   __syncthreads();
+   auto offset= (wid==0)?0:warpSums[wid-1];
+   auto pp=s_pop_count(mask&((1<<w_tid) -1 ));
+   const auto warpTidWriteIndex  =offset + pp;
+   if(active){
+      output[warpTidWriteIndex]=input[tid];
+   }
+}
+
+//Single kernel block compaction. Only works for powers of 2 inputs
+template <typename T,typename U, typename Rule> 
+__global__ void block_compact_keys(T* input,U* output,size_t inputSize,Rule rule,uint32_t *retval) 
+{
+   __shared__ uint32_t warpSums[WARPLENGTH];
+   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+   const size_t wid = tid /WARPLENGTH;
+   const size_t w_tid = tid % WARPLENGTH;
+   //full warp votes for rule-> mask = [01010101010101010101010101010101] 
+   const int active=rule((input[tid])); 
+   const auto mask = split::s_warpVote(active==1,SPLIT_VOTING_MASK);
+   const auto warpCount=s_pop_count(mask);
+   if (w_tid==0){
+      warpSums[wid]=warpCount;
+   }
+   __syncthreads();
+   //Figure out the total here because we overwrite shared mem later
+   if (wid==0){
+      int activeWARPS=nextPow2( static_cast<int>( ceilf(static_cast<float>(inputSize)/WARPLENGTH) ));
+      auto reduceCounts=[activeWARPS](int localCount)->int{
+         for (int i = activeWARPS / 2; i > 0; i = i / 2) {
+            localCount += split::s_shuffle_down(localCount, i, SPLIT_VOTING_MASK);
+         }
+         return localCount;
+      };
+      auto localCount=warpSums[w_tid];
+      int totalCount = reduceCounts(localCount);
+      if (w_tid==0){
+         *retval=(uint32_t)(totalCount);
+      }
+   }
+   //Prefix scan WarpSums on the first warp
+   if (wid==0){
+      auto value = warpSums[w_tid];
+       for (int d=1; d<32; d=2*d) {
+         int res= __shfl_up_sync(SPLIT_VOTING_MASK, value,d);
+         if (tid%32 >= d) value+= res;
+      }
+   warpSums[w_tid]=value;
+   }
+   __syncthreads();
+   auto offset=(wid==0)?0:warpSums[wid-1];
+   auto pp=s_pop_count(mask&((1<<w_tid) -1 ));
+   const auto warpTidWriteIndex  =offset + pp;
+   if(active){
+      output[warpTidWriteIndex]=input[tid].first;
+   }
+}
+#endif
+
+#ifdef __HIP__
+template <typename T, typename Rule> 
+__global__ void block_compact(T* input,T* output,size_t inputSize,Rule rule,uint32_t *retval) 
+{
+   __shared__ uint32_t warpSums[WARPLENGTH/2];
+   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+   const size_t wid = tid /WARPLENGTH;
+   const size_t w_tid = tid % WARPLENGTH;
+   //full warp votes for rule-> mask = [01010101010101010101010101010101] 
+   const int active=(tid<inputSize)?rule(input[tid]):false; 
+   const auto mask = split::s_warpVote(active==1,SPLIT_VOTING_MASK);
+   const auto warpCount=s_pop_count(mask);
+   if (w_tid==0){
+      warpSums[wid]=warpCount;
+   }
+   __syncthreads();
+   //Figure out the total here because we overwrite shared mem later
+   if (wid==0){
+      int activeWARPS=nextPow2( static_cast<int>( ceilf(static_cast<float>(inputSize)/WARPLENGTH) ));
+      auto reduceCounts=[activeWARPS](int localCount)->int{
+         for (int i = activeWARPS / 2; i > 0; i = i / 2) {
+            localCount += split::s_shuffle_down(localCount, i, SPLIT_VOTING_MASK);
+         }
+         return localCount;
+      };
+      auto localCount=warpSums[w_tid];
+      int totalCount = reduceCounts(localCount);
+      if (w_tid==0){
+         *retval=(uint32_t)(totalCount);
+      }
+   }
+   //Prefix scan WarpSums on the first warp
+   if (wid==0){
+      auto value = warpSums[w_tid];
+       for (int d=1; d<16; d=2*d) {
+         int res = split::s_shuffle_up(value,d,SPLIT_VOTING_MASK); 
+         if (tid%16 >= d) value+= res;
+      }
+   warpSums[w_tid]=value;
+   }
+   __syncthreads();
+   auto offset= (wid==0)?0:warpSums[wid-1];
+   auto pp=s_pop_count(mask&((1ul<<w_tid) - 1ul ));
+   const auto warpTidWriteIndex  =offset + pp;
+   if(active){
+      output[warpTidWriteIndex]=input[tid];
+   }
+}
+
+template <typename T,typename U,  typename Rule> 
+__global__ void block_compact_keys(T* input,U* output,size_t inputSize,Rule rule,uint32_t *retval) 
+{
+   __shared__ uint32_t warpSums[WARPLENGTH/2];
+   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+   const size_t wid = tid /WARPLENGTH;
+   const size_t w_tid = tid % WARPLENGTH;
+   //full warp votes for rule-> mask = [01010101010101010101010101010101] 
+   const int active=(tid<inputSize)?rule(input[tid]):false; 
+   const auto mask = split::s_warpVote(active==1,SPLIT_VOTING_MASK);
+   const auto warpCount=s_pop_count(mask);
+   if (w_tid==0){
+      warpSums[wid]=warpCount;
+   }
+   __syncthreads();
+   //Figure out the total here because we overwrite shared mem later
+   if (wid==0){
+      int activeWARPS=nextPow2( static_cast<int>( ceilf(static_cast<float>(inputSize)/WARPLENGTH) ));
+      auto reduceCounts=[activeWARPS](int localCount)->int{
+         for (int i = activeWARPS / 2; i > 0; i = i / 2) {
+            localCount += split::s_shuffle_down(localCount, i, SPLIT_VOTING_MASK);
+         }
+         return localCount;
+      };
+      auto localCount=warpSums[w_tid];
+      int totalCount = reduceCounts(localCount);
+      if (w_tid==0){
+         *retval=(uint32_t)(totalCount);
+      }
+   }
+   //Prefix scan WarpSums on the first warp
+   if (wid==0){
+      auto value = warpSums[w_tid];
+       for (int d=1; d<16; d=2*d) {
+         int res = split::s_shuffle_up(value,d,SPLIT_VOTING_MASK); 
+         if (tid%16 >= d) value+= res;
+      }
+   warpSums[w_tid]=value;
+   }
+   __syncthreads();
+   auto offset= (wid==0)?0:warpSums[wid-1];
+   auto pp=s_pop_count(mask&((1ul<<w_tid) - 1ul ));
+   const auto warpTidWriteIndex  =offset + pp;
+   if(active){
+      output[warpTidWriteIndex]=input[tid].first;
+   }
+}
+#endif
+
+template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
+size_t copy_if_block(T* input, T* output, size_t size, Rule rule, void* stack, size_t max_size,
+               split_gpuStream_t s = 0) {
+   assert(stack && "Invalid stack!");
+   Cuda_mempool mPool(stack, max_size);
+   uint32_t *dlen = (uint32_t*)mPool.allocate(sizeof(uint32_t));
+   split::tools::block_compact<<<1,std::min(BLOCKSIZE,nextPow2(size))>>>(input,output,size,rule,dlen);
+   uint32_t len=0;
+   SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&len, dlen, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+   return len;
+}
+
+template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
+size_t copy_if_block(T* input, T* output, size_t size, Rule rule, Cuda_mempool& mPool,
+               split_gpuStream_t s = 0) {
+   uint32_t *dlen = (uint32_t*)mPool.allocate(sizeof(uint32_t));
+   split::tools::block_compact<<<1,std::min(BLOCKSIZE,nextPow2(size)),0,s>>>(input,output,size,rule,dlen);
+   uint32_t len=0;
+   SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&len, dlen, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+   return len;
+}
+
+template <typename T,typename U, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
+size_t copy_if_keys_block(T* input, U* output, size_t size, Rule rule, Cuda_mempool& mPool,
+               split_gpuStream_t s = 0) {
+   uint32_t *dlen = (uint32_t*)mPool.allocate(sizeof(uint32_t));
+   split::tools::block_compact_keys<<<1,std::min(BLOCKSIZE,nextPow2(size)),0,s>>>(input,output,size,rule,dlen);
+   uint32_t len=0;
+   SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&len, dlen, sizeof(uint32_t), split_gpuMemcpyDeviceToHost,s));
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+   return len;
+}
+
 /**
  * @brief Same as copy_if but using raw memory
  */
@@ -619,13 +854,16 @@ template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARP
 uint32_t copy_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& input, T* output, Rule rule,
                      size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) {
 
+   size_t _size = input.size();
+   if (_size<=BLOCKSIZE){
+      return copy_if_block(input.data(),output,_size,rule,mPool,s);
+   }
    uint32_t* d_counts;
    uint32_t* d_offsets;
    d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
    SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s));
 
    // Phase 1 -- Calculate per warp workload
-   size_t _size = input.size();
    split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input.data(), d_counts, rule, _size);
    d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
    SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s));
@@ -652,6 +890,9 @@ template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARP
 uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule,
                      size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) {
 
+   if (size<=BLOCKSIZE){
+      return copy_if_block(input,output,size,rule,mPool,s);
+   }
    uint32_t* d_counts;
    uint32_t* d_offsets;
    d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
@@ -687,13 +928,16 @@ template <typename T, typename U, typename Rule, size_t BLOCKSIZE = 1024, size_t
 size_t copy_keys_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& input, U* output, Rule rule,
                         size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) {
 
+   size_t _size = input.size();
+   if (_size<=BLOCKSIZE){
+      return copy_if_keys_block(input.data(),output,_size,rule,mPool,s);
+   }
    uint32_t* d_counts;
    uint32_t* d_offsets;
    d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
    SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s));
 
    // Phase 1 -- Calculate per warp workload
-   size_t _size = input.size();
    split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input.data(), d_counts, rule, _size);
    d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
    SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s));
@@ -882,69 +1126,5 @@ size_t copy_if(T* input, T* output, size_t size, Rule rule, void* stack, size_t
    auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s);
    return len;
 }
-
-template <typename T, typename Rule> 
-__global__ void blockCompact(T* input,T* output,size_t inputSize,Rule rule) 
-{
-   __shared__ uint32_t warpSums[WARPLENGTH];
-   constexpr int VIRTUALWARP = WARPLENGTH;
-   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
-   const size_t wid = tid /WARPLENGTH;
-   const size_t w_tid = tid % WARPLENGTH;
-   //full warp votes for rule-> mask = [01010101010101010101010101010101] 
-   const int active=rule(input[tid]); 
-   const auto mask = split::s_warpVote(active==1,SPLIT_VOTING_MASK);
-   const auto warpCount=s_pop_count(mask);
-   if (w_tid==0){
-      warpSums[wid]= (wid==0)?0:warpCount;
-   }
-   __syncthreads();
-   //Prefix scan WarpSums on the first warp
-   if (wid==0){
-      auto value = warpSums[w_tid];
-       for (int d=1; d<32; d=2*d) {
-         int res= __shfl_up_sync(SPLIT_VOTING_MASK, value,d);
-         if (tid%32 >= d) value+= res;
-      }
-   warpSums[w_tid]=value;
-   }
-   __syncthreads();
-   auto offset= (wid==0)?0:warpSums[wid];
-   auto pp=s_pop_count(mask>>(static_cast<uint32_t>(std::min(WARPLENGTH,(int)inputSize))-w_tid));
-   const auto warpTidWriteIndex  =offset + pp;
-   if(active){
-      output[warpTidWriteIndex]=input[tid];
-   }
-}
-
-
-template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-size_t copy_if_small(T* input, T* output, size_t size, Rule rule, void* stack, size_t max_size,
-               split_gpuStream_t s = 0) {
-
-   // Figure out Blocks to use
-   size_t _s = std::ceil((float(size)) / (float)BLOCKSIZE);
-   size_t nBlocks = nextPow2(_s);
-   if (nBlocks == 0) {
-      nBlocks += 1;
-   }
-   assert(stack && "Invalid stack!");
-   Cuda_mempool mPool(stack, max_size);
-   auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s);
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
-   return len;
-}
-
-template <typename T, typename Rule>
-size_t copy_if_small2(T* input, T* output, size_t size, Rule rule, void* stack, size_t max_size,
-               split_gpuStream_t s = 0) {
-   (void)stack;
-   (void)max_size;
-   // Figure out Blocks to use
-   split::tools::blockCompact<<<1,size>>>(input,output,size,rule);
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
-   return 0;
-}
-
 } // namespace tools
 } // namespace split
diff --git a/unit_tests/stream_compaction/bench.cu b/unit_tests/stream_compaction/bench.cu
index 78e159b..303083d 100644
--- a/unit_tests/stream_compaction/bench.cu
+++ b/unit_tests/stream_compaction/bench.cu
@@ -5,17 +5,12 @@
 #include <random>
 #include "../../include/splitvector/splitvec.h"
 #include "../../include/splitvector/split_tools.h"
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <nvToolsExt.h>
 using namespace std::chrono;
 using type_t = uint32_t;
 using  splitvector = split::SplitVector<type_t> ;
-using  thrustvector = thrust::device_vector<type_t> ;
-constexpr int R = 100;
-#define PROFILE_START(msg)   nvtxRangePushA((msg))
-#define PROFILE_END() nvtxRangePop()
+constexpr int R = 1000;
+#define PROFILE_START(msg)  
+#define PROFILE_END() 
 
 template <class Fn, class ... Args>
 auto timeMe(Fn fn, Args && ... args){
@@ -40,59 +35,42 @@ void fillVec(T& vec,size_t sz){
    return;
 }
 
-
-void stream_compaction_split_old(splitvector& v,splitvector& output,  type_t* stack, size_t sz){
-   auto pred =[]__host__ __device__ (type_t  element)->bool{ return (element%2)==0 ;};
-   auto len = split::tools::copy_if_small(v.data(),output.data(),sz,pred,(void*)stack,sz);
+template <typename T>
+void printVec(const T& vec){
+   for (size_t i=0; i< vec.size();++i){
+      std::cout<<vec[i]<<",";
+   }
+   std::cout<<std::endl;
+   return;
 }
 
 
-
-void stream_compaction_split(splitvector& v,splitvector& output,  type_t* stack, size_t sz){
-   auto pred =[]__host__ __device__ (type_t  element)->bool{ return (element%2)==0 ;};
-   auto len = split::tools::copy_if_small2(v.data(),output.data(),sz,pred,(void*)stack,sz);
-}
-void stream_compaction_thrust(thrustvector& v,thrustvector& output){
-   auto pred =[]__host__ __device__ (type_t  element)->bool{ return (element%2)==0 ;};
-   thrust::copy_if(thrust::device, v.begin(), v.end(), output.begin(), pred);
-}
-
 int main(int argc, char* argv[]){
 
-
-
-   int sz=6;
+   int sz=10;
    if (argc>=2){
       sz=atoi(argv[1]);
    }
-   size_t N = 64;
+   void* mem;
+   size_t N = sz;
    splitvector v0(N),v0_out(N);
    srand(1);
    fillVec(v0,N);
-   splitvector stack(N);
-
    v0.optimizeGPU();
    v0_out.optimizeGPU();
-   stack.optimizeGPU();
    split_gpuDeviceSynchronize();
-   
-   double t_split={0};
-   double t_split_old={0};
-
    for (size_t i =0 ; i < R ; ++i){
-      PROFILE_START("SPLIT");
-      t_split+=timeMe(stream_compaction_split,v0,v0_out,stack.data(),N);
+      PROFILE_START("Stream_Compaction");
+      auto pred =[]__host__ __device__ (type_t  element)->bool{ return (element>10) ;};
+      auto pred_ =[]__host__ __device__ (type_t  element)->bool{ return (element%2)==0 ;};
+#if 0 
+      split::tools::copy_if(v0,v0_out,pred);
+#else
+      split::tools::copy_if(v0,v0_out,pred_);
+#endif
       PROFILE_END();
-      splitvector vv(v0_out);
-
-      PROFILE_START("SPLITOLD");
-      t_split_old+=timeMe(stream_compaction_split_old,v0,v0_out,stack.data(),N);
-      PROFILE_END();
-      assert(vv==v0_out);
+      break;
    }
-
-   printf("%d\t%f,%f\n",sz,t_split_old/R,t_split/R);
-
    return 0;
 }
 

From 99f0c52fa9f3f2e21611b2285b929af6a9c825a3 Mon Sep 17 00:00:00 2001
From: kstppd <kpapadakis@protonmail.com>
Date: Fri, 1 Mar 2024 21:53:53 +0200
Subject: [PATCH 06/10] Remove sync

---
 include/splitvector/split_tools.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h
index f4614e2..a63cad3 100644
--- a/include/splitvector/split_tools.h
+++ b/include/splitvector/split_tools.h
@@ -1010,7 +1010,6 @@ void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& inpu
    const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t);
    Cuda_mempool mPool(memory_for_pool, s);
    auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s);
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    output.erase(&output[len], output.end());
 }
 

From 602108c0f412e5f84ee9b033568aa74a141771e6 Mon Sep 17 00:00:00 2001
From: kstppd <kpapadakis@protonmail.com>
Date: Fri, 1 Mar 2024 22:07:45 +0200
Subject: [PATCH 07/10] Rename Cuda_mempool to SplitStackArena

---
 include/hashinator/hashinator.h              |  2 +-
 include/splitvector/split_tools.h            | 46 ++++++++++----------
 unit_tests/benchmark/main.cu                 |  2 +-
 unit_tests/stream_compaction/preallocated.cu | 12 ++---
 unit_tests/stream_compaction/race.cu         |  2 +-
 5 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h
index cb71f17..4a8048e 100644
--- a/include/hashinator/hashinator.h
+++ b/include/hashinator/hashinator.h
@@ -1110,7 +1110,7 @@ class Hashmap {
 
       // Allocate with Mempool
       const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t);
-      split::tools::Cuda_mempool mPool(memory_for_pool, s);
+      split::tools::splitStackArena mPool(memory_for_pool, s);
       size_t retval =
           split::tools::copy_if_raw<hash_pair<KEY_TYPE, VAL_TYPE>, Rule, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>(
               buckets, elements, rule, nBlocks, mPool, s);
diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h
index a63cad3..fbd1153 100644
--- a/include/splitvector/split_tools.h
+++ b/include/splitvector/split_tools.h
@@ -3,7 +3,7 @@
  * Description: Set of tools used by SplitVector
  *
  * This file defines the following classes or functions:
- *    --split::tools::Cuda_mempool
+ *    --split::tools::splitStackArena
  *    --split::tools::copy_if_raw
  *    --split::tools::copy_if
  *    --split::tools::scan_reduce_raw
@@ -501,7 +501,7 @@ __global__ void split_compact_raw(T* input, uint32_t* counts, uint32_t* offsets,
  * It uses async mallocs
  *
  */
-class Cuda_mempool {
+class splitStackArena {
 private:
    size_t total_bytes;
    size_t bytes_used;
@@ -510,24 +510,24 @@ class Cuda_mempool {
    bool isOwner;
 
 public:
-   explicit Cuda_mempool(size_t bytes, split_gpuStream_t str) {
+   explicit splitStackArena(size_t bytes, split_gpuStream_t str) {
       s = str;
       SPLIT_CHECK_ERR(split_gpuMallocAsync(&_data, bytes, s));
       total_bytes = bytes;
       bytes_used = 0;
       isOwner = true;
    }
-   explicit Cuda_mempool(void* ptr, size_t bytes) {
+   explicit splitStackArena(void* ptr, size_t bytes) {
       total_bytes = bytes;
       bytes_used = 0;
       isOwner = false;
       _data = ptr;
    }
 
-   Cuda_mempool() = delete;
-   Cuda_mempool(const Cuda_mempool& other) = delete;
-   Cuda_mempool(Cuda_mempool&& other) = delete;
-   ~Cuda_mempool() {
+   splitStackArena() = delete;
+   splitStackArena(const splitStackArena& other) = delete;
+   splitStackArena(splitStackArena&& other) = delete;
+   ~splitStackArena() {
       if (isOwner) {
          SPLIT_CHECK_ERR(split_gpuFreeAsync(_data, s));
       }
@@ -566,7 +566,7 @@ __global__ void scan_reduce_raw(T* input, uint32_t* output, Rule rule, size_t si
  * @brief Same as split_prefix_scan but with raw memory
  */
 template <typename T, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-void split_prefix_scan_raw(T* input, T* output, Cuda_mempool& mPool, const size_t input_size, split_gpuStream_t s = 0) {
+void split_prefix_scan_raw(T* input, T* output, splitStackArena& mPool, const size_t input_size, split_gpuStream_t s = 0) {
 
    // Scan is performed in half Blocksizes
    size_t scanBlocksize = BLOCKSIZE / 2;
@@ -816,7 +816,7 @@ template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARP
 size_t copy_if_block(T* input, T* output, size_t size, Rule rule, void* stack, size_t max_size,
                split_gpuStream_t s = 0) {
    assert(stack && "Invalid stack!");
-   Cuda_mempool mPool(stack, max_size);
+   splitStackArena mPool(stack, max_size);
    uint32_t *dlen = (uint32_t*)mPool.allocate(sizeof(uint32_t));
    split::tools::block_compact<<<1,std::min(BLOCKSIZE,nextPow2(size))>>>(input,output,size,rule,dlen);
    uint32_t len=0;
@@ -826,7 +826,7 @@ size_t copy_if_block(T* input, T* output, size_t size, Rule rule, void* stack, s
 }
 
 template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-size_t copy_if_block(T* input, T* output, size_t size, Rule rule, Cuda_mempool& mPool,
+size_t copy_if_block(T* input, T* output, size_t size, Rule rule, splitStackArena& mPool,
                split_gpuStream_t s = 0) {
    uint32_t *dlen = (uint32_t*)mPool.allocate(sizeof(uint32_t));
    split::tools::block_compact<<<1,std::min(BLOCKSIZE,nextPow2(size)),0,s>>>(input,output,size,rule,dlen);
@@ -837,7 +837,7 @@ size_t copy_if_block(T* input, T* output, size_t size, Rule rule, Cuda_mempool&
 }
 
 template <typename T,typename U, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-size_t copy_if_keys_block(T* input, U* output, size_t size, Rule rule, Cuda_mempool& mPool,
+size_t copy_if_keys_block(T* input, U* output, size_t size, Rule rule, splitStackArena& mPool,
                split_gpuStream_t s = 0) {
    uint32_t *dlen = (uint32_t*)mPool.allocate(sizeof(uint32_t));
    split::tools::block_compact_keys<<<1,std::min(BLOCKSIZE,nextPow2(size)),0,s>>>(input,output,size,rule,dlen);
@@ -852,7 +852,7 @@ size_t copy_if_keys_block(T* input, U* output, size_t size, Rule rule, Cuda_memp
  */
 template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
 uint32_t copy_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& input, T* output, Rule rule,
-                     size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) {
+                     size_t nBlocks, splitStackArena& mPool, split_gpuStream_t s = 0) {
 
    size_t _size = input.size();
    if (_size<=BLOCKSIZE){
@@ -888,7 +888,7 @@ uint32_t copy_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& i
 
 template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
 uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule,
-                     size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) {
+                     size_t nBlocks, splitStackArena& mPool, split_gpuStream_t s = 0) {
 
    if (size<=BLOCKSIZE){
       return copy_if_block(input,output,size,rule,mPool,s);
@@ -926,7 +926,7 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule,
  */
 template <typename T, typename U, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
 size_t copy_keys_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& input, U* output, Rule rule,
-                        size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) {
+                        size_t nBlocks, splitStackArena& mPool, split_gpuStream_t s = 0) {
 
    size_t _size = input.size();
    if (_size<=BLOCKSIZE){
@@ -1008,7 +1008,7 @@ void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& inpu
 
    // Allocate with Mempool
    const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t);
-   Cuda_mempool mPool(memory_for_pool, s);
+   splitStackArena mPool(memory_for_pool, s);
    auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s);
    output.erase(&output[len], output.end());
 }
@@ -1041,14 +1041,14 @@ void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
 
    // Allocate with Mempool
    const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t);
-   Cuda_mempool mPool(memory_for_pool, s);
+   splitStackArena mPool(memory_for_pool, s);
    auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s);
    output.erase(&output[len], output.end());
 }
 
 template <typename T, typename U, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
 void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
-                  split::SplitVector<U, split::split_unified_allocator<U>>& output, Rule rule, Cuda_mempool&& mPool,
+                  split::SplitVector<U, split::split_unified_allocator<U>>& output, Rule rule, splitStackArena&& mPool,
                   split_gpuStream_t s = 0) {
 
    // Figure out Blocks to use
@@ -1057,13 +1057,13 @@ void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& inpu
    if (nBlocks == 0) {
       nBlocks += 1;
    }
-   auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, std::forward<Cuda_mempool>(mPool), s);
+   auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, std::forward<splitStackArena>(mPool), s);
    output.erase(&output[len], output.end());
 }
 
 template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
 void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
-             split::SplitVector<T, split::split_unified_allocator<T>>& output, Rule rule, Cuda_mempool&& mPool,
+             split::SplitVector<T, split::split_unified_allocator<T>>& output, Rule rule, splitStackArena&& mPool,
              split_gpuStream_t s = 0) {
 
    // Figure out Blocks to use
@@ -1088,7 +1088,7 @@ void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& inpu
       nBlocks += 1;
    }
    assert(stack && "Invalid stack!");
-   Cuda_mempool mPool(stack, max_size);
+   splitStackArena mPool(stack, max_size);
    auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s);
    output.erase(&output[len], output.end());
 }
@@ -1105,7 +1105,7 @@ void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
       nBlocks += 1;
    }
    assert(stack && "Invalid stack!");
-   Cuda_mempool mPool(stack, max_size);
+   splitStackArena mPool(stack, max_size);
    auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s);
    output.erase(&output[len], output.end());
 }
@@ -1121,7 +1121,7 @@ size_t copy_if(T* input, T* output, size_t size, Rule rule, void* stack, size_t
       nBlocks += 1;
    }
    assert(stack && "Invalid stack!");
-   Cuda_mempool mPool(stack, max_size);
+   splitStackArena mPool(stack, max_size);
    auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s);
    return len;
 }
diff --git a/unit_tests/benchmark/main.cu b/unit_tests/benchmark/main.cu
index bd8bada..2e465f0 100644
--- a/unit_tests/benchmark/main.cu
+++ b/unit_tests/benchmark/main.cu
@@ -101,7 +101,7 @@ void benchErase(hashmap& hmap,key_type* gpuKeys, val_type* gpuVals,int sz){
 
 int main(int argc, char* argv[]){
 
-   int sz= 24;
+   int sz= 18;
    if (argc>=2){
       sz=atoi(argv[1]);
    }
diff --git a/unit_tests/stream_compaction/preallocated.cu b/unit_tests/stream_compaction/preallocated.cu
index 20b01af..971a723 100644
--- a/unit_tests/stream_compaction/preallocated.cu
+++ b/unit_tests/stream_compaction/preallocated.cu
@@ -101,10 +101,10 @@ bool preallocated_compactions_basic(int power){
    //These mempools are now allocation free. They essentially just manage the buffer correclty!
    //Please !!ALWAYS!! forwarding here to preserve move semantics as the pool might change later on
    split::tools::copy_if(v,output1,predicate_on,
-                           std::forward<split::tools::Cuda_mempool>(split::tools::Cuda_mempool{buffer,bytesNeeded}));
+                           std::forward<split::tools::splitStackArena>(split::tools::splitStackArena{buffer,bytesNeeded}));
 
    split::tools::copy_if(v,output2,predicate_off,
-                           std::forward<split::tools::Cuda_mempool>(split::tools::Cuda_mempool{buffer,bytesNeeded}));
+                           std::forward<split::tools::splitStackArena>(split::tools::splitStackArena{buffer,bytesNeeded}));
    //Deallocate our good buffer
    SPLIT_CHECK_ERR (split_gpuFree(buffer));
    
@@ -189,7 +189,7 @@ bool preallocated_compactions_medium(int power){
    
    //This guy goes on stream 1
    split::tools::copy_if(v,output1,predicate_on,
-                           std::forward<split::tools::Cuda_mempool>(split::tools::Cuda_mempool{buffer,bytesNeeded}),streams[0]);
+                           std::forward<split::tools::splitStackArena>(split::tools::splitStackArena{buffer,bytesNeeded}),streams[0]);
 
    //This guy goes on stream 2
    /*
@@ -197,7 +197,7 @@ bool preallocated_compactions_medium(int power){
    */
    void* start = reinterpret_cast<void*> ( reinterpret_cast<char*>(buffer)+bytesNeeded);
    split::tools::copy_if(v,output2,predicate_off,
-                           std::forward<split::tools::Cuda_mempool>(split::tools::Cuda_mempool{start,bytesNeeded}),streams[1]);
+                           std::forward<split::tools::splitStackArena>(split::tools::splitStackArena{start,bytesNeeded}),streams[1]);
 
 
    //Wait for them!
@@ -274,10 +274,10 @@ bool preallocated_compactions_HAM(int power){
       void* tidIndex_1 = reinterpret_cast<void*> ( reinterpret_cast<char*>(buffer)+tid*streamsPerThread*bytesNeeded);
       void* tidIndex_2 = reinterpret_cast<void*> ( reinterpret_cast<char*>(buffer)+tid*streamsPerThread*bytesNeeded+bytesNeeded);
       split::tools::copy_if(vecs[i],out1[i],predicate_on,
-                              std::forward<split::tools::Cuda_mempool>(split::tools::Cuda_mempool{tidIndex_1,bytesNeeded}));
+                              std::forward<split::tools::splitStackArena>(split::tools::splitStackArena{tidIndex_1,bytesNeeded}));
 
       split::tools::copy_if(vecs[i],out2[i],predicate_off,
-                              std::forward<split::tools::Cuda_mempool>(split::tools::Cuda_mempool{tidIndex_2,bytesNeeded}));
+                              std::forward<split::tools::splitStackArena>(split::tools::splitStackArena{tidIndex_2,bytesNeeded}));
    }
 
 
diff --git a/unit_tests/stream_compaction/race.cu b/unit_tests/stream_compaction/race.cu
index 5f9f4ea..f0fe82f 100644
--- a/unit_tests/stream_compaction/race.cu
+++ b/unit_tests/stream_compaction/race.cu
@@ -67,7 +67,7 @@ void split_test_prefix(split_vector& input_split,split_vector& output_split,size
       input_split[i]=i;//tmp;
    }
 
-   //split::tools::Cuda_mempool mPool(1024*64);
+   //split::tools::splitStackArena mPool(1024*64);
 
 
    input_split.optimizeGPU();

From 9e0a83c83b60c4ccb42b9d4ffb48168cb7ab1172 Mon Sep 17 00:00:00 2001
From: kstppd <kpapadakis@protonmail.com>
Date: Fri, 15 Mar 2024 16:52:21 +0200
Subject: [PATCH 08/10] Remove forgotten optimize call and add conditional
 prefetches to retrieve and erase methods

---
 include/hashinator/hashinator.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h
index 4a8048e..ddd5f55 100644
--- a/include/hashinator/hashinator.h
+++ b/include/hashinator/hashinator.h
@@ -1275,29 +1275,35 @@ class Hashmap {
       if (neededPowerSize > _mapInfo->sizePower) {
          resize(neededPowerSize, targets::device, s);
       }
-      buckets.optimizeGPU(s);
       DeviceHasher::insert(src, buckets.data(), _mapInfo->sizePower, _mapInfo->currentMaxBucketOverflow,
                            &_mapInfo->currentMaxBucketOverflow, &_mapInfo->fill, len, &_mapInfo->err, s);
       return;
    }
 
    // Uses Hasher's retrieve_kernel to read all elements
-   void retrieve(KEY_TYPE* keys, VAL_TYPE* vals, size_t len, split_gpuStream_t s = 0) {
-      buckets.optimizeGPU(s);
+   void retrieve(KEY_TYPE* keys, VAL_TYPE* vals, size_t len, split_gpuStream_t s = 0,bool prefetches=true) {
+      if (prefetches){
+         buckets.optimizeGPU(s);
+      }
       DeviceHasher::retrieve(keys, vals, buckets.data(), _mapInfo->sizePower, _mapInfo->currentMaxBucketOverflow, len,
                              s);
       return;
    }
 
    // Uses Hasher's retrieve_kernel to read all elements
-   void retrieve(hash_pair<KEY_TYPE, VAL_TYPE>* src, size_t len, split_gpuStream_t s = 0) {
-      buckets.optimizeGPU(s);
+   void retrieve(hash_pair<KEY_TYPE, VAL_TYPE>* src, size_t len, split_gpuStream_t s = 0, bool prefetches=true) {
+      if (prefetches){
+         buckets.optimizeGPU(s);
+      }
       DeviceHasher::retrieve(src, buckets.data(), _mapInfo->sizePower, _mapInfo->currentMaxBucketOverflow, len, s);
       return;
    }
 
    // Uses Hasher's erase_kernel to delete all elements
-   void erase(KEY_TYPE* keys, size_t len, split_gpuStream_t s = 0) {
+   void erase(KEY_TYPE* keys, size_t len, split_gpuStream_t s = 0,bool prefetches=true) {
+      if (prefetches){
+         buckets.optimizeGPU(s);
+      }
       // Remember the last numeber of tombstones
       size_t tbStore = tombstone_count();
       DeviceHasher::erase(keys, buckets.data(), &_mapInfo->tombstoneCounter, _mapInfo->sizePower,

From 376e9f62930d646cd58af32e9ee1a974bf6d1a25 Mon Sep 17 00:00:00 2001
From: kstppd <kpapadakis@protonmail.com>
Date: Tue, 26 Mar 2024 17:06:38 +0200
Subject: [PATCH 09/10] Add methods that return emptybucket and tombstone
 values

---
 include/hashinator/hashinator.h      |  6 +++
 unit_tests/hashmap_unit_test/main.cu | 73 +++++++++++++++-------------
 2 files changed, 46 insertions(+), 33 deletions(-)

diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h
index ddd5f55..b1b6811 100644
--- a/include/hashinator/hashinator.h
+++ b/include/hashinator/hashinator.h
@@ -416,6 +416,12 @@ class Hashmap {
    HASHINATOR_HOSTDEVICE
    size_t bucket_count() const { return buckets.size(); }
 
+   HASHINATOR_HOSTDEVICE
+   constexpr KEY_TYPE get_emptybucket() const { return EMPTYBUCKET; }
+
+   HASHINATOR_HOSTDEVICE
+   constexpr KEY_TYPE get_tombstone() const { return TOMBSTONE; }
+
    HASHINATOR_HOSTDEVICE
    float load_factor() const { return (float)size() / bucket_count(); }
 
diff --git a/unit_tests/hashmap_unit_test/main.cu b/unit_tests/hashmap_unit_test/main.cu
index e9cfa86..9727f8b 100644
--- a/unit_tests/hashmap_unit_test/main.cu
+++ b/unit_tests/hashmap_unit_test/main.cu
@@ -27,9 +27,9 @@ typedef Hashmap<key_type,val_type> hashmap;
 
 struct Predicate{
    HASHINATOR_HOSTDEVICE
-   inline bool operator()( hash_pair<key_type,val_type>& element)const{
-      return element.second%2==0;
-   }
+      inline bool operator()( hash_pair<key_type,val_type>& element)const{
+         return element.second%2==0;
+      }
 };
 
 template <class Fn, class ... Args>
@@ -165,7 +165,7 @@ bool recover_all_elements(const hashmap& hmap, vector& src){
       if (!sane){ 
          return false; 
       }
-   //std::cout<<"Key validated "<<retval->first<<" "<<retval->second<<std::endl;
+      //std::cout<<"Key validated "<<retval->first<<" "<<retval->second<<std::endl;
    }
    return true;
 }
@@ -316,7 +316,7 @@ bool testWarpInsert(int power){
    size_t warpsize     =  Hashinator::defaults::WARPSIZE;
    size_t threadsNeeded  =  N*warpsize; 
    blocks = threadsNeeded/BLOCKSIZE;
- 
+
    bool cpuOK=true;
 
    //Create some input data
@@ -364,7 +364,7 @@ bool testWarpInsertUnorderedSet(int power){
    size_t warpsize     =  Hashinator::defaults::WARPSIZE;
    size_t threadsNeeded  =  N*warpsize; 
    blocks = threadsNeeded/BLOCKSIZE;
- 
+
    bool cpuOK=true;
 
    //Create some input data
@@ -383,7 +383,7 @@ bool testWarpInsertUnorderedSet(int power){
       return false;
    }
 
-   
+
    //Now we change the key values and increment them by 1 and we expect the same keys back because we are not supposed to overwrite
    vector src2(src);
    for (auto& i:src2){
@@ -429,7 +429,7 @@ bool testWarpInsert_V(int power){
    size_t warpsize     =  Hashinator::defaults::WARPSIZE;
    size_t threadsNeeded  =  N*warpsize; 
    blocks = threadsNeeded/BLOCKSIZE;
- 
+
    bool cpuOK=true;
 
    //Create some input data
@@ -457,7 +457,7 @@ bool testWarpErase(int power){
    size_t warpsize     =  Hashinator::defaults::WARPSIZE;
    size_t threadsNeeded  =  N*warpsize; 
    blocks = threadsNeeded/BLOCKSIZE;
- 
+
 
    //Create some input data
    vector src(N);
@@ -508,7 +508,7 @@ bool testWarpFind(int power){
    if (!cpuOK){
       return false;
    }
-   
+
    size_t warpsize     =  Hashinator::defaults::WARPSIZE;
    size_t threadsNeeded  =  N*warpsize; 
    blocks = threadsNeeded/BLOCKSIZE;
@@ -564,7 +564,7 @@ bool test_hashmap_1(int power){
          return false;
       }
    }
-   
+
    //Verify odd elements;
    cpuOK=recover_odd_elements(hmap,src);
    gpu_recover_odd_elements<<<blocks,blocksize>>>(d_hmap,src.data(),src.size());
@@ -645,7 +645,7 @@ bool test_hashmap_2(int power){
          return false;
       }
    }
-   
+
    //Verify odd elements;
    cpuOK=recover_odd_elements(hmap,src);
    gpu_recover_odd_elements<<<blocks,blocksize>>>(hmap,src.data(),src.size());
@@ -973,11 +973,11 @@ TEST(HashmapUnitTets ,Test_Resize_Perf_Device){
 
 template <typename T, typename U>
 struct Rule{
-Rule(){}
+   Rule(){}
    __host__ __device__
-   inline bool operator()( hash_pair<T,U>& element)const{
-      return element.first<1000;
-   }
+      inline bool operator()( hash_pair<T,U>& element)const{
+         return element.first<1000;
+      }
 };
 
 
@@ -1013,7 +1013,7 @@ TEST(HashmapUnitTets ,Test_ErrorCodes_ExtractKeysByPatternNoAllocations){
    size_t mem=2*sizeof(key_type)*(1<<sz);
    SPLIT_CHECK_ERR (split_gpuMallocManaged( (void**)&buffer ,mem));
    hmap.extractKeysByPattern(out2, Rule<key_type,key_type>(),buffer,mem);
-   
+
    expect_true(out1==out2);
    SPLIT_CHECK_ERR (split_gpuFree(buffer));
 }
@@ -1037,25 +1037,25 @@ TEST(HashmapUnitTets ,Test_Copy_Metadata){
 }
 
 std::vector<key_type> generateUniqueRandomKeys(size_t size, size_t range=std::numeric_limits<int>::max()) {
-    std::vector<key_type> elements;
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_int_distribution<int> dist(1, range);
-
-    for (size_t i = 0; i < size; ++i) {
-        key_type randomNum = i;//dist(gen);
-        if (std::find(elements.begin(), elements.end(), randomNum) == elements.end()) {
-            elements.push_back(randomNum);
-        } else {
-            --i;  
-        }
-    }
-    return elements;
+   std::vector<key_type> elements;
+   std::random_device rd;
+   std::mt19937 gen(rd());
+   std::uniform_int_distribution<int> dist(1, range);
+
+   for (size_t i = 0; i < size; ++i) {
+      key_type randomNum = i;//dist(gen);
+      if (std::find(elements.begin(), elements.end(), randomNum) == elements.end()) {
+         elements.push_back(randomNum);
+      } else {
+         --i;  
+      }
+   }
+   return elements;
 }
 
 void insertDuplicates(std::vector<key_type>& vec, key_type element, size_t count) {
    if (count>0){
-    vec.insert(vec.end(), count, element);
+      vec.insert(vec.end(), count, element);
    }
    srand(time(NULL));
    std::random_shuffle(vec.begin(),vec.end());
@@ -1084,7 +1084,14 @@ TEST(HashmapUnitTets ,Test_Duplicate_Insertion){
    }
 }
 
-
+TEST(HashmapUnitTets ,EMPTY_TOMBSTONE_values){
+   Hashmap<key_type,val_type,42,42> hmap;
+   const auto tombstone = hmap.get_tombstone();
+   const auto emptybucket = hmap.get_emptybucket();
+   expect_true(tombstone==emptybucket);
+   //These can be also checked during compile time 
+   static_assert(tombstone==emptybucket);
+}
 
 int main(int argc, char* argv[]){
    srand(time(NULL));

From 3f214f520618e258c4d8d8e4f68ecd97f2d03ea6 Mon Sep 17 00:00:00 2001
From: Markus Battarbee <markus.battarbee@gmail.com>
Date: Thu, 28 Mar 2024 20:57:02 +0200
Subject: [PATCH 10/10] Fix signed-unsigned error which was causing erroneous
 outputs from warpInsert_V

---
 include/hashinator/hashinator.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h
index b1b6811..7516a17 100644
--- a/include/hashinator/hashinator.h
+++ b/include/hashinator/hashinator.h
@@ -962,7 +962,7 @@ class Hashmap {
       }
 
       auto res = split::s_warpVote(localCount > 0, submask);
-      return (res > 0);
+      return (res != 0);
    }
 
    HASHINATOR_DEVICEONLY