diff --git a/include/common.h b/include/common.h
index 70c792a..cb48536 100644
--- a/include/common.h
+++ b/include/common.h
@@ -50,10 +50,9 @@ constexpr inline size_t nextPow2(size_t v) noexcept {
  * @brief Computes the next optimal overflow for the hasher kernels
  */
 HASHINATOR_HOSTDEVICE
-[[nodiscard]]
-constexpr inline size_t nextOverflow(size_t currentOverflow, size_t virtualWarp) noexcept {
-    size_t remainder = currentOverflow % virtualWarp;
-    return ((remainder)==0)?currentOverflow: currentOverflow + (virtualWarp - remainder);
+[[nodiscard]] constexpr inline size_t nextOverflow(size_t currentOverflow, size_t virtualWarp) noexcept {
+   size_t remainder = currentOverflow % virtualWarp;
+   return ((remainder) == 0) ? currentOverflow : currentOverflow + (virtualWarp - remainder);
 }
 
 /**
diff --git a/include/hashinator/hashers.h b/include/hashinator/hashers.h
index f7552be..fdefc87 100644
--- a/include/hashinator/hashers.h
+++ b/include/hashinator/hashers.h
@@ -23,6 +23,7 @@
 #include "../splitvector/gpu_wrappers.h"
 #include "defaults.h"
 #include "hashfunctions.h"
+#include "hash_pair.h"
 #ifdef __NVCC__
 #include "kernels_NVIDIA.h"
 #endif
@@ -160,6 +161,68 @@ class Hasher {
       SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    }
 
+
+   /* ----------------------------------- Members used by Hashinator::Unordered_Set -----------------------------------*/
+
+
+   static void insert_set(KEY_TYPE* keys, KEY_TYPE* buckets, int sizePower,size_t maxoverflow, size_t* d_overflow,
+                      size_t* d_fill, size_t len, status* err,split_gpuStream_t s = 0) {
+      //Make sure this is being used by Unordered_Set
+      static_assert(std::is_same<void,VAL_TYPE>::value);
+      size_t blocks, blockSize;
+      *err = status::success;
+      launchParams(len, blocks, blockSize);
+      Hashinator::Hashers::insert_set_kernel<KEY_TYPE, EMPTYBUCKET, HashFunction, defaults::WARPSIZE,elementsPerWarp>
+          <<<blocks, blockSize, 0, s>>>(keys, buckets, sizePower, maxoverflow, d_overflow, d_fill, len, err);
+      SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+#ifndef NDEBUG
+      if (*err == status::fail) {
+         std::cerr << "***** Hashinator Runtime Warning ********" << std::endl;
+         std::cerr << "Warning: Hashmap completely overflown in Device Insert.\nNot all ellements were "
+                      "inserted!\nConsider resizing before calling insert"
+                   << std::endl;
+         std::cerr << "******************************" << std::endl;
+      }
+#endif
+   }
+
+   // Delete wrapper
+   static void erase_set(KEY_TYPE* keys, KEY_TYPE* buckets, size_t* d_tombstoneCounter, int sizePower,
+                     size_t maxoverflow, size_t len, split_gpuStream_t s = 0) {
+
+      //Make sure this is being used by Unordered_Set
+      static_assert(std::is_same<void,VAL_TYPE>::value);
+      size_t blocks, blockSize;
+      launchParams(len, blocks, blockSize);
+      Hashinator::Hashers::delete_set_kernel<KEY_TYPE, EMPTYBUCKET, TOMBSTONE, HashFunction, defaults::WARPSIZE,
+                                         elementsPerWarp>
+          <<<blocks, blockSize, 0, s>>>(keys, buckets, d_tombstoneCounter, sizePower, maxoverflow, len);
+      SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+   }
+
+   // Reset wrapper
+   static void reset_set(KEY_TYPE* src, KEY_TYPE* dst, const int sizePower,size_t maxoverflow,
+                        size_t len, split_gpuStream_t s = 0) {
+      //Make sure this is being used by Unordered_Set
+      static_assert(std::is_same<void,VAL_TYPE>::value);
+      size_t blocks, blockSize;
+      launchParams(len, blocks, blockSize);
+      Hashinator::Hashers::reset_to_empty_set<KEY_TYPE, EMPTYBUCKET, HashFunction, defaults::WARPSIZE,
+                                          elementsPerWarp>
+          <<<blocks, blockSize, 0, s>>>(src, dst, sizePower, maxoverflow, len);
+      SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+   }
+   // Reset wrapper for all elements
+   static void reset_all_set(KEY_TYPE* dst, size_t len, split_gpuStream_t s = 0) {
+      //Make sure this is being used by Unordered_Set
+      static_assert(std::is_same<void,VAL_TYPE>::value);
+      size_t blocksNeeded = len / defaults::MAX_BLOCKSIZE;
+      blocksNeeded = blocksNeeded + (blocksNeeded == 0);
+      reset_all_to_empty_set<KEY_TYPE, EMPTYBUCKET><<<blocksNeeded, defaults::MAX_BLOCKSIZE, 0, s>>>(dst, len);
+      SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+   }
+
+
 private:
    static void launchParams(size_t N, size_t& blocks, size_t& blockSize) {
       // fast ceil for positive ints
diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashmap/hashmap.h
similarity index 97%
rename from include/hashinator/hashinator.h
rename to include/hashinator/hashmap/hashmap.h
index cb71f17..5fba42b 100644
--- a/include/hashinator/hashinator.h
+++ b/include/hashinator/hashmap/hashmap.h
@@ -1,4 +1,4 @@
-/* File:    hashinator.h
+/* File:    hashmap.h
  * Authors: Kostis Papadakis, Urs Ganse and Markus Battarbee (2023)
  * Description: A hybrid hashmap that can operate on both
  *              CPUs and GPUs using CUDA unified memory.
@@ -24,20 +24,20 @@
 #ifdef HASHINATOR_CPU_ONLY_MODE
 #define SPLIT_CPU_ONLY_MODE
 #endif
-#include "../common.h"
-#include "../splitvector/gpu_wrappers.h"
-#include "../splitvector/split_allocators.h"
-#include "../splitvector/splitvec.h"
-#include "defaults.h"
-#include "hash_pair.h"
-#include "hashfunctions.h"
+#include "../../common.h"
+#include "../../splitvector/gpu_wrappers.h"
+#include "../../splitvector/split_allocators.h"
+#include "../../splitvector/splitvec.h"
+#include "../defaults.h"
+#include "../hash_pair.h"
+#include "../hashfunctions.h"
 #include <algorithm>
 #include <cassert>
 #include <limits>
 #include <stdexcept>
 #ifndef HASHINATOR_CPU_ONLY_MODE
-#include "../splitvector/split_tools.h"
-#include "hashers.h"
+#include "../../splitvector/split_tools.h"
+#include "../hashers.h"
 #endif
 
 namespace Hashinator {
@@ -99,7 +99,7 @@ class Hashmap {
 
    // Deallocates the bookeepping info and the device pointer
    void deallocate_device_handles() {
-      if (device_map==nullptr){
+      if (device_map == nullptr) {
          return;
       }
 #ifndef HASHINATOR_CPU_ONLY_MODE
@@ -138,11 +138,11 @@ class Hashmap {
    Hashmap(Hashmap<KEY_TYPE, VAL_TYPE>&& other) {
       preallocate_device_handles();
       _mapInfo = other._mapInfo;
-      other._mapInfo=nullptr;
+      other._mapInfo = nullptr;
       buckets = std::move(other.buckets);
    };
 
-   Hashmap& operator=(const Hashmap<KEY_TYPE,VAL_TYPE>& other) {
+   Hashmap& operator=(const Hashmap<KEY_TYPE, VAL_TYPE>& other) {
       if (this == &other) {
          return *this;
       }
@@ -167,8 +167,8 @@ class Hashmap {
       }
       _metaAllocator.deallocate(_mapInfo, 1);
       _mapInfo = other._mapInfo;
-      other._mapInfo=nullptr;
-      buckets =std::move(other.buckets);
+      other._mapInfo = nullptr;
+      buckets = std::move(other.buckets);
       return *this;
    }
 
@@ -847,13 +847,13 @@ class Hashmap {
             if (w_tid == winner) {
                KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex].first, EMPTYBUCKET, candidateKey);
                if (old == EMPTYBUCKET) {
-                  threadOverflow =(probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex+1);
+                  threadOverflow = (probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex + 1);
                   split::s_atomicExch(&buckets[probingindex].second, candidateVal);
                   warpDone = 1;
                   split::s_atomicAdd(&_mapInfo->fill, 1);
                   if (threadOverflow > _mapInfo->currentMaxBucketOverflow) {
                      split::s_atomicExch((unsigned long long*)(&_mapInfo->currentMaxBucketOverflow),
-                                         (unsigned long long)nextOverflow(threadOverflow,defaults::WARPSIZE));
+                                         (unsigned long long)nextOverflow(threadOverflow, defaults::WARPSIZE));
                   }
                } else if (old == candidateKey) {
                   // Parallel stuff are fun. Major edge case!
@@ -931,14 +931,14 @@ class Hashmap {
             if (w_tid == winner) {
                KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex].first, EMPTYBUCKET, candidateKey);
                if (old == EMPTYBUCKET) {
-                  threadOverflow = (probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex+1);
+                  threadOverflow = (probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex + 1);
                   split::s_atomicExch(&buckets[probingindex].second, candidateVal);
                   warpDone = 1;
                   localCount = 1;
                   split::s_atomicAdd(&_mapInfo->fill, 1);
                   if (threadOverflow > _mapInfo->currentMaxBucketOverflow) {
                      split::s_atomicExch((unsigned long long*)(&_mapInfo->currentMaxBucketOverflow),
-                                         (unsigned long long)nextOverflow(threadOverflow,defaults::WARPSIZE));
+                                         (unsigned long long)nextOverflow(threadOverflow, defaults::WARPSIZE));
                   }
                } else if (old == candidateKey) {
                   // Parallel stuff are fun. Major edge case!
@@ -1113,7 +1113,7 @@ class Hashmap {
       split::tools::Cuda_mempool mPool(memory_for_pool, s);
       size_t retval =
           split::tools::copy_if_raw<hash_pair<KEY_TYPE, VAL_TYPE>, Rule, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>(
-              buckets, elements, rule, nBlocks, mPool, s);
+              buckets.data(), elements, buckets.size(), rule, nBlocks, mPool, s);
       return retval;
    }
 
@@ -1133,8 +1133,8 @@ class Hashmap {
       return elements.size();
    }
    template <typename Rule>
-   size_t extractKeysByPattern(split::SplitVector<KEY_TYPE>& elements, Rule rule, void *stack, size_t max_size, split_gpuStream_t s = 0,
-                               bool prefetches = true) {
+   size_t extractKeysByPattern(split::SplitVector<KEY_TYPE>& elements, Rule rule, void* stack, size_t max_size,
+                               split_gpuStream_t s = 0, bool prefetches = true) {
       elements.resize(_mapInfo->fill + 1, true);
       if (prefetches) {
          elements.optimizeGPU(s);
@@ -1152,7 +1152,8 @@ class Hashmap {
       };
       return extractKeysByPattern(elements, rule, s, prefetches);
    }
-   size_t extractAllKeys(split::SplitVector<KEY_TYPE>& elements, void *stack, size_t max_size, split_gpuStream_t s = 0, bool prefetches = true) {
+   size_t extractAllKeys(split::SplitVector<KEY_TYPE>& elements, void* stack, size_t max_size, split_gpuStream_t s = 0,
+                         bool prefetches = true) {
       // Extract all keys
       auto rule = [] __host__ __device__(const hash_pair<KEY_TYPE, VAL_TYPE>& kval) -> bool {
          return kval.first != EMPTYBUCKET && kval.first != TOMBSTONE;
@@ -1371,7 +1372,7 @@ class Hashmap {
 
    public:
       HASHINATOR_DEVICEONLY
-      device_iterator(Hashmap<KEY_TYPE, VAL_TYPE>& hashtable, size_t index) : index(index),hashtable(&hashtable) {}
+      device_iterator(Hashmap<KEY_TYPE, VAL_TYPE>& hashtable, size_t index) : index(index), hashtable(&hashtable) {}
 
       HASHINATOR_DEVICEONLY
       size_t getIndex() { return index; }
@@ -1418,7 +1419,7 @@ class Hashmap {
    public:
       HASHINATOR_DEVICEONLY
       explicit const_device_iterator(const Hashmap<KEY_TYPE, VAL_TYPE>& hashtable, size_t index)
-          : index(index), hashtable(&hashtable){}
+          : index(index), hashtable(&hashtable) {}
 
       HASHINATOR_DEVICEONLY
       size_t getIndex() { return index; }
@@ -1629,7 +1630,8 @@ class Hashmap {
    void set_element(const KEY_TYPE& key, VAL_TYPE val) {
       size_t thread_overflowLookup = 0;
       insert_element(key, val, thread_overflowLookup);
-      atomicMax((unsigned long long*)&(_mapInfo->currentMaxBucketOverflow), nextOverflow(thread_overflowLookup,defaults::WARPSIZE/defaults::elementsPerWarp));
+      atomicMax((unsigned long long*)&(_mapInfo->currentMaxBucketOverflow),
+                nextOverflow(thread_overflowLookup, defaults::WARPSIZE / defaults::elementsPerWarp));
    }
 
    HASHINATOR_DEVICEONLY
diff --git a/include/hashinator/kernels_AMD.h b/include/hashinator/kernels_AMD.h
index fc46534..1caa1d9 100644
--- a/include/hashinator/kernels_AMD.h
+++ b/include/hashinator/kernels_AMD.h
@@ -203,7 +203,7 @@ __global__ void insert_kernel(hash_pair<KEY_TYPE, VAL_TYPE>* src, hash_pair<KEY_
          if (w_tid == sub_winner) {
             KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex].first, EMPTYBUCKET, candidate.first);
             if (old == EMPTYBUCKET) {
-               threadOverflow = std::min(i+w_tid,static_cast<size_t>(1<<sizePower)) +1;
+               threadOverflow = std::min(i + w_tid, static_cast<size_t>(1 << sizePower)) + 1;
                split::s_atomicExch(&buckets[probingindex].second, candidate.second);
                vWarpDone = 1;
                // Flip the bit which corresponds to the thread that added an element
@@ -243,7 +243,8 @@ __global__ void insert_kernel(hash_pair<KEY_TYPE, VAL_TYPE>* src, hash_pair<KEY_
       // First thread updates fill and overlfow (1 update per block)
       if (proper_w_tid == 0) {
          if (blockOverflow > *d_overflow) {
-            split::s_atomicExch((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow,VIRTUALWARP));
+            split::s_atomicExch((unsigned long long*)d_overflow,
+                                (unsigned long long)nextOverflow(blockOverflow, VIRTUALWARP));
          }
          split::s_atomicAdd(d_fill, blockTotal);
       }
@@ -334,7 +335,7 @@ __global__ void insert_kernel(KEY_TYPE* keys, VAL_TYPE* vals, hash_pair<KEY_TYPE
          if (w_tid == sub_winner) {
             KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex].first, EMPTYBUCKET, candidateKey);
             if (old == EMPTYBUCKET) {
-               threadOverflow = std::min(i+w_tid,static_cast<size_t>(1<<sizePower)) +1;
+               threadOverflow = std::min(i + w_tid, static_cast<size_t>(1 << sizePower)) + 1;
                split::s_atomicExch(&buckets[probingindex].second, candidateVal);
                vWarpDone = 1;
                // Flip the bit which corresponds to the thread that added an element
@@ -374,7 +375,8 @@ __global__ void insert_kernel(KEY_TYPE* keys, VAL_TYPE* vals, hash_pair<KEY_TYPE
       // First thread updates fill and overlfow (1 update per block)
       if (proper_w_tid == 0) {
          if (blockOverflow > *d_overflow) {
-            split::s_atomicExch((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow,VIRTUALWARP));
+            split::s_atomicExch((unsigned long long*)d_overflow,
+                                (unsigned long long)nextOverflow(blockOverflow, VIRTUALWARP));
          }
          split::s_atomicAdd(d_fill, blockTotal);
       }
@@ -561,7 +563,7 @@ __global__ void insert_index_kernel(KEY_TYPE* keys, hash_pair<KEY_TYPE, VAL_TYPE
          if (w_tid == sub_winner) {
             KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex].first, EMPTYBUCKET, candidateKey);
             if (old == EMPTYBUCKET) {
-               threadOverflow = std::min(i+w_tid,static_cast<size_t>(1<<sizePower)) +1;
+               threadOverflow = std::min(i + w_tid, static_cast<size_t>(1 << sizePower)) + 1;
                split::s_atomicExch(&buckets[probingindex].second, candidateVal);
                vWarpDone = 1;
                // Flip the bit which corresponds to the thread that added an element
@@ -601,7 +603,8 @@ __global__ void insert_index_kernel(KEY_TYPE* keys, hash_pair<KEY_TYPE, VAL_TYPE
       // First thread updates fill and overlfow (1 update per block)
       if (proper_w_tid == 0) {
          if (blockOverflow > *d_overflow) {
-            split::s_atomicExch((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow,VIRTUALWARP));
+            split::s_atomicExch((unsigned long long*)d_overflow,
+                                (unsigned long long)nextOverflow(blockOverflow, VIRTUALWARP));
          }
          split::s_atomicAdd(d_fill, blockTotal);
       }
@@ -716,5 +719,309 @@ __global__ void retrieve_kernel(hash_pair<KEY_TYPE, VAL_TYPE>* src, hash_pair<KE
    }
 }
 
+
+/* ----------------------------------- Kernels used by Hashinator::Unordered_Set -----------------------------------*/
+
+/*
+ * Resets all elements in dst to EMPTY
+ * */
+template <typename KEY_TYPE, KEY_TYPE EMPTYBUCKET = std::numeric_limits<KEY_TYPE>::max()>
+__global__ void reset_all_to_empty_set(KEY_TYPE* dst, const size_t len) {
+   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+   // Early exit here
+   if (tid >= len) {
+      return;
+   }
+
+   if (dst[tid] != EMPTYBUCKET) {
+      dst[tid] = EMPTYBUCKET;
+   }
+   return;
+}
+
+template <typename KEY_TYPE, KEY_TYPE EMPTYBUCKET = std::numeric_limits<KEY_TYPE>::max(),
+          class HashFunction = HashFunctions::Fibonacci<KEY_TYPE>, int WARPSIZE = defaults::WARPSIZE,int elementsPerWarp>
+__global__ void insert_set_kernel(KEY_TYPE* keys, KEY_TYPE* buckets, int sizePower,size_t maxoverflow, size_t* d_overflow,
+                              size_t* d_fill, size_t len, status* err) {
+
+   const int VIRTUALWARP = WARPSIZE / elementsPerWarp;
+   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+   const size_t wid = tid / VIRTUALWARP;
+   const size_t w_tid = tid % VIRTUALWARP;
+   const size_t proper_w_tid = tid % WARPSIZE; // the proper WID as if we had no Virtual warps
+   const size_t proper_wid = tid / WARPSIZE;
+   const size_t blockWid = proper_wid % (WARPSIZE / 4); // we have twice the warpsize and half the warps per block
+
+   __shared__ uint32_t addMask[WARPSIZE / 2];
+   __shared__ uint64_t warpOverflow[WARPSIZE / 2];
+   // Early quit if we have more warps than elements to insert
+   if (wid >= len) {
+      return;
+   }
+
+   // Zero out shared count;
+   if (proper_w_tid == 0 && blockWid == 0) {
+      for (int i = 0; i < WARPSIZE; i++) {
+         addMask[i] = 0;
+         warpOverflow[i] = 0;
+      }
+   }
+   __syncthreads();
+
+   uint64_t subwarp_relative_index = (wid) % (WARPSIZE / VIRTUALWARP);
+   uint64_t submask;
+   if constexpr (elementsPerWarp == 1) {
+      // TODO mind AMD 64 thread wavefronts
+      submask = SPLIT_VOTING_MASK;
+   } else {
+      submask = split::getIntraWarpMask_AMD(0, VIRTUALWARP * subwarp_relative_index + 1,
+                                            VIRTUALWARP * subwarp_relative_index + VIRTUALWARP);
+   }
+
+   KEY_TYPE candidateKey = keys[wid];
+   const int bitMask = (1 << (sizePower)) - 1;
+   const auto hashIndex = HashFunction::_hash(candidateKey, sizePower);
+   uint32_t localCount = 0;
+   uint64_t vWarpDone = 0; // state of virtual warp
+   uint64_t threadOverflow = 0;
+
+   for (size_t i = 0; i < (1 << sizePower); i += VIRTUALWARP) {
+
+      // Check if this virtual warp is done.
+      if (vWarpDone) {
+         break;
+      }
+      // Get the position we should be looking into
+      size_t probingindex = ((hashIndex + i + w_tid) & bitMask);
+      auto target = buckets[probingindex];
+
+      // vote for available emptybuckets in warp region
+      // Note that this has to be done before voting for already existing elements (below)
+      auto mask = split::s_warpVote(target == EMPTYBUCKET, submask) & submask;
+
+      // Check if this elements already exists
+      auto already_exists = split::s_warpVote(target == candidateKey, submask) & submask;
+      if (already_exists) {vWarpDone = 1;}
+
+      // If any duplicate was there now is the time for the whole Virtual warp to find out!
+      vWarpDone = split::s_warpVote(vWarpDone > 0, submask) & submask;
+
+      while (mask && !vWarpDone) {
+         int winner = split::s_findFirstSig(mask) - 1;
+         int sub_winner = winner - (subwarp_relative_index)*VIRTUALWARP;
+         if (w_tid == sub_winner) {
+            KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex], EMPTYBUCKET, candidateKey);
+            if (old == EMPTYBUCKET) {
+               threadOverflow = std::min(i + w_tid, static_cast<size_t>(1 << sizePower)) + 1;
+               vWarpDone = 1;
+               // Flip the bit which corresponds to the thread that added an element
+               localCount++;
+            } else if (old == candidateKey) {
+               vWarpDone = 1;
+            }
+         }
+         // If any of the virtual warp threads are done the the whole
+         // Virtual warp is done
+         vWarpDone = split::s_warpVote(vWarpDone > 0, submask) & submask;
+         mask ^= (1UL << winner);
+      }
+   }
+
+   // Update fill and overflow
+   __syncthreads();
+   // Per warp reduction
+   int warpTotals = warpReduce<WARPSIZE>(localCount);
+   uint64_t perWarpOverflow = warpReduceMax<WARPSIZE>(threadOverflow);
+   __syncthreads();
+
+   // Store to shmem minding Bank Conflicts
+   if (proper_w_tid == 0) {
+      // Write the count to the same place
+      addMask[(blockWid)] = warpTotals;
+      warpOverflow[(blockWid)] = perWarpOverflow;
+   }
+
+   __syncthreads();
+   // First warp in block reductions
+   if (blockWid == 0) {
+      uint64_t blockOverflow = warpReduceMax<WARPSIZE / 2>(warpOverflow[(proper_w_tid)]);
+      int blockTotal = warpReduce<WARPSIZE / 2>(addMask[(proper_w_tid)]);
+      // First thread updates fill and overlfow (1 update per block)
+      if (proper_w_tid == 0) {
+         if (blockOverflow > *d_overflow) {
+            split::s_atomicExch((unsigned long long*)d_overflow,
+                                (unsigned long long)nextOverflow(blockOverflow, VIRTUALWARP));
+         }
+         split::s_atomicAdd(d_fill, blockTotal);
+      }
+   }
+   return;
+}
+
+
+/*
+ * In a similar way to the insert and retrieve kernels we
+ * delete keys in "keys" if they do exist in the set.
+ * If the keys do not exist we do nothing.
+ * */
+template <typename KEY_TYPE, KEY_TYPE EMPTYBUCKET = std::numeric_limits<KEY_TYPE>::max(),
+          KEY_TYPE TOMBSTONE = EMPTYBUCKET - 1, class HashFunction = HashFunctions::Fibonacci<KEY_TYPE>,
+          int WARPSIZE = defaults::WARPSIZE, int elementsPerWarp>
+__global__ void delete_set_kernel(KEY_TYPE* keys, KEY_TYPE* buckets, size_t* d_tombstoneCounter,
+                              int sizePower, size_t maxoverflow, size_t len) {
+
+
+   const int VIRTUALWARP = WARPSIZE / elementsPerWarp;
+   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+   const size_t wid = tid / VIRTUALWARP;
+   const size_t w_tid = tid % VIRTUALWARP;
+   const size_t proper_w_tid = tid % WARPSIZE; // the proper WID as if we had no Virtual warps
+   const size_t proper_wid = tid / WARPSIZE;
+   const size_t blockWid = proper_wid % (WARPSIZE / 4); // we have twice the warpsize and half the warps per block
+
+   __shared__ uint32_t deleteMask[WARPSIZE / 2];
+
+   // Early quit if we have more warps than elements to handle
+   if (wid >= len) {
+      return;
+   }
+
+   // Zero out shared count;
+   if (proper_w_tid == 0 && blockWid == 0) {
+      for (int i = 0; i < WARPSIZE; i++) {
+         deleteMask[i] = 0;
+      }
+   }
+   __syncthreads();
+
+   uint64_t subwarp_relative_index = (wid) % (WARPSIZE / VIRTUALWARP);
+   uint64_t submask;
+   if constexpr (elementsPerWarp == 1) {
+      // TODO mind AMD 64 thread wavefronts
+      submask = SPLIT_VOTING_MASK;
+   } else {
+      submask = split::getIntraWarpMask_AMD(0, VIRTUALWARP * subwarp_relative_index + 1,
+                                            VIRTUALWARP * subwarp_relative_index + VIRTUALWARP);
+   }
+
+   KEY_TYPE candidateKey = keys[wid];
+   const int bitMask = (1 << (sizePower)) - 1;
+   const auto hashIndex = HashFunction::_hash(candidateKey, sizePower);
+   uint32_t localCount = 0;
+
+   for (size_t i = 0; i < maxoverflow; i += VIRTUALWARP) {
+
+      // Get the position we should be looking into
+      size_t probingindex = ((hashIndex + i + w_tid) & bitMask);
+      const auto maskExists =
+          split::s_warpVote(buckets[probingindex] == candidateKey, SPLIT_VOTING_MASK) & submask;
+      const auto emptyFound =
+          split::s_warpVote(buckets[probingindex] == EMPTYBUCKET, SPLIT_VOTING_MASK) & submask;
+      // If we encountered empty and the key is not in the range of this warp that means the key is not in hashmap.
+      if (!maskExists && emptyFound) {
+         return;
+      }
+      if (maskExists) {
+         int winner = split::s_findFirstSig(maskExists) - 1;
+         winner -= (subwarp_relative_index)*VIRTUALWARP;
+         if (w_tid == winner) {
+            split::s_atomicExch(&buckets[probingindex], TOMBSTONE);
+            localCount++;
+         }
+         break;
+      }
+   }
+
+   // Update tombstone counter
+   __syncthreads();
+   // Per warp reduction
+   int warpTotals = warpReduce<WARPSIZE>(localCount);
+   __syncthreads();
+
+   // Store to shmem minding Bank Conflicts
+   if (proper_w_tid == 0) {
+      // Write the count to the same place
+      deleteMask[(blockWid)] = warpTotals;
+   }
+
+   __syncthreads();
+   // First warp in block reductions
+   if (blockWid == 0) {
+      int blockTotal = warpReduce<WARPSIZE / 2>(deleteMask[(proper_w_tid)]);
+      // First thread updates fill and overlfow (1 update per block)
+      if (proper_w_tid == 0) {
+         split::s_atomicAdd(d_tombstoneCounter, blockTotal);
+      }
+   }
+   return;
+}
+
+/*
+ * Resets all elements pointed by src to EMPTY in dst
+ * If an elements in src is not found this will assert(false)
+ * */
+template <typename KEY_TYPE, KEY_TYPE EMPTYBUCKET = std::numeric_limits<KEY_TYPE>::max(),
+          class HashFunction = HashFunctions::Fibonacci<KEY_TYPE>, int WARPSIZE = defaults::WARPSIZE,
+          int elementsPerWarp>
+__global__ void reset_to_empty_set(KEY_TYPE* src, KEY_TYPE* dst,const int sizePower, size_t maxoverflow, size_t len)
+
+
+{
+   const int VIRTUALWARP = WARPSIZE / elementsPerWarp;
+   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+   const size_t wid = tid / VIRTUALWARP;
+   const size_t w_tid = tid % VIRTUALWARP;
+
+   // Early quit if we have more warps than elements to insert
+   if (wid >= len) {
+      return;
+   }
+
+   uint64_t subwarp_relative_index = (wid) % (WARPSIZE / VIRTUALWARP);
+   uint64_t submask;
+   if constexpr (elementsPerWarp == 1) {
+      // TODO mind AMD 64 thread wavefronts
+      submask = SPLIT_VOTING_MASK;
+   } else {
+      submask = split::getIntraWarpMask_AMD(0, VIRTUALWARP * subwarp_relative_index + 1,
+                                            VIRTUALWARP * subwarp_relative_index + VIRTUALWARP);
+   }
+
+   KEY_TYPE candidate = src[wid];
+   const int bitMask = (1 << (sizePower)) - 1;
+   const auto hashIndex = HashFunction::_hash(candidate, sizePower);
+   uint64_t vWarpDone = 0; // state of virtual warp
+
+   for (size_t i = 0; i < (1 << sizePower); i += VIRTUALWARP) {
+
+      // Check if this virtual warp is done.
+      if (vWarpDone) {
+         break;
+      }
+
+      // Get the position we should be looking into
+      size_t probingindex = ((hashIndex + i + w_tid) & bitMask);
+      auto target = dst[probingindex];
+
+      // vote for available emptybuckets in warp region
+      // Note that this has to be done before voting for already existing elements (below)
+      auto mask = split::s_warpVote(target == candidate, submask) & submask;
+
+      while (mask && !vWarpDone) {
+         int winner = split::s_findFirstSig(mask) - 1;
+         int sub_winner = winner - (subwarp_relative_index)*VIRTUALWARP;
+         if (w_tid == sub_winner) {
+            dst[probingindex] = EMPTYBUCKET;
+            vWarpDone = 1;
+         }
+         // If any of the virtual warp threads are done the the whole
+         // Virtual warp is done
+         vWarpDone = split::s_warpVote(vWarpDone > 0, submask) & submask;
+         mask ^= (1UL << winner);
+      }
+   }
+   return;
+}
+
 } // namespace Hashers
 } // namespace Hashinator
diff --git a/include/hashinator/kernels_NVIDIA.h b/include/hashinator/kernels_NVIDIA.h
index 4f009c5..7d9e256 100644
--- a/include/hashinator/kernels_NVIDIA.h
+++ b/include/hashinator/kernels_NVIDIA.h
@@ -208,7 +208,7 @@ __global__ void insert_kernel(hash_pair<KEY_TYPE, VAL_TYPE>* src, hash_pair<KEY_
          if (w_tid == sub_winner) {
             KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex].first, EMPTYBUCKET, candidate.first);
             if (old == EMPTYBUCKET) {
-               threadOverflow = std::min(i+w_tid,static_cast<size_t>(1<<sizePower)) +1;
+               threadOverflow = std::min(i + w_tid, static_cast<size_t>(1 << sizePower)) + 1;
                split::s_atomicExch(&buckets[probingindex].second, candidate.second);
                vWarpDone = 1;
                // Flip the bit which corresponds to the thread that added an element
@@ -252,7 +252,7 @@ __global__ void insert_kernel(hash_pair<KEY_TYPE, VAL_TYPE>* src, hash_pair<KEY_
       int blockTotal = warpReduce<WARPSIZE>(addMask[(proper_w_tid)]);
       // First thread updates fill and overlfow (1 update per block)
       if (proper_w_tid == 0) {
-         atomicMax((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow,VIRTUALWARP));
+         atomicMax((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow, VIRTUALWARP));
          split::s_atomicAdd(d_fill, blockTotal);
          ;
       }
@@ -368,7 +368,7 @@ __global__ void insert_kernel(KEY_TYPE* keys, VAL_TYPE* vals, hash_pair<KEY_TYPE
          if (w_tid == sub_winner) {
             KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex].first, EMPTYBUCKET, candidateKey);
             if (old == EMPTYBUCKET) {
-               threadOverflow = std::min(i+w_tid,static_cast<size_t>(1<<sizePower)) +1;
+               threadOverflow = std::min(i + w_tid, static_cast<size_t>(1 << sizePower)) + 1;
                split::s_atomicExch(&buckets[probingindex].second, candidateVal);
                vWarpDone = 1;
                // Flip the bit which corresponds to the thread that added an element
@@ -412,7 +412,7 @@ __global__ void insert_kernel(KEY_TYPE* keys, VAL_TYPE* vals, hash_pair<KEY_TYPE
       int blockTotal = warpReduce<WARPSIZE>(addMask[(proper_w_tid)]);
       // First thread updates fill and overlfow (1 update per block)
       if (proper_w_tid == 0) {
-         atomicMax((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow,VIRTUALWARP));
+         atomicMax((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow, VIRTUALWARP));
          split::s_atomicAdd(d_fill, blockTotal);
          ;
       }
@@ -528,7 +528,7 @@ __global__ void insert_index_kernel(KEY_TYPE* keys, hash_pair<KEY_TYPE, VAL_TYPE
          if (w_tid == sub_winner) {
             KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex].first, EMPTYBUCKET, candidateKey);
             if (old == EMPTYBUCKET) {
-               threadOverflow = std::min(i+w_tid,static_cast<size_t>(1<<sizePower)) +1;
+               threadOverflow = std::min(i + w_tid, static_cast<size_t>(1 << sizePower)) + 1;
                split::s_atomicExch(&buckets[probingindex].second, candidateVal);
                vWarpDone = 1;
                // Flip the bit which corresponds to the thread that added an element
@@ -572,7 +572,7 @@ __global__ void insert_index_kernel(KEY_TYPE* keys, hash_pair<KEY_TYPE, VAL_TYPE
       int blockTotal = warpReduce<WARPSIZE>(addMask[(proper_w_tid)]);
       // First thread updates fill and overlfow (1 update per block)
       if (proper_w_tid == 0) {
-         atomicMax((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow,VIRTUALWARP));
+         atomicMax((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow, VIRTUALWARP));
          split::s_atomicAdd(d_fill, blockTotal);
          ;
       }
@@ -796,5 +796,333 @@ __global__ void retrieve_kernel(hash_pair<KEY_TYPE, VAL_TYPE>* src, hash_pair<KE
    }
 }
 
+
+
+/* ----------------------------------- Kernels used by Hashinator::Unordered_Set -----------------------------------*/
+
+/*
+ * Resets all elements in dst to EMPTY
+ * */
+template <typename KEY_TYPE, KEY_TYPE EMPTYBUCKET = std::numeric_limits<KEY_TYPE>::max()>
+__global__ void reset_all_to_empty_set(KEY_TYPE* dst, const size_t len) {
+   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+   // Early exit here
+   if (tid >= len) {
+      return;
+   }
+   if (dst[tid]!= EMPTYBUCKET) {
+      dst[tid]= EMPTYBUCKET;
+   }
+   return;
+}
+
+
+/*Warp Synchronous hashing kernel for hashinator's internal use:
+ *  Similar to insert kernel used by Hashinator::hashmap.
+ * */
+template <typename KEY_TYPE, KEY_TYPE EMPTYBUCKET = std::numeric_limits<KEY_TYPE>::max(),
+          class HashFunction = HashFunctions::Fibonacci<KEY_TYPE>, int WARPSIZE = defaults::WARPSIZE,int elementsPerWarp>
+__global__ void insert_set_kernel(KEY_TYPE* keys, KEY_TYPE* buckets, int sizePower,size_t maxoverflow, size_t* d_overflow,
+                              size_t* d_fill, size_t len, status* err) {
+
+   __shared__ uint32_t addMask[WARPSIZE];
+   __shared__ uint64_t warpOverflow[WARPSIZE];
+
+   const int VIRTUALWARP = WARPSIZE / elementsPerWarp;
+   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+   const size_t wid = tid / VIRTUALWARP;
+   const size_t w_tid = tid % VIRTUALWARP;
+   const size_t proper_w_tid = tid % WARPSIZE; // the proper WID as if we had no Virtual warps
+   const size_t proper_wid = tid / WARPSIZE;
+   const size_t blockWid = proper_wid % WARPSIZE;
+
+   // Early quit if we have more warps than elements to insert
+   if (wid >= len) {
+      return;
+   }
+
+   // Zero out shared count;
+   if (proper_w_tid == 0 && blockWid == 0) {
+      for (int i = 0; i < WARPSIZE; i++) {
+         addMask[i] = 0;
+         warpOverflow[i] = 0;
+      }
+   }
+   __syncthreads();
+
+   uint32_t subwarp_relative_index = (wid) % (WARPSIZE / VIRTUALWARP);
+   uint32_t submask;
+   if constexpr (elementsPerWarp == 1) {
+      // TODO mind AMD 64 thread wavefronts
+      submask = SPLIT_VOTING_MASK;
+   } else {
+      submask = split::getIntraWarpMask_CUDA(0, VIRTUALWARP * subwarp_relative_index + 1,
+                                             VIRTUALWARP * subwarp_relative_index + VIRTUALWARP);
+   }
+
+   KEY_TYPE candidateKey = keys[wid];
+   const int bitMask = (1 << (sizePower)) - 1;
+   const auto hashIndex = HashFunction::_hash(candidateKey, sizePower);
+   uint32_t vWarpDone = 0; // state of virtual warp
+   uint32_t localCount = 0;
+   uint64_t threadOverflow = 0;
+
+   for (size_t i = 0; i < (1 << sizePower); i += VIRTUALWARP) {
+
+      // Check if this virtual warp is done.
+      if (vWarpDone) {
+         break;
+      }
+
+      // Get the position we should be looking into
+      size_t probingindex = ((hashIndex + i + w_tid) & bitMask);
+      auto target = buckets[probingindex];
+
+      // vote for available emptybuckets in warp region
+      // Note that this has to be done before voting for already existing elements (below)
+      auto mask = split::s_warpVote(target == EMPTYBUCKET, submask);
+
+      // Check if this elements already exists
+      auto already_exists = split::s_warpVote(target == candidateKey, submask);
+      if (already_exists) {vWarpDone=1;}
+      
+
+      // If any duplicate was there now is the time for the whole Virtual warp to find out!
+      vWarpDone = split::s_warpVoteAny(vWarpDone, submask);
+
+      while (mask && !vWarpDone) {
+         int winner = split::s_findFirstSig(mask) - 1;
+         int sub_winner = winner - (subwarp_relative_index)*VIRTUALWARP;
+         if (w_tid == sub_winner) {
+            KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex], EMPTYBUCKET, candidateKey);
+            if (old == EMPTYBUCKET) {
+               //Key added
+               threadOverflow = std::min(i + w_tid, static_cast<size_t>(1 << sizePower)) + 1;
+               vWarpDone = 1;
+               localCount++;
+            } else if (old == candidateKey) {
+               // Parallel stuff are fun. Major edge case!
+               vWarpDone = 1;
+            }
+         }
+         // If any of the virtual warp threads are done the the whole
+         // Virtual warp is done
+         vWarpDone = split::s_warpVoteAny(vWarpDone, submask);
+         mask ^= (1UL << winner);
+      }
+   }
+
+   /*
+      Update fill and overflow in 2 steps:
+      Step 1--> First thread per warp reduces the total elements added (per Warp)
+      Step 2--> Reduce the blockTotal from the warpTotals but do it in registers using the first warp in the block
+   */
+
+   // Per warp reduction
+   __syncwarp();
+   int warpTotals = warpReduce<WARPSIZE>(localCount);
+   uint64_t perWarpOverflow = warpReduceMax<WARPSIZE>(threadOverflow);
+   __syncwarp();
+
+   // Store to shmem minding Bank Conflicts
+   if (proper_w_tid == 0) {
+      // Write the count to the same place
+      addMask[(blockWid)] = warpTotals;
+      warpOverflow[(blockWid)] = perWarpOverflow;
+   }
+
+   __syncthreads();
+   // First warp in block reductions
+   if (blockWid == 0) {
+      uint64_t blockOverflow = warpReduceMax<WARPSIZE>(warpOverflow[(proper_w_tid)]);
+      int blockTotal = warpReduce<WARPSIZE>(addMask[(proper_w_tid)]);
+      // First thread updates fill and overlfow (1 update per block)
+      if (proper_w_tid == 0) {
+         atomicMax((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow, VIRTUALWARP));
+         split::s_atomicAdd(d_fill, blockTotal);
+         ;
+      }
+   }
+
+   // Make sure everyone actually made it otherwise raise the error flag.
+   if (split::s_warpVote(vWarpDone, SPLIT_VOTING_MASK) != __activemask()) {
+      split::s_atomicExch((uint32_t*)err, (uint32_t)status::fail);
+   }
+   return;
+}
+
+/*
+ * In a similar way to the insert and retrieve kernels we
+ * delete keys in "keys" if they do exist in the set.
+ * If the keys do not exist we do nothing.
+ * */
+template <typename KEY_TYPE, KEY_TYPE EMPTYBUCKET = std::numeric_limits<KEY_TYPE>::max(),
+          KEY_TYPE TOMBSTONE = EMPTYBUCKET - 1, class HashFunction = HashFunctions::Fibonacci<KEY_TYPE>,
+          int WARPSIZE = defaults::WARPSIZE, int elementsPerWarp>
+__global__ void delete_set_kernel(KEY_TYPE* keys, KEY_TYPE* buckets, size_t* d_tombstoneCounter,
+                              int sizePower, size_t maxoverflow, size_t len) {
+
+   const int VIRTUALWARP = WARPSIZE / elementsPerWarp;
+   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+   const size_t wid = tid / VIRTUALWARP;
+   const size_t w_tid = tid % VIRTUALWARP;
+   const size_t proper_w_tid = tid % WARPSIZE; // the proper WID as if we had no Virtual warps
+   const size_t proper_wid = tid / WARPSIZE;
+   const size_t blockWid = proper_wid % WARPSIZE;
+
+   __shared__ uint32_t deleteMask[WARPSIZE];
+   // Early quit if we have more warps than elements to handle
+   if (wid >= len) {
+      return;
+   }
+
+   // Zero out shmem;
+   if (proper_w_tid == 0 && blockWid == 0) {
+      for (int i = 0; i < WARPSIZE; i++) {
+         deleteMask[i] = 0;
+      }
+   }
+   __syncthreads();
+
+   uint32_t subwarp_relative_index = (wid) % (WARPSIZE / VIRTUALWARP);
+   uint32_t submask;
+   if constexpr (elementsPerWarp == 1) {
+      // TODO mind AMD 64 thread wavefronts
+      submask = SPLIT_VOTING_MASK;
+   } else {
+      submask = split::getIntraWarpMask_CUDA(0, VIRTUALWARP * subwarp_relative_index + 1,
+                                             VIRTUALWARP * subwarp_relative_index + VIRTUALWARP);
+   }
+
+   KEY_TYPE candidateKey = keys[wid];
+   const int bitMask = (1 << (sizePower)) - 1;
+   const auto hashIndex = HashFunction::_hash(candidateKey, sizePower);
+   uint32_t localCount = 0;
+   uint32_t vWarpDone = 0; // state of virtual warp
+
+   for (size_t i = 0; i < maxoverflow; i += VIRTUALWARP) {
+      if (vWarpDone) {
+         break;
+      }
+
+      // Get the position we should be looking into
+      size_t probingindex = ((hashIndex + i + w_tid) & bitMask);
+      const auto maskExists = split::s_warpVote(buckets[probingindex] == candidateKey, submask);
+      const auto emptyFound = split::s_warpVote(buckets[probingindex] == EMPTYBUCKET, submask);
+      // If we encountered empty and the key is not in the range of this warp that means the key is not in hashmap.
+      if (!maskExists && emptyFound) {
+         vWarpDone = 1;
+      }
+      if (maskExists) {
+         int winner = split::s_findFirstSig(maskExists) - 1;
+         winner -= (subwarp_relative_index)*VIRTUALWARP;
+         if (w_tid == winner) {
+            split::s_atomicExch(&buckets[probingindex], TOMBSTONE);
+            localCount++;
+            vWarpDone = 1;
+         }
+      }
+      vWarpDone = split::s_warpVoteAny(vWarpDone, submask);
+   }
+
+   /*
+      Update tombstone counter and overflow in 2 steps:
+      Step 1--> First thread per warp reduces the total deleted elements (per Warp)
+      Step 2--> Reduce the blockTotal from the warpTotals but do it in registers using the first warp in the block
+   */
+
+   // Per warp reduction
+   __syncwarp();
+   int warpTotals = warpReduce<WARPSIZE>(localCount);
+   __syncwarp();
+
+   // Store to shmem minding Bank Conflicts
+   if (proper_w_tid == 0) {
+      // Write the count to the same place
+      deleteMask[(blockWid)] = warpTotals;
+   }
+
+   __syncthreads();
+   // First warp in block reductions
+   if (blockWid == 0) {
+      int blockTotal = warpReduce<WARPSIZE>(deleteMask[(proper_w_tid)]);
+      // First thread updates the tombstone counter (1 update per block)
+      if (proper_w_tid == 0) {
+         split::s_atomicAdd(d_tombstoneCounter, blockTotal);
+      }
+   }
+   return;
+}
+
+/*
+ * Resets all elements pointed by src to EMPTY in dst
+ * If an elements in src is not found this will assert(false)
+ * */
+template <typename KEY_TYPE, KEY_TYPE EMPTYBUCKET = std::numeric_limits<KEY_TYPE>::max(),
+          class HashFunction = HashFunctions::Fibonacci<KEY_TYPE>, int WARPSIZE = defaults::WARPSIZE,
+          int elementsPerWarp>
+__global__ void reset_to_empty_set(KEY_TYPE* src, KEY_TYPE* dst,const int sizePower, size_t maxoverflow, size_t len)
+
+{
+   const int VIRTUALWARP = WARPSIZE / elementsPerWarp;
+   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+   const size_t wid = tid / VIRTUALWARP;
+   const size_t w_tid = tid % VIRTUALWARP;
+
+   // Early quit if we have more warps than elements to insert
+   if (wid >= len) {
+      return;
+   }
+
+   uint32_t subwarp_relative_index = (wid) % (WARPSIZE / VIRTUALWARP);
+   uint32_t submask;
+   if constexpr (elementsPerWarp == 1) {
+      // TODO mind AMD 64 thread wavefronts
+      submask = SPLIT_VOTING_MASK;
+   } else {
+      submask = split::getIntraWarpMask_CUDA(0, VIRTUALWARP * subwarp_relative_index + 1,
+                                             VIRTUALWARP * subwarp_relative_index + VIRTUALWARP);
+   }
+
+   KEY_TYPE candidate = src[wid];
+   const int bitMask = (1 << (sizePower)) - 1;
+   const auto hashIndex = HashFunction::_hash(candidate, sizePower);
+   uint32_t vWarpDone = 0; // state of virtual warp
+
+   for (size_t i = 0; i < (1 << sizePower); i += VIRTUALWARP) {
+
+      // Check if this virtual warp is done.
+      if (vWarpDone) {
+         break;
+      }
+
+      // Get the position we should be looking into
+      size_t probingindex = ((hashIndex + i + w_tid) & bitMask);
+      auto target = dst[probingindex];
+
+      // vote for available emptybuckets in warp region
+      // Note that this has to be done before voting for already existing elements (below)
+      auto mask = split::s_warpVote(target == candidate, submask);
+
+      while (mask && !vWarpDone) {
+         int winner = split::s_findFirstSig(mask) - 1;
+         int sub_winner = winner - (subwarp_relative_index)*VIRTUALWARP;
+         if (w_tid == sub_winner) {
+            dst[probingindex] = EMPTYBUCKET;
+            vWarpDone = 1;
+         }
+         // If any of the virtual warp threads are done the the whole
+         // Virtual warp is done
+         vWarpDone = split::s_warpVoteAny(vWarpDone, submask);
+         mask ^= (1UL << winner);
+      }
+   }
+
+   if (split::s_warpVote(vWarpDone, SPLIT_VOTING_MASK) != __activemask()) {
+      assert(0);
+   }
+   return;
+}
+
 } // namespace Hashers
 } // namespace Hashinator
diff --git a/include/hashinator/unordered_set/unordered_set.h b/include/hashinator/unordered_set/unordered_set.h
new file mode 100644
index 0000000..5669f43
--- /dev/null
+++ b/include/hashinator/unordered_set/unordered_set.h
@@ -0,0 +1,1260 @@
+#pragma once
+/* File:    unordered_set.h
+ * Authors: Kostis Papadakis, Urs Ganse and Markus Battarbee (2023)
+ *
+ * This file defines the following classes:
+ *    --Hashinator::Unordered_Set;
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ * */
+#pragma once
+#ifdef HASHINATOR_CPU_ONLY_MODE
+#define SPLIT_CPU_ONLY_MODE
+#endif
+#include "../../common.h"
+#include "../../splitvector/gpu_wrappers.h"
+#include "../../splitvector/split_allocators.h"
+#include "../../splitvector/splitvec.h"
+#include "../defaults.h"
+#include "../hash_pair.h"
+#include "../hashfunctions.h"
+#include <algorithm>
+#include <stdexcept>
+#ifndef HASHINATOR_CPU_ONLY_MODE
+#include "../../splitvector/split_tools.h"
+#include "../hashers.h"
+#endif
+#define UNUSED(x) (void)(x)
+namespace Hashinator {
+
+#ifndef HASHINATOR_CPU_ONLY_MODE
+template <typename T>
+using DefaultMetaAllocator = split::split_unified_allocator<T>;
+#define DefaultHasher                                                                                                  \
+   Hashers::Hasher<KEY_TYPE, void, HashFunction, EMPTYBUCKET, TOMBSTONE, defaults::WARPSIZE,                       \
+                   defaults::elementsPerWarp>
+#else
+template <typename T>
+using DefaultMetaAllocator = split::split_host_allocator<T>;
+#define DefaultHasher void
+#endif
+
+typedef struct Info {
+   Info(){};
+   Info(int sz)
+       : sizePower(sz), fill(0), currentMaxBucketOverflow(defaults::BUCKET_OVERFLOW), tombstoneCounter(0),
+         err(status::invalid) {}
+   int sizePower;
+   size_t fill;
+   size_t currentMaxBucketOverflow;
+   size_t tombstoneCounter;
+   status err;
+} SetInfo;
+
+template <typename KEY_TYPE, KEY_TYPE EMPTYBUCKET = std::numeric_limits<KEY_TYPE>::max(),
+          KEY_TYPE TOMBSTONE = EMPTYBUCKET - 1, class HashFunction = HashFunctions::Fibonacci<KEY_TYPE>,
+          class DeviceHasher = DefaultHasher, class Meta_Allocator = DefaultMetaAllocator<SetInfo>>
+
+class Unordered_Set {
+
+   // members
+private:
+   SetInfo* _setInfo;
+   split::SplitVector<KEY_TYPE> buckets;
+   Meta_Allocator _metaAllocator;
+   Unordered_Set* device_set;
+
+   HASHINATOR_HOSTDEVICE
+   uint32_t hash(KEY_TYPE in) const {
+      static_assert(std::is_arithmetic<KEY_TYPE>::value);
+      return HashFunction::_hash(in, _setInfo->sizePower);
+   }
+
+   HASHINATOR_HOSTDEVICE
+   inline void set_status(status code) noexcept { _setInfo->err = code; }
+
+   void addKey(KEY_TYPE key) noexcept {
+      int bitMask = (1 << _setInfo->sizePower) - 1;
+      auto hashIndex = hash(key);
+      // Try to find the matching bucket.
+      for (size_t i = 0; i < _setInfo->currentMaxBucketOverflow; i++) {
+         KEY_TYPE& candidate = buckets[(hashIndex + i) & bitMask];
+         if (candidate == EMPTYBUCKET) {
+            candidate = key;
+            _setInfo->fill++;
+            return;
+         }
+         if (candidate == TOMBSTONE) {
+            continue;
+         }
+      }
+      rehash(_setInfo->sizePower + 1);
+      return addKey(key);
+   }
+
+   void preallocate_device_handles() {
+   #ifndef HASHINATOR_CPU_ONLY_MODE
+      SPLIT_CHECK_ERR(split_gpuMalloc((void**)&device_set, sizeof(Unordered_Set)));
+   #endif
+   }
+
+   // Deallocates the bookeepping info and the device pointer
+   void deallocate_device_handles() {
+      if (device_set == nullptr) {
+         return;
+      }
+   #ifndef HASHINATOR_CPU_ONLY_MODE
+      SPLIT_CHECK_ERR(split_gpuFree(device_set));
+      device_set= nullptr;
+   #endif
+   }
+
+public:
+   // Constructors Destructors and = Operators with move/cpy semantics
+   Unordered_Set(uint32_t sizePower = 5) {
+      preallocate_device_handles();
+      _setInfo = _metaAllocator.allocate(1);
+      *_setInfo = SetInfo(sizePower);
+      buckets = split::SplitVector<KEY_TYPE>(1 << _setInfo->sizePower, EMPTYBUCKET);
+   }
+
+   Unordered_Set(const Unordered_Set& other) {
+      preallocate_device_handles();
+      _setInfo = _metaAllocator.allocate(1);
+      *_setInfo = *other._setInfo;
+      buckets = other.buckets;
+   }
+
+   Unordered_Set(const std::initializer_list<KEY_TYPE>& list) {
+      preallocate_device_handles();
+      _setInfo = _metaAllocator.allocate(1);
+      *_setInfo = SetInfo(5);
+      buckets = split::SplitVector<KEY_TYPE>(1 << _setInfo->sizePower, EMPTYBUCKET);
+      for (size_t i = 0; i < list.size(); i++) {
+         insert(list.begin()[i]);
+      }
+   }
+
+   Unordered_Set(const std::vector<KEY_TYPE>& vec) {
+      preallocate_device_handles();
+      _setInfo = _metaAllocator.allocate(1);
+      *_setInfo = SetInfo(5);
+      buckets = split::SplitVector<KEY_TYPE>(1 << _setInfo->sizePower, EMPTYBUCKET);
+      for (size_t i = 0; i < vec.size(); i++) {
+         insert(vec.begin()[i]);
+      }
+   }
+
+   Unordered_Set(std::initializer_list<KEY_TYPE>&& list) {
+      preallocate_device_handles();
+      _setInfo = _metaAllocator.allocate(1);
+      *_setInfo = SetInfo(5);
+      buckets = split::SplitVector<KEY_TYPE>(1 << _setInfo->sizePower, EMPTYBUCKET);
+      for (size_t i = 0; i < list.size(); i++) {
+         insert(list.begin()[i]);
+      }
+   }
+
+   Unordered_Set(Unordered_Set&& other) noexcept {
+      preallocate_device_handles();
+      *_setInfo = other.SetInfo;
+      other._setInfo = nullptr;
+      buckets = std::move(other.buckets);
+   }
+
+   ~Unordered_Set() { 
+      deallocate_device_handles();
+      _metaAllocator.deallocate(_setInfo, 1); 
+   }
+
+   Unordered_Set& operator=(const Unordered_Set& other) {
+      if (this == &other) {
+         return *this;
+      }
+      *_setInfo = *(other._setInfo);
+      buckets = other.buckets;
+      return *this;
+   }
+
+   Unordered_Set& operator=(Unordered_Set&& other) noexcept {
+      if (this == &other) {
+         return *this;
+      }
+      _metaAllocator.deallocate(_setInfo, 1);
+      _setInfo = other._setInfo;
+      other._setInfo = nullptr;
+      buckets = std::move(other.buckets);
+      return *this;
+   }
+
+   HASHINATOR_HOSTDEVICE
+   inline status peek_status(void) noexcept {
+      status retval = _setInfo->err;
+      _setInfo->err = status::invalid;
+      return retval;
+   }
+
+#ifdef HASHINATOR_CPU_ONLY_MODE
+   void* operator new(size_t len) {
+      void* ptr = (void*)malloc(len);
+      return ptr;
+   }
+
+   void operator delete(void* ptr) { free(ptr); }
+
+   void* operator new[](size_t len) {
+      void* ptr = (void*)malloc(len);
+      return ptr;
+   }
+
+   void operator delete[](void* ptr) { free(ptr); }
+
+#else
+   void* operator new(size_t len) {
+      void* ptr;
+      SPLIT_CHECK_ERR(split_gpuMallocManaged(&ptr, len));
+      return ptr;
+   }
+
+   void operator delete(void* ptr) { SPLIT_CHECK_ERR(split_gpuFree(ptr)); }
+
+   void* operator new[](size_t len) {
+      void* ptr;
+      SPLIT_CHECK_ERR(split_gpuMallocManaged(&ptr, len));
+      return ptr;
+   }
+
+   void operator delete[](void* ptr) { split_gpuFree(ptr); }
+
+   void copyMetadata(SetInfo* dst, split_gpuStream_t s = 0) {
+      SPLIT_CHECK_ERR(split_gpuMemcpyAsync(dst, _setInfo, sizeof(SetInfo), split_gpuMemcpyDeviceToHost, s));
+   }
+
+#endif
+
+#ifndef HASHINATOR_CPU_ONLY_MODE
+   Unordered_Set* upload(split_gpuStream_t stream = 0) {
+      optimizeGPU(stream);
+      SPLIT_CHECK_ERR(split_gpuMemcpyAsync(device_set, this, sizeof(Unordered_Set), split_gpuMemcpyHostToDevice, stream));
+      return device_set;
+   }
+
+   void download(split_gpuStream_t stream = 0) {
+      // Copy over fill as it might have changed
+      optimizeCPU(stream);
+      if (_setInfo->currentMaxBucketOverflow > Hashinator::defaults::BUCKET_OVERFLOW) {
+         rehash(_setInfo->sizePower + 1);
+      } else {
+         if (tombstone_count() > 0) {
+            clean_tombstones(stream);
+         }
+      }
+   }
+
+   void optimizeGPU(split_gpuStream_t stream = 0) noexcept {
+      int device;
+      SPLIT_CHECK_ERR(split_gpuGetDevice(&device));
+      SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_setInfo, sizeof(SetInfo), device, stream));
+      buckets.optimizeGPU(stream);
+   }
+
+   /*Manually prefetch data on Host*/
+   void optimizeCPU(split_gpuStream_t stream = 0) noexcept {
+      SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_setInfo, sizeof(SetInfo), split_gpuCpuDeviceId, stream));
+      buckets.optimizeCPU(stream);
+   }
+
+#endif
+
+   void rehash(uint32_t newSizePower) {
+      if (newSizePower > 32) {
+         throw std::out_of_range("Hashmap ran into rehashing catastrophe and exceeded 32bit buckets.");
+      }
+      split::SplitVector<KEY_TYPE> newBuckets(1 << newSizePower, EMPTYBUCKET);
+      _setInfo->sizePower = newSizePower;
+      int bitMask = (1 << _setInfo->sizePower) - 1; // For efficient modulo of the array size
+
+      // Iterate through all old elements and rehash them into the new array.
+      for (auto& e : buckets) {
+         // Skip empty buckets ; We also check for TOMBSTONE elements
+         // as we might be coming off a kernel that overflew the hashmap
+         if (e == EMPTYBUCKET || e == TOMBSTONE) {
+            continue;
+         }
+
+         uint32_t newHash = hash(e);
+         bool found = false;
+         for (int i = 0; i < Hashinator::defaults::BUCKET_OVERFLOW; i++) {
+            KEY_TYPE& candidate = newBuckets[(newHash + i) & bitMask];
+            if (candidate == EMPTYBUCKET) {
+               // Found an empty bucket, assign that one.
+               candidate = e;
+               found = true;
+               break;
+            }
+         }
+
+         if (!found) {
+            // Having arrived here means that we unsuccessfully rehashed and
+            // are *still* overflowing our buckets. So we need to try again with a bigger one.
+            return rehash(newSizePower + 1);
+         }
+      }
+
+      // Replace our buckets with the new ones
+      buckets = newBuckets;
+      _setInfo->currentMaxBucketOverflow = Hashinator::defaults::BUCKET_OVERFLOW;
+      _setInfo->tombstoneCounter = 0;
+   }
+
+
+#ifndef HASHINATOR_CPU_ONLY_MODE
+   template <typename Rule, int BLOCKSIZE = 1024>
+   size_t extractPattern(KEY_TYPE* elements, Rule rule, split_gpuStream_t s = 0) {
+      // Figure out Blocks to use
+      size_t _s = std::ceil((float(buckets.size())) / (float)BLOCKSIZE);
+      size_t nBlocks = nextPow2(_s);
+      nBlocks+=(nBlocks==0);
+
+      // Allocate with Mempool
+      const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t);
+      split::tools::Cuda_mempool mPool(memory_for_pool, s);
+      size_t retval =
+          split::tools::copy_if_raw<KEY_TYPE, Rule, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>(
+              buckets, elements, rule, nBlocks, mPool, s);
+      return retval;
+   }
+
+   size_t extractAllKeys(split::SplitVector<KEY_TYPE>& elements, split_gpuStream_t s = 0, bool prefetches = true) {
+      // Extract all keys
+      if (prefetches){
+         elements.optimizeGPU(s);
+      }
+      auto rule = [] __host__ __device__(const KEY_TYPE& kval) -> bool {
+         return kval != EMPTYBUCKET && kval!= TOMBSTONE;
+      };
+      return extractPattern(elements.data(), rule, s);
+   }
+
+   void device_rehash(int newSizePower, split_gpuStream_t s = 0) {
+      if (newSizePower > 32) {
+         throw std::out_of_range("Hashmap ran into rehashing catastrophe and exceeded 32bit buckets.");
+      }
+
+      size_t priorFill = _setInfo->fill;
+      // Extract all valid elements
+      KEY_TYPE* validElements;
+      SPLIT_CHECK_ERR(split_gpuMallocAsync((void**)&validElements,
+                                           (_setInfo->fill + 1) * sizeof(KEY_TYPE), s));
+      optimizeGPU(s);
+      SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+
+      auto isValidKey = [] __host__ __device__(KEY_TYPE& element) {
+         return ( (element !=TOMBSTONE) && (element!=EMPTYBUCKET) );
+      };
+
+      uint32_t nValidElements = extractPattern(validElements, isValidKey, s);
+
+      SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+      assert(nValidElements == _setInfo->fill && "Something really bad happened during rehashing! Ask Kostis!");
+      // We can now clear our buckets
+      // Easy optimization: If our bucket had no valid elements and the same size was requested
+      // we can just clear it
+      if (newSizePower == _setInfo->sizePower && nValidElements == 0) {
+         clear(targets::device, s, true);
+         set_status((priorFill == _setInfo->fill) ? status::success : status::fail);
+         split_gpuFreeAsync(validElements, s);
+         return;
+      }
+      optimizeCPU(s);
+      buckets = std::move(split::SplitVector<KEY_TYPE>(1 << newSizePower, KEY_TYPE(EMPTYBUCKET)));
+      optimizeGPU(s);
+      *_setInfo = SetInfo(newSizePower);
+      // Insert valid elements to now larger buckets
+      insert(validElements, nValidElements, 1, s);
+      set_status((priorFill == _setInfo->fill) ? status::success : status::fail);
+      split_gpuFreeAsync(validElements, s);
+      return;
+   }
+
+
+   void clean_tombstones(split_gpuStream_t s = 0, bool prefetches = false) {
+
+      if (_setInfo->tombstoneCounter == 0) {
+         return;
+      }
+
+      // Reset the tomstone counter
+      _setInfo->tombstoneCounter = 0;
+      // Allocate memory for overflown elements. So far this is the same size as our buckets but we can be better than
+      // this
+
+      KEY_TYPE* overflownElements;
+      SPLIT_CHECK_ERR(split_gpuMallocAsync((void**)&overflownElements,
+                                           (1 << _setInfo->sizePower) * sizeof(KEY_TYPE), s));
+
+      if (prefetches) {
+         optimizeGPU(s);
+      }
+      SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+
+      int currentSizePower = _setInfo->sizePower;
+      KEY_TYPE* bck_ptr = buckets.data();
+
+      auto isOverflown = [bck_ptr, currentSizePower] __host__ __device__(KEY_TYPE & element)->bool {
+         if (element == TOMBSTONE) {
+            element = EMPTYBUCKET;
+            return false;
+         }
+         if (element == EMPTYBUCKET) {
+            return false;
+         }
+         const size_t hashIndex = HashFunction::_hash(element, currentSizePower);
+         const int bitMask = (1 << (currentSizePower)) - 1;
+         bool isOverflown = (bck_ptr[hashIndex & bitMask] != element);
+         return isOverflown;
+      };
+
+      // Extract overflown elements and reset overflow
+      uint32_t nOverflownElements = extractPattern(overflownElements, isOverflown, s);
+      _setInfo->currentMaxBucketOverflow = defaults::BUCKET_OVERFLOW;
+
+      if (nOverflownElements == 0) {
+         SPLIT_CHECK_ERR(split_gpuFreeAsync(overflownElements, s));
+         return;
+      }
+      // If we do have overflown elements we put them back in the buckets
+      SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+      DeviceHasher::reset_set(overflownElements, buckets.data(), _setInfo->sizePower, _setInfo->currentMaxBucketOverflow,
+                          nOverflownElements, s);
+      _setInfo->fill -= nOverflownElements;
+      DeviceHasher::insert_set(overflownElements, buckets.data(), _setInfo->sizePower, _setInfo->currentMaxBucketOverflow,
+                           &_setInfo->currentMaxBucketOverflow, &_setInfo->fill, nOverflownElements, &_setInfo->err, s);
+
+      SPLIT_CHECK_ERR(split_gpuFreeAsync(overflownElements, s));
+      return;
+   }
+#else
+   void clean_tombstones() {
+      rehash();
+   }
+#endif
+
+#ifdef HASHINATOR_CPU_ONLY_MODE
+   // Try to get the overflow back to the original one
+   void performCleanupTasks() {
+      while (_setInfo->currentMaxBucketOverflow > Hashinator::defaults::BUCKET_OVERFLOW) {
+         rehash(_setInfo->sizePower + 1);
+      }
+      // When operating in CPU only mode we rehash to get rid of tombstones
+      if (tombstone_ratio() > 0.025) {
+         rehash(_setInfo->sizePower);
+      }
+   }
+#else
+   // Try to get the overflow back to the original one
+   void performCleanupTasks(split_gpuStream_t s = 0) {
+      while (_setInfo->currentMaxBucketOverflow > Hashinator::defaults::BUCKET_OVERFLOW) {
+         device_rehash(_setInfo->sizePower + 1, s);
+      }
+      if (tombstone_ratio() > 0.025) {
+         clean_tombstones(s);
+      }
+   }
+
+#endif
+
+   void rehash() { rehash(_setInfo->sizePower); }
+
+   // Iterators
+   class iterator {
+      Unordered_Set<KEY_TYPE>* set;
+      size_t index;
+
+   public:
+      iterator(Unordered_Set<KEY_TYPE>& set, size_t index) : set(&set), index(index) {}
+
+      iterator& operator++() {
+         index++;
+         while (index < set->buckets.size()) {
+            if (set->buckets[index] != EMPTYBUCKET && set->buckets[index] != TOMBSTONE) {
+               break;
+            }
+            index++;
+         }
+         return *this;
+      }
+
+      iterator operator++(int) { // Postfix version
+         iterator temp = *this;
+         ++(*this);
+         return temp;
+      }
+      bool operator==(iterator other) const { return &set->buckets[index] == &other.set->buckets[other.index]; }
+      bool operator!=(iterator other) const { return &set->buckets[index] != &other.set->buckets[other.index]; }
+      KEY_TYPE& operator*() const { return set->buckets[index]; }
+      KEY_TYPE* operator->() const { return &set->buckets[index]; }
+      size_t getIndex() { return index; }
+   };
+
+   // Const iterator.
+   class const_iterator {
+      const Unordered_Set<KEY_TYPE>* set;
+      size_t index;
+
+   public:
+      explicit const_iterator(const Unordered_Set<KEY_TYPE>& set, size_t index) : set(&set), index(index) {}
+      const_iterator& operator++() {
+         index++;
+         while (index < set->buckets.size()) {
+            if (set->buckets[index] != EMPTYBUCKET && set->buckets[index] != TOMBSTONE) {
+               break;
+            }
+            index++;
+         }
+         return *this;
+      }
+      const_iterator operator++(int) { // Postfix version
+         const_iterator temp = *this;
+         ++(*this);
+         return temp;
+      }
+      bool operator==(const_iterator other) const { return &set->buckets[index] == &other.set->buckets[other.index]; }
+      bool operator!=(const_iterator other) const { return &set->buckets[index] != &other.set->buckets[other.index]; }
+      const KEY_TYPE& operator*() const { return set->buckets[index]; }
+      const KEY_TYPE* operator->() const { return &set->buckets[index]; }
+      size_t getIndex() { return index; }
+   };
+
+   iterator begin() {
+      for (size_t i = 0; i < buckets.size(); i++) {
+         if (buckets[i] != EMPTYBUCKET && buckets[i] != TOMBSTONE) {
+            return iterator(*this, i);
+         }
+      }
+      return end();
+   }
+
+   const_iterator begin() const {
+      for (size_t i = 0; i < buckets.size(); i++) {
+         if (buckets[i] != EMPTYBUCKET && buckets[i] != TOMBSTONE) {
+            return const_iterator(*this, i);
+         }
+      }
+      return end();
+   }
+
+   iterator end() { return iterator(*this, buckets.size()); }
+
+   const_iterator end() const { return const_iterator(*this, buckets.size()); }
+
+
+#ifndef HASHINATOR_CPU_ONLY_MODE
+   // Device Iterator type. Iterates through all non-empty buckets.
+   class device_iterator {
+   private:
+      size_t index;
+      Unordered_Set<KEY_TYPE>* set;
+
+   public:
+      HASHINATOR_DEVICEONLY
+      device_iterator(Unordered_Set<KEY_TYPE>& set, size_t index) : index(index), set(&set) {}
+
+      HASHINATOR_DEVICEONLY
+      size_t getIndex() { return index; }
+
+      HASHINATOR_DEVICEONLY
+      device_iterator& operator++() {
+         index++;
+         while (index < set->buckets.size()) {
+            if (set->buckets[index] != EMPTYBUCKET && set->buckets[index] != TOMBSTONE) {
+               break;
+            }
+            index++;
+         }
+         return *this;
+      }
+
+      HASHINATOR_DEVICEONLY
+      device_iterator operator++(int) {
+         device_iterator temp = *this;
+         ++(*this);
+         return temp;
+      }
+
+      HASHINATOR_DEVICEONLY
+      bool operator==(device_iterator other) const { return &set->buckets[index] == &other.set->buckets[other.index]; }
+      HASHINATOR_DEVICEONLY
+      bool operator!=(device_iterator other) const { return &set->buckets[index] != &other.set->buckets[other.index]; }
+
+      HASHINATOR_DEVICEONLY
+      KEY_TYPE& operator*() const { return set->buckets[index]; }
+      HASHINATOR_DEVICEONLY
+      KEY_TYPE* operator->() const { return &set->buckets[index]; }
+   };
+
+   class const_device_iterator {
+   private:
+      size_t index;
+      const Unordered_Set<KEY_TYPE>* set;
+
+   public:
+      HASHINATOR_DEVICEONLY
+      explicit const_device_iterator(const Unordered_Set<KEY_TYPE>& set, size_t index) : index(index), set(&set) {}
+
+      HASHINATOR_DEVICEONLY
+      size_t getIndex() { return index; }
+
+      HASHINATOR_DEVICEONLY
+      const_device_iterator& operator++() {
+         index++;
+         while (index < set->buckets.size()) {
+            if (set->buckets[index] != EMPTYBUCKET && set->buckets[index] != TOMBSTONE) {
+               break;
+            }
+            index++;
+         }
+         return *this;
+      }
+
+      HASHINATOR_DEVICEONLY
+      const_device_iterator operator++(int) {
+         const_device_iterator temp = *this;
+         ++(*this);
+         return temp;
+      }
+
+      HASHINATOR_DEVICEONLY
+      bool operator==(const_device_iterator other) const {
+         return &set->buckets[index] == &other.set->buckets[other.index];
+      }
+      HASHINATOR_DEVICEONLY
+      bool operator!=(const_device_iterator other) const {
+         return &set->buckets[index] != &other.set->buckets[other.index];
+      }
+
+      HASHINATOR_DEVICEONLY
+      const KEY_TYPE& operator*() const { return set->buckets[index]; }
+      HASHINATOR_DEVICEONLY
+      const KEY_TYPE* operator->() const { return &set->buckets[index]; }
+   };
+
+   HASHINATOR_DEVICEONLY
+   device_iterator device_end() { return device_iterator(*this, buckets.size()); }
+
+   HASHINATOR_DEVICEONLY
+   const_device_iterator device_end() const { return const_device_iterator(*this, buckets.size()); }
+
+   HASHINATOR_DEVICEONLY
+   device_iterator device_begin() {
+      for (size_t i = 0; i < buckets.size(); i++) {
+         if (buckets[i] != EMPTYBUCKET && buckets[i] != TOMBSTONE) {
+            return device_iterator(*this, i);
+         }
+      }
+      return device_end();
+   }
+
+   HASHINATOR_DEVICEONLY
+   const_device_iterator device_begin() const noexcept {
+      for (size_t i = 0; i < buckets.size(); i++) {
+         if (buckets[i] != EMPTYBUCKET && buckets[i] != TOMBSTONE) {
+            return const_device_iterator(*this, i);
+         }
+      }
+      return device_end();
+   }
+
+   // Element access by iterator
+   HASHINATOR_DEVICEONLY
+   device_iterator device_find(KEY_TYPE key) {
+      int bitMask = (1 << _setInfo->sizePower) - 1; // For efficient modulo of the array size
+      auto hashIndex = hash(key);
+
+      // Try to find the matching bucket.
+      for (size_t i = 0; i < _setInfo->currentMaxBucketOverflow; i++) {
+         const KEY_TYPE& candidate = buckets[(hashIndex + i) & bitMask];
+
+         if (candidate == TOMBSTONE) {
+            continue;
+         }
+
+         if (candidate == key) {
+            // Found a match, return that
+            return device_iterator(*this, (hashIndex + i) & bitMask);
+         }
+
+         if (candidate.first == EMPTYBUCKET) {
+            // Found an empty bucket. Return empty.
+            return device_end();
+         }
+      }
+      // Not found
+      return device_end();
+   }
+
+   HASHINATOR_DEVICEONLY
+   const const_device_iterator device_find(KEY_TYPE key) const {
+      int bitMask = (1 << _setInfo->sizePower) - 1; // For efficient modulo of the array size
+      auto hashIndex = hash(key);
+
+      // Try to find the matching bucket.
+      for (size_t i = 0; i < _setInfo->currentMaxBucketOverflow; i++) {
+         const KEY_TYPE& candidate = buckets[(hashIndex + i) & bitMask];
+
+         if (candidate == TOMBSTONE) {
+            continue;
+         }
+
+         if (candidate == key) {
+            // Found a match, return that
+            return const_device_iterator(*this, (hashIndex + i) & bitMask);
+         }
+
+         if (candidate == EMPTYBUCKET) {
+            // Found an empty bucket. Return empty.
+            return device_end();
+         }
+      }
+      // Not found
+      return device_end();
+   }
+
+   HASHINATOR_DEVICEONLY
+   void insert_element(const KEY_TYPE& key, size_t& thread_overflowLookup) {
+      int bitMask = (1 << _setInfo->sizePower) - 1; // For efficient modulo of the array size
+      auto hashIndex = hash(key);
+      size_t i = 0;
+      while (i < buckets.size()) {
+         uint32_t vecindex = (hashIndex + i) & bitMask;
+         KEY_TYPE old = split::s_atomicCAS(&buckets[vecindex], EMPTYBUCKET, key);
+         // Key does not exist so we create it and incerement fill
+         if (old == EMPTYBUCKET) {
+            split::s_atomicAdd((unsigned int*)(&_setInfo->fill), 1);
+            thread_overflowLookup = i + 1;
+            return;
+         }
+         // Key exists so we overwrite it. Fill stays the same
+         if (old == key) {
+            thread_overflowLookup = i + 1;
+            return;
+         }
+         i++;
+      }
+      assert(false && "Hashmap completely overflown");
+   }
+
+   HASHINATOR_DEVICEONLY
+   hash_pair<device_iterator, bool> device_insert(KEY_TYPE newEntry) {
+      bool found = device_find(newEntry) != device_end();
+      if (!found) {
+         add_element(newEntry);
+      }
+      return hash_pair<device_iterator, bool>(device_find(newEntry.first), !found);
+   }
+
+   HASHINATOR_DEVICEONLY
+   void add_element(const KEY_TYPE& key) {
+      size_t thread_overflowLookup = 0;
+      insert_element(key, thread_overflowLookup);
+      atomicMax((unsigned long long*)&(_setInfo->currentMaxBucketOverflow),
+                nextOverflow(thread_overflowLookup, defaults::WARPSIZE / defaults::elementsPerWarp));
+   }
+#endif
+
+   void print_pair(const KEY_TYPE& i) const noexcept {
+      size_t currentSizePower = _setInfo->sizePower;
+      const size_t hashIndex = HashFunction::_hash(i, currentSizePower);
+      const int bitMask = (1 << (currentSizePower)) - 1;
+      size_t optimalIndex = hashIndex & bitMask;
+      const_iterator it = find(i);
+      int64_t overflow = llabs(it.getIndex() - optimalIndex);
+      if (i == TOMBSTONE) {
+         std::cout << "[╀] ";
+      } else if (i == EMPTYBUCKET) {
+         std::cout << "[▢] ";
+      } else {
+         if (overflow > 0) {
+            printf("[%d,\033[1;31m%li\033[0m] ", i, overflow);
+         } else {
+            printf("[%d,%zu] ", i, overflow);
+         }
+      }
+   }
+
+   void dump_buckets() const noexcept {
+      printf("Hashinator Stats \n");
+      printf("Fill= %zu, LoadFactor=%f \n", _setInfo->fill, load_factor());
+      printf("Tombstones= %zu\n", _setInfo->tombstoneCounter);
+      for (size_t i = 0; i < buckets.size(); ++i) {
+         print_pair(buckets[i]);
+      }
+      printf("\n");
+   }
+
+   HASHINATOR_HOSTDEVICE
+   void stats() const noexcept{
+      printf("Hashinator Stats \n");
+      printf("Bucket size= %lu\n", buckets.size());
+      printf("Fill= %lu, LoadFactor=%f \n", _setInfo->fill, load_factor());
+      printf("Tombstones= %lu\n", _setInfo->tombstoneCounter);
+      printf("Overflow= %lu\n", _setInfo->currentMaxBucketOverflow);
+   }
+
+   HASHINATOR_HOSTDEVICE
+   inline int getSizePower(void) const noexcept { return _setInfo->sizePower; }
+
+   // For STL compatibility: size(), bucket_count(), count(KEY_TYPE), clear()
+   HASHINATOR_HOSTDEVICE
+   size_t size() const noexcept { return _setInfo->fill; }
+
+   HASHINATOR_HOSTDEVICE
+   size_t bucket_count() const noexcept { return buckets.size(); }
+
+   HASHINATOR_HOSTDEVICE
+   float load_factor() const noexcept { return (float)size() / bucket_count(); }
+
+   HASHINATOR_HOSTDEVICE
+   size_t tombstone_count() const noexcept { return _setInfo->tombstoneCounter; }
+
+   HASHINATOR_HOSTDEVICE
+   float tombstone_ratio() const noexcept {
+      if (tombstone_count() == 0) {
+         return 0.0;
+      }
+      return (float)_setInfo->tombstoneCounter / (float)buckets.size();
+   }
+
+   bool contains(const KEY_TYPE& key) const noexcept { return (find(key) != end()) ; }
+
+   bool empty() const noexcept { return begin() == end(); }
+
+   size_t count(const KEY_TYPE& key) const noexcept { return contains(key) ? 1 : 0; }
+
+#ifdef HASHINATOR_CPU_ONLY_MODE
+   void clear(targets t= targets::host){
+      UNUSED(t);
+      buckets = split::SplitVector<KEY_TYPE>(1 << _setInfo->sizePower, {EMPTYBUCKET});
+      *_setInfo = SetInfo(_setInfo->sizePower);
+      return;
+   }
+#else
+   void clear(targets t = targets::host, split_gpuStream_t s = 0, bool prefetches = true) {
+      switch (t) {
+      case targets::host:
+         buckets = split::SplitVector<KEY_TYPE>(1 << _setInfo->sizePower, {EMPTYBUCKET});
+         *_setInfo = SetInfo(_setInfo->sizePower);
+         break;
+      case targets::device:
+         if (prefetches) {
+            buckets.optimizeGPU(s);
+         }
+         DeviceHasher::reset_all_set(buckets.data(), buckets.size(), s);
+         _setInfo->fill = 0;
+         set_status((_setInfo->fill == 0) ? success : fail);
+         break;
+      default:
+         clear(targets::host);
+         break;
+      }
+      return;
+   }
+#endif
+
+
+   iterator find(const KEY_TYPE& key) noexcept {
+      const int bitMask = (1 << _setInfo->sizePower) - 1;
+      const auto hashIndex = hash(key);
+
+      for (size_t i = 0; i < _setInfo->currentMaxBucketOverflow; i++) {
+         const KEY_TYPE& candidate = buckets[(hashIndex + i) & bitMask];
+         if (candidate == key) {
+            auto index = (hashIndex + i) & bitMask;
+            return iterator(*this, index);
+         }
+         if (candidate == TOMBSTONE) {
+            continue;
+         }
+         if (candidate == EMPTYBUCKET) {
+            return end();
+         }
+      }
+      return end();
+   }
+
+   const const_iterator find(const KEY_TYPE& key) const noexcept {
+      const int bitMask = (1 << _setInfo->sizePower) - 1;
+      const auto hashIndex = hash(key);
+
+      for (size_t i = 0; i < _setInfo->currentMaxBucketOverflow; i++) {
+         const KEY_TYPE& candidate = buckets[(hashIndex + i) & bitMask];
+         if (candidate == key) {
+            auto index = (hashIndex + i) & bitMask;
+            return const_iterator(*this, index);
+         }
+         if (candidate == TOMBSTONE) {
+            continue;
+         }
+         if (candidate == EMPTYBUCKET) {
+            return end();
+         }
+      }
+      return end();
+   }
+
+   hash_pair<iterator, bool> insert(const KEY_TYPE& key) noexcept {
+      // try to find key
+      performCleanupTasks();
+      iterator it = find(key);
+
+      // if the key already exists we mutate it
+      if (it != end()) {
+         *it = key;
+         return {it, it != end()};
+      }
+      // otherwise we add it
+      addKey(key);
+      iterator retval = find(key);
+      return {retval, retval != end()};
+   }
+
+   hash_pair<iterator, bool> insert(KEY_TYPE&& key) noexcept {
+      // try to find key
+      iterator it = find(key);
+
+      // if the key already exists we mutate it
+      if (it != end()) {
+         *it = key;
+         return {it, it != end()};
+      }
+      // otherwise we add it
+      addKey(key);
+      iterator retval = find(key);
+      return {retval, retval != end()};
+   }
+
+   iterator erase(iterator pos) {
+      auto index = pos.getIndex();
+      assert(index <static_cast<size_t>(  1 << _setInfo->sizePower ));
+      KEY_TYPE& key = buckets[index];
+      if (key != EMPTYBUCKET && key != TOMBSTONE) {
+         key = TOMBSTONE;
+         _setInfo->fill--;
+         _setInfo->tombstoneCounter++;
+      }
+      return ++pos; // return next valid element;
+   }
+
+   iterator erase(const_iterator pos) {
+      auto index = pos.getIndex();
+      assert(index < 1 << _setInfo->sizePower);
+      KEY_TYPE& key = buckets[index];
+      if (key != EMPTYBUCKET && key != TOMBSTONE) {
+         key = TOMBSTONE;
+         _setInfo->fill--;
+         _setInfo->tombstoneCounter++;
+      }
+      return ++pos; // return next valid element;
+   }
+
+   bool erase(const KEY_TYPE& key) {
+      auto it = find(key);
+      if (it!=end()){
+         erase(it);
+         return true;
+      }
+      return false;
+   }
+
+
+#ifdef HASHINATOR_CPU_ONLY_MODE
+   void resize(int newSizePower,targets t = targets::host) {
+      UNUSED(t);
+      rehash(newSizePower);
+   }
+
+   void insert(KEY_TYPE* keys,size_t len,float targetLF = 0.5) {
+      UNUSED(targetLF);
+      for (size_t i =0 ; i < len; ++i){
+         insert(keys[i]);
+
+      }
+   }
+
+   void erase(KEY_TYPE* keys,size_t len,float targetLF = 0.5) {
+      UNUSED(targetLF);
+      for (size_t i =0 ; i < len; ++i){
+         erase(keys[i]);
+
+      }
+   }
+#else
+   void resize(int newSizePower, targets t = targets::host, split_gpuStream_t s = 0) {
+      switch (t) {
+      case targets::host:
+         rehash(newSizePower);
+         break;
+      case targets::device:
+         device_rehash(newSizePower, s);
+         break;
+      default:
+         resize(newSizePower, targets::host);
+         break;
+      }
+      return;
+   }
+   
+
+   void insert(KEY_TYPE* keys,size_t len,float targetLF = 0.5, split_gpuStream_t s = 0, bool prefetches = true) {
+      // TODO fix these if paths or at least annotate them .
+      if (len == 0) {
+         set_status(status::success);
+         return;
+      }
+      if (prefetches) {
+         buckets.optimizeGPU(s);
+      }
+      int64_t neededPowerSize = std::ceil(std::log2((_setInfo->fill + len) * (1.0 / targetLF)));
+      if (neededPowerSize > _setInfo->sizePower) {
+         resize(neededPowerSize, targets::device, s);
+      }
+      _setInfo->currentMaxBucketOverflow = _setInfo->currentMaxBucketOverflow;
+      DeviceHasher::insert_set(keys, buckets.data(), _setInfo->sizePower, _setInfo->currentMaxBucketOverflow,
+                           &_setInfo->currentMaxBucketOverflow, &_setInfo->fill, len, &_setInfo->err, s);
+      return;
+   }
+
+   // Uses Hasher's erase_kernel to delete  elements
+   void erase(KEY_TYPE* keys, size_t len, split_gpuStream_t s = 0) {
+      if (len == 0) {
+         set_status(status::success);
+         return;
+      }
+      buckets.optimizeGPU(s);
+      // Remember the last number of tombstones
+      size_t tbStore = tombstone_count();
+      DeviceHasher::erase_set(keys, buckets.data(), &_setInfo->tombstoneCounter, _setInfo->sizePower,
+                          _setInfo->currentMaxBucketOverflow, len, s);
+      size_t tombstonesAdded = tombstone_count() - tbStore;
+      // Fill should be decremented by the number of tombstones added;
+      _setInfo->fill -= tombstonesAdded;
+      return;
+   }
+
+#endif
+   
+#ifndef HASHINATOR_CPU_ONLY_MODE
+   template <bool skipOverWrites = false>
+   HASHINATOR_DEVICEONLY void warpInsert(const KEY_TYPE& candidateKey, const size_t w_tid) noexcept {
+
+      const int sizePower = _setInfo->sizePower;
+      const int bitMask = (1 << (sizePower)) - 1;
+      const auto hashIndex = HashFunction::_hash(candidateKey, sizePower);
+      const size_t optimalindex = (hashIndex)&bitMask;
+      const auto submask = SPLIT_VOTING_MASK;
+      bool warpDone = false;
+      uint64_t threadOverflow = 1;
+
+#ifdef HASHINATOR_DEBUG
+// Safety check: make sure everyone has the same key/val and all threads are here.
+#ifdef __CUDACC__
+      assert(__activemask() == SPLIT_VOTING_MASK && "Tried to warpInsert with part of warp predicated off");
+#endif
+      KEY_TYPE storeKey = split::s_shuffle(candidateKey, 0, SPLIT_VOTING_MASK);
+      bool isSafe = (split::s_warpVote(candidateKey == storeKey, SPLIT_VOTING_MASK) == SPLIT_VOTING_MASK);
+      assert(isSafe && "Tried to warpInsert with different keys in the same warp");
+#endif
+
+      for (size_t i = 0; i < (1 << sizePower); i += defaults::WARPSIZE) {
+         // Check if this virtual warp is done.
+         if (warpDone) {
+            break;
+         }
+
+         // Get the position we should be looking into
+         size_t probingindex = ((hashIndex + i + w_tid) & bitMask);
+         auto target = buckets[probingindex];
+
+         // vote for available emptybuckets in warp region
+         // Note that this has to be done before voting for already existing elements (below)
+         auto mask = split::s_warpVote(target == EMPTYBUCKET, submask);
+
+         // Check if this elements already exists
+         auto already_exists = split::s_warpVote(target == candidateKey, submask);
+         if (already_exists) {
+            int winner = split::s_findFirstSig(already_exists) - 1;
+            if (w_tid == winner) {
+               warpDone = 1;
+            }
+         }
+
+         // If any duplicate was there now is the time for the whole Virtual warp to find out!
+         warpDone = split::s_warpVote(warpDone > 0, submask) & submask;
+
+         while (mask && !warpDone) {
+            int winner = split::s_findFirstSig(mask) - 1;
+            if (w_tid == winner) {
+               KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex], EMPTYBUCKET, candidateKey);
+               if (old == EMPTYBUCKET) {
+                  threadOverflow = (probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex + 1);
+                  warpDone = 1;
+                  split::s_atomicAdd(&_setInfo->fill, 1);
+                  if (threadOverflow > _setInfo->currentMaxBucketOverflow) {
+                     split::s_atomicExch((unsigned long long*)(&_setInfo->currentMaxBucketOverflow),
+                                         (unsigned long long)nextOverflow(threadOverflow, defaults::WARPSIZE));
+                  }
+               } else if (old == candidateKey) {
+                  warpDone = 1;
+               }
+            }
+            // If any of the virtual warp threads are done the the whole
+            // Virtual warp is done
+            warpDone = split::s_warpVote(warpDone > 0, submask);
+            mask ^= (1UL << winner);
+         }
+      }
+   }
+
+   template <bool skipOverWrites = false>
+   HASHINATOR_DEVICEONLY bool warpInsert_V(const KEY_TYPE& candidateKey,const size_t w_tid) noexcept {
+
+      const int sizePower = _setInfo->sizePower;
+      const int bitMask = (1 << (sizePower)) - 1;
+      const auto hashIndex = HashFunction::_hash(candidateKey, sizePower);
+      const size_t optimalindex = (hashIndex)&bitMask;
+      const auto submask = SPLIT_VOTING_MASK;
+      bool warpDone = false;
+      uint64_t threadOverflow = 1;
+      int localCount = 0;
+
+#ifdef HASHINATOR_DEBUG
+// Safety check: make sure everyone has the same key/val and all threads are here.
+#ifdef __CUDACC__
+      assert(__activemask() == SPLIT_VOTING_MASK && "Tried to warpInsert_V with part of warp predicated off");
+#endif
+      KEY_TYPE storeKey = split::s_shuffle(candidateKey, 0, SPLIT_VOTING_MASK);
+      KEY_TYPE storeVal = split::s_shuffle(candidateVal, 0, SPLIT_VOTING_MASK);
+      bool isSafe = (split::s_warpVote(candidateKey == storeKey, SPLIT_VOTING_MASDK) == SPLIT_VOTING_MASK;
+      assert(isSafe && "Tried to warpInsert_V with different keys in the same warp");
+#endif
+
+      for (size_t i = 0; i < (1 << sizePower); i += defaults::WARPSIZE) {
+         // Check if this virtual warp is done.
+         if (warpDone) {
+            break;
+         }
+
+         // Get the position we should be looking into
+         size_t probingindex = ((hashIndex + i + w_tid) & bitMask);
+         auto target = buckets[probingindex];
+
+         // vote for available emptybuckets in warp region
+         // Note that this has to be done before voting for already existing elements (below)
+         auto mask = split::s_warpVote(target == EMPTYBUCKET, submask);
+
+         // Check if this elements already exists
+         auto already_exists = split::s_warpVote(target == candidateKey, submask);
+         if (already_exists) {
+            int winner = split::s_findFirstSig(already_exists) - 1;
+            if (w_tid == winner) {
+               warpDone = 1;
+            }
+         }
+
+         // If any duplicate was there now is the time for the whole Virtual warp to find out!
+         warpDone = split::s_warpVote(warpDone > 0, submask) & submask;
+
+         while (mask && !warpDone) {
+            int winner = split::s_findFirstSig(mask) - 1;
+            if (w_tid == winner) {
+               KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex], EMPTYBUCKET, candidateKey);
+               if (old == EMPTYBUCKET) {
+                  threadOverflow = (probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex + 1);
+                  warpDone = 1;
+                  localCount = 1;
+                  split::s_atomicAdd(&_setInfo->fill, 1);
+                  if (threadOverflow > _setInfo->currentMaxBucketOverflow) {
+                     split::s_atomicExch((unsigned long long*)(&_setInfo->currentMaxBucketOverflow),
+                                         (unsigned long long)nextOverflow(threadOverflow, defaults::WARPSIZE));
+                  }
+               } else if (old == candidateKey) {
+                  warpDone = 1;
+               }
+            }
+            // If any of the virtual warp threads are done the the whole
+            // Virtual warp is done
+            warpDone = split::s_warpVote(warpDone > 0, submask);
+            mask ^= (1UL << winner);
+         }
+      }
+
+      auto res = split::s_warpVote(localCount > 0, submask);
+      return (res > 0);
+   }
+
+   HASHINATOR_DEVICEONLY
+   void warpErase(const KEY_TYPE& candidateKey, const size_t w_tid) noexcept {
+
+      const int sizePower = _setInfo->sizePower;
+      const size_t maxoverflow = _setInfo->currentMaxBucketOverflow;
+      const int bitMask = (1 << (sizePower)) - 1;
+      const auto hashIndex = HashFunction::_hash(candidateKey, sizePower);
+      const auto submask = SPLIT_VOTING_MASK;
+      bool warpDone = false;
+      int winner = 0;
+
+#ifdef HASHINATOR_DEBUG
+// Safety check: make sure everyone has the same key/val and all threads are here.
+#ifdef __CUDACC__
+      assert(__activemask() == SPLIT_VOTING_MASK && "Tried to warpFind with part of warp predicated off");
+#endif
+      KEY_TYPE storeKey = split::s_shuffle(candidateKey, 0, SPLIT_VOTING_MASK);
+      bool isSafe = split::s_warpVote(candidateKey == storeKey, SPLIT_VOTING_MASK) == SPLIT_VOTING_MASK;
+      assert(isSafe && "Tried to warpFind with different keys/vals in the same warp");
+#endif
+
+      for (size_t i = 0; i < maxoverflow; i += defaults::WARPSIZE) {
+
+         if (warpDone) {
+            break;
+         }
+
+         // Get the position we should be looking into
+         size_t probingindex = ((hashIndex + i + w_tid) & bitMask);
+         const auto maskExists =
+             split::s_warpVote(buckets[probingindex] == candidateKey, SPLIT_VOTING_MASK) & submask;
+         const auto emptyFound =
+             split::s_warpVote(buckets[probingindex] == EMPTYBUCKET, SPLIT_VOTING_MASK) & submask;
+         // If we encountered empty and the key is not in the range of this warp that means the key is not in hashmap.
+         if (!maskExists && emptyFound) {
+            warpDone = true;
+         }
+         if (maskExists) {
+            winner = split::s_findFirstSig(maskExists) - 1;
+            if (w_tid == winner) {
+               buckets[probingindex] = TOMBSTONE;
+               split::s_atomicAdd(&_setInfo->tombstoneCounter, 1);
+               split::s_atomicSub((unsigned int*)&_setInfo->fill, 1);
+            }
+            warpDone = true;
+         }
+      }
+      return;
+   }
+#endif
+
+}; // Unordered_Set
+
+} // namespace Hashinator
diff --git a/include/splitvector/devicevec.h b/include/splitvector/devicevec.h
new file mode 100644
index 0000000..a543f95
--- /dev/null
+++ b/include/splitvector/devicevec.h
@@ -0,0 +1,783 @@
+#pragma once
+#include "archMacros.h"
+#include "gpu_wrappers.h"
+#include <cassert>
+#ifdef __NVCC__
+#include <cuda_runtime_api.h>
+#else
+#include <hip/hip_runtime_api.h>
+#endif
+#define HOSTONLY __host__
+#define DEVICEONLY __device__
+#define HOSTDEVICE __host__ __device__
+
+namespace split {
+
+template <typename T, class Allocator = split::split_device_allocator<T>>
+class DeviceVector {
+   static_assert(std::is_trivially_copyable<T>::value && "DeviceVector only works for POD types");
+
+private:
+   enum MEMBER { SIZE, CAPACITY };
+
+   // packed to 1 cache line
+   struct __attribute__((__packed__)) Meta {
+      size_t size;
+      size_t capacity;
+      char padding[64 - 2 * sizeof(size_t)]={0}; // pad up to cache line
+      HOSTDEVICE
+      inline size_t& operator[](MEMBER member) noexcept {
+         switch (member) {
+         case MEMBER::SIZE:
+            return *reinterpret_cast<size_t*>(this);
+         case MEMBER::CAPACITY:
+            return *(reinterpret_cast<size_t*>(this) + 1);
+         default:
+            assert(false && "Invalid case");
+         }
+      }
+   };
+
+   // Members
+   Meta* _meta = nullptr;
+   T* _data = nullptr;
+   Allocator _allocator;
+   mutable split_gpuStream_t _stream;
+
+   void setupSpace(void* ptr) noexcept {
+      _meta = reinterpret_cast<Meta*>(ptr);
+      _data = reinterpret_cast<T*>(reinterpret_cast<char*>(ptr) + sizeof(Meta));
+   }
+
+   [[nodiscard]] void* _allocate(const size_t sz) {
+      void* _ptr =nullptr;      if (_stream == NULL) {
+         _ptr = _allocator.allocate_raw( sizeof(Meta) + sz*sizeof(T));
+      } else {
+         _ptr = _allocator.allocate_raw( sizeof(Meta) + sz*sizeof(T),_stream);
+      }
+      return _ptr;
+   }
+
+   void _deallocate(void* _ptr) {
+      if (_ptr == nullptr) {
+         return;
+      }
+      if (_stream == NULL) {
+         _allocator.deallocate((_ptr));
+      } else {
+         _allocator.deallocate(_ptr,_stream);
+      }
+      _ptr = nullptr;
+   }
+
+   DEVICEONLY
+   inline void _rangeCheckDevice(size_t index) const noexcept {
+      if (index >= _meta->size) {
+         assert(true && " out of range ");
+      }
+   }
+
+   HOSTONLY
+   inline void _rangeCheckHost(size_t index) const noexcept {
+      Meta currentMeta = getMeta();
+      if (index >= currentMeta.size) {
+         assert(true && " out of range ");
+      }
+   }
+
+   HOSTONLY
+   inline Meta getMeta() const noexcept {
+      Meta buffer;
+      if (_stream == NULL) {
+         SPLIT_CHECK_ERR(split_gpuMemcpy(&buffer, _meta, sizeof(Meta), split_gpuMemcpyDeviceToHost));
+      } else {
+         SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&buffer, _meta, sizeof(Meta), split_gpuMemcpyDeviceToHost, _stream));
+      }
+      return buffer;
+   }
+
+   HOSTONLY
+   inline void getMeta(Meta& buffer) const noexcept {
+      if (_stream == NULL) {
+         SPLIT_CHECK_ERR(split_gpuMemcpy(&buffer, _meta, sizeof(Meta), split_gpuMemcpyDeviceToHost));
+      } else {
+         SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&buffer, _meta, sizeof(Meta), split_gpuMemcpyDeviceToHost, _stream));
+      }
+   }
+
+   HOSTONLY
+   inline void setMeta(const Meta& buffer) noexcept {
+      if (_stream == NULL) {
+         SPLIT_CHECK_ERR(split_gpuMemcpy(_meta, &buffer, sizeof(Meta), split_gpuMemcpyHostToDevice));
+      } else {
+         SPLIT_CHECK_ERR(split_gpuMemcpyAsync(_meta, &buffer, sizeof(Meta), split_gpuMemcpyHostToDevice, _stream));
+      }
+   }
+
+   HOSTONLY
+   inline T getElementFromDevice(const size_t index) const noexcept {
+      T retval;
+      if (_stream == NULL) {
+         SPLIT_CHECK_ERR(split_gpuMemcpy(&retval, &_data[index], sizeof(T), split_gpuMemcpyDeviceToHost));
+      } else {
+         SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&retval, &_data[index], sizeof(T), split_gpuMemcpyDeviceToHost, _stream));
+      }
+      return retval;
+   }
+
+   HOSTONLY
+   inline void setElementFromHost(const size_t index, const T& val) noexcept {
+      if (_stream == NULL) {
+         SPLIT_CHECK_ERR(split_gpuMemcpy(&_data[index], &val, sizeof(T), split_gpuMemcpyHostToDevice));
+      } else {
+         SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&_data[index], &val, sizeof(T), split_gpuMemcpyHostToDevice, _stream));
+      }
+      return;
+   }
+
+public:
+   DeviceVector(size_t sz = 0) : _stream(NULL) {
+      void* ptr = _allocate(sz);
+      setupSpace(ptr);
+      size_t s = sz;
+      SPLIT_CHECK_ERR(split_gpuMemcpy(&_meta->size, &s, sizeof(size_t), split_gpuMemcpyHostToDevice));
+      SPLIT_CHECK_ERR(split_gpuMemcpy(&_meta->capacity, &sz, sizeof(size_t), split_gpuMemcpyHostToDevice));
+   }
+
+   DeviceVector(size_t sz, split_gpuStream_t s) : _stream(s) {
+      void* ptr = _allocate(sz);
+      setupSpace(ptr);
+      SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&_meta->size, &sz, sizeof(size_t), split_gpuMemcpyHostToDevice, s));
+      SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&_meta->capacity, &sz, sizeof(size_t), split_gpuMemcpyHostToDevice, s));
+   }
+
+   DeviceVector(const DeviceVector<T>& other) : _stream(NULL) {
+      Meta otherMeta = other.getMeta();
+      void* ptr = _allocate(otherMeta.size);
+      setupSpace(ptr);
+      Meta newMeta{.size = otherMeta.size, .capacity = otherMeta.size};
+      setMeta(newMeta);
+      SPLIT_CHECK_ERR(split_gpuMemcpy(_data, other._data, otherMeta.size * sizeof(T), split_gpuMemcpyDeviceToDevice));
+      return;
+   }
+
+   DeviceVector(const DeviceVector<T>& other, split_gpuStream_t s) : _stream(s) {
+      Meta otherMeta = other.getMeta();
+      void* ptr = _allocate(otherMeta.size);
+      setupSpace(ptr);
+      Meta newMeta{.size = otherMeta.size, .capacity = otherMeta.size};
+      setMeta(newMeta);
+      SPLIT_CHECK_ERR(
+          split_gpuMemcpyAsync(_data, other._data, otherMeta.size * sizeof(T), split_gpuMemcpyDeviceToDevice, s));
+      return;
+   }
+
+   DeviceVector(const DeviceVector<T>&& other) : _stream(NULL) {
+      _meta = other._meta;
+      _data = other._data;
+      other._meta = nullptr;
+      other._data = nullptr;
+   }
+
+   DeviceVector(const DeviceVector<T>&& other, split_gpuStream_t s) : _stream(s) {
+      _meta = other._meta;
+      _data = other._data;
+      other._meta = nullptr;
+      other._data = nullptr;
+   }
+
+   DeviceVector(const SplitVector<T>& vec) : _stream(NULL) {
+      void* ptr = _allocate(vec.size());
+      setupSpace(ptr);
+      Meta newMeta{.size = vec.size(), .capacity = vec.size()};
+      setMeta(newMeta);
+      SPLIT_CHECK_ERR(split_gpuMemcpy(_data, vec.data(), vec.size() * sizeof(T), split_gpuMemcpyHostToDevice));
+      return;
+   }
+
+   DeviceVector(const SplitVector<T>& vec, split_gpuStream_t s) : _stream(s) {
+      void* ptr = _allocate(vec.size());
+      setupSpace(ptr);
+      Meta newMeta{.size = vec.size(), .capacity = vec.size()};
+      setMeta(newMeta);
+      SPLIT_CHECK_ERR(split_gpuMemcpyAsync(_data, vec.data(), vec.size() * sizeof(T), split_gpuMemcpyHostToDevice, s));
+      return;
+   }
+
+   DeviceVector(const std::vector<T>& vec) : _stream(NULL) {
+      void* ptr = _allocate(vec.size());
+      setupSpace(ptr);
+      Meta newMeta{.size = vec.size(), .capacity = vec.size()};
+      setMeta(newMeta);
+      SPLIT_CHECK_ERR(split_gpuMemcpy(_data, vec.data(), vec.size() * sizeof(T), split_gpuMemcpyHostToDevice));
+      return;
+   }
+
+   DeviceVector(const std::vector<T>& vec, split_gpuStream_t s) : _stream(s) {
+      void* ptr = _allocate(vec.size());
+      setupSpace(ptr);
+      Meta newMeta{.size = vec.size(), .capacity = vec.size()};
+      setMeta(newMeta);
+      SPLIT_CHECK_ERR(split_gpuMemcpyAsync(_data, vec.data(), vec.size() * sizeof(T), split_gpuMemcpyHostToDevice, s));
+      return;
+   }
+
+   ~DeviceVector() {
+      if (_meta == nullptr) {
+         return;
+      }
+      _deallocate(_meta);
+   }
+
+   HOSTONLY
+   void setStream(split_gpuStream_t s) const noexcept {_stream=s;}
+
+   HOSTONLY
+   DeviceVector& operator=(const DeviceVector& other) {
+      Meta otherMeta = other.getMeta();
+      resize(otherMeta.size);
+      if (_stream == NULL) {
+         SPLIT_CHECK_ERR(split_gpuMemcpy(_meta, other._meta, sizeof(Meta) + otherMeta.size * sizeof(T),
+                                         split_gpuMemcpyDeviceToDevice));
+      } else {
+         SPLIT_CHECK_ERR(split_gpuMemcpyAsync(_meta, other._meta, sizeof(Meta) + otherMeta.size * sizeof(T),
+                                              split_gpuMemcpyDeviceToDevice, _stream));
+      }
+      return *this;
+   }
+
+   HOSTONLY
+   DeviceVector& operator=(DeviceVector&& other) noexcept {
+      if (this == &other) {
+         return *this;
+      }
+      _deallocate(_meta);
+      _meta = other._meta;
+      _data = other._data;
+      other._meta = nullptr;
+      other._data = nullptr;
+      return *this;
+   }
+
+   HOSTONLY
+   void* operator new(size_t len) {
+      void* _ptr = nullptr;
+      SPLIT_CHECK_ERR(split_gpuMallocManaged((void**)&_ptr, len));
+      return _ptr;
+   }
+
+   HOSTONLY
+   void operator delete(void* _ptr) { SPLIT_CHECK_ERR(split_gpuFree(_ptr)); }
+
+   HOSTONLY
+   void* operator new[](size_t len) {
+      void* _ptr = nullptr;
+      SPLIT_CHECK_ERR(split_gpuMallocManaged((void**)&_ptr, len));
+      return _ptr;
+   }
+   HOSTONLY
+   void operator delete[](void* _ptr) { SPLIT_CHECK_ERR(split_gpuFree(_ptr)); }
+
+   DEVICEONLY
+   size_t device_size() const noexcept { return _meta->size; }
+
+   DEVICEONLY
+   size_t device_capacity() const noexcept { return _meta->capacity; }
+
+   HOSTONLY
+   size_t size() const noexcept { return (getMeta()).size; }
+
+   HOSTONLY
+   size_t capacity() const noexcept { return (getMeta()).capacity; }
+
+   HOSTONLY
+   T get(size_t index) const {
+      _rangeCheckHost(index);
+      return getElementFromDevice(index);
+   }
+
+   HOSTONLY
+   void set(size_t index, const T& val) {
+      _rangeCheckHost(index);
+      return setElementFromHost(index, val);
+   }
+
+   DEVICEONLY
+   T device_get(size_t index) const {
+      _rangeCheckDevice(index);
+      return _data[index];
+   }
+
+   DEVICEONLY
+   void device_set(size_t index, const T& val) {
+      _rangeCheckDevice(index);
+      split::s_atomicExch(&_data[index], val);
+      return;
+   }
+
+   DEVICEONLY T& at(size_t index) {
+      _rangeCheckDevice(index);
+      return _data[index];
+   }
+
+   DEVICEONLY const T& at(size_t index) const {
+      _rangeCheckDevice(index);
+      return _data[index];
+   }
+
+   DEVICEONLY T& operator[](size_t index) noexcept { return _data[index]; }
+
+   DEVICEONLY const T& operator[](size_t index) const noexcept { return _data[index]; }
+
+   HOSTDEVICE T* data() noexcept { return &(_data[0]); }
+
+   HOSTDEVICE const T* data() const noexcept { return &(_data[0]); }
+
+   HOSTONLY void reallocate(size_t requested_space) {
+      if (requested_space == 0) {
+         _deallocate(_meta);
+         _meta = nullptr;
+         return;
+      }
+
+      void* _new_data = _allocate(requested_space);
+      const auto currentMeta = getMeta();
+      size_t currentSize = currentMeta.size;
+      if (_stream == NULL) {
+         SPLIT_CHECK_ERR(
+             split_gpuMemcpy(_new_data, _meta, sizeof(Meta) + currentSize * sizeof(T), split_gpuMemcpyDeviceToDevice));
+      } else {
+         SPLIT_CHECK_ERR(split_gpuMemcpyAsync(_new_data, _meta, sizeof(Meta) + currentSize * sizeof(T),
+                                              split_gpuMemcpyDeviceToDevice, _stream));
+      }
+      _deallocate(_meta);
+      setupSpace(_new_data);
+      auto newMeta = currentMeta;
+      newMeta.capacity = requested_space;
+      setMeta(newMeta);
+      return;
+   }
+
+   HOSTONLY
+   void clear() noexcept {
+      Meta currentMeta = getMeta();
+      currentMeta.size = 0;
+      setMeta(currentMeta);
+      return;
+   }
+
+   DEVICEONLY
+   void device_clear() noexcept {
+      _meta[MEMBER::SIZE] = 0;
+      return;
+   }
+
+   HOSTONLY
+   void reserve(size_t requested_space) {
+      Meta currentMeta = getMeta();
+      if (requested_space <= currentMeta.capacity) {
+         return;
+      }
+      reallocate(1.5 * requested_space);
+      return;
+   }
+
+   HOSTONLY
+   void resize(size_t newSize) {
+      Meta currentMeta = getMeta();
+      if (newSize <= currentMeta.size) {
+         currentMeta.size = newSize;
+         setMeta(currentMeta);
+         return;
+      }
+      reserve(newSize);
+      currentMeta = getMeta();
+      currentMeta.size = newSize;
+      setMeta(currentMeta);
+      return;
+   }
+
+   HOSTONLY
+   void grow() { reserve(capacity() + 1); }
+
+   DEVICEONLY
+   void device_resize(size_t newSize) {
+      if (newSize > capacity()) {
+         return;
+      }
+      _meta[MEMBER::SIZE] = newSize;
+   }
+
+   HOSTONLY
+   void push_back(const T& val) {
+      Meta currentMeta = getMeta();
+      resize(currentMeta.size + 1);
+      currentMeta.size++;
+      setElementFromHost(currentMeta.size - 1, val);
+      return;
+   }
+
+   DEVICEONLY
+   bool device_push_back(const T& val) {
+      size_t old = split::s_atomicAdd((unsigned long*)&_meta[MEMBER::SIZE], 1);
+      // TODO relpace this > in splitvec as well
+      if (old > (_meta->operator[](MEMBER::CAPACITY)) - 1) {
+         atomicSub((unsigned*)&_meta[MEMBER::SIZE], 1);
+         return false;
+      }
+      split::s_atomicCAS(&(_data[old]), _data[old], val);
+      return true;
+   }
+
+   class iterator {
+
+   private:
+      const T* _data;
+
+   public:
+      using iterator_category = std::forward_iterator_tag;
+      using value_type = T;
+      using difference_type = int64_t;
+      using pointer = const T*;
+      using reference = const T&;
+
+      iterator(pointer data) : _data(data) {}
+      pointer data() const { return _data; }
+      pointer operator->() const { return _data; }
+      reference operator*() const {
+         assert(false);
+         return *_data;
+      }
+      bool operator==(const iterator& other) const { return _data == other._data; }
+      bool operator!=(const iterator& other) const { return _data != other._data; }
+      iterator& operator++() {
+         _data += 1;
+         return *this;
+      }
+      iterator operator++(int) { return iterator(_data + 1); }
+      iterator operator--(int) { return iterator(_data - 1); }
+      iterator operator--() {
+         _data -= 1;
+         return *this;
+      }
+      iterator& operator+=(int64_t offset) {
+         _data += offset;
+         return *this;
+      }
+      iterator& operator-=(int64_t offset) {
+         _data -= offset;
+         return *this;
+      }
+      iterator operator+(int64_t offset) const {
+         iterator itt(*this);
+         return itt += offset;
+      }
+      iterator operator-(int64_t offset) const {
+         iterator itt(*this);
+         return itt -= offset;
+      }
+   };
+
+   // Device Iterators
+   class device_iterator {
+
+   private:
+      T* _data;
+
+   public:
+      using iterator_category = std::forward_iterator_tag;
+      using value_type = T;
+      using difference_type = int64_t;
+      using pointer = T*;
+      using reference = T&;
+
+      // device_iterator(){}
+      DEVICEONLY
+      device_iterator(pointer data) : _data(data) {}
+
+      DEVICEONLY
+      pointer data() { return _data; }
+      DEVICEONLY
+      pointer operator->() { return _data; }
+      DEVICEONLY
+      reference operator*() { return *_data; }
+
+      DEVICEONLY
+      bool operator==(const device_iterator& other) const { return _data == other._data; }
+      DEVICEONLY
+      bool operator!=(const device_iterator& other) const { return _data != other._data; }
+      DEVICEONLY
+      device_iterator& operator++() {
+         _data += 1;
+         return *this;
+      }
+      DEVICEONLY
+      device_iterator operator++(int) { return device_iterator(_data + 1); }
+      DEVICEONLY
+      device_iterator operator--(int) { return device_iterator(_data - 1); }
+      DEVICEONLY
+      device_iterator operator--() {
+         _data -= 1;
+         return *this;
+      }
+      DEVICEONLY
+      device_iterator& operator+=(int64_t offset) {
+         _data += offset;
+         return *this;
+      }
+      DEVICEONLY
+      device_iterator& operator-=(int64_t offset) {
+         _data -= offset;
+         return *this;
+      }
+      DEVICEONLY
+      device_iterator operator+(int64_t offset) const {
+         device_iterator itt(*this);
+         return itt += offset;
+      }
+      DEVICEONLY
+      device_iterator operator-(int64_t offset) const {
+         device_iterator itt(*this);
+         return itt -= offset;
+      }
+   };
+
+   class const_device_iterator {
+
+   private:
+      const T* _data;
+
+   public:
+      using device_iterator_category = std::forward_iterator_tag;
+      using value_type = T;
+      using difference_type = int64_t;
+      using pointer = const T*;
+      using reference = const T&;
+
+      DEVICEONLY
+      const_device_iterator(pointer data) : _data(data) {}
+
+      DEVICEONLY
+      pointer data() const { return _data; }
+      DEVICEONLY
+      pointer operator->() const { return _data; }
+      DEVICEONLY
+      reference operator*() const { return *_data; }
+
+      DEVICEONLY
+      bool operator==(const const_device_iterator& other) const { return _data == other._data; }
+      DEVICEONLY
+      bool operator!=(const const_device_iterator& other) const { return _data != other._data; }
+      DEVICEONLY
+      const_device_iterator& operator++() {
+         _data += 1;
+         return *this;
+      }
+      DEVICEONLY
+      const_device_iterator operator++(int) { return const_iterator(_data + 1); }
+      DEVICEONLY
+      const_device_iterator operator--(int) { return const_iterator(_data - 1); }
+      DEVICEONLY
+      const_device_iterator operator--() {
+         _data -= 1;
+         return *this;
+      }
+      DEVICEONLY
+      const_device_iterator& operator+=(int64_t offset) {
+         _data += offset;
+         return *this;
+      }
+      DEVICEONLY
+      const_device_iterator& operator-=(int64_t offset) {
+         _data -= offset;
+         return *this;
+      }
+      DEVICEONLY
+      const_device_iterator operator+(int64_t offset) const {
+         const_device_iterator itt(*this);
+         return itt += offset;
+      }
+      DEVICEONLY
+      const_device_iterator operator-(int64_t offset) const {
+         const_device_iterator itt(*this);
+         return itt -= offset;
+      }
+   };
+
+   HOSTONLY
+   iterator begin() const noexcept { return iterator(_data); }
+
+   HOSTONLY
+   iterator end() const noexcept { return iterator(_data + size()); }
+
+   DEVICEONLY
+   device_iterator device_begin() noexcept { return device_iterator(_data); }
+
+   DEVICEONLY
+   const_device_iterator device_begin() const noexcept { return const_device_iterator(_data); }
+
+   DEVICEONLY
+   device_iterator device_end() noexcept { return device_iterator(_data + device_size()); }
+
+   DEVICEONLY
+   const_device_iterator device_end() const noexcept { return const_device_iterator(_data + device_size()); }
+
+   HOSTONLY
+   T back() const noexcept { return get(size() - 1); }
+
+   HOSTONLY
+   T front() const noexcept { return get(0); }
+
+   DEVICEONLY
+   T& device_back() noexcept { return _data[size() - 1]; }
+
+   DEVICEONLY
+   T& device_front() noexcept { return _data[0]; }
+
+   DEVICEONLY
+   const T& device_back() const noexcept { return _data[size() - 1]; }
+
+   DEVICEONLY
+   const T& device_front() const noexcept { return _data[0]; }
+
+   HOSTONLY
+   void set(const iterator& it, T val) {
+      size_t index = it.data() - _data;
+      return setElementFromHost(index, val);
+   }
+
+   T get(const iterator& it) {
+      size_t index = it.data() - _data;
+      return getElementFromDevice(index);
+   }
+
+   HOSTONLY
+   void remove_from_back(size_t n) noexcept {
+      const size_t end = size() - n;
+      Meta currentMeta = getMeta();
+      currentMeta.size = end;
+      setMeta(currentMeta);
+   }
+
+   DEVICEONLY
+   void device_remove_from_back(size_t n) noexcept {
+      const size_t end = device_size() - n;
+      _meta[MEMBER::SIZE] = end;
+   }
+
+   HOSTONLY
+   void pop_back() noexcept { remove_from_back(1); }
+
+   DEVICEONLY
+   void device_pop_back() noexcept { device_remove_from_back(1); }
+
+   HOSTONLY
+   iterator erase(iterator it) noexcept {
+      const int64_t index = it.data() - begin().data();
+      Meta currentMeta = getMeta();
+      for (auto i = index; i < size() - 1; i++) {
+         set(i, get(i + 1));
+      }
+      currentMeta.size -= 1;
+      setMeta(currentMeta);
+      iterator retval = &_data[index];
+      return retval;
+   }
+
+   HOSTONLY
+   iterator erase(iterator p0, iterator p1) noexcept {
+      const int64_t start = p0.data() - begin().data();
+      const int64_t end = p1.data() - begin().data();
+      const int64_t offset = end - start;
+      Meta currentMeta = getMeta();
+      for (auto i = start; i < size() - offset; ++i) {
+         set(i, get(i + offset));
+      }
+      currentMeta.size -= end - start;
+      setMeta(currentMeta);
+      iterator it = &_data[start];
+      return it;
+   }
+
+   HOSTONLY
+   iterator insert(iterator it, const T& val) {
+
+      // If empty or inserting at the end no relocating is needed
+      if (it == end()) {
+         push_back(val);
+         return end()--;
+      }
+
+      int64_t index = it.data() - begin().data();
+      if (index < 0 || index > size()) {
+         throw std::out_of_range("Insert");
+      }
+
+      // Do we do need to increase our capacity?
+      if (size() == capacity()) {
+         grow();
+      }
+
+      for (int64_t i = size() - 1; i >= index; i--) {
+         set(i + 1, get(i));
+      }
+
+      set(index, val);
+      Meta currentMeta = getMeta();
+      currentMeta.size++;
+      setMeta(currentMeta);
+      return iterator(_data + index);
+   }
+
+   template <typename InputIterator, class = typename std::enable_if<!std::is_integral<InputIterator>::value>::type>
+   HOSTONLY iterator insert(iterator it, InputIterator p0, InputIterator p1) {
+
+      const int64_t count = std::distance(p0, p1);
+      const int64_t index = it.data() - begin().data();
+
+      if (index < 0 || index > size()) {
+         throw std::out_of_range("Insert");
+      }
+
+      size_t old_size = size();
+      resize(size() + count);
+
+      iterator retval = &_data[index];
+
+      // Copy
+      for (int64_t i = old_size - 1; i >= index; i--) {
+         set(count + i, get(i));
+      }
+
+      // Overwrite
+      size_t i = index;
+      for (auto p = p0; p != p1; ++p) {
+         set(i, get(p));
+         i++;
+      }
+      return retval;
+   }
+
+}; // DeviceVector
+
+/*Equal operator*/
+template <typename T>
+static inline HOSTONLY bool operator==(const DeviceVector<T>& lhs, const DeviceVector<T>& rhs) noexcept {
+   if (lhs.size() != rhs.size()) {
+      return false;
+   }
+   for (size_t i = 0; i < lhs.size(); i++) {
+      if (!(lhs.get(i) == rhs.get(i))) {
+         return false;
+      }
+   }
+   // if we end up here the vectors are equal
+   return true;
+}
+
+/*Not-Equal operator*/
+template <typename T>
+static inline HOSTONLY bool operator!=(const DeviceVector<T>& lhs, const DeviceVector<T>& rhs) noexcept {
+   return !(rhs == lhs);
+}
+} // namespace split
diff --git a/include/splitvector/split_allocators.h b/include/splitvector/split_allocators.h
index ccff591..64284ea 100644
--- a/include/splitvector/split_allocators.h
+++ b/include/splitvector/split_allocators.h
@@ -77,7 +77,6 @@ class split_unified_allocator {
     */
    split_unified_allocator() throw() {}
 
-
    /**
     * @brief Copy constructor with different type.
     */
@@ -129,6 +128,91 @@ class split_unified_allocator {
    void destroy(pointer p) { p->~value_type(); }
 };
 
+/**
+ * @brief Custom allocator for device only memory
+ * @tparam T Type of the allocated objects.
+ */
+template <class T>
+class split_device_allocator {
+public:
+   typedef T value_type;
+   typedef value_type* pointer;
+   typedef const value_type* const_pointer;
+   typedef value_type& reference;
+   typedef const value_type& const_reference;
+   typedef ptrdiff_t difference_type;
+   typedef size_t size_type;
+   template <class U>
+   struct rebind {
+      typedef split_device_allocator<U> other;
+   };
+   /**
+    * @brief Default constructor.
+    */
+   split_device_allocator() throw() {}
+
+   /**
+    * @brief Copy constructor.
+    */
+   split_device_allocator(split_device_allocator const&) throw() {}
+
+   /**
+    * @brief Copy constructor with different type.
+    */
+   template <class U>
+   split_device_allocator(split_device_allocator<U> const&) throw() {}
+   pointer address(reference x) const { return &x; }
+   const_pointer address(const_reference x) const { return &x; }
+
+   pointer allocate(size_type n, const void* /*hint*/ = 0) {
+      T* ret;
+      assert(n && "allocate 0");
+      SPLIT_CHECK_ERR(split_gpuMalloc((void**)&ret, n * sizeof(value_type)));
+      if (ret == nullptr) {
+         throw std::bad_alloc();
+      }
+      return ret;
+   }
+
+   pointer allocate(size_type n, split_gpuStream_t stream, const void* /*hint*/ = 0) {
+      T* ret;
+      SPLIT_CHECK_ERR(split_gpuMallocAsync((void**)&ret, n * sizeof(value_type),stream));
+      return ret;
+   }
+
+
+   static void* allocate_raw(size_type n, const void* /*hint*/ = 0) {
+      void* ret;
+      SPLIT_CHECK_ERR(split_gpuMalloc((void**)&ret, n));
+      if (ret == nullptr) {
+         throw std::bad_alloc();
+      }
+      return ret;
+   }
+
+   static void* allocate_raw(size_type n, split_gpuStream_t stream, const void* /*hint*/ = 0) {
+      void* ret;
+      SPLIT_CHECK_ERR(split_gpuMallocAsync((void**)&ret, n,stream));
+      return ret;
+   }
+
+   void deallocate(void* p) { SPLIT_CHECK_ERR(split_gpuFree(p)); }
+
+   void deallocate(void* p,split_gpuStream_t stream) { SPLIT_CHECK_ERR(split_gpuFreeAsync(p,stream)); }
+
+   size_type max_size() const throw() {
+      size_type max = static_cast<size_type>(-1) / sizeof(value_type);
+      return (max > 0 ? max : 1);
+   }
+
+   template <typename U, typename... Args>
+   __host__ __device__ void construct(U* p, Args&&... args) {
+      ::new (p) U(std::forward<Args>(args)...);
+   }
+
+   void destroy(pointer p) { p->~value_type(); }
+};
+
 #endif
 
 /**
diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h
index e754965..05abaca 100644
--- a/include/splitvector/split_tools.h
+++ b/include/splitvector/split_tools.h
@@ -3,7 +3,7 @@
  * Description: Set of tools used by SplitVector
  *
  * This file defines the following classes or functions:
- *    --split::tools::Cuda_mempool
+ *    --split::tools::GPU_Mempool
  *    --split::tools::copy_if_raw
  *    --split::tools::copy_if
  *    --split::tools::scan_reduce_raw
@@ -31,6 +31,9 @@
  * */
 #pragma once
 #include "gpu_wrappers.h"
+#ifndef SPLIT_CPU_ONLY_MODE
+#include "devicevec.h"
+#endif
 #define NUM_BANKS 32 // TODO depends on device
 #define LOG_NUM_BANKS 5
 #define CONFLICT_FREE_OFFSET(n) ((n) >> LOG_NUM_BANKS)
@@ -500,7 +503,7 @@ __global__ void split_compact_raw(T* input, uint32_t* counts, uint32_t* offsets,
  * It uses async mallocs
  *
  */
-class Cuda_mempool {
+class GPU_Mempool {
 private:
    size_t total_bytes;
    size_t bytes_used;
@@ -509,24 +512,24 @@ class Cuda_mempool {
    bool isOwner;
 
 public:
-   explicit Cuda_mempool(size_t bytes, split_gpuStream_t str) {
+   explicit GPU_Mempool(size_t bytes, split_gpuStream_t str) {
       s = str;
       SPLIT_CHECK_ERR(split_gpuMallocAsync(&_data, bytes, s));
       total_bytes = bytes;
       bytes_used = 0;
       isOwner = true;
    }
-   explicit Cuda_mempool(void* ptr, size_t bytes) {
+   explicit GPU_Mempool(void* ptr, size_t bytes) {
       total_bytes = bytes;
       bytes_used = 0;
       isOwner = false;
       _data = ptr;
    }
 
-   Cuda_mempool() = delete;
-   Cuda_mempool(const Cuda_mempool& other) = delete;
-   Cuda_mempool(Cuda_mempool&& other) = delete;
-   ~Cuda_mempool() {
+   GPU_Mempool() = delete;
+   GPU_Mempool(const GPU_Mempool& other) = delete;
+   GPU_Mempool(GPU_Mempool&& other) = delete;
+   ~GPU_Mempool() {
       if (isOwner) {
          SPLIT_CHECK_ERR(split_gpuFreeAsync(_data, s));
       }
@@ -565,7 +568,7 @@ __global__ void scan_reduce_raw(T* input, uint32_t* output, Rule rule, size_t si
  * @brief Same as split_prefix_scan but with raw memory
  */
 template <typename T, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-void split_prefix_scan_raw(T* input, T* output, Cuda_mempool& mPool, const size_t input_size, split_gpuStream_t s = 0) {
+void split_prefix_scan_raw(T* input, T* output, GPU_Mempool& mPool, const size_t input_size, split_gpuStream_t s = 0) {
 
    // Scan is performed in half Blocksizes
    size_t scanBlocksize = BLOCKSIZE / 2;
@@ -612,49 +615,9 @@ void split_prefix_scan_raw(T* input, T* output, Cuda_mempool& mPool, const size_
    }
 }
 
-/**
- * @brief Same as copy_if but using raw memory
- */
-template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-uint32_t copy_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& input, T* output, Rule rule,
-                     size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) {
-
-   uint32_t* d_counts;
-   uint32_t* d_offsets;
-   d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
-   SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s));
-
-   // Phase 1 -- Calculate per warp workload
-   size_t _size = input.size();
-   split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input.data(), d_counts, rule, _size);
-   d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
-   SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s));
-
-   // Step 2 -- Exclusive Prefix Scan on offsets
-   if (nBlocks == 1) {
-      split_prefix_scan_raw<uint32_t, 2, WARP>(d_counts, d_offsets, mPool, nBlocks, s);
-   } else {
-      split_prefix_scan_raw<uint32_t, BLOCKSIZE, WARP>(d_counts, d_offsets, mPool, nBlocks, s);
-   }
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
-
-   // Step 3 -- Compaction
-   uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t));
-   split::tools::split_compact_raw<T, Rule, BLOCKSIZE, WARP>
-       <<<nBlocks, BLOCKSIZE, 2 * (BLOCKSIZE / WARP) * sizeof(unsigned int), s>>>(
-           input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval);
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
-   uint32_t numel;
-   SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
-   return numel;
-}
-
 template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
 uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule,
-                     size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) {
+                     size_t nBlocks, GPU_Mempool& mPool, split_gpuStream_t s = 0) {
 
    uint32_t* d_counts;
    uint32_t* d_offsets;
@@ -688,47 +651,6 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule,
    return numel;
 }
 
-/**
- * @brief Same as copy_keys_if but using raw memory
- */
-template <typename T, typename U, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-size_t copy_keys_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& input, U* output, Rule rule,
-                        size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) {
-
-   uint32_t* d_counts;
-   uint32_t* d_offsets;
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
-   d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
-   SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s));
-
-   // Phase 1 -- Calculate per warp workload
-   size_t _size = input.size();
-   split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input.data(), d_counts, rule, _size);
-   d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
-   SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s));
-
-   // Step 2 -- Exclusive Prefix Scan on offsets
-   if (nBlocks == 1) {
-      split_prefix_scan_raw<uint32_t, 2, WARP>(d_counts, d_offsets, mPool, nBlocks, s);
-   } else {
-      split_prefix_scan_raw<uint32_t, BLOCKSIZE, WARP>(d_counts, d_offsets, mPool, nBlocks, s);
-   }
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
-
-   // Step 3 -- Compaction
-   uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t));
-   split::tools::split_compact_keys_raw<T, U, Rule, BLOCKSIZE, WARP>
-       <<<nBlocks, BLOCKSIZE, 2 * (BLOCKSIZE / WARP) * sizeof(unsigned int), s>>>(
-           input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval);
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
-   uint32_t numel;
-   SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));
-   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
-   return numel;
-}
-
 /**
  * @brief Estimates memory needed for compacting the input splitvector
  */
@@ -746,9 +668,11 @@ estimateMemoryForCompaction(const split::SplitVector<T, split::split_unified_all
    return 8 * nBlocks * sizeof(uint32_t);
 }
 
+/**
+ * @brief Estimates memory needed for compacting the input splitvector
+ */
 template <int BLOCKSIZE = 1024>
-[[nodiscard]] size_t
-estimateMemoryForCompaction(const size_t inputSize) noexcept {
+[[nodiscard]] size_t estimateMemoryForCompaction(const size_t inputSize) noexcept {
    // Figure out Blocks to use
    size_t _s = std::ceil((float(inputSize)) / (float)BLOCKSIZE);
    size_t nBlocks = nextPow2(_s);
@@ -777,8 +701,8 @@ void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& inpu
 
    // Allocate with Mempool
    const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t);
-   Cuda_mempool mPool(memory_for_pool, s);
-   auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s);
+   GPU_Mempool mPool(memory_for_pool, s);
+   auto len = copy_keys_if_raw(input.data(), output.data(), input.size(), rule, nBlocks, mPool, s);
    output.erase(&output[len], output.end());
 }
 
@@ -810,14 +734,14 @@ void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
 
    // Allocate with Mempool
    const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t);
-   Cuda_mempool mPool(memory_for_pool, s);
-   auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s);
+   GPU_Mempool mPool(memory_for_pool, s);
+   auto len = copy_if_raw(input.data(), output.data(), input.size(), rule, nBlocks, mPool, s);
    output.erase(&output[len], output.end());
 }
 
 template <typename T, typename U, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
 void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
-                  split::SplitVector<U, split::split_unified_allocator<U>>& output, Rule rule, Cuda_mempool&& mPool,
+                  split::SplitVector<U, split::split_unified_allocator<U>>& output, Rule rule, GPU_Mempool&& mPool,
                   split_gpuStream_t s = 0) {
 
    // Figure out Blocks to use
@@ -826,13 +750,14 @@ void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& inpu
    if (nBlocks == 0) {
       nBlocks += 1;
    }
-   auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, std::forward<Cuda_mempool>(mPool), s);
+   auto len =
+       copy_keys_if_raw(input.data(), output.data(), input.size(), rule, nBlocks, std::forward<GPU_Mempool>(mPool), s);
    output.erase(&output[len], output.end());
 }
 
 template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
 void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
-             split::SplitVector<T, split::split_unified_allocator<T>>& output, Rule rule, Cuda_mempool&& mPool,
+             split::SplitVector<T, split::split_unified_allocator<T>>& output, Rule rule, GPU_Mempool&& mPool,
              split_gpuStream_t s = 0) {
 
    // Figure out Blocks to use
@@ -841,7 +766,7 @@ void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
    if (nBlocks == 0) {
       nBlocks += 1;
    }
-   auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s);
+   auto len = copy_if_raw(input.data(), output.data(), input.size(), rule, nBlocks, mPool, s);
    output.erase(&output[len], output.end());
 }
 
@@ -857,8 +782,8 @@ void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& inpu
       nBlocks += 1;
    }
    assert(stack && "Invalid stack!");
-   Cuda_mempool mPool(stack, max_size);
-   auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s);
+   GPU_Mempool mPool(stack, max_size);
+   auto len = copy_keys_if_raw(input.data(), output.data(), input.size(), rule, nBlocks, mPool, s);
    output.erase(&output[len], output.end());
 }
 
@@ -874,25 +799,69 @@ void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
       nBlocks += 1;
    }
    assert(stack && "Invalid stack!");
-   Cuda_mempool mPool(stack, max_size);
-   auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s);
+   GPU_Mempool mPool(stack, max_size);
+   auto len = copy_if_raw(input.data(), output.data(), input.size(), rule, nBlocks, mPool, s);
    output.erase(&output[len], output.end());
 }
 
+#ifndef SPLIT_CPU_ONLY_MODE
 template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
-size_t copy_if(T* input, T* output, size_t size, Rule rule, void* stack, size_t max_size,
-               split_gpuStream_t s = 0) {
+void copy_if(split::DeviceVector<T>& input,
+                split::DeviceVector<T>& output, Rule rule, void* stack, size_t max_size,
+                split_gpuStream_t s = 0) {
 
    // Figure out Blocks to use
-   size_t _s = std::ceil((float(size)) / (float)BLOCKSIZE);
+   size_t _s = std::ceil((float(input.size())) / (float)BLOCKSIZE);
    size_t nBlocks = nextPow2(_s);
    if (nBlocks == 0) {
       nBlocks += 1;
    }
    assert(stack && "Invalid stack!");
-   Cuda_mempool mPool(stack, max_size);
-   auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s);
-   return len;
+   GPU_Mempool mPool(stack, max_size);
+   auto len = copy_if_raw(input.data(), output.data(), input.size(), rule, nBlocks, mPool, s);
+   output.erase(output.data()+len, output.end());
+}
+#endif
+
+template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
+[[nodiscard]] size_t copy_if(T* input, T* output, size_t inputSize, Rule rule, GPU_Mempool& mPool,
+                             split_gpuStream_t s = 0) {
+
+   // Figure out Blocks to use
+   size_t _s = std::ceil((float(inputSize)) / (float)BLOCKSIZE);
+   size_t nBlocks = nextPow2(_s);
+   if (nBlocks == 0) {
+      nBlocks += 1;
+   }
+   return copy_if_raw(input, output, inputSize, rule, nBlocks, mPool, s);
+}
+
+template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
+[[nodiscard]] size_t copy_if(T* input, T* output, size_t inputSize, Rule rule, void* stack, size_t max_size,split_gpuStream_t s = 0) {
+
+   // Figure out Blocks to use
+   size_t _s = std::ceil((float(inputSize)) / (float)BLOCKSIZE);
+   size_t nBlocks = nextPow2(_s);
+   if (nBlocks == 0) {
+      nBlocks += 1;
+   }
+   assert(stack && "Invalid stack!");
+   GPU_Mempool mPool(stack, max_size);
+   return copy_if_raw(input, output, inputSize, rule, nBlocks, mPool, s);
+}
+
+template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
+[[nodiscard]] size_t copy_if(T* input, T* output, size_t inputSize, Rule rule, split_gpuStream_t s = 0) {
+
+   // Figure out Blocks to use
+   size_t _s = std::ceil((float(inputSize)) / (float)BLOCKSIZE);
+   size_t nBlocks = nextPow2(_s);
+   if (nBlocks == 0) {
+      nBlocks += 1;
+   }
+   size_t mem = estimateMemoryForCompaction(inputSize);
+   GPU_Mempool mPool(mem, s);
+   return copy_if_raw(input, output, inputSize, rule, nBlocks, mPool, s);
 }
 
 } // namespace tools
diff --git a/include/splitvector/splitvec.h b/include/splitvector/splitvec.h
index 3803fa2..bbdf375 100644
--- a/include/splitvector/splitvec.h
+++ b/include/splitvector/splitvec.h
@@ -481,7 +481,7 @@ class SplitVector {
       // This is done because _capacity would page-fault otherwise as pointed by Markus
       SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_capacity, sizeof(size_t), split_gpuCpuDeviceId, stream));
       SPLIT_CHECK_ERR(split_gpuStreamSynchronize(stream));
-      if (*_capacity==0){
+      if (*_capacity == 0) {
          return;
       }
 
@@ -501,7 +501,7 @@ class SplitVector {
       SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_capacity, sizeof(size_t), split_gpuCpuDeviceId, stream));
       SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_size, sizeof(size_t), split_gpuCpuDeviceId, stream));
       SPLIT_CHECK_ERR(split_gpuStreamSynchronize(stream));
-      if (*_capacity==0){
+      if (*_capacity == 0) {
          return;
       }
       SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_data, capacity() * sizeof(T), split_gpuCpuDeviceId, stream));
@@ -525,9 +525,7 @@ class SplitVector {
     *         Splitvector
     */
    HOSTDEVICE
-   [[nodiscard]] inline Residency getResidency()const noexcept{
-      return _location;
-   }
+   [[nodiscard]] inline Residency getResidency() const noexcept { return _location; }
 
    /**
     * @brief Copies metadata to a provided destination SplitInfo structure.
@@ -1476,7 +1474,7 @@ class SplitVector {
       const int64_t end = p1.data() - begin().data();
       const int64_t range = end - start;
 
-      const size_t sz=size();
+      const size_t sz = size();
       if constexpr (!std::is_trivial<T>::value) {
          for (int64_t i = start; i < end; i++) {
             _data[i].~T();
diff --git a/meson.build b/meson.build
index e48b066..5b59965 100644
--- a/meson.build
+++ b/meson.build
@@ -18,6 +18,7 @@ endif
 hashinator_unit = executable('hashmap_test', 'unit_tests/hashmap_unit_test/main.cu',dependencies :gtest_dep )
 splitvector_device_unit = executable('splitvector_device_test', 'unit_tests/gtest_vec_device/vec_test.cu',dependencies :gtest_dep )
 splitvector_host_unit = executable('splitvector_host_test', 'unit_tests/gtest_vec_host/vec_test.cu',dependencies :gtest_dep )
+splitvector_device_vector_unit = executable('splitvector_device_vector_test', 'unit_tests/gtest_dev_vec/vec_test.cu',dependencies :gtest_dep )
 compaction_unit = executable('compaction_test', 'unit_tests/stream_compaction/race.cu',dependencies :gtest_dep )
 compaction2_unit = executable('compaction2_test', 'unit_tests/stream_compaction/preallocated.cu', cuda_args:['--default-stream=per-thread','-Xcompiler','-fopenmp'],link_args : ['-fopenmp'],dependencies :gtest_dep)
 compaction3_unit = executable('compaction3_test', 'unit_tests/stream_compaction/unit.cu', cuda_args:'--default-stream=per-thread',link_args : ['-fopenmp'],dependencies :gtest_dep)
@@ -29,12 +30,17 @@ insertion_mechanism = executable('insertion', 'unit_tests/insertion_mechanism/ma
 tombstoneTest = executable('tbPerf', 'unit_tests/benchmark/tbPerf.cu', dependencies :gtest_dep)
 realisticTest = executable('realistic', 'unit_tests/benchmark/realistic.cu', dependencies :gtest_dep)
 hybridGPU = executable('hybrid_gpu', 'unit_tests/hybrid/main.cu',dependencies :gtest_dep )
+unordered_set_unit = executable('unordered_set_test', 'unit_tests/unordered_set_unit_test/main.cu',dependencies :gtest_dep )
+unordered_set_unit_cpu = executable('unordered_set_test_cpu', 'unit_tests/unordered_set_unit_test/main.cpp',dependencies :gtest_dep )
 
 
 #Test-Runner
 test('HashinatorTest', hashinator_unit)
+test('UnorderedSetTest', unordered_set_unit)
+test('UnorderedSetTestCPU', unordered_set_unit_cpu)
 test('SplitVectorDeviceTest', splitvector_device_unit)
 test('SplitVectorHostTest', splitvector_host_unit)
+test('SplitVectorDeviceVectorTest',splitvector_device_vector_unit)
 test('CompactionTest',  compaction_unit)
 test('CompactionTest2',  compaction2_unit)
 test('CompactionTest3',  compaction3_unit)
diff --git a/unit_tests/Makefile b/unit_tests/Makefile
index 2cf0177..14a74c4 100644
--- a/unit_tests/Makefile
+++ b/unit_tests/Makefile
@@ -7,7 +7,7 @@ EXTRA=  --std=c++17
 EXTRA+= -gencode arch=compute_60,code=sm_60  
 EXTRA+=  -DHASHMAPDEBUG --expt-relaxed-constexpr  --expt-extended-lambda -lpthread
 GTEST= -L/home/kstppd/libs/googletest/build/lib  -I/home/kstppd/libs/googletest/googletest/include -lgtest -lgtest_main -lpthread
-OBJ= gtest_vec_host.o	gtest_vec_device.o  gtest_hashmap.o stream_compaction.o stream_compaction2.o delete_mechanism.o insertion_mechanism.o hybrid_cpu.o hybrid_gpu.o pointer_test.o benchmark.o benchmarkLF.o tbPerf.o realistic.o preallocated.o
+OBJ= gtest_vec_host.o	gtest_vec_device.o gtest_device_vec.o gtest_hashmap.o stream_compaction.o stream_compaction2.o delete_mechanism.o insertion_mechanism.o hybrid_cpu.o hybrid_gpu.o pointer_test.o benchmark.o benchmarkLF.o tbPerf.o realistic.o preallocated.o
 
 
 default: tests
@@ -19,6 +19,7 @@ allclean:
 	rm ${OBJ} &
 	rm gtestvechost &
 	rm gtestvecdevice &
+	rm gtestdevicevec &
 	rm gtest_hashmap & 
 	rm compaction &
 	rm compaction2 &
@@ -54,6 +55,9 @@ gtest_vec_host.o: gtest_vec_host/vec_test.cu
 gtest_vec_device.o: gtest_vec_device/vec_test.cu
 	${CC} ${CXXFLAGS} ${OPT} ${EXTRA} ${GTEST} -o gtestvecdevice gtest_vec_device/vec_test.cu
 
+gtest_device_vec.o: gtest_dev_vec/vec_test.cu
+	${CC} ${CXXFLAGS} ${OPT} ${EXTRA} ${GTEST} -o gtestdevicevec gtest_dev_vec/vec_test.cu
+
 stream_compaction.o: stream_compaction/race.cu
 	${CC} ${CXXFLAGS} ${OPT} ${EXTRA} ${GTEST} -o compaction stream_compaction/race.cu
 
diff --git a/unit_tests/benchmark/loadFactor.cu b/unit_tests/benchmark/loadFactor.cu
index 01f4510..675f696 100644
--- a/unit_tests/benchmark/loadFactor.cu
+++ b/unit_tests/benchmark/loadFactor.cu
@@ -2,8 +2,7 @@
 #include <stdlib.h>
 #include <chrono>
 #include <unordered_set>
-#include <random>
-#include "../../include/hashinator/hashinator.h"
+#include "../../include/hashinator/hashmap/hashmap.h"
 
 constexpr int R = 2;
 using namespace std::chrono;
diff --git a/unit_tests/benchmark/main.cu b/unit_tests/benchmark/main.cu
index f4591df..4c36a2d 100644
--- a/unit_tests/benchmark/main.cu
+++ b/unit_tests/benchmark/main.cu
@@ -3,11 +3,11 @@
 #include <chrono>
 #include <unordered_set>
 #include <random>
-#include "../../include/hashinator/hashinator.h"
+#include "../../include/hashinator/hashmap/hashmap.h"
+constexpr int R = 10;
 #include <nvToolsExt.h>
 #define PROFILE_START(msg)   nvtxRangePushA((msg))
 #define PROFILE_END() nvtxRangePop()
-constexpr int R = 50;
 
 using namespace std::chrono;
 using namespace Hashinator;
diff --git a/unit_tests/benchmark/realistic.cu b/unit_tests/benchmark/realistic.cu
index 310c135..64f257f 100644
--- a/unit_tests/benchmark/realistic.cu
+++ b/unit_tests/benchmark/realistic.cu
@@ -3,7 +3,7 @@
 #include <chrono>
 #include <unordered_set>
 #include <random>
-#include "../../include/hashinator/hashinator.h"
+#include "../../include/hashinator/hashmap/hashmap.h"
 constexpr int R = 10;
 
 using namespace std::chrono;
diff --git a/unit_tests/benchmark/tbPerf.cu b/unit_tests/benchmark/tbPerf.cu
index 5d8872a..f8c204c 100644
--- a/unit_tests/benchmark/tbPerf.cu
+++ b/unit_tests/benchmark/tbPerf.cu
@@ -3,7 +3,7 @@
 #include <chrono>
 #include <unordered_set>
 #include <random>
-#include "../../include/hashinator/hashinator.h"
+#include "../../include/hashinator/hashmap/hashmap.h"
 static constexpr int R = 10;
 
 using namespace std::chrono;
diff --git a/unit_tests/delete_by_compaction/main.cu b/unit_tests/delete_by_compaction/main.cu
index 6cc3e03..fb063e7 100644
--- a/unit_tests/delete_by_compaction/main.cu
+++ b/unit_tests/delete_by_compaction/main.cu
@@ -2,7 +2,7 @@
 #include <stdlib.h>
 #include <chrono>
 #include <random>
-#include "../../include/hashinator/hashinator.h"
+#include "../../include/hashinator/hashmap/hashmap.h"
 #include <gtest/gtest.h>
 
 #define BLOCKSIZE 32
diff --git a/unit_tests/gtest_dev_vec/vec_test.cu b/unit_tests/gtest_dev_vec/vec_test.cu
new file mode 100644
index 0000000..723d05b
--- /dev/null
+++ b/unit_tests/gtest_dev_vec/vec_test.cu
@@ -0,0 +1,389 @@
+#include <iostream>
+#include <stdlib.h>
+#include <vector>
+#include <random>
+#include <chrono>
+#include <gtest/gtest.h>
+#include "../../include/splitvector/splitvec.h"
+#include "../../include/splitvector/devicevec.h"
+#include "../../include/splitvector/split_tools.h"
+#define expect_true EXPECT_TRUE
+
+using vec_type_t = int;
+using vector =  split::DeviceVector<vec_type_t> ;
+
+
+void fill_vec(vector* v, size_t targetSize){
+   std::random_device rd;
+   std::mt19937 gen(rd());
+   std::uniform_int_distribution<vec_type_t> dist(1, std::numeric_limits<vec_type_t>::max());
+   while (v->size() < targetSize) {
+      vec_type_t val =dist(gen);
+      v->push_back(val);
+    }
+}
+
+void printVecStats(vector* v){
+   std::cout<<v->size()<<std::endl;
+   std::cout<<v->capacity()<<std::endl;
+   std::cout<<"-------------\n";
+}
+
+
+void printVecElements(vector* v){
+   std::cout<<v->size()<<std::endl;
+   std::cout<<v->capacity()<<std::endl;
+   for (size_t i =0;i<v->size();i++){
+      std::cout<<v->get(i)<<", ";
+   }
+   std::cout<<"\n-------------\n";
+}
+
+
+TEST(SpDeviceVector,Construction){
+   
+   constexpr size_t N=1<<10;
+   vector* a=new vector;
+   expect_true(a->size()==0);
+   expect_true(a->capacity()==0);
+
+   vector* b=new vector(N);
+   expect_true(b->size()==N);
+   expect_true(b->capacity()==N);
+   
+   vector*c =new vector(N);
+   vector*d =new vector(*c);
+   expect_true(d->size()==c->size());
+
+
+   std::vector<vec_type_t> s{1,2,3,4,5};
+   vector* e = new vector(s);
+   expect_true(e->size()==s.size());
+
+   
+   split::SplitVector<vec_type_t> k{1,2,3,4,5} ;
+   vector* f = new vector(k);
+   expect_true(f->size()==k.size());
+   delete a;
+   delete b;
+   delete c;
+   delete d;
+   delete e;
+   delete f;
+}
+
+TEST(SpDeviceVector,AssignmentOperator){
+   
+   constexpr size_t N=1<<10;
+   vector* a=new vector;
+   expect_true(a->size()==0);
+   expect_true(a->capacity()==0);
+
+   vector* b=new vector;
+   *b=*a;
+   expect_true(b->size()==0);
+   expect_true(b->capacity()==0);
+
+   vector*c =new vector(N);
+   c->reserve(10*N);
+   vector*d =new vector;
+   *d=*c;
+   
+   expect_true(d->size()==c->size());
+
+   for (size_t i =0;i<d->size();i++){
+      expect_true(d->get(i)==c->get(i));
+   }
+   expect_true((*d==*c));
+   expect_true(!(*d!=*c));
+   delete a;
+   delete b;
+   delete c;
+   delete d;
+}
+
+
+TEST(SpDeviceVector,SizeModifiers){
+
+   constexpr size_t N=(1<<10);
+   vector* a = new vector;
+   expect_true(a->size()==0);
+   expect_true(a->capacity()==0);
+   a->reserve(N);
+   expect_true(a->size()==0);
+   expect_true(a->capacity()>=N);
+   auto cap =a->capacity();
+   a->resize(N);
+   expect_true(a->size()==N);
+   expect_true(a->capacity()==cap);
+   delete a;
+}
+
+TEST(SpDeviceVector,Clear){
+
+   constexpr size_t N=(1<<10);
+   vector* a = new vector;
+   expect_true(a->size()==0);
+   expect_true(a->capacity()==0);
+   a->reserve(N);
+   expect_true(a->size()==0);
+   expect_true(a->capacity()>=N);
+   auto cap =a->capacity();
+   a->resize(N);
+   expect_true(a->size()==N);
+   expect_true(a->capacity()==cap);
+   a->clear();
+   expect_true(a->size()==0);
+   delete a;
+}
+
+TEST(SpDeviceVector,HostPushBack){
+   
+   constexpr size_t N=(1<<10);
+   vector* a=new vector;
+   for (size_t i =0;i<N;i++){
+      a->push_back(i);
+   }
+
+   for (size_t i =0;i<N;i++){
+      expect_true(a->get(i)==i);
+   }
+
+   vector* b=new vector;
+   for (size_t i =0;i<N;i++){
+      b->push_back(i);
+   }
+
+   for (size_t i =0;i<N;i++){
+      expect_true(b->get(i)==i);
+   }
+
+   vector* c=new vector;
+   for (size_t i =0;i<10*N;i++){
+      c->push_back(i);
+   }
+
+   for (size_t i =0;i<10*N;i++){
+      expect_true(c->get(i)==i);
+   }
+   delete a;
+   delete b;
+   delete c;
+}
+
+__global__
+void kernel_set(vector* a){
+   int index = blockIdx.x * blockDim.x + threadIdx.x;
+   a->device_set(index,index);
+}
+
+__global__
+void kernel_at(vector* a){
+   int index = blockIdx.x * blockDim.x + threadIdx.x;
+   a->at(index)=index;
+}
+
+__global__
+void kernel_pushback(vector* a){
+   int index = blockIdx.x * blockDim.x + threadIdx.x;
+   a->device_push_back(index);
+}
+
+TEST(SpDeviceVector,DeviceSet){
+   
+   constexpr size_t N=(1<<10);
+   vector* a=new vector;
+   a->resize(N);
+   kernel_set<<<1,N>>>(a);
+   split_gpuDeviceSynchronize();
+
+   for (size_t i =0;i<N;i++){
+      expect_true(a->get(i)==i);
+   }
+   delete a;
+}
+
+TEST(SpDeviceVector,DeviceAt){
+   
+   constexpr size_t N=(1<<10);
+   vector* a=new vector;
+   a->resize(N);
+   kernel_at<<<1,N>>>(a);
+   split_gpuDeviceSynchronize();
+
+   for (size_t i =0;i<N;i++){
+      expect_true(a->get(i)==i);
+   }
+   delete a;
+}
+
+TEST(SpDeviceVector,DevicePushBack){
+   
+   constexpr size_t N=(1<<10);
+   vector* a=new vector;
+   a->reserve(10*N);
+   kernel_pushback<<<10,N>>>(a);
+   split_gpuDeviceSynchronize();
+   expect_true(a->size()==10*N);
+   delete a;
+}
+
+TEST(SpDeviceVector,RemoveFromBack_PopBack_Host){
+   constexpr size_t N=32;
+   vector* a=new vector;
+   a->reserve(N);
+   kernel_pushback<<<1,N>>>(a);
+   split_gpuDeviceSynchronize();
+   for (auto i= a->begin(); i!=a->end();++i){
+      a->set(i,a->get(i)*2);
+   }
+   a->remove_from_back(2);
+   expect_true(a->size()==N-2);
+   a->remove_from_back(1);
+   expect_true(a->size()==N-3);
+   a->pop_back();
+   expect_true(a->size()==N-4);
+   delete a;
+}
+
+TEST(SpDeviceVector,HostErase){
+   constexpr size_t N=32;
+   vector* a=new vector;
+   a->reserve(N);
+   kernel_pushback<<<1,N>>>(a);
+   split_gpuDeviceSynchronize();
+   for (auto i= a->begin(); i!=a->end();++i){
+      a->set(i,a->get(i)*2);
+   }
+   auto it=a->begin();
+   a->erase(it);
+   expect_true(a->get(a->begin())==2);
+   delete a;
+}
+
+TEST(SpDeviceVector,HostInsert){
+   constexpr size_t N=32;
+   vector* a=new vector;
+   a->reserve(N);
+   kernel_pushback<<<1,N>>>(a);
+   split_gpuDeviceSynchronize();
+   for (auto i= a->begin(); i!=a->end();++i){
+      a->set(i,a->get(i)*2);
+   }
+   a->insert(a->end(),63);
+   expect_true(a->back()==63);
+   a->insert(a->begin(),42);
+   expect_true(a->front()==42);
+   expect_true(a->size()==N+2);
+   delete a;
+}
+
+
+TEST(SpDeviceVector,HostInsertRange){
+   constexpr size_t N=32;
+   vector* a=new vector;
+   a->reserve(N);
+   kernel_pushback<<<1,N>>>(a);
+   split_gpuDeviceSynchronize();
+   for (auto i= a->begin(); i!=a->end();++i){
+      a->set(i,a->get(i)*2);
+   }
+   vector* b=new vector(N);
+   for (auto i= b->begin(); i!=b->end();++i){
+      b->set(i,1);
+   }
+   
+   a->insert(a->end(),b->begin(),b->end());
+   a->insert(a->begin(),b->begin(),b->end());
+   for (int i=0;i<32;i++){
+      expect_true(a->get(i)==1);
+   }
+   for (int i=32;i<64;i++){
+      expect_true(a->get(i)>=0 && a->get(i)<63);
+   }
+   for (int i=64;i<96;i++){
+      expect_true(a->get(i)==1);
+   }
+   delete a;
+   delete b;
+}
+
+bool run_compcation_test(size_t sz){
+   vector* v=new vector;
+   fill_vec(v,sz);
+   auto predicate_on =[]__host__ __device__ (vec_type_t element)->bool{ return element%2 == 0 ;};
+   auto predicate_off =[]__host__ __device__ (vec_type_t element)->bool{ return element%2 != 0 ;};
+   vector* output1 = new vector(v->size());
+   vector* output2 = new vector(v->size());
+   const size_t len1 = split::tools::copy_if(v->data(),output1->data(),v->size(),predicate_on);
+   const size_t len2 = split::tools::copy_if(v->data(),output2->data(),v->size(),predicate_off);
+   auto r=v->size();
+   delete v;
+   delete output1;
+   delete output2;
+   return len1+len2==r;
+}
+
+bool run_compcation_test2(size_t sz){
+   vector* v=new vector;
+   void* stack=nullptr;
+   size_t maxBytes = split::tools::estimateMemoryForCompaction(sz);
+   SPLIT_CHECK_ERR (split_gpuMalloc( (void**)&stack ,maxBytes));
+   fill_vec(v,sz);
+   auto predicate_on =[]__host__ __device__ (vec_type_t element)->bool{ return element%2 == 0 ;};
+   auto predicate_off =[]__host__ __device__ (vec_type_t element)->bool{ return element%2 != 0 ;};
+   vector* output1 = new vector(v->size());
+   vector* output2 = new vector(v->size());
+   const size_t len1 = split::tools::copy_if(v->data(),output1->data(),v->size(),predicate_on,stack,maxBytes);
+   const size_t len2 = split::tools::copy_if(v->data(),output2->data(),v->size(),predicate_off,stack,maxBytes);
+   auto r=v->size();
+   delete v;
+   delete output1;
+   delete output2;
+   SPLIT_CHECK_ERR (split_gpuFree(stack));
+   return len1+len2==r;
+}
+
+bool run_compcation_test2_streams(size_t sz){
+   vector* v=new vector;
+   void* stack=nullptr;
+   size_t maxBytes = split::tools::estimateMemoryForCompaction(sz);
+   SPLIT_CHECK_ERR (split_gpuMalloc( (void**)&stack ,maxBytes));
+   fill_vec(v,sz);
+   auto predicate_on =[]__host__ __device__ (vec_type_t element)->bool{ return element%2 == 0 ;};
+   auto predicate_off =[]__host__ __device__ (vec_type_t element)->bool{ return element%2 != 0 ;};
+   split_gpuStream_t s1,s2;
+   SPLIT_CHECK_ERR (split_gpuStreamCreate(&s1));
+   SPLIT_CHECK_ERR (split_gpuStreamCreate(&s2));
+   vector* output1 = new vector(v->size());
+   vector* output2 = new vector(v->size());
+   output1->setStream(s1);
+   output2->setStream(s2);
+   v->setStream(s1);
+   const size_t len1 = split::tools::copy_if(v->data(),output1->data(),v->size(),predicate_on,stack,maxBytes,s1);
+   v->setStream(s2);
+   const size_t len2 = split::tools::copy_if(v->data(),output2->data(),v->size(),predicate_off,stack,maxBytes,s2);
+   auto r=v->size();
+   delete v;
+   delete output1;
+   delete output2;
+
+   SPLIT_CHECK_ERR (split_gpuStreamDestroy(s1));
+   SPLIT_CHECK_ERR (split_gpuStreamDestroy(s2));
+   SPLIT_CHECK_ERR (split_gpuFree(stack));
+   return len1+len2==r;
+}
+
+TEST(SpDeviceVector,StreamCompaction){
+   for (size_t i = 100; i< 50000; i*=4){
+      expect_true(run_compcation_test(i));
+      expect_true(run_compcation_test2(i));
+      expect_true(run_compcation_test2_streams(i));
+   }
+}
+
+
+int main(int argc, char* argv[]){
+   ::testing::InitGoogleTest(&argc, argv);
+   return RUN_ALL_TESTS();
+}
diff --git a/unit_tests/gtest_vec_device/vec_test.cu b/unit_tests/gtest_vec_device/vec_test.cu
index c6e663a..2f3336e 100644
--- a/unit_tests/gtest_vec_device/vec_test.cu
+++ b/unit_tests/gtest_vec_device/vec_test.cu
@@ -2,8 +2,8 @@
 #include <stdlib.h>
 #include <chrono>
 #include <gtest/gtest.h>
-#include "../../include/splitvector/splitvec.h"
-#include "../../include/splitvector/split_tools.h"
+#include "../../../include/splitvector/splitvec.h"
+#include "../../../include/splitvector/split_tools.h"
 
 #define expect_true EXPECT_TRUE
 #define expect_false EXPECT_FALSE
diff --git a/unit_tests/gtest_vec_host/vec_test.cu b/unit_tests/gtest_vec_host/vec_test.cu
index efd65a6..3cc7ea0 100644
--- a/unit_tests/gtest_vec_host/vec_test.cu
+++ b/unit_tests/gtest_vec_host/vec_test.cu
@@ -6,7 +6,7 @@
 #ifndef SPLIT_CPU_ONLY_MODE
 #define  SPLIT_CPU_ONLY_MODE
 #endif
-#include "../../include/splitvector/splitvec.h"
+#include "../../../include/splitvector/splitvec.h"
 
 #define expect_true EXPECT_TRUE
 #define expect_false EXPECT_FALSE
diff --git a/unit_tests/hashmap_unit_test/main.cu b/unit_tests/hashmap_unit_test/main.cu
index e9cfa86..f6e30b6 100644
--- a/unit_tests/hashmap_unit_test/main.cu
+++ b/unit_tests/hashmap_unit_test/main.cu
@@ -2,7 +2,7 @@
 #include <stdlib.h>
 #include <chrono>
 #include <random>
-#include "../../include/hashinator/hashinator.h"
+#include "../../include/hashinator/hashmap/hashmap.h"
 #include <gtest/gtest.h>
 #include <random>
 #include <algorithm>
diff --git a/unit_tests/hybrid/main.cu b/unit_tests/hybrid/main.cu
index 83d09dd..732d284 100644
--- a/unit_tests/hybrid/main.cu
+++ b/unit_tests/hybrid/main.cu
@@ -2,7 +2,7 @@
 #include <stdlib.h>
 #include <chrono>
 #include <random>
-#include "../../include/hashinator/hashinator.h"
+#include "../../include/hashinator/hashmap/hashmap.h"
 #include <gtest/gtest.h>
 
 
diff --git a/unit_tests/insertion_mechanism/main.cu b/unit_tests/insertion_mechanism/main.cu
index d270fca..d186481 100644
--- a/unit_tests/insertion_mechanism/main.cu
+++ b/unit_tests/insertion_mechanism/main.cu
@@ -4,7 +4,7 @@
 #include <algorithm>
 #include <vector>
 #include <random>
-#include "../../include/hashinator/hashinator.h"
+#include "../../include/hashinator/hashmap/hashmap.h"
 #include <gtest/gtest.h>
 
 #define BLOCKSIZE 1024
diff --git a/unit_tests/pointer_test/main.cu b/unit_tests/pointer_test/main.cu
index bd777c6..56745b9 100644
--- a/unit_tests/pointer_test/main.cu
+++ b/unit_tests/pointer_test/main.cu
@@ -2,8 +2,8 @@
 #include <stdlib.h>
 #include <chrono>
 #include <gtest/gtest.h>
-#include "../../include/splitvector/splitvec.h"
-#include "../../include/splitvector/split_tools.h"
+#include "../../../include/splitvector/splitvec.h"
+#include "../../../include/splitvector/split_tools.h"
 #define N 1024
 #define expect_true EXPECT_TRUE
 #define expect_false EXPECT_FALSE
diff --git a/unit_tests/stream_compaction/preallocated.cu b/unit_tests/stream_compaction/preallocated.cu
index 20b01af..ddb9895 100644
--- a/unit_tests/stream_compaction/preallocated.cu
+++ b/unit_tests/stream_compaction/preallocated.cu
@@ -5,8 +5,8 @@
 #include <limits>
 #include <random>
 #include <gtest/gtest.h>
-#include "../../include/splitvector/splitvec.h"
-#include "../../include/splitvector/split_tools.h"
+#include "../../../include/splitvector/splitvec.h"
+#include "../../../include/splitvector/split_tools.h"
 #define expect_true EXPECT_TRUE
 #define expect_false EXPECT_FALSE
 #define expect_eq EXPECT_EQ
diff --git a/unit_tests/stream_compaction/race.cu b/unit_tests/stream_compaction/race.cu
index 5f9f4ea..a5b8e34 100644
--- a/unit_tests/stream_compaction/race.cu
+++ b/unit_tests/stream_compaction/race.cu
@@ -4,8 +4,8 @@
 #include <limits>
 #include <random>
 #include <gtest/gtest.h>
-#include "../../include/splitvector/splitvec.h"
-#include "../../include/splitvector/split_tools.h"
+#include "../../../include/splitvector/splitvec.h"
+#include "../../../include/splitvector/split_tools.h"
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/functional.h>
diff --git a/unit_tests/stream_compaction/unit.cu b/unit_tests/stream_compaction/unit.cu
index 92f5568..0706bbd 100644
--- a/unit_tests/stream_compaction/unit.cu
+++ b/unit_tests/stream_compaction/unit.cu
@@ -4,8 +4,8 @@
 #include <limits>
 #include <random>
 #include <gtest/gtest.h>
-#include "../../include/splitvector/splitvec.h"
-#include "../../include/splitvector/split_tools.h"
+#include "../../../include/splitvector/splitvec.h"
+#include "../../../include/splitvector/split_tools.h"
 #define expect_true EXPECT_TRUE
 #define expect_false EXPECT_FALSE
 #define expect_eq EXPECT_EQ
diff --git a/unit_tests/unordered_set_unit_test/main.cpp b/unit_tests/unordered_set_unit_test/main.cpp
new file mode 100644
index 0000000..7325c07
--- /dev/null
+++ b/unit_tests/unordered_set_unit_test/main.cpp
@@ -0,0 +1,268 @@
+#include <iostream>
+
+#define SPLIT_CPU_ONLY_MODE
+#define HASHINATOR_CPU_ONLY_MODE
+
+#include "../../include/hashinator/unordered_set/unordered_set.h"
+#include <gtest/gtest.h>
+#include <unordered_set>
+
+#define expect_true EXPECT_TRUE
+#define expect_false EXPECT_FALSE
+#define expect_eq EXPECT_EQ
+#define SMALL_SIZE (1<<10)
+#define LARGE_SIZE ( 1<<20 )
+
+using namespace Hashinator;
+typedef uint32_t key_type;
+typedef split::SplitVector<key_type> vector ;
+typedef Unordered_Set<key_type> UnorderedSet;
+
+
+bool isFreeOfDuplicates(const vector& v){
+
+   for (const auto & it : v){
+      auto cnt = std::count( v.begin() ,v.end(),it);
+      if (cnt>1 ){return false;}
+   }
+   return true ;
+}
+
+bool isFreeOfDuplicates( UnorderedSet* s){
+   vector out;
+   for (auto& k:*s ){
+      out.push_back(k);
+   }
+   expect_true(out.size()==s->size());
+   expect_true( isFreeOfDuplicates(out));
+   return true;
+}
+TEST(Unordered_UnitTest , Construction){
+   UnorderedSet s(12);
+   UnorderedSet s2=s;
+   expect_true(s2.bucket_count()==s.bucket_count());
+   UnorderedSet s3= UnorderedSet(12);
+   expect_true(s3.bucket_count()==1<<12);
+   expect_true(true);
+}
+
+TEST(Unordered_UnitTest , Construction_InitializerList){
+   UnorderedSet s{std::initializer_list<key_type>{1,2,3,4,1,2,3,4}};
+   expect_true(s.size()==4);
+}
+
+TEST(Unordered_UnitTest , Empty){
+   UnorderedSet s;
+   expect_true(s.size()==0);
+   expect_true(s.empty());
+   s.insert(1);
+   expect_false(s.empty());
+}
+
+TEST(Unordered_UnitTest , InsertFindHost){
+   UnorderedSet s;
+   for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+      s.insert(i);
+      auto it = s.find(i);
+      expect_true(*it==i);
+   }
+   expect_true(s.size()==SMALL_SIZE);
+   expect_true(s.tombstone_count()==0);
+}
+
+TEST(Unordered_UnitTest , InsertHost){
+   UnorderedSet s;
+   for (uint32_t i = 0 ; i < LARGE_SIZE;++i){
+      s.insert(i);
+   }
+   expect_true(s.size()==LARGE_SIZE);
+}
+
+
+TEST(Unordered_UnitTest , InsertKernel){
+   vector v;
+   for (uint32_t i = 0 ; i < LARGE_SIZE;++i){
+      v.push_back(i);
+   }
+   UnorderedSet s;
+   s.insert(v.data(),v.size()) ;
+   expect_true(s.size()==LARGE_SIZE);
+}
+
+
+TEST(Unordered_UnitTest , InsertEraseHost){
+   std::unordered_set<key_type> s;
+   for (uint32_t i = 0 ; i < LARGE_SIZE;++i){
+      s.insert(i);
+   }
+   expect_true(s.size()==LARGE_SIZE);
+   for (uint32_t i = 0 ; i < LARGE_SIZE;++i){
+      s.erase(i);
+   }
+   expect_true(s.size()==0);
+}
+
+TEST(Unordered_UnitTest , Insert_Erase_Kernel){
+   vector v;
+   for (uint32_t i = 0 ; i < LARGE_SIZE;++i){
+      v.push_back(i);
+   }
+   UnorderedSet s;
+   s.insert(v.data(),v.size()) ;
+   expect_true(s.size()==LARGE_SIZE);
+   s.erase(v.data(),v.size()) ;
+   expect_true(s.size()==0);
+}
+
+TEST(Unordered_UnitTest , NewOverloadManagedMemory){
+   vector v;
+   for (uint32_t i = 0 ; i < LARGE_SIZE;++i){
+      v.push_back(i);
+   }
+   UnorderedSet* s = new UnorderedSet;
+   s->insert(v.data(),v.size()) ;
+   expect_true(s->size()==LARGE_SIZE);
+   s->erase(v.data(),v.size()) ;
+   expect_true(s->size()==0);
+   delete s;
+}
+
+TEST(Unordered_UnitTest , Contains_Count){
+   UnorderedSet s;
+   for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+      s.insert(i);
+      expect_true(s.contains(i));
+      expect_false(s.contains(SMALL_SIZE+i));
+      expect_true(s.count(i)==1);
+   }
+}
+
+TEST(Unordered_UnitTest , InsertEraseHostSmall){
+   UnorderedSet s;
+   for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+      s.insert(i);
+   }
+   expect_true(s.size()==SMALL_SIZE);
+   expect_true(s.tombstone_count()==0);
+
+   for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+      auto it = s.find(i);
+      expect_true(*it==i);
+      s.erase(it);
+      auto it2 = s.find(i);
+      expect_true(it2==s.end());
+   }
+   expect_true(s.size()==0);
+   expect_true(s.tombstone_count()==SMALL_SIZE);
+
+   s.rehash();
+   expect_true(s.size()==0);
+   expect_true(s.tombstone_count()==0);
+
+   for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+      s.insert(i);
+      auto it = s.find(i);
+      expect_true(*it==i);
+   }
+   expect_true(s.size()==SMALL_SIZE);
+   expect_true(s.tombstone_count()==0);
+}
+
+
+TEST(Unordered_UnitTest , Clear){
+   {
+      vector v;
+      for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+         v.push_back(i);
+      }
+      UnorderedSet* s = new UnorderedSet;
+      s->insert(v.data(),v.size()) ;
+      expect_true(s->size()==SMALL_SIZE);
+
+      s->clear();
+      expect_true(s->size()==0);
+      expect_true(s->empty());
+      delete s;
+   }
+
+
+   {
+      vector v;
+      for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+         v.push_back(i);
+      }
+      UnorderedSet* s = new UnorderedSet;
+      s->insert(v.data(),v.size()) ;
+      expect_true(s->size()==SMALL_SIZE);
+
+      s->clear(targets::device);
+      expect_true(s->size()==0);
+      expect_true(s->empty());
+      delete s;
+   }
+}
+
+TEST(Unordered_UnitTest , Resize){
+   vector v;
+   for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+      v.push_back(i);
+   }
+   UnorderedSet* s = new UnorderedSet;
+   s->insert(v.data(),v.size()) ;
+   expect_true(s->size()==SMALL_SIZE);
+   auto priorFill=s->size();
+
+   auto sizePower = s->getSizePower();
+   s->resize(sizePower+1,targets::host);
+   expect_true(s->size()==priorFill);
+   delete s;
+}
+
+TEST(Unordered_UnitTest , LoadFactorReduction){
+
+   vector v;
+   for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+      v.push_back(i);
+   }
+   UnorderedSet* s = new UnorderedSet;
+   s->insert(v.data(),v.size(),1.0) ;
+   expect_true(s->size()==SMALL_SIZE);
+
+   //At this point we are heavilly overflown 
+   expect_true( isFreeOfDuplicates(s) );
+
+   //Let's resize to get back to a proper overflow
+   s->performCleanupTasks();
+   expect_true( isFreeOfDuplicates(s) );
+   delete s;
+}
+
+TEST(Unordered_UnitTest , TombstoneCleaning){
+
+   vector v;
+   for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+      v.push_back(i);
+   }
+   UnorderedSet* s = new UnorderedSet;
+   s->insert(v.data(),v.size(),1.0) ;
+   expect_true(s->size()==SMALL_SIZE);
+
+   //At this point we are heavilly overflown 
+   expect_true( isFreeOfDuplicates(s) );
+
+   //Let's resize to get back to a proper overflow
+   s->erase(v.data(),v.size()/2);
+   expect_true( s->tombstone_count()==SMALL_SIZE/2);
+   expect_true( isFreeOfDuplicates(s) );
+   expect_true( s->tombstone_count()==SMALL_SIZE/2);
+   s->performCleanupTasks();
+   expect_true( s->tombstone_count()==0);
+   delete s;
+}
+
+
+int main(int argc, char* argv[]){
+   srand(time(NULL));
+   ::testing::InitGoogleTest(&argc, argv);
+   return RUN_ALL_TESTS();
+}
diff --git a/unit_tests/unordered_set_unit_test/main.cu b/unit_tests/unordered_set_unit_test/main.cu
new file mode 100644
index 0000000..5f551d9
--- /dev/null
+++ b/unit_tests/unordered_set_unit_test/main.cu
@@ -0,0 +1,415 @@
+#include <iostream>
+#include "../../include/hashinator/unordered_set/unordered_set.h"
+#include <gtest/gtest.h>
+#include <unordered_set>
+
+#define expect_true EXPECT_TRUE
+#define expect_false EXPECT_FALSE
+#define expect_eq EXPECT_EQ
+#define SMALL 10
+#define LARGE 20 
+#define SMALL_SIZE (1<<SMALL)
+#define LARGE_SIZE ( 1<<LARGE )
+
+using namespace Hashinator;
+typedef uint32_t key_type;
+typedef split::SplitVector<key_type> vector ;
+typedef Unordered_Set<key_type> UnorderedSet;
+
+
+bool isFreeOfDuplicates(const vector& v){
+
+   for (const auto & it : v){
+      auto cnt = std::count( v.begin() ,v.end(),it);
+      if (cnt>1 ){return false;}
+   }
+   return true ;
+}
+
+
+bool isFreeOfDuplicates( UnorderedSet* s){
+   vector out(s->size());
+   size_t count = s->extractAllKeys(out);
+   expect_true(count==s->size());
+   expect_true( isFreeOfDuplicates(out) );
+   return true;
+}
+
+__global__ 
+void gpu_write(UnorderedSet* s, key_type*src, size_t N){
+   size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   if (index < N ){
+      s->add_element(src[index]);
+   }
+}
+
+
+TEST(Unordered_UnitTest , Construction){
+   UnorderedSet s(12);
+   UnorderedSet s2=s;
+   expect_true(s2.bucket_count()==s.bucket_count());
+   UnorderedSet s3= UnorderedSet(12);
+   expect_true(s3.bucket_count()==1<<12);
+   expect_true(true);
+}
+
+TEST(Unordered_UnitTest , Construction_InitializerList){
+   UnorderedSet s{std::initializer_list<key_type>{1,2,3,4,1,2,3,4}};
+   expect_true(s.size()==4);
+}
+
+TEST(Unordered_UnitTest , Empty){
+   UnorderedSet s;
+   expect_true(s.size()==0);
+   expect_true(s.empty());
+   s.insert(1);
+   expect_false(s.empty());
+}
+
+TEST(Unordered_UnitTest , InsertFindHost){
+   UnorderedSet s;
+   for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+      s.insert(i);
+      auto it = s.find(i);
+      expect_true(*it==i);
+   }
+   expect_true(s.size()==SMALL_SIZE);
+   expect_true(s.tombstone_count()==0);
+}
+
+TEST(Unordered_UnitTest , InsertHost){
+   UnorderedSet s;
+   for (uint32_t i = 0 ; i < LARGE_SIZE;++i){
+      s.insert(i);
+   }
+   expect_true(s.size()==LARGE_SIZE);
+}
+
+
+TEST(Unordered_UnitTest , InsertKernel){
+   vector v;
+   for (uint32_t i = 0 ; i < LARGE_SIZE;++i){
+      v.push_back(i);
+   }
+   UnorderedSet s;
+   s.insert(v.data(),v.size()) ;
+   expect_true(s.size()==LARGE_SIZE);
+}
+
+
+TEST(Unordered_UnitTest , InsertEraseHost){
+   std::unordered_set<key_type> s;
+   for (uint32_t i = 0 ; i < LARGE_SIZE;++i){
+      s.insert(i);
+   }
+   expect_true(s.size()==LARGE_SIZE);
+   for (uint32_t i = 0 ; i < LARGE_SIZE;++i){
+      s.erase(i);
+   }
+   expect_true(s.size()==0);
+}
+
+TEST(Unordered_UnitTest , Insert_Erase_Kernel){
+   vector v;
+   for (uint32_t i = 0 ; i < LARGE_SIZE;++i){
+      v.push_back(i);
+   }
+   UnorderedSet s;
+   s.insert(v.data(),v.size()) ;
+   expect_true(s.size()==LARGE_SIZE);
+   for (const auto& key:v){
+      expect_true(s.contains(key));
+   }
+   s.erase(v.data(),v.size()) ;
+   expect_true(s.size()==0);
+}
+
+TEST(Unordered_UnitTest , NewOverloadManagedMemory){
+   vector v;
+   for (uint32_t i = 0 ; i < LARGE_SIZE;++i){
+      v.push_back(i);
+   }
+   UnorderedSet* s = new UnorderedSet;
+   s->insert(v.data(),v.size()) ;
+   expect_true(s->size()==LARGE_SIZE);
+   s->erase(v.data(),v.size()) ;
+   expect_true(s->size()==0);
+   delete s;
+}
+
+TEST(Unordered_UnitTest , Contains_Count){
+   UnorderedSet s;
+   for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+      s.insert(i);
+      expect_true(s.contains(i));
+      expect_false(s.contains(SMALL_SIZE+i));
+      expect_true(s.count(i)==1);
+   }
+}
+
+TEST(Unordered_UnitTest , InsertEraseHostSmall){
+   UnorderedSet s;
+   for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+      s.insert(i);
+   }
+   expect_true(s.size()==SMALL_SIZE);
+   expect_true(s.tombstone_count()==0);
+
+   for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+      auto it = s.find(i);
+      expect_true(*it==i);
+      s.erase(it);
+      auto it2 = s.find(i);
+      expect_true(it2==s.end());
+   }
+   expect_true(s.size()==0);
+   expect_true(s.tombstone_count()==SMALL_SIZE);
+
+   s.rehash();
+   expect_true(s.size()==0);
+   expect_true(s.tombstone_count()==0);
+
+   for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+      s.insert(i);
+      auto it = s.find(i);
+      expect_true(*it==i);
+   }
+   expect_true(s.size()==SMALL_SIZE);
+   expect_true(s.tombstone_count()==0);
+}
+
+TEST(Unordered_UnitTest , ExtractPattern){
+   vector v;
+   for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+      v.push_back(i);
+   }
+   UnorderedSet* s = new UnorderedSet;
+   s->insert(v.data(),v.size()) ;
+   expect_true(s->size()==SMALL_SIZE);
+   
+   vector out(SMALL_SIZE);
+   size_t count = s->extractAllKeys(out);
+   expect_true(count==s->size());
+   expect_true( isFreeOfDuplicates(out) );
+   delete s;
+}
+
+TEST(Unordered_UnitTest , Clear){
+   {
+      vector v;
+      for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+         v.push_back(i);
+      }
+      UnorderedSet* s = new UnorderedSet;
+      s->insert(v.data(),v.size()) ;
+      expect_true(s->size()==SMALL_SIZE);
+
+      s->clear();
+      expect_true(s->size()==0);
+      expect_true(s->empty());
+      delete s;
+   }
+
+
+   {
+      vector v;
+      for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+         v.push_back(i);
+      }
+      UnorderedSet* s = new UnorderedSet;
+      s->insert(v.data(),v.size()) ;
+      expect_true(s->size()==SMALL_SIZE);
+
+      s->clear(targets::device);
+      expect_true(s->size()==0);
+      expect_true(s->empty());
+      delete s;
+   }
+}
+
+TEST(Unordered_UnitTest , Resize){
+   {
+      vector v;
+      for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+         v.push_back(i);
+      }
+      UnorderedSet* s = new UnorderedSet;
+      s->insert(v.data(),v.size()) ;
+      expect_true(s->size()==SMALL_SIZE);
+      auto priorFill=s->size();
+
+      auto sizePower = s->getSizePower();
+      s->resize(sizePower+1,targets::host);
+      expect_true(s->size()==priorFill);
+   }
+
+   {
+      vector v;
+      for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+         v.push_back(i);
+      }
+      UnorderedSet* s = new UnorderedSet;
+      s->insert(v.data(),v.size()) ;
+      expect_true(s->size()==SMALL_SIZE);
+      auto priorFill=s->size();
+
+      auto sizePower = s->getSizePower();
+      s->resize(sizePower+1,targets::device);
+      expect_true(s->size()==priorFill);
+
+      vector out(SMALL_SIZE);
+      size_t count = s->extractAllKeys(out);
+      expect_true(count==s->size());
+      expect_true( isFreeOfDuplicates(out) );
+
+      delete s;
+   }
+}
+
+TEST(Unordered_UnitTest , LoadFactorReduction){
+
+   vector v;
+   for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+      v.push_back(i);
+   }
+   UnorderedSet* s = new UnorderedSet;
+   s->insert(v.data(),v.size(),1.0) ;
+   expect_true(s->size()==SMALL_SIZE);
+
+   //At this point we are heavilly overflown 
+   expect_true( isFreeOfDuplicates(s) );
+
+   //Let's resize to get back to a proper overflow
+   s->performCleanupTasks();
+   expect_true( isFreeOfDuplicates(s) );
+   delete s;
+}
+
+TEST(Unordered_UnitTest , TombstoneCleaning){
+
+   vector v;
+   for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+      v.push_back(i);
+   }
+   UnorderedSet* s = new UnorderedSet;
+   s->insert(v.data(),v.size(),1.0) ;
+   expect_true(s->size()==SMALL_SIZE);
+
+   //At this point we are heavilly overflown 
+   expect_true( isFreeOfDuplicates(s) );
+
+   //Let's resize to get back to a proper overflow
+   s->erase(v.data(),v.size()/2);
+   expect_true( s->tombstone_count()==SMALL_SIZE/2);
+   expect_true( isFreeOfDuplicates(s) );
+   expect_true( s->tombstone_count()==SMALL_SIZE/2);
+   s->performCleanupTasks();
+   expect_true( s->tombstone_count()==0);
+   delete s;
+}
+
+TEST(Unordered_UnitTest , DeviceKernelWrite){
+
+   vector v;
+   for (uint32_t i = 0 ; i < SMALL_SIZE;++i){
+      v.push_back(i);
+   }
+   UnorderedSet* s = new UnorderedSet;
+   s->insert(v.data(),v.size(),1.0) ;
+
+   auto* ds= s->upload();
+   gpu_write<<<v.size()/1024,1024>>>(ds,v.data(),v.size());
+   split_gpuDeviceSynchronize();
+   s->download();
+   expect_true(s->size()==SMALL_SIZE);
+   expect_true( isFreeOfDuplicates(s) );
+
+   //Let's resize to get back to a proper overflow
+   s->erase(v.data(),v.size()/2);
+   expect_true( s->tombstone_count()==SMALL_SIZE/2);
+   expect_true( isFreeOfDuplicates(s) );
+   expect_true( s->tombstone_count()==SMALL_SIZE/2);
+   s->performCleanupTasks();
+   expect_true( s->tombstone_count()==0);
+   delete s;
+}
+
+__global__
+void gpu_write_warpWide(UnorderedSet* set,key_type* src,size_t N ){
+   size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   const size_t wid = index / Hashinator::defaults::WARPSIZE;
+   const size_t w_tid = index % defaults::WARPSIZE;
+   if (wid < N ){
+      key_type key= src[wid];
+      set->warpInsert(key,w_tid);
+   }
+}
+
+__global__
+void gpu_erase_warpWide(UnorderedSet* set,key_type* src,size_t N){
+   size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+   const size_t wid = index / Hashinator::defaults::WARPSIZE;
+   const size_t w_tid = index % defaults::WARPSIZE;
+   if (wid < N ){
+      key_type key= src[wid];
+      set->warpErase(key,w_tid);
+   }
+}
+
+TEST(Unordered_UnitTest ,WarpInsert){
+   vector v;
+   for (uint32_t i = 0 ; i < LARGE_SIZE;++i){
+      v.push_back(i);
+   }
+   size_t N = v.size();
+   size_t blocksize=1024;
+   size_t blocks=N/blocksize;
+   size_t warpsize     =  Hashinator::defaults::WARPSIZE;
+   size_t threadsNeeded  =  N*warpsize; 
+   blocks = threadsNeeded/blocksize;
+   UnorderedSet* s = new UnorderedSet(LARGE+1);
+   gpu_write_warpWide<<<blocks,blocksize>>>(s,v.data(),v.size());
+   split_gpuDeviceSynchronize();
+   for (const auto& key:v){
+      expect_true(s->contains(key));
+   }
+   delete s;
+}
+
+
+TEST(Unordered_UnitTest ,WarpInsertErase){
+   vector v;
+   for (uint32_t i = 0 ; i < LARGE_SIZE;++i){
+      v.push_back(i);
+   }
+   size_t N = v.size();
+   size_t blocksize=1024;
+   size_t blocks=N/blocksize;
+   size_t warpsize     =  Hashinator::defaults::WARPSIZE;
+   size_t threadsNeeded  =  N*warpsize; 
+   blocks = threadsNeeded/blocksize;
+   UnorderedSet* s = new UnorderedSet;
+   s->resize(LARGE+1);
+   gpu_write_warpWide<<<blocks,blocksize>>>(s,v.data(),v.size());
+   split_gpuDeviceSynchronize();
+   for (const auto& key:v){
+      expect_true(s->contains(key));
+   }
+
+   gpu_erase_warpWide<<<blocks,blocksize>>>(s,v.data(),v.size());
+   split_gpuDeviceSynchronize();
+   for (const auto& key:v){
+      expect_false(s->contains(key));
+   }
+   expect_true(s->size()==0);
+   split_gpuDeviceSynchronize();
+
+
+   delete s;
+}
+
+int main(int argc, char* argv[]){
+   srand(time(NULL));
+   ::testing::InitGoogleTest(&argc, argv);
+   return RUN_ALL_TESTS();
+}