From 788cc7ad03c64332d40d4c1bfd77f65ed291c617 Mon Sep 17 00:00:00 2001
From: Markus Battarbee <markus.battarbee@gmail.com>
Date: Thu, 30 May 2024 17:39:13 +0300
Subject: [PATCH 1/6] Update README.md with Zenodo DOI

---
 README.md | 1 +
 1 file changed, 1 insertion(+)
diff --git a/README.md b/README.md
index f3af556..fa69fab 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,4 @@
+[![DOI](https://zenodo.org/badge/805378098.svg)](https://zenodo.org/doi/10.5281/zenodo.11396296)
 [![C/C++ CI](https://github.com/kstppd/hashinator/actions/workflows/tests.yml/badge.svg?branch=master)](https://github.com/kstppd/hashinator/actions/workflows/tests.yml)
 
 ## Hashinator: A hybrid hashmap designed for heterogeneous computing.

From 475f6867cbe9d12a9a6b71e806af49023ae0ee98 Mon Sep 17 00:00:00 2001
From: Markus Battarbee <markus.battarbee@gmail.com>
Date: Mon, 10 Jun 2024 22:40:29 +0300
Subject: [PATCH 2/6] Add accessors to internals

---
 include/hashinator/hashinator.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h
index b5fe8a0..27194d3 100644
--- a/include/hashinator/hashinator.h
+++ b/include/hashinator/hashinator.h
@@ -463,6 +463,28 @@ class Hashmap {
       }
    }
 
+   // Dangerous methods for exposing internals
+   template <bool warn = true>
+   HASHINATOR_HOSTDEVICE MapInfo* expose_mapinfo() noexcept {
+      if constexpr(warn) {
+         printf("Warning, exposing Hashmap internal info struct!\n");
+      }
+      return _mapInfo;
+   }
+   template <bool warn = true>
+   HASHINATOR_HOSTDEVICE hash_pair<KEY_TYPE, VAL_TYPE>* expose_bucketdata() noexcept {
+      if constexpr(warn) {
+         printf("Warning, exposing Hashmap internal bucket data!\n");
+      }
+      return buckets.data();
+   }
+   HASHINATOR_HOSTDEVICE inline KEY_TYPE expose_emptybucket() const noexcept {
+      return EMPTYBUCKET;
+   }
+   HASHINATOR_HOSTDEVICE inline KEY_TYPE expose_tombstone() const noexcept {
+      return TOMBSTONE;
+   }
+
 #ifdef HASHINATOR_CPU_ONLY_MODE
    void clear() {
       buckets = split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>(1 << _mapInfo->sizePower, {EMPTYBUCKET, VAL_TYPE()});

From 4f0a2d82728f0023b37063ce6bb3ac5d82dc5c86 Mon Sep 17 00:00:00 2001
From: Markus Battarbee <markus.battarbee@gmail.com>
Date: Tue, 25 Jun 2024 18:07:57 +0300
Subject: [PATCH 3/6] Add split wrappers for atomic max and min

---
 include/splitvector/gpu_wrappers.h | 54 ++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/include/splitvector/gpu_wrappers.h b/include/splitvector/gpu_wrappers.h
index fa00d3d..dd02673 100644
--- a/include/splitvector/gpu_wrappers.h
+++ b/include/splitvector/gpu_wrappers.h
@@ -116,6 +116,60 @@ __device__ __forceinline__ T s_atomicSub(T* address, U val) noexcept {
       } else {
          return atomicSub((unsigned int*)address, static_cast<T>(val));
       }
+   } else if constexpr (sizeof(T) == 8) {
+      return atomicAdd((unsigned long long*)address, static_cast<T>(-val));
+   } else {
+      // Cannot be static_assert(false...);
+      static_assert(!sizeof(T*), "Not supported");
+   }
+}
+
+/**
+ * @brief Wrapper for atomic maximum operation.
+ *
+ * @tparam T The data type of the value being maximized.
+ * @tparam U The data type of the value to maximize against.
+ * @param address Pointer to the memory location.
+ * @param val The value to maximize against.
+ * @return The original value at the memory location.
+ */
+template <typename T, typename U>
+__device__ __forceinline__ T s_atomicMax(T* address, U val) noexcept {
+   static_assert(std::is_integral<T>::value && "Only integers supported");
+   if constexpr (sizeof(T) == 4) {
+      if constexpr (std::is_signed<T>::value) {
+         return atomicMax((int*)address, static_cast<T>(val));
+      } else {
+         return atomicMax((unsigned int*)address, static_cast<T>(val));
+      }
+   } else if constexpr (sizeof(T) == 8) {
+      return atomicMax((unsigned long long*)address, static_cast<T>(val));
+   } else {
+      // Cannot be static_assert(false...);
+      static_assert(!sizeof(T*), "Not supported");
+   }
+}
+
+/**
+ * @brief Wrapper for atomic minimum operation.
+ *
+ * @tparam T The data type of the value being minimized.
+ * @tparam U The data type of the value to minimize against.
+ * @param address Pointer to the memory location.
+ * @param val The value to minimize against.
+ * @return The original value at the memory location.
+ */
+template <typename T, typename U>
+__device__ __forceinline__ T s_atomicMin(T* address, U val) noexcept {
+   static_assert(std::is_integral<T>::value && "Only integers supported");
+   if constexpr (sizeof(T) == 4) {
+      if constexpr (std::is_signed<T>::value) {
+         return atomicMin((int*)address, static_cast<T>(val));
+      } else {
+         return atomicMin((unsigned int*)address, static_cast<T>(val));
+      }
+   } else if constexpr (sizeof(T) == 8) {
+      return atomicMin((unsigned long long*)address, static_cast<T>(val));
    } else {
       // Cannot be static_assert(false...);
       static_assert(!sizeof(T*), "Not supported");

From a2a6d9e8734c0c057428eefcbdcb7b47ee6d6a3a Mon Sep 17 00:00:00 2001
From: Markus Battarbee <markus.battarbee@gmail.com>
Date: Tue, 25 Jun 2024 18:09:48 +0300
Subject: [PATCH 4/6] Remove expose emptybucket and tombstone as duplicates;
 improve threadsafety

---
 include/hashinator/hashinator.h | 145 +++++++++++++++++---------------
 1 file changed, 75 insertions(+), 70 deletions(-)

diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h
index 27194d3..bd15ea8 100644
--- a/include/hashinator/hashinator.h
+++ b/include/hashinator/hashinator.h
@@ -342,7 +342,8 @@ class Hashmap {
       auto hashIndex = hash(key);
 
       // Try to find the matching bucket.
-      for (size_t i = 0; i < _mapInfo->currentMaxBucketOverflow; i++) {
+      const size_t bsize = buckets.size();
+      for (size_t i = 0; i < bsize; i++) {
 
          hash_pair<KEY_TYPE, VAL_TYPE>& candidate = buckets[(hashIndex + i) & bitMask];
 
@@ -368,7 +369,8 @@ class Hashmap {
             // We look ahead in case candidate was already in the hashmap
             // If we find it then we swap the duplicate with empty and do not increment fill
             // but we only reduce the tombstone count
-            for (size_t j = i + 1; j < _mapInfo->currentMaxBucketOverflow; ++j) {
+            const size_t bsize = buckets.size();
+            for (size_t j = i + 1; j < bsize; ++j) {
                hash_pair<KEY_TYPE, VAL_TYPE>& duplicate = buckets[(hashIndex + j) & bitMask];
                if (duplicate.first == candidate.first) {
                   alreadyExists = true;
@@ -478,12 +480,6 @@ class Hashmap {
       }
       return buckets.data();
    }
-   HASHINATOR_HOSTDEVICE inline KEY_TYPE expose_emptybucket() const noexcept {
-      return EMPTYBUCKET;
-   }
-   HASHINATOR_HOSTDEVICE inline KEY_TYPE expose_tombstone() const noexcept {
-      return TOMBSTONE;
-   }
 
 #ifdef HASHINATOR_CPU_ONLY_MODE
    void clear() {
@@ -748,7 +744,8 @@ class Hashmap {
       auto hashIndex = hash(key);
 
       // Try to find the matching bucket.
-      for (size_t i = 0; i < _mapInfo->currentMaxBucketOverflow; i++) {
+      const size_t bsize = buckets.size();
+      for (size_t i = 0; i < bsize; i++) {
          const hash_pair<KEY_TYPE, VAL_TYPE>& candidate = buckets[(hashIndex + i) & bitMask];
 
          if (candidate.first == TOMBSTONE) {
@@ -776,7 +773,8 @@ class Hashmap {
       auto hashIndex = hash(key);
 
       // Try to find the matching bucket.
-      for (size_t i = 0; i < _mapInfo->currentMaxBucketOverflow; i++) {
+      const size_t bsize = buckets.size();
+      for (size_t i = 0; i < bsize; i++) {
          const hash_pair<KEY_TYPE, VAL_TYPE>& candidate = buckets[(hashIndex + i) & bitMask];
 
          if (candidate.first == TOMBSTONE) {
@@ -893,10 +891,10 @@ class Hashmap {
          // Check if this elements already exists
          auto already_exists = split::s_warpVote(target.first == candidateKey, submask);
          if (already_exists) {
-            int winner = split::s_findFirstSig(already_exists) - 1;
-            if (w_tid == winner) {
+            int winner = split::s_findFirstSig(already_exists);
+            if (w_tid == winner-1) {
                if constexpr (!skipOverWrites) {
-                  split::s_atomicExch(&buckets[probingindex].second, candidateVal);
+                  split::s_atomicExch(&(buckets[probingindex].second), candidateVal);
                }
                // This virtual warp is now done.
                warpDone = 1;
@@ -909,23 +907,25 @@ class Hashmap {
          while (mask && !warpDone) {
             int winner = split::s_findFirstSig(mask) - 1;
             if (w_tid == winner) {
-               KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex].first, EMPTYBUCKET, candidateKey);
+               KEY_TYPE old = split::s_atomicCAS(&(buckets[probingindex].first), EMPTYBUCKET, candidateKey);
                if (old == EMPTYBUCKET) {
                   threadOverflow = (probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex + 1);
-                  split::s_atomicExch(&buckets[probingindex].second, candidateVal);
+                  split::s_atomicExch(&(buckets[probingindex].second), candidateVal);
                   warpDone = 1;
-                  split::s_atomicAdd(&_mapInfo->fill, 1);
+                  split::s_atomicAdd(&(_mapInfo->fill), 1);
+                  // Minor optimization to get rid of some unnecessary atomic calls
                   if (threadOverflow > _mapInfo->currentMaxBucketOverflow) {
-                     split::s_atomicExch((unsigned long long*)(&_mapInfo->currentMaxBucketOverflow),
-                                         (unsigned long long)nextOverflow(threadOverflow, defaults::WARPSIZE));
+                     split::s_atomicMax(&(_mapInfo->currentMaxBucketOverflow),
+                                        nextOverflow(threadOverflow, defaults::WARPSIZE));
                   }
+
                } else if (old == candidateKey) {
-                  // Parallel stuff are fun. Major edge case!
+                  // Parallel insertion already added this key.
                   if constexpr (!skipOverWrites) {
-                     split::s_atomicExch(&buckets[probingindex].second, candidateVal);
+                     split::s_atomicExch(&(buckets[probingindex].second), candidateVal);
                   }
                   warpDone = 1;
-               }
+               } // else some other key+value was written here.
             }
             // If any of the virtual warp threads are done the the whole
             // Virtual warp is done
@@ -977,10 +977,10 @@ class Hashmap {
          // Check if this elements already exists
          auto already_exists = split::s_warpVote(target.first == candidateKey, submask);
          if (already_exists) {
-            int winner = split::s_findFirstSig(already_exists) - 1;
-            if (w_tid == winner) {
+            int winner = split::s_findFirstSig(already_exists);
+            if (w_tid == winner-1) {
                if constexpr (!skipOverWrites) {
-                  split::s_atomicExch(&buckets[probingindex].second, candidateVal);
+                     split::s_atomicExch(&(buckets[probingindex].second), candidateVal);
                }
                // This virtual warp is now done.
                warpDone = 1;
@@ -993,21 +993,22 @@ class Hashmap {
          while (mask && !warpDone) {
             int winner = split::s_findFirstSig(mask) - 1;
             if (w_tid == winner) {
-               KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex].first, EMPTYBUCKET, candidateKey);
+               KEY_TYPE old = split::s_atomicCAS(&(buckets[probingindex].first), EMPTYBUCKET, candidateKey);
                if (old == EMPTYBUCKET) {
                   threadOverflow = (probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex + 1);
-                  split::s_atomicExch(&buckets[probingindex].second, candidateVal);
+                  split::s_atomicExch(&(buckets[probingindex].second), candidateVal);
                   warpDone = 1;
                   localCount = 1;
-                  split::s_atomicAdd(&_mapInfo->fill, 1);
+                  split::s_atomicAdd(&(_mapInfo->fill), 1);
+                  // Minor optimization to get rid of some unnecessary atomic calls
                   if (threadOverflow > _mapInfo->currentMaxBucketOverflow) {
-                     split::s_atomicExch((unsigned long long*)(&_mapInfo->currentMaxBucketOverflow),
-                                         (unsigned long long)nextOverflow(threadOverflow, defaults::WARPSIZE));
+                     split::s_atomicMax(&(_mapInfo->currentMaxBucketOverflow),
+                                        nextOverflow(threadOverflow, defaults::WARPSIZE));
                   }
                } else if (old == candidateKey) {
                   // Parallel stuff are fun. Major edge case!
                   if constexpr (!skipOverWrites) {
-                     split::s_atomicExch(&buckets[probingindex].second, candidateVal);
+                     split::s_atomicExch(&(buckets[probingindex].second), candidateVal);
                   }
                   warpDone = 1;
                }
@@ -1027,7 +1028,7 @@ class Hashmap {
    void warpFind(const KEY_TYPE& candidateKey, VAL_TYPE& candidateVal, const size_t w_tid) const noexcept {
 
       const int sizePower = _mapInfo->sizePower;
-      const size_t maxoverflow = _mapInfo->currentMaxBucketOverflow;
+      //const size_t maxoverflow = _mapInfo->currentMaxBucketOverflow;
       const int bitMask = (1 << (sizePower)) - 1;
       const auto hashIndex = HashFunction::_hash(candidateKey, sizePower);
       const auto submask = SPLIT_VOTING_MASK;
@@ -1044,8 +1045,7 @@ class Hashmap {
       assert(isSafe && "Tried to warpFind with different keys/vals in the same warp");
 #endif
 
-      for (size_t i = 0; i < maxoverflow; i += defaults::WARPSIZE) {
-
+      for (size_t i = 0; i < (1 << sizePower); i += defaults::WARPSIZE) {
          if (warpDone) {
             break;
          }
@@ -1061,14 +1061,16 @@ class Hashmap {
             warpDone = true;
          }
          if (maskExists) {
-            winner = split::s_findFirstSig(maskExists) - 1;
-            if (w_tid == winner) {
+            winner = split::s_findFirstSig(maskExists) ;
+            if (w_tid == winner-1) {
                candidateVal = buckets[probingindex].second;
             }
             warpDone = true;
          }
       }
-      candidateVal = split::s_shuffle(candidateVal, winner, SPLIT_VOTING_MASK);
+      if (winner!=0) {
+         candidateVal = split::s_shuffle(candidateVal, winner-1, SPLIT_VOTING_MASK);
+      }
       return;
    }
 
@@ -1076,7 +1078,7 @@ class Hashmap {
    void warpErase(const KEY_TYPE& candidateKey, const size_t w_tid) noexcept {
 
       const int sizePower = _mapInfo->sizePower;
-      const size_t maxoverflow = _mapInfo->currentMaxBucketOverflow;
+      //const size_t maxoverflow = _mapInfo->currentMaxBucketOverflow;
       const int bitMask = (1 << (sizePower)) - 1;
       const auto hashIndex = HashFunction::_hash(candidateKey, sizePower);
       const auto submask = SPLIT_VOTING_MASK;
@@ -1093,8 +1095,7 @@ class Hashmap {
       assert(isSafe && "Tried to warpFind with different keys/vals in the same warp");
 #endif
 
-      for (size_t i = 0; i < maxoverflow; i += defaults::WARPSIZE) {
-
+      for (size_t i = 0; i < (1 << sizePower); i += defaults::WARPSIZE) {
          if (warpDone) {
             break;
          }
@@ -1110,11 +1111,13 @@ class Hashmap {
             warpDone = true;
          }
          if (maskExists) {
-            winner = split::s_findFirstSig(maskExists) - 1;
-            if (w_tid == winner) {
-               buckets[probingindex].first = TOMBSTONE;
-               split::s_atomicAdd(&_mapInfo->tombstoneCounter, 1);
-               split::s_atomicSub((unsigned int*)&_mapInfo->fill, 1);
+            winner = split::s_findFirstSig(maskExists);
+            if (w_tid == winner-1) {
+               KEY_TYPE old = split::s_atomicCAS(&(buckets[probingindex].first), candidateKey, TOMBSTONE);
+               if (old==candidateKey) {
+                  split::s_atomicSub(&(_mapInfo->fill), 1);
+                  split::s_atomicAdd(&(_mapInfo->tombstoneCounter), 1);
+               }
             }
             warpDone = true;
          }
@@ -1327,7 +1330,6 @@ class Hashmap {
       if (neededPowerSize > _mapInfo->sizePower) {
          resize(neededPowerSize, targets::device, s);
       }
-      _mapInfo->currentMaxBucketOverflow = _mapInfo->currentMaxBucketOverflow;
       DeviceHasher::insert(keys, vals, buckets.data(), _mapInfo, len, s);
       return;
    }
@@ -1348,7 +1350,6 @@ class Hashmap {
       if (neededPowerSize > _mapInfo->sizePower) {
          resize(neededPowerSize, targets::device, s);
       }
-      _mapInfo->currentMaxBucketOverflow = _mapInfo->currentMaxBucketOverflow;
       DeviceHasher::insertIndex(keys, buckets.data(), _mapInfo, len, s);
       return;
    }
@@ -1457,11 +1458,9 @@ class Hashmap {
       // Copy over fill as it might have changed
       optimizeCPU(stream);
       if (_mapInfo->currentMaxBucketOverflow > Hashinator::defaults::BUCKET_OVERFLOW) {
-         std::cout << "Device Overflow" << std::endl;
          rehash(_mapInfo->sizePower + 1);
       } else {
          if (tombstone_count() > 0) {
-            std::cout << "Cleaning Tombstones" << std::endl;
             clean_tombstones(stream);
          }
       }
@@ -1568,7 +1567,9 @@ class Hashmap {
       auto hashIndex = hash(key);
 
       // Try to find the matching bucket.
-      for (size_t i = 0; i < _mapInfo->currentMaxBucketOverflow; i++) {
+      //for (size_t i = 0; i < _mapInfo->currentMaxBucketOverflow; i++) {
+      const size_t bsize = buckets.size();
+      for (size_t i = 0; i < bsize; i++) {
          const hash_pair<KEY_TYPE, VAL_TYPE>& candidate = buckets[(hashIndex + i) & bitMask];
 
          if (candidate.first == TOMBSTONE) {
@@ -1596,7 +1597,8 @@ class Hashmap {
       auto hashIndex = hash(key);
 
       // Try to find the matching bucket.
-      for (size_t i = 0; i < _mapInfo->currentMaxBucketOverflow; i++) {
+      const size_t bsize = buckets.size();
+      for (size_t i = 0; i < bsize; i++) {
          const hash_pair<KEY_TYPE, VAL_TYPE>& candidate = buckets[(hashIndex + i) & bitMask];
 
          if (candidate.first == TOMBSTONE) {
@@ -1626,7 +1628,8 @@ class Hashmap {
 
    HASHINATOR_DEVICEONLY
    device_iterator device_begin() {
-      for (size_t i = 0; i < buckets.size(); i++) {
+      const size_t bsize = buckets.size();
+      for (size_t i = 0; i < bsize; i++) {
          if (buckets[i].first != EMPTYBUCKET && buckets[i].first != TOMBSTONE) {
             return device_iterator(*this, i);
          }
@@ -1636,7 +1639,8 @@ class Hashmap {
 
    HASHINATOR_DEVICEONLY
    const_device_iterator device_begin() const {
-      for (size_t i = 0; i < buckets.size(); i++) {
+      const size_t bsize = buckets.size();
+      for (size_t i = 0; i < bsize; i++) {
          if (buckets[i].first != EMPTYBUCKET && buckets[i].first != TOMBSTONE) {
             return const_device_iterator(*this, i);
          }
@@ -1667,23 +1671,23 @@ class Hashmap {
    // Remove with tombstones on device
    HASHINATOR_DEVICEONLY
    device_iterator device_erase(device_iterator keyPos) {
-
       // Get the index of this entry
-      size_t index = keyPos.getIndex();
+      const size_t index = keyPos.getIndex();
 
       // If this is an empty bucket or a tombstone we can return already
-      // TODO Use CAS here for safety
-      KEY_TYPE& item = buckets[index].first;
+      // NOTE: threadsafety requires a read of value, not reference
+      const KEY_TYPE item = buckets[index].first;
       if (item == EMPTYBUCKET || item == TOMBSTONE) {
          return ++keyPos;
       }
 
       // Let's simply add a tombstone here
-      split::s_atomicExch(&buckets[index].first, TOMBSTONE);
-      split::s_atomicSub((unsigned int*)(&_mapInfo->fill), 1);
-      split::s_atomicAdd((unsigned int*)(&_mapInfo->tombstoneCounter), 1);
-      ++keyPos;
-      return keyPos;
+      KEY_TYPE old = split::s_atomicCAS(&(buckets[index].first), item, TOMBSTONE);
+      if (old==item) {
+         split::s_atomicSub(&(_mapInfo->fill), 1);
+         split::s_atomicAdd(&(_mapInfo->tombstoneCounter), 1);
+      }
+      return ++keyPos;
    }
 
 private:
@@ -1695,21 +1699,21 @@ class Hashmap {
       int bitMask = (1 << _mapInfo->sizePower) - 1; // For efficient modulo of the array size
       auto hashIndex = hash(key);
       size_t i = 0;
-      while (i < buckets.size()) {
+      const size_t bsize = buckets.size();
+      while (i < bsize) {
          uint32_t vecindex = (hashIndex + i) & bitMask;
-         KEY_TYPE old = split::s_atomicCAS(&buckets[vecindex].first, EMPTYBUCKET, key);
+         KEY_TYPE old = split::s_atomicCAS(&(buckets[vecindex].first), EMPTYBUCKET, key);
          // Key does not exist so we create it and incerement fill
          if (old == EMPTYBUCKET) {
-            split::s_atomicExch(&buckets[vecindex].first, key);
-            split::s_atomicExch(&buckets[vecindex].second, value);
-            split::s_atomicAdd((unsigned int*)(&_mapInfo->fill), 1);
+            split::s_atomicExch(&(buckets[vecindex].second), value);
+            split::s_atomicAdd(&(_mapInfo->fill), 1);
             thread_overflowLookup = i + 1;
             return;
          }
 
          // Key exists so we overwrite it. Fill stays the same
          if (old == key) {
-            split::s_atomicExch(&buckets[vecindex].second, value);
+            split::s_atomicExch(&(buckets[vecindex].second), value);
             thread_overflowLookup = i + 1;
             return;
          }
@@ -1733,8 +1737,8 @@ class Hashmap {
    void set_element(const KEY_TYPE& key, VAL_TYPE val) {
       size_t thread_overflowLookup = 0;
       insert_element(key, val, thread_overflowLookup);
-      atomicMax((unsigned long long*)&(_mapInfo->currentMaxBucketOverflow),
-                nextOverflow(thread_overflowLookup, defaults::WARPSIZE / defaults::elementsPerWarp));
+      split::s_atomicMax(&(_mapInfo->currentMaxBucketOverflow),
+                         nextOverflow(thread_overflowLookup, defaults::WARPSIZE / defaults::elementsPerWarp));
    }
 
    HASHINATOR_DEVICEONLY
@@ -1743,7 +1747,8 @@ class Hashmap {
       auto hashIndex = hash(key);
 
       // Try to find the matching bucket.
-      for (size_t i = 0; i < _mapInfo->currentMaxBucketOverflow; i++) {
+      const size_t bsize = buckets.size();
+      for (size_t i = 0; i < bsize; i++) {
          uint32_t vecindex = (hashIndex + i) & bitMask;
          const hash_pair<KEY_TYPE, VAL_TYPE>& candidate = buckets[vecindex];
          if (candidate.first == key) {

From bb02249def190f6a08bb52d215dec19635d104c0 Mon Sep 17 00:00:00 2001
From: Markus Battarbee <markus.battarbee@gmail.com>
Date: Wed, 26 Jun 2024 13:24:51 +0300
Subject: [PATCH 5/6] Templated single-thread device insertions for skipping
 overwrites and returning insertion status, streamlined device_insert

---
 include/hashinator/hashinator.h | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h
index bd15ea8..c333bf2 100644
--- a/include/hashinator/hashinator.h
+++ b/include/hashinator/hashinator.h
@@ -1694,8 +1694,9 @@ class Hashmap {
    /**Device code for inserting elements. Nonexistent elements get created.
       Tombstones are accounted for.
     */
+   template <bool skipOverWrites = false>
    HASHINATOR_DEVICEONLY
-   void insert_element(const KEY_TYPE& key, VAL_TYPE value, size_t& thread_overflowLookup) {
+   bool insert_element(const KEY_TYPE& key, VAL_TYPE value, size_t& thread_overflowLookup) {
       int bitMask = (1 << _mapInfo->sizePower) - 1; // For efficient modulo of the array size
       auto hashIndex = hash(key);
       size_t i = 0;
@@ -1708,14 +1709,16 @@ class Hashmap {
             split::s_atomicExch(&(buckets[vecindex].second), value);
             split::s_atomicAdd(&(_mapInfo->fill), 1);
             thread_overflowLookup = i + 1;
-            return;
+            return true;
          }
 
          // Key exists so we overwrite it. Fill stays the same
          if (old == key) {
-            split::s_atomicExch(&(buckets[vecindex].second), value);
+            if constexpr (!skipOverWrites) {
+               split::s_atomicExch(&(buckets[vecindex].second), value);
+            }
             thread_overflowLookup = i + 1;
-            return;
+            return false;
          }
 
          i++;
@@ -1724,21 +1727,22 @@ class Hashmap {
    }
 
 public:
+   template <bool skipOverWrites = false>
    HASHINATOR_DEVICEONLY
    hash_pair<device_iterator, bool> device_insert(hash_pair<KEY_TYPE, VAL_TYPE> newEntry) {
-      bool found = device_find(newEntry.first) != device_end();
-      if (!found) {
-         set_element(newEntry.first, newEntry.second);
-      }
-      return hash_pair<device_iterator, bool>(device_find(newEntry.first), !found);
+      bool newentry = set_element<skipOverWrites>(newEntry.first, newEntry.second);
+      return hash_pair<device_iterator, bool>(device_find(newEntry.first), newentry);
    }
 
+   template <bool skipOverWrites = false>
    HASHINATOR_DEVICEONLY
-   void set_element(const KEY_TYPE& key, VAL_TYPE val) {
+   bool set_element(const KEY_TYPE& key, VAL_TYPE val) {
+      bool newentry = false;
       size_t thread_overflowLookup = 0;
-      insert_element(key, val, thread_overflowLookup);
+      newentry = insert_element<skipOverWrites>(key, val, thread_overflowLookup);
       split::s_atomicMax(&(_mapInfo->currentMaxBucketOverflow),
                          nextOverflow(thread_overflowLookup, defaults::WARPSIZE / defaults::elementsPerWarp));
+      return newentry;
    }
 
    HASHINATOR_DEVICEONLY

From d25dd963310ff1702aedf8cfa830bbcef594306b Mon Sep 17 00:00:00 2001
From: Markus Battarbee <markus.battarbee@gmail.com>
Date: Wed, 26 Jun 2024 13:49:43 +0300
Subject: [PATCH 6/6] compiler warning removal

---
 include/hashinator/hashinator.h      | 1 +
 unit_tests/stream_compaction/unit.cu | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h
index c333bf2..e2a37e4 100644
--- a/include/hashinator/hashinator.h
+++ b/include/hashinator/hashinator.h
@@ -1724,6 +1724,7 @@ class Hashmap {
          i++;
       }
       assert(false && "Hashmap completely overflown");
+      return false;
    }
 
 public:
diff --git a/unit_tests/stream_compaction/unit.cu b/unit_tests/stream_compaction/unit.cu
index 5c689f1..299bae3 100644
--- a/unit_tests/stream_compaction/unit.cu
+++ b/unit_tests/stream_compaction/unit.cu
@@ -6,8 +6,8 @@
 #include <gtest/gtest.h>
 #include "../../include/splitvector/splitvec.h"
 #include "../../include/splitvector/split_tools.h"
-#include "include/common.h"
-#include "include/splitvector/archMacros.h"
+#include "../../include/common.h"
+#include "../../include/splitvector/archMacros.h"
 #define expect_true EXPECT_TRUE
 #define expect_false EXPECT_FALSE
 #define expect_eq EXPECT_EQ