From 07d1ee4d6b3d024d30672af54bdd2e57b1549609 Mon Sep 17 00:00:00 2001
From: Markus Battarbee <markus.battarbee@gmail.com>
Date: Mon, 8 Jan 2024 11:39:15 +0200
Subject: [PATCH 1/3] Added some overloads, increased use of streams in methods

---
 include/hashinator/hashinator.h        | 33 +++++++++-
 include/splitvector/split_allocators.h | 17 ++++--
 include/splitvector/split_tools.h      | 66 ++++++++++++++++++++
 include/splitvector/splitvec.h         | 84 ++++++++++++++++++++------
 4 files changed, 174 insertions(+), 26 deletions(-)

diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h
index b819e6b..4d48a4d 100644
--- a/include/hashinator/hashinator.h
+++ b/include/hashinator/hashinator.h
@@ -151,6 +151,16 @@ class Hashmap {
       return *this;
    }
 
+   /** Copy assign but using a provided stream */
+   void overwrite(const Hashmap<KEY_TYPE,VAL_TYPE>& other, split_gpuStream_t stream = 0) {
+      if (this == &other) {
+         return;
+      }
+      SPLIT_CHECK_ERR(split_gpuMemcpyAsync(_mapInfo,other._mapInfo, sizeof(MapInfo), split_gpuMemcpyDeviceToDevice, stream));
+      buckets.overwrite(other.buckets);
+      return;
+   }
+
    Hashmap& operator=(Hashmap<KEY_TYPE,VAL_TYPE>&& other) {
       if (this == &other) {
          return *this;
@@ -450,12 +460,33 @@ class Hashmap {
    }
 #endif
 
-   // Try to grow our buckets until we achieve a targetLF load factor
+#ifdef HASHINATOR_CPU_ONLY_MODE
+// Try to grow our buckets until we achieve a targetLF load factor
    void resize_to_lf(float targetLF = 0.5) {
       while (load_factor() > targetLF) {
          rehash(_mapInfo->sizePower + 1);
       }
    }
+#else
+   // Try to grow our buckets until we achieve a targetLF load factor
+   void resize_to_lf(float targetLF = 0.5, targets t = targets::host, split_gpuStream_t s = 0) {
+      while (load_factor() > targetLF) {
+         switch (t) {
+            case targets::host:
+               rehash(_mapInfo->sizePower + 1);
+               break;
+            case targets::device:
+               device_rehash(_mapInfo->sizePower + 1, s);
+               break;
+            default:
+               std::cerr << "Defaulting to host rehashing" << std::endl;
+               resize(_mapInfo->sizePower + 1, targets::host);
+               break;
+         }
+      }
+      return;
+   }
+#endif
 
 #ifdef HASHINATOR_CPU_ONLY_MODE
    void resize(int newSizePower) { rehash(newSizePower); }
diff --git a/include/splitvector/split_allocators.h b/include/splitvector/split_allocators.h
index 5d0d17d..ccff591 100644
--- a/include/splitvector/split_allocators.h
+++ b/include/splitvector/split_allocators.h
@@ -33,7 +33,7 @@ namespace split {
 #define SPLIT_CHECK_ERR(err) (split::cuda_error(err, __FILE__, __LINE__))
 static void cuda_error(cudaError_t err, const char* file, int line) {
    if (err != cudaSuccess) {
-      printf("\n\n%s in %s at line %d\n", cudaGetErrorString(err), file, line);
+      std::cerr<<"\n\n"<<cudaGetErrorString(err)<<" in "<<file<<" at line "<<line<<"\n";
       abort();
    }
 }
@@ -43,7 +43,7 @@ static void cuda_error(cudaError_t err, const char* file, int line) {
 #define SPLIT_CHECK_ERR(err) (split::hip_error(err, __FILE__, __LINE__))
 static void hip_error(hipError_t err, const char* file, int line) {
    if (err != hipSuccess) {
-      printf("\n\n%s in %s at line %d\n", hipGetErrorString(err), file, line);
+      std::cerr<<"\n\n"<<hipGetErrorString(err)<<" in "<<file<<" at line "<<line<<"\n";
       abort();
    }
 }
@@ -105,9 +105,16 @@ class split_unified_allocator {
       return ret;
    }
 
-   void deallocate(pointer p, size_type) { SPLIT_CHECK_ERR(split_gpuFree(p)); }
-
-   static void deallocate(void* p, size_type) { SPLIT_CHECK_ERR(split_gpuFree(p)); }
+   void deallocate(pointer p, size_type n) {
+      if (n != 0 && p != 0) {
+         SPLIT_CHECK_ERR(split_gpuFree(p));
+      }
+   }
+   static void deallocate(void* p, size_type n) {
+      if (n != 0 && p != 0) {
+         SPLIT_CHECK_ERR(split_gpuFree(p));
+      }
+   }
 
    size_type max_size() const throw() {
       size_type max = static_cast<size_type>(-1) / sizeof(value_type);
diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h
index 6e34b22..26375ed 100644
--- a/include/splitvector/split_tools.h
+++ b/include/splitvector/split_tools.h
@@ -651,6 +651,42 @@ uint32_t copy_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& i
    return numel;
 }
 
+template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
+uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule,
+                     size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) {
+
+   uint32_t* d_counts;
+   uint32_t* d_offsets;
+   d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+   SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t), s));
+
+   // Phase 1 -- Calculate per warp workload
+   split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input, d_counts, rule, size);
+   d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+   SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s));
+
+   // Step 2 -- Exclusive Prefix Scan on offsets
+   if (nBlocks == 1) {
+      split_prefix_scan_raw<uint32_t, 2, WARP>(d_counts, d_offsets, mPool, nBlocks, s);
+   } else {
+      split_prefix_scan_raw<uint32_t, BLOCKSIZE, WARP>(d_counts, d_offsets, mPool, nBlocks, s);
+   }
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+
+   // Step 3 -- Compaction
+   uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t));
+   split::tools::split_compact_raw<T, Rule, BLOCKSIZE, WARP>
+      <<<nBlocks, BLOCKSIZE, 2 * (BLOCKSIZE / WARP) * sizeof(unsigned int), s>>>(
+         input, d_counts, d_offsets, output, rule, size, nBlocks, retval);
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+   uint32_t numel;
+   SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));
+   SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+   return numel;
+}
+   
 /**
  * @brief Same as copy_keys_if but using raw memory
  */
@@ -708,6 +744,20 @@ estimateMemoryForCompaction(const split::SplitVector<T, split::split_unified_all
    return 8 * nBlocks * sizeof(uint32_t);
 }
 
+template <int BLOCKSIZE = 1024>
+[[nodiscard]] size_t
+estimateMemoryForCompaction(const size_t inputSize) noexcept {
+   // Figure out Blocks to use
+   size_t _s = std::ceil((float(inputSize)) / (float)BLOCKSIZE);
+   size_t nBlocks = nextPow2(_s);
+   if (nBlocks == 0) {
+      nBlocks += 1;
+   }
+
+   // Allocate with Mempool
+   return 8 * nBlocks * sizeof(uint32_t);
+}
+
 /**
  * @brief Same as copy_if but only for Hashinator keys
  */
@@ -827,5 +877,21 @@ void copy_if(split::SplitVector<T, split::split_unified_allocator<T>>& input,
    output.erase(&output[len], output.end());
 }
 
+template <typename T, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
+size_t copy_if(T* input, T* output, size_t size, Rule rule, void* stack, size_t max_size,
+               split_gpuStream_t s = 0) {
+
+   // Figure out Blocks to use
+   size_t _s = std::ceil((float(size)) / (float)BLOCKSIZE);
+   size_t nBlocks = nextPow2(_s);
+   if (nBlocks == 0) {
+      nBlocks += 1;
+   }
+   assert(stack && "Invalid stack!");
+   Cuda_mempool mPool(stack, max_size);
+   auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s);
+   return len;
+}
+   
 } // namespace tools
 } // namespace split
diff --git a/include/splitvector/splitvec.h b/include/splitvector/splitvec.h
index 02dbd8b..f3c2e5c 100644
--- a/include/splitvector/splitvec.h
+++ b/include/splitvector/splitvec.h
@@ -359,6 +359,33 @@ class SplitVector {
       _location = Residency::host;
       return *this;
    }
+
+   /** Copy assign but using a provided stream */
+   HOSTONLY void overwrite(const SplitVector<T, Allocator>& other, split_gpuStream_t stream = 0) {
+      if (this == &other) {
+         return;
+      }
+      // Match other's size prior to copying
+      resize(other.size(), true, stream);
+      auto copySafe = [&]() -> void {
+                         for (size_t i = 0; i < size(); i++) {
+                            _data[i] = other._data[i];
+                         }
+                      };
+
+      if constexpr (std::is_trivially_copyable<T>::value) {
+            if (other._location == Residency::device) {
+               _location = Residency::device;
+               optimizeGPU(stream);
+               SPLIT_CHECK_ERR(split_gpuMemcpyAsync(_data, other._data, size() * sizeof(T), split_gpuMemcpyDeviceToDevice,stream));
+               return;
+            }
+         }
+      copySafe();
+      _location = Residency::host;
+      return;
+   }
+
 #endif
 
    /**
@@ -616,7 +643,7 @@ class SplitVector {
     *
     * @param requested_space The size of the requested space.
     */
-   HOSTONLY void reallocate(size_t requested_space) {
+   HOSTONLY void reallocate(size_t requested_space, split_gpuStream_t stream = 0) {
       if (requested_space == 0) {
          if (_data != nullptr) {
             _deallocate_and_destroy(capacity(), _data);
@@ -633,19 +660,31 @@ class SplitVector {
          this->_deallocate();
          throw std::bad_alloc();
       }
-
-      // Copy over
-      for (size_t i = 0; i < size(); i++) {
-         _new_data[i] = _data[i];
-      }
-
-      // Deallocate old space
-      _deallocate_and_destroy(capacity(), _data);
-
+      // Store addresses
+      const size_t __size = *_size;
+      const size_t __old_capacity = *_capacity;
+      T* __new_data = _new_data;
+      T* __data = _data;
       // Swap pointers & update capacity
       // Size remains the same ofc
       _data = _new_data;
       *_capacity = requested_space;
+      // Perform copy on device
+      if (__size>0) {
+         int device;
+         SPLIT_CHECK_ERR(split_gpuGetDevice(&device));
+         SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(__data, __size * sizeof(T), device, stream));//
+         SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(__new_data, requested_space * sizeof(T), device, stream));
+         SPLIT_CHECK_ERR(split_gpuStreamSynchronize(stream));
+         SPLIT_CHECK_ERR(split_gpuMemcpy(__new_data, __data, __size * sizeof(T), split_gpuMemcpyDeviceToDevice));
+         SPLIT_CHECK_ERR(split_gpuStreamSynchronize(stream));
+      }
+      // for (size_t i = 0; i < size(); i++) {
+      //    _new_data[i] = _data[i];
+      // }
+
+      // Deallocate old space
+      _deallocate_and_destroy(__old_capacity, __data);
       return;
    }
 
@@ -660,8 +699,8 @@ class SplitVector {
     * will be invalidated after a call.
     */
    HOSTONLY
-   void reserve(size_t requested_space, bool eco = false) {
-      size_t current_space = *_capacity;
+   void reserve(size_t requested_space, bool eco = false, split_gpuStream_t stream = 0) {
+      const size_t current_space = *_capacity;
       // Vector was default initialized
       if (_data == nullptr) {
          _deallocate();
@@ -670,9 +709,14 @@ class SplitVector {
          return;
       }
       // Nope.
+      const size_t currentSize = size();
       if (requested_space <= current_space) {
-         for (size_t i = size(); i < requested_space; ++i) {
-            _allocator.construct(&_data[i], T());
+         if (std::is_trivially_constructible<T>::value && _location == Residency::device) {
+            SPLIT_CHECK_ERR( split_gpuMemsetAsync(&_data[currentSize],0,(requested_space-currentSize)*sizeof(T), stream) );
+         } else {
+            for (size_t i = currentSize; i < requested_space; ++i) {
+               _allocator.construct(&_data[i], T());
+            }
          }
          return;
       }
@@ -681,7 +725,7 @@ class SplitVector {
       if (!eco) {
          requested_space *= _alloc_multiplier;
       }
-      reallocate(requested_space);
+      reallocate(requested_space,stream);
       return;
    }
 
@@ -697,13 +741,13 @@ class SplitVector {
     * will be invalid from now on.
     */
    HOSTONLY
-   void resize(size_t newSize, bool eco = false) {
+   void resize(size_t newSize, bool eco = false, split_gpuStream_t stream = 0) {
       // Let's reserve some space and change our size
       if (newSize <= size()) {
          *_size = newSize;
          return;
       }
-      reserve(newSize, eco);
+      reserve(newSize, eco, stream);
       *_size = newSize;
    }
 
@@ -729,13 +773,13 @@ class SplitVector {
     * @brief Increase the capacity of the SplitVector by 1.
     */
    HOSTONLY
-   void grow() { reserve(capacity() + 1); }
+   void grow(split_gpuStream_t stream = 0) { reserve(capacity() + 1, false, stream); }
 
    /**
     * @brief Reduce the capacity of the SplitVector to match its size.
     */
    HOSTONLY
-   void shrink_to_fit() {
+   void shrink_to_fit(split_gpuStream_t stream = 0) {
       size_t curr_cap = *_capacity;
       size_t curr_size = *_size;
 
@@ -743,7 +787,7 @@ class SplitVector {
          return;
       }
 
-      reallocate(curr_size);
+      reallocate(curr_size,stream);
       return;
    }
 

From 57116ea6f16d09d13cad44c272f271ca51cfd257 Mon Sep 17 00:00:00 2001
From: Markus Battarbee <markus.battarbee@gmail.com>
Date: Tue, 30 Jan 2024 16:30:32 +0200
Subject: [PATCH 2/3] commented out erase optimizeGPU

---
 include/hashinator/hashinator.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h
index d011545..387dd8d 100644
--- a/include/hashinator/hashinator.h
+++ b/include/hashinator/hashinator.h
@@ -1296,7 +1296,7 @@ class Hashmap {
 
    // Uses Hasher's erase_kernel to delete all elements
    void erase(KEY_TYPE* keys, size_t len, split_gpuStream_t s = 0) {
-      buckets.optimizeGPU(s);
+      //buckets.optimizeGPU(s);
       // Remember the last numeber of tombstones
       size_t tbStore = tombstone_count();
       DeviceHasher::erase(keys, buckets.data(), &_mapInfo->tombstoneCounter, _mapInfo->sizePower,

From a7dde832220a88b663017b788f333ed567e84573 Mon Sep 17 00:00:00 2001
From: Markus Battarbee <markus.battarbee@gmail.com>
Date: Thu, 1 Feb 2024 14:44:51 +0200
Subject: [PATCH 3/3] removed one comment

---
 include/hashinator/hashinator.h   | 1 -
 include/splitvector/split_tools.h | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h
index 780e5af..cb71f17 100644
--- a/include/hashinator/hashinator.h
+++ b/include/hashinator/hashinator.h
@@ -1298,7 +1298,6 @@ class Hashmap {
 
    // Uses Hasher's erase_kernel to delete all elements
    void erase(KEY_TYPE* keys, size_t len, split_gpuStream_t s = 0) {
-      //buckets.optimizeGPU(s);
       // Remember the last numeber of tombstones
       size_t tbStore = tombstone_count();
       DeviceHasher::erase(keys, buckets.data(), &_mapInfo->tombstoneCounter, _mapInfo->sizePower,
diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h
index 634eea6..bacd415 100644
--- a/include/splitvector/split_tools.h
+++ b/include/splitvector/split_tools.h
@@ -688,7 +688,7 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule,
    SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
    return numel;
 }
-   
+
 /**
  * @brief Same as copy_keys_if but using raw memory
  */
@@ -896,6 +896,6 @@ size_t copy_if(T* input, T* output, size_t size, Rule rule, void* stack, size_t
    auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s);
    return len;
 }
-   
+
 } // namespace tools
 } // namespace split