From 07d1ee4d6b3d024d30672af54bdd2e57b1549609 Mon Sep 17 00:00:00 2001 From: Markus Battarbee Date: Mon, 8 Jan 2024 11:39:15 +0200 Subject: [PATCH 1/3] Added some overloads, increased use of streams in methods --- include/hashinator/hashinator.h | 33 +++++++++- include/splitvector/split_allocators.h | 17 ++++-- include/splitvector/split_tools.h | 66 ++++++++++++++++++++ include/splitvector/splitvec.h | 84 ++++++++++++++++++++------ 4 files changed, 174 insertions(+), 26 deletions(-) diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h index b819e6b..4d48a4d 100644 --- a/include/hashinator/hashinator.h +++ b/include/hashinator/hashinator.h @@ -151,6 +151,16 @@ class Hashmap { return *this; } + /** Copy assign but using a provided stream */ + void overwrite(const Hashmap& other, split_gpuStream_t stream = 0) { + if (this == &other) { + return; + } + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(_mapInfo,other._mapInfo, sizeof(MapInfo), split_gpuMemcpyDeviceToDevice, stream)); + buckets.overwrite(other.buckets); + return; + } + Hashmap& operator=(Hashmap&& other) { if (this == &other) { return *this; @@ -450,12 +460,33 @@ class Hashmap { } #endif - // Try to grow our buckets until we achieve a targetLF load factor +#ifdef HASHINATOR_CPU_ONLY_MODE +// Try to grow our buckets until we achieve a targetLF load factor void resize_to_lf(float targetLF = 0.5) { while (load_factor() > targetLF) { rehash(_mapInfo->sizePower + 1); } } +#else + // Try to grow our buckets until we achieve a targetLF load factor + void resize_to_lf(float targetLF = 0.5, targets t = targets::host, split_gpuStream_t s = 0) { + while (load_factor() > targetLF) { + switch (t) { + case targets::host: + rehash(_mapInfo->sizePower + 1); + break; + case targets::device: + device_rehash(_mapInfo->sizePower + 1, s); + break; + default: + std::cerr << "Defaulting to host rehashing" << std::endl; + resize(_mapInfo->sizePower + 1, targets::host); + break; + } + } + return; + } +#endif #ifdef HASHINATOR_CPU_ONLY_MODE void resize(int newSizePower) { rehash(newSizePower); } diff --git a/include/splitvector/split_allocators.h b/include/splitvector/split_allocators.h index 5d0d17d..ccff591 100644 --- a/include/splitvector/split_allocators.h +++ b/include/splitvector/split_allocators.h @@ -33,7 +33,7 @@ namespace split { #define SPLIT_CHECK_ERR(err) (split::cuda_error(err, __FILE__, __LINE__)) static void cuda_error(cudaError_t err, const char* file, int line) { if (err != cudaSuccess) { - printf("\n\n%s in %s at line %d\n", cudaGetErrorString(err), file, line); + std::cerr<<"\n\n"<(-1) / sizeof(value_type); diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h index 6e34b22..26375ed 100644 --- a/include/splitvector/split_tools.h +++ b/include/splitvector/split_tools.h @@ -651,6 +651,42 @@ uint32_t copy_if_raw(split::SplitVector>& i return numel; } +template +uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, + size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) { + + uint32_t* d_counts; + uint32_t* d_offsets; + d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t), s)); + + // Phase 1 -- Calculate per warp workload + split::tools::scan_reduce_raw<<>>(input, d_counts, rule, size); + d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); + + // Step 2 -- Exclusive Prefix Scan on offsets + if (nBlocks == 1) { + split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); + } else { + split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); + } + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + + // Step 3 -- Compaction + uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); + split::tools::split_compact_raw + <<>>( + input, d_counts, d_offsets, output, rule, size, nBlocks, retval); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + uint32_t numel; + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + return numel; +} + /** * @brief Same as copy_keys_if but using raw memory */ @@ -708,6 +744,20 @@ estimateMemoryForCompaction(const split::SplitVector +[[nodiscard]] size_t +estimateMemoryForCompaction(const size_t inputSize) noexcept { + // Figure out Blocks to use + size_t _s = std::ceil((float(inputSize)) / (float)BLOCKSIZE); + size_t nBlocks = nextPow2(_s); + if (nBlocks == 0) { + nBlocks += 1; + } + + // Allocate with Mempool + return 8 * nBlocks * sizeof(uint32_t); +} + /** * @brief Same as copy_if but only for Hashinator keys */ @@ -827,5 +877,21 @@ void copy_if(split::SplitVector>& input, output.erase(&output[len], output.end()); } +template +size_t copy_if(T* input, T* output, size_t size, Rule rule, void* stack, size_t max_size, + split_gpuStream_t s = 0) { + + // Figure out Blocks to use + size_t _s = std::ceil((float(size)) / (float)BLOCKSIZE); + size_t nBlocks = nextPow2(_s); + if (nBlocks == 0) { + nBlocks += 1; + } + assert(stack && "Invalid stack!"); + Cuda_mempool mPool(stack, max_size); + auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s); + return len; +} + } // namespace tools } // namespace split diff --git a/include/splitvector/splitvec.h b/include/splitvector/splitvec.h index 02dbd8b..f3c2e5c 100644 --- a/include/splitvector/splitvec.h +++ b/include/splitvector/splitvec.h @@ -359,6 +359,33 @@ class SplitVector { _location = Residency::host; return *this; } + + /** Copy assign but using a provided stream */ + HOSTONLY void overwrite(const SplitVector& other, split_gpuStream_t stream = 0) { + if (this == &other) { + return; + } + // Match other's size prior to copying + resize(other.size(), true, stream); + auto copySafe = [&]() -> void { + for (size_t i = 0; i < size(); i++) { + _data[i] = other._data[i]; + } + }; + + if constexpr (std::is_trivially_copyable::value) { + if (other._location == Residency::device) { + _location = Residency::device; + optimizeGPU(stream); + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(_data, other._data, size() * sizeof(T), split_gpuMemcpyDeviceToDevice,stream)); + return; + } + } + copySafe(); + _location = Residency::host; + return; + } + #endif /** @@ -616,7 +643,7 @@ class SplitVector { * * @param requested_space The size of the requested space. */ - HOSTONLY void reallocate(size_t requested_space) { + HOSTONLY void reallocate(size_t requested_space, split_gpuStream_t stream = 0) { if (requested_space == 0) { if (_data != nullptr) { _deallocate_and_destroy(capacity(), _data); @@ -633,19 +660,31 @@ class SplitVector { this->_deallocate(); throw std::bad_alloc(); } - - // Copy over - for (size_t i = 0; i < size(); i++) { - _new_data[i] = _data[i]; - } - - // Deallocate old space - _deallocate_and_destroy(capacity(), _data); - + // Store addresses + const size_t __size = *_size; + const size_t __old_capacity = *_capacity; + T* __new_data = _new_data; + T* __data = _data; // Swap pointers & update capacity // Size remains the same ofc _data = _new_data; *_capacity = requested_space; + // Perform copy on device + if (__size>0) { + int device; + SPLIT_CHECK_ERR(split_gpuGetDevice(&device)); + SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(__data, __size * sizeof(T), device, stream));// + SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(__new_data, requested_space * sizeof(T), device, stream)); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(stream)); + SPLIT_CHECK_ERR(split_gpuMemcpy(__new_data, __data, __size * sizeof(T), split_gpuMemcpyDeviceToDevice)); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(stream)); + } + // for (size_t i = 0; i < size(); i++) { + // _new_data[i] = _data[i]; + // } + + // Deallocate old space + _deallocate_and_destroy(__old_capacity, __data); return; } @@ -660,8 +699,8 @@ class SplitVector { * will be invalidated after a call. */ HOSTONLY - void reserve(size_t requested_space, bool eco = false) { - size_t current_space = *_capacity; + void reserve(size_t requested_space, bool eco = false, split_gpuStream_t stream = 0) { + const size_t current_space = *_capacity; // Vector was default initialized if (_data == nullptr) { _deallocate(); @@ -670,9 +709,14 @@ class SplitVector { return; } // Nope. + const size_t currentSize = size(); if (requested_space <= current_space) { - for (size_t i = size(); i < requested_space; ++i) { - _allocator.construct(&_data[i], T()); + if (std::is_trivially_constructible::value && _location == Residency::device) { + SPLIT_CHECK_ERR( split_gpuMemsetAsync(&_data[currentSize],0,(requested_space-currentSize)*sizeof(T), stream) ); + } else { + for (size_t i = currentSize; i < requested_space; ++i) { + _allocator.construct(&_data[i], T()); + } } return; } @@ -681,7 +725,7 @@ class SplitVector { if (!eco) { requested_space *= _alloc_multiplier; } - reallocate(requested_space); + reallocate(requested_space,stream); return; } @@ -697,13 +741,13 @@ class SplitVector { * will be invalid from now on. */ HOSTONLY - void resize(size_t newSize, bool eco = false) { + void resize(size_t newSize, bool eco = false, split_gpuStream_t stream = 0) { // Let's reserve some space and change our size if (newSize <= size()) { *_size = newSize; return; } - reserve(newSize, eco); + reserve(newSize, eco, stream); *_size = newSize; } @@ -729,13 +773,13 @@ class SplitVector { * @brief Increase the capacity of the SplitVector by 1. */ HOSTONLY - void grow() { reserve(capacity() + 1); } + void grow(split_gpuStream_t stream = 0) { reserve(capacity() + 1, false, stream); } /** * @brief Reduce the capacity of the SplitVector to match its size. */ HOSTONLY - void shrink_to_fit() { + void shrink_to_fit(split_gpuStream_t stream = 0) { size_t curr_cap = *_capacity; size_t curr_size = *_size; @@ -743,7 +787,7 @@ class SplitVector { return; } - reallocate(curr_size); + reallocate(curr_size,stream); return; } From 57116ea6f16d09d13cad44c272f271ca51cfd257 Mon Sep 17 00:00:00 2001 From: Markus Battarbee Date: Tue, 30 Jan 2024 16:30:32 +0200 Subject: [PATCH 2/3] commented out erase optimizeGPU --- include/hashinator/hashinator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h index d011545..387dd8d 100644 --- a/include/hashinator/hashinator.h +++ b/include/hashinator/hashinator.h @@ -1296,7 +1296,7 @@ class Hashmap { // Uses Hasher's erase_kernel to delete all elements void erase(KEY_TYPE* keys, size_t len, split_gpuStream_t s = 0) { - buckets.optimizeGPU(s); + //buckets.optimizeGPU(s); // Remember the last numeber of tombstones size_t tbStore = tombstone_count(); DeviceHasher::erase(keys, buckets.data(), &_mapInfo->tombstoneCounter, _mapInfo->sizePower, From a7dde832220a88b663017b788f333ed567e84573 Mon Sep 17 00:00:00 2001 From: Markus Battarbee Date: Thu, 1 Feb 2024 14:44:51 +0200 Subject: [PATCH 3/3] removed one comment --- include/hashinator/hashinator.h | 1 - include/splitvector/split_tools.h | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h index 780e5af..cb71f17 100644 --- a/include/hashinator/hashinator.h +++ b/include/hashinator/hashinator.h @@ -1298,7 +1298,6 @@ class Hashmap { // Uses Hasher's erase_kernel to delete all elements void erase(KEY_TYPE* keys, size_t len, split_gpuStream_t s = 0) { - //buckets.optimizeGPU(s); // Remember the last numeber of tombstones size_t tbStore = tombstone_count(); DeviceHasher::erase(keys, buckets.data(), &_mapInfo->tombstoneCounter, _mapInfo->sizePower, diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h index 634eea6..bacd415 100644 --- a/include/splitvector/split_tools.h +++ b/include/splitvector/split_tools.h @@ -688,7 +688,7 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); return numel; } - + /** * @brief Same as copy_keys_if but using raw memory */ @@ -896,6 +896,6 @@ size_t copy_if(T* input, T* output, size_t size, Rule rule, void* stack, size_t auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s); return len; } - + } // namespace tools } // namespace split