diff --git a/include/common.h b/include/common.h index 70c792a..cb48536 100644 --- a/include/common.h +++ b/include/common.h @@ -50,10 +50,9 @@ constexpr inline size_t nextPow2(size_t v) noexcept { * @brief Computes the next optimal overflow for the hasher kernels */ HASHINATOR_HOSTDEVICE -[[nodiscard]] -constexpr inline size_t nextOverflow(size_t currentOverflow, size_t virtualWarp) noexcept { - size_t remainder = currentOverflow % virtualWarp; - return ((remainder)==0)?currentOverflow: currentOverflow + (virtualWarp - remainder); +[[nodiscard]] constexpr inline size_t nextOverflow(size_t currentOverflow, size_t virtualWarp) noexcept { + size_t remainder = currentOverflow % virtualWarp; + return ((remainder) == 0) ? currentOverflow : currentOverflow + (virtualWarp - remainder); } /** diff --git a/include/hashinator/hashers.h b/include/hashinator/hashers.h index f7552be..fdefc87 100644 --- a/include/hashinator/hashers.h +++ b/include/hashinator/hashers.h @@ -23,6 +23,7 @@ #include "../splitvector/gpu_wrappers.h" #include "defaults.h" #include "hashfunctions.h" +#include "hash_pair.h" #ifdef __NVCC__ #include "kernels_NVIDIA.h" #endif @@ -160,6 +161,68 @@ class Hasher { SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); } + + /* ----------------------------------- Members used by Hashinator::Unordered_Set -----------------------------------*/ + + + static void insert_set(KEY_TYPE* keys, KEY_TYPE* buckets, int sizePower,size_t maxoverflow, size_t* d_overflow, + size_t* d_fill, size_t len, status* err,split_gpuStream_t s = 0) { + //Make sure this is being used by Unordered_Set + static_assert(std::is_same::value); + size_t blocks, blockSize; + *err = status::success; + launchParams(len, blocks, blockSize); + Hashinator::Hashers::insert_set_kernel + <<>>(keys, buckets, sizePower, maxoverflow, d_overflow, d_fill, len, err); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); +#ifndef NDEBUG + if (*err == status::fail) { + std::cerr << "***** Hashinator Runtime Warning ********" << std::endl; + std::cerr << "Warning: Hashmap completely overflown in Device Insert.\nNot all ellements were " + "inserted!\nConsider resizing before calling insert" + << std::endl; + std::cerr << "******************************" << std::endl; + } +#endif + } + + // Delete wrapper + static void erase_set(KEY_TYPE* keys, KEY_TYPE* buckets, size_t* d_tombstoneCounter, int sizePower, + size_t maxoverflow, size_t len, split_gpuStream_t s = 0) { + + //Make sure this is being used by Unordered_Set + static_assert(std::is_same::value); + size_t blocks, blockSize; + launchParams(len, blocks, blockSize); + Hashinator::Hashers::delete_set_kernel + <<>>(keys, buckets, d_tombstoneCounter, sizePower, maxoverflow, len); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + } + + // Reset wrapper + static void reset_set(KEY_TYPE* src, KEY_TYPE* dst, const int sizePower,size_t maxoverflow, + size_t len, split_gpuStream_t s = 0) { + //Make sure this is being used by Unordered_Set + static_assert(std::is_same::value); + size_t blocks, blockSize; + launchParams(len, blocks, blockSize); + Hashinator::Hashers::reset_to_empty_set + <<>>(src, dst, sizePower, maxoverflow, len); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + } + // Reset wrapper for all elements + static void reset_all_set(KEY_TYPE* dst, size_t len, split_gpuStream_t s = 0) { + //Make sure this is being used by Unordered_Set + static_assert(std::is_same::value); + size_t blocksNeeded = len / defaults::MAX_BLOCKSIZE; + blocksNeeded = blocksNeeded + (blocksNeeded == 0); + reset_all_to_empty_set<<>>(dst, len); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + } + + private: static void launchParams(size_t N, size_t& blocks, size_t& blockSize) { // fast ceil for positive ints diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashmap/hashmap.h similarity index 97% rename from include/hashinator/hashinator.h rename to include/hashinator/hashmap/hashmap.h index cb71f17..5fba42b 100644 --- a/include/hashinator/hashinator.h +++ b/include/hashinator/hashmap/hashmap.h @@ -1,4 +1,4 @@ -/* File: hashinator.h +/* File: hashmap.h * Authors: Kostis Papadakis, Urs Ganse and Markus Battarbee (2023) * Description: A hybrid hashmap that can operate on both * CPUs and GPUs using CUDA unified memory. @@ -24,20 +24,20 @@ #ifdef HASHINATOR_CPU_ONLY_MODE #define SPLIT_CPU_ONLY_MODE #endif -#include "../common.h" -#include "../splitvector/gpu_wrappers.h" -#include "../splitvector/split_allocators.h" -#include "../splitvector/splitvec.h" -#include "defaults.h" -#include "hash_pair.h" -#include "hashfunctions.h" +#include "../../common.h" +#include "../../splitvector/gpu_wrappers.h" +#include "../../splitvector/split_allocators.h" +#include "../../splitvector/splitvec.h" +#include "../defaults.h" +#include "../hash_pair.h" +#include "../hashfunctions.h" #include #include #include #include #ifndef HASHINATOR_CPU_ONLY_MODE -#include "../splitvector/split_tools.h" -#include "hashers.h" +#include "../../splitvector/split_tools.h" +#include "../hashers.h" #endif namespace Hashinator { @@ -99,7 +99,7 @@ class Hashmap { // Deallocates the bookeepping info and the device pointer void deallocate_device_handles() { - if (device_map==nullptr){ + if (device_map == nullptr) { return; } #ifndef HASHINATOR_CPU_ONLY_MODE @@ -138,11 +138,11 @@ class Hashmap { Hashmap(Hashmap&& other) { preallocate_device_handles(); _mapInfo = other._mapInfo; - other._mapInfo=nullptr; + other._mapInfo = nullptr; buckets = std::move(other.buckets); }; - Hashmap& operator=(const Hashmap& other) { + Hashmap& operator=(const Hashmap& other) { if (this == &other) { return *this; } @@ -167,8 +167,8 @@ class Hashmap { } _metaAllocator.deallocate(_mapInfo, 1); _mapInfo = other._mapInfo; - other._mapInfo=nullptr; - buckets =std::move(other.buckets); + other._mapInfo = nullptr; + buckets = std::move(other.buckets); return *this; } @@ -847,13 +847,13 @@ class Hashmap { if (w_tid == winner) { KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex].first, EMPTYBUCKET, candidateKey); if (old == EMPTYBUCKET) { - threadOverflow =(probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex+1); + threadOverflow = (probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex + 1); split::s_atomicExch(&buckets[probingindex].second, candidateVal); warpDone = 1; split::s_atomicAdd(&_mapInfo->fill, 1); if (threadOverflow > _mapInfo->currentMaxBucketOverflow) { split::s_atomicExch((unsigned long long*)(&_mapInfo->currentMaxBucketOverflow), - (unsigned long long)nextOverflow(threadOverflow,defaults::WARPSIZE)); + (unsigned long long)nextOverflow(threadOverflow, defaults::WARPSIZE)); } } else if (old == candidateKey) { // Parallel stuff are fun. Major edge case! @@ -931,14 +931,14 @@ class Hashmap { if (w_tid == winner) { KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex].first, EMPTYBUCKET, candidateKey); if (old == EMPTYBUCKET) { - threadOverflow = (probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex+1); + threadOverflow = (probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex + 1); split::s_atomicExch(&buckets[probingindex].second, candidateVal); warpDone = 1; localCount = 1; split::s_atomicAdd(&_mapInfo->fill, 1); if (threadOverflow > _mapInfo->currentMaxBucketOverflow) { split::s_atomicExch((unsigned long long*)(&_mapInfo->currentMaxBucketOverflow), - (unsigned long long)nextOverflow(threadOverflow,defaults::WARPSIZE)); + (unsigned long long)nextOverflow(threadOverflow, defaults::WARPSIZE)); } } else if (old == candidateKey) { // Parallel stuff are fun. Major edge case! @@ -1113,7 +1113,7 @@ class Hashmap { split::tools::Cuda_mempool mPool(memory_for_pool, s); size_t retval = split::tools::copy_if_raw, Rule, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>( - buckets, elements, rule, nBlocks, mPool, s); + buckets.data(), elements, buckets.size(), rule, nBlocks, mPool, s); return retval; } @@ -1133,8 +1133,8 @@ class Hashmap { return elements.size(); } template - size_t extractKeysByPattern(split::SplitVector& elements, Rule rule, void *stack, size_t max_size, split_gpuStream_t s = 0, - bool prefetches = true) { + size_t extractKeysByPattern(split::SplitVector& elements, Rule rule, void* stack, size_t max_size, + split_gpuStream_t s = 0, bool prefetches = true) { elements.resize(_mapInfo->fill + 1, true); if (prefetches) { elements.optimizeGPU(s); @@ -1152,7 +1152,8 @@ class Hashmap { }; return extractKeysByPattern(elements, rule, s, prefetches); } - size_t extractAllKeys(split::SplitVector& elements, void *stack, size_t max_size, split_gpuStream_t s = 0, bool prefetches = true) { + size_t extractAllKeys(split::SplitVector& elements, void* stack, size_t max_size, split_gpuStream_t s = 0, + bool prefetches = true) { // Extract all keys auto rule = [] __host__ __device__(const hash_pair& kval) -> bool { return kval.first != EMPTYBUCKET && kval.first != TOMBSTONE; @@ -1371,7 +1372,7 @@ class Hashmap { public: HASHINATOR_DEVICEONLY - device_iterator(Hashmap& hashtable, size_t index) : index(index),hashtable(&hashtable) {} + device_iterator(Hashmap& hashtable, size_t index) : index(index), hashtable(&hashtable) {} HASHINATOR_DEVICEONLY size_t getIndex() { return index; } @@ -1418,7 +1419,7 @@ class Hashmap { public: HASHINATOR_DEVICEONLY explicit const_device_iterator(const Hashmap& hashtable, size_t index) - : index(index), hashtable(&hashtable){} + : index(index), hashtable(&hashtable) {} HASHINATOR_DEVICEONLY size_t getIndex() { return index; } @@ -1629,7 +1630,8 @@ class Hashmap { void set_element(const KEY_TYPE& key, VAL_TYPE val) { size_t thread_overflowLookup = 0; insert_element(key, val, thread_overflowLookup); - atomicMax((unsigned long long*)&(_mapInfo->currentMaxBucketOverflow), nextOverflow(thread_overflowLookup,defaults::WARPSIZE/defaults::elementsPerWarp)); + atomicMax((unsigned long long*)&(_mapInfo->currentMaxBucketOverflow), + nextOverflow(thread_overflowLookup, defaults::WARPSIZE / defaults::elementsPerWarp)); } HASHINATOR_DEVICEONLY diff --git a/include/hashinator/kernels_AMD.h b/include/hashinator/kernels_AMD.h index fc46534..1caa1d9 100644 --- a/include/hashinator/kernels_AMD.h +++ b/include/hashinator/kernels_AMD.h @@ -203,7 +203,7 @@ __global__ void insert_kernel(hash_pair* src, hash_pair(1<(1 << sizePower)) + 1; split::s_atomicExch(&buckets[probingindex].second, candidate.second); vWarpDone = 1; // Flip the bit which corresponds to the thread that added an element @@ -243,7 +243,8 @@ __global__ void insert_kernel(hash_pair* src, hash_pair *d_overflow) { - split::s_atomicExch((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow,VIRTUALWARP)); + split::s_atomicExch((unsigned long long*)d_overflow, + (unsigned long long)nextOverflow(blockOverflow, VIRTUALWARP)); } split::s_atomicAdd(d_fill, blockTotal); } @@ -334,7 +335,7 @@ __global__ void insert_kernel(KEY_TYPE* keys, VAL_TYPE* vals, hash_pair(1<(1 << sizePower)) + 1; split::s_atomicExch(&buckets[probingindex].second, candidateVal); vWarpDone = 1; // Flip the bit which corresponds to the thread that added an element @@ -374,7 +375,8 @@ __global__ void insert_kernel(KEY_TYPE* keys, VAL_TYPE* vals, hash_pair *d_overflow) { - split::s_atomicExch((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow,VIRTUALWARP)); + split::s_atomicExch((unsigned long long*)d_overflow, + (unsigned long long)nextOverflow(blockOverflow, VIRTUALWARP)); } split::s_atomicAdd(d_fill, blockTotal); } @@ -561,7 +563,7 @@ __global__ void insert_index_kernel(KEY_TYPE* keys, hash_pair(1<(1 << sizePower)) + 1; split::s_atomicExch(&buckets[probingindex].second, candidateVal); vWarpDone = 1; // Flip the bit which corresponds to the thread that added an element @@ -601,7 +603,8 @@ __global__ void insert_index_kernel(KEY_TYPE* keys, hash_pair *d_overflow) { - split::s_atomicExch((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow,VIRTUALWARP)); + split::s_atomicExch((unsigned long long*)d_overflow, + (unsigned long long)nextOverflow(blockOverflow, VIRTUALWARP)); } split::s_atomicAdd(d_fill, blockTotal); } @@ -716,5 +719,309 @@ __global__ void retrieve_kernel(hash_pair* src, hash_pair::max()> +__global__ void reset_all_to_empty_set(KEY_TYPE* dst, const size_t len) { + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + // Early exit here + if (tid >= len) { + return; + } + + if (dst[tid] != EMPTYBUCKET) { + dst[tid] = EMPTYBUCKET; + } + return; +} + +template ::max(), + class HashFunction = HashFunctions::Fibonacci, int WARPSIZE = defaults::WARPSIZE,int elementsPerWarp> +__global__ void insert_set_kernel(KEY_TYPE* keys, KEY_TYPE* buckets, int sizePower,size_t maxoverflow, size_t* d_overflow, + size_t* d_fill, size_t len, status* err) { + + const int VIRTUALWARP = WARPSIZE / elementsPerWarp; + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t wid = tid / VIRTUALWARP; + const size_t w_tid = tid % VIRTUALWARP; + const size_t proper_w_tid = tid % WARPSIZE; // the proper WID as if we had no Virtual warps + const size_t proper_wid = tid / WARPSIZE; + const size_t blockWid = proper_wid % (WARPSIZE / 4); // we have twice the warpsize and half the warps per block + + __shared__ uint32_t addMask[WARPSIZE / 2]; + __shared__ uint64_t warpOverflow[WARPSIZE / 2]; + // Early quit if we have more warps than elements to insert + if (wid >= len) { + return; + } + + // Zero out shared count; + if (proper_w_tid == 0 && blockWid == 0) { + for (int i = 0; i < WARPSIZE; i++) { + addMask[i] = 0; + warpOverflow[i] = 0; + } + } + __syncthreads(); + + uint64_t subwarp_relative_index = (wid) % (WARPSIZE / VIRTUALWARP); + uint64_t submask; + if constexpr (elementsPerWarp == 1) { + // TODO mind AMD 64 thread wavefronts + submask = SPLIT_VOTING_MASK; + } else { + submask = split::getIntraWarpMask_AMD(0, VIRTUALWARP * subwarp_relative_index + 1, + VIRTUALWARP * subwarp_relative_index + VIRTUALWARP); + } + + KEY_TYPE candidateKey = keys[wid]; + const int bitMask = (1 << (sizePower)) - 1; + const auto hashIndex = HashFunction::_hash(candidateKey, sizePower); + uint32_t localCount = 0; + uint64_t vWarpDone = 0; // state of virtual warp + uint64_t threadOverflow = 0; + + for (size_t i = 0; i < (1 << sizePower); i += VIRTUALWARP) { + + // Check if this virtual warp is done. + if (vWarpDone) { + break; + } + // Get the position we should be looking into + size_t probingindex = ((hashIndex + i + w_tid) & bitMask); + auto target = buckets[probingindex]; + + // vote for available emptybuckets in warp region + // Note that this has to be done before voting for already existing elements (below) + auto mask = split::s_warpVote(target == EMPTYBUCKET, submask) & submask; + + // Check if this elements already exists + auto already_exists = split::s_warpVote(target == candidateKey, submask) & submask; + if (already_exists) {vWarpDone = 1;} + + // If any duplicate was there now is the time for the whole Virtual warp to find out! + vWarpDone = split::s_warpVote(vWarpDone > 0, submask) & submask; + + while (mask && !vWarpDone) { + int winner = split::s_findFirstSig(mask) - 1; + int sub_winner = winner - (subwarp_relative_index)*VIRTUALWARP; + if (w_tid == sub_winner) { + KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex], EMPTYBUCKET, candidateKey); + if (old == EMPTYBUCKET) { + threadOverflow = std::min(i + w_tid, static_cast(1 << sizePower)) + 1; + vWarpDone = 1; + // Flip the bit which corresponds to the thread that added an element + localCount++; + } else if (old == candidateKey) { + vWarpDone = 1; + } + } + // If any of the virtual warp threads are done the the whole + // Virtual warp is done + vWarpDone = split::s_warpVote(vWarpDone > 0, submask) & submask; + mask ^= (1UL << winner); + } + } + + // Update fill and overflow + __syncthreads(); + // Per warp reduction + int warpTotals = warpReduce(localCount); + uint64_t perWarpOverflow = warpReduceMax(threadOverflow); + __syncthreads(); + + // Store to shmem minding Bank Conflicts + if (proper_w_tid == 0) { + // Write the count to the same place + addMask[(blockWid)] = warpTotals; + warpOverflow[(blockWid)] = perWarpOverflow; + } + + __syncthreads(); + // First warp in block reductions + if (blockWid == 0) { + uint64_t blockOverflow = warpReduceMax(warpOverflow[(proper_w_tid)]); + int blockTotal = warpReduce(addMask[(proper_w_tid)]); + // First thread updates fill and overlfow (1 update per block) + if (proper_w_tid == 0) { + if (blockOverflow > *d_overflow) { + split::s_atomicExch((unsigned long long*)d_overflow, + (unsigned long long)nextOverflow(blockOverflow, VIRTUALWARP)); + } + split::s_atomicAdd(d_fill, blockTotal); + } + } + return; +} + + +/* + * In a similar way to the insert and retrieve kernels we + * delete keys in "keys" if they do exist in the set. + * If the keys do not exist we do nothing. + * */ +template ::max(), + KEY_TYPE TOMBSTONE = EMPTYBUCKET - 1, class HashFunction = HashFunctions::Fibonacci, + int WARPSIZE = defaults::WARPSIZE, int elementsPerWarp> +__global__ void delete_set_kernel(KEY_TYPE* keys, KEY_TYPE* buckets, size_t* d_tombstoneCounter, + int sizePower, size_t maxoverflow, size_t len) { + + + const int VIRTUALWARP = WARPSIZE / elementsPerWarp; + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t wid = tid / VIRTUALWARP; + const size_t w_tid = tid % VIRTUALWARP; + const size_t proper_w_tid = tid % WARPSIZE; // the proper WID as if we had no Virtual warps + const size_t proper_wid = tid / WARPSIZE; + const size_t blockWid = proper_wid % (WARPSIZE / 4); // we have twice the warpsize and half the warps per block + + __shared__ uint32_t deleteMask[WARPSIZE / 2]; + + // Early quit if we have more warps than elements to handle + if (wid >= len) { + return; + } + + // Zero out shared count; + if (proper_w_tid == 0 && blockWid == 0) { + for (int i = 0; i < WARPSIZE; i++) { + deleteMask[i] = 0; + } + } + __syncthreads(); + + uint64_t subwarp_relative_index = (wid) % (WARPSIZE / VIRTUALWARP); + uint64_t submask; + if constexpr (elementsPerWarp == 1) { + // TODO mind AMD 64 thread wavefronts + submask = SPLIT_VOTING_MASK; + } else { + submask = split::getIntraWarpMask_AMD(0, VIRTUALWARP * subwarp_relative_index + 1, + VIRTUALWARP * subwarp_relative_index + VIRTUALWARP); + } + + KEY_TYPE candidateKey = keys[wid]; + const int bitMask = (1 << (sizePower)) - 1; + const auto hashIndex = HashFunction::_hash(candidateKey, sizePower); + uint32_t localCount = 0; + + for (size_t i = 0; i < maxoverflow; i += VIRTUALWARP) { + + // Get the position we should be looking into + size_t probingindex = ((hashIndex + i + w_tid) & bitMask); + const auto maskExists = + split::s_warpVote(buckets[probingindex] == candidateKey, SPLIT_VOTING_MASK) & submask; + const auto emptyFound = + split::s_warpVote(buckets[probingindex] == EMPTYBUCKET, SPLIT_VOTING_MASK) & submask; + // If we encountered empty and the key is not in the range of this warp that means the key is not in hashmap. + if (!maskExists && emptyFound) { + return; + } + if (maskExists) { + int winner = split::s_findFirstSig(maskExists) - 1; + winner -= (subwarp_relative_index)*VIRTUALWARP; + if (w_tid == winner) { + split::s_atomicExch(&buckets[probingindex], TOMBSTONE); + localCount++; + } + break; + } + } + + // Update tombstone counter + __syncthreads(); + // Per warp reduction + int warpTotals = warpReduce(localCount); + __syncthreads(); + + // Store to shmem minding Bank Conflicts + if (proper_w_tid == 0) { + // Write the count to the same place + deleteMask[(blockWid)] = warpTotals; + } + + __syncthreads(); + // First warp in block reductions + if (blockWid == 0) { + int blockTotal = warpReduce(deleteMask[(proper_w_tid)]); + // First thread updates fill and overlfow (1 update per block) + if (proper_w_tid == 0) { + split::s_atomicAdd(d_tombstoneCounter, blockTotal); + } + } + return; +} + +/* + * Resets all elements pointed by src to EMPTY in dst + * If an elements in src is not found this will assert(false) + * */ +template ::max(), + class HashFunction = HashFunctions::Fibonacci, int WARPSIZE = defaults::WARPSIZE, + int elementsPerWarp> +__global__ void reset_to_empty_set(KEY_TYPE* src, KEY_TYPE* dst,const int sizePower, size_t maxoverflow, size_t len) + + +{ + const int VIRTUALWARP = WARPSIZE / elementsPerWarp; + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t wid = tid / VIRTUALWARP; + const size_t w_tid = tid % VIRTUALWARP; + + // Early quit if we have more warps than elements to insert + if (wid >= len) { + return; + } + + uint64_t subwarp_relative_index = (wid) % (WARPSIZE / VIRTUALWARP); + uint64_t submask; + if constexpr (elementsPerWarp == 1) { + // TODO mind AMD 64 thread wavefronts + submask = SPLIT_VOTING_MASK; + } else { + submask = split::getIntraWarpMask_AMD(0, VIRTUALWARP * subwarp_relative_index + 1, + VIRTUALWARP * subwarp_relative_index + VIRTUALWARP); + } + + KEY_TYPE candidate = src[wid]; + const int bitMask = (1 << (sizePower)) - 1; + const auto hashIndex = HashFunction::_hash(candidate, sizePower); + uint64_t vWarpDone = 0; // state of virtual warp + + for (size_t i = 0; i < (1 << sizePower); i += VIRTUALWARP) { + + // Check if this virtual warp is done. + if (vWarpDone) { + break; + } + + // Get the position we should be looking into + size_t probingindex = ((hashIndex + i + w_tid) & bitMask); + auto target = dst[probingindex]; + + // vote for available emptybuckets in warp region + // Note that this has to be done before voting for already existing elements (below) + auto mask = split::s_warpVote(target == candidate, submask) & submask; + + while (mask && !vWarpDone) { + int winner = split::s_findFirstSig(mask) - 1; + int sub_winner = winner - (subwarp_relative_index)*VIRTUALWARP; + if (w_tid == sub_winner) { + dst[probingindex] = EMPTYBUCKET; + vWarpDone = 1; + } + // If any of the virtual warp threads are done the the whole + // Virtual warp is done + vWarpDone = split::s_warpVote(vWarpDone > 0, submask) & submask; + mask ^= (1UL << winner); + } + } + return; +} + } // namespace Hashers } // namespace Hashinator diff --git a/include/hashinator/kernels_NVIDIA.h b/include/hashinator/kernels_NVIDIA.h index 4f009c5..7d9e256 100644 --- a/include/hashinator/kernels_NVIDIA.h +++ b/include/hashinator/kernels_NVIDIA.h @@ -208,7 +208,7 @@ __global__ void insert_kernel(hash_pair* src, hash_pair(1<(1 << sizePower)) + 1; split::s_atomicExch(&buckets[probingindex].second, candidate.second); vWarpDone = 1; // Flip the bit which corresponds to the thread that added an element @@ -252,7 +252,7 @@ __global__ void insert_kernel(hash_pair* src, hash_pair(addMask[(proper_w_tid)]); // First thread updates fill and overlfow (1 update per block) if (proper_w_tid == 0) { - atomicMax((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow,VIRTUALWARP)); + atomicMax((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow, VIRTUALWARP)); split::s_atomicAdd(d_fill, blockTotal); ; } @@ -368,7 +368,7 @@ __global__ void insert_kernel(KEY_TYPE* keys, VAL_TYPE* vals, hash_pair(1<(1 << sizePower)) + 1; split::s_atomicExch(&buckets[probingindex].second, candidateVal); vWarpDone = 1; // Flip the bit which corresponds to the thread that added an element @@ -412,7 +412,7 @@ __global__ void insert_kernel(KEY_TYPE* keys, VAL_TYPE* vals, hash_pair(addMask[(proper_w_tid)]); // First thread updates fill and overlfow (1 update per block) if (proper_w_tid == 0) { - atomicMax((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow,VIRTUALWARP)); + atomicMax((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow, VIRTUALWARP)); split::s_atomicAdd(d_fill, blockTotal); ; } @@ -528,7 +528,7 @@ __global__ void insert_index_kernel(KEY_TYPE* keys, hash_pair(1<(1 << sizePower)) + 1; split::s_atomicExch(&buckets[probingindex].second, candidateVal); vWarpDone = 1; // Flip the bit which corresponds to the thread that added an element @@ -572,7 +572,7 @@ __global__ void insert_index_kernel(KEY_TYPE* keys, hash_pair(addMask[(proper_w_tid)]); // First thread updates fill and overlfow (1 update per block) if (proper_w_tid == 0) { - atomicMax((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow,VIRTUALWARP)); + atomicMax((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow, VIRTUALWARP)); split::s_atomicAdd(d_fill, blockTotal); ; } @@ -796,5 +796,333 @@ __global__ void retrieve_kernel(hash_pair* src, hash_pair::max()> +__global__ void reset_all_to_empty_set(KEY_TYPE* dst, const size_t len) { + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + // Early exit here + if (tid >= len) { + return; + } + if (dst[tid]!= EMPTYBUCKET) { + dst[tid]= EMPTYBUCKET; + } + return; +} + + +/*Warp Synchronous hashing kernel for hashinator's internal use: + * Similar to insert kernel used by Hashinator::hashmap. + * */ +template ::max(), + class HashFunction = HashFunctions::Fibonacci, int WARPSIZE = defaults::WARPSIZE,int elementsPerWarp> +__global__ void insert_set_kernel(KEY_TYPE* keys, KEY_TYPE* buckets, int sizePower,size_t maxoverflow, size_t* d_overflow, + size_t* d_fill, size_t len, status* err) { + + __shared__ uint32_t addMask[WARPSIZE]; + __shared__ uint64_t warpOverflow[WARPSIZE]; + + const int VIRTUALWARP = WARPSIZE / elementsPerWarp; + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t wid = tid / VIRTUALWARP; + const size_t w_tid = tid % VIRTUALWARP; + const size_t proper_w_tid = tid % WARPSIZE; // the proper WID as if we had no Virtual warps + const size_t proper_wid = tid / WARPSIZE; + const size_t blockWid = proper_wid % WARPSIZE; + + // Early quit if we have more warps than elements to insert + if (wid >= len) { + return; + } + + // Zero out shared count; + if (proper_w_tid == 0 && blockWid == 0) { + for (int i = 0; i < WARPSIZE; i++) { + addMask[i] = 0; + warpOverflow[i] = 0; + } + } + __syncthreads(); + + uint32_t subwarp_relative_index = (wid) % (WARPSIZE / VIRTUALWARP); + uint32_t submask; + if constexpr (elementsPerWarp == 1) { + // TODO mind AMD 64 thread wavefronts + submask = SPLIT_VOTING_MASK; + } else { + submask = split::getIntraWarpMask_CUDA(0, VIRTUALWARP * subwarp_relative_index + 1, + VIRTUALWARP * subwarp_relative_index + VIRTUALWARP); + } + + KEY_TYPE candidateKey = keys[wid]; + const int bitMask = (1 << (sizePower)) - 1; + const auto hashIndex = HashFunction::_hash(candidateKey, sizePower); + uint32_t vWarpDone = 0; // state of virtual warp + uint32_t localCount = 0; + uint64_t threadOverflow = 0; + + for (size_t i = 0; i < (1 << sizePower); i += VIRTUALWARP) { + + // Check if this virtual warp is done. + if (vWarpDone) { + break; + } + + // Get the position we should be looking into + size_t probingindex = ((hashIndex + i + w_tid) & bitMask); + auto target = buckets[probingindex]; + + // vote for available emptybuckets in warp region + // Note that this has to be done before voting for already existing elements (below) + auto mask = split::s_warpVote(target == EMPTYBUCKET, submask); + + // Check if this elements already exists + auto already_exists = split::s_warpVote(target == candidateKey, submask); + if (already_exists) {vWarpDone=1;} + + + // If any duplicate was there now is the time for the whole Virtual warp to find out! + vWarpDone = split::s_warpVoteAny(vWarpDone, submask); + + while (mask && !vWarpDone) { + int winner = split::s_findFirstSig(mask) - 1; + int sub_winner = winner - (subwarp_relative_index)*VIRTUALWARP; + if (w_tid == sub_winner) { + KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex], EMPTYBUCKET, candidateKey); + if (old == EMPTYBUCKET) { + //Key added + threadOverflow = std::min(i + w_tid, static_cast(1 << sizePower)) + 1; + vWarpDone = 1; + localCount++; + } else if (old == candidateKey) { + // Parallel stuff are fun. Major edge case! + vWarpDone = 1; + } + } + // If any of the virtual warp threads are done the the whole + // Virtual warp is done + vWarpDone = split::s_warpVoteAny(vWarpDone, submask); + mask ^= (1UL << winner); + } + } + + /* + Update fill and overflow in 2 steps: + Step 1--> First thread per warp reduces the total elements added (per Warp) + Step 2--> Reduce the blockTotal from the warpTotals but do it in registers using the first warp in the block + */ + + // Per warp reduction + __syncwarp(); + int warpTotals = warpReduce(localCount); + uint64_t perWarpOverflow = warpReduceMax(threadOverflow); + __syncwarp(); + + // Store to shmem minding Bank Conflicts + if (proper_w_tid == 0) { + // Write the count to the same place + addMask[(blockWid)] = warpTotals; + warpOverflow[(blockWid)] = perWarpOverflow; + } + + __syncthreads(); + // First warp in block reductions + if (blockWid == 0) { + uint64_t blockOverflow = warpReduceMax(warpOverflow[(proper_w_tid)]); + int blockTotal = warpReduce(addMask[(proper_w_tid)]); + // First thread updates fill and overlfow (1 update per block) + if (proper_w_tid == 0) { + atomicMax((unsigned long long*)d_overflow, (unsigned long long)nextOverflow(blockOverflow, VIRTUALWARP)); + split::s_atomicAdd(d_fill, blockTotal); + ; + } + } + + // Make sure everyone actually made it otherwise raise the error flag. + if (split::s_warpVote(vWarpDone, SPLIT_VOTING_MASK) != __activemask()) { + split::s_atomicExch((uint32_t*)err, (uint32_t)status::fail); + } + return; +} + +/* + * In a similar way to the insert and retrieve kernels we + * delete keys in "keys" if they do exist in the set. + * If the keys do not exist we do nothing. + * */ +template ::max(), + KEY_TYPE TOMBSTONE = EMPTYBUCKET - 1, class HashFunction = HashFunctions::Fibonacci, + int WARPSIZE = defaults::WARPSIZE, int elementsPerWarp> +__global__ void delete_set_kernel(KEY_TYPE* keys, KEY_TYPE* buckets, size_t* d_tombstoneCounter, + int sizePower, size_t maxoverflow, size_t len) { + + const int VIRTUALWARP = WARPSIZE / elementsPerWarp; + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t wid = tid / VIRTUALWARP; + const size_t w_tid = tid % VIRTUALWARP; + const size_t proper_w_tid = tid % WARPSIZE; // the proper WID as if we had no Virtual warps + const size_t proper_wid = tid / WARPSIZE; + const size_t blockWid = proper_wid % WARPSIZE; + + __shared__ uint32_t deleteMask[WARPSIZE]; + // Early quit if we have more warps than elements to handle + if (wid >= len) { + return; + } + + // Zero out shmem; + if (proper_w_tid == 0 && blockWid == 0) { + for (int i = 0; i < WARPSIZE; i++) { + deleteMask[i] = 0; + } + } + __syncthreads(); + + uint32_t subwarp_relative_index = (wid) % (WARPSIZE / VIRTUALWARP); + uint32_t submask; + if constexpr (elementsPerWarp == 1) { + // TODO mind AMD 64 thread wavefronts + submask = SPLIT_VOTING_MASK; + } else { + submask = split::getIntraWarpMask_CUDA(0, VIRTUALWARP * subwarp_relative_index + 1, + VIRTUALWARP * subwarp_relative_index + VIRTUALWARP); + } + + KEY_TYPE candidateKey = keys[wid]; + const int bitMask = (1 << (sizePower)) - 1; + const auto hashIndex = HashFunction::_hash(candidateKey, sizePower); + uint32_t localCount = 0; + uint32_t vWarpDone = 0; // state of virtual warp + + for (size_t i = 0; i < maxoverflow; i += VIRTUALWARP) { + if (vWarpDone) { + break; + } + + // Get the position we should be looking into + size_t probingindex = ((hashIndex + i + w_tid) & bitMask); + const auto maskExists = split::s_warpVote(buckets[probingindex] == candidateKey, submask); + const auto emptyFound = split::s_warpVote(buckets[probingindex] == EMPTYBUCKET, submask); + // If we encountered empty and the key is not in the range of this warp that means the key is not in hashmap. + if (!maskExists && emptyFound) { + vWarpDone = 1; + } + if (maskExists) { + int winner = split::s_findFirstSig(maskExists) - 1; + winner -= (subwarp_relative_index)*VIRTUALWARP; + if (w_tid == winner) { + split::s_atomicExch(&buckets[probingindex], TOMBSTONE); + localCount++; + vWarpDone = 1; + } + } + vWarpDone = split::s_warpVoteAny(vWarpDone, submask); + } + + /* + Update tombstone counter and overflow in 2 steps: + Step 1--> First thread per warp reduces the total deleted elements (per Warp) + Step 2--> Reduce the blockTotal from the warpTotals but do it in registers using the first warp in the block + */ + + // Per warp reduction + __syncwarp(); + int warpTotals = warpReduce(localCount); + __syncwarp(); + + // Store to shmem minding Bank Conflicts + if (proper_w_tid == 0) { + // Write the count to the same place + deleteMask[(blockWid)] = warpTotals; + } + + __syncthreads(); + // First warp in block reductions + if (blockWid == 0) { + int blockTotal = warpReduce(deleteMask[(proper_w_tid)]); + // First thread updates the tombstone counter (1 update per block) + if (proper_w_tid == 0) { + split::s_atomicAdd(d_tombstoneCounter, blockTotal); + } + } + return; +} + +/* + * Resets all elements pointed by src to EMPTY in dst + * If an elements in src is not found this will assert(false) + * */ +template ::max(), + class HashFunction = HashFunctions::Fibonacci, int WARPSIZE = defaults::WARPSIZE, + int elementsPerWarp> +__global__ void reset_to_empty_set(KEY_TYPE* src, KEY_TYPE* dst,const int sizePower, size_t maxoverflow, size_t len) + +{ + const int VIRTUALWARP = WARPSIZE / elementsPerWarp; + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t wid = tid / VIRTUALWARP; + const size_t w_tid = tid % VIRTUALWARP; + + // Early quit if we have more warps than elements to insert + if (wid >= len) { + return; + } + + uint32_t subwarp_relative_index = (wid) % (WARPSIZE / VIRTUALWARP); + uint32_t submask; + if constexpr (elementsPerWarp == 1) { + // TODO mind AMD 64 thread wavefronts + submask = SPLIT_VOTING_MASK; + } else { + submask = split::getIntraWarpMask_CUDA(0, VIRTUALWARP * subwarp_relative_index + 1, + VIRTUALWARP * subwarp_relative_index + VIRTUALWARP); + } + + KEY_TYPE candidate = src[wid]; + const int bitMask = (1 << (sizePower)) - 1; + const auto hashIndex = HashFunction::_hash(candidate, sizePower); + uint32_t vWarpDone = 0; // state of virtual warp + + for (size_t i = 0; i < (1 << sizePower); i += VIRTUALWARP) { + + // Check if this virtual warp is done. + if (vWarpDone) { + break; + } + + // Get the position we should be looking into + size_t probingindex = ((hashIndex + i + w_tid) & bitMask); + auto target = dst[probingindex]; + + // vote for available emptybuckets in warp region + // Note that this has to be done before voting for already existing elements (below) + auto mask = split::s_warpVote(target == candidate, submask); + + while (mask && !vWarpDone) { + int winner = split::s_findFirstSig(mask) - 1; + int sub_winner = winner - (subwarp_relative_index)*VIRTUALWARP; + if (w_tid == sub_winner) { + dst[probingindex] = EMPTYBUCKET; + vWarpDone = 1; + } + // If any of the virtual warp threads are done the the whole + // Virtual warp is done + vWarpDone = split::s_warpVoteAny(vWarpDone, submask); + mask ^= (1UL << winner); + } + } + + if (split::s_warpVote(vWarpDone, SPLIT_VOTING_MASK) != __activemask()) { + assert(0); + } + return; +} + } // namespace Hashers } // namespace Hashinator diff --git a/include/hashinator/unordered_set/unordered_set.h b/include/hashinator/unordered_set/unordered_set.h new file mode 100644 index 0000000..5669f43 --- /dev/null +++ b/include/hashinator/unordered_set/unordered_set.h @@ -0,0 +1,1260 @@ +#pragma once +/* File: unordered_set.h + * Authors: Kostis Papadakis, Urs Ganse and Markus Battarbee (2023) + * + * This file defines the following classes: + * --Hashinator::Unordered_Set; + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * */ +#pragma once +#ifdef HASHINATOR_CPU_ONLY_MODE +#define SPLIT_CPU_ONLY_MODE +#endif +#include "../../common.h" +#include "../../splitvector/gpu_wrappers.h" +#include "../../splitvector/split_allocators.h" +#include "../../splitvector/splitvec.h" +#include "../defaults.h" +#include "../hash_pair.h" +#include "../hashfunctions.h" +#include +#include +#ifndef HASHINATOR_CPU_ONLY_MODE +#include "../../splitvector/split_tools.h" +#include "../hashers.h" +#endif +#define UNUSED(x) (void)(x) +namespace Hashinator { + +#ifndef HASHINATOR_CPU_ONLY_MODE +template +using DefaultMetaAllocator = split::split_unified_allocator; +#define DefaultHasher \ + Hashers::Hasher +#else +template +using DefaultMetaAllocator = split::split_host_allocator; +#define DefaultHasher void +#endif + +typedef struct Info { + Info(){}; + Info(int sz) + : sizePower(sz), fill(0), currentMaxBucketOverflow(defaults::BUCKET_OVERFLOW), tombstoneCounter(0), + err(status::invalid) {} + int sizePower; + size_t fill; + size_t currentMaxBucketOverflow; + size_t tombstoneCounter; + status err; +} SetInfo; + +template ::max(), + KEY_TYPE TOMBSTONE = EMPTYBUCKET - 1, class HashFunction = HashFunctions::Fibonacci, + class DeviceHasher = DefaultHasher, class Meta_Allocator = DefaultMetaAllocator> + +class Unordered_Set { + + // members +private: + SetInfo* _setInfo; + split::SplitVector buckets; + Meta_Allocator _metaAllocator; + Unordered_Set* device_set; + + HASHINATOR_HOSTDEVICE + uint32_t hash(KEY_TYPE in) const { + static_assert(std::is_arithmetic::value); + return HashFunction::_hash(in, _setInfo->sizePower); + } + + HASHINATOR_HOSTDEVICE + inline void set_status(status code) noexcept { _setInfo->err = code; } + + void addKey(KEY_TYPE key) noexcept { + int bitMask = (1 << _setInfo->sizePower) - 1; + auto hashIndex = hash(key); + // Try to find the matching bucket. + for (size_t i = 0; i < _setInfo->currentMaxBucketOverflow; i++) { + KEY_TYPE& candidate = buckets[(hashIndex + i) & bitMask]; + if (candidate == EMPTYBUCKET) { + candidate = key; + _setInfo->fill++; + return; + } + if (candidate == TOMBSTONE) { + continue; + } + } + rehash(_setInfo->sizePower + 1); + return addKey(key); + } + + void preallocate_device_handles() { + #ifndef HASHINATOR_CPU_ONLY_MODE + SPLIT_CHECK_ERR(split_gpuMalloc((void**)&device_set, sizeof(Unordered_Set))); + #endif + } + + // Deallocates the bookeepping info and the device pointer + void deallocate_device_handles() { + if (device_set == nullptr) { + return; + } + #ifndef HASHINATOR_CPU_ONLY_MODE + SPLIT_CHECK_ERR(split_gpuFree(device_set)); + device_set= nullptr; + #endif + } + +public: + // Constructors Destructors and = Operators with move/cpy semantics + Unordered_Set(uint32_t sizePower = 5) { + preallocate_device_handles(); + _setInfo = _metaAllocator.allocate(1); + *_setInfo = SetInfo(sizePower); + buckets = split::SplitVector(1 << _setInfo->sizePower, EMPTYBUCKET); + } + + Unordered_Set(const Unordered_Set& other) { + preallocate_device_handles(); + _setInfo = _metaAllocator.allocate(1); + *_setInfo = *other._setInfo; + buckets = other.buckets; + } + + Unordered_Set(const std::initializer_list& list) { + preallocate_device_handles(); + _setInfo = _metaAllocator.allocate(1); + *_setInfo = SetInfo(5); + buckets = split::SplitVector(1 << _setInfo->sizePower, EMPTYBUCKET); + for (size_t i = 0; i < list.size(); i++) { + insert(list.begin()[i]); + } + } + + Unordered_Set(const std::vector& vec) { + preallocate_device_handles(); + _setInfo = _metaAllocator.allocate(1); + *_setInfo = SetInfo(5); + buckets = split::SplitVector(1 << _setInfo->sizePower, EMPTYBUCKET); + for (size_t i = 0; i < vec.size(); i++) { + insert(vec.begin()[i]); + } + } + + Unordered_Set(std::initializer_list&& list) { + preallocate_device_handles(); + _setInfo = _metaAllocator.allocate(1); + *_setInfo = SetInfo(5); + buckets = split::SplitVector(1 << _setInfo->sizePower, EMPTYBUCKET); + for (size_t i = 0; i < list.size(); i++) { + insert(list.begin()[i]); + } + } + + Unordered_Set(Unordered_Set&& other) noexcept { + preallocate_device_handles(); + *_setInfo = other.SetInfo; + other._setInfo = nullptr; + buckets = std::move(other.buckets); + } + + ~Unordered_Set() { + deallocate_device_handles(); + _metaAllocator.deallocate(_setInfo, 1); + } + + Unordered_Set& operator=(const Unordered_Set& other) { + if (this == &other) { + return *this; + } + *_setInfo = *(other._setInfo); + buckets = other.buckets; + return *this; + } + + Unordered_Set& operator=(Unordered_Set&& other) noexcept { + if (this == &other) { + return *this; + } + _metaAllocator.deallocate(_setInfo, 1); + _setInfo = other._setInfo; + other._setInfo = nullptr; + buckets = std::move(other.buckets); + return *this; + } + + HASHINATOR_HOSTDEVICE + inline status peek_status(void) noexcept { + status retval = _setInfo->err; + _setInfo->err = status::invalid; + return retval; + } + +#ifdef HASHINATOR_CPU_ONLY_MODE + void* operator new(size_t len) { + void* ptr = (void*)malloc(len); + return ptr; + } + + void operator delete(void* ptr) { free(ptr); } + + void* operator new[](size_t len) { + void* ptr = (void*)malloc(len); + return ptr; + } + + void operator delete[](void* ptr) { free(ptr); } + +#else + void* operator new(size_t len) { + void* ptr; + SPLIT_CHECK_ERR(split_gpuMallocManaged(&ptr, len)); + return ptr; + } + + void operator delete(void* ptr) { SPLIT_CHECK_ERR(split_gpuFree(ptr)); } + + void* operator new[](size_t len) { + void* ptr; + SPLIT_CHECK_ERR(split_gpuMallocManaged(&ptr, len)); + return ptr; + } + + void operator delete[](void* ptr) { split_gpuFree(ptr); } + + void copyMetadata(SetInfo* dst, split_gpuStream_t s = 0) { + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(dst, _setInfo, sizeof(SetInfo), split_gpuMemcpyDeviceToHost, s)); + } + +#endif + +#ifndef HASHINATOR_CPU_ONLY_MODE + Unordered_Set* upload(split_gpuStream_t stream = 0) { + optimizeGPU(stream); + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(device_set, this, sizeof(Unordered_Set), split_gpuMemcpyHostToDevice, stream)); + return device_set; + } + + void download(split_gpuStream_t stream = 0) { + // Copy over fill as it might have changed + optimizeCPU(stream); + if (_setInfo->currentMaxBucketOverflow > Hashinator::defaults::BUCKET_OVERFLOW) { + rehash(_setInfo->sizePower + 1); + } else { + if (tombstone_count() > 0) { + clean_tombstones(stream); + } + } + } + + void optimizeGPU(split_gpuStream_t stream = 0) noexcept { + int device; + SPLIT_CHECK_ERR(split_gpuGetDevice(&device)); + SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_setInfo, sizeof(SetInfo), device, stream)); + buckets.optimizeGPU(stream); + } + + /*Manually prefetch data on Host*/ + void optimizeCPU(split_gpuStream_t stream = 0) noexcept { + SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_setInfo, sizeof(SetInfo), split_gpuCpuDeviceId, stream)); + buckets.optimizeCPU(stream); + } + +#endif + + void rehash(uint32_t newSizePower) { + if (newSizePower > 32) { + throw std::out_of_range("Hashmap ran into rehashing catastrophe and exceeded 32bit buckets."); + } + split::SplitVector newBuckets(1 << newSizePower, EMPTYBUCKET); + _setInfo->sizePower = newSizePower; + int bitMask = (1 << _setInfo->sizePower) - 1; // For efficient modulo of the array size + + // Iterate through all old elements and rehash them into the new array. + for (auto& e : buckets) { + // Skip empty buckets ; We also check for TOMBSTONE elements + // as we might be coming off a kernel that overflew the hashmap + if (e == EMPTYBUCKET || e == TOMBSTONE) { + continue; + } + + uint32_t newHash = hash(e); + bool found = false; + for (int i = 0; i < Hashinator::defaults::BUCKET_OVERFLOW; i++) { + KEY_TYPE& candidate = newBuckets[(newHash + i) & bitMask]; + if (candidate == EMPTYBUCKET) { + // Found an empty bucket, assign that one. + candidate = e; + found = true; + break; + } + } + + if (!found) { + // Having arrived here means that we unsuccessfully rehashed and + // are *still* overflowing our buckets. So we need to try again with a bigger one. + return rehash(newSizePower + 1); + } + } + + // Replace our buckets with the new ones + buckets = newBuckets; + _setInfo->currentMaxBucketOverflow = Hashinator::defaults::BUCKET_OVERFLOW; + _setInfo->tombstoneCounter = 0; + } + + +#ifndef HASHINATOR_CPU_ONLY_MODE + template + size_t extractPattern(KEY_TYPE* elements, Rule rule, split_gpuStream_t s = 0) { + // Figure out Blocks to use + size_t _s = std::ceil((float(buckets.size())) / (float)BLOCKSIZE); + size_t nBlocks = nextPow2(_s); + nBlocks+=(nBlocks==0); + + // Allocate with Mempool + const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t); + split::tools::Cuda_mempool mPool(memory_for_pool, s); + size_t retval = + split::tools::copy_if_raw( + buckets, elements, rule, nBlocks, mPool, s); + return retval; + } + + size_t extractAllKeys(split::SplitVector& elements, split_gpuStream_t s = 0, bool prefetches = true) { + // Extract all keys + if (prefetches){ + elements.optimizeGPU(s); + } + auto rule = [] __host__ __device__(const KEY_TYPE& kval) -> bool { + return kval != EMPTYBUCKET && kval!= TOMBSTONE; + }; + return extractPattern(elements.data(), rule, s); + } + + void device_rehash(int newSizePower, split_gpuStream_t s = 0) { + if (newSizePower > 32) { + throw std::out_of_range("Hashmap ran into rehashing catastrophe and exceeded 32bit buckets."); + } + + size_t priorFill = _setInfo->fill; + // Extract all valid elements + KEY_TYPE* validElements; + SPLIT_CHECK_ERR(split_gpuMallocAsync((void**)&validElements, + (_setInfo->fill + 1) * sizeof(KEY_TYPE), s)); + optimizeGPU(s); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + + auto isValidKey = [] __host__ __device__(KEY_TYPE& element) { + return ( (element !=TOMBSTONE) && (element!=EMPTYBUCKET) ); + }; + + uint32_t nValidElements = extractPattern(validElements, isValidKey, s); + + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + assert(nValidElements == _setInfo->fill && "Something really bad happened during rehashing! Ask Kostis!"); + // We can now clear our buckets + // Easy optimization: If our bucket had no valid elements and the same size was requested + // we can just clear it + if (newSizePower == _setInfo->sizePower && nValidElements == 0) { + clear(targets::device, s, true); + set_status((priorFill == _setInfo->fill) ? status::success : status::fail); + split_gpuFreeAsync(validElements, s); + return; + } + optimizeCPU(s); + buckets = std::move(split::SplitVector(1 << newSizePower, KEY_TYPE(EMPTYBUCKET))); + optimizeGPU(s); + *_setInfo = SetInfo(newSizePower); + // Insert valid elements to now larger buckets + insert(validElements, nValidElements, 1, s); + set_status((priorFill == _setInfo->fill) ? status::success : status::fail); + split_gpuFreeAsync(validElements, s); + return; + } + + + void clean_tombstones(split_gpuStream_t s = 0, bool prefetches = false) { + + if (_setInfo->tombstoneCounter == 0) { + return; + } + + // Reset the tomstone counter + _setInfo->tombstoneCounter = 0; + // Allocate memory for overflown elements. So far this is the same size as our buckets but we can be better than + // this + + KEY_TYPE* overflownElements; + SPLIT_CHECK_ERR(split_gpuMallocAsync((void**)&overflownElements, + (1 << _setInfo->sizePower) * sizeof(KEY_TYPE), s)); + + if (prefetches) { + optimizeGPU(s); + } + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + + int currentSizePower = _setInfo->sizePower; + KEY_TYPE* bck_ptr = buckets.data(); + + auto isOverflown = [bck_ptr, currentSizePower] __host__ __device__(KEY_TYPE & element)->bool { + if (element == TOMBSTONE) { + element = EMPTYBUCKET; + return false; + } + if (element == EMPTYBUCKET) { + return false; + } + const size_t hashIndex = HashFunction::_hash(element, currentSizePower); + const int bitMask = (1 << (currentSizePower)) - 1; + bool isOverflown = (bck_ptr[hashIndex & bitMask] != element); + return isOverflown; + }; + + // Extract overflown elements and reset overflow + uint32_t nOverflownElements = extractPattern(overflownElements, isOverflown, s); + _setInfo->currentMaxBucketOverflow = defaults::BUCKET_OVERFLOW; + + if (nOverflownElements == 0) { + SPLIT_CHECK_ERR(split_gpuFreeAsync(overflownElements, s)); + return; + } + // If we do have overflown elements we put them back in the buckets + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + DeviceHasher::reset_set(overflownElements, buckets.data(), _setInfo->sizePower, _setInfo->currentMaxBucketOverflow, + nOverflownElements, s); + _setInfo->fill -= nOverflownElements; + DeviceHasher::insert_set(overflownElements, buckets.data(), _setInfo->sizePower, _setInfo->currentMaxBucketOverflow, + &_setInfo->currentMaxBucketOverflow, &_setInfo->fill, nOverflownElements, &_setInfo->err, s); + + SPLIT_CHECK_ERR(split_gpuFreeAsync(overflownElements, s)); + return; + } +#else + void clean_tombstones() { + rehash(); + } +#endif + +#ifdef HASHINATOR_CPU_ONLY_MODE + // Try to get the overflow back to the original one + void performCleanupTasks() { + while (_setInfo->currentMaxBucketOverflow > Hashinator::defaults::BUCKET_OVERFLOW) { + rehash(_setInfo->sizePower + 1); + } + // When operating in CPU only mode we rehash to get rid of tombstones + if (tombstone_ratio() > 0.025) { + rehash(_setInfo->sizePower); + } + } +#else + // Try to get the overflow back to the original one + void performCleanupTasks(split_gpuStream_t s = 0) { + while (_setInfo->currentMaxBucketOverflow > Hashinator::defaults::BUCKET_OVERFLOW) { + device_rehash(_setInfo->sizePower + 1, s); + } + if (tombstone_ratio() > 0.025) { + clean_tombstones(s); + } + } + +#endif + + void rehash() { rehash(_setInfo->sizePower); } + + // Iterators + class iterator { + Unordered_Set* set; + size_t index; + + public: + iterator(Unordered_Set& set, size_t index) : set(&set), index(index) {} + + iterator& operator++() { + index++; + while (index < set->buckets.size()) { + if (set->buckets[index] != EMPTYBUCKET && set->buckets[index] != TOMBSTONE) { + break; + } + index++; + } + return *this; + } + + iterator operator++(int) { // Postfix version + iterator temp = *this; + ++(*this); + return temp; + } + bool operator==(iterator other) const { return &set->buckets[index] == &other.set->buckets[other.index]; } + bool operator!=(iterator other) const { return &set->buckets[index] != &other.set->buckets[other.index]; } + KEY_TYPE& operator*() const { return set->buckets[index]; } + KEY_TYPE* operator->() const { return &set->buckets[index]; } + size_t getIndex() { return index; } + }; + + // Const iterator. + class const_iterator { + const Unordered_Set* set; + size_t index; + + public: + explicit const_iterator(const Unordered_Set& set, size_t index) : set(&set), index(index) {} + const_iterator& operator++() { + index++; + while (index < set->buckets.size()) { + if (set->buckets[index] != EMPTYBUCKET && set->buckets[index] != TOMBSTONE) { + break; + } + index++; + } + return *this; + } + const_iterator operator++(int) { // Postfix version + const_iterator temp = *this; + ++(*this); + return temp; + } + bool operator==(const_iterator other) const { return &set->buckets[index] == &other.set->buckets[other.index]; } + bool operator!=(const_iterator other) const { return &set->buckets[index] != &other.set->buckets[other.index]; } + const KEY_TYPE& operator*() const { return set->buckets[index]; } + const KEY_TYPE* operator->() const { return &set->buckets[index]; } + size_t getIndex() { return index; } + }; + + iterator begin() { + for (size_t i = 0; i < buckets.size(); i++) { + if (buckets[i] != EMPTYBUCKET && buckets[i] != TOMBSTONE) { + return iterator(*this, i); + } + } + return end(); + } + + const_iterator begin() const { + for (size_t i = 0; i < buckets.size(); i++) { + if (buckets[i] != EMPTYBUCKET && buckets[i] != TOMBSTONE) { + return const_iterator(*this, i); + } + } + return end(); + } + + iterator end() { return iterator(*this, buckets.size()); } + + const_iterator end() const { return const_iterator(*this, buckets.size()); } + + +#ifndef HASHINATOR_CPU_ONLY_MODE + // Device Iterator type. Iterates through all non-empty buckets. + class device_iterator { + private: + size_t index; + Unordered_Set* set; + + public: + HASHINATOR_DEVICEONLY + device_iterator(Unordered_Set& set, size_t index) : index(index), set(&set) {} + + HASHINATOR_DEVICEONLY + size_t getIndex() { return index; } + + HASHINATOR_DEVICEONLY + device_iterator& operator++() { + index++; + while (index < set->buckets.size()) { + if (set->buckets[index] != EMPTYBUCKET && set->buckets[index] != TOMBSTONE) { + break; + } + index++; + } + return *this; + } + + HASHINATOR_DEVICEONLY + device_iterator operator++(int) { + device_iterator temp = *this; + ++(*this); + return temp; + } + + HASHINATOR_DEVICEONLY + bool operator==(device_iterator other) const { return &set->buckets[index] == &other.set->buckets[other.index]; } + HASHINATOR_DEVICEONLY + bool operator!=(device_iterator other) const { return &set->buckets[index] != &other.set->buckets[other.index]; } + + HASHINATOR_DEVICEONLY + KEY_TYPE& operator*() const { return set->buckets[index]; } + HASHINATOR_DEVICEONLY + KEY_TYPE* operator->() const { return &set->buckets[index]; } + }; + + class const_device_iterator { + private: + size_t index; + const Unordered_Set* set; + + public: + HASHINATOR_DEVICEONLY + explicit const_device_iterator(const Unordered_Set& set, size_t index) : index(index), set(&set) {} + + HASHINATOR_DEVICEONLY + size_t getIndex() { return index; } + + HASHINATOR_DEVICEONLY + const_device_iterator& operator++() { + index++; + while (index < set->buckets.size()) { + if (set->buckets[index] != EMPTYBUCKET && set->buckets[index] != TOMBSTONE) { + break; + } + index++; + } + return *this; + } + + HASHINATOR_DEVICEONLY + const_device_iterator operator++(int) { + const_device_iterator temp = *this; + ++(*this); + return temp; + } + + HASHINATOR_DEVICEONLY + bool operator==(const_device_iterator other) const { + return &set->buckets[index] == &other.set->buckets[other.index]; + } + HASHINATOR_DEVICEONLY + bool operator!=(const_device_iterator other) const { + return &set->buckets[index] != &other.set->buckets[other.index]; + } + + HASHINATOR_DEVICEONLY + const KEY_TYPE& operator*() const { return set->buckets[index]; } + HASHINATOR_DEVICEONLY + const KEY_TYPE* operator->() const { return &set->buckets[index]; } + }; + + HASHINATOR_DEVICEONLY + device_iterator device_end() { return device_iterator(*this, buckets.size()); } + + HASHINATOR_DEVICEONLY + const_device_iterator device_end() const { return const_device_iterator(*this, buckets.size()); } + + HASHINATOR_DEVICEONLY + device_iterator device_begin() { + for (size_t i = 0; i < buckets.size(); i++) { + if (buckets[i] != EMPTYBUCKET && buckets[i] != TOMBSTONE) { + return device_iterator(*this, i); + } + } + return device_end(); + } + + HASHINATOR_DEVICEONLY + const_device_iterator device_begin() const noexcept { + for (size_t i = 0; i < buckets.size(); i++) { + if (buckets[i] != EMPTYBUCKET && buckets[i] != TOMBSTONE) { + return const_device_iterator(*this, i); + } + } + return device_end(); + } + + // Element access by iterator + HASHINATOR_DEVICEONLY + device_iterator device_find(KEY_TYPE key) { + int bitMask = (1 << _setInfo->sizePower) - 1; // For efficient modulo of the array size + auto hashIndex = hash(key); + + // Try to find the matching bucket. + for (size_t i = 0; i < _setInfo->currentMaxBucketOverflow; i++) { + const KEY_TYPE& candidate = buckets[(hashIndex + i) & bitMask]; + + if (candidate == TOMBSTONE) { + continue; + } + + if (candidate == key) { + // Found a match, return that + return device_iterator(*this, (hashIndex + i) & bitMask); + } + + if (candidate.first == EMPTYBUCKET) { + // Found an empty bucket. Return empty. + return device_end(); + } + } + // Not found + return device_end(); + } + + HASHINATOR_DEVICEONLY + const const_device_iterator device_find(KEY_TYPE key) const { + int bitMask = (1 << _setInfo->sizePower) - 1; // For efficient modulo of the array size + auto hashIndex = hash(key); + + // Try to find the matching bucket. + for (size_t i = 0; i < _setInfo->currentMaxBucketOverflow; i++) { + const KEY_TYPE& candidate = buckets[(hashIndex + i) & bitMask]; + + if (candidate == TOMBSTONE) { + continue; + } + + if (candidate == key) { + // Found a match, return that + return const_device_iterator(*this, (hashIndex + i) & bitMask); + } + + if (candidate == EMPTYBUCKET) { + // Found an empty bucket. Return empty. + return device_end(); + } + } + // Not found + return device_end(); + } + + HASHINATOR_DEVICEONLY + void insert_element(const KEY_TYPE& key, size_t& thread_overflowLookup) { + int bitMask = (1 << _setInfo->sizePower) - 1; // For efficient modulo of the array size + auto hashIndex = hash(key); + size_t i = 0; + while (i < buckets.size()) { + uint32_t vecindex = (hashIndex + i) & bitMask; + KEY_TYPE old = split::s_atomicCAS(&buckets[vecindex], EMPTYBUCKET, key); + // Key does not exist so we create it and incerement fill + if (old == EMPTYBUCKET) { + split::s_atomicAdd((unsigned int*)(&_setInfo->fill), 1); + thread_overflowLookup = i + 1; + return; + } + // Key exists so we overwrite it. Fill stays the same + if (old == key) { + thread_overflowLookup = i + 1; + return; + } + i++; + } + assert(false && "Hashmap completely overflown"); + } + + HASHINATOR_DEVICEONLY + hash_pair device_insert(KEY_TYPE newEntry) { + bool found = device_find(newEntry) != device_end(); + if (!found) { + add_element(newEntry); + } + return hash_pair(device_find(newEntry.first), !found); + } + + HASHINATOR_DEVICEONLY + void add_element(const KEY_TYPE& key) { + size_t thread_overflowLookup = 0; + insert_element(key, thread_overflowLookup); + atomicMax((unsigned long long*)&(_setInfo->currentMaxBucketOverflow), + nextOverflow(thread_overflowLookup, defaults::WARPSIZE / defaults::elementsPerWarp)); + } +#endif + + void print_pair(const KEY_TYPE& i) const noexcept { + size_t currentSizePower = _setInfo->sizePower; + const size_t hashIndex = HashFunction::_hash(i, currentSizePower); + const int bitMask = (1 << (currentSizePower)) - 1; + size_t optimalIndex = hashIndex & bitMask; + const_iterator it = find(i); + int64_t overflow = llabs(it.getIndex() - optimalIndex); + if (i == TOMBSTONE) { + std::cout << "[â•€] "; + } else if (i == EMPTYBUCKET) { + std::cout << "[â–¢] "; + } else { + if (overflow > 0) { + printf("[%d,\033[1;31m%li\033[0m] ", i, overflow); + } else { + printf("[%d,%zu] ", i, overflow); + } + } + } + + void dump_buckets() const noexcept { + printf("Hashinator Stats \n"); + printf("Fill= %zu, LoadFactor=%f \n", _setInfo->fill, load_factor()); + printf("Tombstones= %zu\n", _setInfo->tombstoneCounter); + for (size_t i = 0; i < buckets.size(); ++i) { + print_pair(buckets[i]); + } + printf("\n"); + } + + HASHINATOR_HOSTDEVICE + void stats() const noexcept{ + printf("Hashinator Stats \n"); + printf("Bucket size= %lu\n", buckets.size()); + printf("Fill= %lu, LoadFactor=%f \n", _setInfo->fill, load_factor()); + printf("Tombstones= %lu\n", _setInfo->tombstoneCounter); + printf("Overflow= %lu\n", _setInfo->currentMaxBucketOverflow); + } + + HASHINATOR_HOSTDEVICE + inline int getSizePower(void) const noexcept { return _setInfo->sizePower; } + + // For STL compatibility: size(), bucket_count(), count(KEY_TYPE), clear() + HASHINATOR_HOSTDEVICE + size_t size() const noexcept { return _setInfo->fill; } + + HASHINATOR_HOSTDEVICE + size_t bucket_count() const noexcept { return buckets.size(); } + + HASHINATOR_HOSTDEVICE + float load_factor() const noexcept { return (float)size() / bucket_count(); } + + HASHINATOR_HOSTDEVICE + size_t tombstone_count() const noexcept { return _setInfo->tombstoneCounter; } + + HASHINATOR_HOSTDEVICE + float tombstone_ratio() const noexcept { + if (tombstone_count() == 0) { + return 0.0; + } + return (float)_setInfo->tombstoneCounter / (float)buckets.size(); + } + + bool contains(const KEY_TYPE& key) const noexcept { return (find(key) != end()) ; } + + bool empty() const noexcept { return begin() == end(); } + + size_t count(const KEY_TYPE& key) const noexcept { return contains(key) ? 1 : 0; } + +#ifdef HASHINATOR_CPU_ONLY_MODE + void clear(targets t= targets::host){ + UNUSED(t); + buckets = split::SplitVector(1 << _setInfo->sizePower, {EMPTYBUCKET}); + *_setInfo = SetInfo(_setInfo->sizePower); + return; + } +#else + void clear(targets t = targets::host, split_gpuStream_t s = 0, bool prefetches = true) { + switch (t) { + case targets::host: + buckets = split::SplitVector(1 << _setInfo->sizePower, {EMPTYBUCKET}); + *_setInfo = SetInfo(_setInfo->sizePower); + break; + case targets::device: + if (prefetches) { + buckets.optimizeGPU(s); + } + DeviceHasher::reset_all_set(buckets.data(), buckets.size(), s); + _setInfo->fill = 0; + set_status((_setInfo->fill == 0) ? success : fail); + break; + default: + clear(targets::host); + break; + } + return; + } +#endif + + + iterator find(const KEY_TYPE& key) noexcept { + const int bitMask = (1 << _setInfo->sizePower) - 1; + const auto hashIndex = hash(key); + + for (size_t i = 0; i < _setInfo->currentMaxBucketOverflow; i++) { + const KEY_TYPE& candidate = buckets[(hashIndex + i) & bitMask]; + if (candidate == key) { + auto index = (hashIndex + i) & bitMask; + return iterator(*this, index); + } + if (candidate == TOMBSTONE) { + continue; + } + if (candidate == EMPTYBUCKET) { + return end(); + } + } + return end(); + } + + const const_iterator find(const KEY_TYPE& key) const noexcept { + const int bitMask = (1 << _setInfo->sizePower) - 1; + const auto hashIndex = hash(key); + + for (size_t i = 0; i < _setInfo->currentMaxBucketOverflow; i++) { + const KEY_TYPE& candidate = buckets[(hashIndex + i) & bitMask]; + if (candidate == key) { + auto index = (hashIndex + i) & bitMask; + return const_iterator(*this, index); + } + if (candidate == TOMBSTONE) { + continue; + } + if (candidate == EMPTYBUCKET) { + return end(); + } + } + return end(); + } + + hash_pair insert(const KEY_TYPE& key) noexcept { + // try to find key + performCleanupTasks(); + iterator it = find(key); + + // if the key already exists we mutate it + if (it != end()) { + *it = key; + return {it, it != end()}; + } + // otherwise we add it + addKey(key); + iterator retval = find(key); + return {retval, retval != end()}; + } + + hash_pair insert(KEY_TYPE&& key) noexcept { + // try to find key + iterator it = find(key); + + // if the key already exists we mutate it + if (it != end()) { + *it = key; + return {it, it != end()}; + } + // otherwise we add it + addKey(key); + iterator retval = find(key); + return {retval, retval != end()}; + } + + iterator erase(iterator pos) { + auto index = pos.getIndex(); + assert(index ( 1 << _setInfo->sizePower )); + KEY_TYPE& key = buckets[index]; + if (key != EMPTYBUCKET && key != TOMBSTONE) { + key = TOMBSTONE; + _setInfo->fill--; + _setInfo->tombstoneCounter++; + } + return ++pos; // return next valid element; + } + + iterator erase(const_iterator pos) { + auto index = pos.getIndex(); + assert(index < 1 << _setInfo->sizePower); + KEY_TYPE& key = buckets[index]; + if (key != EMPTYBUCKET && key != TOMBSTONE) { + key = TOMBSTONE; + _setInfo->fill--; + _setInfo->tombstoneCounter++; + } + return ++pos; // return next valid element; + } + + bool erase(const KEY_TYPE& key) { + auto it = find(key); + if (it!=end()){ + erase(it); + return true; + } + return false; + } + + +#ifdef HASHINATOR_CPU_ONLY_MODE + void resize(int newSizePower,targets t = targets::host) { + UNUSED(t); + rehash(newSizePower); + } + + void insert(KEY_TYPE* keys,size_t len,float targetLF = 0.5) { + UNUSED(targetLF); + for (size_t i =0 ; i < len; ++i){ + insert(keys[i]); + + } + } + + void erase(KEY_TYPE* keys,size_t len,float targetLF = 0.5) { + UNUSED(targetLF); + for (size_t i =0 ; i < len; ++i){ + erase(keys[i]); + + } + } +#else + void resize(int newSizePower, targets t = targets::host, split_gpuStream_t s = 0) { + switch (t) { + case targets::host: + rehash(newSizePower); + break; + case targets::device: + device_rehash(newSizePower, s); + break; + default: + resize(newSizePower, targets::host); + break; + } + return; + } + + + void insert(KEY_TYPE* keys,size_t len,float targetLF = 0.5, split_gpuStream_t s = 0, bool prefetches = true) { + // TODO fix these if paths or at least annotate them . + if (len == 0) { + set_status(status::success); + return; + } + if (prefetches) { + buckets.optimizeGPU(s); + } + int64_t neededPowerSize = std::ceil(std::log2((_setInfo->fill + len) * (1.0 / targetLF))); + if (neededPowerSize > _setInfo->sizePower) { + resize(neededPowerSize, targets::device, s); + } + _setInfo->currentMaxBucketOverflow = _setInfo->currentMaxBucketOverflow; + DeviceHasher::insert_set(keys, buckets.data(), _setInfo->sizePower, _setInfo->currentMaxBucketOverflow, + &_setInfo->currentMaxBucketOverflow, &_setInfo->fill, len, &_setInfo->err, s); + return; + } + + // Uses Hasher's erase_kernel to delete elements + void erase(KEY_TYPE* keys, size_t len, split_gpuStream_t s = 0) { + if (len == 0) { + set_status(status::success); + return; + } + buckets.optimizeGPU(s); + // Remember the last number of tombstones + size_t tbStore = tombstone_count(); + DeviceHasher::erase_set(keys, buckets.data(), &_setInfo->tombstoneCounter, _setInfo->sizePower, + _setInfo->currentMaxBucketOverflow, len, s); + size_t tombstonesAdded = tombstone_count() - tbStore; + // Fill should be decremented by the number of tombstones added; + _setInfo->fill -= tombstonesAdded; + return; + } + +#endif + +#ifndef HASHINATOR_CPU_ONLY_MODE + template + HASHINATOR_DEVICEONLY void warpInsert(const KEY_TYPE& candidateKey, const size_t w_tid) noexcept { + + const int sizePower = _setInfo->sizePower; + const int bitMask = (1 << (sizePower)) - 1; + const auto hashIndex = HashFunction::_hash(candidateKey, sizePower); + const size_t optimalindex = (hashIndex)&bitMask; + const auto submask = SPLIT_VOTING_MASK; + bool warpDone = false; + uint64_t threadOverflow = 1; + +#ifdef HASHINATOR_DEBUG +// Safety check: make sure everyone has the same key/val and all threads are here. +#ifdef __CUDACC__ + assert(__activemask() == SPLIT_VOTING_MASK && "Tried to warpInsert with part of warp predicated off"); +#endif + KEY_TYPE storeKey = split::s_shuffle(candidateKey, 0, SPLIT_VOTING_MASK); + bool isSafe = (split::s_warpVote(candidateKey == storeKey, SPLIT_VOTING_MASK) == SPLIT_VOTING_MASK); + assert(isSafe && "Tried to warpInsert with different keys in the same warp"); +#endif + + for (size_t i = 0; i < (1 << sizePower); i += defaults::WARPSIZE) { + // Check if this virtual warp is done. + if (warpDone) { + break; + } + + // Get the position we should be looking into + size_t probingindex = ((hashIndex + i + w_tid) & bitMask); + auto target = buckets[probingindex]; + + // vote for available emptybuckets in warp region + // Note that this has to be done before voting for already existing elements (below) + auto mask = split::s_warpVote(target == EMPTYBUCKET, submask); + + // Check if this elements already exists + auto already_exists = split::s_warpVote(target == candidateKey, submask); + if (already_exists) { + int winner = split::s_findFirstSig(already_exists) - 1; + if (w_tid == winner) { + warpDone = 1; + } + } + + // If any duplicate was there now is the time for the whole Virtual warp to find out! + warpDone = split::s_warpVote(warpDone > 0, submask) & submask; + + while (mask && !warpDone) { + int winner = split::s_findFirstSig(mask) - 1; + if (w_tid == winner) { + KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex], EMPTYBUCKET, candidateKey); + if (old == EMPTYBUCKET) { + threadOverflow = (probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex + 1); + warpDone = 1; + split::s_atomicAdd(&_setInfo->fill, 1); + if (threadOverflow > _setInfo->currentMaxBucketOverflow) { + split::s_atomicExch((unsigned long long*)(&_setInfo->currentMaxBucketOverflow), + (unsigned long long)nextOverflow(threadOverflow, defaults::WARPSIZE)); + } + } else if (old == candidateKey) { + warpDone = 1; + } + } + // If any of the virtual warp threads are done the the whole + // Virtual warp is done + warpDone = split::s_warpVote(warpDone > 0, submask); + mask ^= (1UL << winner); + } + } + } + + template + HASHINATOR_DEVICEONLY bool warpInsert_V(const KEY_TYPE& candidateKey,const size_t w_tid) noexcept { + + const int sizePower = _setInfo->sizePower; + const int bitMask = (1 << (sizePower)) - 1; + const auto hashIndex = HashFunction::_hash(candidateKey, sizePower); + const size_t optimalindex = (hashIndex)&bitMask; + const auto submask = SPLIT_VOTING_MASK; + bool warpDone = false; + uint64_t threadOverflow = 1; + int localCount = 0; + +#ifdef HASHINATOR_DEBUG +// Safety check: make sure everyone has the same key/val and all threads are here. +#ifdef __CUDACC__ + assert(__activemask() == SPLIT_VOTING_MASK && "Tried to warpInsert_V with part of warp predicated off"); +#endif + KEY_TYPE storeKey = split::s_shuffle(candidateKey, 0, SPLIT_VOTING_MASK); + KEY_TYPE storeVal = split::s_shuffle(candidateVal, 0, SPLIT_VOTING_MASK); + bool isSafe = (split::s_warpVote(candidateKey == storeKey, SPLIT_VOTING_MASDK) == SPLIT_VOTING_MASK; + assert(isSafe && "Tried to warpInsert_V with different keys in the same warp"); +#endif + + for (size_t i = 0; i < (1 << sizePower); i += defaults::WARPSIZE) { + // Check if this virtual warp is done. + if (warpDone) { + break; + } + + // Get the position we should be looking into + size_t probingindex = ((hashIndex + i + w_tid) & bitMask); + auto target = buckets[probingindex]; + + // vote for available emptybuckets in warp region + // Note that this has to be done before voting for already existing elements (below) + auto mask = split::s_warpVote(target == EMPTYBUCKET, submask); + + // Check if this elements already exists + auto already_exists = split::s_warpVote(target == candidateKey, submask); + if (already_exists) { + int winner = split::s_findFirstSig(already_exists) - 1; + if (w_tid == winner) { + warpDone = 1; + } + } + + // If any duplicate was there now is the time for the whole Virtual warp to find out! + warpDone = split::s_warpVote(warpDone > 0, submask) & submask; + + while (mask && !warpDone) { + int winner = split::s_findFirstSig(mask) - 1; + if (w_tid == winner) { + KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex], EMPTYBUCKET, candidateKey); + if (old == EMPTYBUCKET) { + threadOverflow = (probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex + 1); + warpDone = 1; + localCount = 1; + split::s_atomicAdd(&_setInfo->fill, 1); + if (threadOverflow > _setInfo->currentMaxBucketOverflow) { + split::s_atomicExch((unsigned long long*)(&_setInfo->currentMaxBucketOverflow), + (unsigned long long)nextOverflow(threadOverflow, defaults::WARPSIZE)); + } + } else if (old == candidateKey) { + warpDone = 1; + } + } + // If any of the virtual warp threads are done the the whole + // Virtual warp is done + warpDone = split::s_warpVote(warpDone > 0, submask); + mask ^= (1UL << winner); + } + } + + auto res = split::s_warpVote(localCount > 0, submask); + return (res > 0); + } + + HASHINATOR_DEVICEONLY + void warpErase(const KEY_TYPE& candidateKey, const size_t w_tid) noexcept { + + const int sizePower = _setInfo->sizePower; + const size_t maxoverflow = _setInfo->currentMaxBucketOverflow; + const int bitMask = (1 << (sizePower)) - 1; + const auto hashIndex = HashFunction::_hash(candidateKey, sizePower); + const auto submask = SPLIT_VOTING_MASK; + bool warpDone = false; + int winner = 0; + +#ifdef HASHINATOR_DEBUG +// Safety check: make sure everyone has the same key/val and all threads are here. +#ifdef __CUDACC__ + assert(__activemask() == SPLIT_VOTING_MASK && "Tried to warpFind with part of warp predicated off"); +#endif + KEY_TYPE storeKey = split::s_shuffle(candidateKey, 0, SPLIT_VOTING_MASK); + bool isSafe = split::s_warpVote(candidateKey == storeKey, SPLIT_VOTING_MASK) == SPLIT_VOTING_MASK; + assert(isSafe && "Tried to warpFind with different keys/vals in the same warp"); +#endif + + for (size_t i = 0; i < maxoverflow; i += defaults::WARPSIZE) { + + if (warpDone) { + break; + } + + // Get the position we should be looking into + size_t probingindex = ((hashIndex + i + w_tid) & bitMask); + const auto maskExists = + split::s_warpVote(buckets[probingindex] == candidateKey, SPLIT_VOTING_MASK) & submask; + const auto emptyFound = + split::s_warpVote(buckets[probingindex] == EMPTYBUCKET, SPLIT_VOTING_MASK) & submask; + // If we encountered empty and the key is not in the range of this warp that means the key is not in hashmap. + if (!maskExists && emptyFound) { + warpDone = true; + } + if (maskExists) { + winner = split::s_findFirstSig(maskExists) - 1; + if (w_tid == winner) { + buckets[probingindex] = TOMBSTONE; + split::s_atomicAdd(&_setInfo->tombstoneCounter, 1); + split::s_atomicSub((unsigned int*)&_setInfo->fill, 1); + } + warpDone = true; + } + } + return; + } +#endif + +}; // Unordered_Set + +} // namespace Hashinator diff --git a/include/splitvector/devicevec.h b/include/splitvector/devicevec.h new file mode 100644 index 0000000..a543f95 --- /dev/null +++ b/include/splitvector/devicevec.h @@ -0,0 +1,783 @@ +#pragma once +#include "archMacros.h" +#include "gpu_wrappers.h" +#include +#ifdef __NVCC__ +#include +#else +#include +#endif +#define HOSTONLY __host__ +#define DEVICEONLY __device__ +#define HOSTDEVICE __host__ __device__ + +namespace split { + +template > +class DeviceVector { + static_assert(std::is_trivially_copyable::value && "DeviceVector only works for POD types"); + +private: + enum MEMBER { SIZE, CAPACITY }; + + // packed to 1 cache line + struct __attribute__((__packed__)) Meta { + size_t size; + size_t capacity; + char padding[64 - 2 * sizeof(size_t)]={0}; // pad up to cache line + HOSTDEVICE + inline size_t& operator[](MEMBER member) noexcept { + switch (member) { + case MEMBER::SIZE: + return *reinterpret_cast(this); + case MEMBER::CAPACITY: + return *(reinterpret_cast(this) + 1); + default: + assert(false && "Invalid case"); + } + } + }; + + // Members + Meta* _meta = nullptr; + T* _data = nullptr; + Allocator _allocator; + mutable split_gpuStream_t _stream; + + void setupSpace(void* ptr) noexcept { + _meta = reinterpret_cast(ptr); + _data = reinterpret_cast(reinterpret_cast(ptr) + sizeof(Meta)); + } + + [[nodiscard]] void* _allocate(const size_t sz) { + void* _ptr =nullptr; if (_stream == NULL) { + _ptr = _allocator.allocate_raw( sizeof(Meta) + sz*sizeof(T)); + } else { + _ptr = _allocator.allocate_raw( sizeof(Meta) + sz*sizeof(T),_stream); + } + return _ptr; + } + + void _deallocate(void* _ptr) { + if (_ptr == nullptr) { + return; + } + if (_stream == NULL) { + _allocator.deallocate((_ptr)); + } else { + _allocator.deallocate(_ptr,_stream); + } + _ptr = nullptr; + } + + DEVICEONLY + inline void _rangeCheckDevice(size_t index) const noexcept { + if (index >= _meta->size) { + assert(true && " out of range "); + } + } + + HOSTONLY + inline void _rangeCheckHost(size_t index) const noexcept { + Meta currentMeta = getMeta(); + if (index >= currentMeta.size) { + assert(true && " out of range "); + } + } + + HOSTONLY + inline Meta getMeta() const noexcept { + Meta buffer; + if (_stream == NULL) { + SPLIT_CHECK_ERR(split_gpuMemcpy(&buffer, _meta, sizeof(Meta), split_gpuMemcpyDeviceToHost)); + } else { + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&buffer, _meta, sizeof(Meta), split_gpuMemcpyDeviceToHost, _stream)); + } + return buffer; + } + + HOSTONLY + inline void getMeta(Meta& buffer) const noexcept { + if (_stream == NULL) { + SPLIT_CHECK_ERR(split_gpuMemcpy(&buffer, _meta, sizeof(Meta), split_gpuMemcpyDeviceToHost)); + } else { + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&buffer, _meta, sizeof(Meta), split_gpuMemcpyDeviceToHost, _stream)); + } + } + + HOSTONLY + inline void setMeta(const Meta& buffer) noexcept { + if (_stream == NULL) { + SPLIT_CHECK_ERR(split_gpuMemcpy(_meta, &buffer, sizeof(Meta), split_gpuMemcpyHostToDevice)); + } else { + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(_meta, &buffer, sizeof(Meta), split_gpuMemcpyHostToDevice, _stream)); + } + } + + HOSTONLY + inline T getElementFromDevice(const size_t index) const noexcept { + T retval; + if (_stream == NULL) { + SPLIT_CHECK_ERR(split_gpuMemcpy(&retval, &_data[index], sizeof(T), split_gpuMemcpyDeviceToHost)); + } else { + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&retval, &_data[index], sizeof(T), split_gpuMemcpyDeviceToHost, _stream)); + } + return retval; + } + + HOSTONLY + inline void setElementFromHost(const size_t index, const T& val) noexcept { + if (_stream == NULL) { + SPLIT_CHECK_ERR(split_gpuMemcpy(&_data[index], &val, sizeof(T), split_gpuMemcpyHostToDevice)); + } else { + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&_data[index], &val, sizeof(T), split_gpuMemcpyHostToDevice, _stream)); + } + return; + } + +public: + DeviceVector(size_t sz = 0) : _stream(NULL) { + void* ptr = _allocate(sz); + setupSpace(ptr); + size_t s = sz; + SPLIT_CHECK_ERR(split_gpuMemcpy(&_meta->size, &s, sizeof(size_t), split_gpuMemcpyHostToDevice)); + SPLIT_CHECK_ERR(split_gpuMemcpy(&_meta->capacity, &sz, sizeof(size_t), split_gpuMemcpyHostToDevice)); + } + + DeviceVector(size_t sz, split_gpuStream_t s) : _stream(s) { + void* ptr = _allocate(sz); + setupSpace(ptr); + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&_meta->size, &sz, sizeof(size_t), split_gpuMemcpyHostToDevice, s)); + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&_meta->capacity, &sz, sizeof(size_t), split_gpuMemcpyHostToDevice, s)); + } + + DeviceVector(const DeviceVector& other) : _stream(NULL) { + Meta otherMeta = other.getMeta(); + void* ptr = _allocate(otherMeta.size); + setupSpace(ptr); + Meta newMeta{.size = otherMeta.size, .capacity = otherMeta.size}; + setMeta(newMeta); + SPLIT_CHECK_ERR(split_gpuMemcpy(_data, other._data, otherMeta.size * sizeof(T), split_gpuMemcpyDeviceToDevice)); + return; + } + + DeviceVector(const DeviceVector& other, split_gpuStream_t s) : _stream(s) { + Meta otherMeta = other.getMeta(); + void* ptr = _allocate(otherMeta.size); + setupSpace(ptr); + Meta newMeta{.size = otherMeta.size, .capacity = otherMeta.size}; + setMeta(newMeta); + SPLIT_CHECK_ERR( + split_gpuMemcpyAsync(_data, other._data, otherMeta.size * sizeof(T), split_gpuMemcpyDeviceToDevice, s)); + return; + } + + DeviceVector(const DeviceVector&& other) : _stream(NULL) { + _meta = other._meta; + _data = other._data; + other._meta = nullptr; + other._data = nullptr; + } + + DeviceVector(const DeviceVector&& other, split_gpuStream_t s) : _stream(s) { + _meta = other._meta; + _data = other._data; + other._meta = nullptr; + other._data = nullptr; + } + + DeviceVector(const SplitVector& vec) : _stream(NULL) { + void* ptr = _allocate(vec.size()); + setupSpace(ptr); + Meta newMeta{.size = vec.size(), .capacity = vec.size()}; + setMeta(newMeta); + SPLIT_CHECK_ERR(split_gpuMemcpy(_data, vec.data(), vec.size() * sizeof(T), split_gpuMemcpyHostToDevice)); + return; + } + + DeviceVector(const SplitVector& vec, split_gpuStream_t s) : _stream(s) { + void* ptr = _allocate(vec.size()); + setupSpace(ptr); + Meta newMeta{.size = vec.size(), .capacity = vec.size()}; + setMeta(newMeta); + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(_data, vec.data(), vec.size() * sizeof(T), split_gpuMemcpyHostToDevice, s)); + return; + } + + DeviceVector(const std::vector& vec) : _stream(NULL) { + void* ptr = _allocate(vec.size()); + setupSpace(ptr); + Meta newMeta{.size = vec.size(), .capacity = vec.size()}; + setMeta(newMeta); + SPLIT_CHECK_ERR(split_gpuMemcpy(_data, vec.data(), vec.size() * sizeof(T), split_gpuMemcpyHostToDevice)); + return; + } + + DeviceVector(const std::vector& vec, split_gpuStream_t s) : _stream(s) { + void* ptr = _allocate(vec.size()); + setupSpace(ptr); + Meta newMeta{.size = vec.size(), .capacity = vec.size()}; + setMeta(newMeta); + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(_data, vec.data(), vec.size() * sizeof(T), split_gpuMemcpyHostToDevice, s)); + return; + } + + ~DeviceVector() { + if (_meta == nullptr) { + return; + } + _deallocate(_meta); + } + + HOSTONLY + void setStream(split_gpuStream_t s) const noexcept {_stream=s;} + + HOSTONLY + DeviceVector& operator=(const DeviceVector& other) { + Meta otherMeta = other.getMeta(); + resize(otherMeta.size); + if (_stream == NULL) { + SPLIT_CHECK_ERR(split_gpuMemcpy(_meta, other._meta, sizeof(Meta) + otherMeta.size * sizeof(T), + split_gpuMemcpyDeviceToDevice)); + } else { + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(_meta, other._meta, sizeof(Meta) + otherMeta.size * sizeof(T), + split_gpuMemcpyDeviceToDevice, _stream)); + } + return *this; + } + + HOSTONLY + DeviceVector& operator=(DeviceVector&& other) noexcept { + if (this == &other) { + return *this; + } + _deallocate(_meta); + _meta = other._meta; + _data = other._data; + other._meta = nullptr; + other._data = nullptr; + return *this; + } + + HOSTONLY + void* operator new(size_t len) { + void* _ptr = nullptr; + SPLIT_CHECK_ERR(split_gpuMallocManaged((void**)&_ptr, len)); + return _ptr; + } + + HOSTONLY + void operator delete(void* _ptr) { SPLIT_CHECK_ERR(split_gpuFree(_ptr)); } + + HOSTONLY + void* operator new[](size_t len) { + void* _ptr = nullptr; + SPLIT_CHECK_ERR(split_gpuMallocManaged((void**)&_ptr, len)); + return _ptr; + } + HOSTONLY + void operator delete[](void* _ptr) { SPLIT_CHECK_ERR(split_gpuFree(_ptr)); } + + DEVICEONLY + size_t device_size() const noexcept { return _meta->size; } + + DEVICEONLY + size_t device_capacity() const noexcept { return _meta->capacity; } + + HOSTONLY + size_t size() const noexcept { return (getMeta()).size; } + + HOSTONLY + size_t capacity() const noexcept { return (getMeta()).capacity; } + + HOSTONLY + T get(size_t index) const { + _rangeCheckHost(index); + return getElementFromDevice(index); + } + + HOSTONLY + void set(size_t index, const T& val) { + _rangeCheckHost(index); + return setElementFromHost(index, val); + } + + DEVICEONLY + T device_get(size_t index) const { + _rangeCheckDevice(index); + return _data[index]; + } + + DEVICEONLY + void device_set(size_t index, const T& val) { + _rangeCheckDevice(index); + split::s_atomicExch(&_data[index], val); + return; + } + + DEVICEONLY T& at(size_t index) { + _rangeCheckDevice(index); + return _data[index]; + } + + DEVICEONLY const T& at(size_t index) const { + _rangeCheckDevice(index); + return _data[index]; + } + + DEVICEONLY T& operator[](size_t index) noexcept { return _data[index]; } + + DEVICEONLY const T& operator[](size_t index) const noexcept { return _data[index]; } + + HOSTDEVICE T* data() noexcept { return &(_data[0]); } + + HOSTDEVICE const T* data() const noexcept { return &(_data[0]); } + + HOSTONLY void reallocate(size_t requested_space) { + if (requested_space == 0) { + _deallocate(_meta); + _meta = nullptr; + return; + } + + void* _new_data = _allocate(requested_space); + const auto currentMeta = getMeta(); + size_t currentSize = currentMeta.size; + if (_stream == NULL) { + SPLIT_CHECK_ERR( + split_gpuMemcpy(_new_data, _meta, sizeof(Meta) + currentSize * sizeof(T), split_gpuMemcpyDeviceToDevice)); + } else { + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(_new_data, _meta, sizeof(Meta) + currentSize * sizeof(T), + split_gpuMemcpyDeviceToDevice, _stream)); + } + _deallocate(_meta); + setupSpace(_new_data); + auto newMeta = currentMeta; + newMeta.capacity = requested_space; + setMeta(newMeta); + return; + } + + HOSTONLY + void clear() noexcept { + Meta currentMeta = getMeta(); + currentMeta.size = 0; + setMeta(currentMeta); + return; + } + + DEVICEONLY + void device_clear() noexcept { + _meta[MEMBER::SIZE] = 0; + return; + } + + HOSTONLY + void reserve(size_t requested_space) { + Meta currentMeta = getMeta(); + if (requested_space <= currentMeta.capacity) { + return; + } + reallocate(1.5 * requested_space); + return; + } + + HOSTONLY + void resize(size_t newSize) { + Meta currentMeta = getMeta(); + if (newSize <= currentMeta.size) { + currentMeta.size = newSize; + setMeta(currentMeta); + return; + } + reserve(newSize); + currentMeta = getMeta(); + currentMeta.size = newSize; + setMeta(currentMeta); + return; + } + + HOSTONLY + void grow() { reserve(capacity() + 1); } + + DEVICEONLY + void device_resize(size_t newSize) { + if (newSize > capacity()) { + return; + } + _meta[MEMBER::SIZE] = newSize; + } + + HOSTONLY + void push_back(const T& val) { + Meta currentMeta = getMeta(); + resize(currentMeta.size + 1); + currentMeta.size++; + setElementFromHost(currentMeta.size - 1, val); + return; + } + + DEVICEONLY + bool device_push_back(const T& val) { + size_t old = split::s_atomicAdd((unsigned long*)&_meta[MEMBER::SIZE], 1); + // TODO relpace this > in splitvec as well + if (old > (_meta->operator[](MEMBER::CAPACITY)) - 1) { + atomicSub((unsigned*)&_meta[MEMBER::SIZE], 1); + return false; + } + split::s_atomicCAS(&(_data[old]), _data[old], val); + return true; + } + + class iterator { + + private: + const T* _data; + + public: + using iterator_category = std::forward_iterator_tag; + using value_type = T; + using difference_type = int64_t; + using pointer = const T*; + using reference = const T&; + + iterator(pointer data) : _data(data) {} + pointer data() const { return _data; } + pointer operator->() const { return _data; } + reference operator*() const { + assert(false); + return *_data; + } + bool operator==(const iterator& other) const { return _data == other._data; } + bool operator!=(const iterator& other) const { return _data != other._data; } + iterator& operator++() { + _data += 1; + return *this; + } + iterator operator++(int) { return iterator(_data + 1); } + iterator operator--(int) { return iterator(_data - 1); } + iterator operator--() { + _data -= 1; + return *this; + } + iterator& operator+=(int64_t offset) { + _data += offset; + return *this; + } + iterator& operator-=(int64_t offset) { + _data -= offset; + return *this; + } + iterator operator+(int64_t offset) const { + iterator itt(*this); + return itt += offset; + } + iterator operator-(int64_t offset) const { + iterator itt(*this); + return itt -= offset; + } + }; + + // Device Iterators + class device_iterator { + + private: + T* _data; + + public: + using iterator_category = std::forward_iterator_tag; + using value_type = T; + using difference_type = int64_t; + using pointer = T*; + using reference = T&; + + // device_iterator(){} + DEVICEONLY + device_iterator(pointer data) : _data(data) {} + + DEVICEONLY + pointer data() { return _data; } + DEVICEONLY + pointer operator->() { return _data; } + DEVICEONLY + reference operator*() { return *_data; } + + DEVICEONLY + bool operator==(const device_iterator& other) const { return _data == other._data; } + DEVICEONLY + bool operator!=(const device_iterator& other) const { return _data != other._data; } + DEVICEONLY + device_iterator& operator++() { + _data += 1; + return *this; + } + DEVICEONLY + device_iterator operator++(int) { return device_iterator(_data + 1); } + DEVICEONLY + device_iterator operator--(int) { return device_iterator(_data - 1); } + DEVICEONLY + device_iterator operator--() { + _data -= 1; + return *this; + } + DEVICEONLY + device_iterator& operator+=(int64_t offset) { + _data += offset; + return *this; + } + DEVICEONLY + device_iterator& operator-=(int64_t offset) { + _data -= offset; + return *this; + } + DEVICEONLY + device_iterator operator+(int64_t offset) const { + device_iterator itt(*this); + return itt += offset; + } + DEVICEONLY + device_iterator operator-(int64_t offset) const { + device_iterator itt(*this); + return itt -= offset; + } + }; + + class const_device_iterator { + + private: + const T* _data; + + public: + using device_iterator_category = std::forward_iterator_tag; + using value_type = T; + using difference_type = int64_t; + using pointer = const T*; + using reference = const T&; + + DEVICEONLY + const_device_iterator(pointer data) : _data(data) {} + + DEVICEONLY + pointer data() const { return _data; } + DEVICEONLY + pointer operator->() const { return _data; } + DEVICEONLY + reference operator*() const { return *_data; } + + DEVICEONLY + bool operator==(const const_device_iterator& other) const { return _data == other._data; } + DEVICEONLY + bool operator!=(const const_device_iterator& other) const { return _data != other._data; } + DEVICEONLY + const_device_iterator& operator++() { + _data += 1; + return *this; + } + DEVICEONLY + const_device_iterator operator++(int) { return const_iterator(_data + 1); } + DEVICEONLY + const_device_iterator operator--(int) { return const_iterator(_data - 1); } + DEVICEONLY + const_device_iterator operator--() { + _data -= 1; + return *this; + } + DEVICEONLY + const_device_iterator& operator+=(int64_t offset) { + _data += offset; + return *this; + } + DEVICEONLY + const_device_iterator& operator-=(int64_t offset) { + _data -= offset; + return *this; + } + DEVICEONLY + const_device_iterator operator+(int64_t offset) const { + const_device_iterator itt(*this); + return itt += offset; + } + DEVICEONLY + const_device_iterator operator-(int64_t offset) const { + const_device_iterator itt(*this); + return itt -= offset; + } + }; + + HOSTONLY + iterator begin() const noexcept { return iterator(_data); } + + HOSTONLY + iterator end() const noexcept { return iterator(_data + size()); } + + DEVICEONLY + device_iterator device_begin() noexcept { return device_iterator(_data); } + + DEVICEONLY + const_device_iterator device_begin() const noexcept { return const_device_iterator(_data); } + + DEVICEONLY + device_iterator device_end() noexcept { return device_iterator(_data + device_size()); } + + DEVICEONLY + const_device_iterator device_end() const noexcept { return const_device_iterator(_data + device_size()); } + + HOSTONLY + T back() const noexcept { return get(size() - 1); } + + HOSTONLY + T front() const noexcept { return get(0); } + + DEVICEONLY + T& device_back() noexcept { return _data[size() - 1]; } + + DEVICEONLY + T& device_front() noexcept { return _data[0]; } + + DEVICEONLY + const T& device_back() const noexcept { return _data[size() - 1]; } + + DEVICEONLY + const T& device_front() const noexcept { return _data[0]; } + + HOSTONLY + void set(const iterator& it, T val) { + size_t index = it.data() - _data; + return setElementFromHost(index, val); + } + + T get(const iterator& it) { + size_t index = it.data() - _data; + return getElementFromDevice(index); + } + + HOSTONLY + void remove_from_back(size_t n) noexcept { + const size_t end = size() - n; + Meta currentMeta = getMeta(); + currentMeta.size = end; + setMeta(currentMeta); + } + + DEVICEONLY + void device_remove_from_back(size_t n) noexcept { + const size_t end = device_size() - n; + _meta[MEMBER::SIZE] = end; + } + + HOSTONLY + void pop_back() noexcept { remove_from_back(1); } + + DEVICEONLY + void device_pop_back() noexcept { device_remove_from_back(1); } + + HOSTONLY + iterator erase(iterator it) noexcept { + const int64_t index = it.data() - begin().data(); + Meta currentMeta = getMeta(); + for (auto i = index; i < size() - 1; i++) { + set(i, get(i + 1)); + } + currentMeta.size -= 1; + setMeta(currentMeta); + iterator retval = &_data[index]; + return retval; + } + + HOSTONLY + iterator erase(iterator p0, iterator p1) noexcept { + const int64_t start = p0.data() - begin().data(); + const int64_t end = p1.data() - begin().data(); + const int64_t offset = end - start; + Meta currentMeta = getMeta(); + for (auto i = start; i < size() - offset; ++i) { + set(i, get(i + offset)); + } + currentMeta.size -= end - start; + setMeta(currentMeta); + iterator it = &_data[start]; + return it; + } + + HOSTONLY + iterator insert(iterator it, const T& val) { + + // If empty or inserting at the end no relocating is needed + if (it == end()) { + push_back(val); + return end()--; + } + + int64_t index = it.data() - begin().data(); + if (index < 0 || index > size()) { + throw std::out_of_range("Insert"); + } + + // Do we do need to increase our capacity? + if (size() == capacity()) { + grow(); + } + + for (int64_t i = size() - 1; i >= index; i--) { + set(i + 1, get(i)); + } + + set(index, val); + Meta currentMeta = getMeta(); + currentMeta.size++; + setMeta(currentMeta); + return iterator(_data + index); + } + + template ::value>::type> + HOSTONLY iterator insert(iterator it, InputIterator p0, InputIterator p1) { + + const int64_t count = std::distance(p0, p1); + const int64_t index = it.data() - begin().data(); + + if (index < 0 || index > size()) { + throw std::out_of_range("Insert"); + } + + size_t old_size = size(); + resize(size() + count); + + iterator retval = &_data[index]; + + // Copy + for (int64_t i = old_size - 1; i >= index; i--) { + set(count + i, get(i)); + } + + // Overwrite + size_t i = index; + for (auto p = p0; p != p1; ++p) { + set(i, get(p)); + i++; + } + return retval; + } + +}; // DeviceVector + +/*Equal operator*/ +template +static inline HOSTONLY bool operator==(const DeviceVector& lhs, const DeviceVector& rhs) noexcept { + if (lhs.size() != rhs.size()) { + return false; + } + for (size_t i = 0; i < lhs.size(); i++) { + if (!(lhs.get(i) == rhs.get(i))) { + return false; + } + } + // if we end up here the vectors are equal + return true; +} + +/*Not-Equal operator*/ +template +static inline HOSTONLY bool operator!=(const DeviceVector& lhs, const DeviceVector& rhs) noexcept { + return !(rhs == lhs); +} +} // namespace split diff --git a/include/splitvector/split_allocators.h b/include/splitvector/split_allocators.h index ccff591..64284ea 100644 --- a/include/splitvector/split_allocators.h +++ b/include/splitvector/split_allocators.h @@ -77,7 +77,6 @@ class split_unified_allocator { */ split_unified_allocator() throw() {} - /** * @brief Copy constructor with different type. */ @@ -129,6 +128,91 @@ class split_unified_allocator { void destroy(pointer p) { p->~value_type(); } }; +/** + * @brief Custom allocator for device only memory + * @tparam T Type of the allocated objects. + */ +template +class split_device_allocator { +public: + typedef T value_type; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef ptrdiff_t difference_type; + typedef size_t size_type; + template + struct rebind { + typedef split_device_allocator other; + }; + /** + * @brief Default constructor. + */ + split_device_allocator() throw() {} + + /** + * @brief Copy constructor. + */ + split_device_allocator(split_device_allocator const&) throw() {} + + /** + * @brief Copy constructor with different type. + */ + template + split_device_allocator(split_device_allocator const&) throw() {} + pointer address(reference x) const { return &x; } + const_pointer address(const_reference x) const { return &x; } + + pointer allocate(size_type n, const void* /*hint*/ = 0) { + T* ret; + assert(n && "allocate 0"); + SPLIT_CHECK_ERR(split_gpuMalloc((void**)&ret, n * sizeof(value_type))); + if (ret == nullptr) { + throw std::bad_alloc(); + } + return ret; + } + + pointer allocate(size_type n, split_gpuStream_t stream, const void* /*hint*/ = 0) { + T* ret; + SPLIT_CHECK_ERR(split_gpuMallocAsync((void**)&ret, n * sizeof(value_type),stream)); + return ret; + } + + + static void* allocate_raw(size_type n, const void* /*hint*/ = 0) { + void* ret; + SPLIT_CHECK_ERR(split_gpuMalloc((void**)&ret, n)); + if (ret == nullptr) { + throw std::bad_alloc(); + } + return ret; + } + + static void* allocate_raw(size_type n, split_gpuStream_t stream, const void* /*hint*/ = 0) { + void* ret; + SPLIT_CHECK_ERR(split_gpuMallocAsync((void**)&ret, n,stream)); + return ret; + } + + void deallocate(void* p) { SPLIT_CHECK_ERR(split_gpuFree(p)); } + + void deallocate(void* p,split_gpuStream_t stream) { SPLIT_CHECK_ERR(split_gpuFreeAsync(p,stream)); } + + size_type max_size() const throw() { + size_type max = static_cast(-1) / sizeof(value_type); + return (max > 0 ? max : 1); + } + + template + __host__ __device__ void construct(U* p, Args&&... args) { + ::new (p) U(std::forward(args)...); + } + + void destroy(pointer p) { p->~value_type(); } +}; + #endif /** diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h index e754965..05abaca 100644 --- a/include/splitvector/split_tools.h +++ b/include/splitvector/split_tools.h @@ -3,7 +3,7 @@ * Description: Set of tools used by SplitVector * * This file defines the following classes or functions: - * --split::tools::Cuda_mempool + * --split::tools::GPU_Mempool * --split::tools::copy_if_raw * --split::tools::copy_if * --split::tools::scan_reduce_raw @@ -31,6 +31,9 @@ * */ #pragma once #include "gpu_wrappers.h" +#ifndef SPLIT_CPU_ONLY_MODE +#include "devicevec.h" +#endif #define NUM_BANKS 32 // TODO depends on device #define LOG_NUM_BANKS 5 #define CONFLICT_FREE_OFFSET(n) ((n) >> LOG_NUM_BANKS) @@ -500,7 +503,7 @@ __global__ void split_compact_raw(T* input, uint32_t* counts, uint32_t* offsets, * It uses async mallocs * */ -class Cuda_mempool { +class GPU_Mempool { private: size_t total_bytes; size_t bytes_used; @@ -509,24 +512,24 @@ class Cuda_mempool { bool isOwner; public: - explicit Cuda_mempool(size_t bytes, split_gpuStream_t str) { + explicit GPU_Mempool(size_t bytes, split_gpuStream_t str) { s = str; SPLIT_CHECK_ERR(split_gpuMallocAsync(&_data, bytes, s)); total_bytes = bytes; bytes_used = 0; isOwner = true; } - explicit Cuda_mempool(void* ptr, size_t bytes) { + explicit GPU_Mempool(void* ptr, size_t bytes) { total_bytes = bytes; bytes_used = 0; isOwner = false; _data = ptr; } - Cuda_mempool() = delete; - Cuda_mempool(const Cuda_mempool& other) = delete; - Cuda_mempool(Cuda_mempool&& other) = delete; - ~Cuda_mempool() { + GPU_Mempool() = delete; + GPU_Mempool(const GPU_Mempool& other) = delete; + GPU_Mempool(GPU_Mempool&& other) = delete; + ~GPU_Mempool() { if (isOwner) { SPLIT_CHECK_ERR(split_gpuFreeAsync(_data, s)); } @@ -565,7 +568,7 @@ __global__ void scan_reduce_raw(T* input, uint32_t* output, Rule rule, size_t si * @brief Same as split_prefix_scan but with raw memory */ template -void split_prefix_scan_raw(T* input, T* output, Cuda_mempool& mPool, const size_t input_size, split_gpuStream_t s = 0) { +void split_prefix_scan_raw(T* input, T* output, GPU_Mempool& mPool, const size_t input_size, split_gpuStream_t s = 0) { // Scan is performed in half Blocksizes size_t scanBlocksize = BLOCKSIZE / 2; @@ -612,49 +615,9 @@ void split_prefix_scan_raw(T* input, T* output, Cuda_mempool& mPool, const size_ } } -/** - * @brief Same as copy_if but using raw memory - */ -template -uint32_t copy_if_raw(split::SplitVector>& input, T* output, Rule rule, - size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) { - - uint32_t* d_counts; - uint32_t* d_offsets; - d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); - SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s)); - - // Phase 1 -- Calculate per warp workload - size_t _size = input.size(); - split::tools::scan_reduce_raw<<>>(input.data(), d_counts, rule, _size); - d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); - SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); - - // Step 2 -- Exclusive Prefix Scan on offsets - if (nBlocks == 1) { - split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); - } else { - split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); - } - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); - - // Step 3 -- Compaction - uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); - split::tools::split_compact_raw - <<>>( - input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); - uint32_t numel; - SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); - return numel; -} - template uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, - size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) { + size_t nBlocks, GPU_Mempool& mPool, split_gpuStream_t s = 0) { uint32_t* d_counts; uint32_t* d_offsets; @@ -688,47 +651,6 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, return numel; } -/** - * @brief Same as copy_keys_if but using raw memory - */ -template -size_t copy_keys_if_raw(split::SplitVector>& input, U* output, Rule rule, - size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) { - - uint32_t* d_counts; - uint32_t* d_offsets; - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); - d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); - SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s)); - - // Phase 1 -- Calculate per warp workload - size_t _size = input.size(); - split::tools::scan_reduce_raw<<>>(input.data(), d_counts, rule, _size); - d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); - SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); - - // Step 2 -- Exclusive Prefix Scan on offsets - if (nBlocks == 1) { - split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); - } else { - split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); - } - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); - - // Step 3 -- Compaction - uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); - split::tools::split_compact_keys_raw - <<>>( - input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); - uint32_t numel; - SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); - return numel; -} - /** * @brief Estimates memory needed for compacting the input splitvector */ @@ -746,9 +668,11 @@ estimateMemoryForCompaction(const split::SplitVector -[[nodiscard]] size_t -estimateMemoryForCompaction(const size_t inputSize) noexcept { +[[nodiscard]] size_t estimateMemoryForCompaction(const size_t inputSize) noexcept { // Figure out Blocks to use size_t _s = std::ceil((float(inputSize)) / (float)BLOCKSIZE); size_t nBlocks = nextPow2(_s); @@ -777,8 +701,8 @@ void copy_keys_if(split::SplitVector>& inpu // Allocate with Mempool const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t); - Cuda_mempool mPool(memory_for_pool, s); - auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s); + GPU_Mempool mPool(memory_for_pool, s); + auto len = copy_keys_if_raw(input.data(), output.data(), input.size(), rule, nBlocks, mPool, s); output.erase(&output[len], output.end()); } @@ -810,14 +734,14 @@ void copy_if(split::SplitVector>& input, // Allocate with Mempool const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t); - Cuda_mempool mPool(memory_for_pool, s); - auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s); + GPU_Mempool mPool(memory_for_pool, s); + auto len = copy_if_raw(input.data(), output.data(), input.size(), rule, nBlocks, mPool, s); output.erase(&output[len], output.end()); } template void copy_keys_if(split::SplitVector>& input, - split::SplitVector>& output, Rule rule, Cuda_mempool&& mPool, + split::SplitVector>& output, Rule rule, GPU_Mempool&& mPool, split_gpuStream_t s = 0) { // Figure out Blocks to use @@ -826,13 +750,14 @@ void copy_keys_if(split::SplitVector>& inpu if (nBlocks == 0) { nBlocks += 1; } - auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, std::forward(mPool), s); + auto len = + copy_keys_if_raw(input.data(), output.data(), input.size(), rule, nBlocks, std::forward(mPool), s); output.erase(&output[len], output.end()); } template void copy_if(split::SplitVector>& input, - split::SplitVector>& output, Rule rule, Cuda_mempool&& mPool, + split::SplitVector>& output, Rule rule, GPU_Mempool&& mPool, split_gpuStream_t s = 0) { // Figure out Blocks to use @@ -841,7 +766,7 @@ void copy_if(split::SplitVector>& input, if (nBlocks == 0) { nBlocks += 1; } - auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s); + auto len = copy_if_raw(input.data(), output.data(), input.size(), rule, nBlocks, mPool, s); output.erase(&output[len], output.end()); } @@ -857,8 +782,8 @@ void copy_keys_if(split::SplitVector>& inpu nBlocks += 1; } assert(stack && "Invalid stack!"); - Cuda_mempool mPool(stack, max_size); - auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s); + GPU_Mempool mPool(stack, max_size); + auto len = copy_keys_if_raw(input.data(), output.data(), input.size(), rule, nBlocks, mPool, s); output.erase(&output[len], output.end()); } @@ -874,25 +799,69 @@ void copy_if(split::SplitVector>& input, nBlocks += 1; } assert(stack && "Invalid stack!"); - Cuda_mempool mPool(stack, max_size); - auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s); + GPU_Mempool mPool(stack, max_size); + auto len = copy_if_raw(input.data(), output.data(), input.size(), rule, nBlocks, mPool, s); output.erase(&output[len], output.end()); } +#ifndef SPLIT_CPU_ONLY_MODE template -size_t copy_if(T* input, T* output, size_t size, Rule rule, void* stack, size_t max_size, - split_gpuStream_t s = 0) { +void copy_if(split::DeviceVector& input, + split::DeviceVector& output, Rule rule, void* stack, size_t max_size, + split_gpuStream_t s = 0) { // Figure out Blocks to use - size_t _s = std::ceil((float(size)) / (float)BLOCKSIZE); + size_t _s = std::ceil((float(input.size())) / (float)BLOCKSIZE); size_t nBlocks = nextPow2(_s); if (nBlocks == 0) { nBlocks += 1; } assert(stack && "Invalid stack!"); - Cuda_mempool mPool(stack, max_size); - auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s); - return len; + GPU_Mempool mPool(stack, max_size); + auto len = copy_if_raw(input.data(), output.data(), input.size(), rule, nBlocks, mPool, s); + output.erase(output.data()+len, output.end()); +} +#endif + +template +[[nodiscard]] size_t copy_if(T* input, T* output, size_t inputSize, Rule rule, GPU_Mempool& mPool, + split_gpuStream_t s = 0) { + + // Figure out Blocks to use + size_t _s = std::ceil((float(inputSize)) / (float)BLOCKSIZE); + size_t nBlocks = nextPow2(_s); + if (nBlocks == 0) { + nBlocks += 1; + } + return copy_if_raw(input, output, inputSize, rule, nBlocks, mPool, s); +} + +template +[[nodiscard]] size_t copy_if(T* input, T* output, size_t inputSize, Rule rule, void* stack, size_t max_size,split_gpuStream_t s = 0) { + + // Figure out Blocks to use + size_t _s = std::ceil((float(inputSize)) / (float)BLOCKSIZE); + size_t nBlocks = nextPow2(_s); + if (nBlocks == 0) { + nBlocks += 1; + } + assert(stack && "Invalid stack!"); + GPU_Mempool mPool(stack, max_size); + return copy_if_raw(input, output, inputSize, rule, nBlocks, mPool, s); +} + +template +[[nodiscard]] size_t copy_if(T* input, T* output, size_t inputSize, Rule rule, split_gpuStream_t s = 0) { + + // Figure out Blocks to use + size_t _s = std::ceil((float(inputSize)) / (float)BLOCKSIZE); + size_t nBlocks = nextPow2(_s); + if (nBlocks == 0) { + nBlocks += 1; + } + size_t mem = estimateMemoryForCompaction(inputSize); + GPU_Mempool mPool(mem, s); + return copy_if_raw(input, output, inputSize, rule, nBlocks, mPool, s); } } // namespace tools diff --git a/include/splitvector/splitvec.h b/include/splitvector/splitvec.h index 3803fa2..bbdf375 100644 --- a/include/splitvector/splitvec.h +++ b/include/splitvector/splitvec.h @@ -481,7 +481,7 @@ class SplitVector { // This is done because _capacity would page-fault otherwise as pointed by Markus SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_capacity, sizeof(size_t), split_gpuCpuDeviceId, stream)); SPLIT_CHECK_ERR(split_gpuStreamSynchronize(stream)); - if (*_capacity==0){ + if (*_capacity == 0) { return; } @@ -501,7 +501,7 @@ class SplitVector { SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_capacity, sizeof(size_t), split_gpuCpuDeviceId, stream)); SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_size, sizeof(size_t), split_gpuCpuDeviceId, stream)); SPLIT_CHECK_ERR(split_gpuStreamSynchronize(stream)); - if (*_capacity==0){ + if (*_capacity == 0) { return; } SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_data, capacity() * sizeof(T), split_gpuCpuDeviceId, stream)); @@ -525,9 +525,7 @@ class SplitVector { * Splitvector */ HOSTDEVICE - [[nodiscard]] inline Residency getResidency()const noexcept{ - return _location; - } + [[nodiscard]] inline Residency getResidency() const noexcept { return _location; } /** * @brief Copies metadata to a provided destination SplitInfo structure. @@ -1476,7 +1474,7 @@ class SplitVector { const int64_t end = p1.data() - begin().data(); const int64_t range = end - start; - const size_t sz=size(); + const size_t sz = size(); if constexpr (!std::is_trivial::value) { for (int64_t i = start; i < end; i++) { _data[i].~T(); diff --git a/meson.build b/meson.build index e48b066..5b59965 100644 --- a/meson.build +++ b/meson.build @@ -18,6 +18,7 @@ endif hashinator_unit = executable('hashmap_test', 'unit_tests/hashmap_unit_test/main.cu',dependencies :gtest_dep ) splitvector_device_unit = executable('splitvector_device_test', 'unit_tests/gtest_vec_device/vec_test.cu',dependencies :gtest_dep ) splitvector_host_unit = executable('splitvector_host_test', 'unit_tests/gtest_vec_host/vec_test.cu',dependencies :gtest_dep ) +splitvector_device_vector_unit = executable('splitvector_device_vector_test', 'unit_tests/gtest_dev_vec/vec_test.cu',dependencies :gtest_dep ) compaction_unit = executable('compaction_test', 'unit_tests/stream_compaction/race.cu',dependencies :gtest_dep ) compaction2_unit = executable('compaction2_test', 'unit_tests/stream_compaction/preallocated.cu', cuda_args:['--default-stream=per-thread','-Xcompiler','-fopenmp'],link_args : ['-fopenmp'],dependencies :gtest_dep) compaction3_unit = executable('compaction3_test', 'unit_tests/stream_compaction/unit.cu', cuda_args:'--default-stream=per-thread',link_args : ['-fopenmp'],dependencies :gtest_dep) @@ -29,12 +30,17 @@ insertion_mechanism = executable('insertion', 'unit_tests/insertion_mechanism/ma tombstoneTest = executable('tbPerf', 'unit_tests/benchmark/tbPerf.cu', dependencies :gtest_dep) realisticTest = executable('realistic', 'unit_tests/benchmark/realistic.cu', dependencies :gtest_dep) hybridGPU = executable('hybrid_gpu', 'unit_tests/hybrid/main.cu',dependencies :gtest_dep ) +unordered_set_unit = executable('unordered_set_test', 'unit_tests/unordered_set_unit_test/main.cu',dependencies :gtest_dep ) +unordered_set_unit_cpu = executable('unordered_set_test_cpu', 'unit_tests/unordered_set_unit_test/main.cpp',dependencies :gtest_dep ) #Test-Runner test('HashinatorTest', hashinator_unit) +test('UnorderedSetTest', unordered_set_unit) +test('UnorderedSetTestCPU', unordered_set_unit_cpu) test('SplitVectorDeviceTest', splitvector_device_unit) test('SplitVectorHostTest', splitvector_host_unit) +test('SplitVectorDeviceVectorTest',splitvector_device_vector_unit) test('CompactionTest', compaction_unit) test('CompactionTest2', compaction2_unit) test('CompactionTest3', compaction3_unit) diff --git a/unit_tests/Makefile b/unit_tests/Makefile index 2cf0177..14a74c4 100644 --- a/unit_tests/Makefile +++ b/unit_tests/Makefile @@ -7,7 +7,7 @@ EXTRA= --std=c++17 EXTRA+= -gencode arch=compute_60,code=sm_60 EXTRA+= -DHASHMAPDEBUG --expt-relaxed-constexpr --expt-extended-lambda -lpthread GTEST= -L/home/kstppd/libs/googletest/build/lib -I/home/kstppd/libs/googletest/googletest/include -lgtest -lgtest_main -lpthread -OBJ= gtest_vec_host.o gtest_vec_device.o gtest_hashmap.o stream_compaction.o stream_compaction2.o delete_mechanism.o insertion_mechanism.o hybrid_cpu.o hybrid_gpu.o pointer_test.o benchmark.o benchmarkLF.o tbPerf.o realistic.o preallocated.o +OBJ= gtest_vec_host.o gtest_vec_device.o gtest_device_vec.o gtest_hashmap.o stream_compaction.o stream_compaction2.o delete_mechanism.o insertion_mechanism.o hybrid_cpu.o hybrid_gpu.o pointer_test.o benchmark.o benchmarkLF.o tbPerf.o realistic.o preallocated.o default: tests @@ -19,6 +19,7 @@ allclean: rm ${OBJ} & rm gtestvechost & rm gtestvecdevice & + rm gtestdevicevec & rm gtest_hashmap & rm compaction & rm compaction2 & @@ -54,6 +55,9 @@ gtest_vec_host.o: gtest_vec_host/vec_test.cu gtest_vec_device.o: gtest_vec_device/vec_test.cu ${CC} ${CXXFLAGS} ${OPT} ${EXTRA} ${GTEST} -o gtestvecdevice gtest_vec_device/vec_test.cu +gtest_device_vec.o: gtest_dev_vec/vec_test.cu + ${CC} ${CXXFLAGS} ${OPT} ${EXTRA} ${GTEST} -o gtestdevicevec gtest_dev_vec/vec_test.cu + stream_compaction.o: stream_compaction/race.cu ${CC} ${CXXFLAGS} ${OPT} ${EXTRA} ${GTEST} -o compaction stream_compaction/race.cu diff --git a/unit_tests/benchmark/loadFactor.cu b/unit_tests/benchmark/loadFactor.cu index 01f4510..675f696 100644 --- a/unit_tests/benchmark/loadFactor.cu +++ b/unit_tests/benchmark/loadFactor.cu @@ -2,8 +2,7 @@ #include #include #include -#include -#include "../../include/hashinator/hashinator.h" +#include "../../include/hashinator/hashmap/hashmap.h" constexpr int R = 2; using namespace std::chrono; diff --git a/unit_tests/benchmark/main.cu b/unit_tests/benchmark/main.cu index f4591df..4c36a2d 100644 --- a/unit_tests/benchmark/main.cu +++ b/unit_tests/benchmark/main.cu @@ -3,11 +3,11 @@ #include #include #include -#include "../../include/hashinator/hashinator.h" +#include "../../include/hashinator/hashmap/hashmap.h" +constexpr int R = 10; #include #define PROFILE_START(msg) nvtxRangePushA((msg)) #define PROFILE_END() nvtxRangePop() -constexpr int R = 50; using namespace std::chrono; using namespace Hashinator; diff --git a/unit_tests/benchmark/realistic.cu b/unit_tests/benchmark/realistic.cu index 310c135..64f257f 100644 --- a/unit_tests/benchmark/realistic.cu +++ b/unit_tests/benchmark/realistic.cu @@ -3,7 +3,7 @@ #include #include #include -#include "../../include/hashinator/hashinator.h" +#include "../../include/hashinator/hashmap/hashmap.h" constexpr int R = 10; using namespace std::chrono; diff --git a/unit_tests/benchmark/tbPerf.cu b/unit_tests/benchmark/tbPerf.cu index 5d8872a..f8c204c 100644 --- a/unit_tests/benchmark/tbPerf.cu +++ b/unit_tests/benchmark/tbPerf.cu @@ -3,7 +3,7 @@ #include #include #include -#include "../../include/hashinator/hashinator.h" +#include "../../include/hashinator/hashmap/hashmap.h" static constexpr int R = 10; using namespace std::chrono; diff --git a/unit_tests/delete_by_compaction/main.cu b/unit_tests/delete_by_compaction/main.cu index 6cc3e03..fb063e7 100644 --- a/unit_tests/delete_by_compaction/main.cu +++ b/unit_tests/delete_by_compaction/main.cu @@ -2,7 +2,7 @@ #include #include #include -#include "../../include/hashinator/hashinator.h" +#include "../../include/hashinator/hashmap/hashmap.h" #include #define BLOCKSIZE 32 diff --git a/unit_tests/gtest_dev_vec/vec_test.cu b/unit_tests/gtest_dev_vec/vec_test.cu new file mode 100644 index 0000000..723d05b --- /dev/null +++ b/unit_tests/gtest_dev_vec/vec_test.cu @@ -0,0 +1,389 @@ +#include +#include +#include +#include +#include +#include +#include "../../include/splitvector/splitvec.h" +#include "../../include/splitvector/devicevec.h" +#include "../../include/splitvector/split_tools.h" +#define expect_true EXPECT_TRUE + +using vec_type_t = int; +using vector = split::DeviceVector ; + + +void fill_vec(vector* v, size_t targetSize){ + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dist(1, std::numeric_limits::max()); + while (v->size() < targetSize) { + vec_type_t val =dist(gen); + v->push_back(val); + } +} + +void printVecStats(vector* v){ + std::cout<size()<capacity()<size()<capacity()<size();i++){ + std::cout<get(i)<<", "; + } + std::cout<<"\n-------------\n"; +} + + +TEST(SpDeviceVector,Construction){ + + constexpr size_t N=1<<10; + vector* a=new vector; + expect_true(a->size()==0); + expect_true(a->capacity()==0); + + vector* b=new vector(N); + expect_true(b->size()==N); + expect_true(b->capacity()==N); + + vector*c =new vector(N); + vector*d =new vector(*c); + expect_true(d->size()==c->size()); + + + std::vector s{1,2,3,4,5}; + vector* e = new vector(s); + expect_true(e->size()==s.size()); + + + split::SplitVector k{1,2,3,4,5} ; + vector* f = new vector(k); + expect_true(f->size()==k.size()); + delete a; + delete b; + delete c; + delete d; + delete e; + delete f; +} + +TEST(SpDeviceVector,AssignmentOperator){ + + constexpr size_t N=1<<10; + vector* a=new vector; + expect_true(a->size()==0); + expect_true(a->capacity()==0); + + vector* b=new vector; + *b=*a; + expect_true(b->size()==0); + expect_true(b->capacity()==0); + + vector*c =new vector(N); + c->reserve(10*N); + vector*d =new vector; + *d=*c; + + expect_true(d->size()==c->size()); + + for (size_t i =0;isize();i++){ + expect_true(d->get(i)==c->get(i)); + } + expect_true((*d==*c)); + expect_true(!(*d!=*c)); + delete a; + delete b; + delete c; + delete d; +} + + +TEST(SpDeviceVector,SizeModifiers){ + + constexpr size_t N=(1<<10); + vector* a = new vector; + expect_true(a->size()==0); + expect_true(a->capacity()==0); + a->reserve(N); + expect_true(a->size()==0); + expect_true(a->capacity()>=N); + auto cap =a->capacity(); + a->resize(N); + expect_true(a->size()==N); + expect_true(a->capacity()==cap); + delete a; +} + +TEST(SpDeviceVector,Clear){ + + constexpr size_t N=(1<<10); + vector* a = new vector; + expect_true(a->size()==0); + expect_true(a->capacity()==0); + a->reserve(N); + expect_true(a->size()==0); + expect_true(a->capacity()>=N); + auto cap =a->capacity(); + a->resize(N); + expect_true(a->size()==N); + expect_true(a->capacity()==cap); + a->clear(); + expect_true(a->size()==0); + delete a; +} + +TEST(SpDeviceVector,HostPushBack){ + + constexpr size_t N=(1<<10); + vector* a=new vector; + for (size_t i =0;ipush_back(i); + } + + for (size_t i =0;iget(i)==i); + } + + vector* b=new vector; + for (size_t i =0;ipush_back(i); + } + + for (size_t i =0;iget(i)==i); + } + + vector* c=new vector; + for (size_t i =0;i<10*N;i++){ + c->push_back(i); + } + + for (size_t i =0;i<10*N;i++){ + expect_true(c->get(i)==i); + } + delete a; + delete b; + delete c; +} + +__global__ +void kernel_set(vector* a){ + int index = blockIdx.x * blockDim.x + threadIdx.x; + a->device_set(index,index); +} + +__global__ +void kernel_at(vector* a){ + int index = blockIdx.x * blockDim.x + threadIdx.x; + a->at(index)=index; +} + +__global__ +void kernel_pushback(vector* a){ + int index = blockIdx.x * blockDim.x + threadIdx.x; + a->device_push_back(index); +} + +TEST(SpDeviceVector,DeviceSet){ + + constexpr size_t N=(1<<10); + vector* a=new vector; + a->resize(N); + kernel_set<<<1,N>>>(a); + split_gpuDeviceSynchronize(); + + for (size_t i =0;iget(i)==i); + } + delete a; +} + +TEST(SpDeviceVector,DeviceAt){ + + constexpr size_t N=(1<<10); + vector* a=new vector; + a->resize(N); + kernel_at<<<1,N>>>(a); + split_gpuDeviceSynchronize(); + + for (size_t i =0;iget(i)==i); + } + delete a; +} + +TEST(SpDeviceVector,DevicePushBack){ + + constexpr size_t N=(1<<10); + vector* a=new vector; + a->reserve(10*N); + kernel_pushback<<<10,N>>>(a); + split_gpuDeviceSynchronize(); + expect_true(a->size()==10*N); + delete a; +} + +TEST(SpDeviceVector,RemoveFromBack_PopBack_Host){ + constexpr size_t N=32; + vector* a=new vector; + a->reserve(N); + kernel_pushback<<<1,N>>>(a); + split_gpuDeviceSynchronize(); + for (auto i= a->begin(); i!=a->end();++i){ + a->set(i,a->get(i)*2); + } + a->remove_from_back(2); + expect_true(a->size()==N-2); + a->remove_from_back(1); + expect_true(a->size()==N-3); + a->pop_back(); + expect_true(a->size()==N-4); + delete a; +} + +TEST(SpDeviceVector,HostErase){ + constexpr size_t N=32; + vector* a=new vector; + a->reserve(N); + kernel_pushback<<<1,N>>>(a); + split_gpuDeviceSynchronize(); + for (auto i= a->begin(); i!=a->end();++i){ + a->set(i,a->get(i)*2); + } + auto it=a->begin(); + a->erase(it); + expect_true(a->get(a->begin())==2); + delete a; +} + +TEST(SpDeviceVector,HostInsert){ + constexpr size_t N=32; + vector* a=new vector; + a->reserve(N); + kernel_pushback<<<1,N>>>(a); + split_gpuDeviceSynchronize(); + for (auto i= a->begin(); i!=a->end();++i){ + a->set(i,a->get(i)*2); + } + a->insert(a->end(),63); + expect_true(a->back()==63); + a->insert(a->begin(),42); + expect_true(a->front()==42); + expect_true(a->size()==N+2); + delete a; +} + + +TEST(SpDeviceVector,HostInsertRange){ + constexpr size_t N=32; + vector* a=new vector; + a->reserve(N); + kernel_pushback<<<1,N>>>(a); + split_gpuDeviceSynchronize(); + for (auto i= a->begin(); i!=a->end();++i){ + a->set(i,a->get(i)*2); + } + vector* b=new vector(N); + for (auto i= b->begin(); i!=b->end();++i){ + b->set(i,1); + } + + a->insert(a->end(),b->begin(),b->end()); + a->insert(a->begin(),b->begin(),b->end()); + for (int i=0;i<32;i++){ + expect_true(a->get(i)==1); + } + for (int i=32;i<64;i++){ + expect_true(a->get(i)>=0 && a->get(i)<63); + } + for (int i=64;i<96;i++){ + expect_true(a->get(i)==1); + } + delete a; + delete b; +} + +bool run_compcation_test(size_t sz){ + vector* v=new vector; + fill_vec(v,sz); + auto predicate_on =[]__host__ __device__ (vec_type_t element)->bool{ return element%2 == 0 ;}; + auto predicate_off =[]__host__ __device__ (vec_type_t element)->bool{ return element%2 != 0 ;}; + vector* output1 = new vector(v->size()); + vector* output2 = new vector(v->size()); + const size_t len1 = split::tools::copy_if(v->data(),output1->data(),v->size(),predicate_on); + const size_t len2 = split::tools::copy_if(v->data(),output2->data(),v->size(),predicate_off); + auto r=v->size(); + delete v; + delete output1; + delete output2; + return len1+len2==r; +} + +bool run_compcation_test2(size_t sz){ + vector* v=new vector; + void* stack=nullptr; + size_t maxBytes = split::tools::estimateMemoryForCompaction(sz); + SPLIT_CHECK_ERR (split_gpuMalloc( (void**)&stack ,maxBytes)); + fill_vec(v,sz); + auto predicate_on =[]__host__ __device__ (vec_type_t element)->bool{ return element%2 == 0 ;}; + auto predicate_off =[]__host__ __device__ (vec_type_t element)->bool{ return element%2 != 0 ;}; + vector* output1 = new vector(v->size()); + vector* output2 = new vector(v->size()); + const size_t len1 = split::tools::copy_if(v->data(),output1->data(),v->size(),predicate_on,stack,maxBytes); + const size_t len2 = split::tools::copy_if(v->data(),output2->data(),v->size(),predicate_off,stack,maxBytes); + auto r=v->size(); + delete v; + delete output1; + delete output2; + SPLIT_CHECK_ERR (split_gpuFree(stack)); + return len1+len2==r; +} + +bool run_compcation_test2_streams(size_t sz){ + vector* v=new vector; + void* stack=nullptr; + size_t maxBytes = split::tools::estimateMemoryForCompaction(sz); + SPLIT_CHECK_ERR (split_gpuMalloc( (void**)&stack ,maxBytes)); + fill_vec(v,sz); + auto predicate_on =[]__host__ __device__ (vec_type_t element)->bool{ return element%2 == 0 ;}; + auto predicate_off =[]__host__ __device__ (vec_type_t element)->bool{ return element%2 != 0 ;}; + split_gpuStream_t s1,s2; + SPLIT_CHECK_ERR (split_gpuStreamCreate(&s1)); + SPLIT_CHECK_ERR (split_gpuStreamCreate(&s2)); + vector* output1 = new vector(v->size()); + vector* output2 = new vector(v->size()); + output1->setStream(s1); + output2->setStream(s2); + v->setStream(s1); + const size_t len1 = split::tools::copy_if(v->data(),output1->data(),v->size(),predicate_on,stack,maxBytes,s1); + v->setStream(s2); + const size_t len2 = split::tools::copy_if(v->data(),output2->data(),v->size(),predicate_off,stack,maxBytes,s2); + auto r=v->size(); + delete v; + delete output1; + delete output2; + + SPLIT_CHECK_ERR (split_gpuStreamDestroy(s1)); + SPLIT_CHECK_ERR (split_gpuStreamDestroy(s2)); + SPLIT_CHECK_ERR (split_gpuFree(stack)); + return len1+len2==r; +} + +TEST(SpDeviceVector,StreamCompaction){ + for (size_t i = 100; i< 50000; i*=4){ + expect_true(run_compcation_test(i)); + expect_true(run_compcation_test2(i)); + expect_true(run_compcation_test2_streams(i)); + } +} + + +int main(int argc, char* argv[]){ + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/unit_tests/gtest_vec_device/vec_test.cu b/unit_tests/gtest_vec_device/vec_test.cu index c6e663a..2f3336e 100644 --- a/unit_tests/gtest_vec_device/vec_test.cu +++ b/unit_tests/gtest_vec_device/vec_test.cu @@ -2,8 +2,8 @@ #include #include #include -#include "../../include/splitvector/splitvec.h" -#include "../../include/splitvector/split_tools.h" +#include "../../../include/splitvector/splitvec.h" +#include "../../../include/splitvector/split_tools.h" #define expect_true EXPECT_TRUE #define expect_false EXPECT_FALSE diff --git a/unit_tests/gtest_vec_host/vec_test.cu b/unit_tests/gtest_vec_host/vec_test.cu index efd65a6..3cc7ea0 100644 --- a/unit_tests/gtest_vec_host/vec_test.cu +++ b/unit_tests/gtest_vec_host/vec_test.cu @@ -6,7 +6,7 @@ #ifndef SPLIT_CPU_ONLY_MODE #define SPLIT_CPU_ONLY_MODE #endif -#include "../../include/splitvector/splitvec.h" +#include "../../../include/splitvector/splitvec.h" #define expect_true EXPECT_TRUE #define expect_false EXPECT_FALSE diff --git a/unit_tests/hashmap_unit_test/main.cu b/unit_tests/hashmap_unit_test/main.cu index e9cfa86..f6e30b6 100644 --- a/unit_tests/hashmap_unit_test/main.cu +++ b/unit_tests/hashmap_unit_test/main.cu @@ -2,7 +2,7 @@ #include #include #include -#include "../../include/hashinator/hashinator.h" +#include "../../include/hashinator/hashmap/hashmap.h" #include #include #include diff --git a/unit_tests/hybrid/main.cu b/unit_tests/hybrid/main.cu index 83d09dd..732d284 100644 --- a/unit_tests/hybrid/main.cu +++ b/unit_tests/hybrid/main.cu @@ -2,7 +2,7 @@ #include #include #include -#include "../../include/hashinator/hashinator.h" +#include "../../include/hashinator/hashmap/hashmap.h" #include diff --git a/unit_tests/insertion_mechanism/main.cu b/unit_tests/insertion_mechanism/main.cu index d270fca..d186481 100644 --- a/unit_tests/insertion_mechanism/main.cu +++ b/unit_tests/insertion_mechanism/main.cu @@ -4,7 +4,7 @@ #include #include #include -#include "../../include/hashinator/hashinator.h" +#include "../../include/hashinator/hashmap/hashmap.h" #include #define BLOCKSIZE 1024 diff --git a/unit_tests/pointer_test/main.cu b/unit_tests/pointer_test/main.cu index bd777c6..56745b9 100644 --- a/unit_tests/pointer_test/main.cu +++ b/unit_tests/pointer_test/main.cu @@ -2,8 +2,8 @@ #include #include #include -#include "../../include/splitvector/splitvec.h" -#include "../../include/splitvector/split_tools.h" +#include "../../../include/splitvector/splitvec.h" +#include "../../../include/splitvector/split_tools.h" #define N 1024 #define expect_true EXPECT_TRUE #define expect_false EXPECT_FALSE diff --git a/unit_tests/stream_compaction/preallocated.cu b/unit_tests/stream_compaction/preallocated.cu index 20b01af..ddb9895 100644 --- a/unit_tests/stream_compaction/preallocated.cu +++ b/unit_tests/stream_compaction/preallocated.cu @@ -5,8 +5,8 @@ #include #include #include -#include "../../include/splitvector/splitvec.h" -#include "../../include/splitvector/split_tools.h" +#include "../../../include/splitvector/splitvec.h" +#include "../../../include/splitvector/split_tools.h" #define expect_true EXPECT_TRUE #define expect_false EXPECT_FALSE #define expect_eq EXPECT_EQ diff --git a/unit_tests/stream_compaction/race.cu b/unit_tests/stream_compaction/race.cu index 5f9f4ea..a5b8e34 100644 --- a/unit_tests/stream_compaction/race.cu +++ b/unit_tests/stream_compaction/race.cu @@ -4,8 +4,8 @@ #include #include #include -#include "../../include/splitvector/splitvec.h" -#include "../../include/splitvector/split_tools.h" +#include "../../../include/splitvector/splitvec.h" +#include "../../../include/splitvector/split_tools.h" #include #include #include diff --git a/unit_tests/stream_compaction/unit.cu b/unit_tests/stream_compaction/unit.cu index 92f5568..0706bbd 100644 --- a/unit_tests/stream_compaction/unit.cu +++ b/unit_tests/stream_compaction/unit.cu @@ -4,8 +4,8 @@ #include #include #include -#include "../../include/splitvector/splitvec.h" -#include "../../include/splitvector/split_tools.h" +#include "../../../include/splitvector/splitvec.h" +#include "../../../include/splitvector/split_tools.h" #define expect_true EXPECT_TRUE #define expect_false EXPECT_FALSE #define expect_eq EXPECT_EQ diff --git a/unit_tests/unordered_set_unit_test/main.cpp b/unit_tests/unordered_set_unit_test/main.cpp new file mode 100644 index 0000000..7325c07 --- /dev/null +++ b/unit_tests/unordered_set_unit_test/main.cpp @@ -0,0 +1,268 @@ +#include + +#define SPLIT_CPU_ONLY_MODE +#define HASHINATOR_CPU_ONLY_MODE + +#include "../../include/hashinator/unordered_set/unordered_set.h" +#include +#include + +#define expect_true EXPECT_TRUE +#define expect_false EXPECT_FALSE +#define expect_eq EXPECT_EQ +#define SMALL_SIZE (1<<10) +#define LARGE_SIZE ( 1<<20 ) + +using namespace Hashinator; +typedef uint32_t key_type; +typedef split::SplitVector vector ; +typedef Unordered_Set UnorderedSet; + + +bool isFreeOfDuplicates(const vector& v){ + + for (const auto & it : v){ + auto cnt = std::count( v.begin() ,v.end(),it); + if (cnt>1 ){return false;} + } + return true ; +} + +bool isFreeOfDuplicates( UnorderedSet* s){ + vector out; + for (auto& k:*s ){ + out.push_back(k); + } + expect_true(out.size()==s->size()); + expect_true( isFreeOfDuplicates(out)); + return true; +} +TEST(Unordered_UnitTest , Construction){ + UnorderedSet s(12); + UnorderedSet s2=s; + expect_true(s2.bucket_count()==s.bucket_count()); + UnorderedSet s3= UnorderedSet(12); + expect_true(s3.bucket_count()==1<<12); + expect_true(true); +} + +TEST(Unordered_UnitTest , Construction_InitializerList){ + UnorderedSet s{std::initializer_list{1,2,3,4,1,2,3,4}}; + expect_true(s.size()==4); +} + +TEST(Unordered_UnitTest , Empty){ + UnorderedSet s; + expect_true(s.size()==0); + expect_true(s.empty()); + s.insert(1); + expect_false(s.empty()); +} + +TEST(Unordered_UnitTest , InsertFindHost){ + UnorderedSet s; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + s.insert(i); + auto it = s.find(i); + expect_true(*it==i); + } + expect_true(s.size()==SMALL_SIZE); + expect_true(s.tombstone_count()==0); +} + +TEST(Unordered_UnitTest , InsertHost){ + UnorderedSet s; + for (uint32_t i = 0 ; i < LARGE_SIZE;++i){ + s.insert(i); + } + expect_true(s.size()==LARGE_SIZE); +} + + +TEST(Unordered_UnitTest , InsertKernel){ + vector v; + for (uint32_t i = 0 ; i < LARGE_SIZE;++i){ + v.push_back(i); + } + UnorderedSet s; + s.insert(v.data(),v.size()) ; + expect_true(s.size()==LARGE_SIZE); +} + + +TEST(Unordered_UnitTest , InsertEraseHost){ + std::unordered_set s; + for (uint32_t i = 0 ; i < LARGE_SIZE;++i){ + s.insert(i); + } + expect_true(s.size()==LARGE_SIZE); + for (uint32_t i = 0 ; i < LARGE_SIZE;++i){ + s.erase(i); + } + expect_true(s.size()==0); +} + +TEST(Unordered_UnitTest , Insert_Erase_Kernel){ + vector v; + for (uint32_t i = 0 ; i < LARGE_SIZE;++i){ + v.push_back(i); + } + UnorderedSet s; + s.insert(v.data(),v.size()) ; + expect_true(s.size()==LARGE_SIZE); + s.erase(v.data(),v.size()) ; + expect_true(s.size()==0); +} + +TEST(Unordered_UnitTest , NewOverloadManagedMemory){ + vector v; + for (uint32_t i = 0 ; i < LARGE_SIZE;++i){ + v.push_back(i); + } + UnorderedSet* s = new UnorderedSet; + s->insert(v.data(),v.size()) ; + expect_true(s->size()==LARGE_SIZE); + s->erase(v.data(),v.size()) ; + expect_true(s->size()==0); + delete s; +} + +TEST(Unordered_UnitTest , Contains_Count){ + UnorderedSet s; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + s.insert(i); + expect_true(s.contains(i)); + expect_false(s.contains(SMALL_SIZE+i)); + expect_true(s.count(i)==1); + } +} + +TEST(Unordered_UnitTest , InsertEraseHostSmall){ + UnorderedSet s; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + s.insert(i); + } + expect_true(s.size()==SMALL_SIZE); + expect_true(s.tombstone_count()==0); + + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + auto it = s.find(i); + expect_true(*it==i); + s.erase(it); + auto it2 = s.find(i); + expect_true(it2==s.end()); + } + expect_true(s.size()==0); + expect_true(s.tombstone_count()==SMALL_SIZE); + + s.rehash(); + expect_true(s.size()==0); + expect_true(s.tombstone_count()==0); + + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + s.insert(i); + auto it = s.find(i); + expect_true(*it==i); + } + expect_true(s.size()==SMALL_SIZE); + expect_true(s.tombstone_count()==0); +} + + +TEST(Unordered_UnitTest , Clear){ + { + vector v; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + v.push_back(i); + } + UnorderedSet* s = new UnorderedSet; + s->insert(v.data(),v.size()) ; + expect_true(s->size()==SMALL_SIZE); + + s->clear(); + expect_true(s->size()==0); + expect_true(s->empty()); + delete s; + } + + + { + vector v; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + v.push_back(i); + } + UnorderedSet* s = new UnorderedSet; + s->insert(v.data(),v.size()) ; + expect_true(s->size()==SMALL_SIZE); + + s->clear(targets::device); + expect_true(s->size()==0); + expect_true(s->empty()); + delete s; + } +} + +TEST(Unordered_UnitTest , Resize){ + vector v; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + v.push_back(i); + } + UnorderedSet* s = new UnorderedSet; + s->insert(v.data(),v.size()) ; + expect_true(s->size()==SMALL_SIZE); + auto priorFill=s->size(); + + auto sizePower = s->getSizePower(); + s->resize(sizePower+1,targets::host); + expect_true(s->size()==priorFill); + delete s; +} + +TEST(Unordered_UnitTest , LoadFactorReduction){ + + vector v; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + v.push_back(i); + } + UnorderedSet* s = new UnorderedSet; + s->insert(v.data(),v.size(),1.0) ; + expect_true(s->size()==SMALL_SIZE); + + //At this point we are heavilly overflown + expect_true( isFreeOfDuplicates(s) ); + + //Let's resize to get back to a proper overflow + s->performCleanupTasks(); + expect_true( isFreeOfDuplicates(s) ); + delete s; +} + +TEST(Unordered_UnitTest , TombstoneCleaning){ + + vector v; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + v.push_back(i); + } + UnorderedSet* s = new UnorderedSet; + s->insert(v.data(),v.size(),1.0) ; + expect_true(s->size()==SMALL_SIZE); + + //At this point we are heavilly overflown + expect_true( isFreeOfDuplicates(s) ); + + //Let's resize to get back to a proper overflow + s->erase(v.data(),v.size()/2); + expect_true( s->tombstone_count()==SMALL_SIZE/2); + expect_true( isFreeOfDuplicates(s) ); + expect_true( s->tombstone_count()==SMALL_SIZE/2); + s->performCleanupTasks(); + expect_true( s->tombstone_count()==0); + delete s; +} + + +int main(int argc, char* argv[]){ + srand(time(NULL)); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/unit_tests/unordered_set_unit_test/main.cu b/unit_tests/unordered_set_unit_test/main.cu new file mode 100644 index 0000000..5f551d9 --- /dev/null +++ b/unit_tests/unordered_set_unit_test/main.cu @@ -0,0 +1,415 @@ +#include +#include "../../include/hashinator/unordered_set/unordered_set.h" +#include +#include + +#define expect_true EXPECT_TRUE +#define expect_false EXPECT_FALSE +#define expect_eq EXPECT_EQ +#define SMALL 10 +#define LARGE 20 +#define SMALL_SIZE (1< vector ; +typedef Unordered_Set UnorderedSet; + + +bool isFreeOfDuplicates(const vector& v){ + + for (const auto & it : v){ + auto cnt = std::count( v.begin() ,v.end(),it); + if (cnt>1 ){return false;} + } + return true ; +} + + +bool isFreeOfDuplicates( UnorderedSet* s){ + vector out(s->size()); + size_t count = s->extractAllKeys(out); + expect_true(count==s->size()); + expect_true( isFreeOfDuplicates(out) ); + return true; +} + +__global__ +void gpu_write(UnorderedSet* s, key_type*src, size_t N){ + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < N ){ + s->add_element(src[index]); + } +} + + +TEST(Unordered_UnitTest , Construction){ + UnorderedSet s(12); + UnorderedSet s2=s; + expect_true(s2.bucket_count()==s.bucket_count()); + UnorderedSet s3= UnorderedSet(12); + expect_true(s3.bucket_count()==1<<12); + expect_true(true); +} + +TEST(Unordered_UnitTest , Construction_InitializerList){ + UnorderedSet s{std::initializer_list{1,2,3,4,1,2,3,4}}; + expect_true(s.size()==4); +} + +TEST(Unordered_UnitTest , Empty){ + UnorderedSet s; + expect_true(s.size()==0); + expect_true(s.empty()); + s.insert(1); + expect_false(s.empty()); +} + +TEST(Unordered_UnitTest , InsertFindHost){ + UnorderedSet s; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + s.insert(i); + auto it = s.find(i); + expect_true(*it==i); + } + expect_true(s.size()==SMALL_SIZE); + expect_true(s.tombstone_count()==0); +} + +TEST(Unordered_UnitTest , InsertHost){ + UnorderedSet s; + for (uint32_t i = 0 ; i < LARGE_SIZE;++i){ + s.insert(i); + } + expect_true(s.size()==LARGE_SIZE); +} + + +TEST(Unordered_UnitTest , InsertKernel){ + vector v; + for (uint32_t i = 0 ; i < LARGE_SIZE;++i){ + v.push_back(i); + } + UnorderedSet s; + s.insert(v.data(),v.size()) ; + expect_true(s.size()==LARGE_SIZE); +} + + +TEST(Unordered_UnitTest , InsertEraseHost){ + std::unordered_set s; + for (uint32_t i = 0 ; i < LARGE_SIZE;++i){ + s.insert(i); + } + expect_true(s.size()==LARGE_SIZE); + for (uint32_t i = 0 ; i < LARGE_SIZE;++i){ + s.erase(i); + } + expect_true(s.size()==0); +} + +TEST(Unordered_UnitTest , Insert_Erase_Kernel){ + vector v; + for (uint32_t i = 0 ; i < LARGE_SIZE;++i){ + v.push_back(i); + } + UnorderedSet s; + s.insert(v.data(),v.size()) ; + expect_true(s.size()==LARGE_SIZE); + for (const auto& key:v){ + expect_true(s.contains(key)); + } + s.erase(v.data(),v.size()) ; + expect_true(s.size()==0); +} + +TEST(Unordered_UnitTest , NewOverloadManagedMemory){ + vector v; + for (uint32_t i = 0 ; i < LARGE_SIZE;++i){ + v.push_back(i); + } + UnorderedSet* s = new UnorderedSet; + s->insert(v.data(),v.size()) ; + expect_true(s->size()==LARGE_SIZE); + s->erase(v.data(),v.size()) ; + expect_true(s->size()==0); + delete s; +} + +TEST(Unordered_UnitTest , Contains_Count){ + UnorderedSet s; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + s.insert(i); + expect_true(s.contains(i)); + expect_false(s.contains(SMALL_SIZE+i)); + expect_true(s.count(i)==1); + } +} + +TEST(Unordered_UnitTest , InsertEraseHostSmall){ + UnorderedSet s; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + s.insert(i); + } + expect_true(s.size()==SMALL_SIZE); + expect_true(s.tombstone_count()==0); + + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + auto it = s.find(i); + expect_true(*it==i); + s.erase(it); + auto it2 = s.find(i); + expect_true(it2==s.end()); + } + expect_true(s.size()==0); + expect_true(s.tombstone_count()==SMALL_SIZE); + + s.rehash(); + expect_true(s.size()==0); + expect_true(s.tombstone_count()==0); + + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + s.insert(i); + auto it = s.find(i); + expect_true(*it==i); + } + expect_true(s.size()==SMALL_SIZE); + expect_true(s.tombstone_count()==0); +} + +TEST(Unordered_UnitTest , ExtractPattern){ + vector v; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + v.push_back(i); + } + UnorderedSet* s = new UnorderedSet; + s->insert(v.data(),v.size()) ; + expect_true(s->size()==SMALL_SIZE); + + vector out(SMALL_SIZE); + size_t count = s->extractAllKeys(out); + expect_true(count==s->size()); + expect_true( isFreeOfDuplicates(out) ); + delete s; +} + +TEST(Unordered_UnitTest , Clear){ + { + vector v; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + v.push_back(i); + } + UnorderedSet* s = new UnorderedSet; + s->insert(v.data(),v.size()) ; + expect_true(s->size()==SMALL_SIZE); + + s->clear(); + expect_true(s->size()==0); + expect_true(s->empty()); + delete s; + } + + + { + vector v; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + v.push_back(i); + } + UnorderedSet* s = new UnorderedSet; + s->insert(v.data(),v.size()) ; + expect_true(s->size()==SMALL_SIZE); + + s->clear(targets::device); + expect_true(s->size()==0); + expect_true(s->empty()); + delete s; + } +} + +TEST(Unordered_UnitTest , Resize){ + { + vector v; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + v.push_back(i); + } + UnorderedSet* s = new UnorderedSet; + s->insert(v.data(),v.size()) ; + expect_true(s->size()==SMALL_SIZE); + auto priorFill=s->size(); + + auto sizePower = s->getSizePower(); + s->resize(sizePower+1,targets::host); + expect_true(s->size()==priorFill); + } + + { + vector v; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + v.push_back(i); + } + UnorderedSet* s = new UnorderedSet; + s->insert(v.data(),v.size()) ; + expect_true(s->size()==SMALL_SIZE); + auto priorFill=s->size(); + + auto sizePower = s->getSizePower(); + s->resize(sizePower+1,targets::device); + expect_true(s->size()==priorFill); + + vector out(SMALL_SIZE); + size_t count = s->extractAllKeys(out); + expect_true(count==s->size()); + expect_true( isFreeOfDuplicates(out) ); + + delete s; + } +} + +TEST(Unordered_UnitTest , LoadFactorReduction){ + + vector v; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + v.push_back(i); + } + UnorderedSet* s = new UnorderedSet; + s->insert(v.data(),v.size(),1.0) ; + expect_true(s->size()==SMALL_SIZE); + + //At this point we are heavilly overflown + expect_true( isFreeOfDuplicates(s) ); + + //Let's resize to get back to a proper overflow + s->performCleanupTasks(); + expect_true( isFreeOfDuplicates(s) ); + delete s; +} + +TEST(Unordered_UnitTest , TombstoneCleaning){ + + vector v; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + v.push_back(i); + } + UnorderedSet* s = new UnorderedSet; + s->insert(v.data(),v.size(),1.0) ; + expect_true(s->size()==SMALL_SIZE); + + //At this point we are heavilly overflown + expect_true( isFreeOfDuplicates(s) ); + + //Let's resize to get back to a proper overflow + s->erase(v.data(),v.size()/2); + expect_true( s->tombstone_count()==SMALL_SIZE/2); + expect_true( isFreeOfDuplicates(s) ); + expect_true( s->tombstone_count()==SMALL_SIZE/2); + s->performCleanupTasks(); + expect_true( s->tombstone_count()==0); + delete s; +} + +TEST(Unordered_UnitTest , DeviceKernelWrite){ + + vector v; + for (uint32_t i = 0 ; i < SMALL_SIZE;++i){ + v.push_back(i); + } + UnorderedSet* s = new UnorderedSet; + s->insert(v.data(),v.size(),1.0) ; + + auto* ds= s->upload(); + gpu_write<<>>(ds,v.data(),v.size()); + split_gpuDeviceSynchronize(); + s->download(); + expect_true(s->size()==SMALL_SIZE); + expect_true( isFreeOfDuplicates(s) ); + + //Let's resize to get back to a proper overflow + s->erase(v.data(),v.size()/2); + expect_true( s->tombstone_count()==SMALL_SIZE/2); + expect_true( isFreeOfDuplicates(s) ); + expect_true( s->tombstone_count()==SMALL_SIZE/2); + s->performCleanupTasks(); + expect_true( s->tombstone_count()==0); + delete s; +} + +__global__ +void gpu_write_warpWide(UnorderedSet* set,key_type* src,size_t N ){ + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + const size_t wid = index / Hashinator::defaults::WARPSIZE; + const size_t w_tid = index % defaults::WARPSIZE; + if (wid < N ){ + key_type key= src[wid]; + set->warpInsert(key,w_tid); + } +} + +__global__ +void gpu_erase_warpWide(UnorderedSet* set,key_type* src,size_t N){ + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + const size_t wid = index / Hashinator::defaults::WARPSIZE; + const size_t w_tid = index % defaults::WARPSIZE; + if (wid < N ){ + key_type key= src[wid]; + set->warpErase(key,w_tid); + } +} + +TEST(Unordered_UnitTest ,WarpInsert){ + vector v; + for (uint32_t i = 0 ; i < LARGE_SIZE;++i){ + v.push_back(i); + } + size_t N = v.size(); + size_t blocksize=1024; + size_t blocks=N/blocksize; + size_t warpsize = Hashinator::defaults::WARPSIZE; + size_t threadsNeeded = N*warpsize; + blocks = threadsNeeded/blocksize; + UnorderedSet* s = new UnorderedSet(LARGE+1); + gpu_write_warpWide<<>>(s,v.data(),v.size()); + split_gpuDeviceSynchronize(); + for (const auto& key:v){ + expect_true(s->contains(key)); + } + delete s; +} + + +TEST(Unordered_UnitTest ,WarpInsertErase){ + vector v; + for (uint32_t i = 0 ; i < LARGE_SIZE;++i){ + v.push_back(i); + } + size_t N = v.size(); + size_t blocksize=1024; + size_t blocks=N/blocksize; + size_t warpsize = Hashinator::defaults::WARPSIZE; + size_t threadsNeeded = N*warpsize; + blocks = threadsNeeded/blocksize; + UnorderedSet* s = new UnorderedSet; + s->resize(LARGE+1); + gpu_write_warpWide<<>>(s,v.data(),v.size()); + split_gpuDeviceSynchronize(); + for (const auto& key:v){ + expect_true(s->contains(key)); + } + + gpu_erase_warpWide<<>>(s,v.data(),v.size()); + split_gpuDeviceSynchronize(); + for (const auto& key:v){ + expect_false(s->contains(key)); + } + expect_true(s->size()==0); + split_gpuDeviceSynchronize(); + + + delete s; +} + +int main(int argc, char* argv[]){ + srand(time(NULL)); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +}