Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Device Vector and Unordered Set Implementations #38

Closed
wants to merge 48 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
dbf4758
First commit of SplitDeviceVector + some unit tests for that
kstppd Oct 13, 2023
858be51
Equality operators
kstppd Oct 13, 2023
7032171
Account for host only modes
kstppd Oct 13, 2023
99799d1
Add device iterators
kstppd Oct 15, 2023
264da71
Modify copy_if signature to support T* instead of splitvectors for input
kstppd Oct 16, 2023
91671d9
Formatter pass
kstppd Oct 16, 2023
3668c0d
Enable stream copmpactions for SplitDeviceVector
kstppd Oct 16, 2023
e51066b
Rename test
kstppd Oct 16, 2023
ba3fb3c
Add host read only iterator...
kstppd Oct 17, 2023
ec853fc
Fix a small memory leak in the unit tests
kstppd Oct 17, 2023
0b57317
WIP more iterator functionality from SplitDeviceVector
kstppd Oct 18, 2023
453e993
WIP more functionallities
kstppd Oct 19, 2023
8751d54
WIP more functionallities...
kstppd Oct 19, 2023
31ab957
WIP more functionallities...
kstppd Oct 19, 2023
bd6f266
Bug fix. Insert range was using capacity and not size for resizing
kstppd Oct 19, 2023
7aed7c0
Range Insertion
kstppd Oct 20, 2023
6139ffd
Merge fixes from dev
kstppd Oct 24, 2023
dbeb97c
Add more overloads for compacting deviceVectors
kstppd Oct 24, 2023
043d0c0
And some more
kstppd Oct 24, 2023
6155653
Move SplitDeviceVector to its own header file and rename the class
kstppd Oct 30, 2023
b95ac83
Add non thread safe at() for device
kstppd Oct 31, 2023
3fbc99e
Merge branch 'dev' into deviceVector
kstppd Oct 31, 2023
3dc9bd3
Add [] operator for device code
kstppd Oct 31, 2023
c26f5b6
Make deviceVector allocator aware
kstppd Nov 2, 2023
e0f3a28
Merge branch 'dev' into deviceVector
kstppd Jan 5, 2024
e51fe66
Add device vector unit tests to meson
kstppd Jan 5, 2024
56ffab3
Turn memset to async variant in split tools compactions
kstppd Jan 6, 2024
1ffd8c4
Fix to warp wide erase
kstppd Jan 17, 2024
ce0ccc5
setStream method for deviceVector
kstppd Jan 21, 2024
d45af37
Renaming frenzy
kstppd Jan 29, 2024
6ea7745
Add unordered set file
kstppd Jan 29, 2024
2378156
formatter pass
kstppd Jan 29, 2024
b3e267c
another renaming and moving frenzy
kstppd Jan 29, 2024
b557725
Initial commit
kstppd Jan 29, 2024
72fae89
WIP
kstppd Jan 29, 2024
102b868
WIP 2 unordered set
kstppd Jan 30, 2024
41196d1
WIP 3 unordered set
kstppd Jan 31, 2024
4373f8e
WIP 4 unordered set
kstppd Jan 31, 2024
23be2b4
WIP 5 unordered set
kstppd Jan 31, 2024
eeed696
Oopss
kstppd Jan 31, 2024
6d77b0c
Merge dev updates to stream compaction
kstppd Feb 1, 2024
996a385
Fix splitvec's .data() method to not dereferecne with no reason
kstppd Feb 1, 2024
0646a5b
Add warp accessors to unordered set
kstppd Feb 5, 2024
8f37799
Add AMD backend for hasher kernels
kstppd Feb 5, 2024
ffeb724
Fix signature of reset_all_to_emtpy_set
kstppd Feb 5, 2024
8bf7650
Merge dev
kstppd Feb 5, 2024
19e8996
Merge device vector
kstppd Feb 5, 2024
247fad0
Fix split t0ols
kstppd Feb 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions include/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,9 @@ constexpr inline size_t nextPow2(size_t v) noexcept {
* @brief Computes the next optimal overflow for the hasher kernels
*/
HASHINATOR_HOSTDEVICE
[[nodiscard]]
constexpr inline size_t nextOverflow(size_t currentOverflow, size_t virtualWarp) noexcept {
size_t remainder = currentOverflow % virtualWarp;
return ((remainder)==0)?currentOverflow: currentOverflow + (virtualWarp - remainder);
[[nodiscard]] constexpr inline size_t nextOverflow(size_t currentOverflow, size_t virtualWarp) noexcept {
size_t remainder = currentOverflow % virtualWarp;
return ((remainder) == 0) ? currentOverflow : currentOverflow + (virtualWarp - remainder);
}

/**
Expand Down
63 changes: 63 additions & 0 deletions include/hashinator/hashers.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "../splitvector/gpu_wrappers.h"
#include "defaults.h"
#include "hashfunctions.h"
#include "hash_pair.h"
#ifdef __NVCC__
#include "kernels_NVIDIA.h"
#endif
Expand Down Expand Up @@ -160,6 +161,68 @@ class Hasher {
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
}


/* ----------------------------------- Members used by Hashinator::Unordered_Set -----------------------------------*/


static void insert_set(KEY_TYPE* keys, KEY_TYPE* buckets, int sizePower,size_t maxoverflow, size_t* d_overflow,
size_t* d_fill, size_t len, status* err,split_gpuStream_t s = 0) {
//Make sure this is being used by Unordered_Set
static_assert(std::is_same<void,VAL_TYPE>::value);
size_t blocks, blockSize;
*err = status::success;
launchParams(len, blocks, blockSize);
Hashinator::Hashers::insert_set_kernel<KEY_TYPE, EMPTYBUCKET, HashFunction, defaults::WARPSIZE,elementsPerWarp>
<<<blocks, blockSize, 0, s>>>(keys, buckets, sizePower, maxoverflow, d_overflow, d_fill, len, err);
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
#ifndef NDEBUG
if (*err == status::fail) {
std::cerr << "***** Hashinator Runtime Warning ********" << std::endl;
std::cerr << "Warning: Hashmap completely overflown in Device Insert.\nNot all ellements were "
"inserted!\nConsider resizing before calling insert"
<< std::endl;
std::cerr << "******************************" << std::endl;
}
#endif
}

// Delete wrapper
static void erase_set(KEY_TYPE* keys, KEY_TYPE* buckets, size_t* d_tombstoneCounter, int sizePower,
size_t maxoverflow, size_t len, split_gpuStream_t s = 0) {

//Make sure this is being used by Unordered_Set
static_assert(std::is_same<void,VAL_TYPE>::value);
size_t blocks, blockSize;
launchParams(len, blocks, blockSize);
Hashinator::Hashers::delete_set_kernel<KEY_TYPE, EMPTYBUCKET, TOMBSTONE, HashFunction, defaults::WARPSIZE,
elementsPerWarp>
<<<blocks, blockSize, 0, s>>>(keys, buckets, d_tombstoneCounter, sizePower, maxoverflow, len);
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
}

// Reset wrapper
static void reset_set(KEY_TYPE* src, KEY_TYPE* dst, const int sizePower,size_t maxoverflow,
size_t len, split_gpuStream_t s = 0) {
//Make sure this is being used by Unordered_Set
static_assert(std::is_same<void,VAL_TYPE>::value);
size_t blocks, blockSize;
launchParams(len, blocks, blockSize);
Hashinator::Hashers::reset_to_empty_set<KEY_TYPE, EMPTYBUCKET, HashFunction, defaults::WARPSIZE,
elementsPerWarp>
<<<blocks, blockSize, 0, s>>>(src, dst, sizePower, maxoverflow, len);
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
}
// Reset wrapper for all elements
static void reset_all_set(KEY_TYPE* dst, size_t len, split_gpuStream_t s = 0) {
//Make sure this is being used by Unordered_Set
static_assert(std::is_same<void,VAL_TYPE>::value);
size_t blocksNeeded = len / defaults::MAX_BLOCKSIZE;
blocksNeeded = blocksNeeded + (blocksNeeded == 0);
reset_all_to_empty_set<KEY_TYPE, EMPTYBUCKET><<<blocksNeeded, defaults::MAX_BLOCKSIZE, 0, s>>>(dst, len);
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
}


private:
static void launchParams(size_t N, size_t& blocks, size_t& blockSize) {
// fast ceil for positive ints
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* File: hashinator.h
/* File: hashmap.h
* Authors: Kostis Papadakis, Urs Ganse and Markus Battarbee (2023)
* Description: A hybrid hashmap that can operate on both
* CPUs and GPUs using CUDA unified memory.
Expand All @@ -24,20 +24,20 @@
#ifdef HASHINATOR_CPU_ONLY_MODE
#define SPLIT_CPU_ONLY_MODE
#endif
#include "../common.h"
#include "../splitvector/gpu_wrappers.h"
#include "../splitvector/split_allocators.h"
#include "../splitvector/splitvec.h"
#include "defaults.h"
#include "hash_pair.h"
#include "hashfunctions.h"
#include "../../common.h"
#include "../../splitvector/gpu_wrappers.h"
#include "../../splitvector/split_allocators.h"
#include "../../splitvector/splitvec.h"
#include "../defaults.h"
#include "../hash_pair.h"
#include "../hashfunctions.h"
#include <algorithm>
#include <cassert>
#include <limits>
#include <stdexcept>
#ifndef HASHINATOR_CPU_ONLY_MODE
#include "../splitvector/split_tools.h"
#include "hashers.h"
#include "../../splitvector/split_tools.h"
#include "../hashers.h"
#endif

namespace Hashinator {
Expand Down Expand Up @@ -99,7 +99,7 @@ class Hashmap {

// Deallocates the bookeepping info and the device pointer
void deallocate_device_handles() {
if (device_map==nullptr){
if (device_map == nullptr) {
return;
}
#ifndef HASHINATOR_CPU_ONLY_MODE
Expand Down Expand Up @@ -138,11 +138,11 @@ class Hashmap {
Hashmap(Hashmap<KEY_TYPE, VAL_TYPE>&& other) {
preallocate_device_handles();
_mapInfo = other._mapInfo;
other._mapInfo=nullptr;
other._mapInfo = nullptr;
buckets = std::move(other.buckets);
};

Hashmap& operator=(const Hashmap<KEY_TYPE,VAL_TYPE>& other) {
Hashmap& operator=(const Hashmap<KEY_TYPE, VAL_TYPE>& other) {
if (this == &other) {
return *this;
}
Expand All @@ -167,8 +167,8 @@ class Hashmap {
}
_metaAllocator.deallocate(_mapInfo, 1);
_mapInfo = other._mapInfo;
other._mapInfo=nullptr;
buckets =std::move(other.buckets);
other._mapInfo = nullptr;
buckets = std::move(other.buckets);
return *this;
}

Expand Down Expand Up @@ -847,13 +847,13 @@ class Hashmap {
if (w_tid == winner) {
KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex].first, EMPTYBUCKET, candidateKey);
if (old == EMPTYBUCKET) {
threadOverflow =(probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex+1);
threadOverflow = (probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex + 1);
split::s_atomicExch(&buckets[probingindex].second, candidateVal);
warpDone = 1;
split::s_atomicAdd(&_mapInfo->fill, 1);
if (threadOverflow > _mapInfo->currentMaxBucketOverflow) {
split::s_atomicExch((unsigned long long*)(&_mapInfo->currentMaxBucketOverflow),
(unsigned long long)nextOverflow(threadOverflow,defaults::WARPSIZE));
(unsigned long long)nextOverflow(threadOverflow, defaults::WARPSIZE));
}
} else if (old == candidateKey) {
// Parallel stuff are fun. Major edge case!
Expand Down Expand Up @@ -931,14 +931,14 @@ class Hashmap {
if (w_tid == winner) {
KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex].first, EMPTYBUCKET, candidateKey);
if (old == EMPTYBUCKET) {
threadOverflow = (probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex+1);
threadOverflow = (probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex + 1);
split::s_atomicExch(&buckets[probingindex].second, candidateVal);
warpDone = 1;
localCount = 1;
split::s_atomicAdd(&_mapInfo->fill, 1);
if (threadOverflow > _mapInfo->currentMaxBucketOverflow) {
split::s_atomicExch((unsigned long long*)(&_mapInfo->currentMaxBucketOverflow),
(unsigned long long)nextOverflow(threadOverflow,defaults::WARPSIZE));
(unsigned long long)nextOverflow(threadOverflow, defaults::WARPSIZE));
}
} else if (old == candidateKey) {
// Parallel stuff are fun. Major edge case!
Expand Down Expand Up @@ -1113,7 +1113,7 @@ class Hashmap {
split::tools::Cuda_mempool mPool(memory_for_pool, s);
size_t retval =
split::tools::copy_if_raw<hash_pair<KEY_TYPE, VAL_TYPE>, Rule, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>(
buckets, elements, rule, nBlocks, mPool, s);
buckets.data(), elements, buckets.size(), rule, nBlocks, mPool, s);
return retval;
}

Expand All @@ -1133,8 +1133,8 @@ class Hashmap {
return elements.size();
}
template <typename Rule>
size_t extractKeysByPattern(split::SplitVector<KEY_TYPE>& elements, Rule rule, void *stack, size_t max_size, split_gpuStream_t s = 0,
bool prefetches = true) {
size_t extractKeysByPattern(split::SplitVector<KEY_TYPE>& elements, Rule rule, void* stack, size_t max_size,
split_gpuStream_t s = 0, bool prefetches = true) {
elements.resize(_mapInfo->fill + 1, true);
if (prefetches) {
elements.optimizeGPU(s);
Expand All @@ -1152,7 +1152,8 @@ class Hashmap {
};
return extractKeysByPattern(elements, rule, s, prefetches);
}
size_t extractAllKeys(split::SplitVector<KEY_TYPE>& elements, void *stack, size_t max_size, split_gpuStream_t s = 0, bool prefetches = true) {
size_t extractAllKeys(split::SplitVector<KEY_TYPE>& elements, void* stack, size_t max_size, split_gpuStream_t s = 0,
bool prefetches = true) {
// Extract all keys
auto rule = [] __host__ __device__(const hash_pair<KEY_TYPE, VAL_TYPE>& kval) -> bool {
return kval.first != EMPTYBUCKET && kval.first != TOMBSTONE;
Expand Down Expand Up @@ -1371,7 +1372,7 @@ class Hashmap {

public:
HASHINATOR_DEVICEONLY
device_iterator(Hashmap<KEY_TYPE, VAL_TYPE>& hashtable, size_t index) : index(index),hashtable(&hashtable) {}
device_iterator(Hashmap<KEY_TYPE, VAL_TYPE>& hashtable, size_t index) : index(index), hashtable(&hashtable) {}

HASHINATOR_DEVICEONLY
size_t getIndex() { return index; }
Expand Down Expand Up @@ -1418,7 +1419,7 @@ class Hashmap {
public:
HASHINATOR_DEVICEONLY
explicit const_device_iterator(const Hashmap<KEY_TYPE, VAL_TYPE>& hashtable, size_t index)
: index(index), hashtable(&hashtable){}
: index(index), hashtable(&hashtable) {}

HASHINATOR_DEVICEONLY
size_t getIndex() { return index; }
Expand Down Expand Up @@ -1629,7 +1630,8 @@ class Hashmap {
void set_element(const KEY_TYPE& key, VAL_TYPE val) {
size_t thread_overflowLookup = 0;
insert_element(key, val, thread_overflowLookup);
atomicMax((unsigned long long*)&(_mapInfo->currentMaxBucketOverflow), nextOverflow(thread_overflowLookup,defaults::WARPSIZE/defaults::elementsPerWarp));
atomicMax((unsigned long long*)&(_mapInfo->currentMaxBucketOverflow),
nextOverflow(thread_overflowLookup, defaults::WARPSIZE / defaults::elementsPerWarp));
}

HASHINATOR_DEVICEONLY
Expand Down
Loading
Loading