Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Device vector and unordered set functionalities #50

Open
wants to merge 58 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
dbf4758
First commit of SplitDeviceVector + some unit tests for that
kstppd Oct 13, 2023
858be51
Equality operators
kstppd Oct 13, 2023
7032171
Account for host only modes
kstppd Oct 13, 2023
99799d1
Add device iterators
kstppd Oct 15, 2023
264da71
Modify copy_if signature to support T* instead of splitvectors for input
kstppd Oct 16, 2023
91671d9
Formatter pass
kstppd Oct 16, 2023
3668c0d
Enable stream copmpactions for SplitDeviceVector
kstppd Oct 16, 2023
e51066b
Rename test
kstppd Oct 16, 2023
ba3fb3c
Add host read only iterator...
kstppd Oct 17, 2023
ec853fc
Fix a small memory leak in the unit tests
kstppd Oct 17, 2023
0b57317
WIP more iterator functionality from SplitDeviceVector
kstppd Oct 18, 2023
453e993
WIP more functionallities
kstppd Oct 19, 2023
8751d54
WIP more functionallities...
kstppd Oct 19, 2023
31ab957
WIP more functionallities...
kstppd Oct 19, 2023
bd6f266
Bug fix. Insert range was using capacity and not size for resizing
kstppd Oct 19, 2023
7aed7c0
Range Insertion
kstppd Oct 20, 2023
6139ffd
Merge fixes from dev
kstppd Oct 24, 2023
dbeb97c
Add more overloads for compacting deviceVectors
kstppd Oct 24, 2023
043d0c0
And some more
kstppd Oct 24, 2023
6155653
Move SplitDeviceVector to its own header file and rename the class
kstppd Oct 30, 2023
b95ac83
Add non thread safe at() for device
kstppd Oct 31, 2023
3fbc99e
Merge branch 'dev' into deviceVector
kstppd Oct 31, 2023
3dc9bd3
Add [] operator for device code
kstppd Oct 31, 2023
c26f5b6
Make deviceVector allocator aware
kstppd Nov 2, 2023
e0f3a28
Merge branch 'dev' into deviceVector
kstppd Jan 5, 2024
e51fe66
Add device vector unit tests to meson
kstppd Jan 5, 2024
56ffab3
Turn memset to async variant in split tools compactions
kstppd Jan 6, 2024
1ffd8c4
Fix to warp wide erase
kstppd Jan 17, 2024
ce0ccc5
setStream method for deviceVector
kstppd Jan 21, 2024
d45af37
Renaming frenzy
kstppd Jan 29, 2024
6ea7745
Add unordered set file
kstppd Jan 29, 2024
2378156
formatter pass
kstppd Jan 29, 2024
b3e267c
another renaming and moving frenzy
kstppd Jan 29, 2024
b557725
Initial commit
kstppd Jan 29, 2024
72fae89
WIP
kstppd Jan 29, 2024
102b868
WIP 2 unordered set
kstppd Jan 30, 2024
41196d1
WIP 3 unordered set
kstppd Jan 31, 2024
4373f8e
WIP 4 unordered set
kstppd Jan 31, 2024
23be2b4
WIP 5 unordered set
kstppd Jan 31, 2024
eeed696
Oopss
kstppd Jan 31, 2024
6d77b0c
Merge dev updates to stream compaction
kstppd Feb 1, 2024
996a385
Fix splitvec's .data() method to not dereferecne with no reason
kstppd Feb 1, 2024
0646a5b
Add warp accessors to unordered set
kstppd Feb 5, 2024
8f37799
Add AMD backend for hasher kernels
kstppd Feb 5, 2024
ffeb724
Fix signature of reset_all_to_emtpy_set
kstppd Feb 5, 2024
760e63a
Merge unordered_set
kstppd Feb 5, 2024
daddb6d
Add warp accesor retrieval to benchmark script
kstppd Feb 6, 2024
359ed52
Cherry pick nodiscard fixes and fix it a bit more
markusbattarbee Feb 6, 2024
eedaf21
AMD backend for unordered set. So far tested only on Instinct.
kstppd Feb 20, 2024
b93708e
Make device vector accept stream as arguement in methods rather than …
kstppd Mar 1, 2024
bfadb03
Formattting pass
kstppd Mar 1, 2024
8438852
Fix signed-unsigned error which was causing erroneous outputs from wa…
markusbattarbee Mar 28, 2024
29cd8c3
Fix signed-unsigned error which was causing erroneous outputs from wa…
markusbattarbee Mar 28, 2024
8725a4b
Merge device_vector and unordered_set
kstppd Apr 9, 2024
cd1ea1c
Merge dev
kstppd Apr 9, 2024
45dff96
Merge dev
kstppd Apr 9, 2024
ea5f152
Move hashmap and unoredered_set one dir up
kstppd Apr 9, 2024
58b0eb5
nudge CI
kstppd Apr 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,4 @@ Hashinator and SplitVector include a suite of unit tests using [googletest](http
## Credits for people who contributed but do not appear in the contribution list.
+ Special thanks to [Urs Ganse](https://github.com/ursg) for the initial CPU version of Hashinator.
+ Thanks to [Jaro Hokkanen](https://github.com/hokkanen). Hashinator's arch agnostic approach was inspired by his work.

2 changes: 2 additions & 0 deletions include/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ inline bool isDeviceAccessible(void* ptr){
}
return true;
#endif
(void)ptr;
return false;
}

/**
Expand Down
63 changes: 63 additions & 0 deletions include/hashinator/hashers.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "../splitvector/gpu_wrappers.h"
#include "defaults.h"
#include "hashfunctions.h"
#include "hash_pair.h"
#ifdef __NVCC__
#include "kernels_NVIDIA.h"
#endif
Expand Down Expand Up @@ -160,6 +161,68 @@ class Hasher {
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
}


/* ----------------------------------- Members used by Hashinator::Unordered_Set -----------------------------------*/


static void insert_set(KEY_TYPE* keys, KEY_TYPE* buckets, int sizePower,size_t maxoverflow, size_t* d_overflow,
size_t* d_fill, size_t len, status* err,split_gpuStream_t s = 0) {
//Make sure this is being used by Unordered_Set
static_assert(std::is_same<void,VAL_TYPE>::value);
size_t blocks, blockSize;
*err = status::success;
launchParams(len, blocks, blockSize);
Hashinator::Hashers::insert_set_kernel<KEY_TYPE, EMPTYBUCKET, HashFunction, defaults::WARPSIZE,elementsPerWarp>
<<<blocks, blockSize, 0, s>>>(keys, buckets, sizePower, maxoverflow, d_overflow, d_fill, len, err);
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
#ifndef NDEBUG
if (*err == status::fail) {
std::cerr << "***** Hashinator Runtime Warning ********" << std::endl;
std::cerr << "Warning: Hashmap completely overflown in Device Insert.\nNot all ellements were "
"inserted!\nConsider resizing before calling insert"
<< std::endl;
std::cerr << "******************************" << std::endl;
}
#endif
}

// Delete wrapper
static void erase_set(KEY_TYPE* keys, KEY_TYPE* buckets, size_t* d_tombstoneCounter, int sizePower,
size_t maxoverflow, size_t len, split_gpuStream_t s = 0) {

//Make sure this is being used by Unordered_Set
static_assert(std::is_same<void,VAL_TYPE>::value);
size_t blocks, blockSize;
launchParams(len, blocks, blockSize);
Hashinator::Hashers::delete_set_kernel<KEY_TYPE, EMPTYBUCKET, TOMBSTONE, HashFunction, defaults::WARPSIZE,
elementsPerWarp>
<<<blocks, blockSize, 0, s>>>(keys, buckets, d_tombstoneCounter, sizePower, maxoverflow, len);
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
}

// Reset wrapper
static void reset_set(KEY_TYPE* src, KEY_TYPE* dst, const int sizePower,size_t maxoverflow,
size_t len, split_gpuStream_t s = 0) {
//Make sure this is being used by Unordered_Set
static_assert(std::is_same<void,VAL_TYPE>::value);
size_t blocks, blockSize;
launchParams(len, blocks, blockSize);
Hashinator::Hashers::reset_to_empty_set<KEY_TYPE, EMPTYBUCKET, HashFunction, defaults::WARPSIZE,
elementsPerWarp>
<<<blocks, blockSize, 0, s>>>(src, dst, sizePower, maxoverflow, len);
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
}
// Reset wrapper for all elements
static void reset_all_set(KEY_TYPE* dst, size_t len, split_gpuStream_t s = 0) {
//Make sure this is being used by Unordered_Set
static_assert(std::is_same<void,VAL_TYPE>::value);
size_t blocksNeeded = len / defaults::MAX_BLOCKSIZE;
blocksNeeded = blocksNeeded + (blocksNeeded == 0);
reset_all_to_empty_set<KEY_TYPE, EMPTYBUCKET><<<blocksNeeded, defaults::MAX_BLOCKSIZE, 0, s>>>(dst, len);
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
}


private:
static void launchParams(size_t N, size_t& blocks, size_t& blockSize) {
// fast ceil for positive ints
Expand Down
36 changes: 19 additions & 17 deletions include/hashinator/hashinator.h → include/hashinator/hashmap.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* File: hashinator.h
/* File: hashmap.h
* Authors: Kostis Papadakis, Urs Ganse and Markus Battarbee (2023)
* Description: A hybrid hashmap that can operate on both
* CPUs and GPUs using CUDA unified memory.
Expand Down Expand Up @@ -99,7 +99,7 @@ class Hashmap {

// Deallocates the bookeepping info and the device pointer
void deallocate_device_handles() {
if (device_map==nullptr){
if (device_map == nullptr) {
return;
}
#ifndef HASHINATOR_CPU_ONLY_MODE
Expand Down Expand Up @@ -138,11 +138,11 @@ class Hashmap {
Hashmap(Hashmap<KEY_TYPE, VAL_TYPE>&& other) {
preallocate_device_handles();
_mapInfo = other._mapInfo;
other._mapInfo=nullptr;
other._mapInfo = nullptr;
buckets = std::move(other.buckets);
};

Hashmap& operator=(const Hashmap<KEY_TYPE,VAL_TYPE>& other) {
Hashmap& operator=(const Hashmap<KEY_TYPE, VAL_TYPE>& other) {
if (this == &other) {
return *this;
}
Expand All @@ -167,8 +167,8 @@ class Hashmap {
}
_metaAllocator.deallocate(_mapInfo, 1);
_mapInfo = other._mapInfo;
other._mapInfo=nullptr;
buckets =std::move(other.buckets);
other._mapInfo = nullptr;
buckets = std::move(other.buckets);
return *this;
}

Expand Down Expand Up @@ -853,13 +853,13 @@ class Hashmap {
if (w_tid == winner) {
KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex].first, EMPTYBUCKET, candidateKey);
if (old == EMPTYBUCKET) {
threadOverflow =(probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex+1);
threadOverflow = (probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex + 1);
split::s_atomicExch(&buckets[probingindex].second, candidateVal);
warpDone = 1;
split::s_atomicAdd(&_mapInfo->fill, 1);
if (threadOverflow > _mapInfo->currentMaxBucketOverflow) {
split::s_atomicExch((unsigned long long*)(&_mapInfo->currentMaxBucketOverflow),
(unsigned long long)nextOverflow(threadOverflow,defaults::WARPSIZE));
(unsigned long long)nextOverflow(threadOverflow, defaults::WARPSIZE));
}
} else if (old == candidateKey) {
// Parallel stuff are fun. Major edge case!
Expand Down Expand Up @@ -937,14 +937,14 @@ class Hashmap {
if (w_tid == winner) {
KEY_TYPE old = split::s_atomicCAS(&buckets[probingindex].first, EMPTYBUCKET, candidateKey);
if (old == EMPTYBUCKET) {
threadOverflow = (probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex+1);
threadOverflow = (probingindex < optimalindex) ? (1 << sizePower) : (probingindex - optimalindex + 1);
split::s_atomicExch(&buckets[probingindex].second, candidateVal);
warpDone = 1;
localCount = 1;
split::s_atomicAdd(&_mapInfo->fill, 1);
if (threadOverflow > _mapInfo->currentMaxBucketOverflow) {
split::s_atomicExch((unsigned long long*)(&_mapInfo->currentMaxBucketOverflow),
(unsigned long long)nextOverflow(threadOverflow,defaults::WARPSIZE));
(unsigned long long)nextOverflow(threadOverflow, defaults::WARPSIZE));
}
} else if (old == candidateKey) {
// Parallel stuff are fun. Major edge case!
Expand Down Expand Up @@ -1119,7 +1119,7 @@ class Hashmap {
split::tools::splitStackArena mPool(memory_for_pool, s);
size_t retval =
split::tools::copy_if_raw<hash_pair<KEY_TYPE, VAL_TYPE>, Rule, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>(
buckets, elements, rule, nBlocks, mPool, s);
buckets.data(), elements, buckets.size(), rule, nBlocks, mPool, s);
return retval;
}
template <typename Rule>
Expand Down Expand Up @@ -1152,8 +1152,8 @@ class Hashmap {
return elements.size();
}
template <typename Rule>
size_t extractKeysByPattern(split::SplitVector<KEY_TYPE>& elements, Rule rule, void *stack, size_t max_size, split_gpuStream_t s = 0,
bool prefetches = true) {
size_t extractKeysByPattern(split::SplitVector<KEY_TYPE>& elements, Rule rule, void* stack, size_t max_size,
split_gpuStream_t s = 0, bool prefetches = true) {
elements.resize(_mapInfo->fill + 1, true);
if (prefetches) {
elements.optimizeGPU(s);
Expand All @@ -1177,7 +1177,8 @@ class Hashmap {
};
return extractKeysByPattern(elements, rule, s, prefetches);
}
size_t extractAllKeys(split::SplitVector<KEY_TYPE>& elements, void *stack, size_t max_size, split_gpuStream_t s = 0, bool prefetches = true) {
size_t extractAllKeys(split::SplitVector<KEY_TYPE>& elements, void* stack, size_t max_size, split_gpuStream_t s = 0,
bool prefetches = true) {
// Extract all keys
auto rule = [] __host__ __device__(const hash_pair<KEY_TYPE, VAL_TYPE>& kval) -> bool {
return kval.first != EMPTYBUCKET && kval.first != TOMBSTONE;
Expand Down Expand Up @@ -1409,7 +1410,7 @@ class Hashmap {

public:
HASHINATOR_DEVICEONLY
device_iterator(Hashmap<KEY_TYPE, VAL_TYPE>& hashtable, size_t index) : index(index),hashtable(&hashtable) {}
device_iterator(Hashmap<KEY_TYPE, VAL_TYPE>& hashtable, size_t index) : index(index), hashtable(&hashtable) {}

HASHINATOR_DEVICEONLY
size_t getIndex() { return index; }
Expand Down Expand Up @@ -1456,7 +1457,7 @@ class Hashmap {
public:
HASHINATOR_DEVICEONLY
explicit const_device_iterator(const Hashmap<KEY_TYPE, VAL_TYPE>& hashtable, size_t index)
: index(index), hashtable(&hashtable){}
: index(index), hashtable(&hashtable) {}

HASHINATOR_DEVICEONLY
size_t getIndex() { return index; }
Expand Down Expand Up @@ -1667,7 +1668,8 @@ class Hashmap {
void set_element(const KEY_TYPE& key, VAL_TYPE val) {
size_t thread_overflowLookup = 0;
insert_element(key, val, thread_overflowLookup);
atomicMax((unsigned long long*)&(_mapInfo->currentMaxBucketOverflow), nextOverflow(thread_overflowLookup,defaults::WARPSIZE/defaults::elementsPerWarp));
atomicMax((unsigned long long*)&(_mapInfo->currentMaxBucketOverflow),
nextOverflow(thread_overflowLookup, defaults::WARPSIZE / defaults::elementsPerWarp));
}

HASHINATOR_DEVICEONLY
Expand Down
Loading
Loading