Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Device resize does not construct elements unless necessary. #52

Merged
merged 10 commits into from
May 28, 2024
114 changes: 76 additions & 38 deletions include/hashinator/hashinator.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ class Hashmap {
private:
// CUDA device handle
Hashmap* device_map;
split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>* device_buckets;
//~CUDA device handle

// Host members
Expand All @@ -83,6 +84,7 @@ class Hashmap {
void preallocate_device_handles() {
#ifndef HASHINATOR_CPU_ONLY_MODE
SPLIT_CHECK_ERR(split_gpuMalloc((void**)&device_map, sizeof(Hashmap)));
device_buckets = (split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>*)((char*)device_map + offsetof(Hashmap, buckets));
#endif
}

Expand All @@ -94,6 +96,7 @@ class Hashmap {
#ifndef HASHINATOR_CPU_ONLY_MODE
SPLIT_CHECK_ERR(split_gpuFree(device_map));
device_map = nullptr;
device_buckets = nullptr;
#endif
}

Expand All @@ -107,6 +110,7 @@ class Hashmap {
*_mapInfo = MapInfo(5);
buckets = split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>(
1 << _mapInfo->sizePower, hash_pair<KEY_TYPE, VAL_TYPE>(EMPTYBUCKET, VAL_TYPE()));
SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
};

Hashmap(int sizepower) {
Expand All @@ -115,20 +119,23 @@ class Hashmap {
*_mapInfo = MapInfo(sizepower);
buckets = split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>(
1 << _mapInfo->sizePower, hash_pair<KEY_TYPE, VAL_TYPE>(EMPTYBUCKET, VAL_TYPE()));
SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
};

Hashmap(const Hashmap<KEY_TYPE, VAL_TYPE>& other) {
preallocate_device_handles();
_mapInfo = _metaAllocator.allocate(1);
*_mapInfo = *(other._mapInfo);
buckets = other.buckets;
SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
};

Hashmap(Hashmap<KEY_TYPE, VAL_TYPE>&& other) {
preallocate_device_handles();
_mapInfo = other._mapInfo;
other._mapInfo=nullptr;
buckets = std::move(other.buckets);
SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
};

Hashmap& operator=(const Hashmap<KEY_TYPE,VAL_TYPE>& other) {
Expand All @@ -137,6 +144,7 @@ class Hashmap {
}
*_mapInfo = *(other._mapInfo);
buckets = other.buckets;
SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
return *this;
}

Expand All @@ -146,7 +154,8 @@ class Hashmap {
return;
}
SPLIT_CHECK_ERR(split_gpuMemcpyAsync(_mapInfo,other._mapInfo, sizeof(MapInfo), split_gpuMemcpyDeviceToDevice, stream));
buckets.overwrite(other.buckets);
buckets.overwrite(other.buckets, stream);
SPLIT_CHECK_ERR(split_gpuMemcpyAsync(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice, stream));
return;
}

Expand All @@ -157,7 +166,8 @@ class Hashmap {
_metaAllocator.deallocate(_mapInfo, 1);
_mapInfo = other._mapInfo;
other._mapInfo=nullptr;
buckets =std::move(other.buckets);
buckets = std::move(other.buckets);
SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
return *this;
}

Expand Down Expand Up @@ -246,11 +256,15 @@ class Hashmap {
buckets = newBuckets;
_mapInfo->currentMaxBucketOverflow = Hashinator::defaults::BUCKET_OVERFLOW;
_mapInfo->tombstoneCounter = 0;
#ifndef HASHINATOR_CPU_ONLY_MODE
SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
#endif
}

#ifndef HASHINATOR_CPU_ONLY_MODE
// Resize the table to fit more things. This is automatically invoked once
// maxBucketOverflow has triggered. This can only be done on host (so far)
template <bool prefetches = true>
void device_rehash(int newSizePower, split_gpuStream_t s = 0) {
if (newSizePower > 32) {
throw std::out_of_range("Hashmap ran into rehashing catastrophe and exceeded 32bit buckets.");
Expand All @@ -261,8 +275,10 @@ class Hashmap {
hash_pair<KEY_TYPE, VAL_TYPE>* validElements;
SPLIT_CHECK_ERR(split_gpuMallocAsync((void**)&validElements,
(_mapInfo->fill + 1) * sizeof(hash_pair<KEY_TYPE, VAL_TYPE>), s));
optimizeGPU(s);
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
if (prefetches) {
optimizeGPU(s);
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
}

auto isValidKey = [] __host__ __device__(hash_pair<KEY_TYPE, VAL_TYPE> & element) {
if (element.first != TOMBSTONE && element.first != EMPTYBUCKET) {
Expand All @@ -283,10 +299,16 @@ class Hashmap {
split_gpuFreeAsync(validElements, s);
return;
}
optimizeCPU(s);
buckets = std::move(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>(
1 << newSizePower, hash_pair<KEY_TYPE, VAL_TYPE>(EMPTYBUCKET, VAL_TYPE())));
optimizeGPU(s);
if (newSizePower == _mapInfo->sizePower) {
// Just clear the current contents
DeviceHasher::reset_all(buckets.data(),_mapInfo, buckets.size(), s);
} else {
// Need new buckets
buckets = std::move(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>(
1 << newSizePower, hash_pair<KEY_TYPE, VAL_TYPE>(EMPTYBUCKET, VAL_TYPE())));
SPLIT_CHECK_ERR(split_gpuMemcpyAsync(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice, s));
optimizeGPU(s);
}
*_mapInfo = Info(newSizePower);
// Insert valid elements to now larger buckets
insert(validElements, nValidElements, 1, s);
Expand Down Expand Up @@ -430,7 +452,8 @@ class Hashmap {
return;
}
#else
void clear(targets t = targets::host, split_gpuStream_t s = 0, bool prefetches = true) {
template <bool prefetches = true>
void clear(targets t = targets::host, split_gpuStream_t s = 0, size_t len = 0) {
switch (t) {
case targets::host:
buckets =
Expand All @@ -440,9 +463,12 @@ class Hashmap {

case targets::device:
if (prefetches) {
buckets.optimizeGPU(s);
optimizeGPU(s);
}
DeviceHasher::reset_all(buckets.data(),_mapInfo, buckets.size(), s);
if (len==0) { // If size is provided, no need to page fault size information.
len = buckets.size();
}
DeviceHasher::reset_all(buckets.data(),_mapInfo, len, s);
#ifdef HASHINATOR_DEBUG
set_status((_mapInfo->fill == 0) ? success : fail);
#endif
Expand Down Expand Up @@ -561,6 +587,7 @@ class Hashmap {
buckets.swap(other.buckets);
std::swap(_mapInfo, other._mapInfo);
std::swap(device_map, other.device_map);
std::swap(device_buckets, other.device_buckets);
}

#ifdef HASHINATOR_CPU_ONLY_MODE
Expand All @@ -576,12 +603,13 @@ class Hashmap {
}
#else
// Try to get the overflow back to the original one
template <bool prefetches = true>
void performCleanupTasks(split_gpuStream_t s = 0) {
while (_mapInfo->currentMaxBucketOverflow > Hashinator::defaults::BUCKET_OVERFLOW) {
device_rehash(_mapInfo->sizePower + 1, s);
}
if (tombstone_ratio() > 0.025) {
clean_tombstones(s);
clean_tombstones<prefetches>(s);
}
while (_mapInfo->currentMaxBucketOverflow > Hashinator::defaults::BUCKET_OVERFLOW) {
device_rehash<prefetches>(_mapInfo->sizePower + 1, s);
}
}

Expand Down Expand Up @@ -1080,9 +1108,9 @@ class Hashmap {
* Then call this:
* hmap.extractPattern(elements,Rule<uint32_t,uint32_t>());
* */
template <typename Rule>
template <typename Rule, bool prefetches = true>
size_t extractPattern(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>& elements, Rule rule,
split_gpuStream_t s = 0, bool prefetches = true) {
split_gpuStream_t s = 0) {
elements.resize(_mapInfo->fill + 1, true);
if (prefetches) {
elements.optimizeGPU(s);
Expand Down Expand Up @@ -1116,7 +1144,7 @@ class Hashmap {
void extractPatternLoop(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>& elements, Rule rule, split_gpuStream_t s = 0) {
// Extract elements matching the Pattern Rule(element)==true;
split::tools::copy_if_loop<hash_pair<KEY_TYPE, VAL_TYPE>, Rule, defaults::MAX_BLOCKSIZE,
defaults::WARPSIZE>(buckets, elements, rule, s);
defaults::WARPSIZE>(*device_buckets, elements, rule, s);
}
void extractLoop(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>& elements, split_gpuStream_t s = 0) {
// Extract all valid elements
Expand All @@ -1126,9 +1154,8 @@ class Hashmap {
extractPatternLoop(elements, rule, s);
}

template <typename Rule>
size_t extractKeysByPattern(split::SplitVector<KEY_TYPE>& elements, Rule rule, split_gpuStream_t s = 0,
bool prefetches = true) {
template <typename Rule, bool prefetches = true>
size_t extractKeysByPattern(split::SplitVector<KEY_TYPE>& elements, Rule rule, split_gpuStream_t s = 0) {
elements.resize(_mapInfo->fill + 1, true);
if (prefetches) {
elements.optimizeGPU(s);
Expand All @@ -1138,12 +1165,13 @@ class Hashmap {
defaults::WARPSIZE>(buckets, elements, rule, s);
//FIXME: there is an issue where paging to host occurs and following calls to hashmap operations take a hit.
//temp fix: call optimizeGPU() here
optimizeGPU(s);
if (prefetches) {
optimizeGPU(s);
}
return elements.size();
}
template <typename Rule>
size_t extractKeysByPattern(split::SplitVector<KEY_TYPE>& elements, Rule rule, void *stack, size_t max_size, split_gpuStream_t s = 0,
bool prefetches = true) {
template <typename Rule, bool prefetches = true>
size_t extractKeysByPattern(split::SplitVector<KEY_TYPE>& elements, Rule rule, void *stack, size_t max_size, split_gpuStream_t s = 0) {
elements.resize(_mapInfo->fill + 1, true);
if (prefetches) {
elements.optimizeGPU(s);
Expand All @@ -1157,17 +1185,19 @@ class Hashmap {
void extractKeysByPatternLoop(split::SplitVector<KEY_TYPE>& elements, Rule rule, split_gpuStream_t s = 0) {
// Extract element **keys** matching the Pattern Rule(element)==true;
split::tools::copy_if_keys_loop<hash_pair<KEY_TYPE, VAL_TYPE>, KEY_TYPE, Rule, defaults::MAX_BLOCKSIZE,
defaults::WARPSIZE>(buckets, elements, rule, s);
defaults::WARPSIZE>(*device_buckets, elements, rule, s);
}

size_t extractAllKeys(split::SplitVector<KEY_TYPE>& elements, split_gpuStream_t s = 0, bool prefetches = true) {
template <bool prefetches = true>
size_t extractAllKeys(split::SplitVector<KEY_TYPE>& elements, split_gpuStream_t s = 0) {
// Extract all keys
auto rule = [] __host__ __device__(const hash_pair<KEY_TYPE, VAL_TYPE>& kval) -> bool {
return kval.first != EMPTYBUCKET && kval.first != TOMBSTONE;
};
return extractKeysByPattern(elements, rule, s, prefetches);
}
size_t extractAllKeys(split::SplitVector<KEY_TYPE>& elements, void *stack, size_t max_size, split_gpuStream_t s = 0, bool prefetches = true) {
template <bool prefetches = true>
size_t extractAllKeys(split::SplitVector<KEY_TYPE>& elements, void *stack, size_t max_size, split_gpuStream_t s = 0) {
// Extract all keys
auto rule = [] __host__ __device__(const hash_pair<KEY_TYPE, VAL_TYPE>& kval) -> bool {
return kval.first != EMPTYBUCKET && kval.first != TOMBSTONE;
Expand All @@ -1182,7 +1212,8 @@ class Hashmap {
extractKeysByPatternLoop(elements, rule, s);
}

void clean_tombstones(split_gpuStream_t s = 0, bool prefetches = false) {
template <bool prefetches = true>
void clean_tombstones(split_gpuStream_t s = 0) {

if (_mapInfo->tombstoneCounter == 0) {
return;
Expand Down Expand Up @@ -1240,8 +1271,8 @@ class Hashmap {
}

// Uses Hasher's insert_kernel to insert all elements
void insert(KEY_TYPE* keys, VAL_TYPE* vals, size_t len, float targetLF = 0.5, split_gpuStream_t s = 0,
bool prefetches = true) {
template <bool prefetches = true>
void insert(KEY_TYPE* keys, VAL_TYPE* vals, size_t len, float targetLF = 0.5, split_gpuStream_t s = 0) {
// Here we do some calculations to estimate how much if any we need to grow our buckets
// TODO fix these if paths or at least annotate them .
if (len == 0) {
Expand All @@ -1262,7 +1293,8 @@ class Hashmap {
}

// Uses Hasher's insert_index_kernel to insert all elements, with the index as the value
void insertIndex(KEY_TYPE* keys, size_t len, float targetLF = 0.5, split_gpuStream_t s = 0, bool prefetches = true) {
template <bool prefetches = true>
void insertIndex(KEY_TYPE* keys, size_t len, float targetLF = 0.5, split_gpuStream_t s = 0) {
// Here we do some calculations to estimate how much if any we need to grow our buckets
// TODO fix these if paths or at least annotate them .
if (len == 0) {
Expand All @@ -1283,8 +1315,8 @@ class Hashmap {
}

// Uses Hasher's insert_kernel to insert all elements
void insert(hash_pair<KEY_TYPE, VAL_TYPE>* src, size_t len, float targetLF = 0.5, split_gpuStream_t s = 0,
bool prefetches = true) {
template <bool prefetches = true>
void insert(hash_pair<KEY_TYPE, VAL_TYPE>* src, size_t len, float targetLF = 0.5, split_gpuStream_t s = 0) {
if (len == 0) {
set_status(status::success);
return;
Expand All @@ -1303,7 +1335,8 @@ class Hashmap {
}

// Uses Hasher's retrieve_kernel to read all elements
void retrieve(KEY_TYPE* keys, VAL_TYPE* vals, size_t len, split_gpuStream_t s = 0,bool prefetches=true) {
template <bool prefetches = true>
void retrieve(KEY_TYPE* keys, VAL_TYPE* vals, size_t len, split_gpuStream_t s = 0) {
if (prefetches){
buckets.optimizeGPU(s);
}
Expand All @@ -1313,7 +1346,8 @@ class Hashmap {
}

// Uses Hasher's retrieve_kernel to read all elements
void retrieve(hash_pair<KEY_TYPE, VAL_TYPE>* src, size_t len, split_gpuStream_t s = 0, bool prefetches=true) {
template <bool prefetches = true>
void retrieve(hash_pair<KEY_TYPE, VAL_TYPE>* src, size_t len, split_gpuStream_t s = 0) {
if (prefetches){
buckets.optimizeGPU(s);
}
Expand All @@ -1322,7 +1356,8 @@ class Hashmap {
}

// Uses Hasher's erase_kernel to delete all elements
void erase(KEY_TYPE* keys, size_t len, split_gpuStream_t s = 0,bool prefetches=true) {
template <bool prefetches = true>
void erase(KEY_TYPE* keys, size_t len, split_gpuStream_t s = 0) {
if (prefetches){
buckets.optimizeGPU(s);
}
Expand All @@ -1341,8 +1376,11 @@ class Hashmap {
* The pointer is internally cleaned up by the destructors, however the user **must**
* call download() after usage on device.
*/
template <bool prefetches = true>
Hashmap* upload(split_gpuStream_t stream = 0) {
optimizeGPU(stream);
if (prefetches) {
optimizeGPU(stream);
}
SPLIT_CHECK_ERR(split_gpuMemcpyAsync(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice, stream));
return device_map;
}
Expand Down
4 changes: 2 additions & 2 deletions include/splitvector/split_tools.h
Original file line number Diff line number Diff line change
Expand Up @@ -1005,7 +1005,7 @@ void copy_if_loop(
split::SplitVector<T, split::split_unified_allocator<T>>& input,
split::SplitVector<T, split::split_unified_allocator<T>>& output,
Rule rule, split_gpuStream_t s = 0) {
#ifndef NDEBUG
#ifdef HASHINATOR_DEBUG
bool input_ok = isDeviceAccessible( reinterpret_cast<void*>(&input));
bool output_ok= isDeviceAccessible( reinterpret_cast<void*>(&output));
assert( (input_ok && output_ok) && "This method supports splitvectors dynamically allocated on device or unified memory!");
Expand All @@ -1018,7 +1018,7 @@ void copy_if_keys_loop(
split::SplitVector<T, split::split_unified_allocator<T>>& input,
split::SplitVector<U, split::split_unified_allocator<U>>& output,
Rule rule, split_gpuStream_t s = 0) {
#ifndef NDEBUG
#ifdef HASHINATOR_DEBUG
bool input_ok = isDeviceAccessible( reinterpret_cast<void*>(&input));
bool output_ok= isDeviceAccessible( reinterpret_cast<void*>(&output));
assert( (input_ok && output_ok) && "This method supports splitvectors dynamically allocated on device or unified memory!");
Expand Down
Loading
Loading