From e0c4392445a75b867daf7a0bbb7b08f266211631 Mon Sep 17 00:00:00 2001 From: Markus Battarbee Date: Tue, 6 Feb 2024 11:34:17 +0200 Subject: [PATCH 01/10] Remove nodiscard options which are breaking compilation on lumi --- include/common.h | 2 +- include/splitvector/splitvec.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/common.h b/include/common.h index 70c792a..83abefe 100644 --- a/include/common.h +++ b/include/common.h @@ -50,7 +50,7 @@ constexpr inline size_t nextPow2(size_t v) noexcept { * @brief Computes the next optimal overflow for the hasher kernels */ HASHINATOR_HOSTDEVICE -[[nodiscard]] +//[[nodiscard]] constexpr inline size_t nextOverflow(size_t currentOverflow, size_t virtualWarp) noexcept { size_t remainder = currentOverflow % virtualWarp; return ((remainder)==0)?currentOverflow: currentOverflow + (virtualWarp - remainder); diff --git a/include/splitvector/splitvec.h b/include/splitvector/splitvec.h index 3803fa2..10290a5 100644 --- a/include/splitvector/splitvec.h +++ b/include/splitvector/splitvec.h @@ -524,10 +524,10 @@ class SplitVector { * @brief Returns the residency information of this * Splitvector */ - HOSTDEVICE - [[nodiscard]] inline Residency getResidency()const noexcept{ - return _location; - } + // HOSTDEVICE + // [[nodiscard]] inline Residency getResidency()const noexcept{ + // return _location; + // } /** * @brief Copies metadata to a provided destination SplitInfo structure. From f266573c61f7bed57528c867e9be4861cdd26817 Mon Sep 17 00:00:00 2001 From: kstppd Date: Wed, 7 Feb 2024 13:12:05 +0200 Subject: [PATCH 02/10] Reduce stream syncs in split tools --- include/splitvector/split_tools.h | 23 ++----- meson.build | 2 + unit_tests/stream_compaction/bench.cu | 93 +++++++++++++++++++++++++++ 3 files changed, 102 insertions(+), 16 deletions(-) create mode 100644 unit_tests/stream_compaction/bench.cu diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h index e754965..c319826 100644 --- a/include/splitvector/split_tools.h +++ b/include/splitvector/split_tools.h @@ -622,14 +622,12 @@ uint32_t copy_if_raw(split::SplitVector>& i uint32_t* d_counts; uint32_t* d_offsets; d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s)); // Phase 1 -- Calculate per warp workload size_t _size = input.size(); split::tools::scan_reduce_raw<<>>(input.data(), d_counts, rule, _size); d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); // Step 2 -- Exclusive Prefix Scan on offsets @@ -638,17 +636,14 @@ uint32_t copy_if_raw(split::SplitVector>& i } else { split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); } - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); // Step 3 -- Compaction uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); split::tools::split_compact_raw <<>>( input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); uint32_t numel; SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); return numel; } @@ -659,13 +654,11 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, uint32_t* d_counts; uint32_t* d_offsets; d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t), s)); // Phase 1 -- Calculate per warp workload split::tools::scan_reduce_raw<<>>(input, d_counts, rule, size); d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); // Step 2 -- Exclusive Prefix Scan on offsets @@ -674,17 +667,14 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, } else { split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); } - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); // Step 3 -- Compaction uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); split::tools::split_compact_raw <<>>( input, d_counts, d_offsets, output, rule, size, nBlocks, retval); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); uint32_t numel; SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); return numel; } @@ -697,16 +687,13 @@ size_t copy_keys_if_raw(split::SplitVector> uint32_t* d_counts; uint32_t* d_offsets; - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s)); // Phase 1 -- Calculate per warp workload size_t _size = input.size(); split::tools::scan_reduce_raw<<>>(input.data(), d_counts, rule, _size); d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); // Step 2 -- Exclusive Prefix Scan on offsets @@ -715,17 +702,14 @@ size_t copy_keys_if_raw(split::SplitVector> } else { split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); } - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); // Step 3 -- Compaction uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); split::tools::split_compact_keys_raw <<>>( input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); uint32_t numel; SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); return numel; } @@ -779,6 +763,7 @@ void copy_keys_if(split::SplitVector>& inpu const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t); Cuda_mempool mPool(memory_for_pool, s); auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -812,6 +797,7 @@ void copy_if(split::SplitVector>& input, const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t); Cuda_mempool mPool(memory_for_pool, s); auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -827,6 +813,7 @@ void copy_keys_if(split::SplitVector>& inpu nBlocks += 1; } auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, std::forward(mPool), s); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -842,6 +829,7 @@ void copy_if(split::SplitVector>& input, nBlocks += 1; } auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -859,6 +847,7 @@ void copy_keys_if(split::SplitVector>& inpu assert(stack && "Invalid stack!"); Cuda_mempool mPool(stack, max_size); auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -876,6 +865,7 @@ void copy_if(split::SplitVector>& input, assert(stack && "Invalid stack!"); Cuda_mempool mPool(stack, max_size); auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -892,6 +882,7 @@ size_t copy_if(T* input, T* output, size_t size, Rule rule, void* stack, size_t assert(stack && "Invalid stack!"); Cuda_mempool mPool(stack, max_size); auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); return len; } diff --git a/meson.build b/meson.build index e48b066..5c10eb6 100644 --- a/meson.build +++ b/meson.build @@ -24,6 +24,7 @@ compaction3_unit = executable('compaction3_test', 'unit_tests/stream_compaction/ pointer_unit = executable('pointer_test', 'unit_tests/pointer_test/main.cu',dependencies :gtest_dep ) hybridCPU = executable('hybrid_cpu', 'unit_tests/hybrid/main.cu',cpp_args:'-DHASHINATOR_CPU_ONLY_MODE',dependencies :gtest_dep ) hashinator_bench = executable('bench', 'unit_tests/benchmark/main.cu', dependencies :gtest_dep,link_args:'-lnvToolsExt') +compaction_bench = executable('streamBench', 'unit_tests/stream_compaction/bench.cu' ,link_args:'-lnvToolsExt') deletion_mechanism = executable('deletion', 'unit_tests/delete_by_compaction/main.cu', dependencies :gtest_dep) insertion_mechanism = executable('insertion', 'unit_tests/insertion_mechanism/main.cu', dependencies :gtest_dep) tombstoneTest = executable('tbPerf', 'unit_tests/benchmark/tbPerf.cu', dependencies :gtest_dep) @@ -39,6 +40,7 @@ test('CompactionTest', compaction_unit) test('CompactionTest2', compaction2_unit) test('CompactionTest3', compaction3_unit) test('HashinatorBench', hashinator_bench) +test('CompactionBench', compaction_bench) test('Insertion', insertion_mechanism) test('Deletion', deletion_mechanism) test('PointerTest', pointer_unit) diff --git a/unit_tests/stream_compaction/bench.cu b/unit_tests/stream_compaction/bench.cu new file mode 100644 index 0000000..74b8ba6 --- /dev/null +++ b/unit_tests/stream_compaction/bench.cu @@ -0,0 +1,93 @@ +#include +#include +#include +#include +#include +#include "../../include/splitvector/splitvec.h" +#include "../../include/splitvector/split_tools.h" +#include +#include +#include +#include +using namespace std::chrono; +using type_t = uint32_t; +using splitvector = split::SplitVector ; +using thrustvector = thrust::device_vector ; +constexpr int R = 100; +#define PROFILE_START(msg) nvtxRangePushA((msg)) +#define PROFILE_END() nvtxRangePop() + +template +auto timeMe(Fn fn, Args && ... args){ + std::chrono::time_point start,stop; + double total_time=0; + start = std::chrono::high_resolution_clock::now(); + fn(args...); + stop = std::chrono::high_resolution_clock::now(); + auto duration = duration_cast(stop- start).count(); + total_time+=duration; + return total_time; +} + +template +void fillVec(T& vec,size_t sz){ + std::random_device dev; + std::mt19937 rng(dev()); + std::uniform_int_distribution dist(0,std::numeric_limits::max()); + for (size_t i=0; i< sz;++i){ + vec[i]=i;//dist(rng); + } + return; +} + + +void stream_compaction_split(splitvector& v,splitvector& output, type_t* stack, size_t sz){ + auto pred =[]__host__ __device__ (type_t element)->bool{ return (element%2)==0 ;}; + split::tools::copy_if(v.data(),output.data(),sz,pred,(void*)stack,sz); +} + +void stream_compaction_thrust(thrustvector& v,thrustvector& output){ + auto pred =[]__host__ __device__ (type_t element)->bool{ return (element%2)==0 ;}; + thrust::copy_if(thrust::device, v.begin(), v.end(), output.begin(), pred); +} + +int main(int argc, char* argv[]){ + + + int sz=10; + if (argc>=2){ + sz=atoi(argv[1]); + } + size_t N = 1< Date: Mon, 19 Feb 2024 11:48:50 +0200 Subject: [PATCH 03/10] WIP in faster stream compactions --- include/splitvector/gpu_wrappers.h | 21 +++++++++ include/splitvector/split_tools.h | 63 +++++++++++++++++++++++++++ unit_tests/benchmark/main.cu | 24 +++++----- unit_tests/stream_compaction/bench.cu | 53 +++++++++++++--------- 4 files changed, 129 insertions(+), 32 deletions(-) diff --git a/include/splitvector/gpu_wrappers.h b/include/splitvector/gpu_wrappers.h index b7d7a6e..fa00d3d 100644 --- a/include/splitvector/gpu_wrappers.h +++ b/include/splitvector/gpu_wrappers.h @@ -261,5 +261,26 @@ __device__ __forceinline__ T s_shuffle_down(T variable, unsigned int delta, U ma return __shfl_down(variable, delta); #endif } + +/** + * @brief Wrapper for performing an up register shuffle operation. + * + * @tparam T The data type of the variable. + * @tparam U The data type of the mask. + * @param variable The variable to shuffle. + * @param delta The offset. + * @param mask Voting mask. + * @return The shuffled variable. + */ +template +__device__ __forceinline__ T s_shuffle_up(T variable, unsigned int delta, U mask = 0) noexcept { + static_assert(std::is_integral::value && "Only integers supported"); +#ifdef __NVCC__ + return __shfl_up_sync(mask, variable, delta); +#endif +#ifdef __HIP__ + return __shfl_up(variable, delta); +#endif +} } // namespace split #endif diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h index c319826..58d01c3 100644 --- a/include/splitvector/split_tools.h +++ b/include/splitvector/split_tools.h @@ -886,5 +886,68 @@ size_t copy_if(T* input, T* output, size_t size, Rule rule, void* stack, size_t return len; } +template +__global__ void blockCompact(T* input,T* output,size_t inputSize,Rule rule) +{ + __shared__ uint32_t warpSums[WARPLENGTH]; + constexpr int VIRTUALWARP = WARPLENGTH; + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t wid = tid /WARPLENGTH; + const size_t w_tid = tid % WARPLENGTH; + //full warp votes for rule-> mask = [01010101010101010101010101010101] + const int active=rule(input[tid]); + const auto mask = split::s_warpVote(active==1,SPLIT_VOTING_MASK); + const auto warpCount=s_pop_count(mask); + if (w_tid==0){ + warpSums[wid]= (wid==0)?0:warpCount; + } + __syncthreads(); + //Prefix scan WarpSums on the first warp + if (wid==0){ + auto value = warpSums[w_tid]; + for (int d=1; d<32; d=2*d) { + int res= __shfl_up_sync(SPLIT_VOTING_MASK, value,d); + if (tid%32 >= d) value+= res; + } + warpSums[w_tid]=value; + } + __syncthreads(); + auto offset= (wid==0)?0:warpSums[wid]; + auto pp=s_pop_count(mask>>(static_cast(std::min(WARPLENGTH,(int)inputSize))-w_tid)); + const auto warpTidWriteIndex =offset + pp; + if(active){ + output[warpTidWriteIndex]=input[tid]; + } +} + + +template +size_t copy_if_small(T* input, T* output, size_t size, Rule rule, void* stack, size_t max_size, + split_gpuStream_t s = 0) { + + // Figure out Blocks to use + size_t _s = std::ceil((float(size)) / (float)BLOCKSIZE); + size_t nBlocks = nextPow2(_s); + if (nBlocks == 0) { + nBlocks += 1; + } + assert(stack && "Invalid stack!"); + Cuda_mempool mPool(stack, max_size); + auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + return len; +} + +template +size_t copy_if_small2(T* input, T* output, size_t size, Rule rule, void* stack, size_t max_size, + split_gpuStream_t s = 0) { + (void)stack; + (void)max_size; + // Figure out Blocks to use + split::tools::blockCompact<<<1,size>>>(input,output,size,rule); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + return 0; +} + } // namespace tools } // namespace split diff --git a/unit_tests/benchmark/main.cu b/unit_tests/benchmark/main.cu index f4591df..bd8bada 100644 --- a/unit_tests/benchmark/main.cu +++ b/unit_tests/benchmark/main.cu @@ -112,28 +112,28 @@ int main(int argc, char* argv[]){ hmap.memAdvise(cudaMemAdviseSetAccessedBy,device); hmap.optimizeGPU(); hmap.optimizeGPU(); + double t_insert={0}; + double t_retrieve={0}; + double t_extract={0}; + double t_erase={0}; + + for (int i =0; ibool{ return (element%2)==0 ;}; - split::tools::copy_if(v.data(),output.data(),sz,pred,(void*)stack,sz); + auto len = split::tools::copy_if_small(v.data(),output.data(),sz,pred,(void*)stack,sz); } + + +void stream_compaction_split(splitvector& v,splitvector& output, type_t* stack, size_t sz){ + auto pred =[]__host__ __device__ (type_t element)->bool{ return (element%2)==0 ;}; + auto len = split::tools::copy_if_small2(v.data(),output.data(),sz,pred,(void*)stack,sz); +} void stream_compaction_thrust(thrustvector& v,thrustvector& output){ auto pred =[]__host__ __device__ (type_t element)->bool{ return (element%2)==0 ;}; thrust::copy_if(thrust::device, v.begin(), v.end(), output.begin(), pred); @@ -54,40 +60,47 @@ void stream_compaction_thrust(thrustvector& v,thrustvector& output){ int main(int argc, char* argv[]){ - int sz=10; + + int sz=6; if (argc>=2){ sz=atoi(argv[1]); } - size_t N = 1< Date: Mon, 19 Feb 2024 12:10:22 +0200 Subject: [PATCH 04/10] Revert a few stream syncs that are actually needed --- include/splitvector/split_tools.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h index 58d01c3..12597ad 100644 --- a/include/splitvector/split_tools.h +++ b/include/splitvector/split_tools.h @@ -644,6 +644,7 @@ uint32_t copy_if_raw(split::SplitVector>& i input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval); uint32_t numel; SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); return numel; } @@ -675,6 +676,7 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, input, d_counts, d_offsets, output, rule, size, nBlocks, retval); uint32_t numel; SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); return numel; } @@ -710,6 +712,7 @@ size_t copy_keys_if_raw(split::SplitVector> input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval); uint32_t numel; SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); return numel; } @@ -797,7 +800,6 @@ void copy_if(split::SplitVector>& input, const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t); Cuda_mempool mPool(memory_for_pool, s); auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -813,7 +815,6 @@ void copy_keys_if(split::SplitVector>& inpu nBlocks += 1; } auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, std::forward(mPool), s); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -829,7 +830,6 @@ void copy_if(split::SplitVector>& input, nBlocks += 1; } auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -847,7 +847,6 @@ void copy_keys_if(split::SplitVector>& inpu assert(stack && "Invalid stack!"); Cuda_mempool mPool(stack, max_size); auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -865,7 +864,6 @@ void copy_if(split::SplitVector>& input, assert(stack && "Invalid stack!"); Cuda_mempool mPool(stack, max_size); auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -882,7 +880,6 @@ size_t copy_if(T* input, T* output, size_t size, Rule rule, void* stack, size_t assert(stack && "Invalid stack!"); Cuda_mempool mPool(stack, max_size); auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); return len; } From e6f1ad3a67a27eadb926976730c9241bdd4e0711 Mon Sep 17 00:00:00 2001 From: kstppd Date: Fri, 1 Mar 2024 21:48:13 +0200 Subject: [PATCH 05/10] Block compaction with one kernel up to 1024 elements --- include/splitvector/split_tools.h | 312 ++++++++++++++++++++------ unit_tests/stream_compaction/bench.cu | 66 ++---- 2 files changed, 268 insertions(+), 110 deletions(-) diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h index 12597ad..f4614e2 100644 --- a/include/splitvector/split_tools.h +++ b/include/splitvector/split_tools.h @@ -433,6 +433,7 @@ void split_prefix_scan(split::SplitVector>& * Modified from (http://www-graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2) to support 64-bit uints * Included here as well for standalone use of splitvec outside of hashintor */ +__host__ __device__ constexpr inline size_t nextPow2(size_t v) noexcept { v--; v |= v >> 1; @@ -612,6 +613,240 @@ void split_prefix_scan_raw(T* input, T* output, Cuda_mempool& mPool, const size_ } } + + +#ifdef __NVCC__ +template +__global__ void block_compact(T* input,T* output,size_t inputSize,Rule rule,uint32_t *retval) +{ + __shared__ uint32_t warpSums[WARPLENGTH]; + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t wid = tid /WARPLENGTH; + const size_t w_tid = tid % WARPLENGTH; + //full warp votes for rule-> mask = [01010101010101010101010101010101] + const int active=(tid( ceilf(static_cast(inputSize)/WARPLENGTH) )); + auto reduceCounts=[activeWARPS](int localCount)->int{ + for (int i = activeWARPS / 2; i > 0; i = i / 2) { + localCount += split::s_shuffle_down(localCount, i, SPLIT_VOTING_MASK); + } + return localCount; + }; + auto localCount=warpSums[w_tid]; + int totalCount = reduceCounts(localCount); + if (w_tid==0){ + *retval=(uint32_t)(totalCount); + } + } + //Prefix scan WarpSums on the first warp + if (wid==0){ + auto value = warpSums[w_tid]; + for (int d=1; d<32; d=2*d) { + int res= __shfl_up_sync(SPLIT_VOTING_MASK, value,d); + if (tid%32 >= d) value+= res; + } + warpSums[w_tid]=value; + } + __syncthreads(); + auto offset= (wid==0)?0:warpSums[wid-1]; + auto pp=s_pop_count(mask&((1< +__global__ void block_compact_keys(T* input,U* output,size_t inputSize,Rule rule,uint32_t *retval) +{ + __shared__ uint32_t warpSums[WARPLENGTH]; + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t wid = tid /WARPLENGTH; + const size_t w_tid = tid % WARPLENGTH; + //full warp votes for rule-> mask = [01010101010101010101010101010101] + const int active=rule((input[tid])); + const auto mask = split::s_warpVote(active==1,SPLIT_VOTING_MASK); + const auto warpCount=s_pop_count(mask); + if (w_tid==0){ + warpSums[wid]=warpCount; + } + __syncthreads(); + //Figure out the total here because we overwrite shared mem later + if (wid==0){ + int activeWARPS=nextPow2( static_cast( ceilf(static_cast(inputSize)/WARPLENGTH) )); + auto reduceCounts=[activeWARPS](int localCount)->int{ + for (int i = activeWARPS / 2; i > 0; i = i / 2) { + localCount += split::s_shuffle_down(localCount, i, SPLIT_VOTING_MASK); + } + return localCount; + }; + auto localCount=warpSums[w_tid]; + int totalCount = reduceCounts(localCount); + if (w_tid==0){ + *retval=(uint32_t)(totalCount); + } + } + //Prefix scan WarpSums on the first warp + if (wid==0){ + auto value = warpSums[w_tid]; + for (int d=1; d<32; d=2*d) { + int res= __shfl_up_sync(SPLIT_VOTING_MASK, value,d); + if (tid%32 >= d) value+= res; + } + warpSums[w_tid]=value; + } + __syncthreads(); + auto offset=(wid==0)?0:warpSums[wid-1]; + auto pp=s_pop_count(mask&((1< +__global__ void block_compact(T* input,T* output,size_t inputSize,Rule rule,uint32_t *retval) +{ + __shared__ uint32_t warpSums[WARPLENGTH/2]; + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t wid = tid /WARPLENGTH; + const size_t w_tid = tid % WARPLENGTH; + //full warp votes for rule-> mask = [01010101010101010101010101010101] + const int active=(tid( ceilf(static_cast(inputSize)/WARPLENGTH) )); + auto reduceCounts=[activeWARPS](int localCount)->int{ + for (int i = activeWARPS / 2; i > 0; i = i / 2) { + localCount += split::s_shuffle_down(localCount, i, SPLIT_VOTING_MASK); + } + return localCount; + }; + auto localCount=warpSums[w_tid]; + int totalCount = reduceCounts(localCount); + if (w_tid==0){ + *retval=(uint32_t)(totalCount); + } + } + //Prefix scan WarpSums on the first warp + if (wid==0){ + auto value = warpSums[w_tid]; + for (int d=1; d<16; d=2*d) { + int res = split::s_shuffle_up(value,d,SPLIT_VOTING_MASK); + if (tid%16 >= d) value+= res; + } + warpSums[w_tid]=value; + } + __syncthreads(); + auto offset= (wid==0)?0:warpSums[wid-1]; + auto pp=s_pop_count(mask&((1ul< +__global__ void block_compact_keys(T* input,U* output,size_t inputSize,Rule rule,uint32_t *retval) +{ + __shared__ uint32_t warpSums[WARPLENGTH/2]; + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t wid = tid /WARPLENGTH; + const size_t w_tid = tid % WARPLENGTH; + //full warp votes for rule-> mask = [01010101010101010101010101010101] + const int active=(tid( ceilf(static_cast(inputSize)/WARPLENGTH) )); + auto reduceCounts=[activeWARPS](int localCount)->int{ + for (int i = activeWARPS / 2; i > 0; i = i / 2) { + localCount += split::s_shuffle_down(localCount, i, SPLIT_VOTING_MASK); + } + return localCount; + }; + auto localCount=warpSums[w_tid]; + int totalCount = reduceCounts(localCount); + if (w_tid==0){ + *retval=(uint32_t)(totalCount); + } + } + //Prefix scan WarpSums on the first warp + if (wid==0){ + auto value = warpSums[w_tid]; + for (int d=1; d<16; d=2*d) { + int res = split::s_shuffle_up(value,d,SPLIT_VOTING_MASK); + if (tid%16 >= d) value+= res; + } + warpSums[w_tid]=value; + } + __syncthreads(); + auto offset= (wid==0)?0:warpSums[wid-1]; + auto pp=s_pop_count(mask&((1ul< +size_t copy_if_block(T* input, T* output, size_t size, Rule rule, void* stack, size_t max_size, + split_gpuStream_t s = 0) { + assert(stack && "Invalid stack!"); + Cuda_mempool mPool(stack, max_size); + uint32_t *dlen = (uint32_t*)mPool.allocate(sizeof(uint32_t)); + split::tools::block_compact<<<1,std::min(BLOCKSIZE,nextPow2(size))>>>(input,output,size,rule,dlen); + uint32_t len=0; + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&len, dlen, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + return len; +} + +template +size_t copy_if_block(T* input, T* output, size_t size, Rule rule, Cuda_mempool& mPool, + split_gpuStream_t s = 0) { + uint32_t *dlen = (uint32_t*)mPool.allocate(sizeof(uint32_t)); + split::tools::block_compact<<<1,std::min(BLOCKSIZE,nextPow2(size)),0,s>>>(input,output,size,rule,dlen); + uint32_t len=0; + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&len, dlen, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + return len; +} + +template +size_t copy_if_keys_block(T* input, U* output, size_t size, Rule rule, Cuda_mempool& mPool, + split_gpuStream_t s = 0) { + uint32_t *dlen = (uint32_t*)mPool.allocate(sizeof(uint32_t)); + split::tools::block_compact_keys<<<1,std::min(BLOCKSIZE,nextPow2(size)),0,s>>>(input,output,size,rule,dlen); + uint32_t len=0; + SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&len, dlen, sizeof(uint32_t), split_gpuMemcpyDeviceToHost,s)); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); + return len; +} + /** * @brief Same as copy_if but using raw memory */ @@ -619,13 +854,16 @@ template >& input, T* output, Rule rule, size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) { + size_t _size = input.size(); + if (_size<=BLOCKSIZE){ + return copy_if_block(input.data(),output,_size,rule,mPool,s); + } uint32_t* d_counts; uint32_t* d_offsets; d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s)); // Phase 1 -- Calculate per warp workload - size_t _size = input.size(); split::tools::scan_reduce_raw<<>>(input.data(), d_counts, rule, _size); d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); @@ -652,6 +890,9 @@ template >& input, U* output, Rule rule, size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) { + size_t _size = input.size(); + if (_size<=BLOCKSIZE){ + return copy_if_keys_block(input.data(),output,_size,rule,mPool,s); + } uint32_t* d_counts; uint32_t* d_offsets; d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s)); // Phase 1 -- Calculate per warp workload - size_t _size = input.size(); split::tools::scan_reduce_raw<<>>(input.data(), d_counts, rule, _size); d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); @@ -882,69 +1126,5 @@ size_t copy_if(T* input, T* output, size_t size, Rule rule, void* stack, size_t auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s); return len; } - -template -__global__ void blockCompact(T* input,T* output,size_t inputSize,Rule rule) -{ - __shared__ uint32_t warpSums[WARPLENGTH]; - constexpr int VIRTUALWARP = WARPLENGTH; - const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; - const size_t wid = tid /WARPLENGTH; - const size_t w_tid = tid % WARPLENGTH; - //full warp votes for rule-> mask = [01010101010101010101010101010101] - const int active=rule(input[tid]); - const auto mask = split::s_warpVote(active==1,SPLIT_VOTING_MASK); - const auto warpCount=s_pop_count(mask); - if (w_tid==0){ - warpSums[wid]= (wid==0)?0:warpCount; - } - __syncthreads(); - //Prefix scan WarpSums on the first warp - if (wid==0){ - auto value = warpSums[w_tid]; - for (int d=1; d<32; d=2*d) { - int res= __shfl_up_sync(SPLIT_VOTING_MASK, value,d); - if (tid%32 >= d) value+= res; - } - warpSums[w_tid]=value; - } - __syncthreads(); - auto offset= (wid==0)?0:warpSums[wid]; - auto pp=s_pop_count(mask>>(static_cast(std::min(WARPLENGTH,(int)inputSize))-w_tid)); - const auto warpTidWriteIndex =offset + pp; - if(active){ - output[warpTidWriteIndex]=input[tid]; - } -} - - -template -size_t copy_if_small(T* input, T* output, size_t size, Rule rule, void* stack, size_t max_size, - split_gpuStream_t s = 0) { - - // Figure out Blocks to use - size_t _s = std::ceil((float(size)) / (float)BLOCKSIZE); - size_t nBlocks = nextPow2(_s); - if (nBlocks == 0) { - nBlocks += 1; - } - assert(stack && "Invalid stack!"); - Cuda_mempool mPool(stack, max_size); - auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); - return len; -} - -template -size_t copy_if_small2(T* input, T* output, size_t size, Rule rule, void* stack, size_t max_size, - split_gpuStream_t s = 0) { - (void)stack; - (void)max_size; - // Figure out Blocks to use - split::tools::blockCompact<<<1,size>>>(input,output,size,rule); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); - return 0; -} - } // namespace tools } // namespace split diff --git a/unit_tests/stream_compaction/bench.cu b/unit_tests/stream_compaction/bench.cu index 78e159b..303083d 100644 --- a/unit_tests/stream_compaction/bench.cu +++ b/unit_tests/stream_compaction/bench.cu @@ -5,17 +5,12 @@ #include #include "../../include/splitvector/splitvec.h" #include "../../include/splitvector/split_tools.h" -#include -#include -#include -#include using namespace std::chrono; using type_t = uint32_t; using splitvector = split::SplitVector ; -using thrustvector = thrust::device_vector ; -constexpr int R = 100; -#define PROFILE_START(msg) nvtxRangePushA((msg)) -#define PROFILE_END() nvtxRangePop() +constexpr int R = 1000; +#define PROFILE_START(msg) +#define PROFILE_END() template auto timeMe(Fn fn, Args && ... args){ @@ -40,59 +35,42 @@ void fillVec(T& vec,size_t sz){ return; } - -void stream_compaction_split_old(splitvector& v,splitvector& output, type_t* stack, size_t sz){ - auto pred =[]__host__ __device__ (type_t element)->bool{ return (element%2)==0 ;}; - auto len = split::tools::copy_if_small(v.data(),output.data(),sz,pred,(void*)stack,sz); +template +void printVec(const T& vec){ + for (size_t i=0; i< vec.size();++i){ + std::cout<bool{ return (element%2)==0 ;}; - auto len = split::tools::copy_if_small2(v.data(),output.data(),sz,pred,(void*)stack,sz); -} -void stream_compaction_thrust(thrustvector& v,thrustvector& output){ - auto pred =[]__host__ __device__ (type_t element)->bool{ return (element%2)==0 ;}; - thrust::copy_if(thrust::device, v.begin(), v.end(), output.begin(), pred); -} - int main(int argc, char* argv[]){ - - - int sz=6; + int sz=10; if (argc>=2){ sz=atoi(argv[1]); } - size_t N = 64; + void* mem; + size_t N = sz; splitvector v0(N),v0_out(N); srand(1); fillVec(v0,N); - splitvector stack(N); - v0.optimizeGPU(); v0_out.optimizeGPU(); - stack.optimizeGPU(); split_gpuDeviceSynchronize(); - - double t_split={0}; - double t_split_old={0}; - for (size_t i =0 ; i < R ; ++i){ - PROFILE_START("SPLIT"); - t_split+=timeMe(stream_compaction_split,v0,v0_out,stack.data(),N); + PROFILE_START("Stream_Compaction"); + auto pred =[]__host__ __device__ (type_t element)->bool{ return (element>10) ;}; + auto pred_ =[]__host__ __device__ (type_t element)->bool{ return (element%2)==0 ;}; +#if 0 + split::tools::copy_if(v0,v0_out,pred); +#else + split::tools::copy_if(v0,v0_out,pred_); +#endif PROFILE_END(); - splitvector vv(v0_out); - - PROFILE_START("SPLITOLD"); - t_split_old+=timeMe(stream_compaction_split_old,v0,v0_out,stack.data(),N); - PROFILE_END(); - assert(vv==v0_out); + break; } - - printf("%d\t%f,%f\n",sz,t_split_old/R,t_split/R); - return 0; } From 99f0c52fa9f3f2e21611b2285b929af6a9c825a3 Mon Sep 17 00:00:00 2001 From: kstppd Date: Fri, 1 Mar 2024 21:53:53 +0200 Subject: [PATCH 06/10] Remove sync --- include/splitvector/split_tools.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h index f4614e2..a63cad3 100644 --- a/include/splitvector/split_tools.h +++ b/include/splitvector/split_tools.h @@ -1010,7 +1010,6 @@ void copy_keys_if(split::SplitVector>& inpu const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t); Cuda_mempool mPool(memory_for_pool, s); auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } From 602108c0f412e5f84ee9b033568aa74a141771e6 Mon Sep 17 00:00:00 2001 From: kstppd Date: Fri, 1 Mar 2024 22:07:45 +0200 Subject: [PATCH 07/10] Rename Cuda_mempool to SplitStackArena --- include/hashinator/hashinator.h | 2 +- include/splitvector/split_tools.h | 46 ++++++++++---------- unit_tests/benchmark/main.cu | 2 +- unit_tests/stream_compaction/preallocated.cu | 12 ++--- unit_tests/stream_compaction/race.cu | 2 +- 5 files changed, 32 insertions(+), 32 deletions(-) diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h index cb71f17..4a8048e 100644 --- a/include/hashinator/hashinator.h +++ b/include/hashinator/hashinator.h @@ -1110,7 +1110,7 @@ class Hashmap { // Allocate with Mempool const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t); - split::tools::Cuda_mempool mPool(memory_for_pool, s); + split::tools::splitStackArena mPool(memory_for_pool, s); size_t retval = split::tools::copy_if_raw, Rule, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>( buckets, elements, rule, nBlocks, mPool, s); diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h index a63cad3..fbd1153 100644 --- a/include/splitvector/split_tools.h +++ b/include/splitvector/split_tools.h @@ -3,7 +3,7 @@ * Description: Set of tools used by SplitVector * * This file defines the following classes or functions: - * --split::tools::Cuda_mempool + * --split::tools::splitStackArena * --split::tools::copy_if_raw * --split::tools::copy_if * --split::tools::scan_reduce_raw @@ -501,7 +501,7 @@ __global__ void split_compact_raw(T* input, uint32_t* counts, uint32_t* offsets, * It uses async mallocs * */ -class Cuda_mempool { +class splitStackArena { private: size_t total_bytes; size_t bytes_used; @@ -510,24 +510,24 @@ class Cuda_mempool { bool isOwner; public: - explicit Cuda_mempool(size_t bytes, split_gpuStream_t str) { + explicit splitStackArena(size_t bytes, split_gpuStream_t str) { s = str; SPLIT_CHECK_ERR(split_gpuMallocAsync(&_data, bytes, s)); total_bytes = bytes; bytes_used = 0; isOwner = true; } - explicit Cuda_mempool(void* ptr, size_t bytes) { + explicit splitStackArena(void* ptr, size_t bytes) { total_bytes = bytes; bytes_used = 0; isOwner = false; _data = ptr; } - Cuda_mempool() = delete; - Cuda_mempool(const Cuda_mempool& other) = delete; - Cuda_mempool(Cuda_mempool&& other) = delete; - ~Cuda_mempool() { + splitStackArena() = delete; + splitStackArena(const splitStackArena& other) = delete; + splitStackArena(splitStackArena&& other) = delete; + ~splitStackArena() { if (isOwner) { SPLIT_CHECK_ERR(split_gpuFreeAsync(_data, s)); } @@ -566,7 +566,7 @@ __global__ void scan_reduce_raw(T* input, uint32_t* output, Rule rule, size_t si * @brief Same as split_prefix_scan but with raw memory */ template -void split_prefix_scan_raw(T* input, T* output, Cuda_mempool& mPool, const size_t input_size, split_gpuStream_t s = 0) { +void split_prefix_scan_raw(T* input, T* output, splitStackArena& mPool, const size_t input_size, split_gpuStream_t s = 0) { // Scan is performed in half Blocksizes size_t scanBlocksize = BLOCKSIZE / 2; @@ -816,7 +816,7 @@ template >>(input,output,size,rule,dlen); uint32_t len=0; @@ -826,7 +826,7 @@ size_t copy_if_block(T* input, T* output, size_t size, Rule rule, void* stack, s } template -size_t copy_if_block(T* input, T* output, size_t size, Rule rule, Cuda_mempool& mPool, +size_t copy_if_block(T* input, T* output, size_t size, Rule rule, splitStackArena& mPool, split_gpuStream_t s = 0) { uint32_t *dlen = (uint32_t*)mPool.allocate(sizeof(uint32_t)); split::tools::block_compact<<<1,std::min(BLOCKSIZE,nextPow2(size)),0,s>>>(input,output,size,rule,dlen); @@ -837,7 +837,7 @@ size_t copy_if_block(T* input, T* output, size_t size, Rule rule, Cuda_mempool& } template -size_t copy_if_keys_block(T* input, U* output, size_t size, Rule rule, Cuda_mempool& mPool, +size_t copy_if_keys_block(T* input, U* output, size_t size, Rule rule, splitStackArena& mPool, split_gpuStream_t s = 0) { uint32_t *dlen = (uint32_t*)mPool.allocate(sizeof(uint32_t)); split::tools::block_compact_keys<<<1,std::min(BLOCKSIZE,nextPow2(size)),0,s>>>(input,output,size,rule,dlen); @@ -852,7 +852,7 @@ size_t copy_if_keys_block(T* input, U* output, size_t size, Rule rule, Cuda_memp */ template uint32_t copy_if_raw(split::SplitVector>& input, T* output, Rule rule, - size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) { + size_t nBlocks, splitStackArena& mPool, split_gpuStream_t s = 0) { size_t _size = input.size(); if (_size<=BLOCKSIZE){ @@ -888,7 +888,7 @@ uint32_t copy_if_raw(split::SplitVector>& i template uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, - size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) { + size_t nBlocks, splitStackArena& mPool, split_gpuStream_t s = 0) { if (size<=BLOCKSIZE){ return copy_if_block(input,output,size,rule,mPool,s); @@ -926,7 +926,7 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, */ template size_t copy_keys_if_raw(split::SplitVector>& input, U* output, Rule rule, - size_t nBlocks, Cuda_mempool& mPool, split_gpuStream_t s = 0) { + size_t nBlocks, splitStackArena& mPool, split_gpuStream_t s = 0) { size_t _size = input.size(); if (_size<=BLOCKSIZE){ @@ -1008,7 +1008,7 @@ void copy_keys_if(split::SplitVector>& inpu // Allocate with Mempool const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t); - Cuda_mempool mPool(memory_for_pool, s); + splitStackArena mPool(memory_for_pool, s); auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s); output.erase(&output[len], output.end()); } @@ -1041,14 +1041,14 @@ void copy_if(split::SplitVector>& input, // Allocate with Mempool const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t); - Cuda_mempool mPool(memory_for_pool, s); + splitStackArena mPool(memory_for_pool, s); auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s); output.erase(&output[len], output.end()); } template void copy_keys_if(split::SplitVector>& input, - split::SplitVector>& output, Rule rule, Cuda_mempool&& mPool, + split::SplitVector>& output, Rule rule, splitStackArena&& mPool, split_gpuStream_t s = 0) { // Figure out Blocks to use @@ -1057,13 +1057,13 @@ void copy_keys_if(split::SplitVector>& inpu if (nBlocks == 0) { nBlocks += 1; } - auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, std::forward(mPool), s); + auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, std::forward(mPool), s); output.erase(&output[len], output.end()); } template void copy_if(split::SplitVector>& input, - split::SplitVector>& output, Rule rule, Cuda_mempool&& mPool, + split::SplitVector>& output, Rule rule, splitStackArena&& mPool, split_gpuStream_t s = 0) { // Figure out Blocks to use @@ -1088,7 +1088,7 @@ void copy_keys_if(split::SplitVector>& inpu nBlocks += 1; } assert(stack && "Invalid stack!"); - Cuda_mempool mPool(stack, max_size); + splitStackArena mPool(stack, max_size); auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s); output.erase(&output[len], output.end()); } @@ -1105,7 +1105,7 @@ void copy_if(split::SplitVector>& input, nBlocks += 1; } assert(stack && "Invalid stack!"); - Cuda_mempool mPool(stack, max_size); + splitStackArena mPool(stack, max_size); auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s); output.erase(&output[len], output.end()); } @@ -1121,7 +1121,7 @@ size_t copy_if(T* input, T* output, size_t size, Rule rule, void* stack, size_t nBlocks += 1; } assert(stack && "Invalid stack!"); - Cuda_mempool mPool(stack, max_size); + splitStackArena mPool(stack, max_size); auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s); return len; } diff --git a/unit_tests/benchmark/main.cu b/unit_tests/benchmark/main.cu index bd8bada..2e465f0 100644 --- a/unit_tests/benchmark/main.cu +++ b/unit_tests/benchmark/main.cu @@ -101,7 +101,7 @@ void benchErase(hashmap& hmap,key_type* gpuKeys, val_type* gpuVals,int sz){ int main(int argc, char* argv[]){ - int sz= 24; + int sz= 18; if (argc>=2){ sz=atoi(argv[1]); } diff --git a/unit_tests/stream_compaction/preallocated.cu b/unit_tests/stream_compaction/preallocated.cu index 20b01af..971a723 100644 --- a/unit_tests/stream_compaction/preallocated.cu +++ b/unit_tests/stream_compaction/preallocated.cu @@ -101,10 +101,10 @@ bool preallocated_compactions_basic(int power){ //These mempools are now allocation free. They essentially just manage the buffer correclty! //Please !!ALWAYS!! forwarding here to preserve move semantics as the pool might change later on split::tools::copy_if(v,output1,predicate_on, - std::forward(split::tools::Cuda_mempool{buffer,bytesNeeded})); + std::forward(split::tools::splitStackArena{buffer,bytesNeeded})); split::tools::copy_if(v,output2,predicate_off, - std::forward(split::tools::Cuda_mempool{buffer,bytesNeeded})); + std::forward(split::tools::splitStackArena{buffer,bytesNeeded})); //Deallocate our good buffer SPLIT_CHECK_ERR (split_gpuFree(buffer)); @@ -189,7 +189,7 @@ bool preallocated_compactions_medium(int power){ //This guy goes on stream 1 split::tools::copy_if(v,output1,predicate_on, - std::forward(split::tools::Cuda_mempool{buffer,bytesNeeded}),streams[0]); + std::forward(split::tools::splitStackArena{buffer,bytesNeeded}),streams[0]); //This guy goes on stream 2 /* @@ -197,7 +197,7 @@ bool preallocated_compactions_medium(int power){ */ void* start = reinterpret_cast ( reinterpret_cast(buffer)+bytesNeeded); split::tools::copy_if(v,output2,predicate_off, - std::forward(split::tools::Cuda_mempool{start,bytesNeeded}),streams[1]); + std::forward(split::tools::splitStackArena{start,bytesNeeded}),streams[1]); //Wait for them! @@ -274,10 +274,10 @@ bool preallocated_compactions_HAM(int power){ void* tidIndex_1 = reinterpret_cast ( reinterpret_cast(buffer)+tid*streamsPerThread*bytesNeeded); void* tidIndex_2 = reinterpret_cast ( reinterpret_cast(buffer)+tid*streamsPerThread*bytesNeeded+bytesNeeded); split::tools::copy_if(vecs[i],out1[i],predicate_on, - std::forward(split::tools::Cuda_mempool{tidIndex_1,bytesNeeded})); + std::forward(split::tools::splitStackArena{tidIndex_1,bytesNeeded})); split::tools::copy_if(vecs[i],out2[i],predicate_off, - std::forward(split::tools::Cuda_mempool{tidIndex_2,bytesNeeded})); + std::forward(split::tools::splitStackArena{tidIndex_2,bytesNeeded})); } diff --git a/unit_tests/stream_compaction/race.cu b/unit_tests/stream_compaction/race.cu index 5f9f4ea..f0fe82f 100644 --- a/unit_tests/stream_compaction/race.cu +++ b/unit_tests/stream_compaction/race.cu @@ -67,7 +67,7 @@ void split_test_prefix(split_vector& input_split,split_vector& output_split,size input_split[i]=i;//tmp; } - //split::tools::Cuda_mempool mPool(1024*64); + //split::tools::splitStackArena mPool(1024*64); input_split.optimizeGPU(); From 9e0a83c83b60c4ccb42b9d4ffb48168cb7ab1172 Mon Sep 17 00:00:00 2001 From: kstppd Date: Fri, 15 Mar 2024 16:52:21 +0200 Subject: [PATCH 08/10] Remove forgotten optimize call and add conditional prefetches to retrieve and erase methods --- include/hashinator/hashinator.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h index 4a8048e..ddd5f55 100644 --- a/include/hashinator/hashinator.h +++ b/include/hashinator/hashinator.h @@ -1275,29 +1275,35 @@ class Hashmap { if (neededPowerSize > _mapInfo->sizePower) { resize(neededPowerSize, targets::device, s); } - buckets.optimizeGPU(s); DeviceHasher::insert(src, buckets.data(), _mapInfo->sizePower, _mapInfo->currentMaxBucketOverflow, &_mapInfo->currentMaxBucketOverflow, &_mapInfo->fill, len, &_mapInfo->err, s); return; } // Uses Hasher's retrieve_kernel to read all elements - void retrieve(KEY_TYPE* keys, VAL_TYPE* vals, size_t len, split_gpuStream_t s = 0) { - buckets.optimizeGPU(s); + void retrieve(KEY_TYPE* keys, VAL_TYPE* vals, size_t len, split_gpuStream_t s = 0,bool prefetches=true) { + if (prefetches){ + buckets.optimizeGPU(s); + } DeviceHasher::retrieve(keys, vals, buckets.data(), _mapInfo->sizePower, _mapInfo->currentMaxBucketOverflow, len, s); return; } // Uses Hasher's retrieve_kernel to read all elements - void retrieve(hash_pair* src, size_t len, split_gpuStream_t s = 0) { - buckets.optimizeGPU(s); + void retrieve(hash_pair* src, size_t len, split_gpuStream_t s = 0, bool prefetches=true) { + if (prefetches){ + buckets.optimizeGPU(s); + } DeviceHasher::retrieve(src, buckets.data(), _mapInfo->sizePower, _mapInfo->currentMaxBucketOverflow, len, s); return; } // Uses Hasher's erase_kernel to delete all elements - void erase(KEY_TYPE* keys, size_t len, split_gpuStream_t s = 0) { + void erase(KEY_TYPE* keys, size_t len, split_gpuStream_t s = 0,bool prefetches=true) { + if (prefetches){ + buckets.optimizeGPU(s); + } // Remember the last numeber of tombstones size_t tbStore = tombstone_count(); DeviceHasher::erase(keys, buckets.data(), &_mapInfo->tombstoneCounter, _mapInfo->sizePower, From 376e9f62930d646cd58af32e9ee1a974bf6d1a25 Mon Sep 17 00:00:00 2001 From: kstppd Date: Tue, 26 Mar 2024 17:06:38 +0200 Subject: [PATCH 09/10] Add methods that return emptybucket and tombstone values --- include/hashinator/hashinator.h | 6 +++ unit_tests/hashmap_unit_test/main.cu | 73 +++++++++++++++------------- 2 files changed, 46 insertions(+), 33 deletions(-) diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h index ddd5f55..b1b6811 100644 --- a/include/hashinator/hashinator.h +++ b/include/hashinator/hashinator.h @@ -416,6 +416,12 @@ class Hashmap { HASHINATOR_HOSTDEVICE size_t bucket_count() const { return buckets.size(); } + HASHINATOR_HOSTDEVICE + constexpr KEY_TYPE get_emptybucket() const { return EMPTYBUCKET; } + + HASHINATOR_HOSTDEVICE + constexpr KEY_TYPE get_tombstone() const { return TOMBSTONE; } + HASHINATOR_HOSTDEVICE float load_factor() const { return (float)size() / bucket_count(); } diff --git a/unit_tests/hashmap_unit_test/main.cu b/unit_tests/hashmap_unit_test/main.cu index e9cfa86..9727f8b 100644 --- a/unit_tests/hashmap_unit_test/main.cu +++ b/unit_tests/hashmap_unit_test/main.cu @@ -27,9 +27,9 @@ typedef Hashmap hashmap; struct Predicate{ HASHINATOR_HOSTDEVICE - inline bool operator()( hash_pair& element)const{ - return element.second%2==0; - } + inline bool operator()( hash_pair& element)const{ + return element.second%2==0; + } }; template @@ -165,7 +165,7 @@ bool recover_all_elements(const hashmap& hmap, vector& src){ if (!sane){ return false; } - //std::cout<<"Key validated "<first<<" "<second<first<<" "<second<>>(d_hmap,src.data(),src.size()); @@ -645,7 +645,7 @@ bool test_hashmap_2(int power){ return false; } } - + //Verify odd elements; cpuOK=recover_odd_elements(hmap,src); gpu_recover_odd_elements<<>>(hmap,src.data(),src.size()); @@ -973,11 +973,11 @@ TEST(HashmapUnitTets ,Test_Resize_Perf_Device){ template struct Rule{ -Rule(){} + Rule(){} __host__ __device__ - inline bool operator()( hash_pair& element)const{ - return element.first<1000; - } + inline bool operator()( hash_pair& element)const{ + return element.first<1000; + } }; @@ -1013,7 +1013,7 @@ TEST(HashmapUnitTets ,Test_ErrorCodes_ExtractKeysByPatternNoAllocations){ size_t mem=2*sizeof(key_type)*(1<(),buffer,mem); - + expect_true(out1==out2); SPLIT_CHECK_ERR (split_gpuFree(buffer)); } @@ -1037,25 +1037,25 @@ TEST(HashmapUnitTets ,Test_Copy_Metadata){ } std::vector generateUniqueRandomKeys(size_t size, size_t range=std::numeric_limits::max()) { - std::vector elements; - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution dist(1, range); - - for (size_t i = 0; i < size; ++i) { - key_type randomNum = i;//dist(gen); - if (std::find(elements.begin(), elements.end(), randomNum) == elements.end()) { - elements.push_back(randomNum); - } else { - --i; - } - } - return elements; + std::vector elements; + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dist(1, range); + + for (size_t i = 0; i < size; ++i) { + key_type randomNum = i;//dist(gen); + if (std::find(elements.begin(), elements.end(), randomNum) == elements.end()) { + elements.push_back(randomNum); + } else { + --i; + } + } + return elements; } void insertDuplicates(std::vector& vec, key_type element, size_t count) { if (count>0){ - vec.insert(vec.end(), count, element); + vec.insert(vec.end(), count, element); } srand(time(NULL)); std::random_shuffle(vec.begin(),vec.end()); @@ -1084,7 +1084,14 @@ TEST(HashmapUnitTets ,Test_Duplicate_Insertion){ } } - +TEST(HashmapUnitTets ,EMPTY_TOMBSTONE_values){ + Hashmap hmap; + const auto tombstone = hmap.get_tombstone(); + const auto emptybucket = hmap.get_emptybucket(); + expect_true(tombstone==emptybucket); + //These can be also checked during compile time + static_assert(tombstone==emptybucket); +} int main(int argc, char* argv[]){ srand(time(NULL)); From 3f214f520618e258c4d8d8e4f68ecd97f2d03ea6 Mon Sep 17 00:00:00 2001 From: Markus Battarbee Date: Thu, 28 Mar 2024 20:57:02 +0200 Subject: [PATCH 10/10] Fix signed-unsigned error which was causing erroneous outputs from warpInsert_V --- include/hashinator/hashinator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h index b1b6811..7516a17 100644 --- a/include/hashinator/hashinator.h +++ b/include/hashinator/hashinator.h @@ -962,7 +962,7 @@ class Hashmap { } auto res = split::s_warpVote(localCount > 0, submask); - return (res > 0); + return (res != 0); } HASHINATOR_DEVICEONLY