From f266573c61f7bed57528c867e9be4861cdd26817 Mon Sep 17 00:00:00 2001 From: kstppd Date: Wed, 7 Feb 2024 13:12:05 +0200 Subject: [PATCH 1/2] Reduce stream syncs in split tools --- include/splitvector/split_tools.h | 23 ++----- meson.build | 2 + unit_tests/stream_compaction/bench.cu | 93 +++++++++++++++++++++++++++ 3 files changed, 102 insertions(+), 16 deletions(-) create mode 100644 unit_tests/stream_compaction/bench.cu diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h index e754965..c319826 100644 --- a/include/splitvector/split_tools.h +++ b/include/splitvector/split_tools.h @@ -622,14 +622,12 @@ uint32_t copy_if_raw(split::SplitVector>& i uint32_t* d_counts; uint32_t* d_offsets; d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s)); // Phase 1 -- Calculate per warp workload size_t _size = input.size(); split::tools::scan_reduce_raw<<>>(input.data(), d_counts, rule, _size); d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); // Step 2 -- Exclusive Prefix Scan on offsets @@ -638,17 +636,14 @@ uint32_t copy_if_raw(split::SplitVector>& i } else { split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); } - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); // Step 3 -- Compaction uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); split::tools::split_compact_raw <<>>( input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); uint32_t numel; SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); return numel; } @@ -659,13 +654,11 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, uint32_t* d_counts; uint32_t* d_offsets; d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t), s)); // Phase 1 -- Calculate per warp workload split::tools::scan_reduce_raw<<>>(input, d_counts, rule, size); d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); // Step 2 -- Exclusive Prefix Scan on offsets @@ -674,17 +667,14 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, } else { split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); } - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); // Step 3 -- Compaction uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); split::tools::split_compact_raw <<>>( input, d_counts, d_offsets, output, rule, size, nBlocks, retval); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); uint32_t numel; SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); return numel; } @@ -697,16 +687,13 @@ size_t copy_keys_if_raw(split::SplitVector> uint32_t* d_counts; uint32_t* d_offsets; - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s)); // Phase 1 -- Calculate per warp workload size_t _size = input.size(); split::tools::scan_reduce_raw<<>>(input.data(), d_counts, rule, _size); d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); // Step 2 -- Exclusive Prefix Scan on offsets @@ -715,17 +702,14 @@ size_t copy_keys_if_raw(split::SplitVector> } else { split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); } - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); // Step 3 -- Compaction uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); split::tools::split_compact_keys_raw <<>>( input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); uint32_t numel; SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); return numel; } @@ -779,6 +763,7 @@ void copy_keys_if(split::SplitVector>& inpu const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t); Cuda_mempool mPool(memory_for_pool, s); auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -812,6 +797,7 @@ void copy_if(split::SplitVector>& input, const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t); Cuda_mempool mPool(memory_for_pool, s); auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -827,6 +813,7 @@ void copy_keys_if(split::SplitVector>& inpu nBlocks += 1; } auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, std::forward(mPool), s); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -842,6 +829,7 @@ void copy_if(split::SplitVector>& input, nBlocks += 1; } auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -859,6 +847,7 @@ void copy_keys_if(split::SplitVector>& inpu assert(stack && "Invalid stack!"); Cuda_mempool mPool(stack, max_size); auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -876,6 +865,7 @@ void copy_if(split::SplitVector>& input, assert(stack && "Invalid stack!"); Cuda_mempool mPool(stack, max_size); auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -892,6 +882,7 @@ size_t copy_if(T* input, T* output, size_t size, Rule rule, void* stack, size_t assert(stack && "Invalid stack!"); Cuda_mempool mPool(stack, max_size); auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); return len; } diff --git a/meson.build b/meson.build index e48b066..5c10eb6 100644 --- a/meson.build +++ b/meson.build @@ -24,6 +24,7 @@ compaction3_unit = executable('compaction3_test', 'unit_tests/stream_compaction/ pointer_unit = executable('pointer_test', 'unit_tests/pointer_test/main.cu',dependencies :gtest_dep ) hybridCPU = executable('hybrid_cpu', 'unit_tests/hybrid/main.cu',cpp_args:'-DHASHINATOR_CPU_ONLY_MODE',dependencies :gtest_dep ) hashinator_bench = executable('bench', 'unit_tests/benchmark/main.cu', dependencies :gtest_dep,link_args:'-lnvToolsExt') +compaction_bench = executable('streamBench', 'unit_tests/stream_compaction/bench.cu' ,link_args:'-lnvToolsExt') deletion_mechanism = executable('deletion', 'unit_tests/delete_by_compaction/main.cu', dependencies :gtest_dep) insertion_mechanism = executable('insertion', 'unit_tests/insertion_mechanism/main.cu', dependencies :gtest_dep) tombstoneTest = executable('tbPerf', 'unit_tests/benchmark/tbPerf.cu', dependencies :gtest_dep) @@ -39,6 +40,7 @@ test('CompactionTest', compaction_unit) test('CompactionTest2', compaction2_unit) test('CompactionTest3', compaction3_unit) test('HashinatorBench', hashinator_bench) +test('CompactionBench', compaction_bench) test('Insertion', insertion_mechanism) test('Deletion', deletion_mechanism) test('PointerTest', pointer_unit) diff --git a/unit_tests/stream_compaction/bench.cu b/unit_tests/stream_compaction/bench.cu new file mode 100644 index 0000000..74b8ba6 --- /dev/null +++ b/unit_tests/stream_compaction/bench.cu @@ -0,0 +1,93 @@ +#include +#include +#include +#include +#include +#include "../../include/splitvector/splitvec.h" +#include "../../include/splitvector/split_tools.h" +#include +#include +#include +#include +using namespace std::chrono; +using type_t = uint32_t; +using splitvector = split::SplitVector ; +using thrustvector = thrust::device_vector ; +constexpr int R = 100; +#define PROFILE_START(msg) nvtxRangePushA((msg)) +#define PROFILE_END() nvtxRangePop() + +template +auto timeMe(Fn fn, Args && ... args){ + std::chrono::time_point start,stop; + double total_time=0; + start = std::chrono::high_resolution_clock::now(); + fn(args...); + stop = std::chrono::high_resolution_clock::now(); + auto duration = duration_cast(stop- start).count(); + total_time+=duration; + return total_time; +} + +template +void fillVec(T& vec,size_t sz){ + std::random_device dev; + std::mt19937 rng(dev()); + std::uniform_int_distribution dist(0,std::numeric_limits::max()); + for (size_t i=0; i< sz;++i){ + vec[i]=i;//dist(rng); + } + return; +} + + +void stream_compaction_split(splitvector& v,splitvector& output, type_t* stack, size_t sz){ + auto pred =[]__host__ __device__ (type_t element)->bool{ return (element%2)==0 ;}; + split::tools::copy_if(v.data(),output.data(),sz,pred,(void*)stack,sz); +} + +void stream_compaction_thrust(thrustvector& v,thrustvector& output){ + auto pred =[]__host__ __device__ (type_t element)->bool{ return (element%2)==0 ;}; + thrust::copy_if(thrust::device, v.begin(), v.end(), output.begin(), pred); +} + +int main(int argc, char* argv[]){ + + + int sz=10; + if (argc>=2){ + sz=atoi(argv[1]); + } + size_t N = 1< Date: Mon, 19 Feb 2024 12:10:22 +0200 Subject: [PATCH 2/2] Revert a few stream syncs that are actually needed --- include/splitvector/split_tools.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h index c319826..8de0f89 100644 --- a/include/splitvector/split_tools.h +++ b/include/splitvector/split_tools.h @@ -644,6 +644,7 @@ uint32_t copy_if_raw(split::SplitVector>& i input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval); uint32_t numel; SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); return numel; } @@ -675,6 +676,7 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, input, d_counts, d_offsets, output, rule, size, nBlocks, retval); uint32_t numel; SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); return numel; } @@ -710,6 +712,7 @@ size_t copy_keys_if_raw(split::SplitVector> input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval); uint32_t numel; SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); return numel; } @@ -797,7 +800,6 @@ void copy_if(split::SplitVector>& input, const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t); Cuda_mempool mPool(memory_for_pool, s); auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -813,7 +815,6 @@ void copy_keys_if(split::SplitVector>& inpu nBlocks += 1; } auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, std::forward(mPool), s); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -829,7 +830,6 @@ void copy_if(split::SplitVector>& input, nBlocks += 1; } auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -847,7 +847,6 @@ void copy_keys_if(split::SplitVector>& inpu assert(stack && "Invalid stack!"); Cuda_mempool mPool(stack, max_size); auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -865,7 +864,6 @@ void copy_if(split::SplitVector>& input, assert(stack && "Invalid stack!"); Cuda_mempool mPool(stack, max_size); auto len = copy_if_raw(input, output.data(), rule, nBlocks, mPool, s); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } @@ -882,7 +880,6 @@ size_t copy_if(T* input, T* output, size_t size, Rule rule, void* stack, size_t assert(stack && "Invalid stack!"); Cuda_mempool mPool(stack, max_size); auto len = copy_if_raw(input, output, size, rule, nBlocks, mPool, s); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); return len; }