diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h index e754965..8de0f89 100644 --- a/include/splitvector/split_tools.h +++ b/include/splitvector/split_tools.h @@ -622,14 +622,12 @@ uint32_t copy_if_raw(split::SplitVector>& i uint32_t* d_counts; uint32_t* d_offsets; d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s)); // Phase 1 -- Calculate per warp workload size_t _size = input.size(); split::tools::scan_reduce_raw<<>>(input.data(), d_counts, rule, _size); d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); // Step 2 -- Exclusive Prefix Scan on offsets @@ -638,14 +636,12 @@ uint32_t copy_if_raw(split::SplitVector>& i } else { split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); } - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); // Step 3 -- Compaction uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); split::tools::split_compact_raw <<>>( input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); uint32_t numel; SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); @@ -659,13 +655,11 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, uint32_t* d_counts; uint32_t* d_offsets; d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t), s)); // Phase 1 -- Calculate per warp workload split::tools::scan_reduce_raw<<>>(input, d_counts, rule, size); d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); // Step 2 -- Exclusive Prefix Scan on offsets @@ -674,14 +668,12 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, } else { split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); } - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); // Step 3 -- Compaction uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); split::tools::split_compact_raw <<>>( input, d_counts, d_offsets, output, rule, size, nBlocks, retval); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); uint32_t numel; SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); @@ -697,16 +689,13 @@ size_t copy_keys_if_raw(split::SplitVector> uint32_t* d_counts; uint32_t* d_offsets; - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s)); // Phase 1 -- Calculate per warp workload size_t _size = input.size(); split::tools::scan_reduce_raw<<>>(input.data(), d_counts, rule, _size); d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); // Step 2 -- Exclusive Prefix Scan on offsets @@ -715,14 +704,12 @@ size_t copy_keys_if_raw(split::SplitVector> } else { split_prefix_scan_raw(d_counts, d_offsets, mPool, nBlocks, s); } - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); // Step 3 -- Compaction uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); split::tools::split_compact_keys_raw <<>>( input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval); - SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); uint32_t numel; SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); @@ -779,6 +766,7 @@ void copy_keys_if(split::SplitVector>& inpu const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t); Cuda_mempool mPool(memory_for_pool, s); auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s); + SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); output.erase(&output[len], output.end()); } diff --git a/meson.build b/meson.build index e48b066..5c10eb6 100644 --- a/meson.build +++ b/meson.build @@ -24,6 +24,7 @@ compaction3_unit = executable('compaction3_test', 'unit_tests/stream_compaction/ pointer_unit = executable('pointer_test', 'unit_tests/pointer_test/main.cu',dependencies :gtest_dep ) hybridCPU = executable('hybrid_cpu', 'unit_tests/hybrid/main.cu',cpp_args:'-DHASHINATOR_CPU_ONLY_MODE',dependencies :gtest_dep ) hashinator_bench = executable('bench', 'unit_tests/benchmark/main.cu', dependencies :gtest_dep,link_args:'-lnvToolsExt') +compaction_bench = executable('streamBench', 'unit_tests/stream_compaction/bench.cu' ,link_args:'-lnvToolsExt') deletion_mechanism = executable('deletion', 'unit_tests/delete_by_compaction/main.cu', dependencies :gtest_dep) insertion_mechanism = executable('insertion', 'unit_tests/insertion_mechanism/main.cu', dependencies :gtest_dep) tombstoneTest = executable('tbPerf', 'unit_tests/benchmark/tbPerf.cu', dependencies :gtest_dep) @@ -39,6 +40,7 @@ test('CompactionTest', compaction_unit) test('CompactionTest2', compaction2_unit) test('CompactionTest3', compaction3_unit) test('HashinatorBench', hashinator_bench) +test('CompactionBench', compaction_bench) test('Insertion', insertion_mechanism) test('Deletion', deletion_mechanism) test('PointerTest', pointer_unit) diff --git a/unit_tests/stream_compaction/bench.cu b/unit_tests/stream_compaction/bench.cu new file mode 100644 index 0000000..74b8ba6 --- /dev/null +++ b/unit_tests/stream_compaction/bench.cu @@ -0,0 +1,93 @@ +#include +#include +#include +#include +#include +#include "../../include/splitvector/splitvec.h" +#include "../../include/splitvector/split_tools.h" +#include +#include +#include +#include +using namespace std::chrono; +using type_t = uint32_t; +using splitvector = split::SplitVector ; +using thrustvector = thrust::device_vector ; +constexpr int R = 100; +#define PROFILE_START(msg) nvtxRangePushA((msg)) +#define PROFILE_END() nvtxRangePop() + +template +auto timeMe(Fn fn, Args && ... args){ + std::chrono::time_point start,stop; + double total_time=0; + start = std::chrono::high_resolution_clock::now(); + fn(args...); + stop = std::chrono::high_resolution_clock::now(); + auto duration = duration_cast(stop- start).count(); + total_time+=duration; + return total_time; +} + +template +void fillVec(T& vec,size_t sz){ + std::random_device dev; + std::mt19937 rng(dev()); + std::uniform_int_distribution dist(0,std::numeric_limits::max()); + for (size_t i=0; i< sz;++i){ + vec[i]=i;//dist(rng); + } + return; +} + + +void stream_compaction_split(splitvector& v,splitvector& output, type_t* stack, size_t sz){ + auto pred =[]__host__ __device__ (type_t element)->bool{ return (element%2)==0 ;}; + split::tools::copy_if(v.data(),output.data(),sz,pred,(void*)stack,sz); +} + +void stream_compaction_thrust(thrustvector& v,thrustvector& output){ + auto pred =[]__host__ __device__ (type_t element)->bool{ return (element%2)==0 ;}; + thrust::copy_if(thrust::device, v.begin(), v.end(), output.begin(), pred); +} + +int main(int argc, char* argv[]){ + + + int sz=10; + if (argc>=2){ + sz=atoi(argv[1]); + } + size_t N = 1<