Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce stream syncs in split tools #40

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 1 addition & 13 deletions include/splitvector/split_tools.h
Original file line number Diff line number Diff line change
Expand Up @@ -622,14 +622,12 @@ uint32_t copy_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& i
uint32_t* d_counts;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

up above I see what I believe are unnecessary syncs:
on lines 595, 603, 611

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as below

uint32_t* d_offsets;
d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

d_counts here is passed to scan_reduce_raw as output, and in that kernel it gets directly written to, not incremented. Thus, the memset appears unnecessary, as long as the kernel actually writes to all elements. This same logic check should be done to the other memsets as well.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be addressed later as I am in a bit of a git wreck


// Phase 1 -- Calculate per warp workload
size_t _size = input.size();
split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input.data(), d_counts, rule, _size);
d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s));

// Step 2 -- Exclusive Prefix Scan on offsets
Expand All @@ -638,14 +636,12 @@ uint32_t copy_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& i
} else {
split_prefix_scan_raw<uint32_t, BLOCKSIZE, WARP>(d_counts, d_offsets, mPool, nBlocks, s);
}
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));

// Step 3 -- Compaction
uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t));
split::tools::split_compact_raw<T, Rule, BLOCKSIZE, WARP>
<<<nBlocks, BLOCKSIZE, 2 * (BLOCKSIZE / WARP) * sizeof(unsigned int), s>>>(
input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval);
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
uint32_t numel;
SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
Expand All @@ -659,13 +655,11 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule,
uint32_t* d_counts;
uint32_t* d_offsets;
d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t), s));

// Phase 1 -- Calculate per warp workload
split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input, d_counts, rule, size);
d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s));

// Step 2 -- Exclusive Prefix Scan on offsets
Expand All @@ -674,14 +668,12 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule,
} else {
split_prefix_scan_raw<uint32_t, BLOCKSIZE, WARP>(d_counts, d_offsets, mPool, nBlocks, s);
}
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));

// Step 3 -- Compaction
uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t));
split::tools::split_compact_raw<T, Rule, BLOCKSIZE, WARP>
<<<nBlocks, BLOCKSIZE, 2 * (BLOCKSIZE / WARP) * sizeof(unsigned int), s>>>(
input, d_counts, d_offsets, output, rule, size, nBlocks, retval);
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
uint32_t numel;
SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
Expand All @@ -697,16 +689,13 @@ size_t copy_keys_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>

uint32_t* d_counts;
uint32_t* d_offsets;
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s));

// Phase 1 -- Calculate per warp workload
size_t _size = input.size();
split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input.data(), d_counts, rule, _size);
d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s));

// Step 2 -- Exclusive Prefix Scan on offsets
Expand All @@ -715,14 +704,12 @@ size_t copy_keys_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>
} else {
split_prefix_scan_raw<uint32_t, BLOCKSIZE, WARP>(d_counts, d_offsets, mPool, nBlocks, s);
}
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));

// Step 3 -- Compaction
uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t));
split::tools::split_compact_keys_raw<T, U, Rule, BLOCKSIZE, WARP>
<<<nBlocks, BLOCKSIZE, 2 * (BLOCKSIZE / WARP) * sizeof(unsigned int), s>>>(
input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval);
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
uint32_t numel;
SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
Expand Down Expand Up @@ -779,6 +766,7 @@ void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& inpu
const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t);
Cuda_mempool mPool(memory_for_pool, s);
auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s);
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one is not needed. Copy_keys_if_raw has a stream sync before returning.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in a new PR

output.erase(&output[len], output.end());
}

Expand Down
2 changes: 2 additions & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ compaction3_unit = executable('compaction3_test', 'unit_tests/stream_compaction/
pointer_unit = executable('pointer_test', 'unit_tests/pointer_test/main.cu',dependencies :gtest_dep )
hybridCPU = executable('hybrid_cpu', 'unit_tests/hybrid/main.cu',cpp_args:'-DHASHINATOR_CPU_ONLY_MODE',dependencies :gtest_dep )
hashinator_bench = executable('bench', 'unit_tests/benchmark/main.cu', dependencies :gtest_dep,link_args:'-lnvToolsExt')
compaction_bench = executable('streamBench', 'unit_tests/stream_compaction/bench.cu' ,link_args:'-lnvToolsExt')
deletion_mechanism = executable('deletion', 'unit_tests/delete_by_compaction/main.cu', dependencies :gtest_dep)
insertion_mechanism = executable('insertion', 'unit_tests/insertion_mechanism/main.cu', dependencies :gtest_dep)
tombstoneTest = executable('tbPerf', 'unit_tests/benchmark/tbPerf.cu', dependencies :gtest_dep)
Expand All @@ -39,6 +40,7 @@ test('CompactionTest', compaction_unit)
test('CompactionTest2', compaction2_unit)
test('CompactionTest3', compaction3_unit)
test('HashinatorBench', hashinator_bench)
test('CompactionBench', compaction_bench)
test('Insertion', insertion_mechanism)
test('Deletion', deletion_mechanism)
test('PointerTest', pointer_unit)
Expand Down
93 changes: 93 additions & 0 deletions unit_tests/stream_compaction/bench.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#include <iostream>
#include <stdlib.h>
#include <chrono>
#include <limits>
#include <random>
#include "../../include/splitvector/splitvec.h"
#include "../../include/splitvector/split_tools.h"
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include <nvToolsExt.h>
using namespace std::chrono;
using type_t = uint32_t;
using splitvector = split::SplitVector<type_t> ;
using thrustvector = thrust::device_vector<type_t> ;
constexpr int R = 100;
#define PROFILE_START(msg) nvtxRangePushA((msg))
#define PROFILE_END() nvtxRangePop()

template <class Fn, class ... Args>
auto timeMe(Fn fn, Args && ... args){
std::chrono::time_point<std::chrono::_V2::system_clock, std::chrono::_V2::system_clock::duration> start,stop;
double total_time=0;
start = std::chrono::high_resolution_clock::now();
fn(args...);
stop = std::chrono::high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop- start).count();
total_time+=duration;
return total_time;
}

template <typename T>
void fillVec(T& vec,size_t sz){
std::random_device dev;
std::mt19937 rng(dev());
std::uniform_int_distribution<std::mt19937::result_type> dist(0,std::numeric_limits<type_t>::max());
for (size_t i=0; i< sz;++i){
vec[i]=i;//dist(rng);
}
return;
}


void stream_compaction_split(splitvector& v,splitvector& output, type_t* stack, size_t sz){
auto pred =[]__host__ __device__ (type_t element)->bool{ return (element%2)==0 ;};
split::tools::copy_if(v.data(),output.data(),sz,pred,(void*)stack,sz);
}

void stream_compaction_thrust(thrustvector& v,thrustvector& output){
auto pred =[]__host__ __device__ (type_t element)->bool{ return (element%2)==0 ;};
thrust::copy_if(thrust::device, v.begin(), v.end(), output.begin(), pred);
}

int main(int argc, char* argv[]){


int sz=10;
if (argc>=2){
sz=atoi(argv[1]);
}
size_t N = 1<<sz;
srand(1);
splitvector v0(N),v0_out(N);
thrustvector v1(N),v1_out(N);
fillVec(v0,N);
fillVec(v1,N);
splitvector stack(N);
v0.optimizeGPU();
v0_out.optimizeGPU();
stack.optimizeGPU();
split_gpuDeviceSynchronize();


double t_split={0};
double t_thrust={0};
for (int i =0; i<R; i++){

v0.optimizeGPU();
v0_out.optimizeGPU();
stack.optimizeGPU();
PROFILE_START("THRUST");
stream_compaction_thrust(v1,v1_out);
t_thrust+=timeMe(stream_compaction_thrust,v1,v1_out);
PROFILE_END();

PROFILE_START("SPLIT");
stream_compaction_split(v0,v0_out,stack.data(),N);
t_split+=timeMe(stream_compaction_split,v0,v0_out,stack.data(),N);
PROFILE_END();
}
printf("%d\t%f\t%f\n",sz,t_split/R,t_thrust/R);;
return 0;
}
Loading