-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Reduce stream syncs in split tools #40
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -622,14 +622,12 @@ uint32_t copy_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& i | |
uint32_t* d_counts; | ||
uint32_t* d_offsets; | ||
d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); | ||
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); | ||
SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. d_counts here is passed to scan_reduce_raw as output, and in that kernel it gets directly written to, not incremented. Thus, the memset appears unnecessary, as long as the kernel actually writes to all elements. This same logic check should be done to the other memsets as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To be addressed later as I am in a bit of a git wreck |
||
|
||
// Phase 1 -- Calculate per warp workload | ||
size_t _size = input.size(); | ||
split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input.data(), d_counts, rule, _size); | ||
d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); | ||
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); | ||
SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); | ||
|
||
// Step 2 -- Exclusive Prefix Scan on offsets | ||
|
@@ -638,14 +636,12 @@ uint32_t copy_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& i | |
} else { | ||
split_prefix_scan_raw<uint32_t, BLOCKSIZE, WARP>(d_counts, d_offsets, mPool, nBlocks, s); | ||
} | ||
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); | ||
|
||
// Step 3 -- Compaction | ||
uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); | ||
split::tools::split_compact_raw<T, Rule, BLOCKSIZE, WARP> | ||
<<<nBlocks, BLOCKSIZE, 2 * (BLOCKSIZE / WARP) * sizeof(unsigned int), s>>>( | ||
input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval); | ||
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); | ||
uint32_t numel; | ||
SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); | ||
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); | ||
|
@@ -659,13 +655,11 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, | |
uint32_t* d_counts; | ||
uint32_t* d_offsets; | ||
d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); | ||
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); | ||
SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t), s)); | ||
|
||
// Phase 1 -- Calculate per warp workload | ||
split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input, d_counts, rule, size); | ||
d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); | ||
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); | ||
SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); | ||
|
||
// Step 2 -- Exclusive Prefix Scan on offsets | ||
|
@@ -674,14 +668,12 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, | |
} else { | ||
split_prefix_scan_raw<uint32_t, BLOCKSIZE, WARP>(d_counts, d_offsets, mPool, nBlocks, s); | ||
} | ||
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); | ||
|
||
// Step 3 -- Compaction | ||
uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); | ||
split::tools::split_compact_raw<T, Rule, BLOCKSIZE, WARP> | ||
<<<nBlocks, BLOCKSIZE, 2 * (BLOCKSIZE / WARP) * sizeof(unsigned int), s>>>( | ||
input, d_counts, d_offsets, output, rule, size, nBlocks, retval); | ||
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); | ||
uint32_t numel; | ||
SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); | ||
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); | ||
|
@@ -697,16 +689,13 @@ size_t copy_keys_if_raw(split::SplitVector<T, split::split_unified_allocator<T>> | |
|
||
uint32_t* d_counts; | ||
uint32_t* d_offsets; | ||
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); | ||
d_counts = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); | ||
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); | ||
SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s)); | ||
|
||
// Phase 1 -- Calculate per warp workload | ||
size_t _size = input.size(); | ||
split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input.data(), d_counts, rule, _size); | ||
d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); | ||
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); | ||
SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); | ||
|
||
// Step 2 -- Exclusive Prefix Scan on offsets | ||
|
@@ -715,14 +704,12 @@ size_t copy_keys_if_raw(split::SplitVector<T, split::split_unified_allocator<T>> | |
} else { | ||
split_prefix_scan_raw<uint32_t, BLOCKSIZE, WARP>(d_counts, d_offsets, mPool, nBlocks, s); | ||
} | ||
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); | ||
|
||
// Step 3 -- Compaction | ||
uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); | ||
split::tools::split_compact_keys_raw<T, U, Rule, BLOCKSIZE, WARP> | ||
<<<nBlocks, BLOCKSIZE, 2 * (BLOCKSIZE / WARP) * sizeof(unsigned int), s>>>( | ||
input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval); | ||
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); | ||
uint32_t numel; | ||
SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); | ||
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); | ||
|
@@ -779,6 +766,7 @@ void copy_keys_if(split::SplitVector<T, split::split_unified_allocator<T>>& inpu | |
const size_t memory_for_pool = 8 * nBlocks * sizeof(uint32_t); | ||
Cuda_mempool mPool(memory_for_pool, s); | ||
auto len = copy_keys_if_raw(input, output.data(), rule, nBlocks, mPool, s); | ||
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This one is not needed. Copy_keys_if_raw has a stream sync before returning. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addressed in a new PR |
||
output.erase(&output[len], output.end()); | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
#include <iostream> | ||
#include <stdlib.h> | ||
#include <chrono> | ||
#include <limits> | ||
#include <random> | ||
#include "../../include/splitvector/splitvec.h" | ||
#include "../../include/splitvector/split_tools.h" | ||
#include <thrust/device_vector.h> | ||
#include <thrust/execution_policy.h> | ||
#include <thrust/functional.h> | ||
#include <nvToolsExt.h> | ||
using namespace std::chrono; | ||
using type_t = uint32_t; | ||
using splitvector = split::SplitVector<type_t> ; | ||
using thrustvector = thrust::device_vector<type_t> ; | ||
constexpr int R = 100; | ||
#define PROFILE_START(msg) nvtxRangePushA((msg)) | ||
#define PROFILE_END() nvtxRangePop() | ||
|
||
template <class Fn, class ... Args> | ||
auto timeMe(Fn fn, Args && ... args){ | ||
std::chrono::time_point<std::chrono::_V2::system_clock, std::chrono::_V2::system_clock::duration> start,stop; | ||
double total_time=0; | ||
start = std::chrono::high_resolution_clock::now(); | ||
fn(args...); | ||
stop = std::chrono::high_resolution_clock::now(); | ||
auto duration = duration_cast<microseconds>(stop- start).count(); | ||
total_time+=duration; | ||
return total_time; | ||
} | ||
|
||
template <typename T> | ||
void fillVec(T& vec,size_t sz){ | ||
std::random_device dev; | ||
std::mt19937 rng(dev()); | ||
std::uniform_int_distribution<std::mt19937::result_type> dist(0,std::numeric_limits<type_t>::max()); | ||
for (size_t i=0; i< sz;++i){ | ||
vec[i]=i;//dist(rng); | ||
} | ||
return; | ||
} | ||
|
||
|
||
void stream_compaction_split(splitvector& v,splitvector& output, type_t* stack, size_t sz){ | ||
auto pred =[]__host__ __device__ (type_t element)->bool{ return (element%2)==0 ;}; | ||
split::tools::copy_if(v.data(),output.data(),sz,pred,(void*)stack,sz); | ||
} | ||
|
||
void stream_compaction_thrust(thrustvector& v,thrustvector& output){ | ||
auto pred =[]__host__ __device__ (type_t element)->bool{ return (element%2)==0 ;}; | ||
thrust::copy_if(thrust::device, v.begin(), v.end(), output.begin(), pred); | ||
} | ||
|
||
int main(int argc, char* argv[]){ | ||
|
||
|
||
int sz=10; | ||
if (argc>=2){ | ||
sz=atoi(argv[1]); | ||
} | ||
size_t N = 1<<sz; | ||
srand(1); | ||
splitvector v0(N),v0_out(N); | ||
thrustvector v1(N),v1_out(N); | ||
fillVec(v0,N); | ||
fillVec(v1,N); | ||
splitvector stack(N); | ||
v0.optimizeGPU(); | ||
v0_out.optimizeGPU(); | ||
stack.optimizeGPU(); | ||
split_gpuDeviceSynchronize(); | ||
|
||
|
||
double t_split={0}; | ||
double t_thrust={0}; | ||
for (int i =0; i<R; i++){ | ||
|
||
v0.optimizeGPU(); | ||
v0_out.optimizeGPU(); | ||
stack.optimizeGPU(); | ||
PROFILE_START("THRUST"); | ||
stream_compaction_thrust(v1,v1_out); | ||
t_thrust+=timeMe(stream_compaction_thrust,v1,v1_out); | ||
PROFILE_END(); | ||
|
||
PROFILE_START("SPLIT"); | ||
stream_compaction_split(v0,v0_out,stack.data(),N); | ||
t_split+=timeMe(stream_compaction_split,v0,v0_out,stack.data(),N); | ||
PROFILE_END(); | ||
} | ||
printf("%d\t%f\t%f\n",sz,t_split/R,t_thrust/R);; | ||
return 0; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
up above I see what I believe are unnecessary syncs:
on lines 595, 603, 611
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same as below