Skip to content

Commit

Permalink
fixes to sizes, prefetches, and asserts
Browse files Browse the repository at this point in the history
  • Loading branch information
markusbattarbee committed Feb 1, 2024
1 parent d21a3dd commit 3afb0ee
Showing 1 changed file with 10 additions and 12 deletions.
22 changes: 10 additions & 12 deletions include/splitvector/split_tools.h
Original file line number Diff line number Diff line change
Expand Up @@ -407,13 +407,13 @@ void split_prefix_scan(split::SplitVector<T, split::split_unified_allocator<T>>&
split::tools::split_prescan<<<gridSize, scanBlocksize, 2 * scanElements * sizeof(T), s>>>(
input.data(), output.data(), partial_sums.data(), scanElements, input.size());
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));

const size_t _pssize = partial_sums.size();
if (gridSize > 1) {
if (partial_sums.size() <= scanElements) {
if (_pssize <= scanElements) {
vector partial_sums_dummy(gridSize);
// TODO +FIXME extra shmem allocations
split::tools::split_prescan<<<1, scanBlocksize, 2 * scanElements * sizeof(T), s>>>(
partial_sums.data(), partial_sums.data(), partial_sums_dummy.data(), gridSize, partial_sums.size());
partial_sums.data(), partial_sums.data(), partial_sums_dummy.data(), gridSize, _pssize);
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
} else {
vector partial_sums_clone(partial_sums);
Expand Down Expand Up @@ -533,7 +533,7 @@ class Cuda_mempool {
}

void* allocate(const size_t bytes) {
assert(bytes_used + bytes < total_bytes && "Mempool run out of space and crashed!");
assert(bytes_used + bytes <= total_bytes && "Mempool run out of space and crashed!");
void* ptr = reinterpret_cast<void*>(reinterpret_cast<char*>(_data) + bytes_used);
bytes_used += bytes;
return ptr;
Expand Down Expand Up @@ -626,8 +626,8 @@ uint32_t copy_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& i
SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s));

// Phase 1 -- Calculate per warp workload
input.optimizeGPU(s);
split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input.data(), d_counts, rule, input.size());
size_t _size = input.size();
split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input.data(), d_counts, rule, _size);
d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s));
Expand All @@ -641,11 +641,10 @@ uint32_t copy_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>& i
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));

// Step 3 -- Compaction
input.optimizeGPU(s);
uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t));
split::tools::split_compact_raw<T, Rule, BLOCKSIZE, WARP>
<<<nBlocks, BLOCKSIZE, 2 * (BLOCKSIZE / WARP) * sizeof(unsigned int), s>>>(
input.data(), d_counts, d_offsets, output, rule, input.size(), nBlocks, retval);
input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval);
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
uint32_t numel;
SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));
Expand All @@ -668,8 +667,8 @@ size_t copy_keys_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>
SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s));

// Phase 1 -- Calculate per warp workload
input.optimizeGPU(s);
split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input.data(), d_counts, rule, input.size());
size_t _size = input.size();
split::tools::scan_reduce_raw<<<nBlocks, BLOCKSIZE, 0, s>>>(input.data(), d_counts, rule, _size);
d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t));
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s));
Expand All @@ -683,11 +682,10 @@ size_t copy_keys_if_raw(split::SplitVector<T, split::split_unified_allocator<T>>
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));

// Step 3 -- Compaction
input.optimizeGPU(s);
uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t));
split::tools::split_compact_keys_raw<T, U, Rule, BLOCKSIZE, WARP>
<<<nBlocks, BLOCKSIZE, 2 * (BLOCKSIZE / WARP) * sizeof(unsigned int), s>>>(
input.data(), d_counts, d_offsets, output, rule, input.size(), nBlocks, retval);
input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval);
SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
uint32_t numel;
SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));
Expand Down

0 comments on commit 3afb0ee

Please sign in to comment.