diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h index 33c0398..f65678b 100644 --- a/include/splitvector/split_tools.h +++ b/include/splitvector/split_tools.h @@ -407,13 +407,13 @@ void split_prefix_scan(split::SplitVector>& split::tools::split_prescan<<>>( input.data(), output.data(), partial_sums.data(), scanElements, input.size()); SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); - + const size_t _pssize = partial_sums.size(); if (gridSize > 1) { - if (partial_sums.size() <= scanElements) { + if (_pssize <= scanElements) { vector partial_sums_dummy(gridSize); // TODO +FIXME extra shmem allocations split::tools::split_prescan<<<1, scanBlocksize, 2 * scanElements * sizeof(T), s>>>( - partial_sums.data(), partial_sums.data(), partial_sums_dummy.data(), gridSize, partial_sums.size()); + partial_sums.data(), partial_sums.data(), partial_sums_dummy.data(), gridSize, _pssize); SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); } else { vector partial_sums_clone(partial_sums); @@ -533,7 +533,7 @@ class Cuda_mempool { } void* allocate(const size_t bytes) { - assert(bytes_used + bytes < total_bytes && "Mempool run out of space and crashed!"); + assert(bytes_used + bytes <= total_bytes && "Mempool run out of space and crashed!"); void* ptr = reinterpret_cast(reinterpret_cast(_data) + bytes_used); bytes_used += bytes; return ptr; @@ -626,8 +626,8 @@ uint32_t copy_if_raw(split::SplitVector>& i SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s)); // Phase 1 -- Calculate per warp workload - input.optimizeGPU(s); - split::tools::scan_reduce_raw<<>>(input.data(), d_counts, rule, input.size()); + size_t _size = input.size(); + split::tools::scan_reduce_raw<<>>(input.data(), d_counts, rule, _size); d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); @@ -641,11 +641,10 @@ uint32_t copy_if_raw(split::SplitVector>& i SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); // Step 3 -- Compaction - input.optimizeGPU(s); uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); split::tools::split_compact_raw <<>>( - input.data(), d_counts, d_offsets, output, rule, input.size(), nBlocks, retval); + input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval); SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); uint32_t numel; SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s)); @@ -668,8 +667,8 @@ size_t copy_keys_if_raw(split::SplitVector> SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_counts, 0, nBlocks * sizeof(uint32_t),s)); // Phase 1 -- Calculate per warp workload - input.optimizeGPU(s); - split::tools::scan_reduce_raw<<>>(input.data(), d_counts, rule, input.size()); + size_t _size = input.size(); + split::tools::scan_reduce_raw<<>>(input.data(), d_counts, rule, _size); d_offsets = (uint32_t*)mPool.allocate(nBlocks * sizeof(uint32_t)); SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); SPLIT_CHECK_ERR(split_gpuMemsetAsync(d_offsets, 0, nBlocks * sizeof(uint32_t),s)); @@ -683,11 +682,10 @@ size_t copy_keys_if_raw(split::SplitVector> SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); // Step 3 -- Compaction - input.optimizeGPU(s); uint32_t* retval = (uint32_t*)mPool.allocate(sizeof(uint32_t)); split::tools::split_compact_keys_raw <<>>( - input.data(), d_counts, d_offsets, output, rule, input.size(), nBlocks, retval); + input.data(), d_counts, d_offsets, output, rule, _size, nBlocks, retval); SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s)); uint32_t numel; SPLIT_CHECK_ERR(split_gpuMemcpyAsync(&numel, retval, sizeof(uint32_t), split_gpuMemcpyDeviceToHost, s));