Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use thread_index_type to avoid index overflow in grid-stride loops #13895

Merged
merged 10 commits into from
Aug 24, 2023
24 changes: 14 additions & 10 deletions cpp/src/bitmask/null_mask.cu
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,15 @@ __global__ void set_null_mask_kernel(bitmask_type* __restrict__ destination,
bool valid,
size_type number_of_mask_words)
{
auto x = destination + word_index(begin_bit);
auto const last_word = word_index(end_bit) - word_index(begin_bit);
bitmask_type fill_value = valid ? 0xffff'ffff : 0;
auto x = destination + word_index(begin_bit);
thread_index_type const last_word = word_index(end_bit) - word_index(begin_bit);
bitmask_type fill_value = valid ? 0xffff'ffff : 0;

for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type const stride = blockDim.x * gridDim.x;
PointKernel marked this conversation as resolved.
Show resolved Hide resolved

for (thread_index_type destination_word_index = grid_1d::global_thread_id();
destination_word_index < number_of_mask_words;
destination_word_index += blockDim.x * gridDim.x) {
destination_word_index += stride) {
if (destination_word_index == 0 || destination_word_index == last_word) {
bitmask_type mask = ~bitmask_type{0};
if (destination_word_index == 0) {
Expand Down Expand Up @@ -189,9 +191,10 @@ __global__ void copy_offset_bitmask(bitmask_type* __restrict__ destination,
size_type source_end_bit,
size_type number_of_mask_words)
{
for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type const stride = blockDim.x * gridDim.x;
for (thread_index_type destination_word_index = grid_1d::global_thread_id();
destination_word_index < number_of_mask_words;
destination_word_index += blockDim.x * gridDim.x) {
destination_word_index += stride) {
destination[destination_word_index] = detail::get_mask_offset_word(
source, destination_word_index, source_begin_bit, source_end_bit);
}
Expand Down Expand Up @@ -261,14 +264,15 @@ __global__ void count_set_bits_kernel(bitmask_type const* bitmask,

auto const first_word_index{word_index(first_bit_index)};
auto const last_word_index{word_index(last_bit_index)};
auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
auto thread_word_index = tid + first_word_index;
thread_index_type const tid = grid_1d::global_thread_id();
thread_index_type const stride = blockDim.x * gridDim.x;
thread_index_type thread_word_index = tid + first_word_index;
size_type thread_count{0};

// First, just count the bits in all words
while (thread_word_index <= last_word_index) {
thread_count += __popc(bitmask[thread_word_index]);
thread_word_index += blockDim.x * gridDim.x;
thread_word_index += stride;
}

// Subtract any slack bits counted from the first and last word
Expand Down
15 changes: 6 additions & 9 deletions cpp/src/transform/jit/kernel.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -37,15 +37,12 @@ namespace jit {
template <typename TypeOut, typename TypeIn>
__global__ void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data)
{
int tid = threadIdx.x;
int blkid = blockIdx.x;
int blksz = blockDim.x;
int gridsz = gridDim.x;
// cannot use global_thread_id utility due to a JIT build issue by including
// the `cudf/detail/utilities/cuda.cuh` header
thread_index_type const start = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type const stride = blockDim.x * gridDim.x;

int start = tid + blkid * blksz;
int step = blksz * gridsz;

for (cudf::size_type i = start; i < size; i += step) {
for (auto i = start; i < static_cast<thread_index_type>(size); i += stride) {
GENERIC_UNARY_OP(&out_data[i], in_data[i]);
}
}
Expand Down