Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix memory access error in cudf::shift for sliced strings #13894

Merged
merged 11 commits into from
Aug 23, 2023
Merged
84 changes: 37 additions & 47 deletions cpp/src/strings/copying/shift.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/copy.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/strings/detail/copying.hpp>
#include <cudf/strings/detail/utilities.hpp>

Expand All @@ -31,35 +33,31 @@ namespace cudf::strings::detail {

namespace {

struct adjust_offsets_fn {
column_device_view const d_column;
struct output_sizes_fn {
column_device_view const d_column; // input strings column
string_view const d_filler;
size_type const offset;

__device__ size_type get_string_size_at(size_type idx)
{
return d_column.is_null(idx) ? 0 : d_column.element<string_view>(idx).size_bytes();
}

__device__ size_type operator()(size_type idx)
{
auto const last_index = offset < 0 ? d_column.size() + offset : offset;
if (offset < 0) {
auto const first = d_column.element<size_type>(-offset);
auto const last_index = d_column.size() + offset;
if (idx < last_index) {
return d_column.element<size_type>(idx - offset) - first;
} else {
auto const last = d_column.element<size_type>(d_column.size() - 1);
return (last - first) + ((idx - last_index + 1) * d_filler.size_bytes());
}
// shift left: a,b,c,d,e,f -> b,c,d,e,f,x
return (idx < last_index) ? get_string_size_at(idx - offset) : d_filler.size_bytes();
} else {
if (idx < offset) {
return idx * d_filler.size_bytes();
} else {
auto const total_filler = d_filler.size_bytes() * offset;
return total_filler + d_column.element<size_type>(idx - offset);
}
// shift right: a,b,c,d,e,f -> x,a,b,c,d,e
return (idx < last_index) ? d_filler.size_bytes() : get_string_size_at(idx - offset);
}
}
};

struct shift_chars_fn {
column_device_view const d_column;
column_device_view const d_column; // input strings column
string_view const d_filler;
size_type const offset;

Expand All @@ -68,8 +66,11 @@ struct shift_chars_fn {
if (offset < 0) {
auto const last_index = -offset;
if (idx < last_index) {
auto const first_index = d_column.size() + offset;
return d_column.element<char>(idx + first_index);
auto const first_index =
offset + d_column.child(strings_column_view::offsets_column_index)
.element<size_type>(d_column.offset() + d_column.size());
return d_column.child(strings_column_view::chars_column_index)
.element<char>(idx + first_index);
} else {
auto const char_index = idx - last_index;
return d_filler.data()[char_index % d_filler.size_bytes()];
Expand All @@ -78,7 +79,10 @@ struct shift_chars_fn {
if (idx < offset) {
return d_filler.data()[idx % d_filler.size_bytes()];
} else {
return d_column.element<char>(idx - offset);
return d_column.child(strings_column_view::chars_column_index)
.element<char>(idx - offset +
d_column.child(strings_column_view::offsets_column_index)
.element<size_type>(d_column.offset()));
}
}
}
Expand All @@ -97,44 +101,30 @@ std::unique_ptr<column> shift(strings_column_view const& input,
// adjust offset when greater than the size of the input
if (std::abs(offset) > input.size()) { offset = input.size(); }

// output offsets column is the same size as the input
auto const input_offsets =
cudf::detail::slice(
input.offsets(), {input.offset(), input.offset() + input.size() + 1}, stream)
.front();
auto const offsets_size = input_offsets.size();
auto offsets_column = cudf::detail::allocate_like(
input_offsets, offsets_size, mask_allocation_policy::NEVER, stream, mr);

// run kernel to simultaneously shift and adjust the values in the output offsets column
auto d_offsets = mutable_column_device_view::create(offsets_column->mutable_view(), stream);
auto const d_input_offsets = column_device_view::create(input_offsets, stream);
thrust::transform(rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(offsets_size),
d_offsets->data<size_type>(),
adjust_offsets_fn{*d_input_offsets, d_fill_str, offset});
// build the output offsets by computing the sizes of each output row
auto const d_input = column_device_view::create(input.parent(), stream);
auto sizes_itr = cudf::detail::make_counting_transform_iterator(
0, output_sizes_fn{*d_input, d_fill_str, offset});
auto [offsets_column, total_bytes] =
cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
auto offsets_view = offsets_column->view();

// compute the shift-offset for the output characters child column
auto const shift_offset = [&] {
auto const index = (offset >= 0) ? offset : offsets_size - 1 + offset;
return (offset < 0 ? -1 : 1) *
cudf::detail::get_value<size_type>(offsets_column->view(), index, stream);
auto const index = (offset >= 0) ? offset : input.size() + offset;
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
return (offset < 0 ? -1 : 1) * cudf::detail::get_value<size_type>(offsets_view, index, stream);
}();

// create output chars child column
auto const chars_size =
cudf::detail::get_value<size_type>(offsets_column->view(), offsets_size - 1, stream);
auto chars_column = create_chars_child_column(chars_size, stream, mr);
auto chars_column = create_chars_child_column(static_cast<size_type>(total_bytes), stream, mr);
auto d_chars = mutable_column_device_view::create(chars_column->mutable_view(), stream);
auto const d_input_chars = column_device_view::create(input.chars(), stream);

// run kernel to shift the characters
// run kernel to shift all the characters
thrust::transform(rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(chars_size),
thrust::counting_iterator<size_type>(total_bytes),
d_chars->data<char>(),
shift_chars_fn{*d_input_chars, d_fill_str, shift_offset});
shift_chars_fn{*d_input, d_fill_str, shift_offset});

// caller sets the null-mask
return make_strings_column(
Expand Down
38 changes: 32 additions & 6 deletions cpp/tests/copying/shift_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,22 +206,48 @@ TEST_F(ShiftTests, StringsShiftTest)
auto results = cudf::shift(input, 2, fill);
auto expected_right =
cudf::test::strings_column_wrapper({"xx", "xx", "", "bb", "ccc"}, {1, 1, 0, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_right, *results);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_right, *results);

results = cudf::shift(input, -2, fill);
auto expected_left =
cudf::test::strings_column_wrapper({"ccc", "ddddddé", "", "xx", "xx"}, {1, 1, 0, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_left, *results);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_left, *results);

auto sliced = cudf::slice(input, {1, 4}).front();

results = cudf::shift(sliced, 1, fill);
auto sliced_right = cudf::test::strings_column_wrapper({"xx", "bb", "ccc"});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced_right, *results);
auto sliced_right = cudf::test::strings_column_wrapper({"xx", "bb", "ccc"}, {1, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(sliced_right, *results);

results = cudf::shift(sliced, -1, fill);
auto sliced_left = cudf::test::strings_column_wrapper({"ccc", "ddddddé", "xx"});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced_left, *results);
auto sliced_left = cudf::test::strings_column_wrapper({"ccc", "ddddddé", "xx"}, {1, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(sliced_left, *results);
}

TEST_F(ShiftTests, StringsShiftNullFillTest)
{
auto input = cudf::test::strings_column_wrapper(
{"a", "b", "c", "d", "e", "ff", "ggg", "hhhh", "iii", "jjjjj"});
auto phil = cudf::string_scalar("", false);

auto results = cudf::shift(input, -1, phil);
auto expected = cudf::test::strings_column_wrapper(
{"b", "c", "d", "e", "ff", "ggg", "hhhh", "iii", "jjjjj", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);

results = cudf::shift(input, 1, phil);
expected = cudf::test::strings_column_wrapper(
{"", "a", "b", "c", "d", "e", "ff", "ggg", "hhhh", "iii"}, {0, 1, 1, 1, 1, 1, 1, 1, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);

auto sliced = cudf::slice(input, {5, 10}).front();
results = cudf::shift(sliced, -2, phil);
expected = cudf::test::strings_column_wrapper({"hhhh", "iii", "jjjjj", "", ""}, {1, 1, 1, 0, 0});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);

results = cudf::shift(sliced, 2, phil);
expected = cudf::test::strings_column_wrapper({"", "", "ff", "ggg", "hhhh"}, {0, 0, 1, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(ShiftTests, OffsetGreaterThanSize)
Expand Down