From c5febdcbb357d625dc45fd67077ce43d8ec19dc4 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 16 Aug 2023 17:32:46 -0400 Subject: [PATCH 1/3] Fix memory access error in cudf::shift for sliced strings --- cpp/src/strings/copying/shift.cu | 32 +++++++++++++++++--------- cpp/tests/copying/shift_tests.cpp | 38 ++++++++++++++++++++++++++----- 2 files changed, 53 insertions(+), 17 deletions(-) diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu index 5f8fc483a34..f49c5e36055 100644 --- a/cpp/src/strings/copying/shift.cu +++ b/cpp/src/strings/copying/shift.cu @@ -52,14 +52,15 @@ struct adjust_offsets_fn { return idx * d_filler.size_bytes(); } else { auto const total_filler = d_filler.size_bytes() * offset; - return total_filler + d_column.element(idx - offset); + return total_filler + d_column.element(idx - offset) - + d_column.element(0); } } } }; struct shift_chars_fn { - column_device_view const d_column; + column_device_view const d_column; // input strings column string_view const d_filler; size_type const offset; @@ -68,8 +69,11 @@ struct shift_chars_fn { if (offset < 0) { auto const last_index = -offset; if (idx < last_index) { - auto const first_index = d_column.size() + offset; - return d_column.element(idx + first_index); + auto const first_index = + offset + d_column.child(strings_column_view::offsets_column_index) + .element(d_column.offset() + d_column.size()); + return d_column.child(strings_column_view::chars_column_index) + .element(idx + first_index); } else { auto const char_index = idx - last_index; return d_filler.data()[char_index % d_filler.size_bytes()]; @@ -78,7 +82,10 @@ struct shift_chars_fn { if (idx < offset) { return d_filler.data()[idx % d_filler.size_bytes()]; } else { - return d_column.element(idx - offset); + return d_column.child(strings_column_view::chars_column_index) + .element(idx - offset + + d_column.child(strings_column_view::offsets_column_index) + .element(d_column.offset())); } } } @@ -97,6 +104,9 @@ std::unique_ptr shift(strings_column_view const& input, // adjust offset when greater than the size of the input if (std::abs(offset) > input.size()) { offset = input.size(); } + // TODO: This is fancy and all but we should try to use the make_offsets_child + // utility to check for overflow since we are inserting an arbitrary fill-value. + // output offsets column is the same size as the input auto const input_offsets = cudf::detail::slice( @@ -115,7 +125,7 @@ std::unique_ptr shift(strings_column_view const& input, d_offsets->data(), adjust_offsets_fn{*d_input_offsets, d_fill_str, offset}); - // compute the shift-offset for the output characters child column + // compute the shift-offset for the output characters child column auto const shift_offset = [&] { auto const index = (offset >= 0) ? offset : offsets_size - 1 + offset; return (offset < 0 ? -1 : 1) * @@ -125,18 +135,18 @@ std::unique_ptr shift(strings_column_view const& input, // create output chars child column auto const chars_size = cudf::detail::get_value(offsets_column->view(), offsets_size - 1, stream); - auto chars_column = create_chars_child_column(chars_size, stream, mr); - auto d_chars = mutable_column_device_view::create(chars_column->mutable_view(), stream); - auto const d_input_chars = column_device_view::create(input.chars(), stream); + auto chars_column = create_chars_child_column(chars_size, stream, mr); + auto d_chars = mutable_column_device_view::create(chars_column->mutable_view(), stream); + auto const d_input = column_device_view::create(input.parent(), stream); // run kernel to shift the characters thrust::transform(rmm::exec_policy(stream), thrust::counting_iterator(0), thrust::counting_iterator(chars_size), d_chars->data(), - shift_chars_fn{*d_input_chars, d_fill_str, shift_offset}); + shift_chars_fn{*d_input, d_fill_str, shift_offset}); // d_input_chars - // caller sets the null-mask + // caller sets the null-mask return make_strings_column( input.size(), std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{}); } diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp index 9ca07040175..17e56ea8ed8 100644 --- a/cpp/tests/copying/shift_tests.cpp +++ b/cpp/tests/copying/shift_tests.cpp @@ -206,22 +206,48 @@ TEST_F(ShiftTests, StringsShiftTest) auto results = cudf::shift(input, 2, fill); auto expected_right = cudf::test::strings_column_wrapper({"xx", "xx", "", "bb", "ccc"}, {1, 1, 0, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_right, *results); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_right, *results); results = cudf::shift(input, -2, fill); auto expected_left = cudf::test::strings_column_wrapper({"ccc", "ddddddé", "", "xx", "xx"}, {1, 1, 0, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_left, *results); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_left, *results); auto sliced = cudf::slice(input, {1, 4}).front(); results = cudf::shift(sliced, 1, fill); - auto sliced_right = cudf::test::strings_column_wrapper({"xx", "bb", "ccc"}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced_right, *results); + auto sliced_right = cudf::test::strings_column_wrapper({"xx", "bb", "ccc"}, {1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sliced_right, *results); results = cudf::shift(sliced, -1, fill); - auto sliced_left = cudf::test::strings_column_wrapper({"ccc", "ddddddé", "xx"}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced_left, *results); + auto sliced_left = cudf::test::strings_column_wrapper({"ccc", "ddddddé", "xx"}, {1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sliced_left, *results); +} + +TEST_F(ShiftTests, StringsShiftNullFillTest) +{ + auto input = cudf::test::strings_column_wrapper( + {"a", "b", "c", "d", "e", "ff", "ggg", "hhhh", "iii", "jjjjj"}); + auto phil = cudf::string_scalar("", false); + + auto results = cudf::shift(input, -1, phil); + auto expected = cudf::test::strings_column_wrapper( + {"b", "c", "d", "e", "ff", "ggg", "hhhh", "iii", "jjjjj", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = cudf::shift(input, 1, phil); + expected = cudf::test::strings_column_wrapper( + {"", "a", "b", "c", "d", "e", "ff", "ggg", "hhhh", "iii"}, {0, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto sliced = cudf::slice(input, {5, 10}).front(); + results = cudf::shift(sliced, -2, phil); + expected = cudf::test::strings_column_wrapper({"hhhh", "iii", "jjjjj", "", ""}, {1, 1, 1, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = cudf::shift(sliced, 2, phil); + expected = cudf::test::strings_column_wrapper({"", "", "ff", "ggg", "hhhh"}, {0, 0, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } TEST_F(ShiftTests, OffsetGreaterThanSize) From 6750582f6c37ff43c59d2119fb0b7f7115186676 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 17 Aug 2023 16:08:06 -0400 Subject: [PATCH 2/3] call make-offsets utility to check for overflow --- cpp/src/strings/copying/shift.cu | 80 ++++++++++++-------------------- 1 file changed, 30 insertions(+), 50 deletions(-) diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu index f49c5e36055..fd4ff94c6b8 100644 --- a/cpp/src/strings/copying/shift.cu +++ b/cpp/src/strings/copying/shift.cu @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include @@ -31,30 +33,25 @@ namespace cudf::strings::detail { namespace { -struct adjust_offsets_fn { - column_device_view const d_column; +struct output_sizes_fn { + column_device_view const d_column; // input strings column string_view const d_filler; size_type const offset; + __device__ size_type get_string_size_at(size_type idx) + { + return d_column.is_null(idx) ? 0 : d_column.element(idx).size_bytes(); + } + __device__ size_type operator()(size_type idx) { + auto const last_index = offset < 0 ? d_column.size() + offset : offset; if (offset < 0) { - auto const first = d_column.element(-offset); - auto const last_index = d_column.size() + offset; - if (idx < last_index) { - return d_column.element(idx - offset) - first; - } else { - auto const last = d_column.element(d_column.size() - 1); - return (last - first) + ((idx - last_index + 1) * d_filler.size_bytes()); - } + // shift left: a,b,c,d,e,f -> b,c,d,e,f,x + return (idx < last_index) ? get_string_size_at(idx - offset) : d_filler.size_bytes(); } else { - if (idx < offset) { - return idx * d_filler.size_bytes(); - } else { - auto const total_filler = d_filler.size_bytes() * offset; - return total_filler + d_column.element(idx - offset) - - d_column.element(0); - } + // shift right: a,b,c,d,e,f -> x,a,b,c,d,e + return (idx < last_index) ? d_filler.size_bytes() : get_string_size_at(idx - offset); } } }; @@ -104,49 +101,32 @@ std::unique_ptr shift(strings_column_view const& input, // adjust offset when greater than the size of the input if (std::abs(offset) > input.size()) { offset = input.size(); } - // TODO: This is fancy and all but we should try to use the make_offsets_child - // utility to check for overflow since we are inserting an arbitrary fill-value. - - // output offsets column is the same size as the input - auto const input_offsets = - cudf::detail::slice( - input.offsets(), {input.offset(), input.offset() + input.size() + 1}, stream) - .front(); - auto const offsets_size = input_offsets.size(); - auto offsets_column = cudf::detail::allocate_like( - input_offsets, offsets_size, mask_allocation_policy::NEVER, stream, mr); - - // run kernel to simultaneously shift and adjust the values in the output offsets column - auto d_offsets = mutable_column_device_view::create(offsets_column->mutable_view(), stream); - auto const d_input_offsets = column_device_view::create(input_offsets, stream); - thrust::transform(rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(offsets_size), - d_offsets->data(), - adjust_offsets_fn{*d_input_offsets, d_fill_str, offset}); + // build the output offsets by computing the sizes of each output row + auto const d_input = column_device_view::create(input.parent(), stream); + auto sizes_itr = cudf::detail::make_counting_transform_iterator( + 0, output_sizes_fn{*d_input, d_fill_str, offset}); + auto [offsets_column, total_bytes] = + cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr); + auto offsets_view = offsets_column->view(); - // compute the shift-offset for the output characters child column + // compute the shift-offset for the output characters child column auto const shift_offset = [&] { - auto const index = (offset >= 0) ? offset : offsets_size - 1 + offset; - return (offset < 0 ? -1 : 1) * - cudf::detail::get_value(offsets_column->view(), index, stream); + auto const index = (offset >= 0) ? offset : input.size() + offset; + return (offset < 0 ? -1 : 1) * cudf::detail::get_value(offsets_view, index, stream); }(); // create output chars child column - auto const chars_size = - cudf::detail::get_value(offsets_column->view(), offsets_size - 1, stream); - auto chars_column = create_chars_child_column(chars_size, stream, mr); - auto d_chars = mutable_column_device_view::create(chars_column->mutable_view(), stream); - auto const d_input = column_device_view::create(input.parent(), stream); + auto chars_column = create_chars_child_column(static_cast(total_bytes), stream, mr); + auto d_chars = mutable_column_device_view::create(chars_column->mutable_view(), stream); - // run kernel to shift the characters + // run kernel to shift all the characters thrust::transform(rmm::exec_policy(stream), thrust::counting_iterator(0), - thrust::counting_iterator(chars_size), + thrust::counting_iterator(total_bytes), d_chars->data(), - shift_chars_fn{*d_input, d_fill_str, shift_offset}); // d_input_chars + shift_chars_fn{*d_input, d_fill_str, shift_offset}); - // caller sets the null-mask + // caller sets the null-mask return make_strings_column( input.size(), std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{}); } From bd7715026a00ee1b24c54ba22e9bca1ba188ab1a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 22 Aug 2023 12:54:57 -0400 Subject: [PATCH 3/3] align offset < 0 statements --- cpp/src/strings/copying/shift.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu index fd4ff94c6b8..b54c433c23d 100644 --- a/cpp/src/strings/copying/shift.cu +++ b/cpp/src/strings/copying/shift.cu @@ -111,7 +111,7 @@ std::unique_ptr shift(strings_column_view const& input, // compute the shift-offset for the output characters child column auto const shift_offset = [&] { - auto const index = (offset >= 0) ? offset : input.size() + offset; + auto const index = (offset < 0) ? input.size() + offset : offset; return (offset < 0 ? -1 : 1) * cudf::detail::get_value(offsets_view, index, stream); }();