From f46cb31b602c7e23a9c08a64add28526f4ac7bf2 Mon Sep 17 00:00:00 2001 From: Divye Gala Date: Wed, 2 Aug 2023 20:00:35 -0400 Subject: [PATCH 1/5] Reduce `lists::contains` dispatches for scalars (#13805) This PR is to prepare for https://github.com/rapidsai/cudf/pull/13672 which uses experimental comparators for both nested and non-nested types. `lists::contains` currently has two APIs that support: 1. `cudf::scalar` 2. `cudf::column_view` Both APIs currently need to call `cudf::type_dispatcher`. However, by converting `cudf::scalar` to a `cudf::column_view` by materializing all rows, we cut down the number of dispatches to half. Compile times: Before this PR: [12:07](https://downloads.rapids.ai/ci/cudf/pull-request/13788/0498f7d/cuda11_x86_64.ninja_log.html) After this PR: [5:46](https://downloads.rapids.ai/ci/cudf/pull-request/13805/7664a98/cuda11_x86_64.ninja_log.html) Authors: - Divye Gala (https://github.com/divyegala) Approvers: - Bradley Dice (https://github.com/bdice) - Karthikeyan (https://github.com/karthikeyann) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/13805 --- cpp/src/lists/contains.cu | 142 ++++++++++++-------------------------- 1 file changed, 46 insertions(+), 96 deletions(-) diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu index a3293e36825..9d39f2f9a90 100644 --- a/cpp/src/lists/contains.cu +++ b/cpp/src/lists/contains.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -154,16 +155,11 @@ struct search_list_nested_types_fn { duplicate_find_option const find_option; KeyValidityIter const key_validity_iter; EqComparator const d_comp; - bool const search_key_is_scalar; search_list_nested_types_fn(duplicate_find_option const find_option, KeyValidityIter const key_validity_iter, - EqComparator const& d_comp, - bool search_key_is_scalar) - : find_option(find_option), - key_validity_iter(key_validity_iter), - d_comp(d_comp), - search_key_is_scalar(search_key_is_scalar) + EqComparator const& d_comp) + : find_option(find_option), key_validity_iter(key_validity_iter), d_comp(d_comp) { } @@ -186,9 +182,8 @@ struct search_list_nested_types_fn { auto const [begin, end] = element_index_pair_iter(list.size()); auto const found_iter = thrust::find_if(thrust::seq, begin, end, [=] __device__(auto const idx) { - return !list.is_null(idx) && - d_comp(static_cast(list.element_offset(idx)), - static_cast(search_key_is_scalar ? 0 : list.row_index())); + return !list.is_null(idx) && d_comp(static_cast(list.element_offset(idx)), + static_cast(list.row_index())); }); // If the key is found, return its found position in the list from `found_iter`. return found_iter == end ? NOT_FOUND_SENTINEL : *found_iter; @@ -199,93 +194,53 @@ struct search_list_nested_types_fn { * @brief Function to search for key element(s) in the corresponding rows of a lists column, * specialized for non-nested types. */ -template +template void index_of_non_nested_types(InputIterator input_it, size_type num_rows, OutputIterator output_it, - SearchKeyType const& search_keys, + column_view const& search_keys, bool search_keys_have_nulls, duplicate_find_option find_option, rmm::cuda_stream_view stream) { - auto const do_search = [=](auto const keys_iter) { - thrust::transform(rmm::exec_policy(stream), - input_it, - input_it + num_rows, - keys_iter, - output_it, - search_list_non_nested_types_fn{find_option}); - }; - - if constexpr (search_key_is_scalar) { - auto const keys_iter = cudf::detail::make_optional_iterator( - search_keys, nullate::DYNAMIC{search_keys_have_nulls}); - do_search(keys_iter); - } else { - auto const keys_cdv_ptr = column_device_view::create(search_keys, stream); - auto const keys_iter = cudf::detail::make_optional_iterator( - *keys_cdv_ptr, nullate::DYNAMIC{search_keys_have_nulls}); - do_search(keys_iter); - } + auto const keys_cdv_ptr = column_device_view::create(search_keys, stream); + auto const keys_iter = cudf::detail::make_optional_iterator( + *keys_cdv_ptr, nullate::DYNAMIC{search_keys_have_nulls}); + thrust::transform(rmm::exec_policy(stream), + input_it, + input_it + num_rows, + keys_iter, + output_it, + search_list_non_nested_types_fn{find_option}); } /** * @brief Function to search for index of key element(s) in the corresponding rows of a lists * column, specialized for nested types. */ -template +template void index_of_nested_types(InputIterator input_it, size_type num_rows, OutputIterator output_it, column_view const& child, - SearchKeyType const& search_keys, + column_view const& search_keys, duplicate_find_option find_option, rmm::cuda_stream_view stream) { - // Create a `table_view` from the search key(s). - // If the input search key is a (nested type) scalar, a new column is materialized from that - // scalar before a `table_view` is generated from it. As such, the new created column will also be - // returned to keep the result `table_view` valid. - [[maybe_unused]] auto const [keys_tview, unused_column] = - [&]() -> std::pair> { - if constexpr (search_key_is_scalar) { - auto tmp_column = make_column_from_scalar(search_keys, 1, stream); - return {table_view{{tmp_column->view()}}, std::move(tmp_column)}; - } else { - return {table_view{{search_keys}}, nullptr}; - } - }(); - + auto const keys_tview = cudf::table_view{{search_keys}}; auto const child_tview = table_view{{child}}; auto const has_nulls = has_nested_nulls(child_tview) || has_nested_nulls(keys_tview); auto const comparator = cudf::experimental::row::equality::two_table_comparator(child_tview, keys_tview, stream); auto const d_comp = comparator.equal_to(nullate::DYNAMIC{has_nulls}); - auto const do_search = [=](auto const key_validity_iter) { - thrust::transform( - rmm::exec_policy(stream), - input_it, - input_it + num_rows, - output_it, - search_list_nested_types_fn{find_option, key_validity_iter, d_comp, search_key_is_scalar}); - }; - - if constexpr (search_key_is_scalar) { - auto const key_validity_iter = cudf::detail::make_validity_iterator(search_keys); - do_search(key_validity_iter); - } else { - auto const keys_dv_ptr = column_device_view::create(search_keys, stream); - auto const key_validity_iter = cudf::detail::make_validity_iterator(*keys_dv_ptr); - do_search(key_validity_iter); - } + auto const keys_dv_ptr = column_device_view::create(search_keys, stream); + auto const key_validity_iter = cudf::detail::make_validity_iterator(*keys_dv_ptr); + thrust::transform(rmm::exec_policy(stream), + input_it, + input_it + num_rows, + output_it, + search_list_nested_types_fn{find_option, key_validity_iter, d_comp}); } /** @@ -295,10 +250,10 @@ void index_of_nested_types(InputIterator input_it, struct dispatch_index_of { // SFINAE with conditional return type because we need to support device lambda in this function. // This is required due to a limitation of nvcc. - template + template std::enable_if_t(), std::unique_ptr> operator()( lists_column_view const& lists, - SearchKeyType const& search_keys, + column_view const& search_keys, duplicate_find_option find_option, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const @@ -313,27 +268,10 @@ struct dispatch_index_of { cudf::data_type_error); CUDF_EXPECTS(search_keys.type().id() != type_id::EMPTY, "Type cannot be empty."); - auto constexpr search_key_is_scalar = std::is_same_v; - auto const search_keys_have_nulls = [&search_keys, stream] { - if constexpr (search_key_is_scalar) { - return !search_keys.is_valid(stream); - } else { - return search_keys.has_nulls(); - } - }(); + auto const search_keys_have_nulls = search_keys.has_nulls(); auto const num_rows = lists.size(); - if (search_key_is_scalar && search_keys_have_nulls) { - // If the scalar key is invalid/null, the entire output column will be all nulls. - return make_numeric_column(data_type{cudf::type_to_id()}, - num_rows, - cudf::create_null_mask(num_rows, mask_state::ALL_NULL, mr), - num_rows, - stream, - mr); - } - auto const lists_cdv_ptr = column_device_view::create(lists.parent(), stream); auto const input_it = cudf::detail::make_counting_transform_iterator( size_type{0}, @@ -346,11 +284,10 @@ struct dispatch_index_of { auto const output_it = out_positions->mutable_view().template begin(); if constexpr (not cudf::is_nested()) { - index_of_non_nested_types( + index_of_non_nested_types( input_it, num_rows, output_it, search_keys, search_keys_have_nulls, find_option, stream); } else { // list + struct - index_of_nested_types( - input_it, num_rows, output_it, child, search_keys, find_option, stream); + index_of_nested_types(input_it, num_rows, output_it, child, search_keys, find_option, stream); } if (search_keys_have_nulls || lists.has_nulls()) { @@ -414,8 +351,21 @@ std::unique_ptr index_of(lists_column_view const& lists, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return cudf::type_dispatcher( - search_key.type(), dispatch_index_of{}, lists, search_key, find_option, stream, mr); + if (!search_key.is_valid(stream)) { + return make_numeric_column(data_type{cudf::type_to_id()}, + lists.size(), + cudf::create_null_mask(lists.size(), mask_state::ALL_NULL, mr), + lists.size(), + stream, + mr); + } + if (lists.size() == 0) { + return make_numeric_column( + data_type{type_to_id()}, 0, cudf::mask_state::UNALLOCATED, stream, mr); + } + + auto search_key_col = cudf::make_column_from_scalar(search_key, lists.size(), stream, mr); + return index_of(lists, search_key_col->view(), find_option, stream, mr); } std::unique_ptr index_of(lists_column_view const& lists, From 11fd25c25e8d593ddfb4e0d29281aaf63898d9ba Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 2 Aug 2023 19:59:04 -0500 Subject: [PATCH 2/5] Fix unbounded sequence issue in `DataFrame` constructor (#13811) In `cudf`, we currently have a hang in this scenario: ```python In [1]: import cudf In [2]: class A: ...: def __getitem__(self, key): ...: return 1 ...: In [3]: cudf.DataFrame([A()]) ``` This PR introduces additional checks before letting the list-like inputs pass onto `itertools` for transposing. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/13811 --- python/cudf/cudf/core/dataframe.py | 6 ++++++ python/cudf/cudf/tests/test_dataframe.py | 12 ++++++++++++ 2 files changed, 18 insertions(+) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a510e6829d1..d421258b06b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -843,6 +843,12 @@ def _init_from_list_like(self, data, index=None, columns=None): data = DataFrame.from_pandas(pd.DataFrame(data)) self._data = data._data else: + if any( + not isinstance(col, (abc.Iterable, abc.Sequence)) + for col in data + ): + raise TypeError("Inputs should be an iterable or sequence.") + data = list(itertools.zip_longest(*data)) if columns is not None and len(data) == 0: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index aad0b757177..e35ab147bf4 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10243,3 +10243,15 @@ def test_dataframe_init_columns_named_index(): pdf = pd.DataFrame(data, columns=columns) assert_eq(gdf, pdf) + + +def test_dataframe_constructor_unbounded_sequence(): + class A: + def __getitem__(self, key): + return 1 + + with pytest.raises(TypeError): + cudf.DataFrame([A()]) + + with pytest.raises(TypeError): + cudf.DataFrame({"a": A()}) From 399efb960f689085bf671f6fa62916b1020e3b30 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 2 Aug 2023 23:50:47 -0700 Subject: [PATCH 3/5] Fix for Parquet writer when requested pages per row is smaller than fragment size (#13806) #12685 introduced a bug in page calculation. If the `max_page_size_rows` parameter is set smaller than the page fragment size, the writer will produce a spurious empty page. This PR fixes this by only checking the fragment size if there are already rows in the page, and then returns the old check for number of rows exceeding the page limit. Interestingly, libcudf can read these files with empty pages just fine, but parquet-mr cannot. Authors: - Ed Seidl (https://github.com/etseidl) Approvers: - Nghia Truong (https://github.com/ttnghia) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/13806 --- cpp/src/io/parquet/page_enc.cu | 12 ++++++++--- cpp/tests/io/parquet_test.cpp | 38 ++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 05f8bba7477..190f70d0747 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -432,9 +432,15 @@ __global__ void __launch_bounds__(128) max_RLE_page_size(col_g.num_def_level_bits(), num_vals) + max_RLE_page_size(col_g.num_rep_level_bits(), num_vals)); - if (num_rows >= ck_g.num_rows || - (values_in_page > 0 && (page_size + fragment_data_size > this_max_page_size)) || - rows_in_page + frag_g.num_rows > max_page_size_rows) { + // checks to see when we need to close the current page and start a new one + auto const is_last_chunk = num_rows >= ck_g.num_rows; + auto const is_page_bytes_exceeded = page_size + fragment_data_size > this_max_page_size; + auto const is_page_rows_exceeded = rows_in_page + frag_g.num_rows > max_page_size_rows; + // only check for limit overflow if there's already at least one fragment for this page + auto const is_page_too_big = + values_in_page > 0 && (is_page_bytes_exceeded || is_page_rows_exceeded); + + if (is_last_chunk || is_page_too_big) { if (ck_g.use_dictionary) { // Additional byte to store entry bit width page_size = 1 + max_RLE_page_size(ck_g.dict_rle_bits, values_in_page); diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 4e28f536728..a5054daed19 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -3709,6 +3709,44 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted) EXPECT_LE(ph.data_page_header.num_values, rows_per_page); } +TEST_F(ParquetWriterTest, CheckPageRowsTooSmall) +{ + constexpr auto rows_per_page = 1'000; + constexpr auto fragment_size = 5'000; + constexpr auto num_rows = 3 * rows_per_page; + const std::string s1(32, 'a'); + auto col0_elements = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s1; }); + auto col0 = cudf::test::strings_column_wrapper(col0_elements, col0_elements + num_rows); + + auto const expected = table_view{{col0}}; + + auto const filepath = temp_env->get_temp_filepath("CheckPageRowsTooSmall.parquet"); + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .max_page_fragment_size(fragment_size) + .max_page_size_rows(rows_per_page); + cudf::io::write_parquet(out_opts); + + // check that file is written correctly when rows/page < fragment size + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::FileMetaData fmd; + + read_footer(source, &fmd); + ASSERT_TRUE(fmd.row_groups.size() > 0); + ASSERT_TRUE(fmd.row_groups[0].columns.size() == 1); + auto const& first_chunk = fmd.row_groups[0].columns[0].meta_data; + ASSERT_TRUE(first_chunk.data_page_offset > 0); + + // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded + // version should be smaller than size of the struct. + auto const ph = read_page_header( + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0}); + + // there should be only one page since the fragment size is larger than rows_per_page + EXPECT_EQ(ph.data_page_header.num_values, num_rows); +} + TEST_F(ParquetWriterTest, Decimal128Stats) { // check that decimal128 min and max statistics are written in network byte order From 9c559c94fcaa1525d7c95faf94e5486fcd992ef1 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 3 Aug 2023 08:22:51 -0400 Subject: [PATCH 4/5] Remove the libcudf cudf::offset_type type (#13788) Replace all occurrences of `cudf::offset_type` with `cudf::size_type` This helps clear up code where sizes are computed and then converted to offsets in-place. Also, reference #13733 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - https://github.com/brandon-b-miller - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/13788 --- cpp/benchmarks/copying/contiguous_split.cu | 2 +- cpp/benchmarks/lists/copying/scatter_lists.cu | 12 ++--- cpp/examples/strings/custom_prealloc.cu | 2 +- .../cudf/detail/sizes_to_offsets_iterator.cuh | 4 +- cpp/include/cudf/lists/detail/scatter.cuh | 8 ++-- cpp/include/cudf/lists/lists_column_view.hpp | 6 +-- .../cudf/strings/strings_column_view.hpp | 6 +-- .../cudf/tdigest/tdigest_column_view.hpp | 4 +- cpp/include/cudf/types.hpp | 1 - cpp/include/cudf_test/column_utilities.hpp | 4 +- cpp/include/cudf_test/column_wrapper.hpp | 2 +- cpp/src/copying/concatenate.cu | 6 +-- cpp/src/copying/contiguous_split.cu | 15 +++---- cpp/src/groupby/groupby.cu | 2 +- cpp/src/groupby/sort/group_collect.cu | 4 +- cpp/src/groupby/sort/group_merge_lists.cu | 6 +-- cpp/src/io/csv/writer_impl.cu | 8 ++-- cpp/src/io/json/json_column.cu | 4 +- cpp/src/io/json/write_json.cu | 12 ++--- cpp/src/io/parquet/reader_impl_preprocess.cu | 4 +- cpp/src/io/statistics/statistics.cuh | 2 +- .../combine/concatenate_list_elements.cu | 12 +++-- cpp/src/lists/combine/concatenate_rows.cu | 19 ++++---- cpp/src/lists/copying/scatter_helper.cu | 12 ++--- cpp/src/lists/interleave_columns.cu | 24 +++++----- cpp/src/lists/lists_column_factories.cu | 8 ++-- cpp/src/lists/reverse.cu | 2 +- cpp/src/lists/sequences.cu | 10 ++--- .../stream_compaction/apply_boolean_mask.cu | 10 ++--- cpp/src/lists/utilities.cu | 10 ++--- cpp/src/quantiles/tdigest/tdigest.cu | 16 +++---- .../quantiles/tdigest/tdigest_aggregation.cu | 44 +++++++++---------- cpp/src/rolling/detail/nth_element.cuh | 4 +- cpp/src/rolling/detail/rolling.cuh | 2 +- .../rolling/detail/rolling_collect_list.cu | 4 +- cpp/src/rolling/grouped_rolling.cu | 4 +- cpp/src/strings/capitalize.cu | 12 ++--- cpp/src/strings/combine/concatenate.cu | 6 +-- cpp/src/strings/combine/join_list_elements.cu | 6 +-- cpp/src/strings/convert/convert_booleans.cu | 2 +- cpp/src/strings/convert/convert_durations.cu | 2 +- .../strings/convert/convert_fixed_point.cu | 2 +- cpp/src/strings/convert/convert_hex.cu | 4 +- cpp/src/strings/convert/convert_lists.cu | 4 +- cpp/src/strings/convert/convert_urls.cu | 19 ++++---- cpp/src/strings/copying/concatenate.cu | 4 +- cpp/src/strings/copying/shift.cu | 18 ++++---- cpp/src/strings/extract/extract_all.cu | 8 ++-- cpp/src/strings/json/json_path.cu | 16 +++---- cpp/src/strings/padding.cu | 4 +- cpp/src/strings/repeat_strings.cu | 10 ++--- cpp/src/strings/reverse.cu | 6 +-- cpp/src/strings/search/find_multiple.cu | 6 +-- cpp/src/strings/search/findall.cu | 6 +-- cpp/src/strings/split/split_re.cu | 12 ++--- cpp/src/strings/strings_column_view.cpp | 4 +- cpp/src/text/subword/bpe_tokenizer.cu | 20 ++++----- cpp/src/text/subword/load_merges_file.cu | 2 +- cpp/src/transform/row_bit_count.cu | 12 ++--- cpp/tests/column/factories_test.cpp | 12 ++--- cpp/tests/copying/concatenate_tests.cpp | 8 ++-- .../copying/copy_if_else_nested_tests.cpp | 2 +- cpp/tests/copying/gather_struct_tests.cpp | 4 +- cpp/tests/copying/get_value_tests.cpp | 20 ++++----- .../copying/scatter_list_scalar_tests.cpp | 8 ++-- cpp/tests/copying/split_tests.cpp | 6 +-- cpp/tests/groupby/collect_list_tests.cpp | 8 ++-- cpp/tests/io/parquet_test.cpp | 12 ++--- cpp/tests/lists/extract_tests.cpp | 6 +-- .../quantiles/percentile_approx_test.cpp | 4 +- cpp/tests/reductions/tdigest_tests.cu | 6 +-- cpp/tests/strings/array_tests.cpp | 2 +- cpp/tests/strings/contains_tests.cpp | 6 +-- cpp/tests/strings/factories_test.cu | 8 ++-- cpp/tests/transform/row_bit_count_test.cu | 43 +++++++++--------- cpp/tests/utilities/column_utilities.cu | 6 +-- cpp/tests/utilities/tdigest_utilities.cu | 6 +-- .../column_utilities_tests.cpp | 2 +- java/src/main/native/src/ColumnViewJni.cu | 6 +-- java/src/main/native/src/row_conversion.cu | 6 +-- python/cudf/cudf/_lib/cpp/types.pxd | 1 - python/cudf/cudf/_lib/pylibcudf/column.pxd | 4 +- python/cudf/cudf/_lib/pylibcudf/column.pyx | 4 +- 83 files changed, 325 insertions(+), 345 deletions(-) diff --git a/cpp/benchmarks/copying/contiguous_split.cu b/cpp/benchmarks/copying/contiguous_split.cu index cad5a7c7b58..910fc689c0b 100644 --- a/cpp/benchmarks/copying/contiguous_split.cu +++ b/cpp/benchmarks/copying/contiguous_split.cu @@ -151,7 +151,7 @@ void BM_contiguous_split_strings(benchmark::State& state, ContiguousSplitImpl& i } int64_t const total_bytes = - total_desired_bytes + ((num_rows + 1) * sizeof(cudf::offset_type)) + + total_desired_bytes + ((num_rows + 1) * sizeof(cudf::size_type)) + (include_validity ? (max(int64_t{1}, (num_rows / 32)) * sizeof(cudf::bitmask_type) * num_cols) : 0); diff --git a/cpp/benchmarks/lists/copying/scatter_lists.cu b/cpp/benchmarks/lists/copying/scatter_lists.cu index 85d730b94fb..dbc3234dabf 100644 --- a/cpp/benchmarks/lists/copying/scatter_lists.cu +++ b/cpp/benchmarks/lists/copying/scatter_lists.cu @@ -62,26 +62,26 @@ void BM_lists_scatter(::benchmark::State& state) target_base_col->mutable_view().end()); auto source_offsets = - make_fixed_width_column(cudf::data_type{cudf::type_to_id()}, + make_fixed_width_column(cudf::data_type{cudf::type_to_id()}, num_rows + 1, cudf::mask_state::UNALLOCATED, stream, mr); auto target_offsets = - make_fixed_width_column(cudf::data_type{cudf::type_to_id()}, + make_fixed_width_column(cudf::data_type{cudf::type_to_id()}, num_rows + 1, cudf::mask_state::UNALLOCATED, stream, mr); thrust::sequence(rmm::exec_policy(stream), - source_offsets->mutable_view().begin(), - source_offsets->mutable_view().end(), + source_offsets->mutable_view().begin(), + source_offsets->mutable_view().end(), 0, num_elements_per_row); thrust::sequence(rmm::exec_policy(stream), - target_offsets->mutable_view().begin(), - target_offsets->mutable_view().end(), + target_offsets->mutable_view().begin(), + target_offsets->mutable_view().end(), 0, num_elements_per_row); diff --git a/cpp/examples/strings/custom_prealloc.cu b/cpp/examples/strings/custom_prealloc.cu index a956550f505..0af4c47e947 100644 --- a/cpp/examples/strings/custom_prealloc.cu +++ b/cpp/examples/strings/custom_prealloc.cu @@ -41,7 +41,7 @@ __global__ void redact_kernel(cudf::column_device_view const d_names, cudf::column_device_view const d_visibilities, cudf::string_view redaction, char* working_memory, - cudf::offset_type const* d_offsets, + cudf::size_type const* d_offsets, cudf::string_view* d_output) { // The row index is resolved from the CUDA thread/block objects diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh index 532d66c8483..155b1ce5691 100644 --- a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh +++ b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh @@ -303,9 +303,9 @@ std::pair, size_type> make_offsets_child_column( { auto count = static_cast(std::distance(begin, end)); auto offsets_column = make_numeric_column( - data_type{type_to_id()}, count + 1, mask_state::UNALLOCATED, stream, mr); + data_type{type_to_id()}, count + 1, mask_state::UNALLOCATED, stream, mr); auto offsets_view = offsets_column->mutable_view(); - auto d_offsets = offsets_view.template data(); + auto d_offsets = offsets_view.template data(); // The number of offsets is count+1 so to build the offsets from the sizes // using exclusive-scan technically requires count+1 input values even though diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh index 18cb147d1e4..f04b2fda2bf 100644 --- a/cpp/include/cudf/lists/detail/scatter.cuh +++ b/cpp/include/cudf/lists/detail/scatter.cuh @@ -240,11 +240,11 @@ std::unique_ptr scatter(scalar const& slr, rmm::device_buffer null_mask = slr_valid ? cudf::detail::create_null_mask(1, mask_state::UNALLOCATED, stream, mr) : cudf::detail::create_null_mask(1, mask_state::ALL_NULL, stream, mr); - auto offset_column = make_numeric_column( - data_type{type_to_id()}, 2, mask_state::UNALLOCATED, stream, mr); + auto offset_column = + make_numeric_column(data_type{type_to_id()}, 2, mask_state::UNALLOCATED, stream, mr); thrust::sequence(rmm::exec_policy_nosync(stream), - offset_column->mutable_view().begin(), - offset_column->mutable_view().end(), + offset_column->mutable_view().begin(), + offset_column->mutable_view().end(), 0, lv->view().size()); auto wrapped = column_view(data_type{type_id::LIST}, diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp index 60bdc654af6..8c6368eacb6 100644 --- a/cpp/include/cudf/lists/lists_column_view.hpp +++ b/cpp/include/cudf/lists/lists_column_view.hpp @@ -71,9 +71,7 @@ class lists_column_view : private column_view { using column_view::null_mask; using column_view::offset; using column_view::size; - static_assert(std::is_same_v, - "offset_type is expected to be the same as size_type."); - using offset_iterator = offset_type const*; ///< Iterator type for offsets + using offset_iterator = size_type const*; ///< Iterator type for offsets /** * @brief Returns the parent column. @@ -119,7 +117,7 @@ class lists_column_view : private column_view { */ [[nodiscard]] offset_iterator offsets_begin() const noexcept { - return offsets().begin() + offset(); + return offsets().begin() + offset(); } /** diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index e617dbde024..f1aa8e49f00 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -67,8 +67,8 @@ class strings_column_view : private column_view { using column_view::offset; using column_view::size; - using offset_iterator = offset_type const*; ///< offsets iterator type - using chars_iterator = char const*; ///< character iterator type + using offset_iterator = size_type const*; ///< offsets iterator type + using chars_iterator = char const*; ///< character iterator type /** * @brief Returns the parent column. diff --git a/cpp/include/cudf/tdigest/tdigest_column_view.hpp b/cpp/include/cudf/tdigest/tdigest_column_view.hpp index 89903c24c21..f2f493cbbe4 100644 --- a/cpp/include/cudf/tdigest/tdigest_column_view.hpp +++ b/cpp/include/cudf/tdigest/tdigest_column_view.hpp @@ -67,9 +67,7 @@ class tdigest_column_view : private column_view { tdigest_column_view& operator=(tdigest_column_view&&) = default; using column_view::size; - static_assert(std::is_same_v, - "offset_type is expected to be the same as size_type."); - using offset_iterator = offset_type const*; ///< Iterator over offsets + using offset_iterator = size_type const*; ///< Iterator over offsets // mean and weight column indices within tdigest inner struct columns static constexpr size_type mean_column_index{0}; ///< Mean column index diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 6991a90b31b..addab160b6e 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -80,7 +80,6 @@ class mutable_table_view; using size_type = int32_t; ///< Row index type for columns and tables using bitmask_type = uint32_t; ///< Bitmask type stored as 32-bit unsigned integer using valid_type = uint8_t; ///< Valid type in host memory -using offset_type = int32_t; ///< Offset type for column offsets using thread_index_type = int64_t; ///< Thread index type in kernels /** diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp index f288c30e313..059bd10eae1 100644 --- a/cpp/include/cudf_test/column_utilities.hpp +++ b/cpp/include/cudf_test/column_utilities.hpp @@ -261,8 +261,8 @@ inline std::pair, std::vector> to cudf::device_span(scv.chars().data(), scv.chars().size()), cudf::get_default_stream()); auto const h_offsets = cudf::detail::make_std_vector_sync( - cudf::device_span( - scv.offsets().data() + scv.offset(), scv.size() + 1), + cudf::device_span(scv.offsets().data() + scv.offset(), + scv.size() + 1), cudf::get_default_stream()); // build std::string vector from chars and offsets diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index 9c1afc64550..1e311322de1 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -1507,7 +1507,7 @@ class lists_column_wrapper : public detail::column_wrapper { */ static lists_column_wrapper make_one_empty_row_column(bool valid = true) { - cudf::test::fixed_width_column_wrapper offsets{0, 0}; + cudf::test::fixed_width_column_wrapper offsets{0, 0}; cudf::test::fixed_width_column_wrapper values{}; return lists_column_wrapper( 1, diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index 170eccbcb09..a53ec295512 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -408,15 +408,15 @@ void traverse_children::operator()(host_span 0 - ? cudf::detail::get_value( + ? cudf::detail::get_value( scv.offsets(), scv.offset() + scv.size(), stream) - - cudf::detail::get_value(scv.offsets(), scv.offset(), stream) + cudf::detail::get_value(scv.offsets(), scv.offset(), stream) // if the offset() is 0, it can still be sliced to a shorter length. in this case // we only need to read a single offset. otherwise just return the full length // (chars_size()) : scv.size() + 1 == scv.offsets().size() ? scv.chars_size() - : cudf::detail::get_value(scv.offsets(), scv.size(), stream)); + : cudf::detail::get_value(scv.offsets(), scv.size(), stream)); }); CUDF_EXPECTS(total_char_count <= static_cast(std::numeric_limits::max()), "Total number of concatenated chars exceeds the column size limit", diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 6c61af0050d..e1a55ec5419 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -939,7 +939,7 @@ struct batch_byte_size_function { * @brief Get the input buffer index given the output buffer index. */ struct out_to_in_index_function { - offset_type const* const batch_offsets; + size_type const* const batch_offsets; int const num_bufs; __device__ int operator()(size_type i) const { @@ -1312,7 +1312,7 @@ std::unique_ptr compute_splits( */ struct chunk_iteration_state { chunk_iteration_state(rmm::device_uvector _d_batched_dst_buf_info, - rmm::device_uvector _d_batch_offsets, + rmm::device_uvector _d_batch_offsets, std::vector&& _h_num_buffs_per_iteration, std::vector&& _h_size_of_buffs_per_iteration, std::size_t total_size) @@ -1375,11 +1375,10 @@ struct chunk_iteration_state { bool has_more_copies() const { return current_iteration < num_iterations; } rmm::device_uvector d_batched_dst_buf_info; ///< dst_buf_info per 1MB batch - rmm::device_uvector const - d_batch_offsets; ///< Offset within a batch per dst_buf_info - std::size_t const total_size; ///< The aggregate size of all iterations - int const num_iterations; ///< The total number of iterations - int current_iteration; ///< Marks the current iteration being worked on + rmm::device_uvector const d_batch_offsets; ///< Offset within a batch per dst_buf_info + std::size_t const total_size; ///< The aggregate size of all iterations + int const num_iterations; ///< The total number of iterations + int current_iteration; ///< Marks the current iteration being worked on private: std::size_t starting_batch; ///< Starting batch index for the current iteration @@ -1398,7 +1397,7 @@ std::unique_ptr chunk_iteration_state::create( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* temp_mr) { - rmm::device_uvector d_batch_offsets(num_bufs + 1, stream, temp_mr); + rmm::device_uvector d_batch_offsets(num_bufs + 1, stream, temp_mr); auto const buf_count_iter = cudf::detail::make_counting_transform_iterator( 0, [num_bufs, num_batches = num_batches_func{batches.begin()}] __device__(size_type i) { diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index eddc748df7c..ce1fc71968f 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -107,7 +107,7 @@ struct empty_column_constructor { if constexpr (k == aggregation::Kind::COLLECT_LIST || k == aggregation::Kind::COLLECT_SET) { return make_lists_column( - 0, make_empty_column(type_to_id()), empty_like(values), 0, {}); + 0, make_empty_column(type_to_id()), empty_like(values), 0, {}); } if constexpr (k == aggregation::Kind::RANK) { diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu index c61a998a40c..f95ad72f453 100644 --- a/cpp/src/groupby/sort/group_collect.cu +++ b/cpp/src/groupby/sort/group_collect.cu @@ -96,12 +96,12 @@ std::unique_ptr group_collect(column_view const& values, auto [child_column, offsets_column] = [null_handling, num_groups, &values, &group_offsets, stream, mr] { auto offsets_column = make_numeric_column( - data_type(type_to_id()), num_groups + 1, mask_state::UNALLOCATED, stream, mr); + data_type(type_to_id()), num_groups + 1, mask_state::UNALLOCATED, stream, mr); thrust::copy(rmm::exec_policy(stream), group_offsets.begin(), group_offsets.end(), - offsets_column->mutable_view().template begin()); + offsets_column->mutable_view().template begin()); // If column of grouped values contains null elements, and null_policy == EXCLUDE, // those elements must be filtered out, and offsets recomputed. diff --git a/cpp/src/groupby/sort/group_merge_lists.cu b/cpp/src/groupby/sort/group_merge_lists.cu index 3043d107635..2c72128dbfb 100644 --- a/cpp/src/groupby/sort/group_merge_lists.cu +++ b/cpp/src/groupby/sort/group_merge_lists.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,7 +38,7 @@ std::unique_ptr group_merge_lists(column_view const& values, "Input to `group_merge_lists` must be a non-nullable lists column."); auto offsets_column = make_numeric_column( - data_type(type_to_id()), num_groups + 1, mask_state::UNALLOCATED, stream, mr); + data_type(type_to_id()), num_groups + 1, mask_state::UNALLOCATED, stream, mr); // Generate offsets of the output lists column by gathering from the provided group offsets and // the input list offsets. @@ -54,7 +54,7 @@ std::unique_ptr group_merge_lists(column_view const& values, group_offsets.begin(), group_offsets.end(), lists_column_view(values).offsets_begin(), - offsets_column->mutable_view().template begin()); + offsets_column->mutable_view().template begin()); // The child column of the output lists column is just copied from the input column. auto child_column = diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 12e9fccdee7..8c586306ad5 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -75,10 +75,10 @@ namespace { struct escape_strings_fn { column_device_view const d_column; string_view const d_delimiter; // check for column delimiter - offset_type* d_offsets{}; + size_type* d_offsets{}; char* d_chars{}; - __device__ void write_char(char_utf8 chr, char*& d_buffer, offset_type& bytes) + __device__ void write_char(char_utf8 chr, char*& d_buffer, size_type& bytes) { if (d_buffer) d_buffer += cudf::strings::detail::from_char_utf8(chr, d_buffer); @@ -105,8 +105,8 @@ struct escape_strings_fn { return chr == quote || chr == new_line || chr == d_delimiter[0]; }); - char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - offset_type bytes = 0; + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + size_type bytes = 0; if (quote_row) write_char(quote, d_buffer, bytes); for (auto chr : d_str) { diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 3a79d832d06..b18637c86d7 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -363,8 +363,8 @@ std::vector copy_strings_to_host(device_span input, cudf::device_span(scv.chars().data(), scv.chars().size()), cudf::get_default_stream()); auto const h_offsets = cudf::detail::make_std_vector_sync( - cudf::device_span( - scv.offsets().data() + scv.offset(), scv.size() + 1), + cudf::device_span(scv.offsets().data() + scv.offset(), + scv.size() + 1), cudf::get_default_stream()); // build std::string vector from chars and offsets diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 9ecf77a798a..6ee16f8866e 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -75,10 +75,10 @@ namespace { struct escape_strings_fn { column_device_view const d_column; bool const append_colon{false}; - offset_type* d_offsets{}; + size_type* d_offsets{}; char* d_chars{}; - __device__ void write_char(char_utf8 chr, char*& d_buffer, offset_type& bytes) + __device__ void write_char(char_utf8 chr, char*& d_buffer, size_type& bytes) { if (d_buffer) d_buffer += cudf::strings::detail::from_char_utf8(chr, d_buffer); @@ -91,7 +91,7 @@ struct escape_strings_fn { return nibble < 10 ? '0' + nibble : 'a' + nibble - 10; } - __device__ void write_utf8_codepoint(uint16_t codepoint, char*& d_buffer, offset_type& bytes) + __device__ void write_utf8_codepoint(uint16_t codepoint, char*& d_buffer, size_type& bytes) { if (d_buffer) { d_buffer[0] = '\\'; @@ -106,7 +106,7 @@ struct escape_strings_fn { } } - __device__ void write_utf16_codepoint(uint32_t codepoint, char*& d_buffer, offset_type& bytes) + __device__ void write_utf16_codepoint(uint32_t codepoint, char*& d_buffer, size_type& bytes) { constexpr uint16_t UTF16_HIGH_SURROGATE_BEGIN = 0xD800; constexpr uint16_t UTF16_LOW_SURROGATE_BEGIN = 0xDC00; @@ -130,8 +130,8 @@ struct escape_strings_fn { constexpr char_utf8 const quote = '\"'; // wrap quotes bool constexpr quote_row = true; - char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - offset_type bytes = 0; + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + size_type bytes = 0; if (quote_row) write_char(quote, d_buffer, bytes); for (auto utf8_char : d_str) { diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 8c3bdabe6b4..c7e3de03312 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -1016,7 +1016,7 @@ struct row_size_functor { template <> __device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) { - auto const offset_size = sizeof(offset_type); + auto const offset_size = sizeof(size_type); // NOTE: Adding the + 1 offset here isn't strictly correct. There will only be 1 extra offset // for the entire column, whereas this is adding an extra offset per page. So we will get a // small over-estimate of the real size of the order : # of pages * 4 bytes. It seems better @@ -1036,7 +1036,7 @@ __device__ size_t row_size_functor::operator()(size_t num_rows, boo { // only returns the size of offsets and validity. the size of the actual string chars // is tracked separately. - auto const offset_size = sizeof(offset_type); + auto const offset_size = sizeof(size_type); // see note about offsets in the list_view template. return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); } diff --git a/cpp/src/io/statistics/statistics.cuh b/cpp/src/io/statistics/statistics.cuh index 89b26fd731a..805ca43553e 100644 --- a/cpp/src/io/statistics/statistics.cuh +++ b/cpp/src/io/statistics/statistics.cuh @@ -132,7 +132,7 @@ __device__ T get_element(column_device_view const& col, uint32_t row) { using et = typename T::element_type; size_type const index = row + col.offset(); // account for this view's _offset - auto const* d_offsets = col.child(lists_column_view::offsets_column_index).data(); + auto const* d_offsets = col.child(lists_column_view::offsets_column_index).data(); auto const* d_data = col.child(lists_column_view::child_column_index).data(); auto const offset = d_offsets[index]; return T(d_data + offset, d_offsets[index + 1] - offset); diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu index fc3d8a9728a..3b00d7bd26e 100644 --- a/cpp/src/lists/combine/concatenate_list_elements.cu +++ b/cpp/src/lists/combine/concatenate_list_elements.cu @@ -53,11 +53,10 @@ std::unique_ptr concatenate_lists_ignore_null(column_view const& input, { auto const num_rows = input.size(); - static_assert(std::is_same_v && std::is_same_v); auto out_offsets = make_numeric_column( - data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, mr); + data_type{type_to_id()}, num_rows + 1, mask_state::UNALLOCATED, stream, mr); - auto const d_out_offsets = out_offsets->mutable_view().template begin(); + auto const d_out_offsets = out_offsets->mutable_view().template begin(); auto const d_row_offsets = lists_column_view(input).offsets_begin(); auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin(); @@ -121,13 +120,12 @@ generate_list_offsets_and_validities(column_view const& input, { auto const num_rows = input.size(); - static_assert(std::is_same_v && std::is_same_v); auto out_offsets = make_numeric_column( - data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, mr); + data_type{type_to_id()}, num_rows + 1, mask_state::UNALLOCATED, stream, mr); auto const lists_of_lists_dv_ptr = column_device_view::create(input, stream); auto const lists_dv_ptr = column_device_view::create(lists_column_view(input).child(), stream); - auto const d_out_offsets = out_offsets->mutable_view().template begin(); + auto const d_out_offsets = out_offsets->mutable_view().template begin(); auto const d_row_offsets = lists_column_view(input).offsets_begin(); auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin(); @@ -198,7 +196,7 @@ std::unique_ptr gather_list_entries(column_view const& input, d_list_offsets, d_indices = gather_map.begin(), d_out_list_offsets = - output_list_offsets.template begin()] __device__(size_type const idx) { + output_list_offsets.template begin()] __device__(size_type const idx) { // The output row has been identified as a null/empty list during list size computation. if (d_out_list_offsets[idx + 1] == d_out_list_offsets[idx]) { return; } diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu index 993d5e3fc78..658538b0195 100644 --- a/cpp/src/lists/combine/concatenate_rows.cu +++ b/cpp/src/lists/combine/concatenate_rows.cu @@ -77,11 +77,8 @@ generate_regrouped_offsets_and_null_mask(table_device_view const& input, rmm::mr::device_memory_resource* mr) { // outgoing offsets. - auto offsets = cudf::make_fixed_width_column(data_type{type_to_id()}, - input.num_rows() + 1, - mask_state::UNALLOCATED, - stream, - mr); + auto offsets = cudf::make_fixed_width_column( + data_type{type_to_id()}, input.num_rows() + 1, mask_state::UNALLOCATED, stream, mr); auto keys = thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}), [num_columns = input.num_columns()] __device__( @@ -91,7 +88,7 @@ generate_regrouped_offsets_and_null_mask(table_device_view const& input, auto values = thrust::make_transform_iterator( thrust::make_counting_iterator(size_t{0}), [input, row_null_counts = row_null_counts.data(), null_policy] __device__( - size_t i) -> offset_type { + size_t i) -> size_type { auto const col_index = i % input.num_columns(); auto const row_index = i / input.num_columns(); @@ -105,7 +102,7 @@ generate_regrouped_offsets_and_null_mask(table_device_view const& input, } } auto offsets = - input.column(col_index).child(lists_column_view::offsets_column_index).data() + + input.column(col_index).child(lists_column_view::offsets_column_index).data() + input.column(col_index).offset(); return offsets[row_index + 1] - offsets[row_index]; }); @@ -115,13 +112,13 @@ generate_regrouped_offsets_and_null_mask(table_device_view const& input, keys + (input.num_rows() * input.num_columns()), values, thrust::make_discard_iterator(), - offsets->mutable_view().begin()); + offsets->mutable_view().begin()); // convert to offsets thrust::exclusive_scan(rmm::exec_policy(stream), - offsets->view().begin(), - offsets->view().begin() + input.num_rows() + 1, - offsets->mutable_view().begin(), + offsets->view().begin(), + offsets->view().begin() + input.num_rows() + 1, + offsets->mutable_view().begin(), 0); // generate appropriate null mask diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu index 2cb0671c2dc..ca5358798c0 100644 --- a/cpp/src/lists/copying/scatter_helper.cu +++ b/cpp/src/lists/copying/scatter_helper.cu @@ -189,7 +189,7 @@ struct list_child_constructor { thrust::make_counting_iterator(0), thrust::make_counting_iterator(child_column->size()), child_column->mutable_view().begin(), - [offset_begin = list_offsets.begin(), + [offset_begin = list_offsets.begin(), offset_size = list_offsets.size(), d_list_vector = list_vector.begin(), source_lists, @@ -241,7 +241,7 @@ struct list_child_constructor { thrust::make_counting_iterator(0), thrust::make_counting_iterator(string_views.size()), string_views.begin(), - [offset_begin = list_offsets.begin(), + [offset_begin = list_offsets.begin(), offset_size = list_offsets.size(), d_list_vector = list_vector.begin(), source_lists, @@ -255,7 +255,7 @@ struct list_child_constructor { auto row_index = d_list_vector[list_index].row_index(); auto actual_list_row = d_list_vector[list_index].bind_to_column(source_lists, target_lists); auto lists_column = actual_list_row.get_column(); - auto lists_offsets_ptr = lists_column.offsets().template data(); + auto lists_offsets_ptr = lists_column.offsets().template data(); auto child_strings_column = lists_column.child(); auto strings_offset = lists_offsets_ptr[row_index] + intra_index; @@ -308,7 +308,7 @@ struct list_child_constructor { thrust::make_counting_iterator(0), thrust::make_counting_iterator(child_list_views.size()), child_list_views.begin(), - [offset_begin = list_offsets.begin(), + [offset_begin = list_offsets.begin(), offset_size = list_offsets.size(), d_list_vector = list_vector.begin(), source_lists, @@ -323,10 +323,10 @@ struct list_child_constructor { auto actual_list_row = d_list_vector[list_index].bind_to_column(source_lists, target_lists); auto lists_column = actual_list_row.get_column(); auto child_lists_column = lists_column.child(); - auto lists_offsets_ptr = lists_column.offsets().template data(); + auto lists_offsets_ptr = lists_column.offsets().template data(); auto child_lists_offsets_ptr = child_lists_column.child(lists_column_view::offsets_column_index) - .template data(); + .template data(); auto child_row_index = lists_offsets_ptr[row_index] + intra_index; auto size = child_lists_offsets_ptr[child_row_index + 1] - child_lists_offsets_ptr[child_row_index]; diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu index f76aaadaf7b..e80d63939ea 100644 --- a/cpp/src/lists/interleave_columns.cu +++ b/cpp/src/lists/interleave_columns.cu @@ -58,11 +58,9 @@ generate_list_offsets_and_validities(table_view const& input, auto const table_dv_ptr = table_device_view::create(input, stream); // The output offsets column. - static_assert(sizeof(offset_type) == sizeof(int32_t)); - static_assert(sizeof(size_type) == sizeof(int32_t)); auto list_offsets = make_numeric_column( - data_type{type_id::INT32}, num_output_lists + 1, mask_state::UNALLOCATED, stream, mr); - auto const d_offsets = list_offsets->mutable_view().template begin(); + data_type{type_to_id()}, num_output_lists + 1, mask_state::UNALLOCATED, stream, mr); + auto const d_offsets = list_offsets->mutable_view().template begin(); // The array of int8_t to store validities for list elements. auto validities = rmm::device_uvector(has_null_mask ? num_output_lists : 0, stream); @@ -82,7 +80,7 @@ generate_list_offsets_and_validities(table_view const& input, auto const& lists_col = table_dv.column(col_id); if (has_null_mask) { d_validities[idx] = static_cast(lists_col.is_valid(list_id)); } auto const list_offsets = - lists_col.child(lists_column_view::offsets_column_index).template data() + + lists_col.child(lists_column_view::offsets_column_index).template data() + lists_col.offset(); return list_offsets[list_id + 1] - list_offsets[list_id]; }); @@ -139,13 +137,13 @@ struct compute_string_sizes_and_interleave_lists_fn { table_device_view const table_dv; // Store list offsets of the output lists column. - offset_type const* const dst_list_offsets; + size_type const* const dst_list_offsets; // Flag to specify whether to compute string validities. bool const has_null_mask; // Store offsets of the strings. - offset_type* d_offsets{nullptr}; + size_type* d_offsets{nullptr}; // If d_chars == nullptr: only compute sizes and validities of the output strings. // If d_chars != nullptr: only interleave lists of strings. @@ -164,11 +162,11 @@ struct compute_string_sizes_and_interleave_lists_fn { if (has_null_mask and lists_col.is_null(list_id)) { return; } auto const list_offsets = - lists_col.child(lists_column_view::offsets_column_index).template data() + + lists_col.child(lists_column_view::offsets_column_index).template data() + lists_col.offset(); auto const& str_col = lists_col.child(lists_column_view::child_column_index); auto const str_offsets = - str_col.child(strings_column_view::offsets_column_index).template data(); + str_col.child(strings_column_view::offsets_column_index).template data(); // The range of indices of the strings within the source list. auto const start_str_idx = list_offsets[list_id]; @@ -224,7 +222,7 @@ struct interleave_list_entries_impl(), data_has_null_mask}; + *table_dv_ptr, output_list_offsets.template begin(), data_has_null_mask}; auto validities = rmm::device_uvector(data_has_null_mask ? num_output_entries : 0, stream); @@ -276,14 +274,14 @@ struct interleave_list_entries_impl( [num_cols, table_dv = *table_dv_ptr, d_validities = validities.begin(), - d_offsets = output_list_offsets.template begin(), + d_offsets = output_list_offsets.template begin(), d_output = output_dv_ptr->template begin(), data_has_null_mask] __device__(size_type const idx) { auto const col_id = idx % num_cols; auto const list_id = idx / num_cols; auto const& lists_col = table_dv.column(col_id); auto const list_offsets = - lists_col.child(lists_column_view::offsets_column_index).template data() + + lists_col.child(lists_column_view::offsets_column_index).template data() + lists_col.offset(); auto const& data_col = lists_col.child(lists_column_view::child_column_index); @@ -384,7 +382,7 @@ std::unique_ptr interleave_columns(table_view const& input, // specialized for different types. auto const num_output_lists = input.num_rows() * input.num_columns(); auto const num_output_entries = - cudf::detail::get_value(offsets_view, num_output_lists, stream); + cudf::detail::get_value(offsets_view, num_output_lists, stream); auto const data_has_null_mask = std::any_of(std::cbegin(input), std::cend(input), [](auto const& col) { return col.child(lists_column_view::child_column_index).nullable(); diff --git a/cpp/src/lists/lists_column_factories.cu b/cpp/src/lists/lists_column_factories.cu index 754735f5a5b..7f82d32d327 100644 --- a/cpp/src/lists/lists_column_factories.cu +++ b/cpp/src/lists/lists_column_factories.cu @@ -39,7 +39,7 @@ std::unique_ptr make_lists_column_from_scalar(list_scalar const& v { if (size == 0) { return make_lists_column(0, - make_empty_column(type_to_id()), + make_empty_column(type_to_id()), empty_like(value.view()), 0, cudf::detail::create_null_mask(0, mask_state::UNALLOCATED, stream, mr), @@ -50,7 +50,7 @@ std::unique_ptr make_lists_column_from_scalar(list_scalar const& v // Handcraft a 1-row column auto offsets = make_numeric_column( - data_type{type_to_id()}, 2, mask_state::UNALLOCATED, stream, mr_final); + data_type{type_to_id()}, 2, mask_state::UNALLOCATED, stream, mr_final); auto m_offsets = offsets->mutable_view(); thrust::sequence(rmm::exec_policy(stream), m_offsets.begin(), @@ -90,7 +90,7 @@ std::unique_ptr make_empty_lists_column(data_type child_type, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto offsets = make_empty_column(data_type(type_to_id())); + auto offsets = make_empty_column(data_type(type_to_id())); auto child = make_empty_column(child_type); return make_lists_column( 0, std::move(offsets), std::move(child), 0, rmm::device_buffer{}, stream, mr); @@ -103,7 +103,7 @@ std::unique_ptr make_all_nulls_lists_column(size_type size, { auto offsets = [&] { auto offsets_buff = - cudf::detail::make_zeroed_device_uvector_async(size + 1, stream, mr); + cudf::detail::make_zeroed_device_uvector_async(size + 1, stream, mr); return std::make_unique(std::move(offsets_buff), rmm::device_buffer{}, 0); }(); auto child = make_empty_column(child_type); diff --git a/cpp/src/lists/reverse.cu b/cpp/src/lists/reverse.cu index d606f11bdb9..a2af85b5dad 100644 --- a/cpp/src/lists/reverse.cu +++ b/cpp/src/lists/reverse.cu @@ -56,7 +56,7 @@ std::unique_ptr reverse(lists_column_view const& input, thrust::for_each_n(rmm::exec_policy(stream), thrust::counting_iterator(0), child.size(), - [list_offsets = out_offsets->view().begin(), + [list_offsets = out_offsets->view().begin(), list_indices = labels->view().begin(), gather_map = gather_map.begin()] __device__(auto const idx) { auto const list_idx = list_indices[idx]; diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu index 8e1e6c37a95..aaee5608cc3 100644 --- a/cpp/src/lists/sequences.cu +++ b/cpp/src/lists/sequences.cu @@ -47,7 +47,7 @@ struct tabulator { T const* const starts; T const* const steps; - offset_type const* const offsets; + size_type const* const offsets; template static std::enable_if_t(), T> __device__ multiply(U x, size_type times) @@ -86,7 +86,7 @@ struct sequences_dispatcher { size_type n_elements, column_view const& starts, std::optional const& steps, - offset_type const* offsets, + size_type const* offsets, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -106,7 +106,7 @@ struct sequences_functor()>> { size_type n_elements, column_view const& starts, std::optional const& steps, - offset_type const* offsets, + size_type const* offsets, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -154,8 +154,8 @@ std::unique_ptr sequences(column_view const& starts, // Generate list offsets for the output. auto list_offsets = make_numeric_column( - data_type(type_to_id()), n_lists + 1, mask_state::UNALLOCATED, stream, mr); - auto const offsets_begin = list_offsets->mutable_view().template begin(); + data_type(type_to_id()), n_lists + 1, mask_state::UNALLOCATED, stream, mr); + auto const offsets_begin = list_offsets->mutable_view().template begin(); auto const sizes_input_it = cudf::detail::indexalator_factory::make_input_iterator(sizes); // First copy the sizes since the exclusive_scan tries to read (n_lists+1) values thrust::copy_n(rmm::exec_policy(stream), sizes_input_it, sizes.size(), offsets_begin); diff --git a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu index 0aaa8356304..ad43fbd5b00 100644 --- a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu +++ b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu @@ -74,7 +74,7 @@ std::unique_ptr apply_boolean_mask(lists_column_view const& input, stream, rmm::mr::get_current_device_resource()); auto const d_sizes = column_device_view::create(*sizes, stream); - auto const sizes_begin = cudf::detail::make_null_replacement_iterator(*d_sizes, offset_type{0}); + auto const sizes_begin = cudf::detail::make_null_replacement_iterator(*d_sizes, size_type{0}); auto const sizes_end = sizes_begin + sizes->size(); auto output_offsets = cudf::make_numeric_column( offset_data_type, num_rows + 1, mask_state::UNALLOCATED, stream, mr); @@ -82,12 +82,10 @@ std::unique_ptr apply_boolean_mask(lists_column_view const& input, // Could have attempted an exclusive_scan(), but it would not compute the last entry. // Instead, inclusive_scan(), followed by writing `0` to the head of the offsets column. - thrust::inclusive_scan(rmm::exec_policy(stream), - sizes_begin, - sizes_end, - output_offsets_view.begin() + 1); + thrust::inclusive_scan( + rmm::exec_policy(stream), sizes_begin, sizes_end, output_offsets_view.begin() + 1); CUDF_CUDA_TRY(cudaMemsetAsync( - output_offsets_view.begin(), 0, sizeof(offset_type), stream.value())); + output_offsets_view.begin(), 0, sizeof(size_type), stream.value())); return output_offsets; }; diff --git a/cpp/src/lists/utilities.cu b/cpp/src/lists/utilities.cu index 50a41c51f76..2c4966c969e 100644 --- a/cpp/src/lists/utilities.cu +++ b/cpp/src/lists/utilities.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,10 +42,10 @@ std::unique_ptr reconstruct_offsets(column_view const& labels, { auto out_offsets = make_numeric_column( - data_type{type_to_id()}, n_lists + 1, mask_state::UNALLOCATED, stream, mr); + data_type{type_to_id()}, n_lists + 1, mask_state::UNALLOCATED, stream, mr); auto const labels_begin = labels.template begin(); - auto const offsets_begin = out_offsets->mutable_view().template begin(); + auto const offsets_begin = out_offsets->mutable_view().template begin(); cudf::detail::labels_to_offsets(labels_begin, labels_begin + labels.size(), offsets_begin, @@ -60,7 +60,7 @@ std::unique_ptr get_normalized_offsets(lists_column_view const& input, { if (input.is_empty()) { return empty_like(input.offsets()); } - auto out_offsets = make_numeric_column(data_type(type_to_id()), + auto out_offsets = make_numeric_column(data_type(type_to_id()), input.size() + 1, cudf::mask_state::UNALLOCATED, stream, @@ -68,7 +68,7 @@ std::unique_ptr get_normalized_offsets(lists_column_view const& input, thrust::transform(rmm::exec_policy(stream), input.offsets_begin(), input.offsets_end(), - out_offsets->mutable_view().begin(), + out_offsets->mutable_view().begin(), [d_offsets = input.offsets_begin()] __device__(auto const offset_val) { // The first offset value, used for zero-normalizing offsets. return offset_val - *d_offsets; diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu index 0c90b0af8d2..cfdb386ff64 100644 --- a/cpp/src/quantiles/tdigest/tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,7 +66,7 @@ struct make_centroid { // kernel for computing percentiles on input tdigest (mean, weight) centroid data. template -__global__ void compute_percentiles_kernel(device_span tdigest_offsets, +__global__ void compute_percentiles_kernel(device_span tdigest_offsets, column_device_view percentiles, CentroidIter centroids_, double const* min_, @@ -199,8 +199,8 @@ std::unique_ptr compute_approx_percentiles(tdigest_column_view const& in rmm::mr::get_current_device_resource()); auto keys = cudf::detail::make_counting_transform_iterator( 0, - [offsets_begin = offsets.begin(), - offsets_end = offsets.end()] __device__(size_type i) { + [offsets_begin = offsets.begin(), + offsets_end = offsets.end()] __device__(size_type i) { return thrust::distance( offsets_begin, thrust::prev(thrust::upper_bound(thrust::seq, offsets_begin, offsets_end, i))); @@ -239,7 +239,7 @@ std::unique_ptr compute_approx_percentiles(tdigest_column_view const& in constexpr size_type block_size = 256; cudf::detail::grid_1d const grid(percentiles.size() * input.size(), block_size); compute_percentiles_kernel<<>>( - {offsets.begin(), static_cast(offsets.size())}, + {offsets.begin(), static_cast(offsets.size())}, *percentiles_cdv, centroids, tdv.min_begin(), @@ -294,8 +294,8 @@ std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, auto offsets = cudf::make_fixed_width_column( data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), - offsets->mutable_view().begin(), - offsets->mutable_view().end(), + offsets->mutable_view().begin(), + offsets->mutable_view().end(), 0); auto min_col = @@ -362,7 +362,7 @@ std::unique_ptr percentile_approx(tdigest_column_view const& input, thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter, row_size_iter + input.size() + 1, - offsets->mutable_view().begin()); + offsets->mutable_view().begin()); if (percentiles.size() == 0 || all_empty_rows) { return cudf::make_lists_column( diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index 35f2ce05bb6..2ce55e10fb1 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -127,7 +127,7 @@ struct merge_centroids { * nearest whole number <= it is floor(3.56) == 3. */ struct nearest_value_scalar_weights_grouped { - offset_type const* group_offsets; + size_type const* group_offsets; thrust::pair operator() __device__(double next_limit, size_type group_index) const { @@ -167,8 +167,8 @@ struct nearest_value_scalar_weights { template struct nearest_value_centroid_weights { double const* cumulative_weights; - GroupOffsetsIter outer_offsets; // groups - offset_type const* inner_offsets; // tdigests within a group + GroupOffsetsIter outer_offsets; // groups + size_type const* inner_offsets; // tdigests within a group thrust::pair operator() __device__(double next_limit, size_type group_index) const { @@ -234,8 +234,8 @@ template struct cumulative_centroid_weight { double const* cumulative_weights; GroupLabelsIter group_labels; - GroupOffsetsIter outer_offsets; // groups - cudf::device_span inner_offsets; // tdigests with a group + GroupOffsetsIter outer_offsets; // groups + cudf::device_span inner_offsets; // tdigests with a group std::tuple operator() __device__(size_type value_index) const { @@ -257,7 +257,7 @@ struct cumulative_centroid_weight { // retrieve group info (total weight, size, start offset) of scalar inputs by group index. struct scalar_group_info_grouped { size_type const* group_valid_counts; - offset_type const* group_offsets; + size_type const* group_offsets; __device__ thrust::tuple operator()(size_type group_index) const { @@ -283,7 +283,7 @@ template struct centroid_group_info { double const* cumulative_weights; GroupOffsetsIter outer_offsets; - offset_type const* inner_offsets; + size_type const* inner_offsets; __device__ thrust::tuple operator()(size_type group_index) const { @@ -375,7 +375,7 @@ __global__ void generate_cluster_limits_kernel(int delta, CumulativeWeight cumulative_weight, double* group_cluster_wl, size_type* group_num_clusters, - offset_type const* group_cluster_offsets, + size_type const* group_cluster_offsets, bool has_nulls) { int const tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -544,12 +544,12 @@ generate_group_cluster_info(int delta, thrust::exclusive_scan(rmm::exec_policy(stream), cluster_size, cluster_size + num_groups + 1, - group_cluster_offsets->mutable_view().begin(), + group_cluster_offsets->mutable_view().begin(), 0); // total # of clusters - offset_type total_clusters = - cudf::detail::get_value(group_cluster_offsets->view(), num_groups, stream); + size_type total_clusters = + cudf::detail::get_value(group_cluster_offsets->view(), num_groups, stream); // fill in the actual cluster weight limits rmm::device_uvector group_cluster_wl(total_clusters, stream); @@ -561,7 +561,7 @@ generate_group_cluster_info(int delta, cumulative_weight, group_cluster_wl.begin(), group_num_clusters.begin(), - group_cluster_offsets->view().begin(), + group_cluster_offsets->view().begin(), has_nulls); return {std::move(group_cluster_wl), @@ -584,7 +584,7 @@ std::unique_ptr build_output_column(size_type num_rows, return weights[i] == 0; }; // whether or not this particular tdigest is a stub - auto is_stub_digest = [offsets = offsets->view().begin(), is_stub_weight] __device__( + auto is_stub_digest = [offsets = offsets->view().begin(), is_stub_weight] __device__( size_type i) { return is_stub_weight(offsets[i]) ? 1 : 0; }; size_type const num_stubs = [&]() { @@ -622,12 +622,12 @@ std::unique_ptr build_output_column(size_type num_rows, auto _weights = remove_stubs(*weights, num_stubs); // adjust offsets. - rmm::device_uvector sizes(num_rows, stream); + rmm::device_uvector sizes(num_rows, stream); thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + num_rows, sizes.begin(), - [offsets = offsets->view().begin()] __device__(size_type i) { + [offsets = offsets->view().begin()] __device__(size_type i) { return offsets[i + 1] - offsets[i]; }); auto iter = cudf::detail::make_counting_transform_iterator( @@ -637,7 +637,7 @@ std::unique_ptr build_output_column(size_type num_rows, thrust::exclusive_scan(rmm::exec_policy(stream), iter, iter + num_rows + 1, - offsets->mutable_view().begin(), + offsets->mutable_view().begin(), 0); // assemble final column @@ -717,7 +717,7 @@ std::unique_ptr compute_tdigests(int delta, thrust::make_counting_iterator(0), [delta, group_cluster_wl = group_cluster_wl.data(), - group_cluster_offsets = group_cluster_offsets->view().begin(), + group_cluster_offsets = group_cluster_offsets->view().begin(), group_cumulative_weight] __device__(size_type value_index) -> size_type { // get group index, relative value index within the group and cumulative weight. [[maybe_unused]] auto [group_index, relative_value_index, cumulative_weight] = @@ -1018,10 +1018,10 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, // bring tdigest offsets back to the host auto tdigest_offsets = tdv.centroids().offsets(); - std::vector h_inner_offsets(tdigest_offsets.size()); + std::vector h_inner_offsets(tdigest_offsets.size()); cudaMemcpyAsync(h_inner_offsets.data(), - tdigest_offsets.begin(), - sizeof(offset_type) * tdigest_offsets.size(), + tdigest_offsets.begin(), + sizeof(size_type) * tdigest_offsets.size(), cudaMemcpyDefault, stream); @@ -1154,7 +1154,7 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, cumulative_weights->view().begin(), group_labels, group_offsets, - {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, + {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, false, stream, mr); @@ -1174,7 +1174,7 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, cumulative_weights->view().begin(), group_labels, group_offsets, - {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, + {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, std::move(merged_min_col), std::move(merged_max_col), group_cluster_wl, diff --git a/cpp/src/rolling/detail/nth_element.cuh b/cpp/src/rolling/detail/nth_element.cuh index c28d96e7793..bd3cbb39168 100644 --- a/cpp/src/rolling/detail/nth_element.cuh +++ b/cpp/src/rolling/detail/nth_element.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -157,7 +157,7 @@ std::unique_ptr nth_element(size_type n, gather_index_calculator{ n, input, preceding, following, min_periods, stream}); - auto gather_map = rmm::device_uvector(input.size(), stream); + auto gather_map = rmm::device_uvector(input.size(), stream); thrust::copy( rmm::exec_policy(stream), gather_iter, gather_iter + input.size(), gather_map.begin()); diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh index 84352e36550..3b6d53f43c4 100644 --- a/cpp/src/rolling/detail/rolling.cuh +++ b/cpp/src/rolling/detail/rolling.cuh @@ -454,7 +454,7 @@ struct agg_specific_empty_output { if constexpr (op == aggregation::COLLECT_LIST) { return cudf::make_lists_column( - 0, make_empty_column(type_to_id()), empty_like(input), 0, {}); + 0, make_empty_column(type_to_id()), empty_like(input), 0, {}); } return empty_like(input); diff --git a/cpp/src/rolling/detail/rolling_collect_list.cu b/cpp/src/rolling/detail/rolling_collect_list.cu index f7544e81ba5..85dced0efe3 100644 --- a/cpp/src/rolling/detail/rolling_collect_list.cu +++ b/cpp/src/rolling/detail/rolling_collect_list.cu @@ -140,8 +140,8 @@ std::pair, std::unique_ptr> purge_null_entries( thrust::tabulate(rmm::exec_policy(stream), new_sizes->mutable_view().template begin(), new_sizes->mutable_view().template end(), - [d_gather_map = gather_map.template begin(), - d_old_offsets = offsets.template begin(), + [d_gather_map = gather_map.template begin(), + d_old_offsets = offsets.template begin(), input_row_not_null] __device__(auto i) { return thrust::count_if(thrust::seq, d_gather_map + d_old_offsets[i], diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu index 899cd8b6c86..ca5c04d1c4f 100644 --- a/cpp/src/rolling/grouped_rolling.cu +++ b/cpp/src/rolling/grouped_rolling.cu @@ -327,12 +327,12 @@ std::unique_ptr expand_to_column(Calculator const& calc, rmm::cuda_stream_view stream) { auto window_column = cudf::make_numeric_column( - cudf::data_type{type_to_id()}, num_rows, cudf::mask_state::UNALLOCATED, stream); + cudf::data_type{type_to_id()}, num_rows, cudf::mask_state::UNALLOCATED, stream); auto begin = cudf::detail::make_counting_transform_iterator(0, calc); thrust::copy_n( - rmm::exec_policy(stream), begin, num_rows, window_column->mutable_view().data()); + rmm::exec_policy(stream), begin, num_rows, window_column->mutable_view().data()); return window_column; } diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu index fcb0bacad9a..4e248922702 100644 --- a/cpp/src/strings/capitalize.cu +++ b/cpp/src/strings/capitalize.cu @@ -63,7 +63,7 @@ struct base_fn { character_cases_table_type const* d_case_table; special_case_mapping const* d_special_case_mapping; column_device_view const d_column; - offset_type* d_offsets{}; + size_type* d_offsets{}; char* d_chars{}; base_fn(column_device_view const& d_column) @@ -111,11 +111,11 @@ struct base_fn { return; } - auto& derived = static_cast(*this); - auto const d_str = d_column.element(idx); - offset_type bytes = 0; - auto d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - bool capitalize = true; + auto& derived = static_cast(*this); + auto const d_str = d_column.element(idx); + size_type bytes = 0; + auto d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + bool capitalize = true; for (auto const chr : d_str) { auto const info = get_char_info(d_flags, chr); auto const flag = info.second; diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu index 29023fbb139..ba8acd23467 100644 --- a/cpp/src/strings/combine/concatenate.cu +++ b/cpp/src/strings/combine/concatenate.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,7 +48,7 @@ struct concat_strings_base { table_device_view const d_table; string_scalar_device_view const d_narep; separator_on_nulls separate_nulls; - offset_type* d_offsets{}; + size_type* d_offsets{}; char* d_chars{}; /** @@ -72,7 +72,7 @@ struct concat_strings_base { } char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - offset_type bytes = 0; + size_type bytes = 0; bool write_separator = false; for (auto itr = d_table.begin(); itr < d_table.end(); ++itr) { diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu index 7c9acbfbc58..eee59e37478 100644 --- a/cpp/src/strings/combine/join_list_elements.cu +++ b/cpp/src/strings/combine/join_list_elements.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -53,13 +53,13 @@ template struct compute_size_and_concatenate_fn { Functor const func; column_device_view const lists_dv; - offset_type const* const list_offsets; + size_type const* const list_offsets; column_device_view const strings_dv; string_scalar_device_view const string_narep_dv; separator_on_nulls const separate_nulls; output_if_empty_list const empty_list_policy; - offset_type* d_offsets{nullptr}; + size_type* d_offsets{nullptr}; // If d_chars == nullptr: only compute sizes and validities of the output strings. // If d_chars != nullptr: only concatenate strings. diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu index 4f446c8c1cf..0d04fc74b0c 100644 --- a/cpp/src/strings/convert/convert_booleans.cu +++ b/cpp/src/strings/convert/convert_booleans.cu @@ -95,7 +95,7 @@ struct from_booleans_fn { column_device_view const d_column; string_view d_true; string_view d_false; - offset_type* d_offsets{}; + size_type* d_offsets{}; char* d_chars{}; __device__ void operator()(size_type idx) const diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu index 0a0f197c8a1..863f76b9b98 100644 --- a/cpp/src/strings/convert/convert_durations.cu +++ b/cpp/src/strings/convert/convert_durations.cu @@ -191,7 +191,7 @@ struct from_durations_fn { column_device_view d_durations; format_item const* d_format_items; size_type items_count; - offset_type* d_offsets{}; + size_type* d_offsets{}; char* d_chars{}; __device__ int8_t format_length(char format_char, duration_component const* const timeparts) const diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu index cb061d03e5a..a3336258d3e 100644 --- a/cpp/src/strings/convert/convert_fixed_point.cu +++ b/cpp/src/strings/convert/convert_fixed_point.cu @@ -197,7 +197,7 @@ namespace { template struct from_fixed_point_fn { column_device_view d_decimals; - offset_type* d_offsets{}; + size_type* d_offsets{}; char* d_chars{}; /** diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu index 8728ad06964..bed682aba71 100644 --- a/cpp/src/strings/convert/convert_hex.cu +++ b/cpp/src/strings/convert/convert_hex.cu @@ -129,7 +129,7 @@ void dispatch_hex_to_integers_fn::operator()(column_device_view const&, template struct integer_to_hex_fn { column_device_view const d_column; - offset_type* d_offsets{}; + size_type* d_offsets{}; char* d_chars{}; __device__ void byte_to_hex(uint8_t byte, char* hex) @@ -173,7 +173,7 @@ struct integer_to_hex_fn { --byte_index; } } else { - d_offsets[idx] = static_cast(bytes) * 2; // 2 hex characters per byte + d_offsets[idx] = static_cast(bytes) * 2; // 2 hex characters per byte } } }; diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu index 609ced97c26..3aef37914fd 100644 --- a/cpp/src/strings/convert/convert_lists.cu +++ b/cpp/src/strings/convert/convert_lists.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -134,7 +134,7 @@ struct format_lists_fn { auto const view = get_nested_child(stack_idx); auto offsets = view.child(cudf::lists_column_view::offsets_column_index); - auto d_offsets = offsets.data() + view.offset(); + auto d_offsets = offsets.data() + view.offset(); // add pending separator if (item.separator == item_separator::LIST) { diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu index 8b6305b68e2..401a04cdc9d 100644 --- a/cpp/src/strings/convert/convert_urls.cu +++ b/cpp/src/strings/convert/convert_urls.cu @@ -197,7 +197,7 @@ __forceinline__ __device__ char escaped_sequence_to_byte(char const* const ptr) */ template __global__ void url_decode_char_counter(column_device_view const in_strings, - offset_type* const out_counts) + size_type* const out_counts) { constexpr int halo_size = 2; __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size]; @@ -221,7 +221,7 @@ __global__ void url_decode_char_counter(column_device_view const in_strings, auto const in_chars = in_string.data(); auto const string_length = in_string.size_bytes(); int const nblocks = cudf::util::div_rounding_up_unsafe(string_length, char_block_size); - offset_type escape_char_count = 0; + size_type escape_char_count = 0; for (int block_idx = 0; block_idx < nblocks; block_idx++) { int const string_length_block = @@ -280,7 +280,7 @@ __global__ void url_decode_char_counter(column_device_view const in_strings, template __global__ void url_decode_char_replacer(column_device_view const in_strings, char* const out_chars, - offset_type const* const out_offsets) + size_type const* const out_offsets) { constexpr int halo_size = 2; __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size * 2]; @@ -393,18 +393,17 @@ std::unique_ptr url_decode(strings_column_view const& strings, auto offsets_mutable_view = offsets_column->mutable_view(); url_decode_char_counter <<>>( - *d_strings, offsets_mutable_view.begin()); + *d_strings, offsets_mutable_view.begin()); // use scan to transform number of bytes into offsets thrust::exclusive_scan(rmm::exec_policy(stream), - offsets_view.begin(), - offsets_view.end(), - offsets_mutable_view.begin()); + offsets_view.begin(), + offsets_view.end(), + offsets_mutable_view.begin()); // copy the total number of characters of all strings combined (last element of the offset column) // to the host memory - auto out_chars_bytes = - cudf::detail::get_value(offsets_view, offset_count - 1, stream); + auto out_chars_bytes = cudf::detail::get_value(offsets_view, offset_count - 1, stream); // create the chars column auto chars_column = create_chars_child_column(out_chars_bytes, stream, mr); @@ -413,7 +412,7 @@ std::unique_ptr url_decode(strings_column_view const& strings, // decode and copy the characters from the input column to the output column url_decode_char_replacer <<>>( - *d_strings, d_out_chars, offsets_column->view().begin()); + *d_strings, d_out_chars, offsets_column->view().begin()); // copy null mask rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index c5dfd4a8b93..287910c9a6f 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -287,12 +287,12 @@ std::unique_ptr concatenate(host_span columns, column_view chars_child = column->child(strings_column_view::chars_column_index); auto bytes_offset = - cudf::detail::get_value(offsets_child, column_offset, stream); + cudf::detail::get_value(offsets_child, column_offset, stream); // copy the chars column data auto d_chars = chars_child.data() + bytes_offset; auto const bytes = - cudf::detail::get_value(offsets_child, column_size + column_offset, stream) - + cudf::detail::get_value(offsets_child, column_size + column_offset, stream) - bytes_offset; CUDF_CUDA_TRY( diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu index bdcf01bd336..5f8fc483a34 100644 --- a/cpp/src/strings/copying/shift.cu +++ b/cpp/src/strings/copying/shift.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,15 +36,15 @@ struct adjust_offsets_fn { string_view const d_filler; size_type const offset; - __device__ offset_type operator()(size_type idx) + __device__ size_type operator()(size_type idx) { if (offset < 0) { - auto const first = d_column.element(-offset); + auto const first = d_column.element(-offset); auto const last_index = d_column.size() + offset; if (idx < last_index) { - return d_column.element(idx - offset) - first; + return d_column.element(idx - offset) - first; } else { - auto const last = d_column.element(d_column.size() - 1); + auto const last = d_column.element(d_column.size() - 1); return (last - first) + ((idx - last_index + 1) * d_filler.size_bytes()); } } else { @@ -52,7 +52,7 @@ struct adjust_offsets_fn { return idx * d_filler.size_bytes(); } else { auto const total_filler = d_filler.size_bytes() * offset; - return total_filler + d_column.element(idx - offset); + return total_filler + d_column.element(idx - offset); } } } @@ -112,19 +112,19 @@ std::unique_ptr shift(strings_column_view const& input, thrust::transform(rmm::exec_policy(stream), thrust::counting_iterator(0), thrust::counting_iterator(offsets_size), - d_offsets->data(), + d_offsets->data(), adjust_offsets_fn{*d_input_offsets, d_fill_str, offset}); // compute the shift-offset for the output characters child column auto const shift_offset = [&] { auto const index = (offset >= 0) ? offset : offsets_size - 1 + offset; return (offset < 0 ? -1 : 1) * - cudf::detail::get_value(offsets_column->view(), index, stream); + cudf::detail::get_value(offsets_column->view(), index, stream); }(); // create output chars child column auto const chars_size = - cudf::detail::get_value(offsets_column->view(), offsets_size - 1, stream); + cudf::detail::get_value(offsets_column->view(), offsets_size - 1, stream); auto chars_column = create_chars_child_column(chars_size, stream, mr); auto d_chars = mutable_column_device_view::create(chars_column->mutable_view(), stream); auto const d_input_chars = column_device_view::create(input.chars(), stream); diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu index fcd05ee9dc6..8a2f8f0cbfc 100644 --- a/cpp/src/strings/extract/extract_all.cu +++ b/cpp/src/strings/extract/extract_all.cu @@ -50,7 +50,7 @@ namespace { */ struct extract_fn { column_device_view const d_strings; - offset_type const* d_offsets; + size_type const* d_offsets; string_index_pair* d_indices; __device__ void operator()(size_type const idx, @@ -119,7 +119,7 @@ std::unique_ptr extract_all_record(strings_column_view const& input, // Get the match counts for each string. // This column will become the output lists child offsets column. auto offsets = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr); - auto d_offsets = offsets->mutable_view().data(); + auto d_offsets = offsets->mutable_view().data(); // Compute null output rows auto [null_mask, null_count] = cudf::detail::valid_if( @@ -138,10 +138,10 @@ std::unique_ptr extract_all_record(strings_column_view const& input, d_offsets + strings_count + 1, d_offsets, [groups] __device__(auto v) { return v * groups; }, - offset_type{0}, + size_type{0}, thrust::plus{}); auto const total_groups = - cudf::detail::get_value(offsets->view(), strings_count, stream); + cudf::detail::get_value(offsets->view(), strings_count, stream); rmm::device_uvector indices(total_groups, stream); diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index f4dfafeb51f..be5b089c6e0 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -901,7 +901,7 @@ template __launch_bounds__(block_size) __global__ void get_json_object_kernel(column_device_view col, path_operator const* const commands, - offset_type* output_offsets, + size_type* output_offsets, thrust::optional out_buf, thrust::optional out_validity, thrust::optional out_valid_count, @@ -932,7 +932,7 @@ __launch_bounds__(block_size) __global__ // filled in only during the precompute step. during the compute step, the offsets // are fed back in so we do -not- want to write them out - if (!out_buf.has_value()) { output_offsets[tid] = static_cast(output_size); } + if (!out_buf.has_value()) { output_offsets[tid] = static_cast(output_size); } // validity filled in only during the output step if (out_validity.has_value()) { @@ -995,7 +995,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c <<>>( *cdv, std::get<0>(preprocess).value().data(), - offsets_view.head(), + offsets_view.head(), thrust::nullopt, thrust::nullopt, thrust::nullopt, @@ -1003,12 +1003,12 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c // convert sizes to offsets thrust::exclusive_scan(rmm::exec_policy(stream), - offsets_view.head(), - offsets_view.head() + col.size() + 1, - offsets_view.head(), + offsets_view.head(), + offsets_view.head() + col.size() + 1, + offsets_view.head(), 0); size_type const output_size = - cudf::detail::get_value(offsets_view, col.size(), stream); + cudf::detail::get_value(offsets_view, col.size(), stream); // allocate output string column auto chars = create_chars_child_column(output_size, stream, mr); @@ -1026,7 +1026,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c <<>>( *cdv, std::get<0>(preprocess).value().data(), - offsets_view.head(), + offsets_view.head(), chars_view.head(), static_cast(validity.data()), d_valid_count.data(), diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu index da6d01c92dc..c501a8bf7b4 100644 --- a/cpp/src/strings/padding.cu +++ b/cpp/src/strings/padding.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -46,7 +46,7 @@ struct base_fn { column_device_view const d_column; size_type const width; size_type const fill_char_size; - offset_type* d_offsets{}; + size_type* d_offsets{}; char* d_chars{}; base_fn(column_device_view const& d_column, size_type width, size_type fill_char_size) diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index 4e0294f188c..396e1e6a2ac 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -84,10 +84,10 @@ auto generate_empty_output(strings_column_view const& input, auto chars_column = create_chars_child_column(0, stream, mr); auto offsets_column = make_numeric_column( - data_type{type_to_id()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); - CUDF_CUDA_TRY(cudaMemsetAsync(offsets_column->mutable_view().template data(), + data_type{type_to_id()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); + CUDF_CUDA_TRY(cudaMemsetAsync(offsets_column->mutable_view().template data(), 0, - offsets_column->size() * sizeof(offset_type), + offsets_column->size() * sizeof(size_type), stream.value())); return make_strings_column(strings_count, @@ -109,7 +109,7 @@ struct compute_size_and_repeat_fn { size_type const repeat_times; bool const has_nulls; - offset_type* d_offsets{nullptr}; + size_type* d_offsets{nullptr}; // If d_chars == nullptr: only compute sizes of the output strings. // If d_chars != nullptr: only repeat strings. @@ -184,7 +184,7 @@ struct compute_sizes_and_repeat_fn { bool const strings_has_nulls; bool const rtimes_has_nulls; - offset_type* d_offsets{nullptr}; + size_type* d_offsets{nullptr}; // If d_chars == nullptr: only compute sizes of the output strings. // If d_chars != nullptr: only repeat strings. diff --git a/cpp/src/strings/reverse.cu b/cpp/src/strings/reverse.cu index 3c1fae7a00f..090705ac25d 100644 --- a/cpp/src/strings/reverse.cu +++ b/cpp/src/strings/reverse.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,7 +37,7 @@ namespace { */ struct reverse_characters_fn { column_device_view const d_strings; - offset_type const* d_offsets; + size_type const* d_offsets; char* d_chars; __device__ void operator()(size_type idx) @@ -64,7 +64,7 @@ std::unique_ptr reverse(strings_column_view const& input, // copy the column; replace data in the chars column auto result = std::make_unique(input.parent(), stream, mr); auto const d_offsets = - result->view().child(strings_column_view::offsets_column_index).data(); + result->view().child(strings_column_view::offsets_column_index).data(); auto d_chars = result->mutable_view().child(strings_column_view::chars_column_index).data(); auto const d_column = column_device_view::create(input.parent(), stream); diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu index 1907c0d749b..4a823ad1dcb 100644 --- a/cpp/src/strings/search/find_multiple.cu +++ b/cpp/src/strings/search/find_multiple.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -70,8 +70,8 @@ std::unique_ptr find_multiple(strings_column_view const& input, results->set_null_count(0); auto offsets = cudf::detail::sequence(strings_count + 1, - numeric_scalar(0), - numeric_scalar(targets_count), + numeric_scalar(0), + numeric_scalar(targets_count), stream, mr); return make_lists_column(strings_count, diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu index 596fbb39d15..2df64c6a0a7 100644 --- a/cpp/src/strings/search/findall.cu +++ b/cpp/src/strings/search/findall.cu @@ -50,7 +50,7 @@ namespace { */ struct findall_fn { column_device_view const d_strings; - offset_type const* d_offsets; + size_type const* d_offsets; string_index_pair* d_indices; __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx) @@ -78,7 +78,7 @@ struct findall_fn { std::unique_ptr findall_util(column_device_view const& d_strings, reprog_device& d_prog, size_type total_matches, - offset_type const* d_offsets, + size_type const* d_offsets, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -106,7 +106,7 @@ std::unique_ptr findall(strings_column_view const& input, // Create lists offsets column auto offsets = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr); - auto d_offsets = offsets->mutable_view().data(); + auto d_offsets = offsets->mutable_view().data(); // Convert counts into offsets thrust::exclusive_scan( diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 0dc92cf343c..9aeb6b69bdc 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -60,7 +60,7 @@ enum class split_direction { struct token_reader_fn { column_device_view const d_strings; split_direction const direction; - offset_type const* d_token_offsets; + size_type const* d_token_offsets; string_index_pair* d_tokens; __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx) @@ -143,17 +143,17 @@ rmm::device_uvector generate_tokens(column_device_view const& auto const begin = thrust::make_counting_iterator(0); auto const end = thrust::make_counting_iterator(strings_count); - auto const d_offsets = offsets.data(); + auto const d_offsets = offsets.data(); // convert match counts to token offsets auto map_fn = [d_strings, d_offsets, max_tokens] __device__(auto idx) { return d_strings.is_null(idx) ? 0 : std::min(d_offsets[idx], max_tokens) + 1; }; thrust::transform_exclusive_scan( - rmm::exec_policy(stream), begin, end + 1, d_offsets, map_fn, 0, thrust::plus{}); + rmm::exec_policy(stream), begin, end + 1, d_offsets, map_fn, 0, thrust::plus{}); // the last offset entry is the total number of tokens to be generated - auto const total_tokens = cudf::detail::get_value(offsets, strings_count, stream); + auto const total_tokens = cudf::detail::get_value(offsets, strings_count, stream); rmm::device_uvector tokens(total_tokens, stream); if (total_tokens == 0) { return tokens; } @@ -176,7 +176,7 @@ rmm::device_uvector generate_tokens(column_device_view const& struct tokens_transform_fn { column_device_view const d_strings; string_index_pair const* d_tokens; - offset_type const* d_token_offsets; + size_type const* d_token_offsets; size_type const column_index; __device__ string_index_pair operator()(size_type idx) const @@ -215,7 +215,7 @@ std::unique_ptr split_re(strings_column_view const& input, auto offsets = count_matches( *d_strings, *d_prog, strings_count + 1, stream, rmm::mr::get_current_device_resource()); auto offsets_view = offsets->mutable_view(); - auto d_offsets = offsets_view.data(); + auto d_offsets = offsets_view.data(); // get the split tokens from the input column; this also converts the counts into offsets auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream); diff --git a/cpp/src/strings/strings_column_view.cpp b/cpp/src/strings/strings_column_view.cpp index 6de478d3e1e..4b206666d4b 100644 --- a/cpp/src/strings/strings_column_view.cpp +++ b/cpp/src/strings/strings_column_view.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ column_view strings_column_view::offsets() const strings_column_view::offset_iterator strings_column_view::offsets_begin() const { - return offsets().begin() + offset(); + return offsets().begin() + offset(); } strings_column_view::offset_iterator strings_column_view::offsets_end() const diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index 413fb2497c0..ac55fe76db1 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -199,7 +199,7 @@ struct byte_pair_encoding_fn { } auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index) - .element(idx); + .element(idx); auto const d_indices = d_byte_indices + offset; // initialize the byte indices for this string; @@ -304,7 +304,7 @@ struct byte_pair_encoding_fn { struct build_encoding_fn { cudf::column_device_view const d_strings; cudf::size_type const* d_byte_indices; - cudf::offset_type const* d_offsets; + cudf::size_type const* d_offsets; char* d_chars{}; __device__ void operator()(cudf::size_type idx) @@ -314,7 +314,7 @@ struct build_encoding_fn { if (d_str.empty()) { return; } auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index) - .element(idx); + .element(idx); auto const d_indices = d_byte_indices + offset; auto d_output = d_chars ? d_chars + d_offsets[idx] : nullptr; @@ -362,12 +362,12 @@ std::unique_ptr byte_pair_encoding( auto const d_merges = cudf::column_device_view::create(merge_pairs.get_merge_pairs(), stream); auto const d_strings = cudf::column_device_view::create(input.parent(), stream); - auto offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, + auto offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, static_cast(input.size() + 1), cudf::mask_state::UNALLOCATED, stream, rmm::mr::get_current_device_resource()); - auto d_offsets = offsets->mutable_view().data(); + auto d_offsets = offsets->mutable_view().data(); byte_pair_encoding_fn fn{*d_merges, *d_strings, @@ -406,14 +406,14 @@ std::unique_ptr byte_pair_encoding( */ struct edge_of_space_fn { cudf::column_device_view const d_strings; - __device__ bool operator()(cudf::offset_type offset) + __device__ bool operator()(cudf::size_type offset) { auto const d_chars = d_strings.child(cudf::strings_column_view::chars_column_index).data(); if (is_whitespace(d_chars[offset]) || !is_whitespace(d_chars[offset - 1])) { return false; } auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); - auto const d_offsets = offsets.data() + d_strings.offset(); + auto const d_offsets = offsets.data() + d_strings.offset(); // ignore offsets outside sliced range if (offset < d_offsets[0] || offset >= d_offsets[d_strings.size()]) { return false; } @@ -452,12 +452,12 @@ std::unique_ptr space_offsets(cudf::strings_column_view const& inp auto const space_count = thrust::count_if(rmm::exec_policy(stream), begin, end, edge_of_space); // copy space offsets - rmm::device_uvector space_offsets(space_count, stream); + rmm::device_uvector space_offsets(space_count, stream); thrust::copy_if(rmm::exec_policy(stream), begin, end, space_offsets.data(), edge_of_space); // create output offsets auto result = - cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, + cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, static_cast(space_count + input.size() + 1), cudf::mask_state::UNALLOCATED, stream, @@ -469,7 +469,7 @@ std::unique_ptr space_offsets(cudf::strings_column_view const& inp input.offsets_end(), space_offsets.begin(), space_offsets.end(), - result->mutable_view().begin()); + result->mutable_view().begin()); return result; } diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu index dffe035ad35..b39413af98f 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/subword/load_merges_file.cu @@ -78,7 +78,7 @@ std::unique_ptr load_file_to_column(std::string const& filename_me CUDF_EXPECTS(merges_file.good(), "Could not open " + filename_merges); std::vector chars{}; - std::vector offsets(1, 0); + std::vector offsets(1, 0); std::string line; std::getline(merges_file, line); diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index 1507a8ce7c6..b151b44565d 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -352,10 +352,10 @@ __device__ size_type row_size_functor::operator()(column_device_vie return 0; } - auto const offsets_size = sizeof(offset_type) * CHAR_BIT; + auto const offsets_size = sizeof(size_type) * CHAR_BIT; auto const validity_size = col.nullable() ? 1 : 0; auto const chars_size = - (offsets.data()[row_end] - offsets.data()[row_start]) * CHAR_BIT; + (offsets.data()[row_end] - offsets.data()[row_start]) * CHAR_BIT; return ((offsets_size + validity_size) * num_rows) + chars_size; } @@ -372,7 +372,7 @@ __device__ size_type row_size_functor::operator()(column_device_view { auto const num_rows{span.row_end - span.row_start}; - auto const offsets_size = sizeof(offset_type) * CHAR_BIT; + auto const offsets_size = sizeof(size_type) * CHAR_BIT; auto const validity_size = col.nullable() ? 1 : 0; return (offsets_size + validity_size) * num_rows; } @@ -451,10 +451,10 @@ __global__ void compute_row_sizes(device_span cols, // if this is a list column, update the working span from our offsets if (col.type().id() == type_id::LIST && col.size() > 0) { column_device_view const& offsets = col.child(lists_column_view::offsets_column_index); - auto const base_offset = offsets.data()[col.offset()]; + auto const base_offset = offsets.data()[col.offset()]; cur_span.row_start = - offsets.data()[cur_span.row_start + col.offset()] - base_offset; - cur_span.row_end = offsets.data()[cur_span.row_end + col.offset()] - base_offset; + offsets.data()[cur_span.row_start + col.offset()] - base_offset; + cur_span.row_end = offsets.data()[cur_span.row_end + col.offset()] - base_offset; } last_branch_depth = info[idx].branch_depth_end; diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp index 3eccb6b2a55..66de4e19b27 100644 --- a/cpp/tests/column/factories_test.cpp +++ b/cpp/tests/column/factories_test.cpp @@ -508,7 +508,7 @@ TYPED_TEST_SUITE(ListsDictionaryLeafTest, cudf::test::FixedWidthTypes); TYPED_TEST(ListsDictionaryLeafTest, FromNonNested) { using DCW = cudf::test::dictionary_column_wrapper; - using offset_t = cudf::test::fixed_width_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; auto s = cudf::make_list_scalar(DCW({1, 3, -1, 1, 3}, {1, 1, 0, 1, 1})); auto col = cudf::make_column_from_scalar(*s, 2); @@ -524,7 +524,7 @@ TYPED_TEST(ListsDictionaryLeafTest, FromNonNested) TYPED_TEST(ListsDictionaryLeafTest, FromNested) { using DCW = cudf::test::dictionary_column_wrapper; - using offset_t = cudf::test::fixed_width_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; DCW leaf({1, 3, -1, 1, 3, 1, 3, -1, 1, 3}, {1, 1, 0, 1, 1, 1, 1, 0, 1, 1}); offset_t offsets{0, 3, 3, 6, 6, 10}; @@ -617,7 +617,7 @@ TYPED_TEST(ListsStructsLeafTest, FromNonNested) { using LCWinner_t = cudf::test::lists_column_wrapper; using StringCW = cudf::test::strings_column_wrapper; - using offset_t = cudf::test::fixed_width_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; using valid_t = std::vector; auto data = this->make_test_structs_column( @@ -648,7 +648,7 @@ TYPED_TEST(ListsStructsLeafTest, FromNested) { using LCWinner_t = cudf::test::lists_column_wrapper; using StringCW = cudf::test::strings_column_wrapper; - using offset_t = cudf::test::fixed_width_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; using valid_t = std::vector; auto leaf = this->make_test_structs_column( {{1, 2}, {0, 1}}, @@ -702,7 +702,7 @@ TEST_F(ListsZeroLengthColumnTest, MixedTypes) using FCW = cudf::test::fixed_width_column_wrapper; using StringCW = cudf::test::strings_column_wrapper; using LCW = cudf::test::lists_column_wrapper; - using offset_t = cudf::test::fixed_width_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; { auto s = cudf::make_list_scalar(FCW{1, 2, 3}); auto got = cudf::make_column_from_scalar(*s, 0); @@ -759,7 +759,7 @@ TEST_F(ListsZeroLengthColumnTest, SuperimposeNulls) using FCW = cudf::test::fixed_width_column_wrapper; using StringCW = cudf::test::strings_column_wrapper; using LCW = cudf::test::lists_column_wrapper; - using offset_t = cudf::test::fixed_width_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; auto const lists = [&] { auto child = this diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 99ef1df1c2b..7701ca1ba56 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -377,7 +377,7 @@ TEST_F(OverflowTest, OverflowTest) constexpr auto size = static_cast(static_cast(1024) * 1024 * 1024); // try and concatenate 6 string columns of with 1 billion chars in each - auto offsets = cudf::test::fixed_width_column_wrapper{0, size}; + auto offsets = cudf::test::fixed_width_column_wrapper{0, size}; auto many_chars = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, size); auto col = cudf::make_strings_column( 1, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{}); @@ -418,7 +418,7 @@ TEST_F(OverflowTest, OverflowTest) cudf::make_structs_column(inner_size, std::move(children), 0, rmm::device_buffer{}); // list - auto offsets = cudf::test::fixed_width_column_wrapper{0, inner_size}; + auto offsets = cudf::test::fixed_width_column_wrapper{0, inner_size}; auto col = cudf::make_lists_column(1, offsets.release(), std::move(struct_col), 0, rmm::device_buffer{}); @@ -435,7 +435,7 @@ TEST_F(OverflowTest, OverflowTest) constexpr cudf::size_type size = 3; // list - auto offsets = cudf::test::fixed_width_column_wrapper{0, 0, 0, inner_size}; + auto offsets = cudf::test::fixed_width_column_wrapper{0, 0, 0, inner_size}; auto many_chars = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, inner_size); auto list_col = @@ -643,7 +643,7 @@ TEST_F(OverflowTest, Presliced) constexpr cudf::size_type list_size = inner_size / num_rows; // list - auto offsets = cudf::test::fixed_width_column_wrapper{ + auto offsets = cudf::test::fixed_width_column_wrapper{ 0, list_size, (list_size * 2) - 1, list_size * 3, inner_size}; auto many_chars = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, inner_size); diff --git a/cpp/tests/copying/copy_if_else_nested_tests.cpp b/cpp/tests/copying/copy_if_else_nested_tests.cpp index ff28156ef1d..579e1bdce8a 100644 --- a/cpp/tests/copying/copy_if_else_nested_tests.cpp +++ b/cpp/tests/copying/copy_if_else_nested_tests.cpp @@ -332,7 +332,7 @@ TYPED_TEST(TypedCopyIfElseNestedTest, ListsWithStructs) using strings = cudf::test::strings_column_wrapper; using structs = cudf::test::structs_column_wrapper; using bools = cudf::test::fixed_width_column_wrapper; - using offsets = cudf::test::fixed_width_column_wrapper; + using offsets = cudf::test::fixed_width_column_wrapper; auto const null_at_0 = null_at(0); auto const null_at_3 = null_at(3); diff --git a/cpp/tests/copying/gather_struct_tests.cpp b/cpp/tests/copying/gather_struct_tests.cpp index ebfd950df4d..2bc18c706db 100644 --- a/cpp/tests/copying/gather_struct_tests.cpp +++ b/cpp/tests/copying/gather_struct_tests.cpp @@ -37,7 +37,7 @@ #include using vector_of_columns = std::vector>; -using gather_map_t = std::vector; +using gather_map_t = std::vector; using offsets = cudf::test::fixed_width_column_wrapper; using structs = cudf::test::structs_column_wrapper; using strings = cudf::test::strings_column_wrapper; @@ -54,7 +54,7 @@ using numerics = cudf::test::fixed_width_column_wrapper; template using lists = cudf::test::lists_column_wrapper; -auto constexpr null_index = std::numeric_limits::max(); +auto constexpr null_index = std::numeric_limits::max(); struct StructGatherTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp index a18ed6a1ccf..d322fbe11f2 100644 --- a/cpp/tests/copying/get_value_tests.cpp +++ b/cpp/tests/copying/get_value_tests.cpp @@ -311,7 +311,7 @@ TYPED_TEST(ListGetFixedWidthValueTest, NestedGetNull) { using LCW = cudf::test::lists_column_wrapper; using FCW = cudf::test::fixed_width_column_wrapper; - using offset_t = cudf::test::fixed_width_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; std::vector valid{1, 0, 1, 0}; // clang-format off @@ -466,7 +466,7 @@ TEST_F(ListGetStringValueTest, NestedGetNonNullEmpty) TEST_F(ListGetStringValueTest, NestedGetNull) { using LCW = cudf::test::lists_column_wrapper; - using offset_t = cudf::test::fixed_width_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; using StringCW = cudf::test::strings_column_wrapper; std::vector valid{0, 0, 1, 1}; @@ -508,7 +508,7 @@ struct ListGetStructValueTest : public cudf::test::BaseFixture { */ std::unique_ptr make_test_lists_column( cudf::size_type num_lists, - cudf::test::fixed_width_column_wrapper offsets, + cudf::test::fixed_width_column_wrapper offsets, std::unique_ptr child, std::initializer_list null_mask) { @@ -776,7 +776,7 @@ TYPED_TEST(ListGetStructValueTest, NestedGetNull) // NULL <- cudf::get_element(2) using valid_t = std::vector; - using offset_t = cudf::test::fixed_width_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; auto list_column = this->make_test_lists_column(2, {0, 2, 3}, this->leaf_data(), {1, 1}); auto list_column_nested = @@ -900,12 +900,12 @@ TEST_F(StructGetValueTest, multi_level_nested) // col fields LCW l3({LCW{1, 1, 1}, LCW{2, 2}, LCW{3}}, validity_mask_t{false, true, true}.begin()); cudf::test::structs_column_wrapper l2{l3}; - auto l1 = cudf::make_lists_column( - 1, - cudf::test::fixed_width_column_wrapper{0, 3}.release(), - l2.release(), - 0, - cudf::create_null_mask(1, cudf::mask_state::UNALLOCATED)); + auto l1 = + cudf::make_lists_column(1, + cudf::test::fixed_width_column_wrapper{0, 3}.release(), + l2.release(), + 0, + cudf::create_null_mask(1, cudf::mask_state::UNALLOCATED)); std::vector> l0_fields; l0_fields.emplace_back(std::move(l1)); cudf::test::structs_column_wrapper l0(std::move(l0_fields)); diff --git a/cpp/tests/copying/scatter_list_scalar_tests.cpp b/cpp/tests/copying/scatter_list_scalar_tests.cpp index 9dda3c12edf..42d2e004d6b 100644 --- a/cpp/tests/copying/scatter_list_scalar_tests.cpp +++ b/cpp/tests/copying/scatter_list_scalar_tests.cpp @@ -307,7 +307,7 @@ TYPED_TEST_SUITE(ScatterListOfStructScalarTest, cudf::test::FixedWidthTypesWitho TYPED_TEST(ScatterListOfStructScalarTest, Basic) { using LCW = cudf::test::lists_column_wrapper; - using offset_t = cudf::test::fixed_width_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; auto data = this->make_test_structs({{42, 42, 42}, {1, 0, 1}}, @@ -346,7 +346,7 @@ TYPED_TEST(ScatterListOfStructScalarTest, Basic) TYPED_TEST(ScatterListOfStructScalarTest, EmptyValidScalar) { using LCW = cudf::test::lists_column_wrapper; - using offset_t = cudf::test::fixed_width_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; auto data = this->make_test_structs({}, {}, LCW{}, {}); auto slr = std::make_unique(data, true); @@ -379,7 +379,7 @@ TYPED_TEST(ScatterListOfStructScalarTest, EmptyValidScalar) TYPED_TEST(ScatterListOfStructScalarTest, NullScalar) { using LCW = cudf::test::lists_column_wrapper; - using offset_t = cudf::test::fixed_width_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; auto data = this->make_test_structs({}, {}, {}, {}); auto slr = std::make_unique(data, false); @@ -411,7 +411,7 @@ TYPED_TEST(ScatterListOfStructScalarTest, NullScalar) TYPED_TEST(ScatterListOfStructScalarTest, NullableTargetRow) { using LCW = cudf::test::lists_column_wrapper; - using offset_t = cudf::test::fixed_width_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; auto data = this->make_test_structs({{42, 42, 42}, {1, 0, 1}}, diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp index c9a53d6ebe0..da85242410b 100644 --- a/cpp/tests/copying/split_tests.cpp +++ b/cpp/tests/copying/split_tests.cpp @@ -2072,8 +2072,7 @@ TEST_F(ContiguousSplitTableCornerCases, PreSplitList) // list> { - cudf::test::fixed_width_column_wrapper offsets{ - 0, 2, 5, 7, 10, 12, 14, 17, 20}; + cudf::test::fixed_width_column_wrapper offsets{0, 2, 5, 7, 10, 12, 14, 17, 20}; cudf::test::fixed_width_column_wrapper floats{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; cudf::test::structs_column_wrapper data({floats}); @@ -2131,8 +2130,7 @@ TEST_F(ContiguousSplitTableCornerCases, PreSplitStructs) // struct> { - cudf::test::fixed_width_column_wrapper offsets{ - 0, 2, 5, 7, 10, 12, 14, 17, 20}; + cudf::test::fixed_width_column_wrapper offsets{0, 2, 5, 7, 10, 12, 14, 17, 20}; cudf::test::fixed_width_column_wrapper floats{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; cudf::test::structs_column_wrapper data({floats}); diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp index 485febe0d92..749f4013013 100644 --- a/cpp/tests/groupby/collect_list_tests.cpp +++ b/cpp/tests/groupby/collect_list_tests.cpp @@ -146,7 +146,7 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputLists) using LCW = cudf::test::lists_column_wrapper; - auto offsets = cudf::data_type{cudf::type_to_id()}; + auto offsets = cudf::data_type{cudf::type_to_id()}; cudf::test::fixed_width_column_wrapper keys{}; auto values = @@ -176,7 +176,7 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputListsOfStructs) auto values = cudf::make_lists_column(0, - cudf::make_empty_column(cudf::type_to_id()), + cudf::make_empty_column(cudf::type_to_id()), struct_column.release(), 0, {}); @@ -188,13 +188,13 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputListsOfStructs) auto expect_child = cudf::make_lists_column(0, - cudf::make_empty_column(cudf::type_to_id()), + cudf::make_empty_column(cudf::type_to_id()), expect_struct_column.release(), 0, {}); auto expect_values = cudf::make_lists_column(0, - cudf::make_empty_column(cudf::type_to_id()), + cudf::make_empty_column(cudf::type_to_id()), std::move(expect_child), 0, {}); diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index a5054daed19..ea2bad0cabf 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -2687,8 +2687,8 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes) constexpr int floats_per_row = 4; auto c1_offset_iter = cudf::detail::make_counting_transform_iterator( 0, [floats_per_row](cudf::size_type idx) { return idx * floats_per_row; }); - cudf::test::fixed_width_column_wrapper c1_offsets( - c1_offset_iter, c1_offset_iter + num_rows + 1); + cudf::test::fixed_width_column_wrapper c1_offsets(c1_offset_iter, + c1_offset_iter + num_rows + 1); cudf::test::fixed_width_column_wrapper c1_floats( values, values + (num_rows * floats_per_row), valids); auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows); @@ -2711,8 +2711,8 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes) cudf::test::strings_column_wrapper string_col{string_iter, string_iter + num_string_rows}; auto offset_iter = cudf::detail::make_counting_transform_iterator( 0, [string_per_row](cudf::size_type idx) { return idx * string_per_row; }); - cudf::test::fixed_width_column_wrapper offsets(offset_iter, - offset_iter + num_rows + 1); + cudf::test::fixed_width_column_wrapper offsets(offset_iter, + offset_iter + num_rows + 1); auto _c3_valids = cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 200; }); @@ -5034,8 +5034,8 @@ TEST_F(ParquetReaderTest, NestingOptimizationTest) 0, [depth, rows_per_level](cudf::size_type i) { return i * rows_per_level; }); total_values_produced += (num_rows + 1); - cudf::test::fixed_width_column_wrapper offsets(offsets_iter, - offsets_iter + num_rows + 1); + cudf::test::fixed_width_column_wrapper offsets(offsets_iter, + offsets_iter + num_rows + 1); auto c = cudf::make_lists_column(num_rows, offsets.release(), std::move(prev_col), 0, {}); prev_col = std::move(c); } diff --git a/cpp/tests/lists/extract_tests.cpp b/cpp/tests/lists/extract_tests.cpp index 2c2b3c8b29c..017cd471e01 100644 --- a/cpp/tests/lists/extract_tests.cpp +++ b/cpp/tests/lists/extract_tests.cpp @@ -269,7 +269,7 @@ TYPED_TEST(ListsExtractColumnIndicesTypedTest, ExtractElement) { using LCW = cudf::test::lists_column_wrapper; using FWCW = cudf::test::fixed_width_column_wrapper; - using indices = cudf::test::fixed_width_column_wrapper; + using indices = cudf::test::fixed_width_column_wrapper; auto input_column = LCW({LCW{3, 2, 1}, LCW{}, LCW{30, 20, 10, 50}, LCW{100, 120}, LCW{0}, LCW{}}, cudf::test::iterators::null_at(1)); @@ -329,7 +329,7 @@ TYPED_TEST(ListsExtractColumnIndicesTypedTest, ExtractElement) TYPED_TEST(ListsExtractColumnIndicesTypedTest, FailureCases) { using LCW = cudf::test::lists_column_wrapper; - using indices = cudf::test::fixed_width_column_wrapper; + using indices = cudf::test::fixed_width_column_wrapper; { // Non-empty input, with mismatched size of indices. @@ -361,7 +361,7 @@ TEST_F(ListsExtractColumnIndicesTest, ExtractStrings) { using LCW = cudf::test::lists_column_wrapper; using strings = cudf::test::strings_column_wrapper; - using indices = cudf::test::fixed_width_column_wrapper; + using indices = cudf::test::fixed_width_column_wrapper; auto input_column = LCW( {LCW{"3", "2", "1"}, LCW{}, LCW{"30", "20", "10", "50"}, LCW{"100", "120"}, LCW{"0"}, LCW{}}, diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp index c6069acad8a..46d4066ddff 100644 --- a/cpp/tests/quantiles/percentile_approx_test.cpp +++ b/cpp/tests/quantiles/percentile_approx_test.cpp @@ -383,7 +383,7 @@ TEST_F(PercentileApproxTest, EmptyInput) cudf::tdigest::tdigest_column_view tdv(*empty); auto result = cudf::percentile_approx(tdv, percentiles); - cudf::test::fixed_width_column_wrapper offsets{0, 0, 0, 0}; + cudf::test::fixed_width_column_wrapper offsets{0, 0, 0, 0}; std::vector nulls{0, 0, 0}; auto [null_mask, null_count] = cudf::test::detail::make_null_mask(nulls.begin(), nulls.end()); @@ -415,7 +415,7 @@ TEST_F(PercentileApproxTest, EmptyPercentiles) cudf::tdigest::tdigest_column_view tdv(*tdigest_column.second[0].results[0]); auto result = cudf::percentile_approx(tdv, percentiles); - cudf::test::fixed_width_column_wrapper offsets{0, 0, 0}; + cudf::test::fixed_width_column_wrapper offsets{0, 0, 0}; std::vector nulls{0, 0}; auto [null_mask, null_count] = cudf::test::detail::make_null_mask(nulls.begin(), nulls.end()); diff --git a/cpp/tests/reductions/tdigest_tests.cu b/cpp/tests/reductions/tdigest_tests.cu index b0087766c0c..c8fec51e1c9 100644 --- a/cpp/tests/reductions/tdigest_tests.cu +++ b/cpp/tests/reductions/tdigest_tests.cu @@ -94,7 +94,7 @@ TEST_F(ReductionTDigestMerge, FewHeavyCentroids) cudf::test::fixed_width_column_wrapper c0c{1.0, 2.0}; cudf::test::fixed_width_column_wrapper c0w{100.0, 50.0}; cudf::test::structs_column_wrapper c0s({c0c, c0w}); - cudf::test::fixed_width_column_wrapper c0_offsets{0, 2}; + cudf::test::fixed_width_column_wrapper c0_offsets{0, 2}; auto c0l = cudf::make_lists_column(1, c0_offsets.release(), c0s.release(), 0, rmm::device_buffer{}); cudf::test::fixed_width_column_wrapper c0min{1.0}; @@ -111,7 +111,7 @@ TEST_F(ReductionTDigestMerge, FewHeavyCentroids) cudf::test::fixed_width_column_wrapper c1c{3.0, 4.0}; cudf::test::fixed_width_column_wrapper c1w{200.0, 50.0}; cudf::test::structs_column_wrapper c1s({c1c, c1w}); - cudf::test::fixed_width_column_wrapper c1_offsets{0, 2}; + cudf::test::fixed_width_column_wrapper c1_offsets{0, 2}; auto c1l = cudf::make_lists_column(1, c1_offsets.release(), c1s.release(), 0, rmm::device_buffer{}); cudf::test::fixed_width_column_wrapper c1min{3.0}; @@ -147,7 +147,7 @@ TEST_F(ReductionTDigestMerge, FewHeavyCentroids) cudf::test::fixed_width_column_wrapper ec{1.0, 2.0, 3.0, 4.0}; cudf::test::fixed_width_column_wrapper ew{100.0, 50.0, 200.0, 50.0}; cudf::test::structs_column_wrapper es({ec, ew}); - cudf::test::fixed_width_column_wrapper e_offsets{0, 4}; + cudf::test::fixed_width_column_wrapper e_offsets{0, 4}; auto el = cudf::make_lists_column(1, e_offsets.release(), es.release(), 0, rmm::device_buffer{}); cudf::test::fixed_width_column_wrapper emin{1.0}; cudf::test::fixed_width_column_wrapper emax{4.0}; diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp index e8e603f8533..ecc38dfd26e 100644 --- a/cpp/tests/strings/array_tests.cpp +++ b/cpp/tests/strings/array_tests.cpp @@ -152,7 +152,7 @@ TEST_F(StringsColumnTest, GatherTooBig) { std::vector h_chars(3000000); cudf::test::fixed_width_column_wrapper chars(h_chars.begin(), h_chars.end()); - cudf::test::fixed_width_column_wrapper offsets({0, 3000000}); + cudf::test::fixed_width_column_wrapper offsets({0, 3000000}); auto input = cudf::column_view( cudf::data_type{cudf::type_id::STRING}, 1, nullptr, nullptr, 0, 0, {offsets, chars}); auto map = thrust::constant_iterator(0); diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 868785b4612..0cb5023a32e 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -294,9 +294,9 @@ TEST_F(StringsContainsTests, HexTest) std::vector ascii_chars( // all possible matchable chars {thrust::make_counting_iterator(0), thrust::make_counting_iterator(127)}); auto const count = static_cast(ascii_chars.size()); - std::vector offsets( - {thrust::make_counting_iterator(0), - thrust::make_counting_iterator(0) + count + 1}); + std::vector offsets( + {thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + count + 1}); auto d_chars = cudf::detail::make_device_uvector_sync( ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto d_offsets = cudf::detail::make_device_uvector_sync( diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index d8594fa4923..a3d392cfed0 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -99,8 +99,8 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair) cudf::device_span(strings_view.chars().data(), strings_view.chars().size()), cudf::get_default_stream()); auto h_offsets_data = cudf::detail::make_std_vector_sync( - cudf::device_span( - strings_view.offsets().data() + strings_view.offset(), + cudf::device_span( + strings_view.offsets().data() + strings_view.offset(), strings_view.size() + 1), cudf::get_default_stream()); EXPECT_EQ(memcmp(h_buffer.data(), h_chars_data.data(), h_buffer.size()), 0); @@ -164,8 +164,8 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets) cudf::device_span(strings_view.chars().data(), strings_view.chars().size()), cudf::get_default_stream()); auto h_offsets_data = cudf::detail::make_std_vector_sync( - cudf::device_span( - strings_view.offsets().data() + strings_view.offset(), + cudf::device_span( + strings_view.offsets().data() + strings_view.offset(), strings_view.size() + 1), cudf::get_default_stream()); EXPECT_EQ(memcmp(h_buffer.data(), h_chars_data.data(), h_buffer.size()), 0); diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu index 095495456e9..4832cdf816f 100644 --- a/cpp/tests/transform/row_bit_count_test.cu +++ b/cpp/tests/transform/row_bit_count_test.cu @@ -98,10 +98,10 @@ std::pair, std::unique_ptr> build_li // } cudf::test::fixed_width_column_wrapper values{ 1, 2, 3, 4, 5, 10, 6, 7, 8, 9, -1, -2, -3, -4, -5, -6, -7, -8, -9}; - cudf::test::fixed_width_column_wrapper inner_offsets{ + cudf::test::fixed_width_column_wrapper inner_offsets{ 0, 2, 5, 6, 9, 10, 12, 14, 17, 19}; auto inner_list = cudf::make_lists_column(9, inner_offsets.release(), values.release(), 0, {}); - cudf::test::fixed_width_column_wrapper outer_offsets{0, 2, 2, 3, 5, 7, 9}; + cudf::test::fixed_width_column_wrapper outer_offsets{0, 2, 2, 3, 5, 7, 9}; auto list = cudf::make_lists_column(6, outer_offsets.release(), std::move(inner_list), 0, {}); // expected size = (num rows at level 1 + num_rows at level 2) + # values in the leaf @@ -142,13 +142,13 @@ TYPED_TEST(RowBitCountTyped, ListsWithNulls) // } cudf::test::fixed_width_column_wrapper values{{1, 2, 3, 4, 5, 10, 6, 7, 8}, {1, 1, 1, 0, 1, 1, 0, 1, 0}}; - cudf::test::fixed_width_column_wrapper inner_offsets{0, 2, 5, 6, 9, 9}; + cudf::test::fixed_width_column_wrapper inner_offsets{0, 2, 5, 6, 9, 9}; std::vector inner_list_validity{1, 1, 1, 1, 0}; auto [null_mask, null_count] = cudf::test::detail::make_null_mask(inner_list_validity.begin(), inner_list_validity.end()); auto inner_list = cudf::make_lists_column( 5, inner_offsets.release(), values.release(), null_count, std::move(null_mask)); - cudf::test::fixed_width_column_wrapper outer_offsets{0, 2, 2, 3, 5}; + cudf::test::fixed_width_column_wrapper outer_offsets{0, 2, 2, 3, 5}; auto list = cudf::make_lists_column(4, outer_offsets.release(), std::move(inner_list), 0, {}); cudf::table_view t({*list}); @@ -177,7 +177,7 @@ TEST_F(RowBitCount, Strings) // expect 1 offset (4 bytes) + length of string per row auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings](int i) { - return (static_cast(strings[i].size()) + sizeof(cudf::offset_type)) * CHAR_BIT; + return (static_cast(strings[i].size()) + sizeof(cudf::size_type)) * CHAR_BIT; }); cudf::test::fixed_width_column_wrapper expected(size_iter, size_iter + strings.size()); @@ -200,7 +200,7 @@ TEST_F(RowBitCount, StringsWithNulls) // expect 1 offset (4 bytes) + (length of string, or 0 if null) + 1 validity bit per row auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings, &valids](int i) { return ((static_cast(valids[i] ? strings[i].size() : 0) + - sizeof(cudf::offset_type)) * + sizeof(cudf::size_type)) * CHAR_BIT) + 1; }); @@ -247,8 +247,8 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock) cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, num_rows + 1); auto list_offsets_view = list_offsets->mutable_view(); thrust::tabulate(rmm::exec_policy(cudf::get_default_stream()), - list_offsets_view.begin(), - list_offsets_view.end(), + list_offsets_view.begin(), + list_offsets_view.end(), times_2{}); // List = {{0,1}, {2,3}, {4,5}, ..., {2*(num_rows-1), 2*num_rows-1}}; @@ -267,7 +267,7 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock) thrust::fill_n(rmm::exec_policy(cudf::get_default_stream()), expected_row_bit_counts->mutable_view().begin(), num_rows, - CHAR_BIT * (2 * sizeof(int32_t) + sizeof(cudf::offset_type))); + CHAR_BIT * (2 * sizeof(int32_t) + sizeof(cudf::size_type))); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(row_bit_counts->view(), expected_row_bit_counts->view()); } @@ -309,8 +309,7 @@ TEST_F(RowBitCount, StructsNoNulls) // expect 1 offset (4 bytes) + (length of string) + 1 float + 1 int16_t auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings](int i) { return ((sizeof(float) + sizeof(int16_t)) * CHAR_BIT) + - ((static_cast(strings[i].size()) + sizeof(cudf::offset_type)) * - CHAR_BIT); + ((static_cast(strings[i].size()) + sizeof(cudf::size_type)) * CHAR_BIT); }); cudf::test::fixed_width_column_wrapper expected(size_iter, size_iter + t.num_rows()); @@ -534,7 +533,7 @@ TEST_F(RowBitCount, NestedTypes) TEST_F(RowBitCount, NullsInStringsList) { - using offsets_wrapper = cudf::test::fixed_width_column_wrapper; + using offsets_wrapper = cudf::test::fixed_width_column_wrapper; // clang-format off auto strings = std::vector{ "daïs", "def", "", "z", "bananas", "warp", "", "zing" }; @@ -552,7 +551,7 @@ TEST_F(RowBitCount, NullsInStringsList) {}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT( cudf::row_bit_count(cudf::table_view{{lists_col->view()}})->view(), - cudf::test::fixed_width_column_wrapper{138, 106, 130, 130}); + cudf::test::fixed_width_column_wrapper{138, 106, 130, 130}); } TEST_F(RowBitCount, EmptyChildColumnInListOfStrings) @@ -560,13 +559,13 @@ TEST_F(RowBitCount, EmptyChildColumnInListOfStrings) // Test with a list column with 4 empty list rows. // Note: Since there are no strings in any of the lists, // the lists column's child can be empty. - auto offsets = cudf::test::fixed_width_column_wrapper{0, 0, 0, 0, 0}; + auto offsets = cudf::test::fixed_width_column_wrapper{0, 0, 0, 0, 0}; auto lists_col = cudf::make_lists_column( 4, offsets.release(), cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}), 0, {}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT( cudf::row_bit_count(cudf::table_view{{lists_col->view()}})->view(), - cudf::test::fixed_width_column_wrapper{32, 32, 32, 32}); + cudf::test::fixed_width_column_wrapper{32, 32, 32, 32}); } TEST_F(RowBitCount, EmptyChildColumnInListOfLists) @@ -579,12 +578,12 @@ TEST_F(RowBitCount, EmptyChildColumnInListOfLists) return cudf::empty_like(exemplar); }; - auto offsets = cudf::test::fixed_width_column_wrapper{0, 0, 0, 0, 0}; + auto offsets = cudf::test::fixed_width_column_wrapper{0, 0, 0, 0, 0}; auto lists_col = cudf::make_lists_column(4, offsets.release(), empty_child_lists_column(), 0, {}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT( cudf::row_bit_count(cudf::table_view{{lists_col->view()}})->view(), - cudf::test::fixed_width_column_wrapper{32, 32, 32, 32}); + cudf::test::fixed_width_column_wrapper{32, 32, 32, 32}); } struct sum_functor { @@ -639,12 +638,12 @@ TEST_F(RowBitCount, DepthJump) // the jump occurs from depth 2 (the leafmost int column) // to depth 0 (the topmost int column) cudf::test::fixed_width_column_wrapper ____c0{1, 2, 3, 5, 5, 6, 7, 8}; - cudf::test::fixed_width_column_wrapper ___offsets{0, 2, 4, 6, 8}; + cudf::test::fixed_width_column_wrapper ___offsets{0, 2, 4, 6, 8}; auto ___c0 = cudf::make_lists_column(4, ___offsets.release(), ____c0.release(), 0, {}); std::vector> __children; __children.push_back(std::move(___c0)); cudf::test::structs_column_wrapper __c0(std::move(__children)); - cudf::test::fixed_width_column_wrapper _offsets{0, 3, 4}; + cudf::test::fixed_width_column_wrapper _offsets{0, 3, 4}; auto _c0 = cudf::make_lists_column(2, _offsets.release(), __c0.release(), 0, {}); cudf::test::fixed_width_column_wrapper _c1{3, 4}; std::vector> children; @@ -657,7 +656,7 @@ TEST_F(RowBitCount, DepthJump) // expected size = (num rows at level 1 + num_rows at level 2) + (# values the leaf int column) + // 1 (value in topmost int column) - constexpr cudf::size_type offset_size = sizeof(cudf::offset_type) * CHAR_BIT; + constexpr cudf::size_type offset_size = sizeof(cudf::size_type) * CHAR_BIT; constexpr cudf::size_type type_size = sizeof(T) * CHAR_BIT; cudf::test::fixed_width_column_wrapper expected{ ((1 + 3) * offset_size) + (6 * type_size) + (1 * type_size), @@ -693,7 +692,7 @@ TEST_F(RowBitCount, SlicedColumnsStrings) // expect 1 offset (4 bytes) + length of string per row auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings](int i) { - return (static_cast(strings[i].size()) + sizeof(cudf::offset_type)) * CHAR_BIT; + return (static_cast(strings[i].size()) + sizeof(cudf::size_type)) * CHAR_BIT; }); cudf::test::fixed_width_column_wrapper expected(size_iter + 3, size_iter + 3 + slice_size); @@ -736,7 +735,7 @@ TEST_F(RowBitCount, SlicedColumnsStructs) // expect 1 offset (4 bytes) + length of string per row + 1 int16_t per row auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings](int i) { - return (static_cast(strings[i].size()) + sizeof(cudf::offset_type) + + return (static_cast(strings[i].size()) + sizeof(cudf::size_type) + sizeof(int16_t)) * CHAR_BIT; }); diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index 3e0545935ef..fcaf23fd456 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -123,7 +123,7 @@ std::unique_ptr generate_child_row_indices(lists_column_view const& c, 0, [row_indices = row_indices.begin(), validity = c.null_mask(), - offsets = c.offsets().begin(), + offsets = c.offsets().begin(), offset = c.offset()] __device__(int index) { // both null mask and offsets data are not pre-sliced. so we need to add the column offset to // every incoming index. @@ -168,9 +168,9 @@ std::unique_ptr generate_child_row_indices(lists_column_view const& c, auto output_row_iter = cudf::detail::make_counting_transform_iterator( 0, [row_indices = row_indices.begin(), - offsets = c.offsets().begin(), + offsets = c.offsets().begin(), offset = c.offset(), - first_offset = cudf::detail::get_value( + first_offset = cudf::detail::get_value( c.offsets(), c.offset(), cudf::test::get_default_stream())] __device__(int index) { auto const true_index = row_indices[index] + offset; return offsets[true_index] - first_offset; diff --git a/cpp/tests/utilities/tdigest_utilities.cu b/cpp/tests/utilities/tdigest_utilities.cu index d2e95812894..9294aa0f681 100644 --- a/cpp/tests/utilities/tdigest_utilities.cu +++ b/cpp/tests/utilities/tdigest_utilities.cu @@ -110,12 +110,12 @@ std::unique_ptr make_expected_tdigest_column(std::vector h_offsets{0, tdigest.mean.size()}; + std::vector h_offsets{0, tdigest.mean.size()}; auto offsets = cudf::make_fixed_width_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED); - CUDF_CUDA_TRY(cudaMemcpy(offsets->mutable_view().begin(), + CUDF_CUDA_TRY(cudaMemcpy(offsets->mutable_view().begin(), h_offsets.data(), - sizeof(offset_type) * 2, + sizeof(size_type) * 2, cudaMemcpyDefault)); auto list = cudf::make_lists_column(1, std::move(offsets), std::move(tdigests), 0, {}); diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp index 6cdcdd22dd7..e90a3f9ac6e 100644 --- a/cpp/tests/utilities_tests/column_utilities_tests.cpp +++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp @@ -385,7 +385,7 @@ TEST_F(ColumnUtilitiesListsTest, UnsanitaryLists) // 0, 1, 2 std::vector> children; children.emplace_back( - std::move(cudf::test::fixed_width_column_wrapper{0, 3}.release())); + std::move(cudf::test::fixed_width_column_wrapper{0, 3}.release())); children.emplace_back(std::move(cudf::test::fixed_width_column_wrapper{0, 1, 2}.release())); auto l0 = std::make_unique(cudf::data_type{cudf::type_id::LIST}, diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu index 580f63d73b2..56aea0b45e2 100644 --- a/java/src/main/native/src/ColumnViewJni.cu +++ b/java/src/main/native/src/ColumnViewJni.cu @@ -208,10 +208,10 @@ std::unique_ptr lists_distinct_by_key(cudf::lists_column_view cons cudf::make_structs_column(out_labels.size(), std::move(out_structs_members), 0, {}); // Assemble a lists column of structs. - auto out_offsets = make_numeric_column(data_type{type_to_id()}, input.size() + 1, + auto out_offsets = make_numeric_column(data_type{type_to_id()}, input.size() + 1, mask_state::UNALLOCATED, stream); - auto const offsets_begin = out_offsets->mutable_view().template begin(); - auto const labels_begin = out_labels.template begin(); + auto const offsets_begin = out_offsets->mutable_view().template begin(); + auto const labels_begin = out_labels.template begin(); cudf::detail::labels_to_offsets(labels_begin, labels_begin + out_labels.size(), offsets_begin, offsets_begin + out_offsets->size(), stream); diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index a0dbfb3b38c..d93d38c7758 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -179,9 +179,9 @@ struct tile_info { * */ struct row_batch { - size_type num_bytes; // number of bytes in this batch - size_type row_count; // number of rows in the batch - device_uvector row_offsets; // offsets column of output cudf column + size_type num_bytes; // number of bytes in this batch + size_type row_count; // number of rows in the batch + device_uvector row_offsets; // offsets column of output cudf column }; /** diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd index ee871f06231..11480d774ef 100644 --- a/python/cudf/cudf/_lib/cpp/types.pxd +++ b/python/cudf/cudf/_lib/cpp/types.pxd @@ -5,7 +5,6 @@ from libc.stdint cimport int32_t, uint32_t cdef extern from "cudf/types.hpp" namespace "cudf" nogil: ctypedef int32_t size_type - ctypedef int32_t offset_type ctypedef uint32_t bitmask_type ctypedef uint32_t char_utf8 diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd index 740db51db6c..2b08e6863a1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd @@ -5,7 +5,7 @@ from libcpp.vector cimport vector from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.types cimport bitmask_type, offset_type, size_type +from cudf._lib.cpp.types cimport bitmask_type, size_type from .gpumemoryview cimport gpumemoryview from .types cimport DataType @@ -20,7 +20,7 @@ cdef class Column: gpumemoryview data gpumemoryview mask size_type null_count - offset_type offset + size_type offset # children: List[Column] list children diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx index 35c30b280c9..be4eff4c49d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx @@ -6,7 +6,7 @@ from libcpp.utility cimport move from rmm._lib.device_buffer cimport DeviceBuffer from cudf._lib.cpp.column.column cimport column, column_contents -from cudf._lib.cpp.types cimport offset_type, size_type +from cudf._lib.cpp.types cimport size_type from .gpumemoryview cimport gpumemoryview from .types cimport DataType @@ -42,7 +42,7 @@ cdef class Column: """ def __init__( self, DataType data_type not None, size_type size, gpumemoryview data, - gpumemoryview mask, size_type null_count, offset_type offset, + gpumemoryview mask, size_type null_count, size_type offset, list children ): self.data_type = data_type From b7994bc16b1b1743b0743860b4f02ac4da8245d5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 3 Aug 2023 07:54:33 -1000 Subject: [PATCH 5/5] Raise NotImplementedError for pd.SparseDtype (#13798) Currently cuDF seems to cast `pd.SparseDtype` to it's subtype instead of maintaining the sparse data type from pandas. Since `pd.SparseDtype` is not supported in cuDF, it is better to raise and tell users to cast directly to the sparse subtype Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/13798 --- python/cudf/cudf/core/column/column.py | 10 ++++++++++ python/cudf/cudf/tests/test_dataframe.py | 6 ++++++ python/cudf/cudf/tests/test_series.py | 6 ++++++ 3 files changed, 22 insertions(+) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b4f3f533d44..da3d04c15c0 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2010,6 +2010,11 @@ def as_column( return as_column(arbitrary.array) elif PANDAS_GE_150 and isinstance(arbitrary.dtype, pd.ArrowDtype): return as_column(pa.array(arbitrary.array, from_pandas=True)) + elif isinstance(arbitrary.dtype, pd.SparseDtype): + raise NotImplementedError( + f"{arbitrary.dtype} is not supported. Convert first to " + f"{arbitrary.dtype.subtype}." + ) if is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) elif is_interval_dtype(arbitrary.dtype): @@ -2214,6 +2219,11 @@ def as_column( ) if dtype is not None: data = data.astype(dtype) + elif isinstance(arbitrary, pd.arrays.SparseArray): + raise NotImplementedError( + f"{arbitrary.dtype} is not supported. Convert first to " + f"{arbitrary.dtype.subtype}." + ) elif isinstance(arbitrary, memoryview): data = as_column( np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e35ab147bf4..0898cb2ef3d 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10245,6 +10245,12 @@ def test_dataframe_init_columns_named_index(): assert_eq(gdf, pdf) +def test_dataframe_from_pandas_sparse(): + pdf = pd.DataFrame(range(2), dtype=pd.SparseDtype(np.int64, 0)) + with pytest.raises(NotImplementedError): + cudf.DataFrame(pdf) + + def test_dataframe_constructor_unbounded_sequence(): class A: def __getitem__(self, key): diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 83d22bbca2d..58eaebae925 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2206,6 +2206,12 @@ def test_series_contains(data, index): assert_eq(False in ps, False in gs) +def test_series_from_pandas_sparse(): + pser = pd.Series(range(2), dtype=pd.SparseDtype(np.int64, 0)) + with pytest.raises(NotImplementedError): + cudf.Series(pser) + + def test_series_constructor_unbounded_sequence(): class A: def __getitem__(self, key):