diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 8190b5d0297..315a389339a 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -31,6 +31,6 @@ ENV PYTHONDONTWRITEBYTECODE="1" ENV SCCACHE_REGION="us-east-2" ENV SCCACHE_BUCKET="rapids-sccache-devs" -ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai" +ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs" ENV HISTFILE="/home/coder/.cache/._bash_history" ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache" diff --git a/.github/workflows/auto-assign.yml b/.github/workflows/auto-assign.yml new file mode 100644 index 00000000000..1bf4ac08b69 --- /dev/null +++ b/.github/workflows/auto-assign.yml @@ -0,0 +1,17 @@ +name: "Auto Assign PR" + +on: + pull_request_target: + types: + - opened + - reopened + - synchronize + +jobs: + add_assignees: + runs-on: ubuntu-latest + steps: + - uses: actions-ecosystem/action-add-assignees@v1 + with: + github_token: "${{ secrets.GITHUB_TOKEN }}" + assignees: ${{ github.actor }} diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 31e78f82a62..f5cb71bfc14 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -1,4 +1,5 @@ name: "Pull Request Labeler" + on: - pull_request_target diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 174dc72bf02..f5234f58efe 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,17 +16,6 @@ repos: ^cpp/cmake/thirdparty/patches/.*| ^python/cudf/cudf/tests/data/subword_tokenizer_data/.* ) - - repo: https://github.com/PyCQA/isort - rev: 5.13.2 - hooks: - - id: isort - # Use the config file specific to each subproject so that each - # project can specify its own first/third-party packages. - args: ["--config-root=python/", "--resolve-all-configs"] - files: python/.* - exclude: | - (?x)^(^python/cudf_polars/.*) - types_or: [python, cython, pyi] - repo: https://github.com/MarcoGorelli/cython-lint rev: v0.16.2 hooks: @@ -150,6 +139,7 @@ repos: rev: v0.4.8 hooks: - id: ruff + args: ["--fix"] files: python/.*$ - id: ruff-format files: python/.*$ @@ -165,7 +155,7 @@ repos: ) - id: verify-alpha-spec - repo: https://github.com/rapidsai/dependency-file-generator - rev: v1.13.11 + rev: v1.16.0 hooks: - id: rapids-dependency-file-generator args: ["--clean"] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f9cdde7c2b7..3db1ed35294 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -38,6 +38,7 @@ conduct. More information can be found at: 8. Verify that CI passes all [status checks](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks). Fix if needed. 9. Wait for other developers to review your code and update code as needed. + Changes to any C++ files require at least 2 approvals from the cudf-cpp-codeowners before merging. 10. Once reviewed and approved, a RAPIDS developer will merge your pull request. If you are unsure about anything, don't hesitate to comment on issues and ask for clarification! @@ -293,8 +294,8 @@ In order to run doxygen as a linter on C++/CUDA code, run ./ci/checks/doxygen.sh ``` -Python code runs several linters including [Black](https://black.readthedocs.io/en/stable/), -[isort](https://pycqa.github.io/isort/), and [flake8](https://flake8.pycqa.org/en/latest/). +Python code runs several linters including [Ruff](https://docs.astral.sh/ruff/) +with its various rules for Black-like formatting or Isort. cuDF also uses [codespell](https://github.com/codespell-project/codespell) to find spelling mistakes, and this check is run as a pre-commit hook. To apply the suggested spelling fixes, diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index e5fcef17a83..3d06eacf9ff 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -15,8 +15,12 @@ rapids-print-env rapids-logger "Begin cpp build" +sccache --zero-stats + # With boa installed conda build forward to boa RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild \ conda/recipes/libcudf +sccache --show-adv-stats + rapids-upload-conda-to-s3 cpp diff --git a/ci/build_python.sh b/ci/build_python.sh index 823d7f62290..ed90041cc77 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -19,6 +19,8 @@ rapids-logger "Begin py build" CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) +sccache --zero-stats + # TODO: Remove `--no-test` flag once importing on a CPU # node works correctly # With boa installed conda build forwards to the boa builder @@ -28,12 +30,18 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --channel "${CPP_CHANNEL}" \ conda/recipes/pylibcudf +sccache --show-adv-stats +sccache --zero-stats + RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/cudf +sccache --show-adv-stats +sccache --zero-stats + RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ @@ -46,6 +54,8 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/cudf_kafka +sccache --show-adv-stats + RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index bf76f4ed29a..78b8a8a08cf 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -3,7 +3,8 @@ set -euo pipefail -package_dir=$1 +package_name=$1 +package_dir=$2 source rapids-configure-sccache source rapids-date-string @@ -12,4 +13,14 @@ rapids-generate-version > ./VERSION cd "${package_dir}" -python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check +sccache --zero-stats + +rapids-logger "Building '${package_name}' wheel" +python -m pip wheel \ + -w dist \ + -v \ + --no-deps \ + --disable-pip-version-check \ + . + +sccache --show-adv-stats diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh index fb93b06dbe2..fef4416a366 100755 --- a/ci/build_wheel_cudf.sh +++ b/ci/build_wheel_cudf.sh @@ -18,7 +18,7 @@ echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf echo "pylibcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/pylibcudf_dist/pylibcudf_*.whl)" >> /tmp/constraints.txt export PIP_CONSTRAINT="/tmp/constraints.txt" -./ci/build_wheel.sh ${package_dir} +./ci/build_wheel.sh cudf ${package_dir} python -m auditwheel repair \ --exclude libcudf.so \ diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh index 9c945e11c00..79853cdbdb2 100755 --- a/ci/build_wheel_cudf_polars.sh +++ b/ci/build_wheel_cudf_polars.sh @@ -5,7 +5,7 @@ set -euo pipefail package_dir="python/cudf_polars" -./ci/build_wheel.sh ${package_dir} +./ci/build_wheel.sh cudf-polars ${package_dir} RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist +RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh index eb2a91289f7..00c64afa2ef 100755 --- a/ci/build_wheel_dask_cudf.sh +++ b/ci/build_wheel_dask_cudf.sh @@ -5,7 +5,7 @@ set -euo pipefail package_dir="python/dask_cudf" -./ci/build_wheel.sh ${package_dir} +./ci/build_wheel.sh dask-cudf ${package_dir} RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist +RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh index 91bc071583e..b3d6778ea04 100755 --- a/ci/build_wheel_libcudf.sh +++ b/ci/build_wheel_libcudf.sh @@ -3,10 +3,30 @@ set -euo pipefail +package_name="libcudf" package_dir="python/libcudf" +rapids-logger "Generating build requirements" + +rapids-dependency-file-generator \ + --output requirements \ + --file-key "py_build_${package_name}" \ + --file-key "py_rapids_build_${package_name}" \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};cuda_suffixed=true" \ +| tee /tmp/requirements-build.txt + +rapids-logger "Installing build requirements" +python -m pip install \ + -v \ + --prefer-binary \ + -r /tmp/requirements-build.txt + +# build with '--no-build-isolation', for better sccache hit rate +# 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735) +export PIP_NO_BUILD_ISOLATION=0 + export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON" -./ci/build_wheel.sh ${package_dir} +./ci/build_wheel.sh "${package_name}" "${package_dir}" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" @@ -16,4 +36,4 @@ python -m auditwheel repair \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* -RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist +RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp "${package_dir}/final_dist" diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh index 5e9f7f8a0c4..839d98846fe 100755 --- a/ci/build_wheel_pylibcudf.sh +++ b/ci/build_wheel_pylibcudf.sh @@ -16,7 +16,7 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf_*.whl)" > /tmp/constraints.txt export PIP_CONSTRAINT="/tmp/constraints.txt" -./ci/build_wheel.sh ${package_dir} +./ci/build_wheel.sh pylibcudf ${package_dir} python -m auditwheel repair \ --exclude libcudf.so \ @@ -24,4 +24,4 @@ python -m auditwheel repair \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* -RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist +RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index bd5e6c3d569..c3716c4759a 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -65,7 +65,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.8,<1.9 +- polars>=1.11,<1.12 - pre-commit - ptxcompiler - pyarrow>=14.0.0,<18.0.0a0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 565a3ebfa3c..38e131e79cb 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -63,7 +63,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.8,<1.9 +- polars>=1.11,<1.12 - pre-commit - pyarrow>=14.0.0,<18.0.0a0 - pydata-sphinx-theme!=0.14.2 diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml index e8fef715c60..edf92b930d9 100644 --- a/conda/recipes/cudf-polars/meta.yaml +++ b/conda/recipes/cudf-polars/meta.yaml @@ -43,7 +43,7 @@ requirements: run: - python - pylibcudf ={{ version }} - - polars >=1.8,<1.9 + - polars >=1.11,<1.12 - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} test: diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e4b9cbf8921..60132f651d2 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -369,6 +369,8 @@ add_library( src/filling/sequence.cu src/groupby/groupby.cu src/groupby/hash/compute_groupby.cu + src/groupby/hash/compute_mapping_indices.cu + src/groupby/hash/compute_mapping_indices_null.cu src/groupby/hash/compute_single_pass_aggs.cu src/groupby/hash/create_sparse_results_table.cu src/groupby/hash/flatten_single_pass_aggs.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index f013b31b3de..2a4ac789046 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -345,11 +345,11 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- -ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp) +ConfigureBench(TEXT_BENCH text/subword.cpp) ConfigureNVBench( - TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp - text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp + TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp + text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp index f44f26e4d2c..2533ea9611c 100644 --- a/cpp/benchmarks/ast/transform.cpp +++ b/cpp/benchmarks/ast/transform.cpp @@ -16,16 +16,29 @@ #include +#include + +#include +#include +#include +#include #include #include +#include +#include #include #include #include +#include #include +#include +#include +#include +#include #include #include #include @@ -39,14 +52,14 @@ enum class TreeType { template static void BM_ast_transform(nvbench::state& state) { - auto const table_size = static_cast(state.get_int64("table_size")); + auto const num_rows = static_cast(state.get_int64("num_rows")); auto const tree_levels = static_cast(state.get_int64("tree_levels")); // Create table data auto const n_cols = reuse_columns ? 1 : tree_levels + 1; auto const source_table = create_sequence_table(cycle_dtypes({cudf::type_to_id()}, n_cols), - row_count{table_size}, + row_count{num_rows}, Nullable ? std::optional{0.5} : std::nullopt); auto table = source_table->view(); @@ -86,7 +99,71 @@ static void BM_ast_transform(nvbench::state& state) auto const& expression_tree_root = expressions.back(); // Use the number of bytes read from global memory - state.add_global_memory_reads(table_size * (tree_levels + 1)); + state.add_global_memory_reads(static_cast(num_rows) * (tree_levels + 1)); + state.add_global_memory_writes(num_rows); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); }); +} + +template +static void BM_string_compare_ast_transform(nvbench::state& state) +{ + auto const string_width = static_cast(state.get_int64("string_width")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const tree_levels = static_cast(state.get_int64("tree_levels")); + auto const hit_rate = static_cast(state.get_int64("hit_rate")); + + CUDF_EXPECTS(tree_levels > 0, "benchmarks require 1 or more comparisons"); + + // Create table data + auto const num_cols = tree_levels * 2; + std::vector> columns; + std::for_each( + thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) { + columns.emplace_back(create_string_column(num_rows, string_width, hit_rate)); + }); + + cudf::table table{std::move(columns)}; + cudf::table_view const table_view = table.view(); + + int64_t const chars_size = std::accumulate( + table_view.begin(), + table_view.end(), + static_cast(0), + [](int64_t size, auto& column) -> int64_t { + return size + cudf::strings_column_view{column}.chars_size(cudf::get_default_stream()); + }); + + // Create column references + auto column_refs = std::vector(); + std::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_cols), + std::back_inserter(column_refs), + [](auto const& column_id) { return cudf::ast::column_reference(column_id); }); + + // Create expression trees + std::list expressions; + + // Construct AST tree (a == b && c == d && e == f && ...) + + expressions.emplace_back(cudf::ast::operation(cmp_op, column_refs[0], column_refs[1])); + + std::for_each(thrust::make_counting_iterator(1), + thrust::make_counting_iterator(tree_levels), + [&](size_t idx) { + auto const& lhs = expressions.back(); + auto const& rhs = expressions.emplace_back( + cudf::ast::operation(cmp_op, column_refs[idx * 2], column_refs[idx * 2 + 1])); + expressions.emplace_back(cudf::ast::operation(reduce_op, lhs, rhs)); + }); + + auto const& expression_tree_root = expressions.back(); + + // Use the number of bytes read from global memory + state.add_element_count(chars_size, "chars_size"); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(num_rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); }); @@ -100,7 +177,7 @@ static void BM_ast_transform(nvbench::state& state) NVBENCH_BENCH(name) \ .set_name(#name) \ .add_int64_axis("tree_levels", {1, 5, 10}) \ - .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000}) + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}) AST_TRANSFORM_BENCHMARK_DEFINE( ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false); @@ -115,3 +192,19 @@ AST_TRANSFORM_BENCHMARK_DEFINE( ast_int32_imbalanced_reuse_nulls, int32_t, TreeType::IMBALANCED_LEFT, true, true); AST_TRANSFORM_BENCHMARK_DEFINE( ast_double_imbalanced_unique_nulls, double, TreeType::IMBALANCED_LEFT, false, true); + +#define AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(name, cmp_op, reduce_op) \ + static void name(::nvbench::state& st) \ + { \ + ::BM_string_compare_ast_transform(st); \ + } \ + NVBENCH_BENCH(name) \ + .set_name(#name) \ + .add_int64_axis("string_width", {32, 64, 128, 256}) \ + .add_int64_axis("num_rows", {32768, 262144, 2097152}) \ + .add_int64_axis("tree_levels", {1, 2, 3, 4}) \ + .add_int64_axis("hit_rate", {50, 100}) + +AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(ast_string_equal_logical_and, + cudf::ast::ast_operator::EQUAL, + cudf::ast::ast_operator::LOGICAL_AND); diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp index 7d267a88764..75c91d270a7 100644 --- a/cpp/benchmarks/binaryop/binaryop.cpp +++ b/cpp/benchmarks/binaryop/binaryop.cpp @@ -17,12 +17,18 @@ #include #include +#include +#include #include #include +#include + #include #include +#include +#include // This set of benchmarks is designed to be a comparison for the AST benchmarks @@ -34,17 +40,18 @@ enum class TreeType { template static void BM_binaryop_transform(nvbench::state& state) { - auto const table_size{static_cast(state.get_int64("table_size"))}; + auto const num_rows{static_cast(state.get_int64("num_rows"))}; auto const tree_levels{static_cast(state.get_int64("tree_levels"))}; // Create table data auto const n_cols = reuse_columns ? 1 : tree_levels + 1; auto const source_table = create_sequence_table( - cycle_dtypes({cudf::type_to_id()}, n_cols), row_count{table_size}); + cycle_dtypes({cudf::type_to_id()}, n_cols), row_count{num_rows}); cudf::table_view table{*source_table}; // Use the number of bytes read from global memory - state.add_global_memory_reads(table_size * (tree_levels + 1)); + state.add_global_memory_reads(static_cast(num_rows) * (tree_levels + 1)); + state.add_global_memory_writes(num_rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { // Execute tree that chains additions like (((a + b) + c) + d) @@ -64,15 +71,69 @@ static void BM_binaryop_transform(nvbench::state& state) }); } +template +static void BM_string_compare_binaryop_transform(nvbench::state& state) +{ + auto const string_width = static_cast(state.get_int64("string_width")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const tree_levels = static_cast(state.get_int64("tree_levels")); + auto const hit_rate = static_cast(state.get_int64("hit_rate")); + + CUDF_EXPECTS(tree_levels > 0, "benchmarks require 1 or more comparisons"); + + // Create table data + auto const num_cols = tree_levels * 2; + std::vector> columns; + std::for_each( + thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) { + columns.emplace_back(create_string_column(num_rows, string_width, hit_rate)); + }); + + cudf::table table{std::move(columns)}; + cudf::table_view const table_view = table.view(); + + int64_t const chars_size = std::accumulate( + table_view.begin(), table_view.end(), static_cast(0), [](int64_t size, auto& column) { + return size + cudf::strings_column_view{column}.chars_size(cudf::get_default_stream()); + }); + + // Create column references + + // Use the number of bytes read from global memory + state.add_element_count(chars_size, "chars_size"); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(num_rows); + + // Construct binary operations (a == b && c == d && e == f && ...) + auto constexpr bool_type = cudf::data_type{cudf::type_id::BOOL8}; + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream{launch.get_stream().get_stream()}; + std::unique_ptr reduction = + cudf::binary_operation(table.get_column(0), table.get_column(1), cmp_op, bool_type, stream); + std::for_each( + thrust::make_counting_iterator(1), + thrust::make_counting_iterator(tree_levels), + [&](size_t idx) { + std::unique_ptr comparison = cudf::binary_operation( + table.get_column(idx * 2), table.get_column(idx * 2 + 1), cmp_op, bool_type, stream); + std::unique_ptr reduced = + cudf::binary_operation(*comparison, *reduction, reduce_op, bool_type, stream); + stream.synchronize(); + reduction = std::move(reduced); + }); + }); +} + #define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \ \ static void name(::nvbench::state& st) \ { \ - BM_binaryop_transform(st); \ + ::BM_binaryop_transform(st); \ } \ NVBENCH_BENCH(name) \ .add_int64_axis("tree_levels", {1, 2, 5, 10}) \ - .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000}) + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}) BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_int32_imbalanced_unique, int32_t, @@ -86,3 +147,20 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique, double, TreeType::IMBALANCED_LEFT, false); + +#define STRING_COMPARE_BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, cmp_op, reduce_op) \ + \ + static void name(::nvbench::state& st) \ + { \ + ::BM_string_compare_binaryop_transform(st); \ + } \ + NVBENCH_BENCH(name) \ + .set_name(#name) \ + .add_int64_axis("string_width", {32, 64, 128, 256}) \ + .add_int64_axis("num_rows", {32768, 262144, 2097152}) \ + .add_int64_axis("tree_levels", {1, 2, 3, 4}) \ + .add_int64_axis("hit_rate", {50, 100}) + +STRING_COMPARE_BINARYOP_TRANSFORM_BENCHMARK_DEFINE(string_compare_binaryop_transform, + cudf::binary_operator::EQUAL, + cudf::binary_operator::LOGICAL_AND); diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp index bc0ff69bce9..426f44a4fa1 100644 --- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp +++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp @@ -23,10 +23,10 @@ template void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop) { - auto const table_size = static_cast(state.get_int64("table_size")); + auto const num_rows = static_cast(state.get_int64("num_rows")); auto const source_table = create_random_table( - {cudf::type_to_id(), cudf::type_to_id()}, row_count{table_size}); + {cudf::type_to_id(), cudf::type_to_id()}, row_count{num_rows}); auto lhs = cudf::column_view(source_table->get_column(0)); auto rhs = cudf::column_view(source_table->get_column(1)); @@ -37,9 +37,9 @@ void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop) cudf::binary_operation(lhs, rhs, binop, output_dtype); // use number of bytes read and written to global memory - state.add_global_memory_reads(table_size); - state.add_global_memory_reads(table_size); - state.add_global_memory_reads(table_size); + state.add_global_memory_reads(num_rows); + state.add_global_memory_reads(num_rows); + state.add_global_memory_writes(num_rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); }); @@ -55,7 +55,7 @@ void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop) } \ NVBENCH_BENCH(name) \ .set_name("compiled_binary_op_" BM_STRINGIFY(name)) \ - .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}) + .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}) #define build_name(a, b, c, d) a##_##b##_##c##_##d diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index dc258e32dc5..bdce8a31176 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -17,13 +17,17 @@ #include "generate_input.hpp" #include "random_distribution_factory.cuh" +#include + #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -918,6 +922,58 @@ std::unique_ptr create_sequence_table(std::vector co return std::make_unique(std::move(columns)); } +std::unique_ptr create_string_column(cudf::size_type num_rows, + cudf::size_type row_width, + int32_t hit_rate) +{ + // build input table using the following data + auto raw_data = cudf::test::strings_column_wrapper( + { + "123 abc 4567890 DEFGHI 0987 5W43", // matches both patterns; + "012345 6789 01234 56789 0123 456", // the rest do not match + "abc 4567890 DEFGHI 0987 Wxyz 123", + "abcdefghijklmnopqrstuvwxyz 01234", + "", + "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01", + "9876543210,abcdefghijklmnopqrstU", + "9876543210,abcdefghijklmnopqrstU", + "123 édf 4567890 DéFG 0987 X5", + "1", + }) + .release(); + + if (row_width / 32 > 1) { + std::vector columns; + for (int i = 0; i < row_width / 32; ++i) { + columns.push_back(raw_data->view()); + } + raw_data = cudf::strings::concatenate(cudf::table_view(columns)); + } + auto data_view = raw_data->view(); + + // compute number of rows in n_rows that should match + auto const num_matches = (static_cast(num_rows) * hit_rate) / 100; + + // Create a randomized gather-map to build a column out of the strings in data. + data_profile gather_profile = + data_profile_builder().cardinality(0).null_probability(0.0).distribution( + cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1); + auto gather_table = + create_random_table({cudf::type_id::INT32}, row_count{num_rows}, gather_profile); + gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); + + // Create scatter map by placing 0-index values throughout the gather-map + auto scatter_data = cudf::sequence(num_matches, + cudf::numeric_scalar(0), + cudf::numeric_scalar(num_rows / num_matches)); + auto zero_scalar = cudf::numeric_scalar(0); + auto table = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view()); + auto gather_map = table->view().column(0); + table = cudf::gather(cudf::table_view({data_view}), gather_map); + + return std::move(table->release().front()); +} + std::pair create_random_null_mask( cudf::size_type size, std::optional null_probability, unsigned seed) { diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp index 68d3dc492f5..57834fd11d2 100644 --- a/cpp/benchmarks/common/generate_input.hpp +++ b/cpp/benchmarks/common/generate_input.hpp @@ -670,6 +670,18 @@ std::unique_ptr create_random_column(cudf::type_id dtype_id, data_profile const& data_params = data_profile{}, unsigned seed = 1); +/** + * @brief Deterministically generates a large string column filled with data with the given + * parameters. + * + * @param num_rows Number of rows in the output column + * @param row_width Width of each string in the column + * @param hit_rate The hit rate percentage, ranging from 0 to 100 + */ +std::unique_ptr create_string_column(cudf::size_type num_rows, + cudf::size_type row_width, + int32_t hit_rate); + /** * @brief Generate sequence columns starting with value 0 in first row and increasing by 1 in * subsequent rows. diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index fe24fb58728..45b46005c47 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -186,7 +186,7 @@ std::string exec_cmd(std::string_view cmd) std::fflush(nullptr); // Switch stderr and stdout to only capture stderr auto const redirected_cmd = std::string{"( "}.append(cmd).append(" 3>&2 2>&1 1>&3) 2>/dev/null"); - std::unique_ptr pipe(popen(redirected_cmd.c_str(), "r"), pclose); + std::unique_ptr pipe(popen(redirected_cmd.c_str(), "r"), pclose); CUDF_EXPECTS(pipe != nullptr, "popen() failed"); std::array buffer; diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp index ae6c8b844c8..a73017dda18 100644 --- a/cpp/benchmarks/string/contains.cpp +++ b/cpp/benchmarks/string/contains.cpp @@ -17,10 +17,6 @@ #include #include -#include - -#include -#include #include #include #include @@ -28,57 +24,6 @@ #include -std::unique_ptr build_input_column(cudf::size_type n_rows, - cudf::size_type row_width, - int32_t hit_rate) -{ - // build input table using the following data - auto raw_data = cudf::test::strings_column_wrapper( - { - "123 abc 4567890 DEFGHI 0987 5W43", // matches both patterns; - "012345 6789 01234 56789 0123 456", // the rest do not match - "abc 4567890 DEFGHI 0987 Wxyz 123", - "abcdefghijklmnopqrstuvwxyz 01234", - "", - "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01", - "9876543210,abcdefghijklmnopqrstU", - "9876543210,abcdefghijklmnopqrstU", - "123 édf 4567890 DéFG 0987 X5", - "1", - }) - .release(); - - if (row_width / 32 > 1) { - std::vector columns; - for (int i = 0; i < row_width / 32; ++i) { - columns.push_back(raw_data->view()); - } - raw_data = cudf::strings::concatenate(cudf::table_view(columns)); - } - auto data_view = raw_data->view(); - - // compute number of rows in n_rows that should match - auto matches = static_cast(n_rows * hit_rate) / 100; - - // Create a randomized gather-map to build a column out of the strings in data. - data_profile gather_profile = - data_profile_builder().cardinality(0).null_probability(0.0).distribution( - cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1); - auto gather_table = - create_random_table({cudf::type_id::INT32}, row_count{n_rows}, gather_profile); - gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); - - // Create scatter map by placing 0-index values throughout the gather-map - auto scatter_data = cudf::sequence( - matches, cudf::numeric_scalar(0), cudf::numeric_scalar(n_rows / matches)); - auto zero_scalar = cudf::numeric_scalar(0); - auto table = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view()); - auto gather_map = table->view().column(0); - table = cudf::gather(cudf::table_view({data_view}), gather_map); - - return std::move(table->release().front()); -} - // longer pattern lengths demand more working memory per string std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"}; @@ -94,7 +39,7 @@ static void bench_contains(nvbench::state& state) state.skip("Skip benchmarks greater than size_type limit"); } - auto col = build_input_column(n_rows, row_width, hit_rate); + auto col = create_string_column(n_rows, row_width, hit_rate); auto input = cudf::strings_column_view(col->view()); auto pattern = patterns[pattern_index]; diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index a9c620e4bf0..996bdcf0332 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -19,7 +19,6 @@ #include -#include #include #include #include @@ -29,10 +28,6 @@ #include -std::unique_ptr build_input_column(cudf::size_type n_rows, - cudf::size_type row_width, - int32_t hit_rate); - static void bench_find_string(nvbench::state& state) { auto const n_rows = static_cast(state.get_int64("num_rows")); @@ -46,7 +41,7 @@ static void bench_find_string(nvbench::state& state) } auto const stream = cudf::get_default_stream(); - auto const col = build_input_column(n_rows, row_width, hit_rate); + auto const col = create_string_column(n_rows, row_width, hit_rate); auto const input = cudf::strings_column_view(col->view()); std::vector h_targets({"5W", "5W43", "0987 5W43"}); diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp index 99cef640dc3..105ae65cbe8 100644 --- a/cpp/benchmarks/string/like.cpp +++ b/cpp/benchmarks/string/like.cpp @@ -18,68 +18,12 @@ #include -#include -#include -#include #include #include #include #include -namespace { -std::unique_ptr build_input_column(cudf::size_type n_rows, - cudf::size_type row_width, - int32_t hit_rate) -{ - // build input table using the following data - auto raw_data = cudf::test::strings_column_wrapper( - { - "123 abc 4567890 DEFGHI 0987 5W43", // matches always; - "012345 6789 01234 56789 0123 456", // the rest do not match - "abc 4567890 DEFGHI 0987 Wxyz 123", - "abcdefghijklmnopqrstuvwxyz 01234", - "", - "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01", - "9876543210,abcdefghijklmnopqrstU", - "9876543210,abcdefghijklmnopqrstU", - "123 édf 4567890 DéFG 0987 X5", - "1", - }) - .release(); - if (row_width / 32 > 1) { - std::vector columns; - for (int i = 0; i < row_width / 32; ++i) { - columns.push_back(raw_data->view()); - } - raw_data = cudf::strings::concatenate(cudf::table_view(columns)); - } - auto data_view = raw_data->view(); - - // compute number of rows in n_rows that should match - auto matches = static_cast(n_rows * hit_rate) / 100; - - // Create a randomized gather-map to build a column out of the strings in data. - data_profile gather_profile = - data_profile_builder().cardinality(0).null_probability(0.0).distribution( - cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1); - auto gather_table = - create_random_table({cudf::type_id::INT32}, row_count{n_rows}, gather_profile); - gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); - - // Create scatter map by placing 0-index values throughout the gather-map - auto scatter_data = cudf::sequence( - matches, cudf::numeric_scalar(0), cudf::numeric_scalar(n_rows / matches)); - auto zero_scalar = cudf::numeric_scalar(0); - auto table = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view()); - auto gather_map = table->view().column(0); - table = cudf::gather(cudf::table_view({data_view}), gather_map); - - return std::move(table->release().front()); -} - -} // namespace - static void bench_like(nvbench::state& state) { auto const n_rows = static_cast(state.get_int64("num_rows")); @@ -91,7 +35,7 @@ static void bench_like(nvbench::state& state) state.skip("Skip benchmarks greater than size_type limit"); } - auto col = build_input_column(n_rows, row_width, hit_rate); + auto col = create_string_column(n_rows, row_width, hit_rate); auto input = cudf::strings_column_view(col->view()); // This pattern forces reading the entire target string (when matched expected) diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp index 8e48f8e9a05..43d57201b20 100644 --- a/cpp/benchmarks/text/ngrams.cpp +++ b/cpp/benchmarks/text/ngrams.cpp @@ -15,58 +15,45 @@ */ #include -#include -#include -#include #include #include #include -class TextNGrams : public cudf::benchmark {}; +#include -enum class ngrams_type { tokens, characters }; - -static void BM_ngrams(benchmark::State& state, ngrams_type nt) +static void bench_ngrams(nvbench::state& state) { - auto const n_rows = static_cast(state.range(0)); - auto const max_str_length = static_cast(state.range(1)); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const ngram_type = state.get_string("type"); + data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); auto const separator = cudf::string_scalar("_"); - for (auto _ : state) { - cuda_event_timer raii(state, true); - switch (nt) { - case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break; - case ngrams_type::characters: nvtext::generate_character_ngrams(input); break; - } - } + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream())); -} + auto chars_size = input.chars_size(cudf::get_default_stream()); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(chars_size * 2); -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 5; - int const max_rowlen = 40; - int const len_mult = 2; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); + if (ngram_type == "chars") { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::generate_character_ngrams(input); + }); + } else { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::generate_ngrams(input, 2, separator); + }); + } } -#define NVTEXT_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(TextNGrams, name) \ - (::benchmark::State & st) { BM_ngrams(st, ngrams_type::name); } \ - BENCHMARK_REGISTER_F(TextNGrams, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -NVTEXT_BENCHMARK_DEFINE(tokens) -NVTEXT_BENCHMARK_DEFINE(characters) +NVBENCH_BENCH(bench_ngrams) + .set_name("ngrams") + .add_int64_axis("num_rows", {131072, 262144, 524288, 1048578}) + .add_int64_axis("row_width", {10, 20, 40, 100}) + .add_string_axis("type", {"chars", "tokens"}); diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp index a254171ef11..f4cce8e6da6 100644 --- a/cpp/include/cudf/ast/detail/expression_parser.hpp +++ b/cpp/include/cudf/ast/detail/expression_parser.hpp @@ -17,12 +17,8 @@ #include #include -#include #include #include -#include - -#include #include #include diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index 6bbe32de134..e72661ce49a 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -24,8 +24,6 @@ #include -#include - namespace CUDF_EXPORT cudf { /** * @addtogroup column_factories diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 48f89b8be25..6db5c8b3c7b 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -16,7 +16,6 @@ #pragma once #include -#include #include #include #include diff --git a/cpp/include/cudf/detail/aggregation/result_cache.hpp b/cpp/include/cudf/detail/aggregation/result_cache.hpp index ec5a511bb7c..486808ebe18 100644 --- a/cpp/include/cudf/detail/aggregation/result_cache.hpp +++ b/cpp/include/cudf/detail/aggregation/result_cache.hpp @@ -19,7 +19,6 @@ #include #include #include -#include #include diff --git a/cpp/include/cudf/detail/is_element_valid.hpp b/cpp/include/cudf/detail/is_element_valid.hpp index 4b74d12f306..26b1bec2ced 100644 --- a/cpp/include/cudf/detail/is_element_valid.hpp +++ b/cpp/include/cudf/detail/is_element_valid.hpp @@ -17,7 +17,6 @@ #pragma once #include -#include #include #include diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 482265d633e..025e2ccc3ec 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -166,16 +166,9 @@ size_type inplace_bitmask_binop(Binop op, rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref(); cudf::detail::device_scalar d_counter{0, stream, mr}; - rmm::device_uvector d_masks(masks.size(), stream, mr); - rmm::device_uvector d_begin_bits(masks_begin_bits.size(), stream, mr); - - CUDF_CUDA_TRY(cudaMemcpyAsync( - d_masks.data(), masks.data(), masks.size_bytes(), cudaMemcpyDefault, stream.value())); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_begin_bits.data(), - masks_begin_bits.data(), - masks_begin_bits.size_bytes(), - cudaMemcpyDefault, - stream.value())); + + auto d_masks = cudf::detail::make_device_uvector_async(masks, stream, mr); + auto d_begin_bits = cudf::detail::make_device_uvector_async(masks_begin_bits, stream, mr); auto constexpr block_size = 256; cudf::detail::grid_1d config(dest_mask.size(), block_size); diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp index 5596f78a90b..0a799f27d00 100644 --- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp +++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp @@ -15,7 +15,6 @@ */ #pragma once -#include #include /** diff --git a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp index 11eb4518210..5659f86b0c4 100644 --- a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp +++ b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp @@ -16,16 +16,10 @@ #pragma once -#include #include #include -#include - -#include -#include #include -#include namespace CUDF_EXPORT cudf { namespace io::text::detail::bgzip { diff --git a/cpp/include/cudf/utilities/default_stream.hpp b/cpp/include/cudf/utilities/default_stream.hpp index 97a42243250..3e740b81cc9 100644 --- a/cpp/include/cudf/utilities/default_stream.hpp +++ b/cpp/include/cudf/utilities/default_stream.hpp @@ -16,10 +16,8 @@ #pragma once -#include #include -#include #include namespace CUDF_EXPORT cudf { diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index 3f37ae02151..cf8413b597f 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -22,8 +22,6 @@ #include #include -#include - namespace CUDF_EXPORT cudf { /** diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp index 15b5f921c1b..6351a84e38f 100644 --- a/cpp/include/cudf/utilities/type_dispatcher.hpp +++ b/cpp/include/cudf/utilities/type_dispatcher.hpp @@ -16,7 +16,6 @@ #pragma once -#include #include #include #include diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 7c909f1a948..42124461cdf 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -41,6 +41,8 @@ namespace CUDF_EXPORT nvtext { * * This function uses MurmurHash3_x86_32 for the hash algorithm. * + * @deprecated Deprecated in 24.12 + * * @throw std::invalid_argument if the width < 2 * * @param input Strings column to compute minhash @@ -51,7 +53,7 @@ namespace CUDF_EXPORT nvtext { * @param mr Device memory resource used to allocate the returned column's device memory * @return Minhash values for each string in input */ -std::unique_ptr minhash( +[[deprecated]] std::unique_ptr minhash( cudf::strings_column_view const& input, cudf::numeric_scalar seed = 0, cudf::size_type width = 4, @@ -71,6 +73,8 @@ std::unique_ptr minhash( * * Any null row entries result in corresponding null output rows. * + * @deprecated Deprecated in 24.12 - to be replaced in a future release + * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if seeds is empty * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit @@ -83,7 +87,7 @@ std::unique_ptr minhash( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr minhash( +[[deprecated]] std::unique_ptr minhash( cudf::strings_column_view const& input, cudf::device_span seeds, cudf::size_type width = 4, @@ -102,6 +106,8 @@ std::unique_ptr minhash( * The hash function returns 2 uint64 values but only the first value * is used with the minhash calculation. * + * @deprecated Deprecated in 24.12 + * * @throw std::invalid_argument if the width < 2 * * @param input Strings column to compute minhash @@ -112,7 +118,7 @@ std::unique_ptr minhash( * @param mr Device memory resource used to allocate the returned column's device memory * @return Minhash values as UINT64 for each string in input */ -std::unique_ptr minhash64( +[[deprecated]] std::unique_ptr minhash64( cudf::strings_column_view const& input, cudf::numeric_scalar seed = 0, cudf::size_type width = 4, @@ -132,6 +138,8 @@ std::unique_ptr minhash64( * * Any null row entries result in corresponding null output rows. * + * @deprecated Deprecated in 24.12 - to be replaced in a future release + * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if seeds is empty * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit @@ -144,7 +152,7 @@ std::unique_ptr minhash64( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr minhash64( +[[deprecated]] std::unique_ptr minhash64( cudf::strings_column_view const& input, cudf::device_span seeds, cudf::size_type width = 4, @@ -164,6 +172,8 @@ std::unique_ptr minhash64( * * Any null row entries result in corresponding null output rows. * + * @deprecated Deprecated in 24.12 + * * @throw std::invalid_argument if seeds is empty * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit * @@ -173,7 +183,7 @@ std::unique_ptr minhash64( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr word_minhash( +[[deprecated]] std::unique_ptr word_minhash( cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream = cudf::get_default_stream(), @@ -193,6 +203,8 @@ std::unique_ptr word_minhash( * * Any null row entries result in corresponding null output rows. * + * @deprecated Deprecated in 24.12 + * * @throw std::invalid_argument if seeds is empty * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit * @@ -202,7 +214,7 @@ std::unique_ptr word_minhash( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr word_minhash64( +[[deprecated]] std::unique_ptr word_minhash64( cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream = cudf::get_default_stream(), diff --git a/cpp/include/nvtext/stemmer.hpp b/cpp/include/nvtext/stemmer.hpp index 55a4124bfd0..e5b2a4cc21b 100644 --- a/cpp/include/nvtext/stemmer.hpp +++ b/cpp/include/nvtext/stemmer.hpp @@ -51,7 +51,7 @@ enum class letter_type { * * @code{.pseudo} * Example: - * st = ["trouble", "toy", "sygyzy"] + * st = ["trouble", "toy", "syzygy"] * b1 = is_letter(st, VOWEL, 1) * b1 is now [false, true, true] * @endcode @@ -62,7 +62,7 @@ enum class letter_type { * * @code{.pseudo} * Example: - * st = ["trouble", "toy", "sygyzy"] + * st = ["trouble", "toy", "syzygy"] * b2 = is_letter(st, CONSONANT, -1) // last letter checked in each string * b2 is now [false, true, false] * @endcode @@ -99,7 +99,7 @@ std::unique_ptr is_letter( * * @code{.pseudo} * Example: - * st = ["trouble", "toy", "sygyzy"] + * st = ["trouble", "toy", "syzygy"] * ix = [3, 1, 4] * b1 = is_letter(st, VOWEL, ix) * b1 is now [true, true, false] @@ -111,7 +111,7 @@ std::unique_ptr is_letter( * * @code{.pseudo} * Example: - * st = ["trouble", "toy", "sygyzy"] + * st = ["trouble", "toy", "syzygy"] * ix = [3, -2, 4] // 2nd to last character in st[1] is checked * b2 = is_letter(st, CONSONANT, ix) * b2 is now [false, false, true] diff --git a/cpp/src/ast/expression_parser.cpp b/cpp/src/ast/expression_parser.cpp index 3b650d791aa..5815ce33e33 100644 --- a/cpp/src/ast/expression_parser.cpp +++ b/cpp/src/ast/expression_parser.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,9 +16,6 @@ #include #include #include -#include -#include -#include #include #include #include diff --git a/cpp/src/ast/expressions.cpp b/cpp/src/ast/expressions.cpp index b45b9d0c78c..4c2b56dd4f5 100644 --- a/cpp/src/ast/expressions.cpp +++ b/cpp/src/ast/expressions.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,9 +17,6 @@ #include #include #include -#include -#include -#include #include #include diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index a6c878efbbc..1b23ea12a5e 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -27,15 +27,10 @@ #include #include #include -#include #include -#include #include #include -#include -#include #include -#include #include #include diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp index 482413d0ccb..972f97e8668 100644 --- a/cpp/src/column/column_factories.cpp +++ b/cpp/src/column/column_factories.cpp @@ -15,19 +15,13 @@ */ #include -#include #include #include #include -#include #include -#include #include -#include #include -#include - namespace cudf { namespace { struct size_of_helper { diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 386c5ebe478..e831aa9645d 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include #include @@ -27,9 +26,7 @@ #include #include -#include #include -#include #include namespace cudf { diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp index d60fb5ce110..5e2065ba844 100644 --- a/cpp/src/copying/copy.cpp +++ b/cpp/src/copying/copy.cpp @@ -20,16 +20,11 @@ #include #include #include -#include #include -#include -#include #include #include -#include - #include namespace cudf { diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp index 1282eec6c44..a001807c82b 100644 --- a/cpp/src/copying/pack.cpp +++ b/cpp/src/copying/pack.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include diff --git a/cpp/src/copying/split.cpp b/cpp/src/copying/split.cpp index 832a72ed5b0..116e3516460 100644 --- a/cpp/src/copying/split.cpp +++ b/cpp/src/copying/split.cpp @@ -14,10 +14,8 @@ * limitations under the License. */ -#include #include #include -#include #include #include diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index 2196ee97fee..f786624680c 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -13,12 +13,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include #include #include #include #include -#include #include #include diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cu b/cpp/src/groupby/hash/compute_mapping_indices.cu new file mode 100644 index 00000000000..519d7cd2f1c --- /dev/null +++ b/cpp/src/groupby/hash/compute_mapping_indices.cu @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_mapping_indices.cuh" +#include "compute_mapping_indices.hpp" + +namespace cudf::groupby::detail::hash { +template cudf::size_type max_occupancy_grid_size>( + cudf::size_type n); + +template void compute_mapping_indices>( + cudf::size_type grid_size, + cudf::size_type num, + hash_set_ref_t global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cuda::std::atomic_flag* needs_global_memory_fallback, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh new file mode 100644 index 00000000000..d353830780f --- /dev/null +++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_mapping_indices.hpp" +#include "helpers.cuh" + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +template +__device__ void find_local_mapping(cooperative_groups::thread_block const& block, + cudf::size_type idx, + cudf::size_type num_input_rows, + SetType shared_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* cardinality, + cudf::size_type* local_mapping_index, + cudf::size_type* shared_set_indices) +{ + auto const is_valid_input = + idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx)); + auto const [result_idx, inserted] = [&]() { + if (is_valid_input) { + auto const result = shared_set.insert_and_find(idx); + auto const matched_idx = *result.first; + auto const inserted = result.second; + // inserted a new element + if (result.second) { + auto const shared_set_index = atomicAdd(cardinality, 1); + shared_set_indices[shared_set_index] = idx; + local_mapping_index[idx] = shared_set_index; + } + return cuda::std::pair{matched_idx, inserted}; + } + return cuda::std::pair{0, false}; // dummy values + }(); + // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all + // threads in the thread block. + block.sync(); + if (is_valid_input) { + // element was already in set + if (!inserted) { local_mapping_index[idx] = local_mapping_index[result_idx]; } + } +} + +template +__device__ void find_global_mapping(cooperative_groups::thread_block const& block, + cudf::size_type cardinality, + SetRef global_set, + cudf::size_type* shared_set_indices, + cudf::size_type* global_mapping_index) +{ + // for all unique keys in shared memory hash set, stores their matches in + // global hash set to `global_mapping_index` + for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { + auto const input_idx = shared_set_indices[idx]; + global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx] = + *global_set.insert_and_find(input_idx).first; + } +} + +/* + * @brief Inserts keys into the shared memory hash set, and stores the block-wise rank for a given + * row index in `local_mapping_index`. If the number of unique keys found in a threadblock exceeds + * `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without updating + * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the + * global hash set, and save the row index of the global sparse table in `global_mapping_index`. + */ +template +CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows, + SetRef global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cuda::std::atomic_flag* needs_global_memory_fallback) +{ + __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; + + // Shared set initialization + __shared__ cuco::window windows[window_extent.value()]; + + auto raw_set = cuco::static_set_ref{ + cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, + global_set.key_eq(), + probing_scheme_t{global_set.hash_function()}, + cuco::thread_scope_block, + cuco::aow_storage_ref{ + window_extent, windows}}; + auto shared_set = raw_set.rebind_operators(cuco::insert_and_find); + + auto const block = cooperative_groups::this_thread_block(); + shared_set.initialize(block); + + __shared__ cudf::size_type cardinality; + if (block.thread_rank() == 0) { cardinality = 0; } + block.sync(); + + auto const stride = cudf::detail::grid_1d::grid_stride(); + + for (auto idx = cudf::detail::grid_1d::global_thread_id(); + idx - block.thread_rank() < num_input_rows; + idx += stride) { + find_local_mapping(block, + idx, + num_input_rows, + shared_set, + row_bitmask, + skip_rows_with_nulls, + &cardinality, + local_mapping_index, + shared_set_indices); + + block.sync(); + + if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { + if (block.thread_rank() == 0) { needs_global_memory_fallback->test_and_set(); } + break; + } + } + + // Insert unique keys from shared to global hash set if block-cardinality + // doesn't exceed the threshold upper-limit + if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) { + find_global_mapping(block, cardinality, global_set, shared_set_indices, global_mapping_index); + } + + if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; } +} + +template +cudf::size_type max_occupancy_grid_size(cudf::size_type n) +{ + cudf::size_type max_active_blocks{-1}; + CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks, mapping_indices_kernel, GROUPBY_BLOCK_SIZE, 0)); + auto const grid_size = max_active_blocks * cudf::detail::num_multiprocessors(); + auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE); + return std::min(grid_size, num_blocks); +} + +template +void compute_mapping_indices(cudf::size_type grid_size, + cudf::size_type num, + SetRef global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cuda::std::atomic_flag* needs_global_memory_fallback, + rmm::cuda_stream_view stream) +{ + mapping_indices_kernel<<>>( + num, + global_set, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index, + global_mapping_index, + block_cardinality, + needs_global_memory_fallback); +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_mapping_indices.hpp b/cpp/src/groupby/hash/compute_mapping_indices.hpp new file mode 100644 index 00000000000..473ad99e650 --- /dev/null +++ b/cpp/src/groupby/hash/compute_mapping_indices.hpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +#include + +namespace cudf::groupby::detail::hash { +/* + * @brief Computes the maximum number of active blocks of the given kernel that can be executed on + * the underlying device + */ +template +[[nodiscard]] cudf::size_type max_occupancy_grid_size(cudf::size_type n); + +template +void compute_mapping_indices(cudf::size_type grid_size, + cudf::size_type num, + SetRef global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cuda::std::atomic_flag* needs_global_memory_fallback, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_mapping_indices_null.cu b/cpp/src/groupby/hash/compute_mapping_indices_null.cu new file mode 100644 index 00000000000..81c3c9e456f --- /dev/null +++ b/cpp/src/groupby/hash/compute_mapping_indices_null.cu @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_mapping_indices.cuh" +#include "compute_mapping_indices.hpp" + +namespace cudf::groupby::detail::hash { +template cudf::size_type +max_occupancy_grid_size>(cudf::size_type n); + +template void compute_mapping_indices>( + cudf::size_type grid_size, + cudf::size_type num, + nullable_hash_set_ref_t global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cuda::std::atomic_flag* needs_global_memory_fallback, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp index 2bf983e5e90..dfad51f27d4 100644 --- a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp @@ -17,7 +17,6 @@ #include #include -#include #include #include diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index a9085a1f1fd..3041e261945 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp index a99262fb3bf..c69ebe12d2c 100644 --- a/cpp/src/interop/arrow_utilities.cpp +++ b/cpp/src/interop/arrow_utilities.cpp @@ -20,11 +20,6 @@ #include #include -#include - -#include -#include - #include namespace cudf { diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp index 1b79fbf9eda..e4bdedf6603 100644 --- a/cpp/src/interop/arrow_utilities.hpp +++ b/cpp/src/interop/arrow_utilities.hpp @@ -17,7 +17,6 @@ #pragma once #include -#include #include #include diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp index a1be6aade4e..4395b741e53 100644 --- a/cpp/src/interop/dlpack.cpp +++ b/cpp/src/interop/dlpack.cpp @@ -16,11 +16,8 @@ #include #include #include -#include -#include #include #include -#include #include #include #include diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp index d5caa4720ac..b3fcca62314 100644 --- a/cpp/src/io/avro/avro.cpp +++ b/cpp/src/io/avro/avro.cpp @@ -17,7 +17,6 @@ #include "avro.hpp" #include -#include #include namespace cudf { diff --git a/cpp/src/io/avro/avro.hpp b/cpp/src/io/avro/avro.hpp index 2e992546ccc..fd2c781b8a1 100644 --- a/cpp/src/io/avro/avro.hpp +++ b/cpp/src/io/avro/avro.hpp @@ -18,11 +18,9 @@ #include "avro_common.hpp" -#include #include #include #include -#include #include #include #include diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp index 583bd6a3523..2e1cda2d6b7 100644 --- a/cpp/src/io/comp/nvcomp_adapter.hpp +++ b/cpp/src/io/comp/nvcomp_adapter.hpp @@ -18,9 +18,7 @@ #include "gpuinflate.hpp" -#include #include -#include #include #include diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index d4d6f46b99a..fb8c308065d 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -24,8 +24,6 @@ #include #include -#include - #include // uncompress #include // memset diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index a8682e6a760..ceaeb5d8f85 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -32,10 +32,8 @@ #include #include #include -#include #include #include -#include #include #include diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index d06338c6f69..570a00cbfc2 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index f6be4539d7f..7b3b04dea16 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -19,10 +19,7 @@ #include #include #include -#include -#include #include -#include #include #include diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index c42348a165f..0081ed30d17 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -23,6 +23,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/io/orc/reader_impl_helpers.cpp b/cpp/src/io/orc/reader_impl_helpers.cpp index 4c1079cffe8..7e5db4b7617 100644 --- a/cpp/src/io/orc/reader_impl_helpers.cpp +++ b/cpp/src/io/orc/reader_impl_helpers.cpp @@ -16,8 +16,6 @@ #include "reader_impl_helpers.hpp" -#include - namespace cudf::io::orc::detail { std::unique_ptr create_empty_column(size_type orc_col_id, diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp index 5528b2ee763..4cded30d89b 100644 --- a/cpp/src/io/orc/reader_impl_helpers.hpp +++ b/cpp/src/io/orc/reader_impl_helpers.hpp @@ -20,9 +20,6 @@ #include "io/orc/orc.hpp" #include "io/utilities/column_buffer.hpp" -#include -#include - #include #include diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index ddf65e9020f..d15435b2553 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -27,7 +27,6 @@ #include "ipc/Schema_generated.h" #include "writer_impl_helpers.hpp" -#include #include #include diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp index 9bc435bf6c8..66810ee163a 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.hpp +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -22,10 +22,9 @@ #pragma once #include -#include -#include +#include +#include #include -#include namespace cudf::io::parquet::detail { diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp index 12c24e2b848..b87f2e9c692 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.hpp +++ b/cpp/src/io/parquet/compact_protocol_reader.hpp @@ -22,10 +22,7 @@ #include #include -#include -#include #include -#include namespace CUDF_EXPORT cudf { namespace io::parquet::detail { diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index d4778b1ea15..05859d60c03 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -17,7 +17,6 @@ #pragma once #include "parquet.hpp" -#include "parquet_common.hpp" #include #include diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 347325edb1d..1ca2c404071 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -37,7 +37,14 @@ struct block_scan_results { }; template -__device__ inline static void scan_block_exclusive_sum(int thread_bit, block_scan_results& results) +using block_scan_temp_storage = int[decode_block_size / cudf::detail::warp_size]; + +// Similar to CUB, must __syncthreads() after calling if reusing temp_storage +template +__device__ inline static void scan_block_exclusive_sum( + int thread_bit, + block_scan_results& results, + block_scan_temp_storage& temp_storage) { int const t = threadIdx.x; int const warp_index = t / cudf::detail::warp_size; @@ -45,15 +52,19 @@ __device__ inline static void scan_block_exclusive_sum(int thread_bit, block_sca uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1; uint32_t warp_bits = ballot(thread_bit); - scan_block_exclusive_sum(warp_bits, warp_lane, warp_index, lane_mask, results); + scan_block_exclusive_sum( + warp_bits, warp_lane, warp_index, lane_mask, results, temp_storage); } +// Similar to CUB, must __syncthreads() after calling if reusing temp_storage template -__device__ static void scan_block_exclusive_sum(uint32_t warp_bits, - int warp_lane, - int warp_index, - uint32_t lane_mask, - block_scan_results& results) +__device__ static void scan_block_exclusive_sum( + uint32_t warp_bits, + int warp_lane, + int warp_index, + uint32_t lane_mask, + block_scan_results& results, + block_scan_temp_storage& temp_storage) { // Compute # warps constexpr int num_warps = decode_block_size / cudf::detail::warp_size; @@ -64,16 +75,15 @@ __device__ static void scan_block_exclusive_sum(uint32_t warp_bits, results.thread_count_within_warp = __popc(results.warp_bits & lane_mask); // Share the warp counts amongst the block threads - __shared__ int warp_counts[num_warps]; - if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; } - __syncthreads(); + if (warp_lane == 0) { temp_storage[warp_index] = results.warp_count; } + __syncthreads(); // Sync to share counts between threads/warps // Compute block-wide results results.block_count = 0; results.thread_count_within_block = results.thread_count_within_warp; for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) { - results.block_count += warp_counts[warp_idx]; - if (warp_idx < warp_index) { results.thread_count_within_block += warp_counts[warp_idx]; } + results.block_count += temp_storage[warp_idx]; + if (warp_idx < warp_index) { results.thread_count_within_block += temp_storage[warp_idx]; } } } @@ -603,6 +613,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c bool const is_first_lane = (warp_lane == 0); __syncthreads(); + __shared__ block_scan_temp_storage temp_storage; while (value_count < target_value_count) { bool const within_batch = value_count + t < target_value_count; @@ -612,15 +623,15 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c auto const [def_level, start_depth, end_depth] = [&]() { if (!within_batch) { return cuda::std::make_tuple(-1, -1, -1); } - int const index = rolling_index(value_count + t); - int const rep_level = static_cast(rep[index]); + int const level_index = rolling_index(value_count + t); + int const rep_level = static_cast(rep[level_index]); int const start_depth = s->nesting_info[rep_level].start_depth; if constexpr (!nullable) { return cuda::std::make_tuple(-1, start_depth, max_depth); } else { if (def != nullptr) { - int const def_level = static_cast(def[index]); + int const def_level = static_cast(def[level_index]); return cuda::std::make_tuple( def_level, start_depth, s->nesting_info[def_level].end_depth); } else { @@ -636,23 +647,26 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c int num_prior_new_rows, total_num_new_rows; { block_scan_results new_row_scan_results; - scan_block_exclusive_sum(is_new_row, new_row_scan_results); + scan_block_exclusive_sum(is_new_row, new_row_scan_results, temp_storage); + __syncthreads(); num_prior_new_rows = new_row_scan_results.thread_count_within_block; total_num_new_rows = new_row_scan_results.block_count; } - int const row_index = input_row_count + (num_prior_new_rows + is_new_row - 1); + int const row_index = input_row_count + ((num_prior_new_rows + is_new_row) - 1); input_row_count += total_num_new_rows; int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); // VALUE COUNT: - // if we are within the range of nesting levels we should be adding value indices for - // is from/in current rep level to/in the rep level AT the depth with the def value + // in_nesting_bounds: if at a nesting level where we need to add value indices + // the bounds: from current rep to the rep AT the def depth int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0; int thread_value_count_within_warp, warp_value_count, thread_value_count, block_value_count; { block_scan_results value_count_scan_results; - scan_block_exclusive_sum(in_nesting_bounds, value_count_scan_results); + scan_block_exclusive_sum( + in_nesting_bounds, value_count_scan_results, temp_storage); + __syncthreads(); thread_value_count_within_warp = value_count_scan_results.thread_count_within_warp; warp_value_count = value_count_scan_results.warp_count; @@ -684,8 +698,13 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1; block_scan_results valid_count_scan_results; - scan_block_exclusive_sum( - warp_valid_mask, warp_lane, warp_index, thread_mask, valid_count_scan_results); + scan_block_exclusive_sum(warp_valid_mask, + warp_lane, + warp_index, + thread_mask, + valid_count_scan_results, + temp_storage); + __syncthreads(); thread_valid_count = valid_count_scan_results.thread_count_within_block; block_valid_count = valid_count_scan_results.block_count; } @@ -702,8 +721,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c ((d_idx + 1 >= start_depth) && (d_idx + 1 <= end_depth) && in_row_bounds) ? 1 : 0; { block_scan_results next_value_count_scan_results; - scan_block_exclusive_sum(next_in_nesting_bounds, - next_value_count_scan_results); + scan_block_exclusive_sum( + next_in_nesting_bounds, next_value_count_scan_results, temp_storage); + __syncthreads(); next_thread_value_count_within_warp = next_value_count_scan_results.thread_count_within_warp; @@ -726,7 +746,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c } } - // validity is processed per-warp (on lane 0's), because writes are 32-bit atomic ops + // validity is processed per-warp (on lane 0's) + // thi is because when atomic writes are needed, they are 32-bit operations // // lists always read and write to the same bounds // (that is, read and write positions are already pre-bounded by first_row/num_rows). @@ -857,6 +878,7 @@ __device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t) // modulo 2 * block_size of course, since that's as many as we process at once int num_skipped = parquet_stream.skip_decode(t, num_to_skip); while (num_skipped < num_to_skip) { + // TODO: Instead of decoding, skip within the run to the appropriate location auto const to_decode = min(2 * decode_block_size_t, num_to_skip - num_skipped); num_skipped += parquet_stream.decode_next(t, to_decode); __syncthreads(); diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 8124be52552..1c1185d8afd 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -994,6 +994,7 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span pages, * @param[in] level_type_size Size in bytes of the type for level decoding * @param[in] has_nesting Whether or not the data contains nested (but not list) data. * @param[in] is_list Whether or not the data contains list data. + * @param[in] is_list Whether or not the data contains list data. * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ @@ -1004,6 +1005,7 @@ void DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages int level_type_size, bool has_nesting, bool is_list, + bool is_list, kernel_error::pointer error_code, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 32e922b04bb..a965f3325d5 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp index dd354b905f3..170c6e8857f 100644 --- a/cpp/src/io/parquet/reader.cpp +++ b/cpp/src/io/parquet/reader.cpp @@ -16,8 +16,6 @@ #include "reader_impl.hpp" -#include - namespace cudf::io::parquet::detail { reader::reader() = default; diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index f367e28de1e..a9ef9d1d100 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -21,11 +21,9 @@ #include #include #include -#include #include #include -#include #include #include diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 1e1afc3d92d..76a2b050b9e 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -216,7 +216,7 @@ struct rle_stream { decode_index = -1; // signals the first iteration. Nothing to decode. } - __device__ inline int get_rle_run_info(rle_run& run) + __device__ inline int get_rle_run_info(rle_run& run) { run.start = cur; run.level_run = get_vlq32(run.start, end); @@ -225,7 +225,6 @@ struct rle_stream { int run_bytes = run.start - cur; if (is_literal_run(run.level_run)) { // from the parquet spec: literal runs always come in multiples of 8 values. - // bit shift level_run down because low bit indicates whether literal/repeated run.size = (run.level_run >> 1) * 8; run_bytes += util::div_rounding_up_unsafe(run.size * level_bits, 8); } else { @@ -248,10 +247,12 @@ struct rle_stream { fill_index < decode_index + run_buffer_size) && cur < end) { // Encoding::RLE + // Pass by reference to fill the runs shared memory with the run data auto& run = runs[rolling_index(fill_index)]; int const run_bytes = get_rle_run_info(run); - run.remaining = run.size; - run.output_pos = output_pos; + + run.remaining = run.size; + run.output_pos = output_pos; cur += run_bytes; output_pos += run.size; @@ -382,7 +383,7 @@ struct rle_stream { // started basically we're setting up the rle_stream vars necessary to start fill_run_batch for // the first time while (cur < end) { - rle_run run; + rle_run run; int run_bytes = get_rle_run_info(run); if ((output_pos + run.size) > target_count) { diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp index 4baea8655e0..f4a2f29026a 100644 --- a/cpp/src/io/text/data_chunk_source_factories.cpp +++ b/cpp/src/io/text/data_chunk_source_factories.cpp @@ -22,10 +22,6 @@ #include #include -#include - -#include - #include namespace cudf::io::text { diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index 249dc3b5875..6d954753af8 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -21,12 +21,12 @@ #include "column_buffer.hpp" +#include #include #include #include #include -#include #include namespace cudf::io::detail { diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp index e73b2bc88de..31c8b781e77 100644 --- a/cpp/src/io/utilities/column_buffer.hpp +++ b/cpp/src/io/utilities/column_buffer.hpp @@ -22,12 +22,9 @@ #pragma once #include -#include #include -#include #include #include -#include #include #include @@ -35,6 +32,8 @@ #include +#include + namespace cudf { namespace io { namespace detail { diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp index 813743fa7b4..b66742569d9 100644 --- a/cpp/src/io/utilities/config_utils.cpp +++ b/cpp/src/io/utilities/config_utils.cpp @@ -16,14 +16,10 @@ #include "getenv_or.hpp" -#include #include #include -#include -#include -#include #include namespace cudf::io { diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 4e8908a8942..9668b30e9a9 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -33,7 +33,6 @@ #include #include -#include #include namespace cudf { diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index 98ed9b28f0a..93cdccfbb9f 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -22,8 +22,6 @@ #include #include -#include - #include #include diff --git a/cpp/src/io/utilities/row_selection.cpp b/cpp/src/io/utilities/row_selection.cpp index c0bbca39167..cf252fe63af 100644 --- a/cpp/src/io/utilities/row_selection.cpp +++ b/cpp/src/io/utilities/row_selection.cpp @@ -16,10 +16,7 @@ #include "io/utilities/row_selection.hpp" -#include - #include -#include namespace cudf::io::detail { diff --git a/cpp/src/io/utilities/row_selection.hpp b/cpp/src/io/utilities/row_selection.hpp index 7c607099cdc..e826feff201 100644 --- a/cpp/src/io/utilities/row_selection.hpp +++ b/cpp/src/io/utilities/row_selection.hpp @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include #include diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp index 89c47d246d0..34a0bdce124 100644 --- a/cpp/src/jit/cache.cpp +++ b/cpp/src/jit/cache.cpp @@ -16,11 +16,8 @@ #include -#include - #include -#include #include namespace cudf { diff --git a/cpp/src/jit/util.cpp b/cpp/src/jit/util.cpp index 0585e02a031..d9a29203133 100644 --- a/cpp/src/jit/util.cpp +++ b/cpp/src/jit/util.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,8 +19,6 @@ #include #include -#include - namespace cudf { namespace jit { struct get_data_ptr_functor { diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index b0a84a6d50c..d27420658d6 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -1126,12 +1126,8 @@ std::pair, rmm::device_uvector> generate_mer * `max` of 0. * * @param tdv input tdigests. The tdigests within this column are grouped by key. - * @param h_group_offsets a host iterator of the offsets to the start of each group. A group is - * counted as one even when the cluster is empty in it. The offsets should have the same values as - * the ones in `group_offsets`. * @param group_offsets a device iterator of the offsets to the start of each group. A group is - * counted as one even when the cluster is empty in it. The offsets should have the same values as - * the ones in `h_group_offsets`. + * counted as one even when the cluster is empty in it. * @param group_labels a device iterator of the the group label for each tdigest cluster including * empty clusters. * @param num_group_labels the number of unique group labels. @@ -1142,9 +1138,8 @@ std::pair, rmm::device_uvector> generate_mer * * @return A column containing the merged tdigests. */ -template +template std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, - HGroupOffsetIter h_group_offsets, GroupOffsetIter group_offsets, GroupLabelIter group_labels, size_t num_group_labels, @@ -1313,21 +1308,13 @@ std::unique_ptr reduce_merge_tdigest(column_view const& input, if (input.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_scalar(stream, mr); } - auto group_offsets_ = group_offsets_fn{input.size()}; - auto h_group_offsets = cudf::detail::make_counting_transform_iterator(0, group_offsets_); - auto group_offsets = cudf::detail::make_counting_transform_iterator(0, group_offsets_); - auto group_labels = thrust::make_constant_iterator(0); - return to_tdigest_scalar(merge_tdigests(tdv, - h_group_offsets, - group_offsets, - group_labels, - input.size(), - 1, - max_centroids, - stream, - mr), - stream, - mr); + auto group_offsets_ = group_offsets_fn{input.size()}; + auto group_offsets = cudf::detail::make_counting_transform_iterator(0, group_offsets_); + auto group_labels = thrust::make_constant_iterator(0); + return to_tdigest_scalar( + merge_tdigests(tdv, group_offsets, group_labels, input.size(), 1, max_centroids, stream, mr), + stream, + mr); } std::unique_ptr group_tdigest(column_view const& col, @@ -1376,16 +1363,7 @@ std::unique_ptr group_merge_tdigest(column_view const& input, return cudf::tdigest::detail::make_empty_tdigests_column(num_groups, stream, mr); } - // bring group offsets back to the host - std::vector h_group_offsets(group_offsets.size()); - cudaMemcpyAsync(h_group_offsets.data(), - group_offsets.begin(), - sizeof(size_type) * group_offsets.size(), - cudaMemcpyDefault, - stream); - return merge_tdigests(tdv, - h_group_offsets.begin(), group_offsets.data(), group_labels.data(), group_labels.size(), diff --git a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp index a9f86ac1b5f..17844b6bb0a 100644 --- a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp +++ b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index d187375b69f..75ebc078930 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -26,8 +26,6 @@ #include #include #include -#include -#include #include #include #include diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp index d3c0b54f286..b91ae19b51a 100644 --- a/cpp/src/reductions/scan/scan.cpp +++ b/cpp/src/reductions/scan/scan.cpp @@ -14,13 +14,10 @@ * limitations under the License. */ -#include #include #include #include #include -#include -#include namespace cudf { diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp index 40d1d8a0a53..c4f6c135dde 100644 --- a/cpp/src/reductions/segmented/reductions.cpp +++ b/cpp/src/reductions/segmented/reductions.cpp @@ -13,16 +13,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include #include #include #include #include #include -#include #include -#include #include #include diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.cpp b/cpp/src/rolling/detail/optimized_unbounded_window.cpp index 72c23395a93..7cad31c0658 100644 --- a/cpp/src/rolling/detail/optimized_unbounded_window.cpp +++ b/cpp/src/rolling/detail/optimized_unbounded_window.cpp @@ -18,13 +18,10 @@ #include #include #include -#include #include #include #include #include -#include -#include #include namespace cudf::detail { diff --git a/cpp/src/rolling/detail/range_window_bounds.hpp b/cpp/src/rolling/detail/range_window_bounds.hpp index 8a53e937f98..77cb2a8c7f5 100644 --- a/cpp/src/rolling/detail/range_window_bounds.hpp +++ b/cpp/src/rolling/detail/range_window_bounds.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,7 @@ #pragma once #include -#include #include -#include -#include namespace cudf { namespace detail { diff --git a/cpp/src/rolling/range_window_bounds.cpp b/cpp/src/rolling/range_window_bounds.cpp index 69792136c64..7f698dfcd6b 100644 --- a/cpp/src/rolling/range_window_bounds.cpp +++ b/cpp/src/rolling/range_window_bounds.cpp @@ -19,7 +19,6 @@ #include #include #include -#include namespace cudf { namespace { diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp index 31535198c58..4ec2174a96f 100644 --- a/cpp/src/scalar/scalar.cpp +++ b/cpp/src/scalar/scalar.cpp @@ -26,8 +26,6 @@ #include #include -#include - #include namespace cudf { diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp index 656fe61fbbe..9f242bdffe0 100644 --- a/cpp/src/scalar/scalar_factories.cpp +++ b/cpp/src/scalar/scalar_factories.cpp @@ -16,10 +16,8 @@ #include #include -#include #include #include -#include #include #include diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu index 0db1adf1223..f5d052c6657 100644 --- a/cpp/src/strings/convert/convert_durations.cu +++ b/cpp/src/strings/convert/convert_durations.cu @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -152,12 +153,8 @@ struct format_compiler { } // create program in device memory - d_items.resize(items.size(), stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_items.data(), - items.data(), - items.size() * sizeof(items[0]), - cudaMemcpyDefault, - stream.value())); + d_items = cudf::detail::make_device_uvector_sync( + items, stream, cudf::get_current_device_resource_ref()); } format_item const* compiled_format_items() { return d_items.data(); } diff --git a/cpp/src/strings/regex/regexec.cpp b/cpp/src/strings/regex/regexec.cpp index d1990733e81..60ad714dfec 100644 --- a/cpp/src/strings/regex/regexec.cpp +++ b/cpp/src/strings/regex/regexec.cpp @@ -24,7 +24,6 @@ #include #include -#include #include #include diff --git a/cpp/src/strings/strings_scalar_factories.cpp b/cpp/src/strings/strings_scalar_factories.cpp index 219d1174d42..1cc405234b2 100644 --- a/cpp/src/strings/strings_scalar_factories.cpp +++ b/cpp/src/strings/strings_scalar_factories.cpp @@ -16,7 +16,6 @@ #include #include -#include #include diff --git a/cpp/src/structs/structs_column_view.cpp b/cpp/src/structs/structs_column_view.cpp index b0284e9cb96..e14142a9ad1 100644 --- a/cpp/src/structs/structs_column_view.cpp +++ b/cpp/src/structs/structs_column_view.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp index 5df9943303d..4012ee3d21c 100644 --- a/cpp/src/structs/utilities.cpp +++ b/cpp/src/structs/utilities.cpp @@ -21,13 +21,10 @@ #include #include #include -#include #include #include #include -#include #include -#include #include #include diff --git a/cpp/src/table/table.cpp b/cpp/src/table/table.cpp index cb707c94288..41c64c6decb 100644 --- a/cpp/src/table/table.cpp +++ b/cpp/src/table/table.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp index 8a5340dc20d..659beb749af 100644 --- a/cpp/src/table/table_view.cpp +++ b/cpp/src/table/table_view.cpp @@ -20,10 +20,7 @@ #include #include -#include - #include -#include #include namespace cudf { diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp index 52b96bc9039..b919ac16956 100644 --- a/cpp/src/transform/transform.cpp +++ b/cpp/src/transform/transform.cpp @@ -23,8 +23,6 @@ #include #include #include -#include -#include #include #include diff --git a/cpp/src/utilities/cuda.cpp b/cpp/src/utilities/cuda.cpp index 53ca0608170..d979bda41d0 100644 --- a/cpp/src/utilities/cuda.cpp +++ b/cpp/src/utilities/cuda.cpp @@ -18,8 +18,6 @@ #include #include -#include - namespace cudf::detail { cudf::size_type num_multiprocessors() diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 9d8e3cf2fa6..e30806a5011 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include #include diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp index 58971552758..000526723c4 100644 --- a/cpp/src/utilities/prefetch.cpp +++ b/cpp/src/utilities/prefetch.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp index 8c29182bfb5..7069b59be26 100644 --- a/cpp/src/utilities/stream_pool.cpp +++ b/cpp/src/utilities/stream_pool.cpp @@ -23,7 +23,6 @@ #include #include -#include #include #include diff --git a/cpp/src/utilities/traits.cpp b/cpp/src/utilities/traits.cpp index a68dc84e340..c1e71f5f8f9 100644 --- a/cpp/src/utilities/traits.cpp +++ b/cpp/src/utilities/traits.cpp @@ -19,8 +19,6 @@ #include #include -#include - namespace cudf { namespace { diff --git a/cpp/src/utilities/type_checks.cpp b/cpp/src/utilities/type_checks.cpp index 3095b342748..84c8529641d 100644 --- a/cpp/src/utilities/type_checks.cpp +++ b/cpp/src/utilities/type_checks.cpp @@ -21,8 +21,6 @@ #include #include -#include - #include namespace cudf { diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index a4213dcbe94..b78a64d0e55 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -611,7 +611,6 @@ ConfigureTest( text/bpe_tests.cpp text/edit_distance_tests.cpp text/jaccard_tests.cpp - text/minhash_tests.cpp text/ngrams_tests.cpp text/ngrams_tokenize_tests.cpp text/normalize_tests.cpp diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp index a4bde50a21e..7af88d8aa34 100644 --- a/cpp/tests/ast/transform_tests.cpp +++ b/cpp/tests/ast/transform_tests.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include #include @@ -26,14 +25,8 @@ #include #include #include -#include -#include -#include #include #include -#include - -#include #include @@ -41,7 +34,6 @@ #include #include #include -#include #include template diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp index aa5b49567e6..3bd67001c16 100644 --- a/cpp/tests/binaryop/binop-compiled-test.cpp +++ b/cpp/tests/binaryop/binop-compiled-test.cpp @@ -26,9 +26,7 @@ #include #include #include -#include #include -#include #include diff --git a/cpp/tests/binaryop/binop-generic-ptx-test.cpp b/cpp/tests/binaryop/binop-generic-ptx-test.cpp index 03cc87a1968..e9a2761db4a 100644 --- a/cpp/tests/binaryop/binop-generic-ptx-test.cpp +++ b/cpp/tests/binaryop/binop-generic-ptx-test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Copyright 2018-2019 BlazingDB, Inc. * Copyright 2018 Christian Noboa Mardini @@ -19,7 +19,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp index fe221fb1c48..799bf646e52 100644 --- a/cpp/tests/bitmask/bitmask_tests.cpp +++ b/cpp/tests/bitmask/bitmask_tests.cpp @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/column/bit_cast_test.cpp b/cpp/tests/column/bit_cast_test.cpp index ab230ab036e..5570a7d498c 100644 --- a/cpp/tests/column/bit_cast_test.cpp +++ b/cpp/tests/column/bit_cast_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -26,8 +25,6 @@ #include -#include - template struct rep_type_impl { using type = void; diff --git a/cpp/tests/column/column_test.cpp b/cpp/tests/column/column_test.cpp index 631f5150829..d700adaebd5 100644 --- a/cpp/tests/column/column_test.cpp +++ b/cpp/tests/column/column_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/tests/column/column_view_device_span_test.cpp b/cpp/tests/column/column_view_device_span_test.cpp index 6de9121158b..470437f4112 100644 --- a/cpp/tests/column/column_view_device_span_test.cpp +++ b/cpp/tests/column/column_view_device_span_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp index 37ab4b8f387..ad344476332 100644 --- a/cpp/tests/column/column_view_shallow_test.cpp +++ b/cpp/tests/column/column_view_shallow_test.cpp @@ -15,9 +15,7 @@ */ #include -#include #include -#include #include #include diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp index 603187f0330..aa9d508b6aa 100644 --- a/cpp/tests/column/factories_test.cpp +++ b/cpp/tests/column/factories_test.cpp @@ -26,11 +26,8 @@ #include #include #include -#include #include -#include - #include class ColumnFactoryTest : public cudf::test::BaseFixture { diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 18140c34abd..aedc498964a 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -34,8 +34,6 @@ #include #include -#include - #include #include #include diff --git a/cpp/tests/copying/copy_if_else_nested_tests.cpp b/cpp/tests/copying/copy_if_else_nested_tests.cpp index cfbd181f944..e1cdfe9beed 100644 --- a/cpp/tests/copying/copy_if_else_nested_tests.cpp +++ b/cpp/tests/copying/copy_if_else_nested_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/copying/copy_range_tests.cpp b/cpp/tests/copying/copy_range_tests.cpp index 25d93da277b..e2133a546e4 100644 --- a/cpp/tests/copying/copy_range_tests.cpp +++ b/cpp/tests/copying/copy_range_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp index 4124f749012..9c00725d5d2 100644 --- a/cpp/tests/copying/copy_tests.cpp +++ b/cpp/tests/copying/copy_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/copying/gather_list_tests.cpp b/cpp/tests/copying/gather_list_tests.cpp index 247090aac90..93f71345c5c 100644 --- a/cpp/tests/copying/gather_list_tests.cpp +++ b/cpp/tests/copying/gather_list_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,8 +17,6 @@ #include #include #include -#include -#include #include #include diff --git a/cpp/tests/copying/gather_str_tests.cpp b/cpp/tests/copying/gather_str_tests.cpp index 28098878086..795e3f30aa1 100644 --- a/cpp/tests/copying/gather_str_tests.cpp +++ b/cpp/tests/copying/gather_str_tests.cpp @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/copying/gather_struct_tests.cpp b/cpp/tests/copying/gather_struct_tests.cpp index 1598ab2646a..b2c0f7acc3a 100644 --- a/cpp/tests/copying/gather_struct_tests.cpp +++ b/cpp/tests/copying/gather_struct_tests.cpp @@ -17,20 +17,15 @@ #include #include #include -#include #include #include #include #include #include -#include -#include -#include #include #include #include -#include #include diff --git a/cpp/tests/copying/gather_tests.cpp b/cpp/tests/copying/gather_tests.cpp index 07ce672b14d..908dcd67673 100644 --- a/cpp/tests/copying/gather_tests.cpp +++ b/cpp/tests/copying/gather_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp index 90ff97e7355..b2d64dac7c8 100644 --- a/cpp/tests/copying/get_value_tests.cpp +++ b/cpp/tests/copying/get_value_tests.cpp @@ -16,10 +16,8 @@ #include #include -#include #include #include -#include #include #include diff --git a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp index 4f28ff12941..1f76efdc4c3 100644 --- a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp +++ b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp @@ -16,13 +16,10 @@ #include #include #include -#include #include #include #include -#include -#include #include #include #include diff --git a/cpp/tests/copying/reverse_tests.cpp b/cpp/tests/copying/reverse_tests.cpp index e4b2d319ddf..46516436901 100644 --- a/cpp/tests/copying/reverse_tests.cpp +++ b/cpp/tests/copying/reverse_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,17 +17,13 @@ #include #include #include -#include #include #include #include -#include -#include #include #include -#include #include #include #include diff --git a/cpp/tests/copying/sample_tests.cpp b/cpp/tests/copying/sample_tests.cpp index 2f76e3f1fcd..8be5d8c1fbb 100644 --- a/cpp/tests/copying/sample_tests.cpp +++ b/cpp/tests/copying/sample_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,12 +15,9 @@ */ #include -#include #include -#include #include -#include #include #include #include diff --git a/cpp/tests/copying/scatter_list_scalar_tests.cpp b/cpp/tests/copying/scatter_list_scalar_tests.cpp index 42d2e004d6b..23faa6e5b86 100644 --- a/cpp/tests/copying/scatter_list_scalar_tests.cpp +++ b/cpp/tests/copying/scatter_list_scalar_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,6 @@ #include #include -#include using mask_vector = std::vector; using size_column = cudf::test::fixed_width_column_wrapper; diff --git a/cpp/tests/copying/scatter_list_tests.cpp b/cpp/tests/copying/scatter_list_tests.cpp index a82860a3eec..1f87fcfcc99 100644 --- a/cpp/tests/copying/scatter_list_tests.cpp +++ b/cpp/tests/copying/scatter_list_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/copying/scatter_struct_scalar_tests.cpp b/cpp/tests/copying/scatter_struct_scalar_tests.cpp index 78572b0bb37..1d1da8a1b1e 100644 --- a/cpp/tests/copying/scatter_struct_scalar_tests.cpp +++ b/cpp/tests/copying/scatter_struct_scalar_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/copying/scatter_struct_tests.cpp b/cpp/tests/copying/scatter_struct_tests.cpp index c92244d047b..7d88e9af85f 100644 --- a/cpp/tests/copying/scatter_struct_tests.cpp +++ b/cpp/tests/copying/scatter_struct_tests.cpp @@ -21,7 +21,6 @@ #include #include -#include #include using namespace cudf::test::iterators; diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp index 41a753cd0ac..74c04446bdd 100644 --- a/cpp/tests/copying/scatter_tests.cpp +++ b/cpp/tests/copying/scatter_tests.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include #include @@ -23,7 +22,6 @@ #include #include #include -#include #include diff --git a/cpp/tests/copying/segmented_gather_list_tests.cpp b/cpp/tests/copying/segmented_gather_list_tests.cpp index 8881fb344a2..a133ae43872 100644 --- a/cpp/tests/copying/segmented_gather_list_tests.cpp +++ b/cpp/tests/copying/segmented_gather_list_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp index ff6808d9a79..72a8e7357bc 100644 --- a/cpp/tests/copying/shift_tests.cpp +++ b/cpp/tests/copying/shift_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -30,7 +29,6 @@ #include #include -#include using TestTypes = cudf::test::Types; diff --git a/cpp/tests/copying/slice_tests.cpp b/cpp/tests/copying/slice_tests.cpp index aef0d4ad78a..3868a147fa8 100644 --- a/cpp/tests/copying/slice_tests.cpp +++ b/cpp/tests/copying/slice_tests.cpp @@ -22,12 +22,8 @@ #include #include -#include #include #include -#include -#include -#include #include #include diff --git a/cpp/tests/copying/utility_tests.cpp b/cpp/tests/copying/utility_tests.cpp index 0905f9babdc..90457f8d74c 100644 --- a/cpp/tests/copying/utility_tests.cpp +++ b/cpp/tests/copying/utility_tests.cpp @@ -23,7 +23,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index 603edb27c7c..44f99adc0e9 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -23,14 +23,11 @@ #include #include -#include #include #include #include #include -#include - #define XXX false // stub for null values constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; diff --git a/cpp/tests/dictionary/add_keys_test.cpp b/cpp/tests/dictionary/add_keys_test.cpp index 46bf5468922..ebc8c11e86c 100644 --- a/cpp/tests/dictionary/add_keys_test.cpp +++ b/cpp/tests/dictionary/add_keys_test.cpp @@ -24,8 +24,6 @@ #include #include -#include - struct DictionaryAddKeysTest : public cudf::test::BaseFixture {}; TEST_F(DictionaryAddKeysTest, StringsColumn) diff --git a/cpp/tests/dictionary/encode_test.cpp b/cpp/tests/dictionary/encode_test.cpp index 5db0e9fa1e4..dfa3ede5d46 100644 --- a/cpp/tests/dictionary/encode_test.cpp +++ b/cpp/tests/dictionary/encode_test.cpp @@ -21,8 +21,6 @@ #include #include -#include - struct DictionaryEncodeTest : public cudf::test::BaseFixture {}; TEST_F(DictionaryEncodeTest, EncodeStringColumn) diff --git a/cpp/tests/dictionary/fill_test.cpp b/cpp/tests/dictionary/fill_test.cpp index 18696b66e48..bc7d19201aa 100644 --- a/cpp/tests/dictionary/fill_test.cpp +++ b/cpp/tests/dictionary/fill_test.cpp @@ -18,13 +18,10 @@ #include #include -#include #include #include #include -#include - struct DictionaryFillTest : public cudf::test::BaseFixture {}; TEST_F(DictionaryFillTest, StringsColumn) diff --git a/cpp/tests/dictionary/search_test.cpp b/cpp/tests/dictionary/search_test.cpp index 25501b4fde7..2774173b80a 100644 --- a/cpp/tests/dictionary/search_test.cpp +++ b/cpp/tests/dictionary/search_test.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include diff --git a/cpp/tests/dictionary/slice_test.cpp b/cpp/tests/dictionary/slice_test.cpp index d80f8dee079..8c15d6dbecd 100644 --- a/cpp/tests/dictionary/slice_test.cpp +++ b/cpp/tests/dictionary/slice_test.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/filling/fill_tests.cpp b/cpp/tests/filling/fill_tests.cpp index 26badefe698..a5e2db6a005 100644 --- a/cpp/tests/filling/fill_tests.cpp +++ b/cpp/tests/filling/fill_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/filling/repeat_tests.cpp b/cpp/tests/filling/repeat_tests.cpp index 6326765c68b..c856984a4a3 100644 --- a/cpp/tests/filling/repeat_tests.cpp +++ b/cpp/tests/filling/repeat_tests.cpp @@ -17,14 +17,11 @@ #include #include #include -#include #include #include #include #include -#include -#include #include #include @@ -33,7 +30,6 @@ #include #include -#include constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; diff --git a/cpp/tests/filling/sequence_tests.cpp b/cpp/tests/filling/sequence_tests.cpp index 0783b4e5bbb..53782c90c26 100644 --- a/cpp/tests/filling/sequence_tests.cpp +++ b/cpp/tests/filling/sequence_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp index a222289216d..b96c6909e55 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cpp +++ b/cpp/tests/fixed_point/fixed_point_tests.cpp @@ -18,17 +18,14 @@ #include #include #include -#include #include #include -#include #include #include #include #include -#include #include using namespace numeric; diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp index a79b6a32916..ba456084a7c 100644 --- a/cpp/tests/groupby/collect_list_tests.cpp +++ b/cpp/tests/groupby/collect_list_tests.cpp @@ -20,8 +20,6 @@ #include #include -#include - template struct groupby_collect_list_test : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/groupby/collect_set_tests.cpp b/cpp/tests/groupby/collect_set_tests.cpp index 61d2838590b..dfd7eb82c4a 100644 --- a/cpp/tests/groupby/collect_set_tests.cpp +++ b/cpp/tests/groupby/collect_set_tests.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/groupby/correlation_tests.cpp b/cpp/tests/groupby/correlation_tests.cpp index 26f714632dd..f8cc813e877 100644 --- a/cpp/tests/groupby/correlation_tests.cpp +++ b/cpp/tests/groupby/correlation_tests.cpp @@ -25,7 +25,6 @@ #include #include -#include using namespace cudf::test::iterators; diff --git a/cpp/tests/groupby/covariance_tests.cpp b/cpp/tests/groupby/covariance_tests.cpp index e3eb2da201f..81378bb91e8 100644 --- a/cpp/tests/groupby/covariance_tests.cpp +++ b/cpp/tests/groupby/covariance_tests.cpp @@ -23,10 +23,8 @@ #include #include -#include #include -#include using namespace cudf::test::iterators; diff --git a/cpp/tests/groupby/groupby_test_util.cpp b/cpp/tests/groupby/groupby_test_util.cpp index 5d99d15ae77..df0375d6a09 100644 --- a/cpp/tests/groupby/groupby_test_util.cpp +++ b/cpp/tests/groupby/groupby_test_util.cpp @@ -17,8 +17,8 @@ #include "groupby_test_util.hpp" #include -#include #include +#include #include #include @@ -27,9 +27,6 @@ #include #include #include -#include - -#include void test_single_agg(cudf::column_view const& keys, cudf::column_view const& values, diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp index 755b0c20f17..9d2e613be3e 100644 --- a/cpp/tests/groupby/groupby_test_util.hpp +++ b/cpp/tests/groupby/groupby_test_util.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,11 +16,8 @@ #pragma once -#include #include -#include #include -#include enum class force_use_sort_impl : bool { NO, YES }; diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp index 2d447025919..783cfb17e49 100644 --- a/cpp/tests/groupby/histogram_tests.cpp +++ b/cpp/tests/groupby/histogram_tests.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/groupby/max_scan_tests.cpp b/cpp/tests/groupby/max_scan_tests.cpp index d86de798844..6195e0179ec 100644 --- a/cpp/tests/groupby/max_scan_tests.cpp +++ b/cpp/tests/groupby/max_scan_tests.cpp @@ -22,7 +22,6 @@ #include #include -#include using namespace cudf::test::iterators; diff --git a/cpp/tests/groupby/merge_lists_tests.cpp b/cpp/tests/groupby/merge_lists_tests.cpp index 279d71560b4..4481e2dc022 100644 --- a/cpp/tests/groupby/merge_lists_tests.cpp +++ b/cpp/tests/groupby/merge_lists_tests.cpp @@ -21,7 +21,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/groupby/merge_sets_tests.cpp b/cpp/tests/groupby/merge_sets_tests.cpp index 9736bb84dd6..1bfba265478 100644 --- a/cpp/tests/groupby/merge_sets_tests.cpp +++ b/cpp/tests/groupby/merge_sets_tests.cpp @@ -21,7 +21,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp index 7f31bc9089f..f2a50248b4a 100644 --- a/cpp/tests/groupby/rank_scan_tests.cpp +++ b/cpp/tests/groupby/rank_scan_tests.cpp @@ -22,8 +22,6 @@ #include #include -#include - using namespace cudf::test::iterators; template diff --git a/cpp/tests/groupby/shift_tests.cpp b/cpp/tests/groupby/shift_tests.cpp index 14c9ceb4508..49f9d7cb10a 100644 --- a/cpp/tests/groupby/shift_tests.cpp +++ b/cpp/tests/groupby/shift_tests.cpp @@ -21,7 +21,6 @@ #include #include -#include #include template diff --git a/cpp/tests/hashing/md5_test.cpp b/cpp/tests/hashing/md5_test.cpp index 69e518cbf8d..b54adb52496 100644 --- a/cpp/tests/hashing/md5_test.cpp +++ b/cpp/tests/hashing/md5_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/hashing/murmurhash3_x86_32_test.cpp b/cpp/tests/hashing/murmurhash3_x86_32_test.cpp index c1a6e6ff6e1..b4622f5eb81 100644 --- a/cpp/tests/hashing/murmurhash3_x86_32_test.cpp +++ b/cpp/tests/hashing/murmurhash3_x86_32_test.cpp @@ -17,11 +17,9 @@ #include #include #include -#include #include #include -#include #include constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; diff --git a/cpp/tests/hashing/sha1_test.cpp b/cpp/tests/hashing/sha1_test.cpp index e28e71442a6..1e86751bb4c 100644 --- a/cpp/tests/hashing/sha1_test.cpp +++ b/cpp/tests/hashing/sha1_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/hashing/sha224_test.cpp b/cpp/tests/hashing/sha224_test.cpp index 61b584f94df..259e7102ee2 100644 --- a/cpp/tests/hashing/sha224_test.cpp +++ b/cpp/tests/hashing/sha224_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp index 8bc47c92c6b..a4affc87874 100644 --- a/cpp/tests/hashing/sha256_test.cpp +++ b/cpp/tests/hashing/sha256_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/hashing/sha384_test.cpp b/cpp/tests/hashing/sha384_test.cpp index 4c79934f98d..8a5c090eeea 100644 --- a/cpp/tests/hashing/sha384_test.cpp +++ b/cpp/tests/hashing/sha384_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/hashing/sha512_test.cpp b/cpp/tests/hashing/sha512_test.cpp index 0eb1c60b8fc..77fc56b5f13 100644 --- a/cpp/tests/hashing/sha512_test.cpp +++ b/cpp/tests/hashing/sha512_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/hashing/xxhash_64_test.cpp b/cpp/tests/hashing/xxhash_64_test.cpp index ab4ed829681..d8694a72d94 100644 --- a/cpp/tests/hashing/xxhash_64_test.cpp +++ b/cpp/tests/hashing/xxhash_64_test.cpp @@ -17,11 +17,8 @@ #include #include #include -#include #include -#include -#include #include using NumericTypesNoBools = diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp index 2151ec6e22f..1ddc33e749a 100644 --- a/cpp/tests/interop/from_arrow_device_test.cpp +++ b/cpp/tests/interop/from_arrow_device_test.cpp @@ -17,17 +17,13 @@ #include "nanoarrow_utils.hpp" #include -#include #include #include -#include #include #include #include #include -#include -#include #include #include #include diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp index ef9936b214c..d93ef28aab8 100644 --- a/cpp/tests/interop/from_arrow_host_test.cpp +++ b/cpp/tests/interop/from_arrow_host_test.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -28,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/tests/interop/from_arrow_stream_test.cpp b/cpp/tests/interop/from_arrow_stream_test.cpp index 80a2e4b2ffd..3916025bf22 100644 --- a/cpp/tests/interop/from_arrow_stream_test.cpp +++ b/cpp/tests/interop/from_arrow_stream_test.cpp @@ -17,27 +17,14 @@ #include "nanoarrow_utils.hpp" #include -#include -#include #include -#include -#include -#include -#include #include -#include -#include -#include -#include #include #include #include -#include #include -#include - struct VectorOfArrays { std::vector arrays; nanoarrow::UniqueSchema schema; diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 6e742b9e4cf..18efae75cb1 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -25,9 +25,7 @@ #include #include #include -#include #include -#include #include #include #include @@ -37,8 +35,6 @@ #include #include -#include -#include std::unique_ptr get_cudf_table() { diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp index 7ba586461dc..29aa928c277 100644 --- a/cpp/tests/interop/to_arrow_device_test.cpp +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -17,21 +17,15 @@ #include "nanoarrow_utils.hpp" #include -#include #include -#include -#include #include #include -#include -#include #include #include #include #include #include -#include #include #include diff --git a/cpp/tests/interop/to_arrow_host_test.cpp b/cpp/tests/interop/to_arrow_host_test.cpp index fcb4433b42e..fa3aa82fee2 100644 --- a/cpp/tests/interop/to_arrow_host_test.cpp +++ b/cpp/tests/interop/to_arrow_host_test.cpp @@ -17,20 +17,14 @@ #include "nanoarrow_utils.hpp" #include -#include #include -#include -#include #include #include #include #include -#include #include #include -#include -#include #include #include #include diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index a6aa4b22eca..86295d8efb1 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -19,14 +19,12 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include #include diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index b265dcf9273..cc1e367d114 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -17,14 +17,12 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include #include @@ -32,18 +30,12 @@ #include #include #include -#include -#include #include -#include #include -#include - #include #include -#include #include #include #include diff --git a/cpp/tests/io/file_io_test.cpp b/cpp/tests/io/file_io_test.cpp index 3c41f21b0a4..1b85541687a 100644 --- a/cpp/tests/io/file_io_test.cpp +++ b/cpp/tests/io/file_io_test.cpp @@ -15,13 +15,10 @@ */ #include -#include #include #include -#include - // Base test fixture for tests struct CuFileIOTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp index d23acf3ae00..c8c2d18903f 100644 --- a/cpp/tests/io/json/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json/json_quote_normalization_test.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include #include @@ -29,7 +28,6 @@ #include #include -#include #include diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index cb6716f4a18..5f070bd53b9 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -39,8 +39,6 @@ #include -#include - #include #include #include diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp index 15682c6ae6b..887d4fa783f 100644 --- a/cpp/tests/io/json/json_tree.cpp +++ b/cpp/tests/io/json/json_tree.cpp @@ -15,12 +15,8 @@ */ #include "io/json/nested_json.hpp" -#include "io/utilities/hostdevice_vector.hpp" #include -#include -#include -#include #include #include @@ -29,9 +25,9 @@ #include #include -#include #include +#include #include #include #include diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp index f32aba0e632..e0e955c4f48 100644 --- a/cpp/tests/io/json/nested_json_test.cpp +++ b/cpp/tests/io/json/nested_json_test.cpp @@ -21,24 +21,16 @@ #include #include #include -#include #include -#include #include -#include #include -#include #include -#include #include #include #include #include -#include - -#include #include #include diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index cce0adbf317..fce99187516 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/tests/io/parquet_common.hpp b/cpp/tests/io/parquet_common.hpp index c90b81ed27a..d66aa3bde9d 100644 --- a/cpp/tests/io/parquet_common.hpp +++ b/cpp/tests/io/parquet_common.hpp @@ -22,13 +22,11 @@ #include #include -#include #include #include #include #include -#include #include #include diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp index f1286a00d22..d66f685cd9c 100644 --- a/cpp/tests/io/parquet_misc_test.cpp +++ b/cpp/tests/io/parquet_misc_test.cpp @@ -20,8 +20,6 @@ #include #include -#include -#include #include diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp index 7986a3c6d70..177e6163d4f 100644 --- a/cpp/tests/io/parquet_reader_test.cpp +++ b/cpp/tests/io/parquet_reader_test.cpp @@ -29,6 +29,8 @@ #include #include +#include + #include TEST_F(ParquetReaderTest, UserBounds) diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index be2ecd56424..5c3c8342cd2 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include // NOTE: this file exists to define the parquet test's `main()` function. diff --git a/cpp/tests/io/row_selection_test.cpp b/cpp/tests/io/row_selection_test.cpp index ebadd870091..c40d3bbd299 100644 --- a/cpp/tests/io/row_selection_test.cpp +++ b/cpp/tests/io/row_selection_test.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include diff --git a/cpp/tests/io/text/data_chunk_source_test.cpp b/cpp/tests/io/text/data_chunk_source_test.cpp index 6f46df20633..79ce908f3e0 100644 --- a/cpp/tests/io/text/data_chunk_source_test.cpp +++ b/cpp/tests/io/text/data_chunk_source_test.cpp @@ -15,14 +15,11 @@ */ #include -#include #include #include #include -#include - #include #include diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 74d08061df9..60244462e2c 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -19,16 +19,12 @@ #include #include #include -#include -#include #include -#include #include #include #include #include -#include #include using cudf::test::strings_column_wrapper; diff --git a/cpp/tests/iterator/value_iterator.cpp b/cpp/tests/iterator/value_iterator.cpp index 22bc7475dbe..f7f7c0f2721 100644 --- a/cpp/tests/iterator/value_iterator.cpp +++ b/cpp/tests/iterator/value_iterator.cpp @@ -13,7 +13,6 @@ * the License. */ -#include #include CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/jit/parse_ptx_function.cpp b/cpp/tests/jit/parse_ptx_function.cpp index 6f9dfd06730..c9bb691907a 100644 --- a/cpp/tests/jit/parse_ptx_function.cpp +++ b/cpp/tests/jit/parse_ptx_function.cpp @@ -16,7 +16,6 @@ #include "jit/parser.hpp" -#include #include #include diff --git a/cpp/tests/join/cross_join_tests.cpp b/cpp/tests/join/cross_join_tests.cpp index d87f5e54153..971913443e5 100644 --- a/cpp/tests/join/cross_join_tests.cpp +++ b/cpp/tests/join/cross_join_tests.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include #include diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp index 178edc52dd3..9070efa38fe 100644 --- a/cpp/tests/join/distinct_join_tests.cpp +++ b/cpp/tests/join/distinct_join_tests.cpp @@ -15,12 +15,8 @@ */ #include -#include #include -#include #include -#include -#include #include #include @@ -31,7 +27,6 @@ #include #include -#include #include template diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index 3431e941359..6a8a54c8465 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -20,17 +20,12 @@ #include #include #include -#include #include -#include #include #include -#include -#include #include #include -#include #include #include #include diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp index 554d5754e39..ddc65c3f379 100644 --- a/cpp/tests/join/semi_anti_join_tests.cpp +++ b/cpp/tests/join/semi_anti_join_tests.cpp @@ -22,7 +22,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/json/json_tests.cpp b/cpp/tests/json/json_tests.cpp index 42a574ac5c0..53166e04173 100644 --- a/cpp/tests/json/json_tests.cpp +++ b/cpp/tests/json/json_tests.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp index 7b61be113f9..f1404990354 100644 --- a/cpp/tests/large_strings/large_strings_fixture.cpp +++ b/cpp/tests/large_strings/large_strings_fixture.cpp @@ -16,12 +16,10 @@ #include "large_strings_fixture.hpp" -#include #include #include #include -#include #include #include diff --git a/cpp/tests/large_strings/parquet_tests.cpp b/cpp/tests/large_strings/parquet_tests.cpp index 007c08ce0fb..f47782a2d02 100644 --- a/cpp/tests/large_strings/parquet_tests.cpp +++ b/cpp/tests/large_strings/parquet_tests.cpp @@ -16,8 +16,6 @@ #include "large_strings_fixture.hpp" -#include -#include #include #include diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp index 8fb2b403051..7ae7a6a7414 100644 --- a/cpp/tests/lists/contains_tests.cpp +++ b/cpp/tests/lists/contains_tests.cpp @@ -22,7 +22,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/lists/extract_tests.cpp b/cpp/tests/lists/extract_tests.cpp index 92dd5df5ec7..2c24f695c29 100644 --- a/cpp/tests/lists/extract_tests.cpp +++ b/cpp/tests/lists/extract_tests.cpp @@ -21,12 +21,8 @@ #include #include -#include -#include #include -#include - #include #include #include diff --git a/cpp/tests/lists/sequences_tests.cpp b/cpp/tests/lists/sequences_tests.cpp index 74545903eb3..dcb906cd2ef 100644 --- a/cpp/tests/lists/sequences_tests.cpp +++ b/cpp/tests/lists/sequences_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp index 5625b47e7ea..18aa118bb81 100644 --- a/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp +++ b/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp @@ -20,8 +20,6 @@ #include #include -#include -#include #include namespace cudf::test { diff --git a/cpp/tests/merge/merge_dictionary_test.cpp b/cpp/tests/merge/merge_dictionary_test.cpp index dd528c19e4e..1d7a31fd797 100644 --- a/cpp/tests/merge/merge_dictionary_test.cpp +++ b/cpp/tests/merge/merge_dictionary_test.cpp @@ -17,9 +17,7 @@ #include #include #include -#include -#include #include #include #include diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp index bea044496b3..d9fdb6099f0 100644 --- a/cpp/tests/merge/merge_string_test.cpp +++ b/cpp/tests/merge/merge_string_test.cpp @@ -17,10 +17,8 @@ #include #include #include -#include #include -#include #include #include #include @@ -30,10 +28,6 @@ #include -#include -#include -#include -#include #include #include diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp index 6208d395f0a..fad390105d7 100644 --- a/cpp/tests/merge/merge_test.cpp +++ b/cpp/tests/merge/merge_test.cpp @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -34,7 +33,6 @@ #include #include -#include #include diff --git a/cpp/tests/partitioning/round_robin_test.cpp b/cpp/tests/partitioning/round_robin_test.cpp index 89d23c39dca..3693cfbcc72 100644 --- a/cpp/tests/partitioning/round_robin_test.cpp +++ b/cpp/tests/partitioning/round_robin_test.cpp @@ -17,10 +17,8 @@ #include #include #include -#include #include -#include #include #include #include @@ -30,12 +28,7 @@ #include -#include -#include -#include -#include #include -#include #include using cudf::test::fixed_width_column_wrapper; diff --git a/cpp/tests/quantiles/quantile_test.cpp b/cpp/tests/quantiles/quantile_test.cpp index 6e88365b6e8..23b58618fe1 100644 --- a/cpp/tests/quantiles/quantile_test.cpp +++ b/cpp/tests/quantiles/quantile_test.cpp @@ -22,7 +22,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/quantiles/quantiles_test.cpp b/cpp/tests/quantiles/quantiles_test.cpp index 44d4ec61852..c7e11af8c85 100644 --- a/cpp/tests/quantiles/quantiles_test.cpp +++ b/cpp/tests/quantiles/quantiles_test.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/reductions/ewm_tests.cpp b/cpp/tests/reductions/ewm_tests.cpp index 09cec688509..1117b0d1acf 100644 --- a/cpp/tests/reductions/ewm_tests.cpp +++ b/cpp/tests/reductions/ewm_tests.cpp @@ -18,9 +18,7 @@ #include #include -#include -#include #include template diff --git a/cpp/tests/reductions/list_rank_test.cpp b/cpp/tests/reductions/list_rank_test.cpp index f5470f7d881..736b5081d8f 100644 --- a/cpp/tests/reductions/list_rank_test.cpp +++ b/cpp/tests/reductions/list_rank_test.cpp @@ -14,14 +14,9 @@ * limitations under the License. */ -#include - #include #include -#include -#include -#include #include struct ListRankScanTest : public cudf::test::BaseFixture { diff --git a/cpp/tests/reductions/rank_tests.cpp b/cpp/tests/reductions/rank_tests.cpp index 3ab1fc01eaa..19633211192 100644 --- a/cpp/tests/reductions/rank_tests.cpp +++ b/cpp/tests/reductions/rank_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp index bdb98372836..c09cde8f9e4 100644 --- a/cpp/tests/reductions/reduction_tests.cpp +++ b/cpp/tests/reductions/reduction_tests.cpp @@ -22,9 +22,7 @@ #include #include -#include #include -#include #include #include #include @@ -33,11 +31,9 @@ #include #include -#include #include #include -#include #include #include diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp index c4463d68a68..72d92c5ac53 100644 --- a/cpp/tests/reductions/scan_tests.cpp +++ b/cpp/tests/reductions/scan_tests.cpp @@ -20,13 +20,11 @@ #include #include -#include #include #include #include #include -#include #include #include diff --git a/cpp/tests/reductions/scan_tests.hpp b/cpp/tests/reductions/scan_tests.hpp index 858697d8ef5..c2cce4bbbfa 100644 --- a/cpp/tests/reductions/scan_tests.hpp +++ b/cpp/tests/reductions/scan_tests.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,9 +20,7 @@ #include #include -#include #include -#include #include #include @@ -30,7 +28,6 @@ #include #include -#include template struct TypeParam_to_host_type { diff --git a/cpp/tests/replace/clamp_test.cpp b/cpp/tests/replace/clamp_test.cpp index 239c9ce6ddd..e972ea35ed0 100644 --- a/cpp/tests/replace/clamp_test.cpp +++ b/cpp/tests/replace/clamp_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/replace/normalize_replace_tests.cpp b/cpp/tests/replace/normalize_replace_tests.cpp index 2de17388ee8..c35f385329a 100644 --- a/cpp/tests/replace/normalize_replace_tests.cpp +++ b/cpp/tests/replace/normalize_replace_tests.cpp @@ -19,7 +19,6 @@ #include #include -#include #include // This is the main test fixture diff --git a/cpp/tests/replace/replace_nans_tests.cpp b/cpp/tests/replace/replace_nans_tests.cpp index 35232204db7..1b9fe92066a 100644 --- a/cpp/tests/replace/replace_nans_tests.cpp +++ b/cpp/tests/replace/replace_nans_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp index fcee27305f2..0c8ccea52a6 100644 --- a/cpp/tests/replace/replace_nulls_tests.cpp +++ b/cpp/tests/replace/replace_nulls_tests.cpp @@ -20,13 +20,11 @@ #include #include #include -#include #include #include #include #include -#include #include #include #include diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp index b12bf08520f..ae4041bcfaf 100644 --- a/cpp/tests/replace/replace_tests.cpp +++ b/cpp/tests/replace/replace_tests.cpp @@ -20,20 +20,16 @@ #include #include #include -#include #include #include #include #include -#include -#include #include #include #include #include -#include #include diff --git a/cpp/tests/reshape/byte_cast_tests.cpp b/cpp/tests/reshape/byte_cast_tests.cpp index b3d9b2e2f5f..59585c0e947 100644 --- a/cpp/tests/reshape/byte_cast_tests.cpp +++ b/cpp/tests/reshape/byte_cast_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/reshape/tile_tests.cpp b/cpp/tests/reshape/tile_tests.cpp index ed76b9d2ea5..25cfc5c5108 100644 --- a/cpp/tests/reshape/tile_tests.cpp +++ b/cpp/tests/reshape/tile_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,6 @@ */ #include -#include #include #include #include diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp index 165e0347785..e8a36d9ab48 100644 --- a/cpp/tests/rolling/collect_ops_test.cpp +++ b/cpp/tests/rolling/collect_ops_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/rolling/empty_input_test.cpp b/cpp/tests/rolling/empty_input_test.cpp index e7d1e3f0b10..2e1815671a9 100644 --- a/cpp/tests/rolling/empty_input_test.cpp +++ b/cpp/tests/rolling/empty_input_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,9 +15,7 @@ */ #include -#include #include -#include #include #include diff --git a/cpp/tests/rolling/grouped_rolling_range_test.cpp b/cpp/tests/rolling/grouped_rolling_range_test.cpp index fcfbd0eee78..2cb9b60000b 100644 --- a/cpp/tests/rolling/grouped_rolling_range_test.cpp +++ b/cpp/tests/rolling/grouped_rolling_range_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,21 +17,16 @@ #include #include #include -#include #include #include #include #include -#include -#include #include #include #include #include -#include -#include #include #include diff --git a/cpp/tests/rolling/grouped_rolling_test.cpp b/cpp/tests/rolling/grouped_rolling_test.cpp index 78d5daf7e83..78b444bcd93 100644 --- a/cpp/tests/rolling/grouped_rolling_test.cpp +++ b/cpp/tests/rolling/grouped_rolling_test.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/rolling/lead_lag_test.cpp b/cpp/tests/rolling/lead_lag_test.cpp index de057e96320..6519b0ed4ee 100644 --- a/cpp/tests/rolling/lead_lag_test.cpp +++ b/cpp/tests/rolling/lead_lag_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -26,7 +25,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/rolling/nth_element_test.cpp b/cpp/tests/rolling/nth_element_test.cpp index 2444992e68f..5f2b383ed55 100644 --- a/cpp/tests/rolling/nth_element_test.cpp +++ b/cpp/tests/rolling/nth_element_test.cpp @@ -17,22 +17,15 @@ #include #include #include -#include #include #include #include -#include -#include #include -#include - #include #include -#include - #include #include diff --git a/cpp/tests/rolling/offset_row_window_test.cpp b/cpp/tests/rolling/offset_row_window_test.cpp index 0eaab0c9f7a..dcaa47e722b 100644 --- a/cpp/tests/rolling/offset_row_window_test.cpp +++ b/cpp/tests/rolling/offset_row_window_test.cpp @@ -17,14 +17,10 @@ #include #include #include -#include #include #include -#include -#include #include -#include template using fwcw = cudf::test::fixed_width_column_wrapper; diff --git a/cpp/tests/rolling/range_rolling_window_test.cpp b/cpp/tests/rolling/range_rolling_window_test.cpp index 461c41025e9..daf5fcc1d96 100644 --- a/cpp/tests/rolling/range_rolling_window_test.cpp +++ b/cpp/tests/rolling/range_rolling_window_test.cpp @@ -17,22 +17,17 @@ #include #include #include -#include #include #include -#include #include #include -#include -#include #include #include #include #include -#include #include #include diff --git a/cpp/tests/rolling/range_window_bounds_test.cpp b/cpp/tests/rolling/range_window_bounds_test.cpp index b77451bf0bc..a67555280f4 100644 --- a/cpp/tests/rolling/range_window_bounds_test.cpp +++ b/cpp/tests/rolling/range_window_bounds_test.cpp @@ -15,9 +15,6 @@ */ #include -#include -#include -#include #include #include @@ -25,8 +22,6 @@ #include -#include - struct RangeWindowBoundsTest : public cudf::test::BaseFixture {}; template diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp index 6e0dc16dca9..72a511fd5f1 100644 --- a/cpp/tests/rolling/rolling_test.cpp +++ b/cpp/tests/rolling/rolling_test.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -30,7 +29,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/scalar/factories_test.cpp b/cpp/tests/scalar/factories_test.cpp index 5f132f3ace9..26987ea1b7b 100644 --- a/cpp/tests/scalar/factories_test.cpp +++ b/cpp/tests/scalar/factories_test.cpp @@ -22,11 +22,8 @@ #include #include -#include #include -#include - class ScalarFactoryTest : public cudf::test::BaseFixture {}; template diff --git a/cpp/tests/search/search_dictionary_test.cpp b/cpp/tests/search/search_dictionary_test.cpp index 78f79ccc648..a3bb1dfda10 100644 --- a/cpp/tests/search/search_dictionary_test.cpp +++ b/cpp/tests/search/search_dictionary_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include diff --git a/cpp/tests/search/search_list_test.cpp b/cpp/tests/search/search_list_test.cpp index 7584003e800..fb5d0fcc889 100644 --- a/cpp/tests/search/search_list_test.cpp +++ b/cpp/tests/search/search_list_test.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/search/search_struct_test.cpp b/cpp/tests/search/search_struct_test.cpp index c35d359e75c..05b9deb3463 100644 --- a/cpp/tests/search/search_struct_test.cpp +++ b/cpp/tests/search/search_struct_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/search/search_test.cpp b/cpp/tests/search/search_test.cpp index 7550cc27161..8d750be5677 100644 --- a/cpp/tests/search/search_test.cpp +++ b/cpp/tests/search/search_test.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/sort/is_sorted_tests.cpp b/cpp/tests/sort/is_sorted_tests.cpp index 109095192f9..e3c9f8d349e 100644 --- a/cpp/tests/sort/is_sorted_tests.cpp +++ b/cpp/tests/sort/is_sorted_tests.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include #include diff --git a/cpp/tests/sort/rank_test.cpp b/cpp/tests/sort/rank_test.cpp index e08a2105aea..ded46cb1f31 100644 --- a/cpp/tests/sort/rank_test.cpp +++ b/cpp/tests/sort/rank_test.cpp @@ -18,10 +18,8 @@ #include #include #include -#include #include -#include #include #include #include diff --git a/cpp/tests/sort/sort_nested_types_tests.cpp b/cpp/tests/sort/sort_nested_types_tests.cpp index 8ab23936ceb..ce4148a941e 100644 --- a/cpp/tests/sort/sort_nested_types_tests.cpp +++ b/cpp/tests/sort/sort_nested_types_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp index 6a35e977b46..e1505c7a474 100644 --- a/cpp/tests/sort/sort_test.cpp +++ b/cpp/tests/sort/sort_test.cpp @@ -28,7 +28,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/sort/stable_sort_tests.cpp b/cpp/tests/sort/stable_sort_tests.cpp index 655166e0d62..88de9d51523 100644 --- a/cpp/tests/sort/stable_sort_tests.cpp +++ b/cpp/tests/sort/stable_sort_tests.cpp @@ -25,9 +25,6 @@ #include #include -#include -#include - #include #include diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp index 6c0582fb846..1204b019739 100644 --- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp +++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp @@ -20,9 +20,7 @@ #include #include #include -#include -#include #include #include #include @@ -31,8 +29,6 @@ #include #include -#include -#include #include struct ApplyBooleanMask : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp index a2dab649961..ee1bb3ead92 100644 --- a/cpp/tests/stream_compaction/distinct_count_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -15,16 +15,11 @@ */ #include -#include #include #include -#include #include -#include -#include #include -#include #include #include diff --git a/cpp/tests/stream_compaction/distinct_tests.cpp b/cpp/tests/stream_compaction/distinct_tests.cpp index 14d7d8789ac..c618ff68cbb 100644 --- a/cpp/tests/stream_compaction/distinct_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_tests.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include #include @@ -27,8 +26,6 @@ #include #include -#include - auto constexpr null{0}; // null at current level auto constexpr XXX{0}; // null pushed down from parent level auto constexpr NaN = std::numeric_limits::quiet_NaN(); diff --git a/cpp/tests/stream_compaction/drop_nans_tests.cpp b/cpp/tests/stream_compaction/drop_nans_tests.cpp index bf72da5c840..71321361564 100644 --- a/cpp/tests/stream_compaction/drop_nans_tests.cpp +++ b/cpp/tests/stream_compaction/drop_nans_tests.cpp @@ -15,12 +15,9 @@ */ #include -#include #include #include -#include -#include #include #include #include diff --git a/cpp/tests/stream_compaction/drop_nulls_tests.cpp b/cpp/tests/stream_compaction/drop_nulls_tests.cpp index dbac1d58195..d3b45c2323e 100644 --- a/cpp/tests/stream_compaction/drop_nulls_tests.cpp +++ b/cpp/tests/stream_compaction/drop_nulls_tests.cpp @@ -15,12 +15,10 @@ */ #include -#include #include #include #include -#include #include #include #include diff --git a/cpp/tests/stream_compaction/stable_distinct_tests.cpp b/cpp/tests/stream_compaction/stable_distinct_tests.cpp index 6c6c53331d4..cc847da6340 100644 --- a/cpp/tests/stream_compaction/stable_distinct_tests.cpp +++ b/cpp/tests/stream_compaction/stable_distinct_tests.cpp @@ -15,20 +15,16 @@ */ #include -#include #include #include #include #include -#include #include #include #include #include -#include - auto constexpr null{0}; // null at current level auto constexpr XXX{0}; // null pushed down from parent level auto constexpr NaN = std::numeric_limits::quiet_NaN(); diff --git a/cpp/tests/stream_compaction/unique_count_tests.cpp b/cpp/tests/stream_compaction/unique_count_tests.cpp index 640d159fc4f..bad93e92712 100644 --- a/cpp/tests/stream_compaction/unique_count_tests.cpp +++ b/cpp/tests/stream_compaction/unique_count_tests.cpp @@ -15,16 +15,11 @@ */ #include -#include #include #include -#include #include -#include -#include #include -#include #include #include diff --git a/cpp/tests/stream_compaction/unique_tests.cpp b/cpp/tests/stream_compaction/unique_tests.cpp index d5b6915b520..e2b32b898b3 100644 --- a/cpp/tests/stream_compaction/unique_tests.cpp +++ b/cpp/tests/stream_compaction/unique_tests.cpp @@ -15,22 +15,16 @@ */ #include -#include #include #include #include -#include #include -#include #include #include #include #include -#include -#include - using cudf::nan_policy; using cudf::null_equality; using cudf::null_policy; diff --git a/cpp/tests/streams/binaryop_test.cpp b/cpp/tests/streams/binaryop_test.cpp index 2a7b52b1b6b..3dcc6f9e632 100644 --- a/cpp/tests/streams/binaryop_test.cpp +++ b/cpp/tests/streams/binaryop_test.cpp @@ -21,7 +21,6 @@ #include #include -#include #include class BinaryopTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/streams/io/csv_test.cpp b/cpp/tests/streams/io/csv_test.cpp index 42894a0ebcb..a74ee64f8de 100644 --- a/cpp/tests/streams/io/csv_test.cpp +++ b/cpp/tests/streams/io/csv_test.cpp @@ -17,13 +17,9 @@ #include #include #include -#include #include -#include -#include #include -#include #include #include diff --git a/cpp/tests/streams/io/json_test.cpp b/cpp/tests/streams/io/json_test.cpp index f98e685ed0c..d352c6c3b2a 100644 --- a/cpp/tests/streams/io/json_test.cpp +++ b/cpp/tests/streams/io/json_test.cpp @@ -19,9 +19,7 @@ #include #include -#include #include -#include #include #include diff --git a/cpp/tests/streams/io/multibyte_split_test.cpp b/cpp/tests/streams/io/multibyte_split_test.cpp index b0eff1d3340..5bb17226029 100644 --- a/cpp/tests/streams/io/multibyte_split_test.cpp +++ b/cpp/tests/streams/io/multibyte_split_test.cpp @@ -17,7 +17,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/streams/io/orc_test.cpp b/cpp/tests/streams/io/orc_test.cpp index cc43bf15b5d..10722557e6a 100644 --- a/cpp/tests/streams/io/orc_test.cpp +++ b/cpp/tests/streams/io/orc_test.cpp @@ -17,19 +17,11 @@ #include #include #include -#include -#include #include #include -#include #include -#include -#include -#include -#include -#include #include #include diff --git a/cpp/tests/streams/io/parquet_test.cpp b/cpp/tests/streams/io/parquet_test.cpp index 9d2dec2d697..18bb80e64af 100644 --- a/cpp/tests/streams/io/parquet_test.cpp +++ b/cpp/tests/streams/io/parquet_test.cpp @@ -17,13 +17,9 @@ #include #include #include -#include -#include #include #include -#include -#include #include #include diff --git a/cpp/tests/streams/join_test.cpp b/cpp/tests/streams/join_test.cpp index 2811bb676fa..27bd7e080c9 100644 --- a/cpp/tests/streams/join_test.cpp +++ b/cpp/tests/streams/join_test.cpp @@ -19,11 +19,9 @@ #include #include -#include #include #include #include -#include #include #include diff --git a/cpp/tests/streams/null_mask_test.cpp b/cpp/tests/streams/null_mask_test.cpp index e96224003f4..ed37a72545f 100644 --- a/cpp/tests/streams/null_mask_test.cpp +++ b/cpp/tests/streams/null_mask_test.cpp @@ -14,15 +14,12 @@ * limitations under the License. */ -#include - #include #include #include #include #include -#include class NullMaskTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/streams/reduction_test.cpp b/cpp/tests/streams/reduction_test.cpp index b4f013fc960..9ab972302e4 100644 --- a/cpp/tests/streams/reduction_test.cpp +++ b/cpp/tests/streams/reduction_test.cpp @@ -17,11 +17,8 @@ #include #include #include -#include -#include #include -#include #include #include diff --git a/cpp/tests/streams/rolling_test.cpp b/cpp/tests/streams/rolling_test.cpp index b352ad2c0d2..4d9899870b4 100644 --- a/cpp/tests/streams/rolling_test.cpp +++ b/cpp/tests/streams/rolling_test.cpp @@ -17,12 +17,10 @@ #include #include #include -#include #include #include #include -#include class RollingTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/streams/stream_compaction_test.cpp b/cpp/tests/streams/stream_compaction_test.cpp index 07b2d77cc04..e7b282601e1 100644 --- a/cpp/tests/streams/stream_compaction_test.cpp +++ b/cpp/tests/streams/stream_compaction_test.cpp @@ -15,20 +15,16 @@ */ #include -#include #include #include #include -#include #include #include #include #include #include -#include - auto constexpr NaN = std::numeric_limits::quiet_NaN(); auto constexpr KEEP_ANY = cudf::duplicate_keep_option::KEEP_ANY; auto constexpr KEEP_FIRST = cudf::duplicate_keep_option::KEEP_FIRST; diff --git a/cpp/tests/streams/strings/factory_test.cpp b/cpp/tests/streams/strings/factory_test.cpp index 36e595ab9fa..449e0830b0c 100644 --- a/cpp/tests/streams/strings/factory_test.cpp +++ b/cpp/tests/streams/strings/factory_test.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include diff --git a/cpp/tests/streams/strings/reverse_test.cpp b/cpp/tests/streams/strings/reverse_test.cpp index 4b4d0a7aff5..154e1c1b715 100644 --- a/cpp/tests/streams/strings/reverse_test.cpp +++ b/cpp/tests/streams/strings/reverse_test.cpp @@ -21,7 +21,6 @@ #include #include -#include class StringsReverseTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/streams/transform_test.cpp b/cpp/tests/streams/transform_test.cpp index cf81dc6fb42..9f168abcb31 100644 --- a/cpp/tests/streams/transform_test.cpp +++ b/cpp/tests/streams/transform_test.cpp @@ -15,17 +15,11 @@ */ #include -#include #include #include -#include -#include #include -#include -#include #include -#include #include #include diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp index 9c0ecaa52c0..06b9c2fa3c1 100644 --- a/cpp/tests/strings/array_tests.cpp +++ b/cpp/tests/strings/array_tests.cpp @@ -23,10 +23,8 @@ #include #include #include -#include #include #include -#include #include #include diff --git a/cpp/tests/strings/combine/concatenate_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp index bb57d6f5e8a..e53adcf373a 100644 --- a/cpp/tests/strings/combine/concatenate_tests.cpp +++ b/cpp/tests/strings/combine/concatenate_tests.cpp @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/strings/combine/join_list_elements_tests.cpp b/cpp/tests/strings/combine/join_list_elements_tests.cpp index 00317146088..c92f1cfc8f8 100644 --- a/cpp/tests/strings/combine/join_list_elements_tests.cpp +++ b/cpp/tests/strings/combine/join_list_elements_tests.cpp @@ -22,7 +22,6 @@ #include #include #include -#include using namespace cudf::test::iterators; diff --git a/cpp/tests/strings/concatenate_tests.cpp b/cpp/tests/strings/concatenate_tests.cpp index 5cf4015b9e9..51dcc60d95e 100644 --- a/cpp/tests/strings/concatenate_tests.cpp +++ b/cpp/tests/strings/concatenate_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,6 @@ #include #include -#include #include diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp index b3dc3010c67..da0db0fc056 100644 --- a/cpp/tests/strings/datetime_tests.cpp +++ b/cpp/tests/strings/datetime_tests.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index 7e0338f1bf4..37b25d9b287 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -21,7 +21,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 4821a7fa999..7eb4b32d078 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -28,8 +27,6 @@ #include -#include - struct StringsFindallTests : public cudf::test::BaseFixture {}; TEST_F(StringsFindallTests, FindallTest) diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp index 79054551498..b788c05c152 100644 --- a/cpp/tests/strings/fixed_point_tests.cpp +++ b/cpp/tests/strings/fixed_point_tests.cpp @@ -23,8 +23,6 @@ #include #include -#include - struct StringsConvertTest : public cudf::test::BaseFixture {}; template diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp index 26bcfe8028d..c08effdb969 100644 --- a/cpp/tests/strings/integers_tests.cpp +++ b/cpp/tests/strings/integers_tests.cpp @@ -24,9 +24,6 @@ #include #include -#include -#include - #include #include diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp index 219bd6d8b01..a34ff25cb69 100644 --- a/cpp/tests/structs/structs_column_tests.cpp +++ b/cpp/tests/structs/structs_column_tests.cpp @@ -17,28 +17,18 @@ #include #include #include -#include #include #include #include -#include #include -#include #include -#include -#include -#include -#include #include #include #include -#include #include -#include -#include #include #include diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp index c33eedf9bd9..c0df2f01a63 100644 --- a/cpp/tests/structs/utilities_tests.cpp +++ b/cpp/tests/structs/utilities_tests.cpp @@ -14,21 +14,15 @@ * limitations under the License. */ -#include "cudf_test/default_stream.hpp" - #include #include #include -#include #include #include #include -#include -#include #include #include -#include #include #include diff --git a/cpp/tests/table/row_operators_tests.cpp b/cpp/tests/table/row_operators_tests.cpp index 5fa63c47cf0..216c4d7b6bb 100644 --- a/cpp/tests/table/row_operators_tests.cpp +++ b/cpp/tests/table/row_operators_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/table/table_tests.cpp b/cpp/tests/table/table_tests.cpp index 1637ba7d7d3..363f1a0ba5d 100644 --- a/cpp/tests/table/table_tests.cpp +++ b/cpp/tests/table/table_tests.cpp @@ -17,17 +17,14 @@ #include #include #include -#include #include #include #include -#include #include #include #include -#include template using column_wrapper = cudf::test::fixed_width_column_wrapper; diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index e23f3f6e7d8..ef35a4472cf 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -21,13 +21,9 @@ #include #include -#include #include -#include -#include - #include struct MinHashTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp index 1acb4fc4265..c72c7cfc80e 100644 --- a/cpp/tests/text/ngrams_tests.cpp +++ b/cpp/tests/text/ngrams_tests.cpp @@ -28,8 +28,6 @@ #include -#include - struct TextGenerateNgramsTest : public cudf::test::BaseFixture {}; TEST_F(TextGenerateNgramsTest, Ngrams) diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp index b0d41004e7e..2515cc917fa 100644 --- a/cpp/tests/text/normalize_tests.cpp +++ b/cpp/tests/text/normalize_tests.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/text/stemmer_tests.cpp b/cpp/tests/text/stemmer_tests.cpp index a343913411c..82c4bf53cfc 100644 --- a/cpp/tests/text/stemmer_tests.cpp +++ b/cpp/tests/text/stemmer_tests.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp index a615780c02a..782551ad66e 100644 --- a/cpp/tests/text/subword_tests.cpp +++ b/cpp/tests/text/subword_tests.cpp @@ -19,13 +19,11 @@ #include #include -#include #include #include #include -#include #include // Global environment for temporary files diff --git a/cpp/tests/transform/bools_to_mask_test.cpp b/cpp/tests/transform/bools_to_mask_test.cpp index 2684123c08a..9437440f34d 100644 --- a/cpp/tests/transform/bools_to_mask_test.cpp +++ b/cpp/tests/transform/bools_to_mask_test.cpp @@ -20,10 +20,8 @@ #include #include -#include #include #include -#include #include diff --git a/cpp/tests/transform/nans_to_null_test.cpp b/cpp/tests/transform/nans_to_null_test.cpp index ba16c100e7a..42ca872a936 100644 --- a/cpp/tests/transform/nans_to_null_test.cpp +++ b/cpp/tests/transform/nans_to_null_test.cpp @@ -17,12 +17,10 @@ #include #include #include -#include #include #include #include -#include template struct NaNsToNullTest : public cudf::test::BaseFixture { diff --git a/cpp/tests/transpose/transpose_test.cpp b/cpp/tests/transpose/transpose_test.cpp index 5a88c402b8c..7797b2b2cf8 100644 --- a/cpp/tests/transpose/transpose_test.cpp +++ b/cpp/tests/transpose/transpose_test.cpp @@ -22,7 +22,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/types/traits_test.cpp b/cpp/tests/types/traits_test.cpp index 0d9092c33da..46468af515d 100644 --- a/cpp/tests/types/traits_test.cpp +++ b/cpp/tests/types/traits_test.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp index 45b89b76070..ed4c1340dbb 100644 --- a/cpp/tests/unary/cast_tests.cpp +++ b/cpp/tests/unary/cast_tests.cpp @@ -20,18 +20,15 @@ #include #include -#include #include #include #include #include #include -#include #include #include -#include #include static auto const test_timestamps_D = std::vector{ diff --git a/cpp/tests/unary/math_ops_test.cpp b/cpp/tests/unary/math_ops_test.cpp index 5bfbf70d5f9..663a919f3f4 100644 --- a/cpp/tests/unary/math_ops_test.cpp +++ b/cpp/tests/unary/math_ops_test.cpp @@ -22,10 +22,6 @@ #include #include #include -#include -#include - -#include #include diff --git a/cpp/tests/unary/unary_ops_test.cpp b/cpp/tests/unary/unary_ops_test.cpp index e7477c34642..3c616461c74 100644 --- a/cpp/tests/unary/unary_ops_test.cpp +++ b/cpp/tests/unary/unary_ops_test.cpp @@ -23,7 +23,6 @@ #include #include -#include #include template diff --git a/cpp/tests/utilities/random_seed.cpp b/cpp/tests/utilities/random_seed.cpp index ab5a31ce161..555d89b7dc5 100644 --- a/cpp/tests/utilities/random_seed.cpp +++ b/cpp/tests/utilities/random_seed.cpp @@ -13,8 +13,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include -#include +#include namespace cudf { namespace test { diff --git a/cpp/tests/utilities_tests/column_debug_tests.cpp b/cpp/tests/utilities_tests/column_debug_tests.cpp index 7aa05af4591..2a57d678d07 100644 --- a/cpp/tests/utilities_tests/column_debug_tests.cpp +++ b/cpp/tests/utilities_tests/column_debug_tests.cpp @@ -16,12 +16,9 @@ #include #include -#include #include #include -#include - #include #include diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp index 9d6d5ccb9b5..a13ce825d0b 100644 --- a/cpp/tests/utilities_tests/column_utilities_tests.cpp +++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp @@ -17,20 +17,16 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include -#include - template struct ColumnUtilitiesTest : public cudf::test::BaseFixture { cudf::test::UniformRandomGenerator random; diff --git a/cpp/tests/utilities_tests/column_wrapper_tests.cpp b/cpp/tests/utilities_tests/column_wrapper_tests.cpp index 479c6687e75..339678f3be8 100644 --- a/cpp/tests/utilities_tests/column_wrapper_tests.cpp +++ b/cpp/tests/utilities_tests/column_wrapper_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp b/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp index 5e3fda5e6f7..ff50dc39979 100644 --- a/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp +++ b/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/utilities_tests/type_check_tests.cpp b/cpp/tests/utilities_tests/type_check_tests.cpp index fecb896f95a..c1c5776be74 100644 --- a/cpp/tests/utilities_tests/type_check_tests.cpp +++ b/cpp/tests/utilities_tests/type_check_tests.cpp @@ -18,7 +18,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/utilities_tests/type_list_tests.cpp b/cpp/tests/utilities_tests/type_list_tests.cpp index 849457056e4..6c3a84763a0 100644 --- a/cpp/tests/utilities_tests/type_list_tests.cpp +++ b/cpp/tests/utilities_tests/type_list_tests.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include using namespace cudf::test; // this will make reading code way easier @@ -23,6 +22,7 @@ namespace { // Work around to remove parentheses surrounding a type template struct argument_type; + template struct argument_type { using type = U; diff --git a/dependencies.yaml b/dependencies.yaml index ff97b67f0ce..7c7aa43fa41 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -232,7 +232,7 @@ files: key: cudf-pandas-tests includes: - test_python_cudf_pandas - py_rapids_build_cudf_polars: + py_build_cudf_polars: output: pyproject pyproject_dir: python/cudf_polars extras: @@ -727,7 +727,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.8,<1.9 + - polars>=1.11,<1.12 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index ecf619ddc44..5942cc16850 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -342,10 +342,7 @@ def clean_all_xml_files(path): "cudf.Series": ("cudf.core.series.Series", "cudf.Series"), "cudf.Index": ("cudf.core.index.Index", "cudf.Index"), "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"), - # TODO: Replace the first entry in a follow-up with rmm.pylibrmm.device_buffer.DeviceBuffer - # when the RMM objects inventory is generated from branch-24.12. The RMM objects inventory - # can be accessed here : https://docs.rapids.ai/api/rmm/nightly/objects.inv - "DeviceBuffer": ("rmm.DeviceBuffer", "rmm.DeviceBuffer"), + "DeviceBuffer": ("rmm.pylibrmm.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"), } diff --git a/docs/cudf/source/developer_guide/contributing_guide.md b/docs/cudf/source/developer_guide/contributing_guide.md index 6fce268f309..f4d2c7319b3 100644 --- a/docs/cudf/source/developer_guide/contributing_guide.md +++ b/docs/cudf/source/developer_guide/contributing_guide.md @@ -15,8 +15,7 @@ Developers are strongly recommended to set up `pre-commit` prior to any developm The `.pre-commit-config.yaml` file at the root of the repo is the primary source of truth linting. Specifically, cuDF uses the following tools: -- [`ruff`](https://beta.ruff.rs/) checks for general code formatting compliance. -- [`isort`](https://pycqa.github.io/isort/) ensures imports are sorted consistently. +- [`ruff`](https://docs.astral.sh/ruff/) checks for general code formatting compliance. - [`mypy`](http://mypy-lang.org/) performs static type checking. In conjunction with [type hints](https://docs.python.org/3/library/typing.html), `mypy` can help catch various bugs that are otherwise difficult to find. diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb index 95f5f9734dd..46221b6015b 100644 --- a/docs/cudf/source/user_guide/10min.ipynb +++ b/docs/cudf/source/user_guide/10min.ipynb @@ -38,10 +38,10 @@ "import os\n", "\n", "import cupy as cp\n", + "import dask_cudf\n", "import pandas as pd\n", "\n", "import cudf\n", - "import dask_cudf\n", "\n", "cp.random.seed(12)\n", "\n", diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst index c5b9533597a..e0735a197fd 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst @@ -11,3 +11,4 @@ nvtext ngrams_tokenize normalize replace + stemmer diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst new file mode 100644 index 00000000000..b407ff8451a --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst @@ -0,0 +1,6 @@ +======= +stemmer +======= + +.. automodule:: pylibcudf.nvtext.stemmer + :members: diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb index 75eafcc5387..abfe5a1b178 100644 --- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb +++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb @@ -101,6 +101,8 @@ "outputs": [], "source": [ "# define a scalar function\n", + "\n", + "\n", "def f(x):\n", " return x + 1" ] @@ -247,6 +249,8 @@ "outputs": [], "source": [ "# redefine the same function from above\n", + "\n", + "\n", "def f(x):\n", " return x + 1" ] @@ -1622,6 +1626,8 @@ "outputs": [], "source": [ "# a user defined aggregation function.\n", + "\n", + "\n", "def udaf(df):\n", " return df[\"b\"].max() - df[\"b\"].min() / 2" ] diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java index e4106574a19..d792459901c 100644 --- a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java +++ b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -155,6 +155,16 @@ public static HostMemoryBuffer allocate(long bytes) { return allocate(bytes, defaultPreferPinned); } + /** + * Allocate host memory bypassing the default allocator. This is intended to only be used by other allocators. + * Pinned memory will not be used for these allocations. + * @param bytes size in bytes to allocate + * @return the newly created buffer + */ + public static HostMemoryBuffer allocateRaw(long bytes) { + return new HostMemoryBuffer(UnsafeMemoryAccessor.allocate(bytes), bytes); + } + /** * Create a host buffer that is memory-mapped to a file. * @param path path to the file to map into host memory diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java b/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java deleted file mode 100644 index 72c2e659372..00000000000 --- a/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java +++ /dev/null @@ -1,32 +0,0 @@ -/* - * - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package ai.rapids.cudf; - -/** - * Represents some amount of host memory that has been reserved. A reservation guarantees that one - * or more allocations up to the reserved amount, minus padding for alignment will succeed. A - * reservation typically guarantees the amount can be allocated one, meaning when a buffer - * allocated from a reservation is freed it is not returned to the reservation, but to the pool of - * memory the reservation originally came from. If more memory is allocated from the reservation - * an OutOfMemoryError may be thrown, but it is not guaranteed to happen. - * - * When the reservation is closed any unused reservation will be returned to the pool of memory - * the reservation came from. - */ -public interface HostMemoryReservation extends HostMemoryAllocator, AutoCloseable {} diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java index 76b2799aad6..6da591d659f 100644 --- a/java/src/main/java/ai/rapids/cudf/Schema.java +++ b/java/src/main/java/ai/rapids/cudf/Schema.java @@ -29,26 +29,52 @@ public class Schema { public static final Schema INFERRED = new Schema(); private final DType topLevelType; + + /** + * Default value for precision value, when it is not specified or the column type is not decimal. + */ + private static final int UNKNOWN_PRECISION = -1; + + /** + * Store precision for the top level column, only applicable if the column is a decimal type. + *

+ * This variable is not designed to be used by any libcudf's APIs since libcudf does not support + * precisions for fixed point numbers. + * Instead, it is used only to pass down the precision values from Spark's DecimalType to the + * JNI level, where some JNI functions require these values to perform their operations. + */ + private final int topLevelPrecision; + private final List childNames; private final List childSchemas; private boolean flattened = false; private String[] flattenedNames; private DType[] flattenedTypes; + private int[] flattenedPrecisions; private int[] flattenedCounts; private Schema(DType topLevelType, + int topLevelPrecision, List childNames, List childSchemas) { this.topLevelType = topLevelType; + this.topLevelPrecision = topLevelPrecision; this.childNames = childNames; this.childSchemas = childSchemas; } + private Schema(DType topLevelType, + List childNames, + List childSchemas) { + this(topLevelType, UNKNOWN_PRECISION, childNames, childSchemas); + } + /** * Inferred schema. */ private Schema() { topLevelType = null; + topLevelPrecision = UNKNOWN_PRECISION; childNames = null; childSchemas = null; } @@ -104,14 +130,17 @@ private void flattenIfNeeded() { if (flatLen == 0) { flattenedNames = null; flattenedTypes = null; + flattenedPrecisions = null; flattenedCounts = null; } else { String[] names = new String[flatLen]; DType[] types = new DType[flatLen]; + int[] precisions = new int[flatLen]; int[] counts = new int[flatLen]; - collectFlattened(names, types, counts, 0); + collectFlattened(names, types, precisions, counts, 0); flattenedNames = names; flattenedTypes = types; + flattenedPrecisions = precisions; flattenedCounts = counts; } flattened = true; @@ -128,19 +157,20 @@ private int flattenedLength(int startingLength) { return startingLength; } - private int collectFlattened(String[] names, DType[] types, int[] counts, int offset) { + private int collectFlattened(String[] names, DType[] types, int[] precisions, int[] counts, int offset) { if (childSchemas != null) { for (int i = 0; i < childSchemas.size(); i++) { Schema child = childSchemas.get(i); names[offset] = childNames.get(i); types[offset] = child.topLevelType; + precisions[offset] = child.topLevelPrecision; if (child.childNames != null) { counts[offset] = child.childNames.size(); } else { counts[offset] = 0; } offset++; - offset = this.childSchemas.get(i).collectFlattened(names, types, counts, offset); + offset = this.childSchemas.get(i).collectFlattened(names, types, precisions, counts, offset); } } return offset; @@ -226,6 +256,22 @@ public int[] getFlattenedTypeScales() { return ret; } + /** + * Get decimal precisions of the columns' types flattened from all levels in schema by + * depth-first traversal. + *

+ * This is used to pass down the decimal precisions from Spark to only the JNI layer, where + * some JNI functions require precision values to perform their operations. + * Decimal precisions should not be consumed by any libcudf's APIs since libcudf does not + * support precisions for fixed point numbers. + * + * @return An array containing decimal precision of all columns in schema. + */ + public int[] getFlattenedDecimalPrecisions() { + flattenIfNeeded(); + return flattenedPrecisions; + } + /** * Get the types of the columns in schema flattened from all levels by depth-first traversal. * @return An array containing types of all columns in schema. @@ -307,11 +353,13 @@ public HostColumnVector.DataType asHostDataType() { public static class Builder { private final DType topLevelType; + private final int topLevelPrecision; private final List names; private final List types; - private Builder(DType topLevelType) { + private Builder(DType topLevelType, int topLevelPrecision) { this.topLevelType = topLevelType; + this.topLevelPrecision = topLevelPrecision; if (topLevelType == DType.STRUCT || topLevelType == DType.LIST) { // There can be children names = new ArrayList<>(); @@ -322,14 +370,19 @@ private Builder(DType topLevelType) { } } + private Builder(DType topLevelType) { + this(topLevelType, UNKNOWN_PRECISION); + } + /** * Add a new column * @param type the type of column to add * @param name the name of the column to add (Ignored for list types) + * @param precision the decimal precision, only applicable for decimal types * @return the builder for the new column. This should really only be used when the type * passed in is a LIST or a STRUCT. */ - public Builder addColumn(DType type, String name) { + public Builder addColumn(DType type, String name, int precision) { if (names == null) { throw new IllegalStateException("A column of type " + topLevelType + " cannot have children"); @@ -340,21 +393,31 @@ public Builder addColumn(DType type, String name) { if (names.contains(name)) { throw new IllegalStateException("Cannot add duplicate names to a schema"); } - Builder ret = new Builder(type); + Builder ret = new Builder(type, precision); types.add(ret); names.add(name); return ret; } + public Builder addColumn(DType type, String name) { + return addColumn(type, name, UNKNOWN_PRECISION); + } + /** * Adds a single column to the current schema. addColumn is preferred as it can be used * to support nested types. * @param type the type of the column. * @param name the name of the column. + * @param precision the decimal precision, only applicable for decimal types. * @return this for chaining. */ + public Builder column(DType type, String name, int precision) { + addColumn(type, name, precision); + return this; + } + public Builder column(DType type, String name) { - addColumn(type, name); + addColumn(type, name, UNKNOWN_PRECISION); return this; } diff --git a/pyproject.toml b/pyproject.toml index 661c68ee62e..6933484f4e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,8 @@ select = [ "F", # pycodestyle Warning "W", + # isort + "I", # no-blank-line-before-function "D201", # one-blank-line-after-class diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py index 7b2b71cf216..0e4afadccf5 100644 --- a/python/cudf/benchmarks/conftest.py +++ b/python/cudf/benchmarks/conftest.py @@ -56,27 +56,23 @@ # into the main repo. sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common")) -from config import cudf # noqa: W0611, E402, F401 -from utils import ( # noqa: E402 - OrderedSet, - collapse_fixtures, - column_generators, - make_fixture, -) - # Turn off isort until we upgrade to 5.8.0 # https://github.com/pycqa/isort/issues/1594 -# isort: off from config import ( # noqa: W0611, E402, F401 NUM_COLS, NUM_ROWS, collect_ignore, + cudf, # noqa: W0611, E402, F401 pytest_collection_modifyitems, pytest_sessionfinish, pytest_sessionstart, ) - -# isort: on +from utils import ( # noqa: E402 + OrderedSet, + collapse_fixtures, + column_generators, + make_fixture, +) @pytest_cases.fixture(params=[0, 1], ids=["AxisIndex", "AxisColumn"]) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 065655505b8..94dbdf5534d 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -688,15 +688,18 @@ cdef class Column: # special case for string column is_string_column = (cv.type().id() == libcudf_types.type_id.STRING) if is_string_column: - # get the size from offset child column (device to host copy) - offsets_column_index = 0 - offset_child_column = cv.child(offsets_column_index) - if offset_child_column.size() == 0: + if cv.num_children() == 0: base_nbytes = 0 else: - chars_size = get_element( - offset_child_column, offset_child_column.size()-1).value - base_nbytes = chars_size + # get the size from offset child column (device to host copy) + offsets_column_index = 0 + offset_child_column = cv.child(offsets_column_index) + if offset_child_column.size() == 0: + base_nbytes = 0 + else: + chars_size = get_element( + offset_child_column, offset_child_column.size()-1).value + base_nbytes = chars_size if data_ptr: if data_owner is None: diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 30353c4be6c..4221e745e65 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -4,7 +4,7 @@ import pickle from libc.stdint cimport uint8_t, uintptr_t from libcpp cimport bool -from libcpp.memory cimport make_shared, shared_ptr, unique_ptr +from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector @@ -30,10 +30,6 @@ from libcpp.memory cimport make_unique cimport pylibcudf.libcudf.contiguous_split as cpp_contiguous_split from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.lists.gather cimport ( - segmented_gather as cpp_segmented_gather, -) -from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.types cimport size_type @@ -339,26 +335,6 @@ def get_element(Column input_column, size_type index): ) -@acquire_spill_lock() -def segmented_gather(Column source_column, Column gather_map): - cdef shared_ptr[lists_column_view] source_LCV = ( - make_shared[lists_column_view](source_column.view()) - ) - cdef shared_ptr[lists_column_view] gather_map_LCV = ( - make_shared[lists_column_view](gather_map.view()) - ) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_segmented_gather( - source_LCV.get()[0], gather_map_LCV.get()[0]) - ) - - result = Column.from_unique_ptr(move(c_result)) - return result - - cdef class _CPackedColumns: @staticmethod diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx index 1dc586bb257..1c9d3a01b80 100644 --- a/python/cudf/cudf/_lib/interop.pyx +++ b/python/cudf/cudf/_lib/interop.pyx @@ -1,49 +1,22 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cpython cimport pycapsule -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - import pylibcudf -from pylibcudf.libcudf.interop cimport ( - DLManagedTensor, - from_dlpack as cpp_from_dlpack, - to_dlpack as cpp_to_dlpack, -) -from pylibcudf.libcudf.table.table cimport table -from pylibcudf.libcudf.table.table_view cimport table_view - -from cudf._lib.utils cimport ( - columns_from_pylibcudf_table, - columns_from_unique_ptr, - table_view_from_columns, -) +from cudf._lib.utils cimport columns_from_pylibcudf_table from cudf.core.buffer import acquire_spill_lock from cudf.core.dtypes import ListDtype, StructDtype -def from_dlpack(dlpack_capsule): +def from_dlpack(object dlpack_capsule): """ Converts a DLPack Tensor PyCapsule into a list of columns. DLPack Tensor PyCapsule is expected to have the name "dltensor". """ - cdef DLManagedTensor* dlpack_tensor = pycapsule.\ - PyCapsule_GetPointer(dlpack_capsule, 'dltensor') - pycapsule.PyCapsule_SetName(dlpack_capsule, 'used_dltensor') - - cdef unique_ptr[table] c_result - - with nogil: - c_result = move( - cpp_from_dlpack(dlpack_tensor) - ) - - res = columns_from_unique_ptr(move(c_result)) - dlpack_tensor.deleter(dlpack_tensor) - return res + return columns_from_pylibcudf_table( + pylibcudf.interop.from_dlpack(dlpack_capsule) + ) def to_dlpack(list source_columns): @@ -52,39 +25,13 @@ def to_dlpack(list source_columns): DLPack Tensor PyCapsule will have the name "dltensor". """ - if any(column.null_count for column in source_columns): - raise ValueError( - "Cannot create a DLPack tensor with null values. \ - Input is required to have null count as zero." - ) - - cdef DLManagedTensor *dlpack_tensor - cdef table_view source_table_view = table_view_from_columns(source_columns) - - with nogil: - dlpack_tensor = cpp_to_dlpack( - source_table_view + return pylibcudf.interop.to_dlpack( + pylibcudf.Table( + [col.to_pylibcudf(mode="read") for col in source_columns] ) - - return pycapsule.PyCapsule_New( - dlpack_tensor, - 'dltensor', - dlmanaged_tensor_pycapsule_deleter ) -cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept: - cdef DLManagedTensor* dlpack_tensor = 0 - try: - dlpack_tensor = pycapsule.PyCapsule_GetPointer( - pycap_obj, 'used_dltensor') - return # we do not call a used capsule's deleter - except Exception: - dlpack_tensor = pycapsule.PyCapsule_GetPointer( - pycap_obj, 'dltensor') - dlpack_tensor.deleter(dlpack_tensor) - - def gather_metadata(object cols_dtypes): """ Generates a ColumnMetadata vector for each column. diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 7e8710bedb6..12432ac6d5d 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -9,7 +9,7 @@ from pylibcudf.libcudf.types cimport null_order, size_type from cudf._lib.column cimport Column from cudf._lib.utils cimport columns_from_pylibcudf_table -import pylibcudf +import pylibcudf as plc from pylibcudf cimport Scalar @@ -17,7 +17,7 @@ from pylibcudf cimport Scalar @acquire_spill_lock() def count_elements(Column col): return Column.from_pylibcudf( - pylibcudf.lists.count_elements( + plc.lists.count_elements( col.to_pylibcudf(mode="read")) ) @@ -25,8 +25,8 @@ def count_elements(Column col): @acquire_spill_lock() def explode_outer(list source_columns, int explode_column_idx): return columns_from_pylibcudf_table( - pylibcudf.lists.explode_outer( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]), + plc.lists.explode_outer( + plc.Table([c.to_pylibcudf(mode="read") for c in source_columns]), explode_column_idx, ) ) @@ -35,7 +35,7 @@ def explode_outer(list source_columns, int explode_column_idx): @acquire_spill_lock() def distinct(Column col, bool nulls_equal, bool nans_all_equal): return Column.from_pylibcudf( - pylibcudf.lists.distinct( + plc.lists.distinct( col.to_pylibcudf(mode="read"), nulls_equal, nans_all_equal, @@ -46,7 +46,7 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal): @acquire_spill_lock() def sort_lists(Column col, bool ascending, str na_position): return Column.from_pylibcudf( - pylibcudf.lists.sort_lists( + plc.lists.sort_lists( col.to_pylibcudf(mode="read"), ascending, null_order.BEFORE if na_position == "first" else null_order.AFTER, @@ -58,7 +58,7 @@ def sort_lists(Column col, bool ascending, str na_position): @acquire_spill_lock() def extract_element_scalar(Column col, size_type index): return Column.from_pylibcudf( - pylibcudf.lists.extract_list_element( + plc.lists.extract_list_element( col.to_pylibcudf(mode="read"), index, ) @@ -68,7 +68,7 @@ def extract_element_scalar(Column col, size_type index): @acquire_spill_lock() def extract_element_column(Column col, Column index): return Column.from_pylibcudf( - pylibcudf.lists.extract_list_element( + plc.lists.extract_list_element( col.to_pylibcudf(mode="read"), index.to_pylibcudf(mode="read"), ) @@ -78,7 +78,7 @@ def extract_element_column(Column col, Column index): @acquire_spill_lock() def contains_scalar(Column col, py_search_key): return Column.from_pylibcudf( - pylibcudf.lists.contains( + plc.lists.contains( col.to_pylibcudf(mode="read"), py_search_key.device_value.c_value, ) @@ -88,7 +88,7 @@ def contains_scalar(Column col, py_search_key): @acquire_spill_lock() def index_of_scalar(Column col, object py_search_key): return Column.from_pylibcudf( - pylibcudf.lists.index_of( + plc.lists.index_of( col.to_pylibcudf(mode="read"), py_search_key.device_value.c_value, True, @@ -99,7 +99,7 @@ def index_of_scalar(Column col, object py_search_key): @acquire_spill_lock() def index_of_column(Column col, Column search_keys): return Column.from_pylibcudf( - pylibcudf.lists.index_of( + plc.lists.index_of( col.to_pylibcudf(mode="read"), search_keys.to_pylibcudf(mode="read"), True, @@ -110,8 +110,8 @@ def index_of_column(Column col, Column search_keys): @acquire_spill_lock() def concatenate_rows(list source_columns): return Column.from_pylibcudf( - pylibcudf.lists.concatenate_rows( - pylibcudf.Table([ + plc.lists.concatenate_rows( + plc.Table([ c.to_pylibcudf(mode="read") for c in source_columns ]) ) @@ -121,8 +121,18 @@ def concatenate_rows(list source_columns): @acquire_spill_lock() def concatenate_list_elements(Column input_column, dropna=False): return Column.from_pylibcudf( - pylibcudf.lists.concatenate_list_elements( + plc.lists.concatenate_list_elements( input_column.to_pylibcudf(mode="read"), dropna, ) ) + + +@acquire_spill_lock() +def segmented_gather(Column source_column, Column gather_map): + return Column.from_pylibcudf( + plc.lists.segmented_gather( + source_column.to_pylibcudf(mode="read"), + gather_map.to_pylibcudf(mode="read"), + ) + ) diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx index 5bf25562fed..63a389b64d5 100644 --- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx +++ b/python/cudf/cudf/_lib/nvtext/stemmer.pyx @@ -1,24 +1,19 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cudf.core.buffer import acquire_spill_lock - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from enum import IntEnum -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view +from cudf.core.buffer import acquire_spill_lock + from pylibcudf.libcudf.nvtext.stemmer cimport ( - is_letter as cpp_is_letter, letter_type, - porter_stemmer_measure as cpp_porter_stemmer_measure, underlying_type_t_letter_type, ) from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column +from pylibcudf import nvtext + class LetterType(IntEnum): CONSONANT = letter_type.CONSONANT @@ -27,43 +22,34 @@ class LetterType(IntEnum): @acquire_spill_lock() def porter_stemmer_measure(Column strings): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_porter_stemmer_measure(c_strings)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + nvtext.stemmer.porter_stemmer_measure( + strings.to_pylibcudf(mode="read"), + ) + ) @acquire_spill_lock() def is_letter(Column strings, object ltype, size_type index): - cdef column_view c_strings = strings.view() - cdef letter_type c_ltype = ( - ltype + return Column.from_pylibcudf( + nvtext.stemmer.is_letter( + strings.to_pylibcudf(mode="read"), + ltype==LetterType.VOWEL, + index, + ) ) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_is_letter(c_strings, c_ltype, index)) - - return Column.from_unique_ptr(move(c_result)) @acquire_spill_lock() def is_letter_multi(Column strings, object ltype, Column indices): - cdef column_view c_strings = strings.view() - cdef column_view c_indices = indices.view() - cdef letter_type c_ltype = ( - ltype + return Column.from_pylibcudf( + nvtext.stemmer.is_letter( + strings.to_pylibcudf(mode="read"), + ltype==LetterType.VOWEL, + indices.to_pylibcudf(mode="read"), + ) ) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_is_letter(c_strings, c_ltype, c_indices)) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py index 6e8ad556b08..3b13cc258ab 100644 --- a/python/cudf/cudf/_typing.py +++ b/python/cudf/cudf/_typing.py @@ -1,8 +1,8 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. import sys -from collections.abc import Callable -from typing import TYPE_CHECKING, Any, Dict, Iterable, TypeVar, Union +from collections.abc import Callable, Iterable +from typing import TYPE_CHECKING, Any, TypeVar, Union import numpy as np from pandas import Period, Timedelta, Timestamp @@ -42,7 +42,7 @@ SeriesOrSingleColumnIndex = Union["cudf.Series", "cudf.core.index.Index"] # Groupby aggregation -AggType = Union[str, Callable] -MultiColumnAggType = Union[ - AggType, Iterable[AggType], Dict[Any, Iterable[AggType]] +AggType = Union[str, Callable] # noqa: UP007 +MultiColumnAggType = Union[ # noqa: UP007 + AggType, Iterable[AggType], dict[Any, Iterable[AggType]] ] diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index caff019f575..ffa306bf93f 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -6,7 +6,7 @@ import pickle import weakref from types import SimpleNamespace -from typing import Any, Literal, Mapping +from typing import TYPE_CHECKING, Any, Literal import numpy from typing_extensions import Self @@ -18,6 +18,9 @@ from cudf.core.abc import Serializable from cudf.utils.string import format_bytes +if TYPE_CHECKING: + from collections.abc import Mapping + def host_memory_allocation(nbytes: int) -> memoryview: """Allocate host memory using NumPy diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py index 0bd8d6054b3..ecf9807cfc2 100644 --- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py +++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py @@ -2,13 +2,16 @@ from __future__ import annotations -from typing import Literal, Mapping +from typing import TYPE_CHECKING, Literal from typing_extensions import Self import cudf from cudf.core.buffer.buffer import Buffer, BufferOwner +if TYPE_CHECKING: + from collections.abc import Mapping + class ExposureTrackedBuffer(Buffer): """An exposure tracked buffer. diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 06791df7dc0..a1e87d04bc9 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -29,4 +29,3 @@ Decimal128Column, DecimalBaseColumn, ) -from cudf.core.column.interval import IntervalColumn # noqa: F401 diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 864e87b5377..087d0ed65f5 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -4,7 +4,7 @@ import warnings from functools import cached_property -from typing import TYPE_CHECKING, Any, Mapping, Sequence, cast +from typing import TYPE_CHECKING, Any, cast import numpy as np import pandas as pd @@ -26,6 +26,7 @@ if TYPE_CHECKING: from collections import abc + from collections.abc import Mapping, Sequence import numba.cuda diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 7674565e2c3..d2cd6e8ac8f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -4,10 +4,11 @@ import pickle from collections import abc +from collections.abc import MutableSequence, Sequence from functools import cached_property from itertools import chain from types import SimpleNamespace -from typing import TYPE_CHECKING, Any, Literal, MutableSequence, Sequence, cast +from typing import TYPE_CHECKING, Any, Literal, cast import cupy import numpy as np diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 2c9b0baa9b6..b6dc250e64d 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -8,7 +8,7 @@ import locale import re from locale import nl_langinfo -from typing import TYPE_CHECKING, Literal, Sequence, cast +from typing import TYPE_CHECKING, Literal, cast import numpy as np import pandas as pd @@ -31,6 +31,8 @@ from cudf.utils.utils import _all_bools_with_nulls if TYPE_CHECKING: + from collections.abc import Sequence + from cudf._typing import ( ColumnBinaryOperand, DatetimeLikeScalar, diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 8803ebd6791..8ae06f72d1e 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -3,8 +3,9 @@ from __future__ import annotations import warnings +from collections.abc import Sequence from decimal import Decimal -from typing import TYPE_CHECKING, Sequence, cast +from typing import TYPE_CHECKING, cast import cupy as cp import numpy as np diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index c6a39199e3b..6b25e568f00 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -3,7 +3,7 @@ from __future__ import annotations from functools import cached_property -from typing import TYPE_CHECKING, Sequence, cast +from typing import TYPE_CHECKING, cast import numpy as np import pandas as pd @@ -11,7 +11,6 @@ from typing_extensions import Self import cudf -from cudf._lib.copying import segmented_gather from cudf._lib.lists import ( concatenate_list_elements, concatenate_rows, @@ -22,6 +21,7 @@ extract_element_scalar, index_of_column, index_of_scalar, + segmented_gather, sort_lists, ) from cudf._lib.strings.convert.convert_lists import format_list_column @@ -34,6 +34,8 @@ from cudf.core.missing import NA if TYPE_CHECKING: + from collections.abc import Sequence + from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 05a0ab2e09a..a91c080fe21 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -2,9 +2,7 @@ from __future__ import annotations -from typing import Union, overload - -from typing_extensions import Literal +from typing import Literal, Union, overload import cudf import cudf.core.column diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 78d2814ed26..620cae65374 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -3,7 +3,7 @@ from __future__ import annotations import functools -from typing import TYPE_CHECKING, Any, Sequence, cast +from typing import TYPE_CHECKING, Any, cast import numpy as np import pandas as pd @@ -28,7 +28,7 @@ from .numerical_base import NumericalBaseColumn if TYPE_CHECKING: - from collections.abc import Callable + from collections.abc import Callable, Sequence from cudf._typing import ( ColumnBinaryOperand, diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index b25e486d855..856ce0f75de 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5,7 +5,7 @@ import re import warnings from functools import cached_property -from typing import TYPE_CHECKING, Sequence, cast, overload +from typing import TYPE_CHECKING, cast, overload import numpy as np import pandas as pd @@ -35,6 +35,8 @@ def str_to_boolean(column: StringColumn): if TYPE_CHECKING: + from collections.abc import Sequence + import cupy import numba.cuda diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 6b6f3e517a8..087d6474e7f 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -4,7 +4,7 @@ import datetime import functools -from typing import TYPE_CHECKING, Sequence, cast +from typing import TYPE_CHECKING, cast import numpy as np import pandas as pd @@ -19,6 +19,8 @@ from cudf.utils.utils import _all_bools_with_nulls if TYPE_CHECKING: + from collections.abc import Sequence + from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype _unit_to_nanoseconds_conversion = { diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index bc093fdaa9a..496e86ed709 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -5,8 +5,9 @@ import itertools import sys from collections import abc +from collections.abc import Mapping from functools import cached_property, reduce -from typing import TYPE_CHECKING, Any, Mapping, cast +from typing import TYPE_CHECKING, Any, cast import numpy as np import pandas as pd diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7d4d34f5b04..bf1c39b23da 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -13,8 +13,8 @@ import textwrap import warnings from collections import abc, defaultdict -from collections.abc import Callable, Iterator -from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast +from collections.abc import Callable, Iterator, MutableMapping +from typing import TYPE_CHECKING, Any, Literal, cast import cupy import numba diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 5250a741d3d..aa601a2b322 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -3,7 +3,7 @@ import enum from collections import abc -from typing import Any, Iterable, Mapping, Sequence, Tuple, cast +from typing import TYPE_CHECKING, Any, cast import cupy as cp import numpy as np @@ -20,6 +20,9 @@ build_column, ) +if TYPE_CHECKING: + from collections.abc import Iterable, Mapping, Sequence + # Implementation of interchange protocol classes # ---------------------------------------------- @@ -61,7 +64,7 @@ class _MaskKind(enum.IntEnum): _DtypeKind.BOOL, _DtypeKind.STRING, } -ProtoDtype = Tuple[_DtypeKind, int, str, str] +ProtoDtype = tuple[_DtypeKind, int, str, str] class _CuDFBuffer: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 37ad6b8fabb..205edd91d9d 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,7 +6,7 @@ import pickle import warnings from collections import abc -from typing import TYPE_CHECKING, Any, Literal, MutableMapping +from typing import TYPE_CHECKING, Any, Literal # TODO: The `numpy` import is needed for typing purposes during doc builds # only, need to figure out why the `np` alias is insufficient then remove. @@ -36,6 +36,7 @@ from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf if TYPE_CHECKING: + from collections.abc import MutableMapping from types import ModuleType from cudf._typing import Dtype, ScalarLike diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 81b20488d8d..6630bd96c01 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -8,7 +8,7 @@ import warnings from collections import abc from functools import cached_property -from typing import TYPE_CHECKING, Any, Iterable, Literal +from typing import TYPE_CHECKING, Any, Literal import cupy as cp import numpy as np @@ -36,6 +36,8 @@ from cudf.utils.utils import GetAttrGetItemMixin if TYPE_CHECKING: + from collections.abc import Iterable + from cudf._typing import ( AggType, DataFrameOrSeries, diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index cd07c58c5d9..1b90e9f9df0 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -5,10 +5,10 @@ import operator import pickle import warnings -from collections.abc import Hashable +from collections.abc import Hashable, MutableMapping from functools import cache, cached_property from numbers import Number -from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast +from typing import TYPE_CHECKING, Any, Literal, cast import cupy import numpy as np diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 5952815deef..e031f2a4e8e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -10,9 +10,7 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, - MutableMapping, TypeVar, cast, ) @@ -63,6 +61,8 @@ from cudf.utils.utils import _warn_no_dask_cudf if TYPE_CHECKING: + from collections.abc import Callable, MutableMapping + from cudf._typing import ( ColumnLike, DataFrameOrSeries, diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index 8182e5cede2..ce6a5c960dd 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -3,9 +3,7 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Any, List, Union - -from typing_extensions import TypeAlias +from typing import Any, TypeAlias import cudf from cudf.api.types import _is_scalar_or_zero_d_array, is_integer @@ -46,11 +44,11 @@ class ScalarIndexer: key: GatherMap -IndexingSpec: TypeAlias = Union[ - EmptyIndexer, MapIndexer, MaskIndexer, ScalarIndexer, SliceIndexer -] +IndexingSpec: TypeAlias = ( + EmptyIndexer | MapIndexer | MaskIndexer | ScalarIndexer | SliceIndexer +) -ColumnLabels: TypeAlias = List[str] +ColumnLabels: TypeAlias = list[str] def destructure_iloc_key( diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 92d094d9de5..bfff62f0a89 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -8,7 +8,7 @@ import pickle import warnings from functools import cached_property -from typing import TYPE_CHECKING, Any, MutableMapping +from typing import TYPE_CHECKING, Any import cupy as cp import numpy as np @@ -36,7 +36,7 @@ from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name if TYPE_CHECKING: - from collections.abc import Generator, Hashable + from collections.abc import Generator, Hashable, MutableMapping from typing_extensions import Self diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 29ed18ac0ce..9b60424c924 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -9,7 +9,7 @@ import warnings from collections import abc from shutil import get_terminal_size -from typing import TYPE_CHECKING, Any, Literal, MutableMapping +from typing import TYPE_CHECKING, Any, Literal import cupy import numpy as np @@ -71,6 +71,8 @@ from cudf.utils.performance_tracking import _performance_tracking if TYPE_CHECKING: + from collections.abc import MutableMapping + import pyarrow as pa from cudf._typing import ( diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 68f34fa28ff..885e7b16644 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -4,7 +4,7 @@ import math import re import warnings -from typing import Literal, Sequence +from typing import TYPE_CHECKING, Literal import numpy as np import pandas as pd @@ -20,6 +20,9 @@ from cudf.core import column from cudf.core.index import ensure_index +if TYPE_CHECKING: + from collections.abc import Sequence + # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112 _unit_map = { "year": "year", diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index c364d55e677..73afde407db 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -10,9 +10,9 @@ import pickle import types import warnings -from collections.abc import Callable, Iterator +from collections.abc import Callable, Iterator, Mapping from enum import IntEnum -from typing import Any, Literal, Mapping +from typing import Any, Literal import numpy as np diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py index f82e300e83d..38103a71908 100644 --- a/python/cudf/cudf/pandas/module_accelerator.py +++ b/python/cudf/cudf/pandas/module_accelerator.py @@ -17,7 +17,7 @@ from abc import abstractmethod from importlib._bootstrap import _ImportLockContext as ImportLock from types import ModuleType -from typing import Any, ContextManager, NamedTuple +from typing import Any, ContextManager, NamedTuple # noqa: UP035 from typing_extensions import Self diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py index 8870fbc5c28..bb2fc00d9fc 100644 --- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py +++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py @@ -9,6 +9,7 @@ python analyze-test-failures.py Example: +------- python analyze-test-failures.py log.json frame/* """ diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index d12d2697729..59966a5ff0c 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -35,7 +35,7 @@ def null_assert_warnings(*args, **kwargs): @pytest.fixture(scope="session", autouse=True) # type: ignore def patch_testing_functions(): - tm.assert_produces_warning = null_assert_warnings + tm.assert_produces_warning = null_assert_warnings # noqa: F821 pytest.raises = replace_kwargs({"match": None})(pytest.raises) diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py index 4ea0b3b4413..a0ad872e4c7 100644 --- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py +++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py @@ -5,7 +5,8 @@ """ Summarizes the test results per module. -Examples: +Examples +-------- python summarize-test-results.py log.json python summarize-test-results.py log.json --output json python summarize-test-results.py log.json --output table diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py index a75a20a4681..63fd9601fc1 100644 --- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py +++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py @@ -387,7 +387,8 @@ def test_dir_bound_method( ): """This test will fail because dir for bound methods is currently incorrect, but we have no way to fix it without materializing the slow - type, which is unnecessarily expensive.""" + type, which is unnecessarily expensive. + """ Fast, FastIntermediate = fast_and_intermediate_with_doc Slow, SlowIntermediate = slow_and_intermediate_with_doc diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index feab04ffadc..80201dd84db 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -81,50 +81,6 @@ cudf-pandas-tests = [ Homepage = "https://github.com/rapidsai/cudf" Documentation = "https://docs.rapids.ai/api/cudf/stable/" -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true -known_dask = [ - "dask", - "distributed", - "dask_cuda", -] -known_rapids = [ - "rmm", - "pylibcudf" -] -known_first_party = [ - "cudf", -] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", -] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", - "__init__.py", -] - [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" @@ -174,3 +130,18 @@ wheel.packages = ["cudf"] provider = "scikit_build_core.metadata.regex" input = "cudf/VERSION" regex = "(?P.*)" + +[tool.ruff] +extend = "../../pyproject.toml" + +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["cudf"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] + +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda"] +rapids = ["rmm", "pylibcudf"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 87e19a2bccf..667cd7b1db8 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -32,51 +32,20 @@ test = [ Homepage = "https://github.com/rapidsai/cudf" Documentation = "https://docs.rapids.ai/api/cudf/stable/" -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true -known_dask = [ - "dask", - "distributed", - "dask_cuda", - "streamz", -] -known_rapids = [ - "rmm", - "cudf", - "dask_cudf", -] -known_first_party = [ - "cudf_kafka", -] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", -] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", - "__init__.py", -] +[tool.ruff] +extend = "../../pyproject.toml" + +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["cudf_kafka"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] + +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda", "streamz"] +rapids = ["rmm", "cudf", "dask_cudf"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index eb93929cf61..f79e229d3f3 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -666,11 +666,11 @@ def __init__( raise NotImplementedError( "rolling window/groupby" ) # pragma: no cover; rollingwindow constructor has already raised + if self.options.dynamic: + raise NotImplementedError("dynamic group by") if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests): raise NotImplementedError("Nested aggregations in groupby") self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] - if len(self.keys) == 0: - raise NotImplementedError("dynamic groupby") @staticmethod def check_agg(agg: expr.Expr) -> int: @@ -802,10 +802,10 @@ class Join(IR): right_on: tuple[expr.NamedExpr, ...] """List of expressions used as keys in the right frame.""" options: tuple[ - Literal["inner", "left", "right", "full", "leftsemi", "leftanti", "cross"], + Literal["inner", "left", "right", "full", "semi", "anti", "cross"], bool, tuple[int, int] | None, - str | None, + str, bool, ] """ @@ -840,7 +840,7 @@ def __init__( @staticmethod @cache def _joiners( - how: Literal["inner", "left", "right", "full", "leftsemi", "leftanti"], + how: Literal["inner", "left", "right", "full", "semi", "anti"], ) -> tuple[ Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None ]: @@ -862,13 +862,13 @@ def _joiners( plc.copying.OutOfBoundsPolicy.NULLIFY, plc.copying.OutOfBoundsPolicy.NULLIFY, ) - elif how == "leftsemi": + elif how == "semi": return ( plc.join.left_semi_join, plc.copying.OutOfBoundsPolicy.DONT_CHECK, None, ) - elif how == "leftanti": + elif how == "anti": return ( plc.join.left_anti_join, plc.copying.OutOfBoundsPolicy.DONT_CHECK, @@ -933,7 +933,6 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" left, right = (c.evaluate(cache=cache) for c in self.children) how, join_nulls, zlice, suffix, coalesce = self.options - suffix = "_right" if suffix is None else suffix if how == "cross": # Separate implementation, since cross_join returns the # result, not the gather maps @@ -955,7 +954,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: columns[left.num_columns :], right.column_names, strict=True ) ] - return DataFrame([*left_cols, *right_cols]) + return DataFrame([*left_cols, *right_cols]).slice(zlice) # TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184 left_on = DataFrame(broadcast(*(e.evaluate(left) for e in self.left_on))) right_on = DataFrame(broadcast(*(e.evaluate(right) for e in self.right_on))) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 522c4a6729c..c28f2c2651a 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -5,10 +5,11 @@ from __future__ import annotations +import functools import json from contextlib import AbstractContextManager, nullcontext from functools import singledispatch -from typing import Any +from typing import TYPE_CHECKING, Any import pyarrow as pa import pylibcudf as plc @@ -19,9 +20,13 @@ from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir from cudf_polars.dsl import expr, ir +from cudf_polars.dsl.traversal import make_recursive, reuse_if_unchanged from cudf_polars.typing import NodeTraverser from cudf_polars.utils import dtypes, sorting +if TYPE_CHECKING: + from cudf_polars.typing import ExprTransformer + __all__ = ["translate_ir", "translate_named_expr"] @@ -182,7 +187,71 @@ def _( with set_node(visitor, node.input_right): inp_right = translate_ir(visitor, n=None) right_on = [translate_named_expr(visitor, n=e) for e in node.right_on] - return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right) + if (how := node.options[0]) in { + "inner", + "left", + "right", + "full", + "cross", + "semi", + "anti", + }: + return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right) + else: + how, op1, op2 = how + if how != "ie_join": + raise NotImplementedError( + f"Unsupported join type {how}" + ) # pragma: no cover; asof joins not yet exposed + # No exposure of mixed/conditional joins in pylibcudf yet, so in + # the first instance, implement by doing a cross join followed by + # a filter. + _, join_nulls, zlice, suffix, coalesce = node.options + cross = ir.Join( + schema, + [], + [], + ("cross", join_nulls, None, suffix, coalesce), + inp_left, + inp_right, + ) + dtype = plc.DataType(plc.TypeId.BOOL8) + if op2 is None: + ops = [op1] + else: + ops = [op1, op2] + suffix = cross.options[3] + + # Column references in the right table refer to the post-join + # names, so with suffixes. + def _rename(e: expr.Expr, rec: ExprTransformer) -> expr.Expr: + if isinstance(e, expr.Col) and e.name in inp_left.schema: + return type(e)(e.dtype, f"{e.name}{suffix}") + return reuse_if_unchanged(e, rec) + + mapper = make_recursive(_rename) + right_on = [ + expr.NamedExpr( + f"{old.name}{suffix}" if old.name in inp_left.schema else old.name, new + ) + for new, old in zip( + (mapper(e.value) for e in right_on), right_on, strict=True + ) + ] + mask = functools.reduce( + functools.partial( + expr.BinOp, dtype, plc.binaryop.BinaryOperator.LOGICAL_AND + ), + ( + expr.BinOp(dtype, expr.BinOp._MAPPING[op], left.value, right.value) + for op, left, right in zip(ops, left_on, right_on, strict=True) + ), + ) + filtered = ir.Filter(schema, expr.NamedExpr("mask", mask), cross) + if zlice is not None: + offset, length = zlice + return ir.Slice(schema, offset, length, filtered) + return filtered @_translate_ir.register @@ -319,8 +388,7 @@ def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR: # IR is versioned with major.minor, minor is bumped for backwards # compatible changes (e.g. adding new nodes), major is bumped for # incompatible changes (e.g. renaming nodes). - # Polars 1.7 changes definition of the CSV reader options schema name. - if (version := visitor.version()) >= (3, 0): + if (version := visitor.version()) >= (4, 0): raise NotImplementedError( f"No support for polars IR {version=}" ) # pragma: no cover; no such version for now. diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 05b76d76808..a3607159e01 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -53,12 +53,34 @@ def pytest_configure(config: pytest.Config): "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match", "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match", "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-columns]": "Correctly raises but different error", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "Correctly raises but different error", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "Correctly raises but different error", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "Correctly raises but different error", "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[False-False]": "Needs some variant of cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[True-False]": "Needs some variant of cudf#16394", "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?", "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394", "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception", "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception", "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-none]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-none]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-prefiltered]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-prefiltered]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-row_groups]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-row_groups]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-columns]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-columns]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-none]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-none]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-prefiltered]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-prefiltered]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-row_groups]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-row_groups]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-columns]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-columns]": "Mismatching column read cudf#16394", "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match", "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match", "tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match", @@ -107,6 +129,14 @@ def pytest_configure(config: pytest.Config): "tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero", "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list", "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func0-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func1-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func2-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func3-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func0-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func1-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func2-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func3-none]": "cudf-polars doesn't nullify division by zero", "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype", @@ -124,13 +154,6 @@ def pytest_configure(config: pytest.Config): "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg", "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg", "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_by_monday_and_offset_5444": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[left-expected0]": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[right-expected1]": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[datapoint-expected2]": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_rolling_dynamic_sortedness_check": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_validation": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_15225": "IR needs to expose groupby-dynamic information", "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins", "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU", "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU", @@ -140,6 +163,7 @@ def pytest_configure(config: pytest.Config): "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match", "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match", "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852", + "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised", # Maybe flaky, order-dependent? "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order", "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero", diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index a8bb634732f..2afdab1be4b 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -19,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.8,<1.9", + "polars>=1.11,<1.12", "pylibcudf==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index 7d9ec98db97..501560d15b8 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -2,9 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +from contextlib import nullcontext + import pytest import polars as pl +from polars.testing import assert_frame_equal from cudf_polars.testing.asserts import ( assert_gpu_result_equal, @@ -22,6 +25,11 @@ def how(request): return request.param +@pytest.fixture(params=[None, (1, 5), (1, None), (0, 2), (0, None)]) +def zlice(request): + return request.param + + @pytest.fixture def left(): return pl.LazyFrame( @@ -37,8 +45,9 @@ def left(): def right(): return pl.LazyFrame( { - "a": [1, 4, 3, 7, None, None], - "c": [2, 3, 4, 5, 6, 7], + "a": [1, 4, 3, 7, None, None, 1], + "c": [2, 3, 4, 5, 6, 7, 8], + "d": [6, None, 7, 8, -1, 2, 4], } ) @@ -70,11 +79,31 @@ def test_coalesce_join(left, right, how, join_nulls, join_expr): query = left.join( right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=True ) - assert_gpu_result_equal(query, check_row_order=False) + assert_gpu_result_equal(query, check_row_order=how == "left") -def test_cross_join(left, right): +def test_left_join_with_slice(left, right, join_nulls, zlice): + q = left.join(right, on="a", how="left", join_nulls=join_nulls, coalesce=True) + ctx = nullcontext() + if zlice is not None: + q_expect = q.collect().slice(*zlice) + q = q.slice(*zlice) + if zlice == (1, 5) or zlice == (0, 2): + # https://github.com/pola-rs/polars/issues/19403 + # https://github.com/pola-rs/polars/issues/19405 + ctx = pytest.raises(AssertionError) + assert_frame_equal( + q_expect, q.collect(engine=pl.GPUEngine(raise_on_fail=True)) + ) + + with ctx: + assert_gpu_result_equal(q) + + +def test_cross_join(left, right, zlice): q = left.join(right, how="cross") + if zlice is not None: + q = q.slice(*zlice) assert_gpu_result_equal(q) @@ -86,3 +115,26 @@ def test_join_literal_key_unsupported(left, right, left_on, right_on): q = left.join(right, left_on=left_on, right_on=right_on, how="inner") assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.parametrize( + "conditions", + [ + [pl.col("a") < pl.col("a_right")], + [pl.col("a_right") <= pl.col("a") * 2], + [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")], + [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")], + [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2], + ], +) +def test_join_where(left, right, conditions, zlice): + q = left.join_where(right, *conditions) + + assert_gpu_result_equal(q, check_row_order=False) + + if zlice is not None: + q_len = q.slice(*zlice).select(pl.len()) + # Can't compare result, since row order is not guaranteed and + # therefore we only check the length + + assert_gpu_result_equal(q_len) diff --git a/python/custreamz/custreamz/tests/conftest.py b/python/custreamz/custreamz/tests/conftest.py index 1cda9b71387..c5135bc6414 100644 --- a/python/custreamz/custreamz/tests/conftest.py +++ b/python/custreamz/custreamz/tests/conftest.py @@ -2,6 +2,7 @@ import socket import pytest + from custreamz import kafka diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index af45f49d9b4..a8ab05a3922 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -65,50 +65,20 @@ include = [ ] exclude = ["*tests*"] -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true -known_dask = [ - "dask", - "distributed", - "dask_cuda", -] -known_rapids = [ - "rmm", - "cudf", - "dask_cudf", -] -known_first_party = [ - "streamz", -] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", -] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", - "__init__.py", -] +[tool.ruff] +extend = "../../pyproject.toml" + +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["streamz"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] + +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda"] +rapids = ["rmm", "cudf", "dask_cudf"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py index 04c2ad65b99..f9df22cc436 100644 --- a/python/dask_cudf/dask_cudf/__init__.py +++ b/python/dask_cudf/dask_cudf/__init__.py @@ -7,15 +7,15 @@ # do anything for dask==2024.2.0) config.set({"dataframe.query-planning-warning": False}) -import dask.dataframe as dd -from dask.dataframe import from_delayed +import dask.dataframe as dd # noqa: E402 +from dask.dataframe import from_delayed # noqa: E402 -import cudf +import cudf # noqa: E402 -from . import backends -from ._version import __git_commit__, __version__ -from .core import concat, from_cudf, from_dask_dataframe -from .expr import QUERY_PLANNING_ON +from . import backends # noqa: E402, F401 +from ._version import __git_commit__, __version__ # noqa: E402, F401 +from .core import concat, from_cudf, from_dask_dataframe # noqa: E402 +from .expr import QUERY_PLANNING_ON # noqa: E402 def read_csv(*args, **kwargs): @@ -55,9 +55,9 @@ def inner_func(*args, **kwargs): to_orc = raise_not_implemented_error("to_orc") else: - from .core import DataFrame, Index, Series - from .groupby import groupby_agg - from .io import read_text, to_orc + from .core import DataFrame, Index, Series # noqa: F401 + from .groupby import groupby_agg # noqa: F401 + from .io import read_text, to_orc # noqa: F401 __all__ = [ diff --git a/python/dask_cudf/dask_cudf/expr/__init__.py b/python/dask_cudf/dask_cudf/expr/__init__.py index a76b655ef42..6dadadd5263 100644 --- a/python/dask_cudf/dask_cudf/expr/__init__.py +++ b/python/dask_cudf/dask_cudf/expr/__init__.py @@ -12,8 +12,8 @@ config.set({"dataframe.shuffle.method": "tasks"}) try: - import dask_cudf.expr._collection - import dask_cudf.expr._expr + import dask_cudf.expr._collection # noqa: F401 + import dask_cudf.expr._expr # noqa: F401 except ImportError as err: # Dask *should* raise an error before this. diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py index 76bb2ea99b4..0421bd755f4 100644 --- a/python/dask_cudf/dask_cudf/io/__init__.py +++ b/python/dask_cudf/dask_cudf/io/__init__.py @@ -1,11 +1,11 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. -from .csv import read_csv -from .json import read_json -from .orc import read_orc, to_orc -from .text import read_text +from .csv import read_csv # noqa: F401 +from .json import read_json # noqa: F401 +from .orc import read_orc, to_orc # noqa: F401 +from .text import read_text # noqa: F401 try: - from .parquet import read_parquet, to_parquet + from .parquet import read_parquet, to_parquet # noqa: F401 except ImportError: pass diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 705865d083b..862e8f36eaa 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -69,50 +69,17 @@ version = {file = "dask_cudf/VERSION"} [tool.setuptools.packages.find] exclude = ["*tests*"] -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true +[tool.ruff] +extend = "../../pyproject.toml" -known_dask = [ - "dask", - "distributed", - "dask_cuda", -] -known_rapids = [ - "rmm", - "cudf", -] -known_first_party = [ - "dask_cudf", -] +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["dask_cudf"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", -] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", -] +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda"] +rapids = ["rmm", "cudf"] [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd index aa67b4b1149..9bdfdab97c2 100644 --- a/python/pylibcudf/pylibcudf/__init__.pxd +++ b/python/pylibcudf/pylibcudf/__init__.pxd @@ -13,6 +13,7 @@ from . cimport ( expressions, filling, groupby, + interop, join, json, labeling, @@ -62,6 +63,7 @@ __all__ = [ "filling", "gpumemoryview", "groupby", + "interop", "join", "json", "lists", diff --git a/python/pylibcudf/pylibcudf/interop.pxd b/python/pylibcudf/pylibcudf/interop.pxd new file mode 100644 index 00000000000..2a0a8c15fdd --- /dev/null +++ b/python/pylibcudf/pylibcudf/interop.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.table cimport Table + + +cpdef Table from_dlpack(object managed_tensor) + +cpdef object to_dlpack(Table input) diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx index 642516a1b90..61e812353b7 100644 --- a/python/pylibcudf/pylibcudf/interop.pyx +++ b/python/pylibcudf/pylibcudf/interop.pyx @@ -1,6 +1,11 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. -from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_New +from cpython.pycapsule cimport ( + PyCapsule_GetPointer, + PyCapsule_IsValid, + PyCapsule_New, + PyCapsule_SetName, +) from libc.stdlib cimport free from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -16,11 +21,14 @@ from pylibcudf.libcudf.interop cimport ( ArrowArray, ArrowArrayStream, ArrowSchema, + DLManagedTensor, column_metadata, from_arrow_column as cpp_from_arrow_column, from_arrow_stream as cpp_from_arrow_stream, + from_dlpack as cpp_from_dlpack, to_arrow_host_raw, to_arrow_schema_raw, + to_dlpack as cpp_to_dlpack, ) from pylibcudf.libcudf.table.table cimport table @@ -315,3 +323,87 @@ def _to_arrow_scalar(cudf_object, metadata=None): # Note that metadata for scalars is primarily important for preserving # information on nested types since names are otherwise irrelevant. return to_arrow(Column.from_scalar(cudf_object, 1), metadata=metadata)[0] + + +cpdef Table from_dlpack(object managed_tensor): + """ + Convert a DLPack DLTensor into a cudf table. + + For details, see :cpp:func:`cudf::from_dlpack` + + Parameters + ---------- + managed_tensor : PyCapsule + A 1D or 2D column-major (Fortran order) tensor. + + Returns + ------- + Table + Table with a copy of the tensor data. + """ + if not PyCapsule_IsValid(managed_tensor, "dltensor"): + raise ValueError("Invalid PyCapsule object") + cdef unique_ptr[table] c_result + cdef DLManagedTensor* dlpack_tensor = PyCapsule_GetPointer( + managed_tensor, "dltensor" + ) + if dlpack_tensor is NULL: + raise ValueError("PyCapsule object contained a NULL pointer") + PyCapsule_SetName(managed_tensor, "used_dltensor") + + # Note: A copy is always performed when converting the dlpack + # data to a libcudf table. We also delete the dlpack_tensor pointer + # as the pointer is not deleted by libcudf's from_dlpack function. + # TODO: https://github.com/rapidsai/cudf/issues/10874 + # TODO: https://github.com/rapidsai/cudf/issues/10849 + with nogil: + c_result = cpp_from_dlpack(dlpack_tensor) + + cdef Table result = Table.from_libcudf(move(c_result)) + dlpack_tensor.deleter(dlpack_tensor) + return result + + +cpdef object to_dlpack(Table input): + """ + Convert a cudf table into a DLPack DLTensor. + + For details, see :cpp:func:`cudf::to_dlpack` + + Parameters + ---------- + input : Table + A 1D or 2D column-major (Fortran order) tensor. + + Returns + ------- + PyCapsule + 1D or 2D DLPack tensor with a copy of the table data, or nullptr. + """ + for col in input._columns: + if col.null_count(): + raise ValueError( + "Cannot create a DLPack tensor with null values. " + "Input is required to have null count as zero." + ) + cdef DLManagedTensor *dlpack_tensor + + with nogil: + dlpack_tensor = cpp_to_dlpack(input.view()) + + return PyCapsule_New( + dlpack_tensor, + "dltensor", + dlmanaged_tensor_pycapsule_deleter + ) + + +cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept: + if PyCapsule_IsValid(pycap_obj, "used_dltensor"): + # we do not call a used capsule's deleter + return + cdef DLManagedTensor* dlpack_tensor = PyCapsule_GetPointer( + pycap_obj, "dltensor" + ) + if dlpack_tensor is not NULL: + dlpack_tensor.deleter(dlpack_tensor) diff --git a/python/pylibcudf/pylibcudf/libcudf/interop.pxd b/python/pylibcudf/pylibcudf/libcudf/interop.pxd index 30b97fdec34..b75e9ca7001 100644 --- a/python/pylibcudf/pylibcudf/libcudf/interop.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/interop.pxd @@ -32,11 +32,13 @@ cdef extern from "cudf/interop.hpp" nogil: cdef extern from "cudf/interop.hpp" namespace "cudf" \ nogil: - cdef unique_ptr[table] from_dlpack(const DLManagedTensor* tensor - ) except + + cdef unique_ptr[table] from_dlpack( + const DLManagedTensor* managed_tensor + ) except + - DLManagedTensor* to_dlpack(table_view input_table - ) except + + DLManagedTensor* to_dlpack( + const table_view& input + ) except + cdef cppclass column_metadata: column_metadata() except + diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd index 673bffa28ae..be3a2d75718 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t +from libcpp cimport bool from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -8,9 +9,9 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "nvtext/stemmer.hpp" namespace "nvtext" nogil: - ctypedef enum letter_type: - CONSONANT 'nvtext::letter_type::CONSONANT' - VOWEL 'nvtext::letter_type::VOWEL' + cpdef enum class letter_type: + CONSONANT + VOWEL cdef unique_ptr[column] porter_stemmer_measure( const column_view & strings diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt index 7a94490998a..d97c0a73267 100644 --- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -13,7 +13,7 @@ # ============================================================================= set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx - ngrams_tokenize.pyx normalize.pyx replace.pyx + ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd index 5a5e665d309..a658e57018e 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -8,6 +8,7 @@ from . cimport ( ngrams_tokenize, normalize, replace, + stemmer, ) __all__ = [ @@ -18,4 +19,5 @@ __all__ = [ "ngrams_tokenize", "normalize", "replace", + "stemmer", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py index 77187f0845d..2c1feb089a2 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.py +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -8,6 +8,7 @@ ngrams_tokenize, normalize, replace, + stemmer, ) __all__ = [ @@ -18,4 +19,5 @@ "ngrams_tokenize", "normalize", "replace", + "stemmer", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd new file mode 100644 index 00000000000..48762efc01f --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from pylibcudf.column cimport Column +from pylibcudf.libcudf.nvtext.stemmer cimport letter_type +from pylibcudf.libcudf.types cimport size_type + +ctypedef fused ColumnOrSize: + Column + size_type + +cpdef Column is_letter(Column input, bool check_vowels, ColumnOrSize indices) + +cpdef Column porter_stemmer_measure(Column input) diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx new file mode 100644 index 00000000000..854d1053624 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx @@ -0,0 +1,76 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.stemmer cimport ( + is_letter as cpp_is_letter, + letter_type, + porter_stemmer_measure as cpp_porter_stemmer_measure, +) +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column is_letter( + Column input, + bool check_vowels, + ColumnOrSize indices +): + """ + Returns boolean column indicating if the character + or characters at the provided character index or + indices (respectively) are consonants or vowels + + For details, see :cpp:func:`is_letter` + + Parameters + ---------- + input : Column + Input strings + check_vowels : bool + If true, the check is for vowels. Otherwise the check is + for consonants. + indices : Union[Column, size_type] + The character position(s) to check in each string + + Returns + ------- + Column + New boolean column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_is_letter( + input.view(), + letter_type.VOWEL if check_vowels else letter_type.CONSONANT, + indices if ColumnOrSize is size_type else indices.view() + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column porter_stemmer_measure(Column input): + """ + Returns the Porter Stemmer measurements of a strings column. + + For details, see :cpp:func:`porter_stemmer_measure` + + Parameters + ---------- + input : Column + Strings column of words to measure + + Returns + ------- + Column + New column of measure values + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_porter_stemmer_measure(input.view()) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/common/utils.py b/python/pylibcudf/pylibcudf/tests/common/utils.py index 9f389fa42c4..d95849ef371 100644 --- a/python/pylibcudf/pylibcudf/tests/common/utils.py +++ b/python/pylibcudf/pylibcudf/tests/common/utils.py @@ -7,10 +7,11 @@ import numpy as np import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from pyarrow.orc import write_table as orc_write_table from pyarrow.parquet import write_table as pq_write_table + +import pylibcudf as plc from pylibcudf.io.types import CompressionType diff --git a/python/pylibcudf/pylibcudf/tests/conftest.py b/python/pylibcudf/pylibcudf/tests/conftest.py index fdce6f353ca..a19a8835498 100644 --- a/python/pylibcudf/pylibcudf/tests/conftest.py +++ b/python/pylibcudf/pylibcudf/tests/conftest.py @@ -8,8 +8,9 @@ import numpy as np import pyarrow as pa -import pylibcudf as plc import pytest + +import pylibcudf as plc from pylibcudf.io.types import CompressionType sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common")) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_avro.py b/python/pylibcudf/pylibcudf/tests/io/test_avro.py index 0cd5064a697..3d9d99ffa61 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_avro.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_avro.py @@ -5,10 +5,11 @@ import fastavro import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_table_and_meta_eq +import pylibcudf as plc + avro_dtype_pairs = [ ("boolean", pa.bool_()), ("int", pa.int32()), diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py index ab26f23418d..22c83acc47c 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py @@ -5,9 +5,7 @@ import pandas as pd import pyarrow as pa -import pylibcudf as plc import pytest -from pylibcudf.io.types import CompressionType from utils import ( _convert_types, assert_table_and_meta_eq, @@ -15,6 +13,9 @@ write_source_str, ) +import pylibcudf as plc +from pylibcudf.io.types import CompressionType + # Shared kwargs to pass to make_source _COMMON_CSV_SOURCE_KWARGS = { "format": "csv", diff --git a/python/pylibcudf/pylibcudf/tests/io/test_json.py b/python/pylibcudf/pylibcudf/tests/io/test_json.py index 9d976fedf00..453e5ce32a8 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_json.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_json.py @@ -3,9 +3,7 @@ import pandas as pd import pyarrow as pa -import pylibcudf as plc import pytest -from pylibcudf.io.types import CompressionType from utils import ( assert_table_and_meta_eq, make_source, @@ -13,6 +11,9 @@ write_source_str, ) +import pylibcudf as plc +from pylibcudf.io.types import CompressionType + # Shared kwargs to pass to make_source _COMMON_JSON_SOURCE_KWARGS = {"format": "json", "orient": "records"} diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py index 42b14b1feff..5ed660ba6cf 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import _convert_types, assert_table_and_meta_eq, make_source +import pylibcudf as plc + # Shared kwargs to pass to make_source _COMMON_ORC_SOURCE_KWARGS = {"format": "orc"} diff --git a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py index f6e843ccf66..41298601539 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py @@ -1,9 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from pyarrow.parquet import read_table +from utils import assert_table_and_meta_eq, make_source + +import pylibcudf as plc from pylibcudf.expressions import ( ASTOperator, ColumnNameReference, @@ -11,7 +13,6 @@ Literal, Operation, ) -from utils import assert_table_and_meta_eq, make_source # Shared kwargs to pass to make_source _COMMON_PARQUET_SOURCE_KWARGS = {"format": "parquet"} diff --git a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py index 747f58ec8cf..0c43c363e55 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py @@ -2,9 +2,10 @@ import io -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo]) def io_class(request): diff --git a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py index 76b0424b2af..b3555013927 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import zoneinfo -import pylibcudf as plc import pytest +import pylibcudf as plc + def test_make_timezone_transition_table(): if len(zoneinfo.TZPATH) == 0: diff --git a/python/pylibcudf/pylibcudf/tests/test_binaryops.py b/python/pylibcudf/pylibcudf/tests/test_binaryops.py index f784cb3c191..bbb08e8b95a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_binaryops.py +++ b/python/pylibcudf/pylibcudf/tests/test_binaryops.py @@ -4,10 +4,11 @@ import numpy as np import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + def idfn(param): ltype, rtype, outtype, plc_op, _ = param diff --git a/python/pylibcudf/pylibcudf/tests/test_column_factories.py b/python/pylibcudf/pylibcudf/tests/test_column_factories.py index 8cedbc6d42f..e317362a76b 100644 --- a/python/pylibcudf/pylibcudf/tests/test_column_factories.py +++ b/python/pylibcudf/pylibcudf/tests/test_column_factories.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import DEFAULT_STRUCT_TESTING_TYPE, assert_column_eq +import pylibcudf as plc + EMPTY_COL_SIZE = 3 NUMERIC_TYPES = [ diff --git a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py index 0e129fdf0ef..24cd6b9e35f 100644 --- a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py +++ b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py @@ -1,12 +1,13 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq import rmm +import pylibcudf as plc + VALID_TYPES = [ pa.int8(), pa.int16(), diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py index 7a5c1664eed..6d8b5993964 100644 --- a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py +++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_table_eq +import pylibcudf as plc + param_pyarrow_tables = [ pa.table([]), pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}), diff --git a/python/pylibcudf/pylibcudf/tests/test_copying.py b/python/pylibcudf/pylibcudf/tests/test_copying.py index 628682d0a66..c0a41b96b1a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_copying.py +++ b/python/pylibcudf/pylibcudf/tests/test_copying.py @@ -2,7 +2,6 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import ( DEFAULT_STRUCT_TESTING_TYPE, @@ -16,6 +15,8 @@ metadata_from_arrow_type, ) +import pylibcudf as plc + # TODO: consider moving this to conftest and "pairing" # it with pa_type, so that they don't get out of sync diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py index 75930d59058..a80ab8d9f65 100644 --- a/python/pylibcudf/pylibcudf/tests/test_datetime.py +++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py @@ -4,10 +4,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module", params=["s", "ms", "us", "ns"]) def datetime_column(has_nulls, request): diff --git a/python/pylibcudf/pylibcudf/tests/test_expressions.py b/python/pylibcudf/pylibcudf/tests/test_expressions.py index 5894ef4624c..6eabd6db617 100644 --- a/python/pylibcudf/pylibcudf/tests/test_expressions.py +++ b/python/pylibcudf/pylibcudf/tests/test_expressions.py @@ -1,8 +1,9 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest +import pylibcudf as plc + # We can't really evaluate these expressions, so just make sure # construction works properly diff --git a/python/pylibcudf/pylibcudf/tests/test_interop.py b/python/pylibcudf/pylibcudf/tests/test_interop.py index 01c998f16d4..af80b6e5978 100644 --- a/python/pylibcudf/pylibcudf/tests/test_interop.py +++ b/python/pylibcudf/pylibcudf/tests/test_interop.py @@ -1,8 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +import cupy as cp +import numpy as np import pyarrow as pa -import pylibcudf as plc import pytest +from utils import assert_table_eq + +import pylibcudf as plc def test_list_dtype_roundtrip(): @@ -66,3 +70,31 @@ def test_decimal_other(data_type): arrow_type = plc.interop.to_arrow(data_type, precision=precision) assert arrow_type == pa.decimal128(precision, 0) + + +def test_round_trip_dlpack_plc_table(): + expected = pa.table({"a": [1, 2, 3], "b": [5, 6, 7]}) + plc_table = plc.interop.from_arrow(expected) + result = plc.interop.from_dlpack(plc.interop.to_dlpack(plc_table)) + assert_table_eq(expected, result) + + +@pytest.mark.parametrize("array", [np.array, cp.array]) +def test_round_trip_dlpack_array(array): + arr = array([1, 2, 3]) + result = plc.interop.from_dlpack(arr.__dlpack__()) + expected = pa.table({"a": [1, 2, 3]}) + assert_table_eq(expected, result) + + +def test_to_dlpack_error(): + plc_table = plc.interop.from_arrow( + pa.table({"a": [1, None, 3], "b": [5, 6, 7]}) + ) + with pytest.raises(ValueError, match="Cannot create a DLPack tensor"): + plc.interop.from_dlpack(plc.interop.to_dlpack(plc_table)) + + +def test_from_dlpack_error(): + with pytest.raises(ValueError, match="Invalid PyCapsule object"): + plc.interop.from_dlpack(1) diff --git a/python/pylibcudf/pylibcudf/tests/test_join.py b/python/pylibcudf/pylibcudf/tests/test_join.py index 61e02f4d28d..f43a56046a4 100644 --- a/python/pylibcudf/pylibcudf/tests/test_join.py +++ b/python/pylibcudf/pylibcudf/tests/test_join.py @@ -2,9 +2,10 @@ import numpy as np import pyarrow as pa -import pylibcudf as plc from utils import assert_table_eq +import pylibcudf as plc + def test_cross_join(): left = pa.Table.from_arrays([[0, 1, 2], [3, 4, 5]], names=["a", "b"]) diff --git a/python/pylibcudf/pylibcudf/tests/test_json.py b/python/pylibcudf/pylibcudf/tests/test_json.py index 3d2955211f8..486a9524e92 100644 --- a/python/pylibcudf/pylibcudf/tests/test_json.py +++ b/python/pylibcudf/pylibcudf/tests/test_json.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def plc_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py index f7fb7463b50..beacfc63ce5 100644 --- a/python/pylibcudf/pylibcudf/tests/test_labeling.py +++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.mark.parametrize("left_inclusive", [True, False]) @pytest.mark.parametrize("right_inclusive", [True, False]) diff --git a/python/pylibcudf/pylibcudf/tests/test_lists.py b/python/pylibcudf/pylibcudf/tests/test_lists.py index 2353a6ff8f9..f3ef555f11d 100644 --- a/python/pylibcudf/pylibcudf/tests/test_lists.py +++ b/python/pylibcudf/pylibcudf/tests/test_lists.py @@ -3,10 +3,11 @@ import numpy as np import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture def test_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_null_mask.py b/python/pylibcudf/pylibcudf/tests/test_null_mask.py index 3edcae59edc..cd3da856de2 100644 --- a/python/pylibcudf/pylibcudf/tests/test_null_mask.py +++ b/python/pylibcudf/pylibcudf/tests/test_null_mask.py @@ -1,12 +1,13 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest -from pylibcudf.null_mask import MaskState import rmm +import pylibcudf as plc +from pylibcudf.null_mask import MaskState + @pytest.fixture(params=[False, True]) def nullable(request): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py index 7d93c471cc4..8b14e0db576 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def edit_distance_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py index 5cf9874d595..fae4685f81b 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def input_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py index d5a168426b1..05fe7b53c16 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def input_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py index 4e389a63f90..ead9ee094af 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()]) def minhash_input_data(request): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py index 283a009288d..84748b5597e 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def input_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py index fe28b83c09a..25b6d1389ec 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def norm_spaces_input_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py index 0fb54bb4ee1..65687f31c85 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def input_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py new file mode 100644 index 00000000000..e7f4a971f08 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py @@ -0,0 +1,48 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def input_col(): + arr = ["trouble", "toy", "syzygy"] + return pa.array(arr) + + +@pytest.mark.parametrize("check_vowels", [True, False]) +@pytest.mark.parametrize("indices", [[3, 1, 4], 1]) +def test_is_letter(input_col, check_vowels, indices): + def is_letter(s, i, check): + vowels = "aeiouy" + return (s[i] in vowels) == check + + result = plc.nvtext.stemmer.is_letter( + plc.interop.from_arrow(input_col), + check_vowels, + plc.interop.from_arrow(pa.array(indices)) + if isinstance(indices, list) + else indices, + ) + expected = pa.array( + [ + is_letter( + s, + indices[i] if isinstance(indices, list) else indices, + check_vowels, + ) + for i, s in enumerate(input_col.to_pylist()) + ] + ) + assert_column_eq(result, expected) + + +def test_porter_stemmer_measure(input_col): + result = plc.nvtext.stemmer.porter_stemmer_measure( + plc.interop.from_arrow(input_col), + ) + expected = pa.array([1, 1, 2], type=pa.int32()) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_partitioning.py b/python/pylibcudf/pylibcudf/tests/test_partitioning.py index 444d0089d2c..c55e54cebc6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_partitioning.py +++ b/python/pylibcudf/pylibcudf/tests/test_partitioning.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_table_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def partitioning_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_quantiles.py b/python/pylibcudf/pylibcudf/tests/test_quantiles.py index bac56691306..e4a24fb1c98 100644 --- a/python/pylibcudf/pylibcudf/tests/test_quantiles.py +++ b/python/pylibcudf/pylibcudf/tests/test_quantiles.py @@ -3,10 +3,11 @@ import numpy as np import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq, assert_table_eq +import pylibcudf as plc + # Map pylibcudf interpolation options to pyarrow options interp_mapping = { plc.types.Interpolation.LINEAR: "linear", diff --git a/python/pylibcudf/pylibcudf/tests/test_regex_program.py b/python/pylibcudf/pylibcudf/tests/test_regex_program.py index 777315df538..52598f2c462 100644 --- a/python/pylibcudf/pylibcudf/tests/test_regex_program.py +++ b/python/pylibcudf/pylibcudf/tests/test_regex_program.py @@ -1,8 +1,9 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.mark.parametrize("pat", ["(", "*", "\\"]) def test_regex_program_invalid(pat): diff --git a/python/pylibcudf/pylibcudf/tests/test_reshape.py b/python/pylibcudf/pylibcudf/tests/test_reshape.py index 01115bc363a..ef23e23766a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_reshape.py +++ b/python/pylibcudf/pylibcudf/tests/test_reshape.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq, assert_table_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def reshape_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_round.py b/python/pylibcudf/pylibcudf/tests/test_round.py index 0b30316b9a0..2526580bc13 100644 --- a/python/pylibcudf/pylibcudf/tests/test_round.py +++ b/python/pylibcudf/pylibcudf/tests/test_round.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(params=["float32", "float64"]) def column(request, has_nulls): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py index a1820def0b1..f461657281a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture() def str_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py index 176ccc55b96..3e31c75c38a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def str_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_case.py b/python/pylibcudf/pylibcudf/tests/test_string_case.py index 233cc253b14..08ac371fd96 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_case.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_case.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def string_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py index bcd030c019e..06b44210d74 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py @@ -2,9 +2,10 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_all_characters_of_type(): pa_array = pa.array(["1", "A"]) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_combine.py b/python/pylibcudf/pylibcudf/tests/test_string_combine.py index 4a7007a0d6b..eea3ac68e84 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_combine.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_combine.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + def test_concatenate_scalar_seperator(): plc_table = plc.interop.from_arrow( diff --git a/python/pylibcudf/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py index 4e4dd7cbb00..ba9a4a7d3b8 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_contains.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def target_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py index 69f7a0fdd33..3f3f452c4f6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture( scope="module", diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py index 117c59ff1b8..b391d2b290e 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_to_booleans(): pa_array = pa.array(["true", None, "True"]) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py index f3e84286a36..c9368d858a4 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py @@ -3,10 +3,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture def fmt(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py index 6d704309bfd..2d3578e4e71 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py @@ -3,10 +3,11 @@ from datetime import datetime, timedelta import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture( params=[ diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py index b1c4d729604..012e722038e 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py @@ -2,9 +2,10 @@ import decimal import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_to_fixed_point(): typ = pa.decimal128(38, 2) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py index e9918fab559..8ee2b5075af 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_to_floats(): typ = pa.float32() diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py index 6d1d565af30..01192c2d1f8 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py @@ -1,8 +1,9 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_to_integers(): typ = pa.int8() diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py index 4dc3e512624..b533809f106 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py @@ -1,8 +1,9 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_ipv4_to_integers(): arr = pa.array(["123.45.67.890", None]) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py index 8591732b39e..737036a4f0f 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.mark.parametrize("na_rep", [None, pa.scalar("")]) @pytest.mark.parametrize("separators", [None, pa.array([",", "[", "]"])]) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py index fee8c3fb8f6..528736798c7 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py @@ -2,9 +2,10 @@ import urllib import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_url_encode(): data = ["/home/nfs", None] diff --git a/python/pylibcudf/pylibcudf/tests/test_string_extract.py b/python/pylibcudf/pylibcudf/tests/test_string_extract.py index 788b86423c4..e70edf4fb33 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_extract.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_extract.py @@ -2,6 +2,7 @@ import pyarrow as pa import pyarrow.compute as pc + import pylibcudf as plc diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find.py b/python/pylibcudf/pylibcudf/tests/test_string_find.py index db3b13a5aae..82ec18832a9 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_find.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_find.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def data_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py index d6b37a388f0..fa9eee3594b 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_find_multiple(): arr = pa.array(["abc", "def"]) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_findall.py b/python/pylibcudf/pylibcudf/tests/test_string_findall.py index debfad92d00..b73d812c898 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_findall.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_findall.py @@ -2,9 +2,10 @@ import re import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_findall(): arr = pa.array(["bunny", "rabbit", "hare", "dog"]) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_padding.py b/python/pylibcudf/pylibcudf/tests/test_string_padding.py index 2ba775d17ae..79498132097 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_padding.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_padding.py @@ -2,6 +2,7 @@ import pyarrow as pa import pyarrow.compute as pc + import pylibcudf as plc diff --git a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py index 18b5d8bf4d0..c06c06be7c6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py @@ -2,9 +2,10 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.mark.parametrize("repeats", [pa.array([2, 2]), 2]) def test_repeat_strings(repeats): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace.py b/python/pylibcudf/pylibcudf/tests/test_string_replace.py index 5a9c2007b73..2c7d25133de 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_replace.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_replace.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def data_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py index ff2ce348d3b..511f826441a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.mark.parametrize("max_replace_count", [-1, 1]) def test_replace_re_regex_program_scalar(max_replace_count): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_slice.py b/python/pylibcudf/pylibcudf/tests/test_string_slice.py index d9ce5591b98..1759f739e31 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_slice.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_slice.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def pa_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py index 80cae8d1c6b..4e80f19b814 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_table_eq +import pylibcudf as plc + @pytest.fixture def data_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py index 2aeffac8209..450b336ce65 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq, assert_table_eq +import pylibcudf as plc + @pytest.fixture def data_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_strip.py b/python/pylibcudf/pylibcudf/tests/test_string_strip.py index 005e5e4a405..5869e5f4920 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_strip.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_strip.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + data_strings = [ "AbC", "123abc", diff --git a/python/pylibcudf/pylibcudf/tests/test_string_translate.py b/python/pylibcudf/pylibcudf/tests/test_string_translate.py index 2ae893e69fb..84fd3354ac6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_translate.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_translate.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture def data_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py index a1c820cd586..00442d866e9 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py @@ -2,9 +2,10 @@ import textwrap import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_wrap(): width = 12 diff --git a/python/pylibcudf/pylibcudf/tests/test_table.py b/python/pylibcudf/pylibcudf/tests/test_table.py index e822d6a97a8..ac39ef4c5c9 100644 --- a/python/pylibcudf/pylibcudf/tests/test_table.py +++ b/python/pylibcudf/pylibcudf/tests/test_table.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.mark.parametrize( "arrow_tbl", diff --git a/python/pylibcudf/pylibcudf/tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py index d5c618f07e4..49802fe64ac 100644 --- a/python/pylibcudf/pylibcudf/tests/test_transform.py +++ b/python/pylibcudf/pylibcudf/tests/test_transform.py @@ -3,9 +3,10 @@ import math import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_nans_to_nulls(has_nans): if has_nans: diff --git a/python/pylibcudf/pylibcudf/tests/test_transpose.py b/python/pylibcudf/pylibcudf/tests/test_transpose.py index ac11123f680..b0c0bc72ead 100644 --- a/python/pylibcudf/pylibcudf/tests/test_transpose.py +++ b/python/pylibcudf/pylibcudf/tests/test_transpose.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from packaging.version import parse +import pylibcudf as plc + @pytest.mark.skipif( parse(pa.__version__) < parse("16.0.0"), diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index ea5b3065896..a80c85a1fa8 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -53,48 +53,20 @@ test = [ Homepage = "https://github.com/rapidsai/cudf" Documentation = "https://docs.rapids.ai/api/cudf/stable/" -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true -known_dask = [ - "dask", - "distributed", - "dask_cuda", -] -known_rapids = [ - "rmm", -] -known_first_party = [ - "cudf", -] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", -] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", - "__init__.py", -] +[tool.ruff] +extend = "../../pyproject.toml" + +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["cudf"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] + +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda"] +rapids = ["rmm"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] [tool.pytest.ini_options] # --import-mode=importlib because two test_json.py exists and tests directory is not a structured module