Merge branch 'branch-24.12' into rjzamora/refactor-evaluate

rapidsai · Oct 29, 2024 · 3f4ae3f · 3f4ae3f
2 parents ea94242 + eeb4d27
commit 3f4ae3f
Show file tree

Hide file tree

Showing 295 changed files with 1,387 additions and 961 deletions.
diff --git a/.github/workflows/auto-assign.yml b/.github/workflows/auto-assign.yml
@@ -0,0 +1,17 @@
+name: "Auto Assign PR"
+
+on:
+  pull_request_target:
+    types:
+      - opened
+      - reopened
+      - synchronize
+
+jobs:
+  add_assignees:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions-ecosystem/action-add-assignees@v1
+        with:
+          github_token: "${{ secrets.GITHUB_TOKEN }}"
+          assignees: ${{ github.actor }}
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
@@ -1,4 +1,5 @@
 name: "Pull Request Labeler"
+
 on:
 - pull_request_target
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -155,7 +155,7 @@ repos:
           )
       - id: verify-alpha-spec
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.13.11
+    rev: v1.16.0
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -38,6 +38,7 @@ conduct. More information can be found at:
 8. Verify that CI passes all [status checks](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks).
    Fix if needed.
 9. Wait for other developers to review your code and update code as needed.
+   Changes to any C++ files require at least 2 approvals from the cudf-cpp-codeowners before merging.
 10. Once reviewed and approved, a RAPIDS developer will merge your pull request.
 
 If you are unsure about anything, don't hesitate to comment on issues and ask for clarification!

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
@@ -15,8 +15,12 @@ rapids-print-env
 
 rapids-logger "Begin cpp build"
 
+sccache --zero-stats
+
 # With boa installed conda build forward to boa
 RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild \
     conda/recipes/libcudf
 
+sccache --show-adv-stats
+
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_python.sh b/ci/build_python.sh
@@ -19,6 +19,8 @@ rapids-logger "Begin py build"
 
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
+sccache --zero-stats
+
 # TODO: Remove `--no-test` flag once importing on a CPU
 # node works correctly
 # With boa installed conda build forwards to the boa builder
@@ -28,12 +30,18 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/pylibcudf
 
+sccache --show-adv-stats
+sccache --zero-stats
+
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cudf
 
+sccache --show-adv-stats
+sccache --zero-stats
+
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
@@ -46,6 +54,8 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cudf_kafka
 
+sccache --show-adv-stats
+
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
@@ -3,7 +3,8 @@
 
 set -euo pipefail
 
-package_dir=$1
+package_name=$1
+package_dir=$2
 
 source rapids-configure-sccache
 source rapids-date-string
@@ -12,4 +13,14 @@ rapids-generate-version > ./VERSION
 
 cd "${package_dir}"
 
-python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check
+sccache --zero-stats
+
+rapids-logger "Building '${package_name}' wheel"
+python -m pip wheel \
+    -w dist \
+    -v \
+    --no-deps \
+    --disable-pip-version-check \
+    .
+
+sccache --show-adv-stats
diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
@@ -18,7 +18,7 @@ echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf
 echo "pylibcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/pylibcudf_dist/pylibcudf_*.whl)" >> /tmp/constraints.txt
 export PIP_CONSTRAINT="/tmp/constraints.txt"
 
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh cudf ${package_dir}
 
 python -m auditwheel repair \
     --exclude libcudf.so \

diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 
 package_dir="python/cudf_polars"
 
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh cudf-polars ${package_dir}
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist
+RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist
diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 
 package_dir="python/dask_cudf"
 
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh dask-cudf ${package_dir}
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist
+RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist
diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
@@ -3,10 +3,30 @@
 
 set -euo pipefail
 
+package_name="libcudf"
 package_dir="python/libcudf"
 
+rapids-logger "Generating build requirements"
+
+rapids-dependency-file-generator \
+  --output requirements \
+  --file-key "py_build_${package_name}" \
+  --file-key "py_rapids_build_${package_name}" \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};cuda_suffixed=true" \
+| tee /tmp/requirements-build.txt
+
+rapids-logger "Installing build requirements"
+python -m pip install \
+    -v \
+    --prefer-binary \
+    -r /tmp/requirements-build.txt
+
+# build with '--no-build-isolation', for better sccache hit rate
+# 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735)
+export PIP_NO_BUILD_ISOLATION=0
+
 export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
@@ -16,4 +36,4 @@ python -m auditwheel repair \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
-RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist
+RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp "${package_dir}/final_dist"
diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh
@@ -16,12 +16,12 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f
 echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf_*.whl)" > /tmp/constraints.txt
 export PIP_CONSTRAINT="/tmp/constraints.txt"
 
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh pylibcudf ${package_dir}
 
 python -m auditwheel repair \
     --exclude libcudf.so \
     --exclude libnvcomp.so \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -369,6 +369,8 @@ add_library(
   src/filling/sequence.cu
   src/groupby/groupby.cu
   src/groupby/hash/compute_groupby.cu
+  src/groupby/hash/compute_mapping_indices.cu
+  src/groupby/hash/compute_mapping_indices_null.cu
   src/groupby/hash/compute_single_pass_aggs.cu
   src/groupby/hash/create_sparse_results_table.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp

diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
@@ -16,16 +16,29 @@
 
 #include <benchmarks/common/generate_input.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
 #include <nvbench/nvbench.cuh>
+#include <nvbench/types.cuh>
 
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <iterator>
 #include <list>
 #include <memory>
 #include <optional>
@@ -39,14 +52,14 @@ enum class TreeType {
 template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
 static void BM_ast_transform(nvbench::state& state)
 {
-  auto const table_size  = static_cast<cudf::size_type>(state.get_int64("table_size"));
+  auto const num_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const tree_levels = static_cast<cudf::size_type>(state.get_int64("tree_levels"));
 
   // Create table data
   auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
   auto const source_table =
     create_sequence_table(cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols),
-                          row_count{table_size},
+                          row_count{num_rows},
                           Nullable ? std::optional<double>{0.5} : std::nullopt);
   auto table = source_table->view();
 
@@ -86,7 +99,71 @@ static void BM_ast_transform(nvbench::state& state)
   auto const& expression_tree_root = expressions.back();
 
   // Use the number of bytes read from global memory
-  state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
+  state.add_global_memory_reads<key_type>(static_cast<size_t>(num_rows) * (tree_levels + 1));
+  state.add_global_memory_writes<key_type>(num_rows);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
+}
+
+template <cudf::ast::ast_operator cmp_op, cudf::ast::ast_operator reduce_op>
+static void BM_string_compare_ast_transform(nvbench::state& state)
+{
+  auto const string_width = static_cast<cudf::size_type>(state.get_int64("string_width"));
+  auto const num_rows     = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const tree_levels  = static_cast<cudf::size_type>(state.get_int64("tree_levels"));
+  auto const hit_rate     = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+
+  CUDF_EXPECTS(tree_levels > 0, "benchmarks require 1 or more comparisons");
+
+  // Create table data
+  auto const num_cols = tree_levels * 2;
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  std::for_each(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) {
+      columns.emplace_back(create_string_column(num_rows, string_width, hit_rate));
+    });
+
+  cudf::table table{std::move(columns)};
+  cudf::table_view const table_view = table.view();
+
+  int64_t const chars_size = std::accumulate(
+    table_view.begin(),
+    table_view.end(),
+    static_cast<int64_t>(0),
+    [](int64_t size, auto& column) -> int64_t {
+      return size + cudf::strings_column_view{column}.chars_size(cudf::get_default_stream());
+    });
+
+  // Create column references
+  auto column_refs = std::vector<cudf::ast::column_reference>();
+  std::transform(thrust::make_counting_iterator(0),
+                 thrust::make_counting_iterator(num_cols),
+                 std::back_inserter(column_refs),
+                 [](auto const& column_id) { return cudf::ast::column_reference(column_id); });
+
+  // Create expression trees
+  std::list<cudf::ast::operation> expressions;
+
+  // Construct AST tree (a == b && c == d && e == f && ...)
+
+  expressions.emplace_back(cudf::ast::operation(cmp_op, column_refs[0], column_refs[1]));
+
+  std::for_each(thrust::make_counting_iterator(1),
+                thrust::make_counting_iterator(tree_levels),
+                [&](size_t idx) {
+                  auto const& lhs = expressions.back();
+                  auto const& rhs = expressions.emplace_back(
+                    cudf::ast::operation(cmp_op, column_refs[idx * 2], column_refs[idx * 2 + 1]));
+                  expressions.emplace_back(cudf::ast::operation(reduce_op, lhs, rhs));
+                });
+
+  auto const& expression_tree_root = expressions.back();
+
+  // Use the number of bytes read from global memory
+  state.add_element_count(chars_size, "chars_size");
+  state.add_global_memory_reads<nvbench::uint8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows);
 
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
@@ -100,7 +177,7 @@ static void BM_ast_transform(nvbench::state& state)
   NVBENCH_BENCH(name)                                                                      \
     .set_name(#name)                                                                       \
     .add_int64_axis("tree_levels", {1, 5, 10})                                             \
-    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
+    .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000})
 
 AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);
@@ -115,3 +192,19 @@ AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_int32_imbalanced_reuse_nulls, int32_t, TreeType::IMBALANCED_LEFT, true, true);
 AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_double_imbalanced_unique_nulls, double, TreeType::IMBALANCED_LEFT, false, true);
+
+#define AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(name, cmp_op, reduce_op) \
+  static void name(::nvbench::state& st)                                       \
+  {                                                                            \
+    ::BM_string_compare_ast_transform<cmp_op, reduce_op>(st);                  \
+  }                                                                            \
+  NVBENCH_BENCH(name)                                                          \
+    .set_name(#name)                                                           \
+    .add_int64_axis("string_width", {32, 64, 128, 256})                        \
+    .add_int64_axis("num_rows", {32768, 262144, 2097152})                      \
+    .add_int64_axis("tree_levels", {1, 2, 3, 4})                               \
+    .add_int64_axis("hit_rate", {50, 100})
+
+AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(ast_string_equal_logical_and,
+                                              cudf::ast::ast_operator::EQUAL,
+                                              cudf::ast::ast_operator::LOGICAL_AND);