Merge remote-tracking branch 'upstream/branch-24.06' into drop-centos7

rapidsai · Mar 19, 2024 · 07cb2fe · 07cb2fe
2 parents fd71e48 + ae60f1d
commit 07cb2fe
Show file tree

Hide file tree

Showing 26 changed files with 159 additions and 141 deletions.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -125,5 +125,4 @@ jobs:
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
-      # pr mode uses the HEAD of the branch, which is also correct for nightlies
-      script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
+      script: ci/cudf_pandas_scripts/pandas-tests/run.sh main
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -23,13 +23,6 @@ repos:
         args: ["--config-root=python/", "--resolve-all-configs"]
         files: python/.*
         types_or: [python, cython, pyi]
-  - repo: https://github.com/psf/black
-    rev: 23.12.1
-    hooks:
-      - id: black
-        files: python/.*
-        # Explicitly specify the pyproject.toml at the repo root, not per-project.
-        args: ["--config", "pyproject.toml"]
   - repo: https://github.com/MarcoGorelli/cython-lint
     rev: v0.16.0
     hooks:
@@ -64,9 +57,6 @@ repos:
         # Use the cudf_kafka isort orderings in notebooks so that dask
         # and RAPIDS packages have their own sections.
         args: ["--settings-file=python/cudf_kafka/pyproject.toml"]
-      - id: nbqa-black
-        # Explicitly specify the pyproject.toml at the repo root, not per-project.
-        args: ["--config=pyproject.toml"]
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v16.0.6
     hooks:
@@ -155,6 +145,8 @@ repos:
     hooks:
       - id: ruff
         files: python/.*$
+      - id: ruff-format
+        files: python/.*$
   - repo: https://github.com/rapidsai/pre-commit-hooks
     rev: v0.0.1
     hooks:

diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -32,3 +32,4 @@ python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json p
 RAPIDS_ARTIFACTS_DIR=${RAPIDS_ARTIFACTS_DIR:-"${PWD}/artifacts"}
 mkdir -p "${RAPIDS_ARTIFACTS_DIR}"
 mv pandas-testing/${PANDAS_TESTS_BRANCH}-results.json ${RAPIDS_ARTIFACTS_DIR}/
+rapids-upload-to-s3 ${RAPIDS_ARTIFACTS_DIR}/${PANDAS_TESTS_BRANCH}-results.json "${RAPIDS_ARTIFACTS_DIR}"
diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
@@ -19,8 +19,8 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
-rapids-logger "pytest dask_cudf"
-./ci/run_dask_cudf_pytests.sh \
+rapids-logger "pytest dask_cudf (dask-expr)"
+DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
   --dist=worksteal \
@@ -29,10 +29,9 @@ rapids-logger "pytest dask_cudf"
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \
   --cov-report=term
 
-# Run tests in dask_cudf/tests and dask_cudf/io/tests with dask-expr
-rapids-logger "pytest dask_cudf + dask_expr"
-DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \
-  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-expr.xml" \
+rapids-logger "pytest dask_cudf (legacy)"
+DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
   --numprocesses=8 \
   --dist=loadscope \
   .

diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
@@ -18,19 +18,19 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
 
 # Run tests in dask_cudf/tests and dask_cudf/io/tests
-rapids-logger "pytest dask_cudf"
+rapids-logger "pytest dask_cudf (dask-expr)"
 pushd python/dask_cudf/dask_cudf
-python -m pytest \
+DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
   .
 popd
 
-# Run tests in dask_cudf/tests and dask_cudf/io/tests with dask-expr
-rapids-logger "pytest dask_cudf + dask_expr"
+# Run tests in dask_cudf/tests and dask_cudf/io/tests (legacy)
+rapids-logger "pytest dask_cudf (legacy)"
 pushd python/dask_cudf/dask_cudf
-DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \
-  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-expr.xml" \
+DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
   --numprocesses=8 \
   .
 popd
diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu
@@ -804,8 +804,7 @@ __device__ void process_symbols(inflate_state_s* s, int t)
       dist   = symbol >> 16;
       for (int i = t; i < len; i += 32) {
         uint8_t const* src = out + ((i >= dist) ? (i % dist) : i) - dist;
-        uint8_t b          = (src < outbase) ? 0 : *src;
-        if (out + i < outend) { out[i] = b; }
+        if (out + i < outend and src >= outbase) { out[i] = *src; }
       }
       out += len;
       pos++;

diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
@@ -44,6 +44,11 @@ class file_source : public datasource {
   explicit file_source(char const* filepath) : _file(filepath, O_RDONLY)
   {
     if (detail::cufile_integration::is_kvikio_enabled()) {
+      // Workaround for https://github.com/rapidsai/cudf/issues/14140, where cuFileDriverOpen errors
+      // out if no CUDA calls have been made before it. This is a no-op if the CUDA context is
+      // already initialized
+      cudaFree(0);
+
       _kvikio_file = kvikio::FileHandle(filepath);
       CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");

diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
@@ -97,7 +97,8 @@ TEST(DebugAssertDeathTest, cudf_assert_false)
   testing::FLAGS_gtest_death_test_style = "threadsafe";
 
   auto call_kernel = []() {
-    assert_false_kernel<<<1, 1>>>();
+    auto const stream = cudf::get_default_stream().value();
+    assert_false_kernel<<<1, 1, 0, stream>>>();
 
     // Kernel should fail with `cudaErrorAssert`
     // This error invalidates the current device context, so we need to kill
@@ -114,7 +115,8 @@ TEST(DebugAssertDeathTest, cudf_assert_false)
 
 TEST(DebugAssert, cudf_assert_true)
 {
-  assert_true_kernel<<<1, 1>>>();
+  auto const stream = cudf::get_default_stream().value();
+  assert_true_kernel<<<1, 1, 0, stream>>>();
   ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
 }
 
@@ -136,6 +138,7 @@ int main(int argc, char** argv)
     auto adaptor                       = make_stream_checking_resource_adaptor(
       resource, error_on_invalid_stream, check_default_stream);
     rmm::mr::set_current_device_resource(&adaptor);
+    return RUN_ALL_TESTS();
   }
   return RUN_ALL_TESTS();
 }
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,22 +1,4 @@
-[tool.black]
-line-length = 79
-target-version = ["py39"]
-include = '\.py?$'
-force-exclude = '''
-/(
-    thirdparty |
-    \.eggs |
-    \.git |
-    \.hg |
-    \.mypy_cache |
-    \.tox |
-    \.venv |
-    _build |
-    buck-out |
-    build |
-    dist
-)/
-'''
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 [tool.pydocstyle]
 # Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather
@@ -60,13 +42,15 @@ select = ["E", "F", "W"]
 ignore = [
     # whitespace before :
     "E203",
+    # line-too-long (due to Copyright header)
+    "E501",
 ]
 fixable = ["ALL"]
 exclude = [
     # TODO: Remove this in a follow-up where we fix __all__.
     "__init__.py",
 ]
-line-length = 88
+line-length = 79
 
 [tool.ruff.per-file-ignores]
 # Lots of pytest implicitly injected attributes in conftest-patch.py

diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
@@ -85,8 +85,9 @@ def _read_tzfile_as_frame(tzdir, zone_name):
     if not transition_times_and_offsets:
         # this happens for UTC-like zones
         min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]")
-        transition_times_and_offsets = as_column([min_date]), as_column(
-            [np.timedelta64(0, "s")]
+        transition_times_and_offsets = (
+            as_column([min_date]),
+            as_column([np.timedelta64(0, "s")]),
         )
 
     return DataFrame._from_data(

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -1731,7 +1731,8 @@ def as_column(
         If None (default), treats NaN values in arbitrary as null if there is
         no mask passed along with it. If True, combines the mask and NaNs to
         form a new validity mask. If False, leaves NaN values as is.
-        Only applies when arbitrary is not a cudf object (Index, Series, Column).
+        Only applies when arbitrary is not a cudf object
+        (Index, Series, Column).
     dtype : optional
         Optionally typecast the constructed Column to the given
         dtype.

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -470,9 +470,12 @@ class _DataFrameIlocIndexer(_DataFrameIndexer):
     _frame: DataFrame
 
     def __getitem__(self, arg):
-        row_key, (
-            col_is_scalar,
-            column_names,
+        (
+            row_key,
+            (
+                col_is_scalar,
+                column_names,
+            ),
         ) = indexing_utils.destructure_dataframe_iloc_indexer(arg, self._frame)
         row_spec = indexing_utils.parse_row_iloc_indexer(
             row_key, len(self._frame)
@@ -6901,16 +6904,18 @@ def stack(self, level=-1, dropna=no_default, future_stack=False):
         if future_stack:
             if dropna is not no_default:
                 raise ValueError(
-                    "dropna must be unspecified with future_stack=True as the new "
-                    "implementation does not introduce rows of NA values. This "
-                    "argument will be removed in a future version of cudf."
+                    "dropna must be unspecified with future_stack=True as "
+                    "the new implementation does not introduce rows of NA "
+                    "values. This argument will be removed in a future "
+                    "version of cudf."
                 )
         else:
             if dropna is not no_default or self._data.nlevels > 1:
                 warnings.warn(
-                    "The previous implementation of stack is deprecated and will be "
-                    "removed in a future version of cudf. Specify future_stack=True "
-                    "to adopt the new implementation and silence this warning.",
+                    "The previous implementation of stack is deprecated and "
+                    "will be removed in a future version of cudf. Specify "
+                    "future_stack=True to adopt the new implementation and "
+                    "silence this warning.",
                     FutureWarning,
                 )
             if dropna is no_default:
@@ -7028,9 +7033,13 @@ def unnamed_group_generator():
                             unique_named_levels, axis=0, fill_value=-1
                         ).values
                     else:
-                        yield grpdf.reindex(
-                            unique_named_levels, axis=0, fill_value=-1
-                        ).sort_index().values
+                        yield (
+                            grpdf.reindex(
+                                unique_named_levels, axis=0, fill_value=-1
+                            )
+                            .sort_index()
+                            .values
+                        )
             else:
                 if future_stack:
                     yield column_idx_df.values

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -22,7 +22,12 @@
 from cudf._lib.types import size_type_dtype
 from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
 from cudf.api.extensions import no_default
-from cudf.api.types import is_bool_dtype, is_float_dtype, is_list_like
+from cudf.api.types import (
+    is_bool_dtype,
+    is_float_dtype,
+    is_list_like,
+    is_numeric_dtype,
+)
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.abc import Serializable
 from cudf.core.column.column import ColumnBase, StructDtype, as_column
@@ -282,9 +287,12 @@ def __iter__(self):
         if isinstance(group_names, cudf.BaseIndex):
             group_names = group_names.to_pandas()
         for i, name in enumerate(group_names):
-            yield (name,) if isinstance(self._by, list) and len(
-                self._by
-            ) == 1 else name, grouped_values[offsets[i] : offsets[i + 1]]
+            yield (
+                (name,)
+                if isinstance(self._by, list) and len(self._by) == 1
+                else name,
+                grouped_values[offsets[i] : offsets[i + 1]],
+            )
 
     @property
     def dtypes(self):
@@ -698,6 +706,11 @@ def agg(self, func):
 
         return result
 
+    def _reduce_numeric_only(self, op: str):
+        raise NotImplementedError(
+            f"numeric_only is not implemented for {type(self)}"
+        )
+
     def _reduce(
         self,
         op: str,
@@ -728,14 +741,12 @@ def _reduce(
 
             The numeric_only, min_count
         """
-        if numeric_only:
-            raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
-            )
         if min_count != 0:
             raise NotImplementedError(
                 "min_count parameter is not implemented yet"
             )
+        if numeric_only:
+            return self._reduce_numeric_only(op)
         return self.agg(op)
 
     def _scan(self, op: str, *args, **kwargs):
@@ -2269,8 +2280,8 @@ def fillna(
         """
         warnings.warn(
             "groupby fillna is deprecated and "
-            "will be removed in a future version. Use groupby ffill or groupby bfill "
-            "for forward or backward filling instead.",
+            "will be removed in a future version. Use groupby ffill "
+            "or groupby bfill for forward or backward filling instead.",
             FutureWarning,
         )
         if inplace:
@@ -2645,6 +2656,17 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
 
     _PROTECTED_KEYS = frozenset(("obj",))
 
+    def _reduce_numeric_only(self, op: str):
+        columns = list(
+            name
+            for name in self.obj._data.names
+            if (
+                is_numeric_dtype(self.obj._data[name].dtype)
+                and name not in self.grouping.names
+            )
+        )
+        return self[columns].agg(op)
+
     def __getitem__(self, key):
         return self.obj[key].groupby(
             by=self.grouping.keys,