diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 1f27ffcffe3..075825e852e 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -125,5 +125,4 @@ jobs: branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} - # pr mode uses the HEAD of the branch, which is also correct for nightlies - script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr + script: ci/cudf_pandas_scripts/pandas-tests/run.sh main diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9235c80bdc9..67a71021a63 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,13 +23,6 @@ repos: args: ["--config-root=python/", "--resolve-all-configs"] files: python/.* types_or: [python, cython, pyi] - - repo: https://github.com/psf/black - rev: 23.12.1 - hooks: - - id: black - files: python/.* - # Explicitly specify the pyproject.toml at the repo root, not per-project. - args: ["--config", "pyproject.toml"] - repo: https://github.com/MarcoGorelli/cython-lint rev: v0.16.0 hooks: @@ -64,9 +57,6 @@ repos: # Use the cudf_kafka isort orderings in notebooks so that dask # and RAPIDS packages have their own sections. args: ["--settings-file=python/cudf_kafka/pyproject.toml"] - - id: nbqa-black - # Explicitly specify the pyproject.toml at the repo root, not per-project. - args: ["--config=pyproject.toml"] - repo: https://github.com/pre-commit/mirrors-clang-format rev: v16.0.6 hooks: @@ -155,6 +145,8 @@ repos: hooks: - id: ruff files: python/.*$ + - id: ruff-format + files: python/.*$ - repo: https://github.com/rapidsai/pre-commit-hooks rev: v0.0.1 hooks: diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh index 667ca35163b..1f70ca78c41 100755 --- a/ci/cudf_pandas_scripts/pandas-tests/run.sh +++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh @@ -32,3 +32,4 @@ python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json p RAPIDS_ARTIFACTS_DIR=${RAPIDS_ARTIFACTS_DIR:-"${PWD}/artifacts"} mkdir -p "${RAPIDS_ARTIFACTS_DIR}" mv pandas-testing/${PANDAS_TESTS_BRANCH}-results.json ${RAPIDS_ARTIFACTS_DIR}/ +rapids-upload-to-s3 ${RAPIDS_ARTIFACTS_DIR}/${PANDAS_TESTS_BRANCH}-results.json "${RAPIDS_ARTIFACTS_DIR}" diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh index 8ecd02f70a1..cbc1dc1cb87 100755 --- a/ci/test_python_other.sh +++ b/ci/test_python_other.sh @@ -19,8 +19,8 @@ EXITCODE=0 trap "EXITCODE=1" ERR set +e -rapids-logger "pytest dask_cudf" -./ci/run_dask_cudf_pytests.sh \ +rapids-logger "pytest dask_cudf (dask-expr)" +DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ --dist=worksteal \ @@ -29,10 +29,9 @@ rapids-logger "pytest dask_cudf" --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \ --cov-report=term -# Run tests in dask_cudf/tests and dask_cudf/io/tests with dask-expr -rapids-logger "pytest dask_cudf + dask_expr" -DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-expr.xml" \ +rapids-logger "pytest dask_cudf (legacy)" +DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ --numprocesses=8 \ --dist=loadscope \ . diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index af5e062a8bd..2b20b9d9ce4 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -18,19 +18,19 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" # Run tests in dask_cudf/tests and dask_cudf/io/tests -rapids-logger "pytest dask_cudf" +rapids-logger "pytest dask_cudf (dask-expr)" pushd python/dask_cudf/dask_cudf -python -m pytest \ +DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ . popd -# Run tests in dask_cudf/tests and dask_cudf/io/tests with dask-expr -rapids-logger "pytest dask_cudf + dask_expr" +# Run tests in dask_cudf/tests and dask_cudf/io/tests (legacy) +rapids-logger "pytest dask_cudf (legacy)" pushd python/dask_cudf/dask_cudf -DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-expr.xml" \ +DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ --numprocesses=8 \ . popd diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu index f29e830eb41..fff1cf0c96a 100644 --- a/cpp/src/io/comp/gpuinflate.cu +++ b/cpp/src/io/comp/gpuinflate.cu @@ -804,8 +804,7 @@ __device__ void process_symbols(inflate_state_s* s, int t) dist = symbol >> 16; for (int i = t; i < len; i += 32) { uint8_t const* src = out + ((i >= dist) ? (i % dist) : i) - dist; - uint8_t b = (src < outbase) ? 0 : *src; - if (out + i < outend) { out[i] = b; } + if (out + i < outend and src >= outbase) { out[i] = *src; } } out += len; pos++; diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index d2026473b6c..54e7c6bf1d6 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -44,6 +44,11 @@ class file_source : public datasource { explicit file_source(char const* filepath) : _file(filepath, O_RDONLY) { if (detail::cufile_integration::is_kvikio_enabled()) { + // Workaround for https://github.com/rapidsai/cudf/issues/14140, where cuFileDriverOpen errors + // out if no CUDA calls have been made before it. This is a no-op if the CUDA context is + // already initialized + cudaFree(0); + _kvikio_file = kvikio::FileHandle(filepath); CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.", _kvikio_file.is_compat_mode_on() ? "on" : "off"); diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu index 5cb2d729f3d..674d2e0a6ea 100644 --- a/cpp/tests/error/error_handling_test.cu +++ b/cpp/tests/error/error_handling_test.cu @@ -97,7 +97,8 @@ TEST(DebugAssertDeathTest, cudf_assert_false) testing::FLAGS_gtest_death_test_style = "threadsafe"; auto call_kernel = []() { - assert_false_kernel<<<1, 1>>>(); + auto const stream = cudf::get_default_stream().value(); + assert_false_kernel<<<1, 1, 0, stream>>>(); // Kernel should fail with `cudaErrorAssert` // This error invalidates the current device context, so we need to kill @@ -114,7 +115,8 @@ TEST(DebugAssertDeathTest, cudf_assert_false) TEST(DebugAssert, cudf_assert_true) { - assert_true_kernel<<<1, 1>>>(); + auto const stream = cudf::get_default_stream().value(); + assert_true_kernel<<<1, 1, 0, stream>>>(); ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); } @@ -136,6 +138,7 @@ int main(int argc, char** argv) auto adaptor = make_stream_checking_resource_adaptor( resource, error_on_invalid_stream, check_default_stream); rmm::mr::set_current_device_resource(&adaptor); + return RUN_ALL_TESTS(); } return RUN_ALL_TESTS(); } diff --git a/pyproject.toml b/pyproject.toml index 4048eb9452c..c71394058df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,22 +1,4 @@ -[tool.black] -line-length = 79 -target-version = ["py39"] -include = '\.py?$' -force-exclude = ''' -/( - thirdparty | - \.eggs | - \.git | - \.hg | - \.mypy_cache | - \.tox | - \.venv | - _build | - buck-out | - build | - dist -)/ -''' +# Copyright (c) 2019-2024, NVIDIA CORPORATION. [tool.pydocstyle] # Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather @@ -60,13 +42,15 @@ select = ["E", "F", "W"] ignore = [ # whitespace before : "E203", + # line-too-long (due to Copyright header) + "E501", ] fixable = ["ALL"] exclude = [ # TODO: Remove this in a follow-up where we fix __all__. "__init__.py", ] -line-length = 88 +line-length = 79 [tool.ruff.per-file-ignores] # Lots of pytest implicitly injected attributes in conftest-patch.py diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index 053425fff8d..4e2fad08d56 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -85,8 +85,9 @@ def _read_tzfile_as_frame(tzdir, zone_name): if not transition_times_and_offsets: # this happens for UTC-like zones min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]") - transition_times_and_offsets = as_column([min_date]), as_column( - [np.timedelta64(0, "s")] + transition_times_and_offsets = ( + as_column([min_date]), + as_column([np.timedelta64(0, "s")]), ) return DataFrame._from_data( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 3e0ec4b5cd7..f13d8cf12f7 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1731,7 +1731,8 @@ def as_column( If None (default), treats NaN values in arbitrary as null if there is no mask passed along with it. If True, combines the mask and NaNs to form a new validity mask. If False, leaves NaN values as is. - Only applies when arbitrary is not a cudf object (Index, Series, Column). + Only applies when arbitrary is not a cudf object + (Index, Series, Column). dtype : optional Optionally typecast the constructed Column to the given dtype. diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0440512c467..35588725655 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -470,9 +470,12 @@ class _DataFrameIlocIndexer(_DataFrameIndexer): _frame: DataFrame def __getitem__(self, arg): - row_key, ( - col_is_scalar, - column_names, + ( + row_key, + ( + col_is_scalar, + column_names, + ), ) = indexing_utils.destructure_dataframe_iloc_indexer(arg, self._frame) row_spec = indexing_utils.parse_row_iloc_indexer( row_key, len(self._frame) @@ -6901,16 +6904,18 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): if future_stack: if dropna is not no_default: raise ValueError( - "dropna must be unspecified with future_stack=True as the new " - "implementation does not introduce rows of NA values. This " - "argument will be removed in a future version of cudf." + "dropna must be unspecified with future_stack=True as " + "the new implementation does not introduce rows of NA " + "values. This argument will be removed in a future " + "version of cudf." ) else: if dropna is not no_default or self._data.nlevels > 1: warnings.warn( - "The previous implementation of stack is deprecated and will be " - "removed in a future version of cudf. Specify future_stack=True " - "to adopt the new implementation and silence this warning.", + "The previous implementation of stack is deprecated and " + "will be removed in a future version of cudf. Specify " + "future_stack=True to adopt the new implementation and " + "silence this warning.", FutureWarning, ) if dropna is no_default: @@ -7028,9 +7033,13 @@ def unnamed_group_generator(): unique_named_levels, axis=0, fill_value=-1 ).values else: - yield grpdf.reindex( - unique_named_levels, axis=0, fill_value=-1 - ).sort_index().values + yield ( + grpdf.reindex( + unique_named_levels, axis=0, fill_value=-1 + ) + .sort_index() + .values + ) else: if future_stack: yield column_idx_df.values diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index e5030eb634b..945e546af1a 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -22,7 +22,12 @@ from cudf._lib.types import size_type_dtype from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType from cudf.api.extensions import no_default -from cudf.api.types import is_bool_dtype, is_float_dtype, is_list_like +from cudf.api.types import ( + is_bool_dtype, + is_float_dtype, + is_list_like, + is_numeric_dtype, +) from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.core.column.column import ColumnBase, StructDtype, as_column @@ -282,9 +287,12 @@ def __iter__(self): if isinstance(group_names, cudf.BaseIndex): group_names = group_names.to_pandas() for i, name in enumerate(group_names): - yield (name,) if isinstance(self._by, list) and len( - self._by - ) == 1 else name, grouped_values[offsets[i] : offsets[i + 1]] + yield ( + (name,) + if isinstance(self._by, list) and len(self._by) == 1 + else name, + grouped_values[offsets[i] : offsets[i + 1]], + ) @property def dtypes(self): @@ -698,6 +706,11 @@ def agg(self, func): return result + def _reduce_numeric_only(self, op: str): + raise NotImplementedError( + f"numeric_only is not implemented for {type(self)}" + ) + def _reduce( self, op: str, @@ -728,14 +741,12 @@ def _reduce( The numeric_only, min_count """ - if numeric_only: - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) if min_count != 0: raise NotImplementedError( "min_count parameter is not implemented yet" ) + if numeric_only: + return self._reduce_numeric_only(op) return self.agg(op) def _scan(self, op: str, *args, **kwargs): @@ -2269,8 +2280,8 @@ def fillna( """ warnings.warn( "groupby fillna is deprecated and " - "will be removed in a future version. Use groupby ffill or groupby bfill " - "for forward or backward filling instead.", + "will be removed in a future version. Use groupby ffill " + "or groupby bfill for forward or backward filling instead.", FutureWarning, ) if inplace: @@ -2645,6 +2656,17 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): _PROTECTED_KEYS = frozenset(("obj",)) + def _reduce_numeric_only(self, op: str): + columns = list( + name + for name in self.obj._data.names + if ( + is_numeric_dtype(self.obj._data[name].dtype) + and name not in self.grouping.names + ) + ) + return self[columns].agg(op) + def __getitem__(self, key): return self.obj[key].groupby( by=self.grouping.keys, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 94d862d52b4..ca9d5590044 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -211,8 +211,8 @@ def _get_label_range_or_mask(index, start, stop, step): return slice(start_loc, stop_loc) else: raise KeyError( - "Value based partial slicing on non-monotonic DatetimeIndexes " - "with non-existing keys is not allowed.", + "Value based partial slicing on non-monotonic " + "DatetimeIndexes with non-existing keys is not allowed.", ) elif start is not None: boolean_mask = index >= start @@ -2449,7 +2449,8 @@ def squeeze(self, axis: Literal["index", "columns", 0, 1, None] = None): ---------- axis : {0 or 'index', 1 or 'columns', None}, default None A specific axis to squeeze. By default, all length-1 axes are - squeezed. For `Series` this parameter is unused and defaults to `None`. + squeezed. For `Series` this parameter is unused and defaults + to `None`. Returns ------- @@ -5835,9 +5836,7 @@ def floordiv(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def rfloordiv( - self, other, axis, level=None, fill_value=None - ): # noqa: D102 + def rfloordiv(self, other, axis, level=None, fill_value=None): # noqa: D102 if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5967,9 +5966,7 @@ def rtruediv(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def eq( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: D102 + def eq(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 return self._binaryop( other=other, op="__eq__", fill_value=fill_value, can_reindex=True ) @@ -6009,9 +6006,7 @@ def eq( ), ) ) - def ne( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: D102 + def ne(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 return self._binaryop( other=other, op="__ne__", fill_value=fill_value, can_reindex=True ) @@ -6051,9 +6046,7 @@ def ne( ), ) ) - def lt( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: D102 + def lt(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 return self._binaryop( other=other, op="__lt__", fill_value=fill_value, can_reindex=True ) @@ -6093,9 +6086,7 @@ def lt( ), ) ) - def le( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: D102 + def le(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 return self._binaryop( other=other, op="__le__", fill_value=fill_value, can_reindex=True ) @@ -6135,9 +6126,7 @@ def le( ), ) ) - def gt( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: D102 + def gt(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 return self._binaryop( other=other, op="__gt__", fill_value=fill_value, can_reindex=True ) @@ -6177,9 +6166,7 @@ def gt( ), ) ) - def ge( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: D102 + def ge(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 return self._binaryop( other=other, op="__ge__", fill_value=fill_value, can_reindex=True ) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index d182b7b4a7c..65f97c99934 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -164,9 +164,9 @@ def to_datetime( if errors == "ignore": warnings.warn( - "errors='ignore' is deprecated and will raise in a future version. " - "Use to_datetime without passing `errors` and catch exceptions " - "explicitly instead", + "errors='ignore' is deprecated and will raise in a " + "future version. Use to_datetime without passing `errors` " + "and catch exceptions explicitly instead", FutureWarning, ) diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index e1424459c8f..68b23f1e059 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -97,9 +97,9 @@ def to_numeric(arg, errors="raise", downcast=None): raise ValueError("invalid error value specified") elif errors == "ignore": warnings.warn( - "errors='ignore' is deprecated and will raise in a future version. " - "Use to_numeric without passing `errors` and catch exceptions " - "explicitly instead", + "errors='ignore' is deprecated and will raise in " + "a future version. Use to_numeric without passing `errors` " + "and catch exceptions explicitly instead", FutureWarning, ) diff --git a/python/cudf/cudf/core/udf/strings_lowering.py b/python/cudf/cudf/core/udf/strings_lowering.py index fdce404d887..3c02ee52b25 100644 --- a/python/cudf/cudf/core/udf/strings_lowering.py +++ b/python/cudf/cudf/core/udf/strings_lowering.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. import operator from functools import partial @@ -249,7 +249,7 @@ def replace_impl(context, builder, sig, args): replacement_ptr = builder.alloca(args[2].type) builder.store(args[0], src_ptr) - builder.store(args[1], to_replace_ptr), + builder.store(args[1], to_replace_ptr) builder.store(args[2], replacement_ptr) udf_str_ptr = builder.alloca(default_manager[udf_string].get_value_type()) diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 12baf1ea6d1..bc1f4f2557e 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -41,9 +41,7 @@ from cudf.utils.utils import initfunc # Maximum size of a string column is 2 GiB -_STRINGS_UDF_DEFAULT_HEAP_SIZE = os.environ.get( - "STRINGS_UDF_HEAP_SIZE", 2**31 -) +_STRINGS_UDF_DEFAULT_HEAP_SIZE = os.environ.get("STRINGS_UDF_HEAP_SIZE", 2**31) _heap_size = 0 _cudf_str_dtype = dtype(str) diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 3f5df18eae1..e811ba1351a 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # noqa: E501 # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -437,9 +437,7 @@ def __get__(self, obj, owner=None) -> Any: # methods because dir for the method won't be the same as for # the pure unbound function, but the alternative is # materializing the slow object when we don't really want to. - result._fsproxy_slow_dir = dir( - slow_result_type - ) # type: ignore + result._fsproxy_slow_dir = dir(slow_result_type) # type: ignore return result diff --git a/python/cudf/cudf/pandas/profiler.py b/python/cudf/cudf/pandas/profiler.py index c5662d06e09..0124d411e3b 100644 --- a/python/cudf/cudf/pandas/profiler.py +++ b/python/cudf/cudf/pandas/profiler.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -124,7 +124,7 @@ def get_namespaced_function_name( _MethodProxy, type[_FinalProxy], type[_IntermediateProxy], - ] + ], ): if isinstance(func_obj, _MethodProxy): # Extract classname from method object @@ -177,17 +177,15 @@ def _tracefunc(self, frame, event, arg): if self._currkey is not None and arg is not None: if arg[1]: # fast run_time = time.perf_counter() - self._timer[self._currkey] - self._results[self._currkey][ - "gpu_time" - ] = run_time + self._results[self._currkey].get( - "gpu_time", 0 + self._results[self._currkey]["gpu_time"] = ( + run_time + + self._results[self._currkey].get("gpu_time", 0) ) else: run_time = time.perf_counter() - self._timer[self._currkey] - self._results[self._currkey][ - "cpu_time" - ] = run_time + self._results[self._currkey].get( - "cpu_time", 0 + self._results[self._currkey]["cpu_time"] = ( + run_time + + self._results[self._currkey].get("cpu_time", 0) ) frame_locals = inspect.getargvalues(frame).locals diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e034a3f5e10..ead1ab2da6c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2351,7 +2351,7 @@ def test_dataframe_reductions(data, axis, func, skipna): for kwargs in all_kwargs: if expected_exception is not None: with pytest.raises(expected_exception): - getattr(gdf, func)(axis=axis, skipna=skipna, **kwargs), + (getattr(gdf, func)(axis=axis, skipna=skipna, **kwargs),) else: expect = getattr(pdf, func)(axis=axis, skipna=skipna, **kwargs) with expect_warning_if( diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 06516b6b4ea..c139b06d20f 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1259,7 +1259,7 @@ def test_groupby_unsupported_columns(): pdg = pdf.groupby("x").sum(numeric_only=True) # cudf does not yet support numeric_only, so our default is False (unlike # pandas, which defaults to inferring and throws a warning about it). - gdg = gdf.groupby("x").sum() + gdg = gdf.groupby("x").sum(numeric_only=True) assert_groupby_results_equal(pdg, gdg) @@ -2158,7 +2158,9 @@ def test_groupby_list_columns_excluded(): pandas_agg_result = pdf.groupby("a").agg("mean", numeric_only=True) assert_groupby_results_equal( - pandas_result, gdf.groupby("a").mean(), check_dtype=False + pandas_result, + gdf.groupby("a").mean(numeric_only=True), + check_dtype=False, ) assert_groupby_results_equal( @@ -3826,3 +3828,27 @@ def test_groupby_shift_series_multiindex(): result = ser.groupby(level=0).shift(1) expected = ser.to_pandas().groupby(level=0).shift(1) assert_eq(expected, result) + + +@pytest.mark.parametrize( + "func", ["min", "max", "sum", "mean", "idxmin", "idxmax"] +) +@pytest.mark.parametrize( + "by,data", + [ + ("a", {"a": [1, 2, 3]}), + (["a", "id"], {"id": [0, 0, 1], "a": [1, 2, 3]}), + ("a", {"a": [1, 2, 3], "b": ["A", "B", "C"]}), + ("id", {"id": [0, 0, 1], "a": [1, 2, 3], "b": ["A", "B", "C"]}), + (["b", "id"], {"id": [0, 0, 1], "b": ["A", "B", "C"]}), + ("b", {"b": ["A", "B", "C"]}), + ], +) +def test_group_by_reduce_numeric_only(by, data, func): + # Test that simple groupby reductions support numeric_only=True + df = cudf.DataFrame(data) + expected = getattr(df.to_pandas().groupby(by, sort=True), func)( + numeric_only=True + ) + result = getattr(df.groupby(by, sort=True), func)(numeric_only=True) + assert_eq(expected, result) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 51e9a3022f4..05213d7601c 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1721,8 +1721,7 @@ def test_get_indexer_single_unique_numeric(idx, key, method): if ( # `method` only applicable to monotonic index - not pi.is_monotonic_increasing - and method is not None + not pi.is_monotonic_increasing and method is not None ): assert_exceptions_equal( lfunc=pi.get_loc, diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 69ddd936eee..a9bca7d8b98 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -608,7 +608,8 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): from pyarrow import orc supported_stat_types = supported_numpy_dtypes + ["str"] - # Writing bool columns to multiple row groups is disabled until #6763 is fixed + # Writing bool columns to multiple row groups is disabled + # until #6763 is fixed if nrows == 100000: supported_stat_types.remove("bool") @@ -683,7 +684,8 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): np.random.seed(0) supported_stat_types = supported_numpy_dtypes + ["str"] - # Writing bool columns to multiple row groups is disabled until #6763 is fixed + # Writing bool columns to multiple row groups is disabled + # until #6763 is fixed if nrows == 200000: supported_stat_types.remove("bool") @@ -697,8 +699,7 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): # Make a dataframe gdf = cudf.DataFrame( { - "col_" - + str(dtype): gen_rand_series( + "col_" + str(dtype): gen_rand_series( dtype, nrows // 2, has_nulls=True, @@ -716,8 +717,7 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): # write and no pointers are saved into the original table gdf = cudf.DataFrame( { - "col_" - + str(dtype): gen_rand_series( + "col_" + str(dtype): gen_rand_series( dtype, nrows // 2, has_nulls=True, diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 18efd4417a1..8b72fe84359 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1087,8 +1087,9 @@ def struct_gen(gen, skip_rows, num_rows, include_validity=False): def R(first_val, num_fields): return { - "col" - + str(f): (gen[f](first_val, first_val) if f % 4 != 0 else None) + "col" + str(f): ( + gen[f](first_val, first_val) if f % 4 != 0 else None + ) if include_validity else (gen[f](first_val, first_val)) for f in range(len(gen)) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 925fd24e6c8..85abf438efb 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -85,9 +85,7 @@ 0 10 hello 1 20 rapids 2 30 ai -""".format( - remote_data_sources=_docstring_remote_sources -) +""".format(remote_data_sources=_docstring_remote_sources) doc_read_avro = docfmt_partial(docstring=_docstring_read_avro) _docstring_read_parquet_metadata = """ @@ -1416,9 +1414,7 @@ list of Filepath strings or in-memory buffers of data. compression : str Type of compression algorithm for the content - """.format( - bytes_per_thread=_BYTES_PER_THREAD_DEFAULT -) + """.format(bytes_per_thread=_BYTES_PER_THREAD_DEFAULT) doc_get_reader_filepath_or_buffer = docfmt_partial(