Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-24.06' into drop-centos7
Browse files Browse the repository at this point in the history
  • Loading branch information
bdice committed Mar 19, 2024
2 parents fd71e48 + ae60f1d commit 07cb2fe
Show file tree
Hide file tree
Showing 26 changed files with 159 additions and 141 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -125,5 +125,4 @@ jobs:
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
sha: ${{ inputs.sha }}
# pr mode uses the HEAD of the branch, which is also correct for nightlies
script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
script: ci/cudf_pandas_scripts/pandas-tests/run.sh main
12 changes: 2 additions & 10 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,6 @@ repos:
args: ["--config-root=python/", "--resolve-all-configs"]
files: python/.*
types_or: [python, cython, pyi]
- repo: https://github.com/psf/black
rev: 23.12.1
hooks:
- id: black
files: python/.*
# Explicitly specify the pyproject.toml at the repo root, not per-project.
args: ["--config", "pyproject.toml"]
- repo: https://github.com/MarcoGorelli/cython-lint
rev: v0.16.0
hooks:
Expand Down Expand Up @@ -64,9 +57,6 @@ repos:
# Use the cudf_kafka isort orderings in notebooks so that dask
# and RAPIDS packages have their own sections.
args: ["--settings-file=python/cudf_kafka/pyproject.toml"]
- id: nbqa-black
# Explicitly specify the pyproject.toml at the repo root, not per-project.
args: ["--config=pyproject.toml"]
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v16.0.6
hooks:
Expand Down Expand Up @@ -155,6 +145,8 @@ repos:
hooks:
- id: ruff
files: python/.*$
- id: ruff-format
files: python/.*$
- repo: https://github.com/rapidsai/pre-commit-hooks
rev: v0.0.1
hooks:
Expand Down
1 change: 1 addition & 0 deletions ci/cudf_pandas_scripts/pandas-tests/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,4 @@ python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json p
RAPIDS_ARTIFACTS_DIR=${RAPIDS_ARTIFACTS_DIR:-"${PWD}/artifacts"}
mkdir -p "${RAPIDS_ARTIFACTS_DIR}"
mv pandas-testing/${PANDAS_TESTS_BRANCH}-results.json ${RAPIDS_ARTIFACTS_DIR}/
rapids-upload-to-s3 ${RAPIDS_ARTIFACTS_DIR}/${PANDAS_TESTS_BRANCH}-results.json "${RAPIDS_ARTIFACTS_DIR}"
11 changes: 5 additions & 6 deletions ci/test_python_other.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ EXITCODE=0
trap "EXITCODE=1" ERR
set +e

rapids-logger "pytest dask_cudf"
./ci/run_dask_cudf_pytests.sh \
rapids-logger "pytest dask_cudf (dask-expr)"
DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \
--junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
--numprocesses=8 \
--dist=worksteal \
Expand All @@ -29,10 +29,9 @@ rapids-logger "pytest dask_cudf"
--cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \
--cov-report=term

# Run tests in dask_cudf/tests and dask_cudf/io/tests with dask-expr
rapids-logger "pytest dask_cudf + dask_expr"
DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \
--junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-expr.xml" \
rapids-logger "pytest dask_cudf (legacy)"
DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \
--junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
--numprocesses=8 \
--dist=loadscope \
.
Expand Down
12 changes: 6 additions & 6 deletions ci/test_wheel_dask_cudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,19 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
mkdir -p "${RAPIDS_TESTS_DIR}"

# Run tests in dask_cudf/tests and dask_cudf/io/tests
rapids-logger "pytest dask_cudf"
rapids-logger "pytest dask_cudf (dask-expr)"
pushd python/dask_cudf/dask_cudf
python -m pytest \
DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \
--junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
--numprocesses=8 \
.
popd

# Run tests in dask_cudf/tests and dask_cudf/io/tests with dask-expr
rapids-logger "pytest dask_cudf + dask_expr"
# Run tests in dask_cudf/tests and dask_cudf/io/tests (legacy)
rapids-logger "pytest dask_cudf (legacy)"
pushd python/dask_cudf/dask_cudf
DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \
--junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-expr.xml" \
DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \
--junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
--numprocesses=8 \
.
popd
3 changes: 1 addition & 2 deletions cpp/src/io/comp/gpuinflate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -804,8 +804,7 @@ __device__ void process_symbols(inflate_state_s* s, int t)
dist = symbol >> 16;
for (int i = t; i < len; i += 32) {
uint8_t const* src = out + ((i >= dist) ? (i % dist) : i) - dist;
uint8_t b = (src < outbase) ? 0 : *src;
if (out + i < outend) { out[i] = b; }
if (out + i < outend and src >= outbase) { out[i] = *src; }
}
out += len;
pos++;
Expand Down
5 changes: 5 additions & 0 deletions cpp/src/io/utilities/datasource.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ class file_source : public datasource {
explicit file_source(char const* filepath) : _file(filepath, O_RDONLY)
{
if (detail::cufile_integration::is_kvikio_enabled()) {
// Workaround for https://github.com/rapidsai/cudf/issues/14140, where cuFileDriverOpen errors
// out if no CUDA calls have been made before it. This is a no-op if the CUDA context is
// already initialized
cudaFree(0);

_kvikio_file = kvikio::FileHandle(filepath);
CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.",
_kvikio_file.is_compat_mode_on() ? "on" : "off");
Expand Down
7 changes: 5 additions & 2 deletions cpp/tests/error/error_handling_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ TEST(DebugAssertDeathTest, cudf_assert_false)
testing::FLAGS_gtest_death_test_style = "threadsafe";

auto call_kernel = []() {
assert_false_kernel<<<1, 1>>>();
auto const stream = cudf::get_default_stream().value();
assert_false_kernel<<<1, 1, 0, stream>>>();

// Kernel should fail with `cudaErrorAssert`
// This error invalidates the current device context, so we need to kill
Expand All @@ -114,7 +115,8 @@ TEST(DebugAssertDeathTest, cudf_assert_false)

TEST(DebugAssert, cudf_assert_true)
{
assert_true_kernel<<<1, 1>>>();
auto const stream = cudf::get_default_stream().value();
assert_true_kernel<<<1, 1, 0, stream>>>();
ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
}

Expand All @@ -136,6 +138,7 @@ int main(int argc, char** argv)
auto adaptor = make_stream_checking_resource_adaptor(
resource, error_on_invalid_stream, check_default_stream);
rmm::mr::set_current_device_resource(&adaptor);
return RUN_ALL_TESTS();
}
return RUN_ALL_TESTS();
}
24 changes: 4 additions & 20 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,22 +1,4 @@
[tool.black]
line-length = 79
target-version = ["py39"]
include = '\.py?$'
force-exclude = '''
/(
thirdparty |
\.eggs |
\.git |
\.hg |
\.mypy_cache |
\.tox |
\.venv |
_build |
buck-out |
build |
dist
)/
'''
# Copyright (c) 2019-2024, NVIDIA CORPORATION.

[tool.pydocstyle]
# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather
Expand Down Expand Up @@ -60,13 +42,15 @@ select = ["E", "F", "W"]
ignore = [
# whitespace before :
"E203",
# line-too-long (due to Copyright header)
"E501",
]
fixable = ["ALL"]
exclude = [
# TODO: Remove this in a follow-up where we fix __all__.
"__init__.py",
]
line-length = 88
line-length = 79

[tool.ruff.per-file-ignores]
# Lots of pytest implicitly injected attributes in conftest-patch.py
Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/core/_internals/timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,9 @@ def _read_tzfile_as_frame(tzdir, zone_name):
if not transition_times_and_offsets:
# this happens for UTC-like zones
min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]")
transition_times_and_offsets = as_column([min_date]), as_column(
[np.timedelta64(0, "s")]
transition_times_and_offsets = (
as_column([min_date]),
as_column([np.timedelta64(0, "s")]),
)

return DataFrame._from_data(
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1731,7 +1731,8 @@ def as_column(
If None (default), treats NaN values in arbitrary as null if there is
no mask passed along with it. If True, combines the mask and NaNs to
form a new validity mask. If False, leaves NaN values as is.
Only applies when arbitrary is not a cudf object (Index, Series, Column).
Only applies when arbitrary is not a cudf object
(Index, Series, Column).
dtype : optional
Optionally typecast the constructed Column to the given
dtype.
Expand Down
33 changes: 21 additions & 12 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,9 +470,12 @@ class _DataFrameIlocIndexer(_DataFrameIndexer):
_frame: DataFrame

def __getitem__(self, arg):
row_key, (
col_is_scalar,
column_names,
(
row_key,
(
col_is_scalar,
column_names,
),
) = indexing_utils.destructure_dataframe_iloc_indexer(arg, self._frame)
row_spec = indexing_utils.parse_row_iloc_indexer(
row_key, len(self._frame)
Expand Down Expand Up @@ -6901,16 +6904,18 @@ def stack(self, level=-1, dropna=no_default, future_stack=False):
if future_stack:
if dropna is not no_default:
raise ValueError(
"dropna must be unspecified with future_stack=True as the new "
"implementation does not introduce rows of NA values. This "
"argument will be removed in a future version of cudf."
"dropna must be unspecified with future_stack=True as "
"the new implementation does not introduce rows of NA "
"values. This argument will be removed in a future "
"version of cudf."
)
else:
if dropna is not no_default or self._data.nlevels > 1:
warnings.warn(
"The previous implementation of stack is deprecated and will be "
"removed in a future version of cudf. Specify future_stack=True "
"to adopt the new implementation and silence this warning.",
"The previous implementation of stack is deprecated and "
"will be removed in a future version of cudf. Specify "
"future_stack=True to adopt the new implementation and "
"silence this warning.",
FutureWarning,
)
if dropna is no_default:
Expand Down Expand Up @@ -7028,9 +7033,13 @@ def unnamed_group_generator():
unique_named_levels, axis=0, fill_value=-1
).values
else:
yield grpdf.reindex(
unique_named_levels, axis=0, fill_value=-1
).sort_index().values
yield (
grpdf.reindex(
unique_named_levels, axis=0, fill_value=-1
)
.sort_index()
.values
)
else:
if future_stack:
yield column_idx_df.values
Expand Down
42 changes: 32 additions & 10 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,12 @@
from cudf._lib.types import size_type_dtype
from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
from cudf.api.extensions import no_default
from cudf.api.types import is_bool_dtype, is_float_dtype, is_list_like
from cudf.api.types import (
is_bool_dtype,
is_float_dtype,
is_list_like,
is_numeric_dtype,
)
from cudf.core._compat import PANDAS_LT_300
from cudf.core.abc import Serializable
from cudf.core.column.column import ColumnBase, StructDtype, as_column
Expand Down Expand Up @@ -282,9 +287,12 @@ def __iter__(self):
if isinstance(group_names, cudf.BaseIndex):
group_names = group_names.to_pandas()
for i, name in enumerate(group_names):
yield (name,) if isinstance(self._by, list) and len(
self._by
) == 1 else name, grouped_values[offsets[i] : offsets[i + 1]]
yield (
(name,)
if isinstance(self._by, list) and len(self._by) == 1
else name,
grouped_values[offsets[i] : offsets[i + 1]],
)

@property
def dtypes(self):
Expand Down Expand Up @@ -698,6 +706,11 @@ def agg(self, func):

return result

def _reduce_numeric_only(self, op: str):
raise NotImplementedError(
f"numeric_only is not implemented for {type(self)}"
)

def _reduce(
self,
op: str,
Expand Down Expand Up @@ -728,14 +741,12 @@ def _reduce(
The numeric_only, min_count
"""
if numeric_only:
raise NotImplementedError(
"numeric_only parameter is not implemented yet"
)
if min_count != 0:
raise NotImplementedError(
"min_count parameter is not implemented yet"
)
if numeric_only:
return self._reduce_numeric_only(op)
return self.agg(op)

def _scan(self, op: str, *args, **kwargs):
Expand Down Expand Up @@ -2269,8 +2280,8 @@ def fillna(
"""
warnings.warn(
"groupby fillna is deprecated and "
"will be removed in a future version. Use groupby ffill or groupby bfill "
"for forward or backward filling instead.",
"will be removed in a future version. Use groupby ffill "
"or groupby bfill for forward or backward filling instead.",
FutureWarning,
)
if inplace:
Expand Down Expand Up @@ -2645,6 +2656,17 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):

_PROTECTED_KEYS = frozenset(("obj",))

def _reduce_numeric_only(self, op: str):
columns = list(
name
for name in self.obj._data.names
if (
is_numeric_dtype(self.obj._data[name].dtype)
and name not in self.grouping.names
)
)
return self[columns].agg(op)

def __getitem__(self, key):
return self.obj[key].groupby(
by=self.grouping.keys,
Expand Down
Loading

0 comments on commit 07cb2fe

Please sign in to comment.