From bee2a38b63fb5e4ef90f243a3c51cf23fbf3c984 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 3 May 2024 11:58:14 -1000 Subject: [PATCH] Enable FutureWarnings/DeprecationWarnings as errors for dask_cudf (#15634) Part of https://github.com/rapidsai/build-planning/issues/26 Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15634 --- python/cudf/cudf/core/index.py | 18 +++++++++++++++--- python/cudf/cudf/tests/test_index.py | 12 ++++++++---- .../dask_cudf/dask_cudf/io/tests/test_json.py | 5 ++--- .../dask_cudf/dask_cudf/tests/test_accessor.py | 6 +++--- .../dask_cudf/dask_cudf/tests/test_groupby.py | 10 +++++----- python/dask_cudf/dask_cudf/tests/test_join.py | 16 ++++++++++++---- python/dask_cudf/pyproject.toml | 10 ++++++++++ 7 files changed, 55 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 583e5d74b56..b51751a1b55 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1119,14 +1119,26 @@ def _concat(cls, objs): assert ( PANDAS_LT_300 ), "Need to drop after pandas-3.0 support is added." - warnings.warn( + warning_msg = ( "The behavior of array concatenation with empty entries is " "deprecated. In a future version, this will no longer exclude " "empty items when determining the result dtype. " "To retain the old behavior, exclude the empty entries before " - "the concat operation.", - FutureWarning, + "the concat operation." ) + # Warn only if the type might _actually_ change + if len(non_empties) == 0: + if not all(objs[0].dtype == index.dtype for index in objs[1:]): + warnings.warn(warning_msg, FutureWarning) + else: + common_all_type = find_common_type( + [index.dtype for index in objs] + ) + common_non_empty_type = find_common_type( + [index.dtype for index in non_empties] + ) + if common_all_type != common_non_empty_type: + warnings.warn(warning_msg, FutureWarning) if all(isinstance(obj, RangeIndex) for obj in non_empties): result = _concat_range_index(non_empties) else: diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index c7875b81440..104a5fc0ffa 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1039,7 +1039,9 @@ def test_index_append(data, other): (len(data) == 0 or len(other) == 0) and pd_data.dtype != pd_other.dtype ): expected = pd_data.append(pd_other) - with expect_warning_if(len(data) == 0 or len(other) == 0): + with expect_warning_if( + (len(data) == 0 or len(other) == 0) and gd_data.dtype != gd_other.dtype + ): actual = gd_data.append(gd_other) if len(data) == 0 and len(other) == 0: # Pandas default dtype to "object" for empty list @@ -1237,7 +1239,10 @@ def test_index_append_list(data, other): and (any(d.dtype != data.dtype for d in other)) ): expected = pd_data.append(pd_other) - with expect_warning_if(len(data) == 0 or any(len(d) == 0 for d in other)): + with expect_warning_if( + (len(data) == 0 or any(len(d) == 0 for d in other)) + and (any(d.dtype != data.dtype for d in other)) + ): actual = gd_data.append(gd_other) assert_eq(expected, actual) @@ -2817,8 +2822,7 @@ def test_index_methods(index, func): if func == "append": expected = pidx.append(other=pidx) - with expect_warning_if(len(gidx) == 0): - actual = gidx.append(other=gidx) + actual = gidx.append(other=gidx) else: expected = getattr(pidx, func)() actual = getattr(gidx, func)() diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py index f8e5be0a417..dc780478794 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_json.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py @@ -84,9 +84,8 @@ def test_read_json_nested(tmp_path): } ) kwargs = dict(orient="records", lines=True) - with tmp_path / "data.json" as f, dask.config.set( - {"dataframe.convert-string": False} - ): + f = tmp_path / "data.json" + with dask.config.set({"dataframe.convert-string": False}): df.to_json(f, **kwargs) # Ensure engine='cudf' is tested. actual = dask_cudf.read_json(f, engine="cudf", **kwargs) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index ae17b89832a..035b73094e7 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -543,7 +543,7 @@ def test_struct_explode(data): def test_tz_localize(): - data = Series(date_range("2000-04-01", "2000-04-03", freq="H")) + data = Series(date_range("2000-04-01", "2000-04-03", freq="h")) expect = data.dt.tz_localize( "US/Eastern", ambiguous="NaT", nonexistent="NaT" ) @@ -560,8 +560,8 @@ def test_tz_localize(): @pytest.mark.parametrize( "data", [ - date_range("2000-04-01", "2000-04-03", freq="H").tz_localize("UTC"), - date_range("2000-04-01", "2000-04-03", freq="H").tz_localize( + date_range("2000-04-01", "2000-04-03", freq="h").tz_localize("UTC"), + date_range("2000-04-01", "2000-04-03", freq="h").tz_localize( "US/Eastern" ), ], diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 67fa045d3d0..f96b5b760d8 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -233,7 +233,7 @@ def test_groupby_split_out(split_out, column): gddf = dask_cudf.from_cudf(gdf, npartitions=3) ddf_result = ( - ddf.groupby(column) + ddf.groupby(column, observed=True) .a.mean(split_out=split_out) .compute() .sort_values() @@ -368,10 +368,10 @@ def test_groupby_dropna_dask(dropna, by): if dropna is None: dask_cudf_result = gddf.groupby(by).e.sum() - dask_result = ddf.groupby(by).e.sum() + dask_result = ddf.groupby(by, observed=True).e.sum() else: dask_cudf_result = gddf.groupby(by, dropna=dropna).e.sum() - dask_result = ddf.groupby(by, dropna=dropna).e.sum() + dask_result = ddf.groupby(by, dropna=dropna, observed=True).e.sum() dd.assert_eq(dask_cudf_result, dask_result) @@ -505,7 +505,7 @@ def test_groupby_reset_index_dtype(): a = df.groupby("a").agg({"b": ["count"]}) assert a.index.dtype == "int8" - assert a.reset_index().dtypes[0] == "int8" + assert a.reset_index().dtypes.iloc[0] == "int8" def test_groupby_reset_index_names(): @@ -563,7 +563,7 @@ def test_groupby_categorical_key(): # (See: https://github.com/dask/dask/issues/9515) expect = ( ddf.compute() - .groupby("name", sort=True) + .groupby("name", sort=True, observed=True) .agg({"x": ["mean", "max"], "y": ["mean", "count"]}) ) dd.assert_eq(expect, got) diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py index 42ecc130298..ed291ef31a7 100644 --- a/python/dask_cudf/dask_cudf/tests/test_join.py +++ b/python/dask_cudf/dask_cudf/tests/test_join.py @@ -66,8 +66,12 @@ def test_join_inner(left_nrows, right_nrows, left_nkeys, right_nkeys): def gather(df, grows): grows[df["x"].values[0]] = (set(df.al), set(df.ar)) - expect.reset_index().groupby("x").apply(partial(gather, grows=expect_rows)) - expect.reset_index().groupby("x").apply(partial(gather, grows=got_rows)) + expect.reset_index().groupby("x")[["x", "al", "ar"]].apply( + partial(gather, grows=expect_rows) + ) + expect.reset_index().groupby("x")[["x", "al", "ar"]].apply( + partial(gather, grows=got_rows) + ) assert got_rows == expect_rows @@ -127,9 +131,13 @@ def gather(df, grows): grows[df["x"].values[0]] = (cola, colb) - expect.reset_index().groupby("x").apply(partial(gather, grows=expect_rows)) + expect.reset_index().groupby("x")[["x", "al", "ar"]].apply( + partial(gather, grows=expect_rows) + ) - expect.reset_index().groupby("x").apply(partial(gather, grows=got_rows)) + expect.reset_index().groupby("x")[["x", "al", "ar"]].apply( + partial(gather, grows=got_rows) + ) for k in expect_rows: np.testing.assert_array_equal(expect_rows[k][0], got_rows[k][0]) diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index fcf83e82989..5fbdd98225e 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -107,3 +107,13 @@ skip = [ "build", "dist", ] + +[tool.pytest.ini_options] +filterwarnings = [ + "error::FutureWarning", + "error::DeprecationWarning", + "ignore:create_block_manager_from_blocks is deprecated and will be removed in a future version. Use public APIs instead.:DeprecationWarning", + # https://github.com/dask/partd/blob/main/partd/pandas.py#L198 + "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning", + "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask", +]