From c30626e2ae0542f3e27a19c814f405d5e93043f5 Mon Sep 17 00:00:00 2001 From: Austin Au-Yeung Date: Tue, 1 Aug 2023 20:32:17 -0500 Subject: [PATCH 1/8] BUG: pivot_table mean of integer input casted back to int (#54263) --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/reshape/pivot.py | 23 -------- pandas/tests/frame/methods/test_drop.py | 5 +- pandas/tests/groupby/test_groupby.py | 3 -- pandas/tests/reshape/test_pivot.py | 71 ++++++++++++++----------- 5 files changed, 42 insertions(+), 62 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 6c91c4b512f41..c50e031c815a6 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -787,6 +787,7 @@ Other - Bug in :func:`api.interchange.from_dataframe` when converting an empty DataFrame object (:issue:`53155`) - Bug in :func:`assert_almost_equal` now throwing assertion error for two unequal sets (:issue:`51727`) - Bug in :func:`assert_frame_equal` checks category dtypes even when asked not to check index type (:issue:`52126`) +- Bug in :meth:`DataFrame.pivot_table` with casting the mean of ints back to an int (:issue:`16676`) - Bug in :meth:`DataFrame.reindex` with a ``fill_value`` that should be inferred with a :class:`ExtensionDtype` incorrectly inferring ``object`` dtype (:issue:`52586`) - Bug in :meth:`DataFrame.shift` and :meth:`Series.shift` and :meth:`DataFrameGroupBy.shift` when passing both "freq" and "fill_value" silently ignoring "fill_value" instead of raising ``ValueError`` (:issue:`53832`) - Bug in :meth:`DataFrame.shift` with ``axis=1`` on a :class:`DataFrame` with a single :class:`ExtensionDtype` column giving incorrect results (:issue:`53832`) @@ -798,7 +799,6 @@ Other - Bug in :meth:`Series.memory_usage` when ``deep=True`` throw an error with Series of objects and the returned value is incorrect, as it does not take into account GC corrections (:issue:`51858`) - Bug in :meth:`period_range` the default behavior when freq was not passed as an argument was incorrect(:issue:`53687`) - Fixed incorrect ``__name__`` attribute of ``pandas._libs.json`` (:issue:`52898`) -- .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 099bfde7af1d3..5c2e94735ddc5 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -20,7 +20,6 @@ from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( - is_integer_dtype, is_list_like, is_nested_list_like, is_scalar, @@ -172,28 +171,6 @@ def __internal_pivot_table( if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how="all") - # gh-21133 - # we want to down cast if - # the original values are ints - # as we grouped with a NaN value - # and then dropped, coercing to floats - for v in values: - if ( - v in data - and is_integer_dtype(data[v]) - and v in agged - and not is_integer_dtype(agged[v]) - ): - if not isinstance(agged[v], ABCDataFrame) and isinstance( - data[v].dtype, np.dtype - ): - # exclude DataFrame case bc maybe_downcast_to_dtype expects - # ArrayLike - # e.g. test_pivot_table_multiindex_columns_doctest_case - # agged.columns is a MultiIndex and 'v' is indexing only - # on its first level. - agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) - table = agged # GH17038, this check should only happen if index is defined (not None) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index 0a796982e9fca..9a4882f11e961 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -187,10 +187,7 @@ def test_drop_multiindex_not_lexsorted(self): not_lexsorted_df = not_lexsorted_df.reset_index() assert not not_lexsorted_df.columns._is_lexsorted() - # compare the results - tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) - - expected = lexsorted_df.drop("a", axis=1) + expected = lexsorted_df.drop("a", axis=1).astype(float) with tm.assert_produces_warning(PerformanceWarning): result = not_lexsorted_df.drop("a", axis=1) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index fc0efb74a9b62..e247c179b1fcb 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1818,9 +1818,6 @@ def test_groupby_multiindex_not_lexsorted(): not_lexsorted_df = not_lexsorted_df.reset_index() assert not not_lexsorted_df.columns._is_lexsorted() - # compare the results - tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) - expected = lexsorted_df.groupby("a").mean() with tm.assert_produces_warning(PerformanceWarning): result = not_lexsorted_df.groupby("a").mean() diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 9fed22d4d0cba..43786ee15d138 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -204,7 +204,7 @@ def test_pivot_table_categorical(self): result = pivot_table(df, values="values", index=["A", "B"], dropna=True) exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"]) - expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) + expected = DataFrame({"values": [1.0, 2.0, 3.0, 4.0]}, index=exp_index) tm.assert_frame_equal(result, expected) def test_pivot_table_dropna_categoricals(self, dropna): @@ -225,7 +225,7 @@ def test_pivot_table_dropna_categoricals(self, dropna): expected_columns = expected_columns.astype(CDT(categories, ordered=False)) expected_index = Series([1, 2, 3], name="B") expected = DataFrame( - [[0, 3, 6], [1, 4, 7], [2, 5, 8]], + [[0.0, 3.0, 6.0], [1.0, 4.0, 7.0], [2.0, 5.0, 8.0]], index=expected_index, columns=expected_columns, ) @@ -283,7 +283,7 @@ def test_pivot_with_non_observable_dropna_multi_cat(self, dropna): result = df.pivot_table(index="A", values="B", dropna=dropna) expected = DataFrame( - {"B": [2, 3, 0]}, + {"B": [2.0, 3.0, 0.0]}, index=Index( Categorical.from_codes( [0, 1, 2], categories=["low", "high", "left"], ordered=True @@ -300,7 +300,9 @@ def test_pivot_with_interval_index(self, interval_values, dropna): # GH 25814 df = DataFrame({"A": interval_values, "B": 1}) result = df.pivot_table(index="A", values="B", dropna=dropna) - expected = DataFrame({"B": 1}, index=Index(interval_values.unique(), name="A")) + expected = DataFrame( + {"B": 1.0}, index=Index(interval_values.unique(), name="A") + ) if not dropna: expected = expected.astype(float) tm.assert_frame_equal(result, expected) @@ -444,7 +446,7 @@ def test_pivot_no_values(self): index=Grouper(freq="A"), columns=Grouper(key="dt", freq="M") ) exp = DataFrame( - [3], index=pd.DatetimeIndex(["2011-12-31"], freq="A"), columns=exp_columns + [3.0], index=pd.DatetimeIndex(["2011-12-31"], freq="A"), columns=exp_columns ) tm.assert_frame_equal(res, exp) @@ -1059,7 +1061,7 @@ def test_pivot_table_multiindex_only(self, cols): result = df2.pivot_table(values="v", columns=cols) expected = DataFrame( - [[4, 5, 6]], + [[4.0, 5.0, 6.0]], columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), index=Index(["v"]), ) @@ -1564,7 +1566,9 @@ def test_pivot_datetime_tz(self): exp_col1 = Index(["value1", "value1"]) exp_col2 = Index(["a", "b"], name="label") exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) - expected = DataFrame([[0, 3], [1, 4], [2, 5]], index=exp_idx, columns=exp_col) + expected = DataFrame( + [[0.0, 3.0], [1.0, 4.0], [2.0, 5.0]], index=exp_idx, columns=exp_col + ) result = pivot_table(df, index=["dt1"], columns=["label"], values=["value1"]) tm.assert_frame_equal(result, expected) @@ -1576,18 +1580,35 @@ def test_pivot_datetime_tz(self): name="dt2", ) exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3]) - expected = DataFrame( + expected1 = DataFrame( np.array( [ - [0, 3, 1, 2, 0, 3, 1, 2], - [1, 4, 2, 1, 1, 4, 2, 1], - [2, 5, 1, 2, 2, 5, 1, 2], + [ + 0, + 3, + 1, + 2, + ], + [1, 4, 2, 1], + [2, 5, 1, 2], ], dtype="int64", ), index=exp_idx, - columns=exp_col, + columns=exp_col[:4], + ) + expected2 = DataFrame( + np.array( + [ + [0.0, 3.0, 1.0, 2.0], + [1.0, 4.0, 2.0, 1.0], + [2.0, 5.0, 1.0, 2.0], + ], + ), + index=exp_idx, + columns=exp_col[4:], ) + expected = concat([expected1, expected2], axis=1) result = pivot_table( df, @@ -1634,7 +1655,7 @@ def test_pivot_dtaccessor(self): exp_idx = Index(["a", "b"], name="label") expected = DataFrame( - {7: [0, 3], 8: [1, 4], 9: [2, 5]}, + {7: [0.0, 3.0], 8: [1.0, 4.0], 9: [2.0, 5.0]}, index=exp_idx, columns=Index([7, 8, 9], dtype=np.int32, name="dt1"), ) @@ -1645,7 +1666,7 @@ def test_pivot_dtaccessor(self): ) expected = DataFrame( - {7: [0, 3], 8: [1, 4], 9: [2, 5]}, + {7: [0.0, 3.0], 8: [1.0, 4.0], 9: [2.0, 5.0]}, index=Index([1, 2], dtype=np.int32, name="dt2"), columns=Index([7, 8, 9], dtype=np.int32, name="dt1"), ) @@ -1666,7 +1687,7 @@ def test_pivot_dtaccessor(self): names=["dt1", "dt2"], ) expected = DataFrame( - np.array([[0, 3, 1, 4, 2, 5]], dtype="int64"), + np.array([[0.0, 3.0, 1.0, 4.0, 2.0, 5.0]]), index=Index([2013], dtype=np.int32), columns=exp_col, ) @@ -1770,13 +1791,7 @@ def test_pivot_table_margins_name_with_aggfunc_list(self): expected = DataFrame(table.values, index=ix, columns=cols) tm.assert_frame_equal(table, expected) - def test_categorical_margins(self, observed, request): - if observed: - request.node.add_marker( - pytest.mark.xfail( - reason="GH#17035 (np.mean of ints is casted back to ints)" - ) - ) + def test_categorical_margins(self, observed): # GH 10989 df = DataFrame( {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} @@ -1789,13 +1804,7 @@ def test_categorical_margins(self, observed, request): table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) tm.assert_frame_equal(table, expected) - def test_categorical_margins_category(self, observed, request): - if observed: - request.node.add_marker( - pytest.mark.xfail( - reason="GH#17035 (np.mean of ints is casted back to ints)" - ) - ) + def test_categorical_margins_category(self, observed): df = DataFrame( {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} ) @@ -1822,7 +1831,7 @@ def test_margins_casted_to_float(self): result = pivot_table(df, index="D", margins=True) expected = DataFrame( - {"A": [3, 7, 5], "B": [2.5, 6.5, 4.5], "C": [2, 5, 3.5]}, + {"A": [3.0, 7.0, 5], "B": [2.5, 6.5, 4.5], "C": [2.0, 5.0, 3.5]}, index=Index(["X", "Y", "All"], name="D"), ) tm.assert_frame_equal(result, expected) @@ -2255,7 +2264,7 @@ def test_pivot_table_sort_false_with_multiple_values(self): index=["lastname", "firstname"], values=["height", "age"], sort=False ) expected = DataFrame( - [[173, 47], [182, 33]], + [[173.0, 47.0], [182.0, 33.0]], columns=["height", "age"], index=MultiIndex.from_tuples( [("Foo", "John"), ("Bar", "Michael")], From e287acc1b80d032a7ee8b0bd195ff34678034b73 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 2 Aug 2023 05:22:31 -0400 Subject: [PATCH 2/8] CI: fix cython lint errors (#54370) fix pre-commit checks --- pandas/_libs/sparse.pyx | 2 +- pandas/_libs/tslibs/offsets.pyx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 8d01fe4efe545..b42a9415fbefb 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -726,7 +726,7 @@ def make_mask_object_ndarray(ndarray[object, ndim=1] arr, object fill_value): for i in range(new_length): value = arr[i] - if value == fill_value and type(value) == type(fill_value): + if value == fill_value and type(value) is type(fill_value): mask[i] = 0 return mask.view(dtype=bool) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 84b102bd4a262..028e79774607d 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -498,7 +498,7 @@ cdef class BaseOffset: def __sub__(self, other): if PyDateTime_Check(other): raise TypeError("Cannot subtract datetime from offset.") - elif type(other) == type(self): + elif type(other) is type(self): return type(self)(self.n - other.n, normalize=self.normalize, **self.kwds) elif not isinstance(self, BaseOffset): @@ -1047,7 +1047,7 @@ cdef class Tick(SingleConstructorOffset): return other.__add__(self) if isinstance(other, Tick): - if type(self) == type(other): + if type(self) is type(other): return type(self)(self.n + other.n) else: return delta_to_tick(self.delta + other.delta) From 67984e9789eabea6bc8141b62bd2d879c19465a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dea=20Mar=C3=ADa=20L=C3=A9on?= Date: Wed, 2 Aug 2023 17:37:32 +0200 Subject: [PATCH 3/8] DOC: Fixing EX01 - Added examples (#54357) * Example for writer * updated code_checks.sh --- ci/code_checks.sh | 1 - pandas/io/stata.py | 24 ++++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 59e707c32ceb6..a793e03b8745f 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -63,7 +63,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then MSG='Partially validate docstrings (EX01)' ; echo $MSG $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01 --ignore_functions \ - pandas.io.stata.StataWriter.write_file \ pandas.api.extensions.ExtensionArray \ RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 054d73a8aba42..698a2882ada39 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2737,6 +2737,30 @@ def _encode_strings(self) -> None: def write_file(self) -> None: """ Export DataFrame object to Stata dta format. + + Examples + -------- + >>> df = pd.DataFrame({"fully_labelled": [1, 2, 3, 3, 1], + ... "partially_labelled": [1.0, 2.0, np.nan, 9.0, np.nan], + ... "Y": [7, 7, 9, 8, 10], + ... "Z": pd.Categorical(["j", "k", "l", "k", "j"]), + ... }) + >>> path = "/My_path/filename.dta" + >>> labels = {"fully_labelled": {1: "one", 2: "two", 3: "three"}, + ... "partially_labelled": {1.0: "one", 2.0: "two"}, + ... } + >>> writer = pd.io.stata.StataWriter(path, + ... df, + ... value_labels=labels) # doctest: +SKIP + >>> writer.write_file() # doctest: +SKIP + >>> df = pd.read_stata(path) # doctest: +SKIP + >>> df # doctest: +SKIP + index fully_labelled partially_labeled Y Z + 0 0 one one 7 j + 1 1 two two 7 k + 2 2 three NaN 9 l + 3 3 three 9.0 8 k + 4 4 one NaN 10 j """ with get_handle( self._fname, From 7cbf949817fef013d25af512467cd13358850bbb Mon Sep 17 00:00:00 2001 From: mrastgoo Date: Wed, 2 Aug 2023 17:39:28 +0200 Subject: [PATCH 4/8] ENH: explicit filters parameter in pd.read_parquet (#53212) * filters parameters in pd.read_parqeut * linter * docstring validation * test for filter args in pd.read_parquet * black * addressing reviews --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/io/parquet.py | 31 ++++++++++++++++++++++++++++--- pandas/tests/io/test_parquet.py | 19 +++++++++++++++++++ 3 files changed, 48 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index c50e031c815a6..b869859e0ca80 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -669,6 +669,7 @@ I/O ^^^ - :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`) - :meth:`DataFrame.to_sql` now raising ``ValueError`` when the name param is left empty while using SQLAlchemy to connect (:issue:`52675`) +- Added ``filters`` parameter to :func:`read_parquet` to filter out data, compatible with both ``engines`` (:issue:`53212`) - Bug in :func:`json_normalize`, fix json_normalize cannot parse metadata fields list type (:issue:`37782`) - Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`) - Bug in :func:`read_csv`, with ``engine="pyarrow"`` erroring when specifying a ``dtype`` with ``index_col`` (:issue:`53229`) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 61112542fb9d8..90d59b0dfcfc8 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -228,6 +228,7 @@ def read( self, path, columns=None, + filters=None, use_nullable_dtypes: bool = False, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, storage_options: StorageOptions | None = None, @@ -257,7 +258,11 @@ def read( ) try: pa_table = self.api.parquet.read_table( - path_or_handle, columns=columns, filesystem=filesystem, **kwargs + path_or_handle, + columns=columns, + filesystem=filesystem, + filters=filters, + **kwargs, ) result = pa_table.to_pandas(**to_pandas_kwargs) @@ -335,6 +340,7 @@ def read( self, path, columns=None, + filters=None, storage_options: StorageOptions | None = None, filesystem=None, **kwargs, @@ -375,7 +381,7 @@ def read( try: parquet_file = self.api.ParquetFile(path, **parquet_kwargs) - return parquet_file.to_pandas(columns=columns, **kwargs) + return parquet_file.to_pandas(columns=columns, filters=filters, **kwargs) finally: if handles is not None: handles.close() @@ -487,6 +493,7 @@ def read_parquet( use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, filesystem: Any = None, + filters: list[tuple] | list[list[tuple]] | None = None, **kwargs, ) -> DataFrame: """ @@ -517,7 +524,6 @@ def read_parquet( if you wish to use its implementation. columns : list, default=None If not None, only these columns will be read from the file. - {storage_options} .. versionadded:: 1.3.0 @@ -550,6 +556,24 @@ def read_parquet( .. versionadded:: 2.1.0 + filters : List[Tuple] or List[List[Tuple]], default None + To filter out data. + Filter syntax: [[(column, op, val), ...],...] + where op is [==, =, >, >=, <, <=, !=, in, not in] + The innermost tuples are transposed into a set of filters applied + through an `AND` operation. + The outer list combines these sets of filters through an `OR` + operation. + A single list of tuples can also be used, meaning that no `OR` + operation between set of filters is to be conducted. + + Using this argument will NOT result in row-wise filtering of the final + partitions unless ``engine="pyarrow"`` is also specified. For + other engines, filtering is only performed at the partition level, that is, + to prevent the loading of some row-groups and/or files. + + .. versionadded:: 2.1.0 + **kwargs Any additional kwargs are passed to the engine. @@ -632,6 +656,7 @@ def read_parquet( return impl.read( path, columns=columns, + filters=filters, storage_options=storage_options, use_nullable_dtypes=use_nullable_dtypes, dtype_backend=dtype_backend, diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index a1bc470bdb7a2..5399cabb61ec3 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -426,6 +426,25 @@ def test_read_columns(self, engine): df, engine, expected=expected, read_kwargs={"columns": ["string"]} ) + def test_read_filters(self, engine, tmp_path): + df = pd.DataFrame( + { + "int": list(range(4)), + "part": list("aabb"), + } + ) + + expected = pd.DataFrame({"int": [0, 1]}) + check_round_trip( + df, + engine, + path=tmp_path, + expected=expected, + write_kwargs={"partition_cols": ["part"]}, + read_kwargs={"filters": [("part", "==", "a")], "columns": ["int"]}, + repeat=1, + ) + def test_write_index(self, engine, using_copy_on_write, request): check_names = engine != "fastparquet" if using_copy_on_write and engine == "fastparquet": From fb6f704b4002f306a38e81f10986a6e12e997ed1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 2 Aug 2023 08:48:02 -0700 Subject: [PATCH 5/8] REF: implement BaseOpsUtil._cast_pointwise_result (#54366) * REF: implement BaseOpsUtil._cast_pointwise_result * REF: use _cast_pointwise_result in arrow tests --- pandas/tests/extension/base/ops.py | 20 ++++++ pandas/tests/extension/test_arrow.py | 14 ++-- pandas/tests/extension/test_boolean.py | 68 ++++++++++--------- pandas/tests/extension/test_masked_numeric.py | 60 ++++------------ 4 files changed, 75 insertions(+), 87 deletions(-) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 49598f014fbf6..bc2048f9c31bb 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import final + import numpy as np import pytest @@ -10,6 +12,15 @@ class BaseOpsUtil(BaseExtensionTests): + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): + # In _check_op we check that the result of a pointwise operation + # (found via _combine) matches the result of the vectorized + # operation obj.__op_name__(other). + # In some cases pandas dtype inference on the scalar result may not + # give a matching dtype even if both operations are behaving "correctly". + # In these cases, do extra required casting here. + return pointwise_result + def get_op_from_name(self, op_name: str): return tm.get_op_from_name(op_name) @@ -18,6 +29,12 @@ def check_opname(self, ser: pd.Series, op_name: str, other, exc=Exception): self._check_op(ser, op, other, op_name, exc) + # Subclasses are not expected to need to override _check_op or _combine. + # Ideally any relevant overriding can be done in _cast_pointwise_result, + # get_op_from_name, and the specification of `exc`. If you find a use + # case that still requires overriding _check_op or _combine, please let + # us know at github.com/pandas-dev/pandas/issues + @final def _combine(self, obj, other, op): if isinstance(obj, pd.DataFrame): if len(obj.columns) != 1: @@ -27,12 +44,15 @@ def _combine(self, obj, other, op): expected = obj.combine(other, op) return expected + # see comment on _combine + @final def _check_op( self, ser: pd.Series, op, other, op_name: str, exc=NotImplementedError ): if exc is None: result = op(ser, other) expected = self._combine(ser, other, op) + expected = self._cast_pointwise_result(op_name, ser, other, expected) assert isinstance(result, type(ser)) tm.assert_equal(result, expected) else: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 29fcd4e48db46..0bb9e02ca732a 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -873,11 +873,11 @@ def rtruediv(x, y): return tm.get_op_from_name(op_name) - def _combine(self, obj, other, op): + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): # BaseOpsUtil._combine can upcast expected dtype # (because it generates expected on python scalars) # while ArrowExtensionArray maintains original type - expected = base.BaseArithmeticOpsTests._combine(self, obj, other, op) + expected = pointwise_result was_frame = False if isinstance(expected, pd.DataFrame): @@ -895,7 +895,7 @@ def _combine(self, obj, other, op): pa.types.is_floating(orig_pa_type) or ( pa.types.is_integer(orig_pa_type) - and op.__name__ not in ["truediv", "rtruediv"] + and op_name not in ["__truediv__", "__rtruediv__"] ) or pa.types.is_duration(orig_pa_type) or pa.types.is_timestamp(orig_pa_type) @@ -906,7 +906,7 @@ def _combine(self, obj, other, op): # ArrowExtensionArray does not upcast return expected elif not ( - (op is operator.floordiv and pa.types.is_integer(orig_pa_type)) + (op_name == "__floordiv__" and pa.types.is_integer(orig_pa_type)) or pa.types.is_duration(orig_pa_type) or pa.types.is_timestamp(orig_pa_type) or pa.types.is_date(orig_pa_type) @@ -943,14 +943,14 @@ def _combine(self, obj, other, op): ): # decimal precision can resize in the result type depending on data # just compare the float values - alt = op(obj, other) + alt = getattr(obj, op_name)(other) alt_dtype = tm.get_dtype(alt) assert isinstance(alt_dtype, ArrowDtype) - if op is operator.pow and isinstance(other, Decimal): + if op_name == "__pow__" and isinstance(other, Decimal): # TODO: would it make more sense to retain Decimal here? alt_dtype = ArrowDtype(pa.float64()) elif ( - op is operator.pow + op_name == "__pow__" and isinstance(other, pd.Series) and other.dtype == original_dtype ): diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 24095f807d4ae..a4910f29df9b4 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -13,6 +13,8 @@ be added to the array-specific tests in `pandas/tests/arrays/`. """ +import operator + import numpy as np import pytest @@ -23,6 +25,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core import roperator from pandas.core.arrays.boolean import BooleanDtype from pandas.tests.extension import base @@ -125,41 +128,40 @@ def check_opname(self, s, op_name, other, exc=None): if op_name.strip("_").lstrip("r") in ["pow", "truediv", "floordiv"]: # match behavior with non-masked bool dtype exc = NotImplementedError + elif op_name in self.implements: + # exception message would include "numpy boolean subtract"" + exc = TypeError + super().check_opname(s, op_name, other, exc=exc) - def _check_op(self, obj, op, other, op_name, exc=NotImplementedError): - if exc is None: - if op_name in self.implements: - msg = r"numpy boolean subtract" - with pytest.raises(TypeError, match=msg): - op(obj, other) - return - - result = op(obj, other) - expected = self._combine(obj, other, op) - - if op_name in ( - "__floordiv__", - "__rfloordiv__", - "__pow__", - "__rpow__", - "__mod__", - "__rmod__", - ): - # combine keeps boolean type - expected = expected.astype("Int8") - elif op_name in ("__truediv__", "__rtruediv__"): - # combine with bools does not generate the correct result - # (numpy behaviour for div is to regard the bools as numeric) - expected = self._combine(obj.astype(float), other, op) - expected = expected.astype("Float64") - if op_name == "__rpow__": - # for rpow, combine does not propagate NaN - expected[result.isna()] = np.nan - tm.assert_equal(result, expected) - else: - with pytest.raises(exc): - op(obj, other) + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): + if op_name in ( + "__floordiv__", + "__rfloordiv__", + "__pow__", + "__rpow__", + "__mod__", + "__rmod__", + ): + # combine keeps boolean type + pointwise_result = pointwise_result.astype("Int8") + + elif op_name in ("__truediv__", "__rtruediv__"): + # combine with bools does not generate the correct result + # (numpy behaviour for div is to regard the bools as numeric) + if op_name == "__truediv__": + op = operator.truediv + else: + op = roperator.rtruediv + pointwise_result = self._combine(obj.astype(float), other, op) + pointwise_result = pointwise_result.astype("Float64") + + if op_name == "__rpow__": + # for rpow, combine does not propagate NaN + result = getattr(obj, op_name)(other) + pointwise_result[result.isna()] = np.nan + + return pointwise_result @pytest.mark.xfail( reason="Inconsistency between floordiv and divmod; we raise for floordiv " diff --git a/pandas/tests/extension/test_masked_numeric.py b/pandas/tests/extension/test_masked_numeric.py index 321321e0760d5..6262f9ceac561 100644 --- a/pandas/tests/extension/test_masked_numeric.py +++ b/pandas/tests/extension/test_masked_numeric.py @@ -146,45 +146,20 @@ class TestDtype(base.BaseDtypeTests): class TestArithmeticOps(base.BaseArithmeticOpsTests): - def _check_op(self, s, op, other, op_name, exc=NotImplementedError): - if exc is None: - sdtype = tm.get_dtype(s) - - if hasattr(other, "dtype") and isinstance(other.dtype, np.dtype): - if sdtype.kind == "f": - if other.dtype.kind == "f": - # other is np.float64 and would therefore always result - # in upcasting, so keeping other as same numpy_dtype - other = other.astype(sdtype.numpy_dtype) - - else: - # i.e. sdtype.kind in "iu"" - if other.dtype.kind in "iu" and sdtype.is_unsigned_integer: - # TODO: comment below is inaccurate; other can be int8 - # int16, ... - # and the trouble is that e.g. if s is UInt8 and other - # is int8, then result is UInt16 - # other is np.int64 and would therefore always result in - # upcasting, so keeping other as same numpy_dtype - other = other.astype(sdtype.numpy_dtype) - - result = op(s, other) - expected = self._combine(s, other, op) - - if sdtype.kind in "iu": - if op_name in ("__rtruediv__", "__truediv__", "__div__"): - expected = expected.fillna(np.nan).astype("Float64") - else: - # combine method result in 'biggest' (int64) dtype - expected = expected.astype(sdtype) + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): + sdtype = tm.get_dtype(obj) + expected = pointwise_result + + if sdtype.kind in "iu": + if op_name in ("__rtruediv__", "__truediv__", "__div__"): + expected = expected.fillna(np.nan).astype("Float64") else: - # combine method result in 'biggest' (float64) dtype + # combine method result in 'biggest' (int64) dtype expected = expected.astype(sdtype) - - tm.assert_equal(result, expected) else: - with pytest.raises(exc): - op(s, other) + # combine method result in 'biggest' (float64) dtype + expected = expected.astype(sdtype) + return expected def check_opname(self, ser: pd.Series, op_name: str, other, exc=None): # overwriting to indicate ops don't raise an error @@ -195,17 +170,8 @@ def _check_divmod_op(self, ser: pd.Series, op, other, exc=None): class TestComparisonOps(base.BaseComparisonOpsTests): - def _check_op( - self, ser: pd.Series, op, other, op_name: str, exc=NotImplementedError - ): - if exc is None: - result = op(ser, other) - # Override to do the astype to boolean - expected = ser.combine(other, op).astype("boolean") - tm.assert_series_equal(result, expected) - else: - with pytest.raises(exc): - op(ser, other) + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): + return pointwise_result.astype("boolean") def check_opname(self, ser: pd.Series, op_name: str, other, exc=None): super().check_opname(ser, op_name, other, exc=None) From 292eb326f3762161ed1208bfd723210b77e2f432 Mon Sep 17 00:00:00 2001 From: Tim Hoffmann <2836374+timhoffm@users.noreply.github.com> Date: Wed, 2 Aug 2023 17:48:50 +0200 Subject: [PATCH 6/8] CLN: Remove None check in attrs property lookup (#54364) `self._attrs` is always initialized to an empty dict per https://github.com/pandas-dev/pandas/blob/c93e8034a13d3dbe2358b1b2f868a0d54d1034a7/pandas/core/generic.py#L275 The attribute `_attrs` is only witten to in two other places a) in the attrs.setter property (which enforces dict as well) b) in __setstate__, which takes whatever the state is: https://github.com/pandas-dev/pandas/blob/c93e8034a13d3dbe2358b1b2f868a0d54d1034a7/pandas/core/generic.py#L2129C24-L2129C24 AFAICS (including code history) I do not see a reason that this could be None. But if we want to be very defensive, we should do the dict enforcing here. Co-authored-by: Tim Hoffmann --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index aa6578bbcaf66..8a3a105749800 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -381,8 +381,6 @@ def attrs(self) -> dict[Hashable, Any]: >>> df.attrs {'A': [10, 20, 30]} """ - if self._attrs is None: - self._attrs = {} return self._attrs @attrs.setter @@ -2126,6 +2124,8 @@ def __setstate__(self, state) -> None: typ = state.get("_typ") if typ is not None: attrs = state.get("_attrs", {}) + if attrs is None: # should not happen, but better be on the safe side + attrs = {} object.__setattr__(self, "_attrs", attrs) flags = state.get("_flags", {"allows_duplicate_labels": True}) object.__setattr__(self, "_flags", Flags(self, **flags)) From 46386f0e97c14d7d1daad662977a26402e0cf468 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 2 Aug 2023 05:50:06 -1000 Subject: [PATCH 7/8] CI/TST: Cleanups (#54362) --- .circleci/setup_env.sh | 4 +- .github/workflows/unit-tests.yml | 2 - pandas/tests/arrays/test_datetimelike.py | 69 --------------------- pandas/tests/io/parser/test_upcast.py | 3 - pandas/tests/io/test_gcs.py | 10 ++-- pandas/tests/test_downstream.py | 76 ++++++++++++++++++++++++ 6 files changed, 83 insertions(+), 81 deletions(-) diff --git a/.circleci/setup_env.sh b/.circleci/setup_env.sh index e41650870bd70..4f81acb6d2099 100755 --- a/.circleci/setup_env.sh +++ b/.circleci/setup_env.sh @@ -48,10 +48,10 @@ source activate pandas-dev # downstream CI jobs that may also build pandas from source. export PANDAS_CI=1 -if pip list | grep -q ^pandas; then +if pip show pandas 1>/dev/null; then echo echo "remove any installed pandas package w/o removing anything else" - pip uninstall -y pandas || true + pip uninstall -y pandas fi echo "Install pandas" diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index a9651ae26934b..1770d18d4eb41 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -333,7 +333,6 @@ jobs: PYTEST_WORKERS: "auto" PANDAS_CI: 1 PATTERN: "not slow and not network and not clipboard and not single_cpu" - COVERAGE: true PYTEST_TARGET: pandas steps: @@ -351,7 +350,6 @@ jobs: python --version python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.0.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy - python -m pip install git+https://github.com/nedbat/coveragepy.git python -m pip install versioneer[toml] python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 python -m pip list diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index fd133b104b380..a4fbc8df4a8fa 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -1,6 +1,5 @@ from __future__ import annotations -import array import re import warnings @@ -12,7 +11,6 @@ OutOfBoundsDatetime, Timestamp, ) -import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -1328,70 +1326,3 @@ def test_from_pandas_array(dtype): result = idx_cls(arr) expected = idx_cls(data) tm.assert_index_equal(result, expected) - - -@pytest.fixture( - params=[ - "memoryview", - "array", - pytest.param("dask", marks=td.skip_if_no("dask.array")), - pytest.param("xarray", marks=td.skip_if_no("xarray")), - ] -) -def array_likes(request): - """ - Fixture giving a numpy array and a parametrized 'data' object, which can - be a memoryview, array, dask or xarray object created from the numpy array. - """ - # GH#24539 recognize e.g xarray, dask, ... - arr = np.array([1, 2, 3], dtype=np.int64) - - name = request.param - if name == "memoryview": - data = memoryview(arr) - elif name == "array": - data = array.array("i", arr) - elif name == "dask": - import dask.array - - data = dask.array.array(arr) - elif name == "xarray": - import xarray as xr - - data = xr.DataArray(arr) - - return arr, data - - -@pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"]) -def test_from_obscure_array(dtype, array_likes): - # GH#24539 recognize e.g xarray, dask, ... - # Note: we dont do this for PeriodArray bc _from_sequence won't accept - # an array of integers - # TODO: could check with arraylike of Period objects - arr, data = array_likes - - cls = {"M8[ns]": DatetimeArray, "m8[ns]": TimedeltaArray}[dtype] - - expected = cls(arr) - result = cls._from_sequence(data) - tm.assert_extension_array_equal(result, expected) - - func = {"M8[ns]": _sequence_to_dt64ns, "m8[ns]": sequence_to_td64ns}[dtype] - result = func(arr)[0] - expected = func(data)[0] - tm.assert_equal(result, expected) - - if not isinstance(data, memoryview): - # FIXME(GH#44431) these raise on memoryview and attempted fix - # fails on py3.10 - func = {"M8[ns]": pd.to_datetime, "m8[ns]": pd.to_timedelta}[dtype] - result = func(arr).array - expected = func(data).array - tm.assert_equal(result, expected) - - # Let's check the Indexes while we're here - idx_cls = {"M8[ns]": DatetimeIndex, "m8[ns]": TimedeltaIndex}[dtype] - result = idx_cls(arr) - expected = idx_cls(data) - tm.assert_index_equal(result, expected) diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index 558822b84620a..7cfaac997e3b1 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -38,9 +38,6 @@ def test_maybe_upcast(any_real_numpy_dtype): def test_maybe_upcast_no_na(any_real_numpy_dtype): # GH#36712 - if any_real_numpy_dtype == "float32": - pytest.skip() - arr = np.array([1, 2, 3], dtype=any_real_numpy_dtype) result = _maybe_upcast(arr, use_dtype_backend=True) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index bdea24f7bb5aa..89655e8693d7f 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -22,7 +22,8 @@ @pytest.fixture def gcs_buffer(): """Emulate GCS using a binary buffer.""" - import fsspec + pytest.importorskip("gcsfs") + fsspec = pytest.importorskip("fsspec") gcs_buffer = BytesIO() gcs_buffer.close = lambda: True @@ -43,7 +44,6 @@ def ls(self, path, **kwargs): return gcs_buffer -@td.skip_if_no("gcsfs") # Patches pyarrow; other processes should not pick up change @pytest.mark.single_cpu @pytest.mark.parametrize("format", ["csv", "json", "parquet", "excel", "markdown"]) @@ -131,7 +131,6 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): assert result == expected -@td.skip_if_no("gcsfs") @pytest.mark.parametrize("encoding", ["utf-8", "cp1251"]) def test_to_csv_compression_encoding_gcs( gcs_buffer, compression_only, encoding, compression_to_extension @@ -177,10 +176,11 @@ def test_to_csv_compression_encoding_gcs( tm.assert_frame_equal(df, read_df) -@td.skip_if_no("fastparquet") -@td.skip_if_no("gcsfs") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" + pytest.importorskip("fastparquet") + pytest.importorskip("gcsfs") + from fsspec import AbstractFileSystem df1 = DataFrame( diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 09594588be81c..01efb01e63e1c 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -1,6 +1,7 @@ """ Testing that we work in the downstream packages """ +import array import importlib import subprocess import sys @@ -14,9 +15,17 @@ import pandas as pd from pandas import ( DataFrame, + DatetimeIndex, Series, + TimedeltaIndex, ) import pandas._testing as tm +from pandas.core.arrays import ( + DatetimeArray, + TimedeltaArray, +) +from pandas.core.arrays.datetimes import _sequence_to_dt64ns +from pandas.core.arrays.timedeltas import sequence_to_td64ns def import_module(name): @@ -277,3 +286,70 @@ def __radd__(self, other): assert right.__add__(left) is NotImplemented assert right + left is left + + +@pytest.fixture( + params=[ + "memoryview", + "array", + pytest.param("dask", marks=td.skip_if_no("dask.array")), + pytest.param("xarray", marks=td.skip_if_no("xarray")), + ] +) +def array_likes(request): + """ + Fixture giving a numpy array and a parametrized 'data' object, which can + be a memoryview, array, dask or xarray object created from the numpy array. + """ + # GH#24539 recognize e.g xarray, dask, ... + arr = np.array([1, 2, 3], dtype=np.int64) + + name = request.param + if name == "memoryview": + data = memoryview(arr) + elif name == "array": + data = array.array("i", arr) + elif name == "dask": + import dask.array + + data = dask.array.array(arr) + elif name == "xarray": + import xarray as xr + + data = xr.DataArray(arr) + + return arr, data + + +@pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"]) +def test_from_obscure_array(dtype, array_likes): + # GH#24539 recognize e.g xarray, dask, ... + # Note: we dont do this for PeriodArray bc _from_sequence won't accept + # an array of integers + # TODO: could check with arraylike of Period objects + arr, data = array_likes + + cls = {"M8[ns]": DatetimeArray, "m8[ns]": TimedeltaArray}[dtype] + + expected = cls(arr) + result = cls._from_sequence(data) + tm.assert_extension_array_equal(result, expected) + + func = {"M8[ns]": _sequence_to_dt64ns, "m8[ns]": sequence_to_td64ns}[dtype] + result = func(arr)[0] + expected = func(data)[0] + tm.assert_equal(result, expected) + + if not isinstance(data, memoryview): + # FIXME(GH#44431) these raise on memoryview and attempted fix + # fails on py3.10 + func = {"M8[ns]": pd.to_datetime, "m8[ns]": pd.to_timedelta}[dtype] + result = func(arr).array + expected = func(data).array + tm.assert_equal(result, expected) + + # Let's check the Indexes while we're here + idx_cls = {"M8[ns]": DatetimeIndex, "m8[ns]": TimedeltaIndex}[dtype] + result = idx_cls(arr) + expected = idx_cls(data) + tm.assert_index_equal(result, expected) From 263828c164ae00fa8bd76f7c1eea6f1809acbeaf Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Wed, 2 Aug 2023 11:53:11 -0400 Subject: [PATCH 8/8] ENH: Add new implementation of DataFrame.stack (#53921) * DEPR: Add new implementation of DataFrame.stack and deprecate old * Merge cleanup * Revert filterwarnings in conf.py * Merge fixup * Rename inner function * v3->future_stack; other refinements * Fixup docstring * Docstring fixup --- .../comparison/comparison_with_r.rst | 2 +- doc/source/user_guide/10min.rst | 2 +- doc/source/user_guide/cookbook.rst | 4 +- doc/source/user_guide/groupby.rst | 2 +- doc/source/user_guide/reshaping.rst | 20 +- doc/source/whatsnew/v2.1.0.rst | 41 +- pandas/core/frame.py | 90 ++- pandas/core/groupby/generic.py | 2 +- pandas/core/indexes/multi.py | 4 + pandas/core/resample.py | 2 +- pandas/core/reshape/pivot.py | 2 +- pandas/core/reshape/reshape.py | 122 +++- pandas/tests/extension/base/reshaping.py | 7 +- pandas/tests/extension/json/test_json.py | 2 +- pandas/tests/extension/test_sparse.py | 5 +- .../tests/frame/methods/test_reset_index.py | 6 +- pandas/tests/frame/test_stack_unstack.py | 555 ++++++++++++------ pandas/tests/frame/test_subclass.py | 14 +- pandas/tests/groupby/aggregate/test_cython.py | 2 +- pandas/tests/groupby/test_categorical.py | 22 +- pandas/tests/groupby/test_function.py | 2 +- .../indexes/datetimes/test_partial_slicing.py | 2 +- pandas/tests/indexes/multi/test_indexing.py | 10 +- pandas/tests/indexes/multi/test_integrity.py | 5 +- pandas/tests/io/json/test_pandas.py | 4 +- pandas/tests/io/pytables/test_append.py | 2 +- .../tests/series/methods/test_reset_index.py | 2 +- pandas/tests/series/methods/test_unstack.py | 4 +- 28 files changed, 662 insertions(+), 275 deletions(-) diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index 25ba237e8caf3..a6cfcd4614984 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -438,7 +438,7 @@ In Python, the :meth:`~pandas.melt` method is the R equivalent: ) pd.melt(cheese, id_vars=["first", "last"]) - cheese.set_index(["first", "last"]).stack() # alternative way + cheese.set_index(["first", "last"]).stack(future_stack=True) # alternative way For more details and examples see :ref:`the reshaping documentation `. diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index cb3c4ab3de658..51168f74c2657 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -579,7 +579,7 @@ columns: .. ipython:: python - stacked = df2.stack() + stacked = df2.stack(future_stack=True) stacked With a "stacked" DataFrame or Series (having a :class:`MultiIndex` as the diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 041061f32db3f..66ee571d6b5a5 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -311,7 +311,7 @@ The :ref:`multindexing ` docs. df.columns = pd.MultiIndex.from_tuples([tuple(c.split("_")) for c in df.columns]) df # Now stack & Reset - df = df.stack(0).reset_index(1) + df = df.stack(0, future_stack=True).reset_index(1) df # And fix the labels (Notice the label 'level_1' got added automatically) df.columns = ["Sample", "All_X", "All_Y"] @@ -688,7 +688,7 @@ The :ref:`Pivot ` docs. aggfunc="sum", margins=True, ) - table.stack("City") + table.stack("City", future_stack=True) `Frequency table like plyr in R `__ diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 482e3fe91ca09..75c816f66d5e4 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1713,4 +1713,4 @@ column index name will be used as the name of the inserted column: result - result.stack() + result.stack(future_stack=True) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 4df6996c4f66b..1e73b7672782e 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -127,7 +127,7 @@ stacked level becomes the new lowest level in a :class:`MultiIndex` on the colum .. ipython:: python - stacked = df2.stack() + stacked = df2.stack(future_stack=True) stacked With a "stacked" :class:`DataFrame` or :class:`Series` (having a :class:`MultiIndex` as the @@ -163,7 +163,7 @@ will result in a **sorted** copy of the original :class:`DataFrame` or :class:`S index = pd.MultiIndex.from_product([[2, 1], ["a", "b"]]) df = pd.DataFrame(np.random.randn(4), index=index, columns=["A"]) df - all(df.unstack().stack() == df.sort_index()) + all(df.unstack().stack(future_stack=True) == df.sort_index()) The above code will raise a ``TypeError`` if the call to :meth:`~DataFrame.sort_index` is removed. @@ -191,16 +191,16 @@ processed individually. df = pd.DataFrame(np.random.randn(4, 4), columns=columns) df - df.stack(level=["animal", "hair_length"]) + df.stack(level=["animal", "hair_length"], future_stack=True) The list of levels can contain either level names or level numbers (but not a mixture of the two). .. ipython:: python - # df.stack(level=['animal', 'hair_length']) + # df.stack(level=['animal', 'hair_length'], future_stack=True) # from above is equivalent to: - df.stack(level=[1, 2]) + df.stack(level=[1, 2], future_stack=True) Missing data ~~~~~~~~~~~~ @@ -233,8 +233,8 @@ which level in the columns to stack: .. ipython:: python - df2.stack("exp") - df2.stack("animal") + df2.stack("exp", future_stack=True) + df2.stack("animal", future_stack=True) Unstacking can result in missing values if subgroups do not have the same set of labels. By default, missing values will be replaced with the default @@ -345,12 +345,12 @@ some very expressive and fast data manipulations. .. ipython:: python df - df.stack().mean(1).unstack() + df.stack(future_stack=True).mean(1).unstack() # same result, another way df.T.groupby(level=1).mean() - df.stack().groupby(level=1).mean() + df.stack(future_stack=True).groupby(level=1).mean() df.mean().unstack(0) @@ -460,7 +460,7 @@ as having a multi-level index: .. ipython:: python - table.stack() + table.stack(future_stack=True) .. _reshaping.crosstabulations: diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index b869859e0ca80..17894914b44d1 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -78,7 +78,7 @@ Copy-on-Write improvements - DataFrame.fillna / Series.fillna - DataFrame.replace / Series.replace -.. _whatsnew_210.enhancements.enhancement2: +.. _whatsnew_210.enhancements.map_na_action: ``map(func, na_action="ignore")`` now works for all array types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -128,6 +128,45 @@ Also, note that :meth:`Categorical.map` implicitly has had its ``na_action`` set This has been deprecated and will :meth:`Categorical.map` in the future change the default to ``na_action=None``, like for all the other array types. +.. _whatsnew_210.enhancements.new_stack: + +New implementation of :meth:`DataFrame.stack` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +pandas has reimplemented :meth:`DataFrame.stack`. To use the new implementation, pass the argument ``future_stack=True``. This will become the only option in pandas 3.0. + +The previous implementation had two main behavioral downsides. + + 1. The previous implementation would unnecessarily introduce NA values into the result. The user could have NA values automatically removed by passing ``dropna=True`` (the default), but doing this could also remove NA values from the result that existed in the input. See the examples below. + 2. The previous implementation with ``sort=True`` (the default) would sometimes sort part of the resulting index, and sometimes not. If the input's columns are *not* a :class:`MultiIndex`, then the resulting index would never be sorted. If the columns are a :class:`MultiIndex`, then in most cases the level(s) in the resulting index that come from stacking the column level(s) would be sorted. In rare cases such level(s) would be sorted in a non-standard order, depending on how the columns were created. + +The new implementation (``future_stack=True``) will no longer unnecessarily introduce NA values when stacking multiple levels and will never sort. As such, the arguments ``dropna`` and ``sort`` are not utilized and must remain unspecified when using ``future_stack=True``. These arguments will be removed in the next major release. + +.. ipython:: python + + columns = pd.MultiIndex.from_tuples([("B", "d"), ("A", "c")]) + df = pd.DataFrame([[0, 2], [1, 3]], index=["z", "y"], columns=columns) + df + +In the previous version (``future_stack=False``), the default of ``dropna=True`` would remove unnecessarily introduced NA values but still coerce the dtype to ``float64`` in the process. In the new version, no NAs are introduced and so there is no coercion of the dtype. + +.. ipython:: python + :okwarning: + + df.stack([0, 1], future_stack=False, dropna=True) + df.stack([0, 1], future_stack=True) + +If the input contains NA values, the previous version would drop those as well with ``dropna=True`` or introduce new NA values with ``dropna=False``. The new version persists all values from the input. + +.. ipython:: python + :okwarning: + + df = pd.DataFrame([[0, 2], [np.nan, np.nan]], columns=columns) + df + df.stack([0, 1], future_stack=False, dropna=True) + df.stack([0, 1], future_stack=False, dropna=False) + df.stack([0, 1], future_stack=True) + .. _whatsnew_210.enhancements.other: Other enhancements diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 331b06b42e7dc..3b2fe1699e996 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9166,7 +9166,13 @@ def pivot_table( sort=sort, ) - def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): + def stack( + self, + level: IndexLabel = -1, + dropna: bool | lib.NoDefault = lib.no_default, + sort: bool | lib.NoDefault = lib.no_default, + future_stack: bool = False, + ): """ Stack the prescribed level(s) from columns to index. @@ -9194,6 +9200,11 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): section. sort : bool, default True Whether to sort the levels of the resulting MultiIndex. + future_stack : bool, default False + Whether to use the new implementation that will replace the current + implementation in pandas 3.0. When True, dropna and sort have no impact + on the result and must remain unspecified. See :ref:`pandas 2.1.0 Release + notes ` for more details. Returns ------- @@ -9233,7 +9244,7 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): weight height cat 0 1 dog 2 3 - >>> df_single_level_cols.stack() + >>> df_single_level_cols.stack(future_stack=True) cat weight 0 height 1 dog weight 2 @@ -9255,7 +9266,7 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): kg pounds cat 1 2 dog 2 4 - >>> df_multi_level_cols1.stack() + >>> df_multi_level_cols1.stack(future_stack=True) weight cat kg 1 pounds 2 @@ -9280,7 +9291,7 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): kg m cat 1.0 2.0 dog 3.0 4.0 - >>> df_multi_level_cols2.stack() + >>> df_multi_level_cols2.stack(future_stack=True) weight height cat kg 1.0 NaN m NaN 2.0 @@ -9291,17 +9302,17 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): The first parameter controls which level or levels are stacked: - >>> df_multi_level_cols2.stack(0) + >>> df_multi_level_cols2.stack(0, future_stack=True) kg m - cat height NaN 2.0 - weight 1.0 NaN - dog height NaN 4.0 - weight 3.0 NaN - >>> df_multi_level_cols2.stack([0, 1]) - cat height m 2.0 - weight kg 1.0 - dog height m 4.0 - weight kg 3.0 + cat weight 1.0 NaN + height NaN 2.0 + dog weight 3.0 NaN + height NaN 4.0 + >>> df_multi_level_cols2.stack([0, 1], future_stack=True) + cat weight kg 1.0 + height m 2.0 + dog weight kg 3.0 + height m 4.0 dtype: float64 **Dropping missing values** @@ -9331,15 +9342,52 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): dog kg 2.0 NaN m NaN 3.0 """ - from pandas.core.reshape.reshape import ( - stack, - stack_multiple, - ) + if not future_stack: + from pandas.core.reshape.reshape import ( + stack, + stack_multiple, + ) + + if dropna is lib.no_default: + dropna = True + if sort is lib.no_default: + sort = True - if isinstance(level, (tuple, list)): - result = stack_multiple(self, level, dropna=dropna, sort=sort) + if isinstance(level, (tuple, list)): + result = stack_multiple(self, level, dropna=dropna, sort=sort) + else: + result = stack(self, level, dropna=dropna, sort=sort) else: - result = stack(self, level, dropna=dropna, sort=sort) + from pandas.core.reshape.reshape import stack_v3 + + if dropna is not lib.no_default: + raise ValueError( + "dropna must be unspecified with future_stack=True as the new " + "implementation does not introduce rows of NA values. This " + "argument will be removed in a future version of pandas." + ) + + if sort is not lib.no_default: + raise ValueError( + "Cannot specify sort with future_stack=True, this argument will be " + "removed in a future version of pandas. Sort the result using " + ".sort_index instead." + ) + + if ( + isinstance(level, (tuple, list)) + and not all(lev in self.columns.names for lev in level) + and not all(isinstance(lev, int) for lev in level) + ): + raise ValueError( + "level should contain all level names or all level " + "numbers, not a mixture of the two." + ) + + if not isinstance(level, (tuple, list)): + level = [level] + level = [self.columns._get_level_number(lev) for lev in level] + result = stack_v3(self, level) return result.__finalize__(self, method="stack") diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2ffdaa934e838..5c678adfe4970 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -416,7 +416,7 @@ def _wrap_applied_output( res_df = self._reindex_output(res_df) # if self.observed is False, # keep all-NaN rows created while re-indexing - res_ser = res_df.stack(dropna=self.observed) + res_ser = res_df.stack(future_stack=True) res_ser.name = self.obj.name return res_ser elif isinstance(values[0], (Series, DataFrame)): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1961bd83d2fed..33eb411374e67 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2440,6 +2440,10 @@ def reorder_levels(self, order) -> MultiIndex: names=['y', 'x']) """ order = [self._get_level_number(i) for i in order] + result = self._reorder_ilevels(order) + return result + + def _reorder_ilevels(self, order) -> MultiIndex: if len(order) != self.nlevels: raise AssertionError( f"Length of order must be same as number of levels ({self.nlevels}), " diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 53d587cdde182..9b8d1c870091d 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1497,7 +1497,7 @@ def size(self): # If the result is a non-empty DataFrame we stack to get a Series # GH 46826 if isinstance(result, ABCDataFrame) and not result.empty: - result = result.stack() + result = result.stack(future_stack=True) if not len(self.ax): from pandas import Series diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 5c2e94735ddc5..71e3ea5b2588e 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -418,7 +418,7 @@ def _all_key(key): if len(cols) > 0: row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc) - row_margin = row_margin.stack() + row_margin = row_margin.stack(future_stack=True) # slight hack new_order = [len(cols)] + list(range(len(cols))) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 6845530c5fa2a..fc8d827cd31bb 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -28,14 +28,19 @@ from pandas.core.dtypes.missing import notna import pandas.core.algorithms as algos -from pandas.core.algorithms import unique +from pandas.core.algorithms import ( + factorize, + unique, +) from pandas.core.arrays.categorical import factorize_from_iterable from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.frame import DataFrame from pandas.core.indexes.api import ( Index, MultiIndex, + RangeIndex, ) +from pandas.core.reshape.concat import concat from pandas.core.series import Series from pandas.core.sorting import ( compress_group_index, @@ -498,7 +503,7 @@ def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True): if isinstance(obj.index, MultiIndex): return _unstack_frame(obj, level, fill_value=fill_value, sort=sort) else: - return obj.T.stack(dropna=False, sort=sort) + return obj.T.stack(future_stack=True) elif not isinstance(obj.index, MultiIndex): # GH 36113 # Give nicer error messages when unstack a Series whose @@ -581,7 +586,7 @@ def stack(frame: DataFrame, level=-1, dropna: bool = True, sort: bool = True): stacked : Series or DataFrame """ - def factorize(index): + def stack_factorize(index): if index.is_unique: return index, np.arange(len(index)) codes, categories = factorize_from_iterable(index) @@ -600,7 +605,7 @@ def factorize(index): new_levels = list(frame.index.levels) new_codes = [lab.repeat(K) for lab in frame.index.codes] - clev, clab = factorize(frame.columns) + clev, clab = stack_factorize(frame.columns) new_levels.append(clev) new_codes.append(np.tile(clab, N).ravel()) @@ -610,7 +615,7 @@ def factorize(index): levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) else: - levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) + levels, (ilab, clab) = zip(*map(stack_factorize, (frame.index, frame.columns))) codes = ilab.repeat(K), np.tile(clab, N).ravel() new_index = MultiIndex( levels=levels, @@ -875,3 +880,110 @@ def _reorder_for_extension_array_stack( # c0r1, c1r1, c2r1, ...] idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel() return arr.take(idx) + + +def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: + if frame.columns.nunique() != len(frame.columns): + raise ValueError("Columns with duplicate values are not supported in stack") + + # If we need to drop `level` from columns, it needs to be in descending order + drop_levnums = sorted(level, reverse=True) + stack_cols = frame.columns._drop_level_numbers( + [k for k in range(frame.columns.nlevels) if k not in level][::-1] + ) + if len(level) > 1: + # Arrange columns in the order we want to take them, e.g. level=[2, 0, 1] + sorter = np.argsort(level) + ordered_stack_cols = stack_cols._reorder_ilevels(sorter) + else: + ordered_stack_cols = stack_cols + + stack_cols_unique = stack_cols.unique() + ordered_stack_cols_unique = ordered_stack_cols.unique() + + # Grab data for each unique index to be stacked + buf = [] + for idx in stack_cols_unique: + if len(frame.columns) == 1: + data = frame.copy() + else: + # Take the data from frame corresponding to this idx value + if not isinstance(idx, tuple): + idx = (idx,) + gen = iter(idx) + column_indexer = tuple( + next(gen) if k in level else slice(None) + for k in range(frame.columns.nlevels) + ) + data = frame.loc[:, column_indexer] + + if len(level) < frame.columns.nlevels: + data.columns = data.columns._drop_level_numbers(drop_levnums) + elif stack_cols.nlevels == 1: + if data.ndim == 1: + data.name = 0 + else: + data.columns = RangeIndex(len(data.columns)) + buf.append(data) + + result: Series | DataFrame + if len(buf) > 0 and not frame.empty: + result = concat(buf) + ratio = len(result) // len(frame) + else: + # input is empty + if len(level) < frame.columns.nlevels: + # concat column order may be different from dropping the levels + new_columns = frame.columns._drop_level_numbers(drop_levnums).unique() + else: + new_columns = [0] + result = DataFrame(columns=new_columns, dtype=frame._values.dtype) + ratio = 0 + + if len(level) < frame.columns.nlevels: + # concat column order may be different from dropping the levels + desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique() + if not result.columns.equals(desired_columns): + result = result[desired_columns] + + # Construct the correct MultiIndex by combining the frame's index and + # stacked columns. + index_levels: list | FrozenList + if isinstance(frame.index, MultiIndex): + index_levels = frame.index.levels + index_codes = list(np.tile(frame.index.codes, (1, ratio))) + else: + index_levels = [frame.index.unique()] + codes = factorize(frame.index)[0] + index_codes = list(np.tile(codes, (1, ratio))) + if isinstance(stack_cols, MultiIndex): + column_levels = ordered_stack_cols.levels + column_codes = ordered_stack_cols.drop_duplicates().codes + else: + column_levels = [ordered_stack_cols.unique()] + column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] + column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] + result.index = MultiIndex( + levels=index_levels + column_levels, + codes=index_codes + column_codes, + names=frame.index.names + list(ordered_stack_cols.names), + verify_integrity=False, + ) + + # sort result, but faster than calling sort_index since we know the order we need + len_df = len(frame) + n_uniques = len(ordered_stack_cols_unique) + indexer = np.arange(n_uniques) + idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques) + result = result.take(idxs) + + # Reshape/rename if needed and dropna + if result.ndim == 2 and frame.columns.nlevels == len(level): + if len(result.columns) == 0: + result = Series(index=result.index) + else: + result = result.iloc[:, 0] + if result.ndim == 1: + result.name = None + + return result diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 9b150cf5054ee..3f89ef5395006 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -253,11 +253,12 @@ def test_merge_on_extension_array_duplicates(self, data): ), ], ) - def test_stack(self, data, columns): + @pytest.mark.parametrize("future_stack", [True, False]) + def test_stack(self, data, columns, future_stack): df = pd.DataFrame({"A": data[:5], "B": data[:5]}) df.columns = columns - result = df.stack() - expected = df.astype(object).stack() + result = df.stack(future_stack=future_stack) + expected = df.astype(object).stack(future_stack=future_stack) # we need a second astype(object), in case the constructor inferred # object -> specialized, as is done for period. expected = expected.astype(object) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 3a2ec1f2e6ce1..8a571d9295e1f 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -139,7 +139,7 @@ class TestReshaping(BaseJSON, base.BaseReshapingTests): @pytest.mark.xfail(reason="Different definitions of NA") def test_stack(self): """ - The test does .astype(object).stack(). If we happen to have + The test does .astype(object).stack(future_stack=True). If we happen to have any missing values in `data`, then we'll end up with different rows since we consider `{}` NA, but `.astype(object)` doesn't. """ diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 748d5cc65de1a..851a630dbc1f2 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -157,8 +157,9 @@ def test_concat_mixed_dtypes(self, data): ), ], ) - def test_stack(self, data, columns): - super().test_stack(data, columns) + @pytest.mark.parametrize("future_stack", [True, False]) + def test_stack(self, data, columns, future_stack): + super().test_stack(data, columns, future_stack) def test_concat_columns(self, data, na_value): self._check_unsupported(data) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index fa28ebc16e942..d99dd36f3a2e3 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -112,7 +112,7 @@ def test_reset_index_with_intervals(self): tm.assert_frame_equal(result2, original) def test_reset_index(self, float_frame): - stacked = float_frame.stack()[::2] + stacked = float_frame.stack(future_stack=True)[::2] stacked = DataFrame({"foo": stacked, "bar": stacked}) names = ["first", "second"] @@ -761,7 +761,7 @@ def test_reset_index_rename(float_frame): def test_reset_index_rename_multiindex(float_frame): # GH 6878 - stacked_df = float_frame.stack()[::2] + stacked_df = float_frame.stack(future_stack=True)[::2] stacked_df = DataFrame({"foo": stacked_df, "bar": stacked_df}) names = ["first", "second"] @@ -775,7 +775,7 @@ def test_reset_index_rename_multiindex(float_frame): def test_errorreset_index_rename(float_frame): # GH 6878 - stacked_df = float_frame.stack()[::2] + stacked_df = float_frame.stack(future_stack=True)[::2] stacked_df = DataFrame({"first": stacked_df, "second": stacked_df}) with pytest.raises( diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 3d11802694aef..cb8e8c5025e3b 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -6,6 +6,7 @@ import numpy as np import pytest +from pandas._libs import lib from pandas.errors import PerformanceWarning import pandas as pd @@ -22,12 +23,17 @@ from pandas.core.reshape import reshape as reshape_lib +@pytest.fixture(params=[True, False]) +def future_stack(request): + return request.param + + class TestDataFrameReshape: - def test_stack_unstack(self, float_frame): + def test_stack_unstack(self, float_frame, future_stack): df = float_frame.copy() df[:] = np.arange(np.prod(df.shape)).reshape(df.shape) - stacked = df.stack() + stacked = df.stack(future_stack=future_stack) stacked_df = DataFrame({"foo": stacked, "bar": stacked}) unstacked = stacked.unstack() @@ -41,26 +47,26 @@ def test_stack_unstack(self, float_frame): tm.assert_frame_equal(unstacked_cols.T, df) tm.assert_frame_equal(unstacked_cols_df["bar"].T, df) - def test_stack_mixed_level(self): + def test_stack_mixed_level(self, future_stack): # GH 18310 levels = [range(3), [3, "a", "b"], [1, 2]] # flat columns: df = DataFrame(1, index=levels[0], columns=levels[1]) - result = df.stack() + result = df.stack(future_stack=future_stack) expected = Series(1, index=MultiIndex.from_product(levels[:2])) tm.assert_series_equal(result, expected) # MultiIndex columns: df = DataFrame(1, index=levels[0], columns=MultiIndex.from_product(levels[1:])) - result = df.stack(1) + result = df.stack(1, future_stack=future_stack) expected = DataFrame( 1, index=MultiIndex.from_product([levels[0], levels[2]]), columns=levels[1] ) tm.assert_frame_equal(result, expected) # as above, but used labels in level are actually of homogeneous type - result = df[["a", "b"]].stack(1) + result = df[["a", "b"]].stack(1, future_stack=future_stack) expected = expected[["a", "b"]] tm.assert_frame_equal(result, expected) @@ -76,7 +82,7 @@ def test_unstack_not_consolidated(self, using_array_manager): expected = df.unstack() tm.assert_series_equal(res, expected) - def test_unstack_fill(self): + def test_unstack_fill(self, future_stack): # GH #9746: fill_value keyword argument for Series # and DataFrame unstack @@ -109,7 +115,7 @@ def test_unstack_fill(self): result = Series([0, 0, 2], index=unstacked.index, name=key) tm.assert_series_equal(result, expected) - stacked = unstacked.stack(["x", "y"]) + stacked = unstacked.stack(["x", "y"], future_stack=future_stack) stacked.index = stacked.index.reorder_levels(df.index.names) # Workaround for GH #17886 (unnecessarily casts to float): stacked = stacked.astype(np.int64) @@ -382,15 +388,23 @@ def unstack_and_compare(df, column_name): s = df1["A"] unstack_and_compare(s, "index") - def test_stack_ints(self): + def test_stack_ints(self, future_stack): columns = MultiIndex.from_tuples(list(itertools.product(range(3), repeat=3))) df = DataFrame( np.random.default_rng(2).standard_normal((30, 27)), columns=columns ) - tm.assert_frame_equal(df.stack(level=[1, 2]), df.stack(level=1).stack(level=1)) tm.assert_frame_equal( - df.stack(level=[-2, -1]), df.stack(level=1).stack(level=1) + df.stack(level=[1, 2], future_stack=future_stack), + df.stack(level=1, future_stack=future_stack).stack( + level=1, future_stack=future_stack + ), + ) + tm.assert_frame_equal( + df.stack(level=[-2, -1], future_stack=future_stack), + df.stack(level=1, future_stack=future_stack).stack( + level=1, future_stack=future_stack + ), ) df_named = df.copy() @@ -398,10 +412,13 @@ def test_stack_ints(self): assert return_value is None tm.assert_frame_equal( - df_named.stack(level=[1, 2]), df_named.stack(level=1).stack(level=1) + df_named.stack(level=[1, 2], future_stack=future_stack), + df_named.stack(level=1, future_stack=future_stack).stack( + level=1, future_stack=future_stack + ), ) - def test_stack_mixed_levels(self): + def test_stack_mixed_levels(self, future_stack): columns = MultiIndex.from_tuples( [ ("A", "cat", "long"), @@ -415,8 +432,12 @@ def test_stack_mixed_levels(self): np.random.default_rng(2).standard_normal((4, 4)), columns=columns ) - animal_hair_stacked = df.stack(level=["animal", "hair_length"]) - exp_hair_stacked = df.stack(level=["exp", "hair_length"]) + animal_hair_stacked = df.stack( + level=["animal", "hair_length"], future_stack=future_stack + ) + exp_hair_stacked = df.stack( + level=["exp", "hair_length"], future_stack=future_stack + ) # GH #8584: Need to check that stacking works when a number # is passed that is both a level name and in the range of @@ -424,10 +445,14 @@ def test_stack_mixed_levels(self): df2 = df.copy() df2.columns.names = ["exp", "animal", 1] tm.assert_frame_equal( - df2.stack(level=["animal", 1]), animal_hair_stacked, check_names=False + df2.stack(level=["animal", 1], future_stack=future_stack), + animal_hair_stacked, + check_names=False, ) tm.assert_frame_equal( - df2.stack(level=["exp", 1]), exp_hair_stacked, check_names=False + df2.stack(level=["exp", 1], future_stack=future_stack), + exp_hair_stacked, + check_names=False, ) # When mixed types are passed and the ints are not level @@ -437,17 +462,19 @@ def test_stack_mixed_levels(self): "a mixture of the two" ) with pytest.raises(ValueError, match=msg): - df2.stack(level=["animal", 0]) + df2.stack(level=["animal", 0], future_stack=future_stack) # GH #8584: Having 0 in the level names could raise a # strange error about lexsort depth df3 = df.copy() df3.columns.names = ["exp", "animal", 0] tm.assert_frame_equal( - df3.stack(level=["animal", 0]), animal_hair_stacked, check_names=False + df3.stack(level=["animal", 0], future_stack=future_stack), + animal_hair_stacked, + check_names=False, ) - def test_stack_int_level_names(self): + def test_stack_int_level_names(self, future_stack): columns = MultiIndex.from_tuples( [ ("A", "cat", "long"), @@ -461,33 +488,51 @@ def test_stack_int_level_names(self): np.random.default_rng(2).standard_normal((4, 4)), columns=columns ) - exp_animal_stacked = df.stack(level=["exp", "animal"]) - animal_hair_stacked = df.stack(level=["animal", "hair_length"]) - exp_hair_stacked = df.stack(level=["exp", "hair_length"]) + exp_animal_stacked = df.stack( + level=["exp", "animal"], future_stack=future_stack + ) + animal_hair_stacked = df.stack( + level=["animal", "hair_length"], future_stack=future_stack + ) + exp_hair_stacked = df.stack( + level=["exp", "hair_length"], future_stack=future_stack + ) df2 = df.copy() df2.columns.names = [0, 1, 2] tm.assert_frame_equal( - df2.stack(level=[1, 2]), animal_hair_stacked, check_names=False + df2.stack(level=[1, 2], future_stack=future_stack), + animal_hair_stacked, + check_names=False, ) tm.assert_frame_equal( - df2.stack(level=[0, 1]), exp_animal_stacked, check_names=False + df2.stack(level=[0, 1], future_stack=future_stack), + exp_animal_stacked, + check_names=False, ) tm.assert_frame_equal( - df2.stack(level=[0, 2]), exp_hair_stacked, check_names=False + df2.stack(level=[0, 2], future_stack=future_stack), + exp_hair_stacked, + check_names=False, ) # Out-of-order int column names df3 = df.copy() df3.columns.names = [2, 0, 1] tm.assert_frame_equal( - df3.stack(level=[0, 1]), animal_hair_stacked, check_names=False + df3.stack(level=[0, 1], future_stack=future_stack), + animal_hair_stacked, + check_names=False, ) tm.assert_frame_equal( - df3.stack(level=[2, 0]), exp_animal_stacked, check_names=False + df3.stack(level=[2, 0], future_stack=future_stack), + exp_animal_stacked, + check_names=False, ) tm.assert_frame_equal( - df3.stack(level=[2, 1]), exp_hair_stacked, check_names=False + df3.stack(level=[2, 1], future_stack=future_stack), + exp_hair_stacked, + check_names=False, ) def test_unstack_bool(self): @@ -504,7 +549,7 @@ def test_unstack_bool(self): ) tm.assert_frame_equal(rs, xp) - def test_unstack_level_binding(self): + def test_unstack_level_binding(self, future_stack): # GH9856 mi = MultiIndex( levels=[["foo", "bar"], ["one", "two"], ["a", "b"]], @@ -512,7 +557,7 @@ def test_unstack_level_binding(self): names=["first", "second", "third"], ) s = Series(0, index=mi) - result = s.unstack([1, 2]).stack(0) + result = s.unstack([1, 2]).stack(0, future_stack=future_stack) expected_mi = MultiIndex( levels=[["foo", "bar"], ["one", "two"]], @@ -631,7 +676,7 @@ def test_unstack_dtypes_mixed_date(self, c, d): assert left.shape == (3, 2) tm.assert_frame_equal(left, right) - def test_unstack_non_unique_index_names(self): + def test_unstack_non_unique_index_names(self, future_stack): idx = MultiIndex.from_tuples([("a", "b"), ("c", "d")], names=["c1", "c1"]) df = DataFrame([1, 2], index=idx) msg = "The name c1 occurs multiple times, use a level number" @@ -639,7 +684,7 @@ def test_unstack_non_unique_index_names(self): df.unstack("c1") with pytest.raises(ValueError, match=msg): - df.T.stack("c1") + df.T.stack("c1", future_stack=future_stack) def test_unstack_unused_levels(self): # GH 17845: unused codes in index make unstack() cast int to float @@ -995,11 +1040,11 @@ def test_unstack_nan_index5(self): key = r["1st"], (col, r["2nd"], r["3rd"]) assert r[col] == left.loc[key] - def test_stack_datetime_column_multiIndex(self): + def test_stack_datetime_column_multiIndex(self, future_stack): # GH 8039 t = datetime(2014, 1, 1) df = DataFrame([1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, "A", "B")])) - result = df.stack() + result = df.stack(future_stack=future_stack) eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)]) ecols = MultiIndex.from_tuples([(t, "A")]) @@ -1033,8 +1078,9 @@ def test_stack_datetime_column_multiIndex(self): ], ) @pytest.mark.parametrize("level", (-1, 0, 1, [0, 1], [1, 0])) - def test_stack_partial_multiIndex(self, multiindex_columns, level): + def test_stack_partial_multiIndex(self, multiindex_columns, level, future_stack): # GH 8844 + dropna = False if not future_stack else lib.no_default full_multiindex = MultiIndex.from_tuples( [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")], names=["Upper", "Lower"], @@ -1044,13 +1090,13 @@ def test_stack_partial_multiIndex(self, multiindex_columns, level): np.arange(3 * len(multiindex)).reshape(3, len(multiindex)), columns=multiindex, ) - result = df.stack(level=level, dropna=False) + result = df.stack(level=level, dropna=dropna, future_stack=future_stack) - if isinstance(level, int): + if isinstance(level, int) and not future_stack: # Stacking a single level should not make any all-NaN rows, # so df.stack(level=level, dropna=False) should be the same # as df.stack(level=level, dropna=True). - expected = df.stack(level=level, dropna=True) + expected = df.stack(level=level, dropna=True, future_stack=future_stack) if isinstance(expected, Series): tm.assert_series_equal(result, expected) else: @@ -1059,20 +1105,21 @@ def test_stack_partial_multiIndex(self, multiindex_columns, level): df.columns = MultiIndex.from_tuples( df.columns.to_numpy(), names=df.columns.names ) - expected = df.stack(level=level, dropna=False) + expected = df.stack(level=level, dropna=dropna, future_stack=future_stack) if isinstance(expected, Series): tm.assert_series_equal(result, expected) else: tm.assert_frame_equal(result, expected) - def test_stack_full_multiIndex(self): + def test_stack_full_multiIndex(self, future_stack): # GH 8844 full_multiindex = MultiIndex.from_tuples( [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")], names=["Upper", "Lower"], ) df = DataFrame(np.arange(6).reshape(2, 3), columns=full_multiindex[[0, 1, 3]]) - result = df.stack(dropna=False) + dropna = False if not future_stack else lib.no_default + result = df.stack(dropna=dropna, future_stack=future_stack) expected = DataFrame( [[0, 2], [1, np.nan], [3, 5], [4, np.nan]], index=MultiIndex( @@ -1086,12 +1133,11 @@ def test_stack_full_multiIndex(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("ordered", [False, True]) - @pytest.mark.parametrize("labels", [list("yxz"), list("yxy")]) - def test_stack_preserve_categorical_dtype(self, ordered, labels): + def test_stack_preserve_categorical_dtype(self, ordered, future_stack): # GH13854 - cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered) + cidx = pd.CategoricalIndex(list("yxz"), categories=list("xyz"), ordered=ordered) df = DataFrame([[10, 11, 12]], columns=cidx) - result = df.stack() + result = df.stack(future_stack=future_stack) # `MultiIndex.from_product` preserves categorical dtype - # it's tested elsewhere. @@ -1108,24 +1154,30 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): (list("zyx"), [14, 15, 12, 13, 10, 11]), ], ) - def test_stack_multi_preserve_categorical_dtype(self, ordered, labels, data): + def test_stack_multi_preserve_categorical_dtype( + self, ordered, labels, data, future_stack + ): # GH-36991 cidx = pd.CategoricalIndex(labels, categories=sorted(labels), ordered=ordered) cidx2 = pd.CategoricalIndex(["u", "v"], ordered=ordered) midx = MultiIndex.from_product([cidx, cidx2]) df = DataFrame([sorted(data)], columns=midx) - result = df.stack([0, 1]) + result = df.stack([0, 1], future_stack=future_stack) - s_cidx = pd.CategoricalIndex(sorted(labels), ordered=ordered) - expected = Series(data, index=MultiIndex.from_product([[0], s_cidx, cidx2])) + labels = labels if future_stack else sorted(labels) + s_cidx = pd.CategoricalIndex(labels, ordered=ordered) + expected_data = sorted(data) if future_stack else data + expected = Series( + expected_data, index=MultiIndex.from_product([[0], s_cidx, cidx2]) + ) tm.assert_series_equal(result, expected) - def test_stack_preserve_categorical_dtype_values(self): + def test_stack_preserve_categorical_dtype_values(self, future_stack): # GH-23077 cat = pd.Categorical(["a", "a", "b", "c"]) df = DataFrame({"A": cat, "B": cat}) - result = df.stack() + result = df.stack(future_stack=future_stack) index = MultiIndex.from_product([[0, 1, 2, 3], ["A", "B"]]) expected = Series( pd.Categorical(["a", "a", "a", "a", "b", "b", "c", "c"]), index=index @@ -1140,10 +1192,10 @@ def test_stack_preserve_categorical_dtype_values(self): ([0, 1, 2, 3], MultiIndex.from_product([[1, 2], ["a", "b"]])), ], ) - def test_stack_multi_columns_non_unique_index(self, index, columns): + def test_stack_multi_columns_non_unique_index(self, index, columns, future_stack): # GH-28301 df = DataFrame(index=index, columns=columns).fillna(1) - stacked = df.stack() + stacked = df.stack(future_stack=future_stack) new_index = MultiIndex.from_tuples(stacked.index.to_numpy()) expected = DataFrame( stacked.to_numpy(), index=new_index, columns=stacked.columns @@ -1161,7 +1213,7 @@ def test_stack_multi_columns_non_unique_index(self, index, columns): ], ) def test_stack_multi_columns_mixed_extension_types( - self, vals1, vals2, dtype1, dtype2, expected_dtype + self, vals1, vals2, dtype1, dtype2, expected_dtype, future_stack ): # GH45740 df = DataFrame( @@ -1170,8 +1222,10 @@ def test_stack_multi_columns_mixed_extension_types( ("A", 2): Series(vals2, dtype=dtype2), } ) - result = df.stack() - expected = df.astype(object).stack().astype(expected_dtype) + result = df.stack(future_stack=future_stack) + expected = ( + df.astype(object).stack(future_stack=future_stack).astype(expected_dtype) + ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("level", [0, 1]) @@ -1301,11 +1355,11 @@ def test_unstack_timezone_aware_values(): tm.assert_frame_equal(result, expected) -def test_stack_timezone_aware_values(): +def test_stack_timezone_aware_values(future_stack): # GH 19420 ts = date_range(freq="D", start="20180101", end="20180103", tz="America/New_York") df = DataFrame({"A": ts}, index=["a", "b", "c"]) - result = df.stack() + result = df.stack(future_stack=future_stack) expected = Series( ts, index=MultiIndex(levels=[["a", "b", "c"], ["A"]], codes=[[0, 1, 2], [0, 0, 0]]), @@ -1313,24 +1367,38 @@ def test_stack_timezone_aware_values(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("dropna", [True, False]) -def test_stack_empty_frame(dropna): +@pytest.mark.parametrize("dropna", [True, False, lib.no_default]) +def test_stack_empty_frame(dropna, future_stack): # GH 36113 levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)] expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []])) - result = DataFrame(dtype=np.float64).stack(dropna=dropna) - tm.assert_series_equal(result, expected) + if future_stack and dropna is not lib.no_default: + with pytest.raises(ValueError, match="dropna must be unspecified"): + DataFrame(dtype=np.float64).stack(dropna=dropna, future_stack=future_stack) + else: + result = DataFrame(dtype=np.float64).stack( + dropna=dropna, future_stack=future_stack + ) + tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("dropna", [True, False]) +@pytest.mark.parametrize("dropna", [True, False, lib.no_default]) @pytest.mark.parametrize("fill_value", [None, 0]) -def test_stack_unstack_empty_frame(dropna, fill_value): +def test_stack_unstack_empty_frame(dropna, fill_value, future_stack): # GH 36113 - result = ( - DataFrame(dtype=np.int64).stack(dropna=dropna).unstack(fill_value=fill_value) - ) - expected = DataFrame(dtype=np.int64) - tm.assert_frame_equal(result, expected) + if future_stack and dropna is not lib.no_default: + with pytest.raises(ValueError, match="dropna must be unspecified"): + DataFrame(dtype=np.int64).stack( + dropna=dropna, future_stack=future_stack + ).unstack(fill_value=fill_value) + else: + result = ( + DataFrame(dtype=np.int64) + .stack(dropna=dropna, future_stack=future_stack) + .unstack(fill_value=fill_value) + ) + expected = DataFrame(dtype=np.int64) + tm.assert_frame_equal(result, expected) def test_unstack_single_index_series(): @@ -1371,11 +1439,11 @@ def test_unstacking_multi_index_df(): tm.assert_frame_equal(result, expected) -def test_stack_positional_level_duplicate_column_names(): +def test_stack_positional_level_duplicate_column_names(future_stack): # https://github.com/pandas-dev/pandas/issues/36353 columns = MultiIndex.from_product([("x", "y"), ("y", "z")], names=["a", "a"]) df = DataFrame([[1, 1, 1, 1]], columns=columns) - result = df.stack(0) + result = df.stack(0, future_stack=future_stack) new_columns = Index(["y", "z"], name="a") new_index = MultiIndex.from_tuples([(0, "x"), (0, "y")], names=[None, "a"]) @@ -1406,7 +1474,7 @@ def test_unstack_non_slice_like_blocks(using_array_manager): tm.assert_frame_equal(res, expected) -def test_stack_sort_false(): +def test_stack_sort_false(future_stack): # GH 15105 data = [[1, 2, 3.0, 4.0], [2, 3, 4.0, 5.0], [3, 4, np.nan, np.nan]] df = DataFrame( @@ -1415,11 +1483,23 @@ def test_stack_sort_false(): levels=[["B", "A"], ["x", "y"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] ), ) - result = df.stack(level=0, sort=False) - expected = DataFrame( - {"x": [1.0, 3.0, 2.0, 4.0, 3.0], "y": [2.0, 4.0, 3.0, 5.0, 4.0]}, - index=MultiIndex.from_arrays([[0, 0, 1, 1, 2], ["B", "A", "B", "A", "B"]]), - ) + kwargs = {} if future_stack else {"sort": False} + result = df.stack(level=0, future_stack=future_stack, **kwargs) + if future_stack: + expected = DataFrame( + { + "x": [1.0, 3.0, 2.0, 4.0, 3.0, np.nan], + "y": [2.0, 4.0, 3.0, 5.0, 4.0, np.nan], + }, + index=MultiIndex.from_arrays( + [[0, 0, 1, 1, 2, 2], ["B", "A", "B", "A", "B", "A"]] + ), + ) + else: + expected = DataFrame( + {"x": [1.0, 3.0, 2.0, 4.0, 3.0], "y": [2.0, 4.0, 3.0, 5.0, 4.0]}, + index=MultiIndex.from_arrays([[0, 0, 1, 1, 2], ["B", "A", "B", "A", "B"]]), + ) tm.assert_frame_equal(result, expected) # Codes sorted in this call @@ -1427,15 +1507,17 @@ def test_stack_sort_false(): data, columns=MultiIndex.from_arrays([["B", "B", "A", "A"], ["x", "y", "x", "y"]]), ) - result = df.stack(level=0, sort=False) + kwargs = {} if future_stack else {"sort": False} + result = df.stack(level=0, future_stack=future_stack, **kwargs) tm.assert_frame_equal(result, expected) -def test_stack_sort_false_multi_level(): +def test_stack_sort_false_multi_level(future_stack): # GH 15105 idx = MultiIndex.from_tuples([("weight", "kg"), ("height", "m")]) df = DataFrame([[1.0, 2.0], [3.0, 4.0]], index=["cat", "dog"], columns=idx) - result = df.stack([0, 1], sort=False) + kwargs = {} if future_stack else {"sort": False} + result = df.stack([0, 1], future_stack=future_stack, **kwargs) expected_index = MultiIndex.from_tuples( [ ("cat", "weight", "kg"), @@ -1516,75 +1598,85 @@ def test_unstack_multiple_no_empty_columns(self): expected = unstacked.dropna(axis=1, how="all") tm.assert_frame_equal(unstacked, expected) - def test_stack(self, multiindex_year_month_day_dataframe_random_data): + def test_stack(self, multiindex_year_month_day_dataframe_random_data, future_stack): ymd = multiindex_year_month_day_dataframe_random_data # regular roundtrip unstacked = ymd.unstack() - restacked = unstacked.stack() + restacked = unstacked.stack(future_stack=future_stack) + if future_stack: + # NA values in unstacked persist to restacked in version 3 + restacked = restacked.dropna(how="all") tm.assert_frame_equal(restacked, ymd) unlexsorted = ymd.sort_index(level=2) unstacked = unlexsorted.unstack(2) - restacked = unstacked.stack() + restacked = unstacked.stack(future_stack=future_stack) + if future_stack: + # NA values in unstacked persist to restacked in version 3 + restacked = restacked.dropna(how="all") tm.assert_frame_equal(restacked.sort_index(level=0), ymd) unlexsorted = unlexsorted[::-1] unstacked = unlexsorted.unstack(1) - restacked = unstacked.stack().swaplevel(1, 2) + restacked = unstacked.stack(future_stack=future_stack).swaplevel(1, 2) + if future_stack: + # NA values in unstacked persist to restacked in version 3 + restacked = restacked.dropna(how="all") tm.assert_frame_equal(restacked.sort_index(level=0), ymd) unlexsorted = unlexsorted.swaplevel(0, 1) unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) - restacked = unstacked.stack(0).swaplevel(1, 2) + restacked = unstacked.stack(0, future_stack=future_stack).swaplevel(1, 2) + if future_stack: + # NA values in unstacked persist to restacked in version 3 + restacked = restacked.dropna(how="all") tm.assert_frame_equal(restacked.sort_index(level=0), ymd) # columns unsorted unstacked = ymd.unstack() - restacked = unstacked.stack() + restacked = unstacked.stack(future_stack=future_stack) + if future_stack: + # NA values in unstacked persist to restacked in version 3 + restacked = restacked.dropna(how="all") tm.assert_frame_equal(restacked, ymd) # more than 2 levels in the columns unstacked = ymd.unstack(1).unstack(1) - result = unstacked.stack(1) + result = unstacked.stack(1, future_stack=future_stack) expected = ymd.unstack() tm.assert_frame_equal(result, expected) - result = unstacked.stack(2) + result = unstacked.stack(2, future_stack=future_stack) expected = ymd.unstack(1) tm.assert_frame_equal(result, expected) - result = unstacked.stack(0) - expected = ymd.stack().unstack(1).unstack(1) + result = unstacked.stack(0, future_stack=future_stack) + expected = ymd.stack(future_stack=future_stack).unstack(1).unstack(1) tm.assert_frame_equal(result, expected) # not all levels present in each echelon unstacked = ymd.unstack(2).loc[:, ::3] - stacked = unstacked.stack().stack() - ymd_stacked = ymd.stack() + stacked = unstacked.stack(future_stack=future_stack).stack( + future_stack=future_stack + ) + ymd_stacked = ymd.stack(future_stack=future_stack) + if future_stack: + # NA values in unstacked persist to restacked in version 3 + stacked = stacked.dropna(how="all") + ymd_stacked = ymd_stacked.dropna(how="all") tm.assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) # stack with negative number - result = ymd.unstack(0).stack(-2) - expected = ymd.unstack(0).stack(0) + result = ymd.unstack(0).stack(-2, future_stack=future_stack) + expected = ymd.unstack(0).stack(0, future_stack=future_stack) tm.assert_equal(result, expected) @pytest.mark.parametrize( "idx, columns, exp_idx", [ - [ - list("abab"), - ["1st", "2nd", "3rd"], - MultiIndex( - levels=[["a", "b"], ["1st", "2nd", "3rd"]], - codes=[ - np.tile(np.arange(2).repeat(3), 2), - np.tile(np.arange(3), 4), - ], - ), - ], [ list("abab"), ["1st", "2nd", "1st"], @@ -1607,21 +1699,26 @@ def test_stack(self, multiindex_year_month_day_dataframe_random_data): ], ], ) - def test_stack_duplicate_index(self, idx, columns, exp_idx): + def test_stack_duplicate_index(self, idx, columns, exp_idx, future_stack): # GH10417 df = DataFrame( np.arange(12).reshape(4, 3), index=idx, columns=columns, ) - result = df.stack() - expected = Series(np.arange(12), index=exp_idx) - tm.assert_series_equal(result, expected) - assert result.index.is_unique is False - li, ri = result.index, expected.index - tm.assert_index_equal(li, ri) + if future_stack: + msg = "Columns with duplicate values are not supported in stack" + with pytest.raises(ValueError, match=msg): + df.stack(future_stack=future_stack) + else: + result = df.stack(future_stack=future_stack) + expected = Series(np.arange(12), index=exp_idx) + tm.assert_series_equal(result, expected) + assert result.index.is_unique is False + li, ri = result.index, expected.index + tm.assert_index_equal(li, ri) - def test_unstack_odd_failure(self): + def test_unstack_odd_failure(self, future_stack): data = """day,time,smoker,sum,len Fri,Dinner,No,8.25,3. Fri,Dinner,Yes,27.03,9 @@ -1640,23 +1737,26 @@ def test_unstack_odd_failure(self): # it works, #2100 result = df.unstack(2) - recons = result.stack() + recons = result.stack(future_stack=future_stack) + if future_stack: + # NA values in unstacked persist to restacked in version 3 + recons = recons.dropna(how="all") tm.assert_frame_equal(recons, df) - def test_stack_mixed_dtype(self, multiindex_dataframe_random_data): + def test_stack_mixed_dtype(self, multiindex_dataframe_random_data, future_stack): frame = multiindex_dataframe_random_data df = frame.T df["foo", "four"] = "foo" df = df.sort_index(level=1, axis=1) - stacked = df.stack() - result = df["foo"].stack().sort_index() + stacked = df.stack(future_stack=future_stack) + result = df["foo"].stack(future_stack=future_stack).sort_index() tm.assert_series_equal(stacked["foo"], result, check_names=False) assert result.name is None assert stacked["bar"].dtype == np.float_ - def test_unstack_bug(self): + def test_unstack_bug(self, future_stack): df = DataFrame( { "state": ["naive", "naive", "naive", "active", "active", "active"], @@ -1670,22 +1770,24 @@ def test_unstack_bug(self): result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() - restacked = unstacked.stack() + restacked = unstacked.stack(future_stack=future_stack) tm.assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) - def test_stack_unstack_preserve_names(self, multiindex_dataframe_random_data): + def test_stack_unstack_preserve_names( + self, multiindex_dataframe_random_data, future_stack + ): frame = multiindex_dataframe_random_data unstacked = frame.unstack() assert unstacked.index.name == "first" assert unstacked.columns.names == ["exp", "second"] - restacked = unstacked.stack() + restacked = unstacked.stack(future_stack=future_stack) assert restacked.index.names == frame.index.names @pytest.mark.parametrize("method", ["stack", "unstack"]) def test_stack_unstack_wrong_level_name( - self, method, multiindex_dataframe_random_data + self, method, multiindex_dataframe_random_data, future_stack ): # GH 18303 - wrong level name should raise frame = multiindex_dataframe_random_data @@ -1693,14 +1795,15 @@ def test_stack_unstack_wrong_level_name( # A DataFrame with flat axes: df = frame.loc["foo"] + kwargs = {"future_stack": future_stack} if method == "stack" else {} with pytest.raises(KeyError, match="does not match index name"): - getattr(df, method)("mistake") + getattr(df, method)("mistake", **kwargs) if method == "unstack": # Same on a Series: s = df.iloc[:, 0] with pytest.raises(KeyError, match="does not match index name"): - getattr(s, method)("mistake") + getattr(s, method)("mistake", **kwargs) def test_unstack_level_name(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data @@ -1709,20 +1812,20 @@ def test_unstack_level_name(self, multiindex_dataframe_random_data): expected = frame.unstack(level=1) tm.assert_frame_equal(result, expected) - def test_stack_level_name(self, multiindex_dataframe_random_data): + def test_stack_level_name(self, multiindex_dataframe_random_data, future_stack): frame = multiindex_dataframe_random_data unstacked = frame.unstack("second") - result = unstacked.stack("exp") - expected = frame.unstack().stack(0) + result = unstacked.stack("exp", future_stack=future_stack) + expected = frame.unstack().stack(0, future_stack=future_stack) tm.assert_frame_equal(result, expected) - result = frame.stack("exp") - expected = frame.stack() + result = frame.stack("exp", future_stack=future_stack) + expected = frame.stack(future_stack=future_stack) tm.assert_series_equal(result, expected) def test_stack_unstack_multiple( - self, multiindex_year_month_day_dataframe_random_data + self, multiindex_year_month_day_dataframe_random_data, future_stack ): ymd = multiindex_year_month_day_dataframe_random_data @@ -1736,7 +1839,10 @@ def test_stack_unstack_multiple( s_unstacked = s.unstack(["year", "month"]) tm.assert_frame_equal(s_unstacked, expected["A"]) - restacked = unstacked.stack(["year", "month"]) + restacked = unstacked.stack(["year", "month"], future_stack=future_stack) + if future_stack: + # NA values in unstacked persist to restacked in version 3 + restacked = restacked.dropna(how="all") restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) restacked = restacked.sort_index(level=0) @@ -1753,7 +1859,7 @@ def test_stack_unstack_multiple( tm.assert_frame_equal(unstacked, expected.loc[:, unstacked.columns]) def test_stack_names_and_numbers( - self, multiindex_year_month_day_dataframe_random_data + self, multiindex_year_month_day_dataframe_random_data, future_stack ): ymd = multiindex_year_month_day_dataframe_random_data @@ -1761,10 +1867,10 @@ def test_stack_names_and_numbers( # Can't use mixture of names and numbers to stack with pytest.raises(ValueError, match="level should contain"): - unstacked.stack([0, "month"]) + unstacked.stack([0, "month"], future_stack=future_stack) def test_stack_multiple_out_of_bounds( - self, multiindex_year_month_day_dataframe_random_data + self, multiindex_year_month_day_dataframe_random_data, future_stack ): # nlevels == 3 ymd = multiindex_year_month_day_dataframe_random_data @@ -1772,9 +1878,9 @@ def test_stack_multiple_out_of_bounds( unstacked = ymd.unstack(["year", "month"]) with pytest.raises(IndexError, match="Too many levels"): - unstacked.stack([2, 3]) + unstacked.stack([2, 3], future_stack=future_stack) with pytest.raises(IndexError, match="not a valid level number"): - unstacked.stack([-4, -3]) + unstacked.stack([-4, -3], future_stack=future_stack) def test_unstack_period_series(self): # GH4342 @@ -1892,7 +1998,7 @@ def test_unstack_period_frame(self): tm.assert_frame_equal(result3, expected) - def test_stack_multiple_bug(self): + def test_stack_multiple_bug(self, future_stack): # bug when some uniques are not present in the data GH#3170 id_col = ([1] * 3) + ([2] * 3) name = (["a"] * 3) + (["b"] * 3) @@ -1907,23 +2013,33 @@ def test_stack_multiple_bug(self): with pytest.raises(TypeError, match=msg): unst.resample("W-THU").mean() down = unst.resample("W-THU").mean(numeric_only=True) - rs = down.stack("ID") - xp = unst.loc[:, ["VAR1"]].resample("W-THU").mean().stack("ID") + rs = down.stack("ID", future_stack=future_stack) + xp = ( + unst.loc[:, ["VAR1"]] + .resample("W-THU") + .mean() + .stack("ID", future_stack=future_stack) + ) xp.columns.name = "Params" tm.assert_frame_equal(rs, xp) - def test_stack_dropna(self): + def test_stack_dropna(self, future_stack): # GH#3997 df = DataFrame({"A": ["a1", "a2"], "B": ["b1", "b2"], "C": [1, 1]}) df = df.set_index(["A", "B"]) - stacked = df.unstack().stack(dropna=False) + dropna = False if not future_stack else lib.no_default + stacked = df.unstack().stack(dropna=dropna, future_stack=future_stack) assert len(stacked) > len(stacked.dropna()) - stacked = df.unstack().stack(dropna=True) - tm.assert_frame_equal(stacked, stacked.dropna()) + if future_stack: + with pytest.raises(ValueError, match="dropna must be unspecified"): + df.unstack().stack(dropna=True, future_stack=future_stack) + else: + stacked = df.unstack().stack(dropna=True, future_stack=future_stack) + tm.assert_frame_equal(stacked, stacked.dropna()) - def test_unstack_multiple_hierarchical(self): + def test_unstack_multiple_hierarchical(self, future_stack): df = DataFrame( index=[ [0, 0, 0, 0, 1, 1, 1, 1], @@ -1960,7 +2076,7 @@ def test_unstack_sparse_keyspace(self): # it works! is sufficient idf.unstack("E") - def test_unstack_unobserved_keys(self): + def test_unstack_unobserved_keys(self, future_stack): # related to GH#2278 refactoring levels = [[0, 1], [0, 1, 2, 3]] codes = [[0, 0, 1, 1], [0, 2, 0, 2]] @@ -1972,7 +2088,7 @@ def test_unstack_unobserved_keys(self): result = df.unstack() assert len(result.columns) == 4 - recons = result.stack() + recons = result.stack(future_stack=future_stack) tm.assert_frame_equal(recons, df) @pytest.mark.slow @@ -2006,12 +2122,15 @@ def __init__(self, *args, **kwargs) -> None: ) @pytest.mark.parametrize("stack_lev", range(2)) @pytest.mark.parametrize("sort", [True, False]) - def test_stack_order_with_unsorted_levels(self, levels, stack_lev, sort): + def test_stack_order_with_unsorted_levels( + self, levels, stack_lev, sort, future_stack + ): # GH#16323 # deep check for 1-row case columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) df = DataFrame(columns=columns, data=[range(4)]) - df_stacked = df.stack(stack_lev, sort=sort) + kwargs = {} if future_stack else {"sort": sort} + df_stacked = df.stack(stack_lev, future_stack=future_stack, **kwargs) for row in df.index: for col in df.columns: expected = df.loc[row, col] @@ -2020,7 +2139,7 @@ def test_stack_order_with_unsorted_levels(self, levels, stack_lev, sort): result = df_stacked.loc[result_row, result_col] assert result == expected - def test_stack_order_with_unsorted_levels_multi_row(self): + def test_stack_order_with_unsorted_levels_multi_row(self, future_stack): # GH#16323 # check multi-row case @@ -2032,18 +2151,20 @@ def test_stack_order_with_unsorted_levels_multi_row(self): columns=mi, index=range(5), data=np.arange(5 * len(mi)).reshape(5, -1) ) assert all( - df.loc[row, col] == df.stack(0).loc[(row, col[0]), col[1]] + df.loc[row, col] + == df.stack(0, future_stack=future_stack).loc[(row, col[0]), col[1]] for row in df.index for col in df.columns ) - def test_stack_order_with_unsorted_levels_multi_row_2(self): + def test_stack_order_with_unsorted_levels_multi_row_2(self, future_stack): # GH#53636 levels = ((0, 1), (1, 0)) stack_lev = 1 columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) df = DataFrame(columns=columns, data=[range(4)], index=[1, 0, 2, 3]) - result = df.stack(stack_lev, sort=True) + kwargs = {} if future_stack else {"sort": True} + result = df.stack(stack_lev, future_stack=future_stack, **kwargs) expected_index = MultiIndex( levels=[[0, 1, 2, 3], [0, 1]], codes=[[1, 1, 0, 0, 2, 2, 3, 3], [1, 0, 1, 0, 1, 0, 1, 0]], @@ -2057,7 +2178,7 @@ def test_stack_order_with_unsorted_levels_multi_row_2(self): ) tm.assert_frame_equal(result, expected) - def test_stack_unstack_unordered_multiindex(self): + def test_stack_unstack_unordered_multiindex(self, future_stack): # GH# 18265 values = np.arange(5) data = np.vstack( @@ -2072,7 +2193,9 @@ def test_stack_unstack_unordered_multiindex(self): multi_level_df = pd.concat(second_level_dict, axis=1) multi_level_df.columns.names = ["second", "first"] df = multi_level_df.reindex(sorted(multi_level_df.columns), axis=1) - result = df.stack(["first", "second"]).unstack(["first", "second"]) + result = df.stack(["first", "second"], future_stack=future_stack).unstack( + ["first", "second"] + ) expected = DataFrame( [["a0", "b0"], ["a1", "b1"], ["a2", "b2"], ["a3", "b3"], ["a4", "b4"]], index=[0, 1, 2, 3, 4], @@ -2095,7 +2218,7 @@ def test_unstack_preserve_types( assert unstacked["E", 1].dtype == np.object_ assert unstacked["F", 1].dtype == np.float64 - def test_unstack_group_index_overflow(self): + def test_unstack_group_index_overflow(self, future_stack): codes = np.tile(np.arange(500), 2) level = np.arange(500) @@ -2109,7 +2232,7 @@ def test_unstack_group_index_overflow(self): assert result.shape == (500, 2) # test roundtrip - stacked = result.stack() + stacked = result.stack(future_stack=future_stack) tm.assert_series_equal(s, stacked.reindex(s.index)) # put it at beginning @@ -2188,7 +2311,7 @@ def test_unstack_with_level_has_nan(self): tm.assert_index_equal(result, expected) - def test_stack_nan_in_multiindex_columns(self): + def test_stack_nan_in_multiindex_columns(self, future_stack): # GH#39481 df = DataFrame( np.zeros([1, 5]), @@ -2202,15 +2325,21 @@ def test_stack_nan_in_multiindex_columns(self): ], ), ) - result = df.stack(2) + result = df.stack(2, future_stack=future_stack) + if future_stack: + index = MultiIndex(levels=[[0], [0.0, 1.0]], codes=[[0, 0, 0], [-1, 0, 1]]) + columns = MultiIndex(levels=[[0], [2, 3]], codes=[[0, 0, 0], [-1, 0, 1]]) + else: + index = Index([(0, None), (0, 0), (0, 1)]) + columns = Index([(0, None), (0, 2), (0, 3)]) expected = DataFrame( [[0.0, np.nan, np.nan], [np.nan, 0.0, 0.0], [np.nan, 0.0, 0.0]], - index=Index([(0, None), (0, 0), (0, 1)]), - columns=Index([(0, None), (0, 2), (0, 3)]), + index=index, + columns=columns, ) tm.assert_frame_equal(result, expected) - def test_multi_level_stack_categorical(self): + def test_multi_level_stack_categorical(self, future_stack): # GH 15239 midx = MultiIndex.from_arrays( [ @@ -2220,30 +2349,52 @@ def test_multi_level_stack_categorical(self): ] ) df = DataFrame(np.arange(8).reshape(2, 4), columns=midx) - result = df.stack([1, 2]) - expected = DataFrame( - [ - [0, np.nan], - [np.nan, 2], - [1, np.nan], - [np.nan, 3], - [4, np.nan], - [np.nan, 6], - [5, np.nan], - [np.nan, 7], - ], - columns=["A", "B"], - index=MultiIndex.from_arrays( + result = df.stack([1, 2], future_stack=future_stack) + if future_stack: + expected = DataFrame( [ - [0] * 4 + [1] * 4, - pd.Categorical(list("aabbaabb")), - pd.Categorical(list("cdcdcdcd")), - ] - ), - ) + [0, np.nan], + [1, np.nan], + [np.nan, 2], + [np.nan, 3], + [4, np.nan], + [5, np.nan], + [np.nan, 6], + [np.nan, 7], + ], + columns=["A", "B"], + index=MultiIndex.from_arrays( + [ + [0] * 4 + [1] * 4, + pd.Categorical(list("abababab")), + pd.Categorical(list("ccddccdd")), + ] + ), + ) + else: + expected = DataFrame( + [ + [0, np.nan], + [np.nan, 2], + [1, np.nan], + [np.nan, 3], + [4, np.nan], + [np.nan, 6], + [5, np.nan], + [np.nan, 7], + ], + columns=["A", "B"], + index=MultiIndex.from_arrays( + [ + [0] * 4 + [1] * 4, + pd.Categorical(list("aabbaabb")), + pd.Categorical(list("cdcdcdcd")), + ] + ), + ) tm.assert_frame_equal(result, expected) - def test_stack_nan_level(self): + def test_stack_nan_level(self, future_stack): # GH 9406 df_nan = DataFrame( np.arange(4).reshape(2, 2), @@ -2253,13 +2404,21 @@ def test_stack_nan_level(self): index=Index([0, 1], name="Num"), dtype=np.float64, ) - result = df_nan.stack() + result = df_nan.stack(future_stack=future_stack) + if future_stack: + index = MultiIndex( + levels=[[0, 1], [np.nan, "b"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=["Num", "Lower"], + ) + else: + index = MultiIndex.from_tuples( + [(0, np.nan), (0, "b"), (1, np.nan), (1, "b")], names=["Num", "Lower"] + ) expected = DataFrame( [[0.0, np.nan], [np.nan, 1], [2.0, np.nan], [np.nan, 3.0]], columns=Index(["A", "B"], name="Upper"), - index=MultiIndex.from_tuples( - [(0, np.nan), (0, "b"), (1, np.nan), (1, "b")], names=["Num", "Lower"] - ), + index=index, ) tm.assert_frame_equal(result, expected) @@ -2278,7 +2437,7 @@ def test_unstack_categorical_columns(self): expected.columns = MultiIndex.from_tuples([("cat", 0), ("cat", 1)]) tm.assert_frame_equal(result, expected) - def test_stack_unsorted(self): + def test_stack_unsorted(self, future_stack): # GH 16925 PAE = ["ITA", "FRA"] VAR = ["A1", "A2"] @@ -2292,11 +2451,15 @@ def test_stack_unsorted(self): DF.columns = DF.columns.droplevel(0) DF.loc[:, ("A0", "NET")] = 9999 - result = DF.stack(["VAR", "TYP"]).sort_index() - expected = DF.sort_index(axis=1).stack(["VAR", "TYP"]).sort_index() + result = DF.stack(["VAR", "TYP"], future_stack=future_stack).sort_index() + expected = ( + DF.sort_index(axis=1) + .stack(["VAR", "TYP"], future_stack=future_stack) + .sort_index() + ) tm.assert_series_equal(result, expected) - def test_stack_nullable_dtype(self): + def test_stack_nullable_dtype(self, future_stack): # GH#43561 columns = MultiIndex.from_product( [["54511", "54515"], ["r", "t_mean"]], names=["station", "element"] @@ -2306,14 +2469,18 @@ def test_stack_nullable_dtype(self): arr = np.array([[50, 226, 10, 215], [10, 215, 9, 220], [305, 232, 111, 220]]) df = DataFrame(arr, columns=columns, index=index, dtype=pd.Int64Dtype()) - result = df.stack("station") + result = df.stack("station", future_stack=future_stack) - expected = df.astype(np.int64).stack("station").astype(pd.Int64Dtype()) + expected = ( + df.astype(np.int64) + .stack("station", future_stack=future_stack) + .astype(pd.Int64Dtype()) + ) tm.assert_frame_equal(result, expected) # non-homogeneous case df[df.columns[0]] = df[df.columns[0]].astype(pd.Float64Dtype()) - result = df.stack("station") + result = df.stack("station", future_stack=future_stack) expected = DataFrame( { diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 9bc790cbed8e8..3ef012183ef26 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -216,7 +216,7 @@ def test_subclass_stack(self): columns=["X", "Y", "Z"], ) - res = df.stack() + res = df.stack(future_stack=True) exp = tm.SubclassedSeries( [1, 2, 3, 4, 5, 6, 7, 8, 9], index=[list("aaabbbccc"), list("XYZXYZXYZ")] ) @@ -253,10 +253,10 @@ def test_subclass_stack_multi(self): columns=Index(["W", "X"], name="www"), ) - res = df.stack() + res = df.stack(future_stack=True) tm.assert_frame_equal(res, exp) - res = df.stack("yyy") + res = df.stack("yyy", future_stack=True) tm.assert_frame_equal(res, exp) exp = tm.SubclassedDataFrame( @@ -277,7 +277,7 @@ def test_subclass_stack_multi(self): columns=Index(["y", "z"], name="yyy"), ) - res = df.stack("www") + res = df.stack("www", future_stack=True) tm.assert_frame_equal(res, exp) def test_subclass_stack_multi_mixed(self): @@ -315,10 +315,10 @@ def test_subclass_stack_multi_mixed(self): columns=Index(["W", "X"], name="www"), ) - res = df.stack() + res = df.stack(future_stack=True) tm.assert_frame_equal(res, exp) - res = df.stack("yyy") + res = df.stack("yyy", future_stack=True) tm.assert_frame_equal(res, exp) exp = tm.SubclassedDataFrame( @@ -339,7 +339,7 @@ def test_subclass_stack_multi_mixed(self): columns=Index(["y", "z"], name="yyy"), ) - res = df.stack("www") + res = df.stack("www", future_stack=True) tm.assert_frame_equal(res, exp) def test_subclass_unstack(self): diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index f5fe4d7d9831a..f917f567e1ce3 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -68,7 +68,7 @@ def test_cythonized_aggers(op_name): expd = {} for (cat1, cat2), group in grouped: expd.setdefault(cat1, {})[cat2] = op(group["C"]) - exp = DataFrame(expd).T.stack(dropna=False) + exp = DataFrame(expd).T.stack(future_stack=True) exp.index.names = ["A", "B"] exp.name = "C" diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 11ce290896073..d0ae9eeed394f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -237,9 +237,13 @@ def f(x): # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) + tm.assert_index_equal( + (desc_result.stack(future_stack=True).index.get_level_values(0)), exp + ) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) - tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) + tm.assert_index_equal( + (desc_result.stack(future_stack=True).index.get_level_values(1)), exp + ) def test_level_get_group(observed): @@ -673,9 +677,13 @@ def test_datetime(): # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) + tm.assert_index_equal( + (desc_result.stack(future_stack=True).index.get_level_values(0)), exp + ) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) - tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) + tm.assert_index_equal( + (desc_result.stack(future_stack=True).index.get_level_values(1)), exp + ) def test_categorical_index(): @@ -713,8 +721,10 @@ def test_describe_categorical_columns(): df = DataFrame(np.random.default_rng(2).standard_normal((20, 4)), columns=cats) result = df.groupby([1, 2, 3, 4] * 5).describe() - tm.assert_index_equal(result.stack().columns, cats) - tm.assert_categorical_equal(result.stack().columns.values, cats.values) + tm.assert_index_equal(result.stack(future_stack=True).columns, cats) + tm.assert_categorical_equal( + result.stack(future_stack=True).columns.values, cats.values + ) def test_unstack_categorical(): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index ffedafa91ce50..78e9f6111a230 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1118,7 +1118,7 @@ def test_series_describe_single(): ts = tm.makeTimeSeries() grouped = ts.groupby(lambda x: x.month) result = grouped.apply(lambda x: x.describe()) - expected = grouped.describe().stack() + expected = grouped.describe().stack(future_stack=True) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 33d7570a07d73..7978e596e6ee5 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -363,7 +363,7 @@ def test_partial_slicing_with_multiindex_series(self): ser = DataFrame( np.random.default_rng(2).random((1000, 1000)), index=date_range("2000-1-1", periods=1000), - ).stack() + ).stack(future_stack=True) s2 = ser[:-1].copy() expected = s2["2000-1-4"] diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 2b75efd130aa2..78b2c493ec116 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -37,12 +37,12 @@ def test_slice_locs_partial(self, idx): def test_slice_locs(self): df = tm.makeTimeDataFrame() - stacked = df.stack() + stacked = df.stack(future_stack=True) idx = stacked.index slob = slice(*idx.slice_locs(df.index[5], df.index[15])) sliced = stacked[slob] - expected = df[5:16].stack() + expected = df[5:16].stack(future_stack=True) tm.assert_almost_equal(sliced.values, expected.values) slob = slice( @@ -52,19 +52,19 @@ def test_slice_locs(self): ) ) sliced = stacked[slob] - expected = df[6:15].stack() + expected = df[6:15].stack(future_stack=True) tm.assert_almost_equal(sliced.values, expected.values) def test_slice_locs_with_type_mismatch(self): df = tm.makeTimeDataFrame() - stacked = df.stack() + stacked = df.stack(future_stack=True) idx = stacked.index with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs((1, 3)) with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs(df.index[5] + timedelta(seconds=30), (5, 2)) df = tm.makeCustomDataframe(5, 5) - stacked = df.stack() + stacked = df.stack(future_stack=True) idx = stacked.index with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs(timedelta(seconds=30)) diff --git a/pandas/tests/indexes/multi/test_integrity.py b/pandas/tests/indexes/multi/test_integrity.py index 72b6754542fa6..45dd484eff4c6 100644 --- a/pandas/tests/indexes/multi/test_integrity.py +++ b/pandas/tests/indexes/multi/test_integrity.py @@ -235,7 +235,10 @@ def test_rangeindex_fallback_coercion_bug(): # GH 12893 df1 = pd.DataFrame(np.arange(100).reshape((10, 10))) df2 = pd.DataFrame(np.arange(100).reshape((10, 10))) - df = pd.concat({"df1": df1.stack(), "df2": df2.stack()}, axis=1) + df = pd.concat( + {"df1": df1.stack(future_stack=True), "df2": df2.stack(future_stack=True)}, + axis=1, + ) df.index.names = ["fizz", "buzz"] str(df) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 4b5fe5ff13c14..f49c8b6d53723 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1875,7 +1875,7 @@ def test_frame_int_overflow(self): ], ) def test_json_multiindex(self, dataframe, expected): - series = dataframe.stack() + series = dataframe.stack(future_stack=True) result = series.to_json(orient="index") assert result == expected @@ -1914,7 +1914,7 @@ def test_to_json_multiindex_escape(self): True, index=pd.date_range("2017-01-20", "2017-01-23"), columns=["foo", "bar"], - ).stack() + ).stack(future_stack=True) result = df.to_json() expected = ( "{\"(Timestamp('2017-01-20 00:00:00'), 'foo')\":true," diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 1fc2877e70c65..a447601f3d8c4 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -134,7 +134,7 @@ def test_append_series(setup_path): mi["C"] = "foo" mi.loc[3:5, "C"] = "bar" mi.set_index(["C", "B"], inplace=True) - s = mi.stack() + s = mi.stack(future_stack=True) s.index = s.index.droplevel(2) store.append("mi", s) tm.assert_series_equal(store["mi"], s, check_index_type=True) diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py index 39578212d4af0..db36221d8f510 100644 --- a/pandas/tests/series/methods/test_reset_index.py +++ b/pandas/tests/series/methods/test_reset_index.py @@ -34,7 +34,7 @@ def test_reset_index_dti_round_trip(self): def test_reset_index(self): df = tm.makeDataFrame()[:5] - ser = df.stack() + ser = df.stack(future_stack=True) ser.index.names = ["hash", "category"] ser.name = "value" diff --git a/pandas/tests/series/methods/test_unstack.py b/pandas/tests/series/methods/test_unstack.py index e37f955a91cd3..b294e2fcce9d8 100644 --- a/pandas/tests/series/methods/test_unstack.py +++ b/pandas/tests/series/methods/test_unstack.py @@ -133,7 +133,9 @@ def test_unstack_mixed_type_name_in_multiindex( def test_unstack_multi_index_categorical_values(): - mi = tm.makeTimeDataFrame().stack().index.rename(["major", "minor"]) + mi = ( + tm.makeTimeDataFrame().stack(future_stack=True).index.rename(["major", "minor"]) + ) ser = Series(["foo"] * len(mi), index=mi, name="category", dtype="category") result = ser.unstack()