Merge branch 'main' into ref-tst-check_op

jbrockmendel · Aug 2, 2023 · bbfc580 · bbfc580
2 parents 2966521 + 263828c
commit bbfc580
Show file tree

Hide file tree

Showing 48 changed files with 938 additions and 514 deletions.
diff --git a/.circleci/setup_env.sh b/.circleci/setup_env.sh
@@ -48,10 +48,10 @@ source activate pandas-dev
 # downstream CI jobs that may also build pandas from source.
 export PANDAS_CI=1
 
-if pip list | grep -q ^pandas; then
+if pip show pandas 1>/dev/null; then
     echo
     echo "remove any installed pandas package w/o removing anything else"
-    pip uninstall -y pandas || true
+    pip uninstall -y pandas
 fi
 
 echo "Install pandas"

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -333,7 +333,6 @@ jobs:
       PYTEST_WORKERS: "auto"
       PANDAS_CI: 1
       PATTERN: "not slow and not network and not clipboard and not single_cpu"
-      COVERAGE: true
       PYTEST_TARGET: pandas
 
     steps:
@@ -351,7 +350,6 @@ jobs:
           python --version
           python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.0.1 meson-python==0.13.1
           python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy
-          python -m pip install git+https://github.com/nedbat/coveragepy.git
           python -m pip install versioneer[toml]
           python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17
           python -m pip list

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -63,7 +63,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
 
     MSG='Partially validate docstrings (EX01)' ;  echo $MSG
     $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01 --ignore_functions \
-        pandas.io.stata.StataWriter.write_file \
         pandas.api.extensions.ExtensionArray \
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 

diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst
@@ -438,7 +438,7 @@ In Python, the :meth:`~pandas.melt` method is the R equivalent:
    )
 
    pd.melt(cheese, id_vars=["first", "last"])
-   cheese.set_index(["first", "last"]).stack()  # alternative way
+   cheese.set_index(["first", "last"]).stack(future_stack=True)  # alternative way
 
 For more details and examples see :ref:`the reshaping documentation
 <reshaping.melt>`.

diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst
@@ -579,7 +579,7 @@ columns:
 
 .. ipython:: python
 
-   stacked = df2.stack()
+   stacked = df2.stack(future_stack=True)
    stacked
 
 With a "stacked" DataFrame or Series (having a :class:`MultiIndex` as the

diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst
@@ -311,7 +311,7 @@ The :ref:`multindexing <advanced.hierarchical>` docs.
    df.columns = pd.MultiIndex.from_tuples([tuple(c.split("_")) for c in df.columns])
    df
    # Now stack & Reset
-   df = df.stack(0).reset_index(1)
+   df = df.stack(0, future_stack=True).reset_index(1)
    df
    # And fix the labels (Notice the label 'level_1' got added automatically)
    df.columns = ["Sample", "All_X", "All_Y"]
@@ -688,7 +688,7 @@ The :ref:`Pivot <reshaping.pivot>` docs.
        aggfunc="sum",
        margins=True,
    )
-   table.stack("City")
+   table.stack("City", future_stack=True)
 
 `Frequency table like plyr in R
 <https://stackoverflow.com/questions/15589354/frequency-tables-in-pandas-like-plyr-in-r>`__

diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -1713,4 +1713,4 @@ column index name will be used as the name of the inserted column:
 
    result
 
-   result.stack()
+   result.stack(future_stack=True)
diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
@@ -127,7 +127,7 @@ stacked level becomes the new lowest level in a :class:`MultiIndex` on the colum
 
 .. ipython:: python
 
-   stacked = df2.stack()
+   stacked = df2.stack(future_stack=True)
    stacked
 
 With a "stacked" :class:`DataFrame` or :class:`Series` (having a :class:`MultiIndex` as the
@@ -163,7 +163,7 @@ will result in a **sorted** copy of the original :class:`DataFrame` or :class:`S
    index = pd.MultiIndex.from_product([[2, 1], ["a", "b"]])
    df = pd.DataFrame(np.random.randn(4), index=index, columns=["A"])
    df
-   all(df.unstack().stack() == df.sort_index())
+   all(df.unstack().stack(future_stack=True) == df.sort_index())
 
 The above code will raise a ``TypeError`` if the call to :meth:`~DataFrame.sort_index` is
 removed.
@@ -191,16 +191,16 @@ processed individually.
     df = pd.DataFrame(np.random.randn(4, 4), columns=columns)
     df
 
-    df.stack(level=["animal", "hair_length"])
+    df.stack(level=["animal", "hair_length"], future_stack=True)
 
 The list of levels can contain either level names or level numbers (but
 not a mixture of the two).
 
 .. ipython:: python
 
-    # df.stack(level=['animal', 'hair_length'])
+    # df.stack(level=['animal', 'hair_length'], future_stack=True)
     # from above is equivalent to:
-    df.stack(level=[1, 2])
+    df.stack(level=[1, 2], future_stack=True)
 
 Missing data
 ~~~~~~~~~~~~
@@ -233,8 +233,8 @@ which level in the columns to stack:
 
 .. ipython:: python
 
-   df2.stack("exp")
-   df2.stack("animal")
+   df2.stack("exp", future_stack=True)
+   df2.stack("animal", future_stack=True)
 
 Unstacking can result in missing values if subgroups do not have the same
 set of labels.  By default, missing values will be replaced with the default
@@ -345,12 +345,12 @@ some very expressive and fast data manipulations.
 .. ipython:: python
 
    df
-   df.stack().mean(1).unstack()
+   df.stack(future_stack=True).mean(1).unstack()
 
    # same result, another way
    df.T.groupby(level=1).mean()
 
-   df.stack().groupby(level=1).mean()
+   df.stack(future_stack=True).groupby(level=1).mean()
 
    df.mean().unstack(0)
 
@@ -460,7 +460,7 @@ as having a multi-level index:
 
 .. ipython:: python
 
-    table.stack()
+    table.stack(future_stack=True)
 
 .. _reshaping.crosstabulations:
 

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -78,7 +78,7 @@ Copy-on-Write improvements
   - DataFrame.fillna / Series.fillna
   - DataFrame.replace / Series.replace
 
-.. _whatsnew_210.enhancements.enhancement2:
+.. _whatsnew_210.enhancements.map_na_action:
 
 ``map(func, na_action="ignore")`` now works for all array types
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -128,6 +128,45 @@ Also, note that :meth:`Categorical.map` implicitly has had its ``na_action`` set
 This has been deprecated and will :meth:`Categorical.map` in the future change the default
 to ``na_action=None``, like for all the other array types.
 
+.. _whatsnew_210.enhancements.new_stack:
+
+New implementation of :meth:`DataFrame.stack`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+pandas has reimplemented :meth:`DataFrame.stack`. To use the new implementation, pass the argument ``future_stack=True``. This will become the only option in pandas 3.0.
+
+The previous implementation had two main behavioral downsides.
+
+ 1. The previous implementation would unnecessarily introduce NA values into the result. The user could have NA values automatically removed by passing ``dropna=True`` (the default), but doing this could also remove NA values from the result that existed in the input. See the examples below.
+ 2. The previous implementation with ``sort=True`` (the default) would sometimes sort part of the resulting index, and sometimes not. If the input's columns are *not* a :class:`MultiIndex`, then the resulting index would never be sorted. If the columns are a :class:`MultiIndex`, then in most cases the level(s) in the resulting index that come from stacking the column level(s) would be sorted. In rare cases such level(s) would be sorted in a non-standard order, depending on how the columns were created.
+
+The new implementation (``future_stack=True``) will no longer unnecessarily introduce NA values when stacking multiple levels and will never sort. As such, the arguments ``dropna`` and ``sort`` are not utilized and must remain unspecified when using ``future_stack=True``. These arguments will be removed in the next major release.
+
+.. ipython:: python
+
+    columns = pd.MultiIndex.from_tuples([("B", "d"), ("A", "c")])
+    df = pd.DataFrame([[0, 2], [1, 3]], index=["z", "y"], columns=columns)
+    df
+
+In the previous version (``future_stack=False``), the default of ``dropna=True`` would remove unnecessarily introduced NA values but still coerce the dtype to ``float64`` in the process. In the new version, no NAs are introduced and so there is no coercion of the dtype.
+
+.. ipython:: python
+    :okwarning:
+
+    df.stack([0, 1], future_stack=False, dropna=True)
+    df.stack([0, 1], future_stack=True)
+
+If the input contains NA values, the previous version would drop those as well with ``dropna=True`` or introduce new NA values with ``dropna=False``. The new version persists all values from the input.
+
+.. ipython:: python
+    :okwarning:
+
+    df = pd.DataFrame([[0, 2], [np.nan, np.nan]], columns=columns)
+    df
+    df.stack([0, 1], future_stack=False, dropna=True)
+    df.stack([0, 1], future_stack=False, dropna=False)
+    df.stack([0, 1], future_stack=True)
+
 .. _whatsnew_210.enhancements.other:
 
 Other enhancements
@@ -669,6 +708,7 @@ I/O
 ^^^
 - :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`)
 - :meth:`DataFrame.to_sql` now raising ``ValueError`` when the name param is left empty while using SQLAlchemy to connect (:issue:`52675`)
+- Added ``filters`` parameter to :func:`read_parquet` to filter out data, compatible with both ``engines`` (:issue:`53212`)
 - Bug in :func:`json_normalize`, fix json_normalize cannot parse metadata fields list type (:issue:`37782`)
 - Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`)
 - Bug in :func:`read_csv`, with ``engine="pyarrow"`` erroring when specifying a ``dtype`` with ``index_col`` (:issue:`53229`)
@@ -787,6 +827,7 @@ Other
 - Bug in :func:`api.interchange.from_dataframe` when converting an empty DataFrame object (:issue:`53155`)
 - Bug in :func:`assert_almost_equal` now throwing assertion error for two unequal sets (:issue:`51727`)
 - Bug in :func:`assert_frame_equal` checks category dtypes even when asked not to check index type (:issue:`52126`)
+- Bug in :meth:`DataFrame.pivot_table` with casting the mean of ints back to an int (:issue:`16676`)
 - Bug in :meth:`DataFrame.reindex` with a ``fill_value`` that should be inferred with a :class:`ExtensionDtype` incorrectly inferring ``object`` dtype (:issue:`52586`)
 - Bug in :meth:`DataFrame.shift` and :meth:`Series.shift` and :meth:`DataFrameGroupBy.shift` when passing both "freq" and "fill_value" silently ignoring "fill_value" instead of raising ``ValueError`` (:issue:`53832`)
 - Bug in :meth:`DataFrame.shift` with ``axis=1`` on a :class:`DataFrame` with a single :class:`ExtensionDtype` column giving incorrect results (:issue:`53832`)
@@ -798,7 +839,6 @@ Other
 - Bug in :meth:`Series.memory_usage` when ``deep=True`` throw an error with Series of objects and the returned value is incorrect, as it does not take into account GC corrections (:issue:`51858`)
 - Bug in :meth:`period_range` the default behavior when freq was not passed as an argument was incorrect(:issue:`53687`)
 - Fixed incorrect ``__name__`` attribute of ``pandas._libs.json`` (:issue:`52898`)
--
 
 .. ***DO NOT USE THIS SECTION***
 

diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx
@@ -726,7 +726,7 @@ def make_mask_object_ndarray(ndarray[object, ndim=1] arr, object fill_value):
 
     for i in range(new_length):
         value = arr[i]
-        if value == fill_value and type(value) == type(fill_value):
+        if value == fill_value and type(value) is type(fill_value):
             mask[i] = 0
 
     return mask.view(dtype=bool)
diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
@@ -498,7 +498,7 @@ cdef class BaseOffset:
     def __sub__(self, other):
         if PyDateTime_Check(other):
             raise TypeError("Cannot subtract datetime from offset.")
-        elif type(other) == type(self):
+        elif type(other) is type(self):
             return type(self)(self.n - other.n, normalize=self.normalize,
                               **self.kwds)
         elif not isinstance(self, BaseOffset):
@@ -1047,7 +1047,7 @@ cdef class Tick(SingleConstructorOffset):
             return other.__add__(self)
 
         if isinstance(other, Tick):
-            if type(self) == type(other):
+            if type(self) is type(other):
                 return type(self)(self.n + other.n)
             else:
                 return delta_to_tick(self.delta + other.delta)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -9166,7 +9166,13 @@ def pivot_table(
             sort=sort,
         )
 
-    def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
+    def stack(
+        self,
+        level: IndexLabel = -1,
+        dropna: bool | lib.NoDefault = lib.no_default,
+        sort: bool | lib.NoDefault = lib.no_default,
+        future_stack: bool = False,
+    ):
         """
         Stack the prescribed level(s) from columns to index.
 
@@ -9194,6 +9200,11 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
             section.
         sort : bool, default True
             Whether to sort the levels of the resulting MultiIndex.
+        future_stack : bool, default False
+            Whether to use the new implementation that will replace the current
+            implementation in pandas 3.0. When True, dropna and sort have no impact
+            on the result and must remain unspecified. See :ref:`pandas 2.1.0 Release
+            notes <whatsnew_210.enhancements.new_stack>` for more details.
 
         Returns
         -------
@@ -9233,7 +9244,7 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
              weight height
         cat       0      1
         dog       2      3
-        >>> df_single_level_cols.stack()
+        >>> df_single_level_cols.stack(future_stack=True)
         cat  weight    0
              height    1
         dog  weight    2
@@ -9255,7 +9266,7 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
                  kg    pounds
         cat       1        2
         dog       2        4
-        >>> df_multi_level_cols1.stack()
+        >>> df_multi_level_cols1.stack(future_stack=True)
                     weight
         cat kg           1
             pounds       2
@@ -9280,7 +9291,7 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
                 kg      m
         cat    1.0    2.0
         dog    3.0    4.0
-        >>> df_multi_level_cols2.stack()
+        >>> df_multi_level_cols2.stack(future_stack=True)
                 weight  height
         cat kg     1.0     NaN
             m      NaN     2.0
@@ -9291,17 +9302,17 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
 
         The first parameter controls which level or levels are stacked:
 
-        >>> df_multi_level_cols2.stack(0)
+        >>> df_multi_level_cols2.stack(0, future_stack=True)
                      kg    m
-        cat height  NaN  2.0
-            weight  1.0  NaN
-        dog height  NaN  4.0
-            weight  3.0  NaN
-        >>> df_multi_level_cols2.stack([0, 1])
-        cat  height  m     2.0
-             weight  kg    1.0
-        dog  height  m     4.0
-             weight  kg    3.0
+        cat weight  1.0  NaN
+            height  NaN  2.0
+        dog weight  3.0  NaN
+            height  NaN  4.0
+        >>> df_multi_level_cols2.stack([0, 1], future_stack=True)
+        cat  weight  kg    1.0
+             height  m     2.0
+        dog  weight  kg    3.0
+             height  m     4.0
         dtype: float64
 
         **Dropping missing values**
@@ -9331,15 +9342,52 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
         dog kg     2.0     NaN
             m      NaN     3.0
         """
-        from pandas.core.reshape.reshape import (
-            stack,
-            stack_multiple,
-        )
+        if not future_stack:
+            from pandas.core.reshape.reshape import (
+                stack,
+                stack_multiple,
+            )
+
+            if dropna is lib.no_default:
+                dropna = True
+            if sort is lib.no_default:
+                sort = True
 
-        if isinstance(level, (tuple, list)):
-            result = stack_multiple(self, level, dropna=dropna, sort=sort)
+            if isinstance(level, (tuple, list)):
+                result = stack_multiple(self, level, dropna=dropna, sort=sort)
+            else:
+                result = stack(self, level, dropna=dropna, sort=sort)
         else:
-            result = stack(self, level, dropna=dropna, sort=sort)
+            from pandas.core.reshape.reshape import stack_v3
+
+            if dropna is not lib.no_default:
+                raise ValueError(
+                    "dropna must be unspecified with future_stack=True as the new "
+                    "implementation does not introduce rows of NA values. This "
+                    "argument will be removed in a future version of pandas."
+                )
+
+            if sort is not lib.no_default:
+                raise ValueError(
+                    "Cannot specify sort with future_stack=True, this argument will be "
+                    "removed in a future version of pandas. Sort the result using "
+                    ".sort_index instead."
+                )
+
+            if (
+                isinstance(level, (tuple, list))
+                and not all(lev in self.columns.names for lev in level)
+                and not all(isinstance(lev, int) for lev in level)
+            ):
+                raise ValueError(
+                    "level should contain all level names or all level "
+                    "numbers, not a mixture of the two."
+                )
+
+            if not isinstance(level, (tuple, list)):
+                level = [level]
+            level = [self.columns._get_level_number(lev) for lev in level]
+            result = stack_v3(self, level)
 
         return result.__finalize__(self, method="stack")