Merge branch 'docs' of https://github.com/andrewgsavage/pint-pandas i…

…nto docs
hgrecco · Sep 5, 2023 · 19322e5 · 19322e5
2 parents 6f94a94 + 64017fc
commit 19322e5
Show file tree

Hide file tree

Showing 5 changed files with 304 additions and 62 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -8,7 +8,7 @@ jobs:
       matrix:
         python-version: [3.9, "3.10", "3.11"]
         numpy: ["numpy>=1.20.3,<2.0.0"]
-        pandas: ["pandas==2.0.2", ]
+        pandas: ["pandas==2.0.2", "pandas==2.1.0rc0" ]
         pint: ["pint>=0.21.1", "pint==0.22"]
 
     runs-on: ubuntu-latest

diff --git a/CHANGES b/CHANGES
@@ -5,6 +5,10 @@ pint-pandas Changelog
 ----------------
 
 - ReadTheDocs Documentation created.
+- Support for Pandas version 2.1.0. #196
+- Support for dtype-preserving `PintArray.map` for both Pandas 2.0.2 and Pandas 2.1. #196
+- Support for <NA> values in columns with integer magnitudes
+- Support for magnitudes of any type, such as complex128 or tuples #146
 - Support for pandas 2.0, allowing `.cumsum, .cummax, .cummin` methods for `Series` and `DataFrame`. #186
 - Minimum Pint version is 0.21
 - Minimum Pandas vesrion is 2.0

diff --git a/pint_pandas/pint_array.py b/pint_pandas/pint_array.py
@@ -2,11 +2,12 @@
 import re
 import warnings
 from collections import OrderedDict
+from importlib.metadata import version
 
 import numpy as np
 import pandas as pd
 import pint
-from pandas import DataFrame, Series
+from pandas import DataFrame, Series, Index
 from pandas.api.extensions import (
     ExtensionArray,
     ExtensionDtype,
@@ -27,6 +28,11 @@
 # quantify/dequantify
 NO_UNIT = "No Unit"
 
+pandas_version = version("pandas")
+pandas_version_info = tuple(
+    int(x) if x.isdigit() else x for x in pandas_version.split(".")
+)
+
 
 class PintType(ExtensionDtype):
     """
@@ -65,7 +71,7 @@ def __new__(cls, units=None):
         if not isinstance(units, _Unit):
             units = cls._parse_dtype_strict(units)
             # ureg.unit returns a quantity with a magnitude of 1
-            # eg 1 mm. Initialising a quantity and taking it's unit
+            # eg 1 mm. Initialising a quantity and taking its unit
             # TODO: Seperate units from quantities in pint
             # to simplify this bit
             units = cls.ureg.Quantity(1, units).units
@@ -185,6 +191,12 @@ def __repr__(self):
         return self.name
 
 
+_NumpyEADtype = (
+    pd.core.dtypes.dtypes.PandasDtype
+    if pandas_version_info < (2, 1)
+    else pd.core.dtypes.dtypes.NumpyEADtype
+)
+
 dtypemap = {
     int: pd.Int64Dtype(),
     np.int64: pd.Int64Dtype(),
@@ -195,8 +207,8 @@ def __repr__(self):
     float: pd.Float64Dtype(),
     np.float64: pd.Float64Dtype(),
     np.float32: pd.Float32Dtype(),
-    np.complex128: pd.core.dtypes.dtypes.PandasDtype("complex128"),
-    np.complex64: pd.core.dtypes.dtypes.PandasDtype("complex64"),
+    np.complex128: _NumpyEADtype("complex128"),
+    np.complex64: _NumpyEADtype("complex64"),
     # np.float16: pd.Float16Dtype(),
 }
 dtypeunmap = {v: k for k, v in dtypemap.items()}
@@ -250,7 +262,6 @@ def __init__(self, values, dtype=None, copy=False):
             copy = False
         elif not isinstance(values, pd.core.arrays.numeric.NumericArray):
             values = pd.array(values, copy=copy)
-            copy = False
         if copy:
             values = values.copy()
         self._data = values
@@ -311,10 +322,14 @@ def __setitem__(self, key, value):
 
         if isinstance(value, _Quantity):
             value = value.to(self.units).magnitude
-        elif is_list_like(value) and len(value) > 0 and isinstance(value[0], _Quantity):
-            value = [item.to(self.units).magnitude for item in value]
+        elif is_list_like(value) and len(value) > 0:
+            if isinstance(value[0], _Quantity):
+                value = [item.to(self.units).magnitude for item in value]
+            if len(value) == 1:
+                value = value[0]
 
         key = check_array_indexer(self, key)
+        # Filter out invalid values for our array type(s)
         try:
             self._data[key] = value
         except IndexError as e:
@@ -458,7 +473,8 @@ def take(self, indices, allow_fill=False, fill_value=None):
         Examples
         --------
         """
-        from pandas.core.algorithms import take, is_scalar
+        from pandas.core.algorithms import take
+        from pandas.api.types import is_scalar
 
         data = self._data
         if allow_fill and fill_value is None:
@@ -470,7 +486,10 @@ def take(self, indices, allow_fill=False, fill_value=None):
                 # magnitude is in fact an array scalar, which will get rejected by pandas.
                 fill_value = fill_value[()]
 
-        result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            # Turn off warning that PandasArray is deprecated for ``take``
+            result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill)
 
         return PintArray(result, dtype=self.dtype)
 
@@ -512,18 +531,12 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
                 raise ValueError(
                     "Cannot infer dtype. No dtype specified and empty array"
                 )
-        if dtype is None and not isinstance(master_scalar, _Quantity):
-            raise ValueError("No dtype specified and not a sequence of quantities")
-        if dtype is None and isinstance(master_scalar, _Quantity):
+        if dtype is None:
+            if not isinstance(master_scalar, _Quantity):
+                raise ValueError("No dtype specified and not a sequence of quantities")
             dtype = PintType(master_scalar.units)
 
-        def quantify_nan(item):
-            if type(item) is float:
-                return item * dtype.units
-            return item
-
         if isinstance(master_scalar, _Quantity):
-            scalars = [quantify_nan(item) for item in scalars]
             scalars = [
                 (item.to(dtype.units).magnitude if hasattr(item, "to") else item)
                 for item in scalars
@@ -538,10 +551,21 @@ def _from_sequence_of_strings(cls, scalars, dtype=None, copy=False):
 
     @classmethod
     def _from_factorized(cls, values, original):
+        from pandas.api.types import infer_dtype
+
+        if infer_dtype(values) != "object":
+            values = pd.array(values, copy=False)
         return cls(values, dtype=original.dtype)
 
     def _values_for_factorize(self):
-        return self._data._values_for_factorize()
+        # factorize can now handle differentiating various types of null values.
+        # These can only occur when the array has object dtype.
+        # However, for backwards compatibility we only use the null for the
+        # provided dtype. This may be revisited in the future, see GH#48476.
+        arr = self._data
+        if arr.dtype.kind == "O":
+            return np.array(arr, copy=False), self.dtype.na_value
+        return arr._values_for_factorize()
 
     def value_counts(self, dropna=True):
         """
@@ -567,18 +591,19 @@ def value_counts(self, dropna=True):
 
         # compute counts on the data with no nans
         data = self._data
-        nafilt = np.isnan(data)
+        nafilt = pd.isna(data)
+        na_value = pd.NA  # NA value for index, not data, so not quantified
         data = data[~nafilt]
+        index = list(set(data))
 
         data_list = data.tolist()
-        index = list(set(data))
         array = [data_list.count(item) for item in index]
 
         if not dropna:
-            index.append(np.nan)
+            index.append(na_value)
             array.append(nafilt.sum())
 
-        return Series(array, index=index)
+        return Series(np.asarray(array), index=index)
 
     def unique(self):
         """Compute the PintArray of unique values.
@@ -589,7 +614,8 @@ def unique(self):
         """
         from pandas import unique
 
-        return self._from_sequence(unique(self._data), dtype=self.dtype)
+        data = self._data
+        return self._from_sequence(unique(data), dtype=self.dtype)
 
     def __contains__(self, item) -> bool:
         if not isinstance(item, _Quantity):
@@ -691,7 +717,7 @@ def convert_values(param):
                 else:
                     return param
 
-            if isinstance(other, (Series, DataFrame)):
+            if isinstance(other, (Series, DataFrame, Index)):
                 return NotImplemented
             lvalues = self.quantity
             validate_length(lvalues, other)
@@ -740,7 +766,9 @@ def __array__(self, dtype=None, copy=False):
 
     def _to_array_of_quantity(self, copy=False):
         qtys = [
-            self._Q(item, self._dtype.units) if not pd.isna(item) else item
+            self._Q(item, self._dtype.units)
+            if not pd.isna(item)
+            else self.dtype.na_value
             for item in self._data
         ]
         with warnings.catch_warnings(record=True):
@@ -798,7 +826,42 @@ def searchsorted(self, value, side="left", sorter=None):
             value = [item.to(self.units).magnitude for item in value]
         return arr.searchsorted(value, side=side, sorter=sorter)
 
-    def _reduce(self, name, **kwds):
+    def map(self, mapper, na_action=None):
+        """
+        Map values using an input mapping or function.
+
+        Parameters
+        ----------
+        mapper : function, dict, or Series
+            Mapping correspondence.
+        na_action : {None, 'ignore'}, default None
+            If 'ignore', propagate NA values, without passing them to the
+            mapping correspondence. If 'ignore' is not supported, a
+            ``NotImplementedError`` should be raised.
+
+        Returns
+        -------
+        If mapper is a function, operate on the magnitudes of the array and
+
+        """
+        if pandas_version_info < (2, 1):
+            ser = pd.Series(self._to_array_of_quantity())
+            arr = ser.map(mapper, na_action).values
+        else:
+            from pandas.core.algorithms import map_array
+
+            arr = map_array(self, mapper, na_action)
+
+        master_scalar = None
+        try:
+            master_scalar = next(i for i in arr if hasattr(i, "units"))
+        except StopIteration:
+            # JSON mapper formatting Qs as str don't create PintArrays
+            # ...and that's OK.  Caller will get array of values
+            return arr
+        return PintArray._from_sequence(arr, PintType(master_scalar.units))
+
+    def _reduce(self, name, *, skipna: bool = True, keepdims: bool = False, **kwds):
         """
         Return a scalar result of performing the reduction operation.
 
@@ -842,14 +905,20 @@ def _reduce(self, name, **kwds):
 
         if isinstance(self._data, ExtensionArray):
             try:
-                result = self._data._reduce(name, **kwds)
+                result = self._data._reduce(
+                    name, skipna=skipna, keepdims=keepdims, **kwds
+                )
             except NotImplementedError:
                 result = functions[name](self.numpy_data, **kwds)
 
         if name in {"all", "any", "kurt", "skew"}:
             return result
         if name == "var":
+            if keepdims:
+                return PintArray(result, f"pint[({self.units})**2]")
             return self._Q(result, self.units**2)
+        if keepdims:
+            return PintArray(result, self.dtype)
         return self._Q(result, self.units)
 
     def _accumulate(self, name: str, *, skipna: bool = True, **kwds):
@@ -866,7 +935,6 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwds):
                 result = self._data._accumulate(name, **kwds)
             except NotImplementedError:
                 result = functions[name](self.numpy_data, **kwds)
-        print(result)
 
         return self._from_sequence(result, self.units)
 

diff --git a/pint_pandas/testsuite/test_issues.py b/pint_pandas/testsuite/test_issues.py
@@ -3,12 +3,14 @@
 
 import numpy as np
 import pandas as pd
+import pandas._testing as tm
 import pytest
 import pint
 from pandas.tests.extension.base.base import BaseExtensionTests
 from pint.testsuite import helpers
 
 from pint_pandas import PintArray, PintType
+from pint_pandas.pint_array import pandas_version_info
 
 ureg = PintType.ureg
 
@@ -41,7 +43,7 @@ def test_force_ndarray_like(self):
             expected = pd.DataFrame(
                 {0: PintArray(q_a_), 1: PintArray(q_b)}, dtype="pint[degC]"
             )
-            self.assert_equal(result, expected)
+            tm.assert_equal(result, expected)
 
         finally:
             # restore registry
@@ -64,7 +66,7 @@ def test_offset_concat(self):
         expected = pd.DataFrame(
             {0: PintArray(q_a_), 1: PintArray(q_b)}, dtype="pint[degC]"
         )
-        self.assert_equal(result, expected)
+        tm.assert_equal(result, expected)
 
         # issue #141
         print(PintArray(q_a))
@@ -80,7 +82,7 @@ def test_assignment_add_empty(self):
         result = pd.Series(data)
         result[[]] += data[0]
         expected = pd.Series(data)
-        self.assert_series_equal(result, expected)
+        tm.assert_series_equal(result, expected)
 
 
 class TestIssue80:
@@ -167,3 +169,19 @@ def test_issue_127():
     a = PintType.construct_from_string("pint[dimensionless]")
     b = PintType.construct_from_string("pint[]")
     assert a == b
+
+
+class TestIssue174(BaseExtensionTests):
+    def test_sum(self):
+        if pandas_version_info < (2, 1):
+            pytest.skip("Pandas reduce functions strip units prior to version 2.1.0")
+        a = pd.DataFrame([[0, 1, 2], [3, 4, 5]]).astype("pint[m]")
+        row_sum = a.sum(axis=0)
+        expected_1 = pd.Series([3, 5, 7], dtype="pint[m]")
+
+        tm.assert_series_equal(row_sum, expected_1)
+
+        col_sum = a.sum(axis=1)
+        expected_2 = pd.Series([3, 12], dtype="pint[m]")
+
+        tm.assert_series_equal(col_sum, expected_2)