From fc5fb04c6ea55afe967e214da03a12d9ccc7da53 Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Wed, 6 Sep 2023 14:59:22 -0400 Subject: [PATCH 1/4] Fix typo (and test readthedocs.io build) Fix a simple typo, and see whether my account permissions allow the docs to build correctly on readthedocs.io Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- docs/getting/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/getting/index.rst b/docs/getting/index.rst index da8a44a..6e000fc 100644 --- a/docs/getting/index.rst +++ b/docs/getting/index.rst @@ -7,7 +7,7 @@ The getting started guide aims to get you using pint-pandas productively as quic What is Pint-pandas? -------------------- -The Pandas package provides powerful DataFrame and Series abstractions for dealing with numerical, temporal, categorical, string-based, and even user-defined data (using its ExtensionArray feature). The Pint package provides a rich and extensible vocabulary of units for constructing Quantities and an equally rich and extensible range of unit conversions to make it easy to perform unit-safe calculations using Quantities. Pint-pandas provides PintArray, aPandas ExtensionArray that efficiently implements Pandas DataFrame and Series functionality as unit-aware operations where appropriate. +The Pandas package provides powerful DataFrame and Series abstractions for dealing with numerical, temporal, categorical, string-based, and even user-defined data (using its ExtensionArray feature). The Pint package provides a rich and extensible vocabulary of units for constructing Quantities and an equally rich and extensible range of unit conversions to make it easy to perform unit-safe calculations using Quantities. Pint-pandas provides PintArray, a Pandas ExtensionArray that efficiently implements Pandas DataFrame and Series functionality as unit-aware operations where appropriate. Those who have used Pint know well that good units discipline often catches not only simple mistakes, but sometimes more fundamental errors as well. Pint-pandas can reveal similar errors when it comes to slicing and dicing Pandas data. From 3840f88f65aa98c9212d3a1f6832319ea11ba33b Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Fri, 20 Oct 2023 04:08:05 -0400 Subject: [PATCH 2/4] Don't fail on duplicate column names Use enumeration to wade through duplicate column names. This does not preserve default column names when dequantifying, but it doesn't break. Should `pint.dequantify()` also preserve duplicated column names? Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- pint_pandas/pint_array.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pint_pandas/pint_array.py b/pint_pandas/pint_array.py index fcbfe61..03ac04a 100644 --- a/pint_pandas/pint_array.py +++ b/pint_pandas/pint_array.py @@ -981,18 +981,18 @@ def formatter_func(dtype): df_columns = df.columns.to_frame() df_columns["units"] = [ - formatter_func(df[col].dtype) - if isinstance(df[col].dtype, PintType) + formatter_func(df.dtypes.iloc[i]) + if isinstance(df.dtypes.iloc[i], PintType) else NO_UNIT - for col in df.columns + for i, col in enumerate(df.columns) ] data_for_df = OrderedDict() for i, col in enumerate(df.columns): - if isinstance(df[col].dtype, PintType): - data_for_df[tuple(df_columns.iloc[i])] = df[col].values.data + if isinstance(df.dtypes.iloc[i], PintType): + data_for_df[tuple(df_columns.iloc[i])] = df.iloc[:, i].values.data else: - data_for_df[tuple(df_columns.iloc[i])] = df[col].values + data_for_df[tuple(df_columns.iloc[i])] = df.iloc[:, i].values df_new = DataFrame(data_for_df, columns=data_for_df.keys()) From aa3c9f1929f92981122e0417289608e2e882b456 Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Fri, 20 Oct 2023 04:20:16 -0400 Subject: [PATCH 3/4] Update CHANGES Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- CHANGES | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES b/CHANGES index f896b8e..7bfcbf9 100644 --- a/CHANGES +++ b/CHANGES @@ -4,6 +4,7 @@ pint-pandas Changelog 0.6 (unreleased) ---------------- +- Fix dequantify duplicate column failure #202 - Fix astype issue #196 From 2808bb8dfec09a26db87f55316beaf299ef91b8e Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Sat, 21 Oct 2023 13:30:21 -0400 Subject: [PATCH 4/4] Support duplicate columns and add test case Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- pint_pandas/pint_array.py | 25 ++++++++++++++------ pint_pandas/testsuite/test_issues.py | 35 ++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/pint_pandas/pint_array.py b/pint_pandas/pint_array.py index 03ac04a..6fb96ee 100644 --- a/pint_pandas/pint_array.py +++ b/pint_pandas/pint_array.py @@ -1,7 +1,6 @@ import copy import re import warnings -from collections import OrderedDict from importlib.metadata import version import numpy as np @@ -987,17 +986,29 @@ def formatter_func(dtype): for i, col in enumerate(df.columns) ] - data_for_df = OrderedDict() + data_for_df = [] for i, col in enumerate(df.columns): if isinstance(df.dtypes.iloc[i], PintType): - data_for_df[tuple(df_columns.iloc[i])] = df.iloc[:, i].values.data + data_for_df.append( + pd.Series( + data=df.iloc[:, i].values.data, + name=tuple(df_columns.iloc[i]), + index=df.index, + copy=False, + ) + ) else: - data_for_df[tuple(df_columns.iloc[i])] = df.iloc[:, i].values - - df_new = DataFrame(data_for_df, columns=data_for_df.keys()) + data_for_df.append( + pd.Series( + data=df.iloc[:, i].values, + name=tuple(df_columns.iloc[i]), + index=df.index, + copy=False, + ) + ) + df_new = pd.concat(data_for_df, axis=1, copy=False) df_new.columns.names = df.columns.names + ["unit"] - df_new.index = df.index return df_new diff --git a/pint_pandas/testsuite/test_issues.py b/pint_pandas/testsuite/test_issues.py index d8d6ce0..fbcd0c6 100644 --- a/pint_pandas/testsuite/test_issues.py +++ b/pint_pandas/testsuite/test_issues.py @@ -194,3 +194,38 @@ def test_issue_194(dtype): s2 = s1.astype(dtype) tm.assert_series_equal(s0, s2) + + +class TestIssue202(BaseExtensionTests): + def test_dequantify(self): + df = pd.DataFrame() + df["test"] = pd.Series([1, 2, 3], dtype="pint[kN]") + df.insert(0, "test", df["test"], allow_duplicates=True) + + expected = pd.DataFrame.from_dict( + data={ + "index": [0, 1, 2], + "columns": [("test", "kilonewton")], + "data": [[1], [2], [3]], + "index_names": [None], + "column_names": [None, "unit"], + }, + orient="tight", + dtype="Int64", + ) + result = df.iloc[:, 1:].pint.dequantify() + tm.assert_frame_equal(expected, result) + + expected = pd.DataFrame.from_dict( + data={ + "index": [0, 1, 2], + "columns": [("test", "kilonewton"), ("test", "kilonewton")], + "data": [[1, 1], [2, 2], [3, 3]], + "index_names": [None], + "column_names": [None, "unit"], + }, + orient="tight", + dtype="Int64", + ) + result = df.pint.dequantify() + tm.assert_frame_equal(expected, result)