From f00e92220a337ad3af8c01c8c9e96f3c80e4f47e Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 28 Jul 2023 12:05:35 -0500 Subject: [PATCH] Preserve names of column object in various APIs (#13772) This PR preserves column names in various APIs by retaining `self._data._level_names` and also calculating when to preserve the column names. Fixes: #13741, #13740 Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) - Ashwin Srinath (https://github.com/shwina) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/13772 --- python/cudf/cudf/core/dataframe.py | 37 ++++++++++++++++++--- python/cudf/cudf/core/indexed_frame.py | 25 ++++++++++++--- python/cudf/cudf/core/series.py | 5 ++- python/cudf/cudf/tests/test_dataframe.py | 41 ++++++++++++++++++++++-- 4 files changed, 96 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0fe89490905..fc6c669256f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -723,6 +723,10 @@ def __init__( if dtype: self._data = self.astype(dtype)._data + self._data.multiindex = self._data.multiindex or isinstance( + columns, pd.MultiIndex + ) + @_cudf_nvtx_annotate def _init_from_series_list(self, data, columns, index): if index is None: @@ -1820,19 +1824,29 @@ def _make_operands_and_index_for_binop( NotImplementedType, ], Optional[BaseIndex], + bool, ]: lhs, rhs = self._data, other index = self._index fill_requires_key = False left_default: Any = False + equal_columns = False + can_use_self_column_name = True if _is_scalar_or_zero_d_array(other): rhs = {name: other for name in self._data} + equal_columns = True elif isinstance(other, Series): rhs = dict(zip(other.index.values_host, other.values_host)) # For keys in right but not left, perform binops between NaN (not # NULL!) and the right value (result is NaN). left_default = as_column(np.nan, length=len(self)) + equal_columns = other.index.to_pandas().equals( + self._data.to_pandas_index() + ) + can_use_self_column_name = equal_columns or ( + list(other._index._data.names) == self._data._level_names + ) elif isinstance(other, DataFrame): if ( not can_reindex @@ -1854,13 +1868,18 @@ def _make_operands_and_index_for_binop( # For DataFrame-DataFrame ops, always default to operating against # the fill value. left_default = fill_value + equal_columns = self._column_names == other._column_names + can_use_self_column_name = ( + equal_columns + or self._data._level_names == other._data._level_names + ) elif isinstance(other, (dict, abc.Mapping)): # Need to fail early on host mapping types because we ultimately # convert everything to a dict. - return NotImplemented, None + return NotImplemented, None, True if not isinstance(rhs, (dict, abc.Mapping)): - return NotImplemented, None + return NotImplemented, None, True operands = { k: ( @@ -1876,7 +1895,8 @@ def _make_operands_and_index_for_binop( for k, v in rhs.items(): if k not in lhs: operands[k] = (left_default, v, reflect, None) - return operands, index + + return operands, index, can_use_self_column_name @classmethod @_cudf_nvtx_annotate @@ -5042,6 +5062,7 @@ def from_pandas(cls, dataframe, nan_as_null=None): index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null) df = cls._from_data(data, index) + df._data._level_names = list(dataframe.columns.names) # Set columns only if it is a MultiIndex if isinstance(dataframe.columns, pd.MultiIndex): @@ -5085,13 +5106,19 @@ def from_arrow(cls, table): 2 3 6 """ index_col = None + col_index_names = None if isinstance(table, pa.Table) and isinstance( table.schema.pandas_metadata, dict ): index_col = table.schema.pandas_metadata["index_columns"] + if "column_indexes" in table.schema.pandas_metadata: + col_index_names = [] + for col_meta in table.schema.pandas_metadata["column_indexes"]: + col_index_names.append(col_meta["name"]) out = super().from_arrow(table) - + if col_index_names is not None: + out._data._level_names = col_index_names if index_col: if isinstance(index_col[0], dict): idx = cudf.RangeIndex( @@ -5337,6 +5364,8 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): df._data[names[0]] = column.as_column( data, nan_as_null=nan_as_null ) + if isinstance(columns, pd.Index): + df._data._level_names = list(columns.names) if index is None: df._index = RangeIndex(start=0, stop=len(data)) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index aa0f060c8da..0ffc3948e67 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -293,7 +293,9 @@ def _from_data( @_cudf_nvtx_annotate def _from_data_like_self(self, data: MutableMapping): - return self._from_data(data, self._index) + out = self._from_data(data, self._index) + out._data._level_names = self._data._level_names + return out @classmethod @_cudf_nvtx_annotate @@ -3128,7 +3130,9 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""): # inserted to the left of existing data columns. return ( ColumnAccessor( - {**new_column_data, **self._data}, self._data.multiindex + {**new_column_data, **self._data}, + self._data.multiindex, + self._data._level_names, ), index, ) @@ -3465,14 +3469,24 @@ def _binaryop( **kwargs, ): reflect, op = self._check_reflected_op(op) - operands, out_index = self._make_operands_and_index_for_binop( + ( + operands, + out_index, + can_use_self_column_name, + ) = self._make_operands_and_index_for_binop( other, op, fill_value, reflect, can_reindex ) if operands is NotImplemented: return NotImplemented + level_names = ( + None if not can_use_self_column_name else self._data._level_names + ) return self._from_data( - ColumnAccessor(type(self)._colwise_binop(operands, op)), + ColumnAccessor( + type(self)._colwise_binop(operands, op), + level_names=level_names, + ), index=out_index, ) @@ -3491,6 +3505,7 @@ def _make_operands_and_index_for_binop( NotImplementedType, ], Optional[cudf.BaseIndex], + bool, ]: raise NotImplementedError( f"Binary operations are not supported for {self.__class__}" @@ -3516,7 +3531,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): if cupy_func: if ufunc.nin == 2: other = inputs[self is inputs[0]] - inputs, index = self._make_operands_and_index_for_binop( + inputs, index, _ = self._make_operands_and_index_for_binop( other, fname ) else: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index aaac91e927a..02de3b8282a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1472,6 +1472,7 @@ def _make_operands_and_index_for_binop( NotImplementedType, ], Optional[BaseIndex], + bool, ]: # Specialize binops to align indices. if isinstance(other, Series): @@ -1484,11 +1485,13 @@ def _make_operands_and_index_for_binop( "Can only compare identically-labeled Series objects" ) lhs, other = _align_indices([self, other], allow_non_unique=True) + can_use_self_column_name = self.name == other.name else: lhs = self + can_use_self_column_name = False operands = lhs._make_operands_for_binop(other, fill_value, reflect) - return operands, lhs._index + return operands, lhs._index, can_use_self_column_name @copy_docstring(CategoricalAccessor) # type: ignore @property diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 25a17697538..d443cd92968 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1708,6 +1708,7 @@ def test_nonmatching_index_setitem(nrows): ) def test_from_pandas(dtype): df = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0], dtype=dtype) + df.columns.name = "custom_column_name" gdf = cudf.DataFrame.from_pandas(df) assert isinstance(gdf, cudf.DataFrame) @@ -2483,8 +2484,15 @@ def test_bitwise_binops_series(pdf, gdf, binop): @pytest.mark.parametrize("unaryop", [operator.neg, operator.inv, operator.abs]) -def test_unaryops_df(pdf, gdf, unaryop): - d = unaryop(pdf - 5) +@pytest.mark.parametrize( + "col_name,assign_col_name", [(None, False), (None, True), ("abc", True)] +) +def test_unaryops_df(pdf, unaryop, col_name, assign_col_name): + pd_df = pdf.copy() + if assign_col_name: + pd_df.columns.name = col_name + gdf = cudf.from_pandas(pd_df) + d = unaryop(pd_df - 5) g = unaryop(gdf - 5) assert_eq(d, g) @@ -2626,6 +2634,12 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index): pdf2 = pdf_arrow_table.to_pandas() assert_eq(pdf2, gdf2) + pdf.columns.name = "abc" + pdf_arrow_table = pa.Table.from_pandas(pdf, preserve_index=preserve_index) + + gdf2 = cudf.DataFrame.from_arrow(pdf_arrow_table) + pdf2 = pdf_arrow_table.to_pandas() + assert_eq(pdf2, gdf2) @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"]) @@ -2912,6 +2926,7 @@ def test_tail_for_string(): ["v0", "v1"], ["v0", "index"], pd.MultiIndex.from_tuples([("x0", "x1"), ("y0", "y1")]), + pd.MultiIndex.from_tuples([(1, 2), (10, 11)], names=["ABC", "DEF"]), ], ) @pytest.mark.parametrize("inplace", [True, False]) @@ -10147,3 +10162,25 @@ def test_dataframe_init_length_error(data, index): {"data": data, "index": index}, ), ) + + +def test_dataframe_init_columns_named_multiindex(): + np.random.seed(0) + data = np.random.randn(2, 2) + columns = cudf.MultiIndex.from_tuples( + [("A", "one"), ("A", "two")], names=["y", "z"] + ) + gdf = cudf.DataFrame(data, columns=columns) + pdf = pd.DataFrame(data, columns=columns.to_pandas()) + + assert_eq(gdf, pdf) + + +def test_dataframe_init_columns_named_index(): + np.random.seed(0) + data = np.random.randn(2, 2) + columns = pd.Index(["a", "b"], name="custom_name") + gdf = cudf.DataFrame(data, columns=columns) + pdf = pd.DataFrame(data, columns=columns) + + assert_eq(gdf, pdf)