Skip to content

Commit

Permalink
Preserve names of column object in various APIs (#13772)
Browse files Browse the repository at this point in the history
This PR preserves column names in various APIs by retaining `self._data._level_names` and also calculating when to preserve the column names.
Fixes: #13741, #13740

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ashwin Srinath (https://github.com/shwina)
  - Lawrence Mitchell (https://github.com/wence-)

URL: #13772
  • Loading branch information
galipremsagar authored Jul 28, 2023
1 parent 3dba6ea commit f00e922
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 12 deletions.
37 changes: 33 additions & 4 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,10 @@ def __init__(
if dtype:
self._data = self.astype(dtype)._data

self._data.multiindex = self._data.multiindex or isinstance(
columns, pd.MultiIndex
)

@_cudf_nvtx_annotate
def _init_from_series_list(self, data, columns, index):
if index is None:
Expand Down Expand Up @@ -1820,19 +1824,29 @@ def _make_operands_and_index_for_binop(
NotImplementedType,
],
Optional[BaseIndex],
bool,
]:
lhs, rhs = self._data, other
index = self._index
fill_requires_key = False
left_default: Any = False
equal_columns = False
can_use_self_column_name = True

if _is_scalar_or_zero_d_array(other):
rhs = {name: other for name in self._data}
equal_columns = True
elif isinstance(other, Series):
rhs = dict(zip(other.index.values_host, other.values_host))
# For keys in right but not left, perform binops between NaN (not
# NULL!) and the right value (result is NaN).
left_default = as_column(np.nan, length=len(self))
equal_columns = other.index.to_pandas().equals(
self._data.to_pandas_index()
)
can_use_self_column_name = equal_columns or (
list(other._index._data.names) == self._data._level_names
)
elif isinstance(other, DataFrame):
if (
not can_reindex
Expand All @@ -1854,13 +1868,18 @@ def _make_operands_and_index_for_binop(
# For DataFrame-DataFrame ops, always default to operating against
# the fill value.
left_default = fill_value
equal_columns = self._column_names == other._column_names
can_use_self_column_name = (
equal_columns
or self._data._level_names == other._data._level_names
)
elif isinstance(other, (dict, abc.Mapping)):
# Need to fail early on host mapping types because we ultimately
# convert everything to a dict.
return NotImplemented, None
return NotImplemented, None, True

if not isinstance(rhs, (dict, abc.Mapping)):
return NotImplemented, None
return NotImplemented, None, True

operands = {
k: (
Expand All @@ -1876,7 +1895,8 @@ def _make_operands_and_index_for_binop(
for k, v in rhs.items():
if k not in lhs:
operands[k] = (left_default, v, reflect, None)
return operands, index

return operands, index, can_use_self_column_name

@classmethod
@_cudf_nvtx_annotate
Expand Down Expand Up @@ -5042,6 +5062,7 @@ def from_pandas(cls, dataframe, nan_as_null=None):

index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null)
df = cls._from_data(data, index)
df._data._level_names = list(dataframe.columns.names)

# Set columns only if it is a MultiIndex
if isinstance(dataframe.columns, pd.MultiIndex):
Expand Down Expand Up @@ -5085,13 +5106,19 @@ def from_arrow(cls, table):
2 3 6
"""
index_col = None
col_index_names = None
if isinstance(table, pa.Table) and isinstance(
table.schema.pandas_metadata, dict
):
index_col = table.schema.pandas_metadata["index_columns"]
if "column_indexes" in table.schema.pandas_metadata:
col_index_names = []
for col_meta in table.schema.pandas_metadata["column_indexes"]:
col_index_names.append(col_meta["name"])

out = super().from_arrow(table)

if col_index_names is not None:
out._data._level_names = col_index_names
if index_col:
if isinstance(index_col[0], dict):
idx = cudf.RangeIndex(
Expand Down Expand Up @@ -5337,6 +5364,8 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
df._data[names[0]] = column.as_column(
data, nan_as_null=nan_as_null
)
if isinstance(columns, pd.Index):
df._data._level_names = list(columns.names)

if index is None:
df._index = RangeIndex(start=0, stop=len(data))
Expand Down
25 changes: 20 additions & 5 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,9 @@ def _from_data(

@_cudf_nvtx_annotate
def _from_data_like_self(self, data: MutableMapping):
return self._from_data(data, self._index)
out = self._from_data(data, self._index)
out._data._level_names = self._data._level_names
return out

@classmethod
@_cudf_nvtx_annotate
Expand Down Expand Up @@ -3128,7 +3130,9 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""):
# inserted to the left of existing data columns.
return (
ColumnAccessor(
{**new_column_data, **self._data}, self._data.multiindex
{**new_column_data, **self._data},
self._data.multiindex,
self._data._level_names,
),
index,
)
Expand Down Expand Up @@ -3465,14 +3469,24 @@ def _binaryop(
**kwargs,
):
reflect, op = self._check_reflected_op(op)
operands, out_index = self._make_operands_and_index_for_binop(
(
operands,
out_index,
can_use_self_column_name,
) = self._make_operands_and_index_for_binop(
other, op, fill_value, reflect, can_reindex
)
if operands is NotImplemented:
return NotImplemented

level_names = (
None if not can_use_self_column_name else self._data._level_names
)
return self._from_data(
ColumnAccessor(type(self)._colwise_binop(operands, op)),
ColumnAccessor(
type(self)._colwise_binop(operands, op),
level_names=level_names,
),
index=out_index,
)

Expand All @@ -3491,6 +3505,7 @@ def _make_operands_and_index_for_binop(
NotImplementedType,
],
Optional[cudf.BaseIndex],
bool,
]:
raise NotImplementedError(
f"Binary operations are not supported for {self.__class__}"
Expand All @@ -3516,7 +3531,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
if cupy_func:
if ufunc.nin == 2:
other = inputs[self is inputs[0]]
inputs, index = self._make_operands_and_index_for_binop(
inputs, index, _ = self._make_operands_and_index_for_binop(
other, fname
)
else:
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1472,6 +1472,7 @@ def _make_operands_and_index_for_binop(
NotImplementedType,
],
Optional[BaseIndex],
bool,
]:
# Specialize binops to align indices.
if isinstance(other, Series):
Expand All @@ -1484,11 +1485,13 @@ def _make_operands_and_index_for_binop(
"Can only compare identically-labeled Series objects"
)
lhs, other = _align_indices([self, other], allow_non_unique=True)
can_use_self_column_name = self.name == other.name
else:
lhs = self
can_use_self_column_name = False

operands = lhs._make_operands_for_binop(other, fill_value, reflect)
return operands, lhs._index
return operands, lhs._index, can_use_self_column_name

@copy_docstring(CategoricalAccessor) # type: ignore
@property
Expand Down
41 changes: 39 additions & 2 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1708,6 +1708,7 @@ def test_nonmatching_index_setitem(nrows):
)
def test_from_pandas(dtype):
df = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0], dtype=dtype)
df.columns.name = "custom_column_name"
gdf = cudf.DataFrame.from_pandas(df)
assert isinstance(gdf, cudf.DataFrame)

Expand Down Expand Up @@ -2483,8 +2484,15 @@ def test_bitwise_binops_series(pdf, gdf, binop):


@pytest.mark.parametrize("unaryop", [operator.neg, operator.inv, operator.abs])
def test_unaryops_df(pdf, gdf, unaryop):
d = unaryop(pdf - 5)
@pytest.mark.parametrize(
"col_name,assign_col_name", [(None, False), (None, True), ("abc", True)]
)
def test_unaryops_df(pdf, unaryop, col_name, assign_col_name):
pd_df = pdf.copy()
if assign_col_name:
pd_df.columns.name = col_name
gdf = cudf.from_pandas(pd_df)
d = unaryop(pd_df - 5)
g = unaryop(gdf - 5)
assert_eq(d, g)

Expand Down Expand Up @@ -2626,6 +2634,12 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index):
pdf2 = pdf_arrow_table.to_pandas()

assert_eq(pdf2, gdf2)
pdf.columns.name = "abc"
pdf_arrow_table = pa.Table.from_pandas(pdf, preserve_index=preserve_index)

gdf2 = cudf.DataFrame.from_arrow(pdf_arrow_table)
pdf2 = pdf_arrow_table.to_pandas()
assert_eq(pdf2, gdf2)


@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
Expand Down Expand Up @@ -2912,6 +2926,7 @@ def test_tail_for_string():
["v0", "v1"],
["v0", "index"],
pd.MultiIndex.from_tuples([("x0", "x1"), ("y0", "y1")]),
pd.MultiIndex.from_tuples([(1, 2), (10, 11)], names=["ABC", "DEF"]),
],
)
@pytest.mark.parametrize("inplace", [True, False])
Expand Down Expand Up @@ -10147,3 +10162,25 @@ def test_dataframe_init_length_error(data, index):
{"data": data, "index": index},
),
)


def test_dataframe_init_columns_named_multiindex():
np.random.seed(0)
data = np.random.randn(2, 2)
columns = cudf.MultiIndex.from_tuples(
[("A", "one"), ("A", "two")], names=["y", "z"]
)
gdf = cudf.DataFrame(data, columns=columns)
pdf = pd.DataFrame(data, columns=columns.to_pandas())

assert_eq(gdf, pdf)


def test_dataframe_init_columns_named_index():
np.random.seed(0)
data = np.random.randn(2, 2)
columns = pd.Index(["a", "b"], name="custom_name")
gdf = cudf.DataFrame(data, columns=columns)
pdf = pd.DataFrame(data, columns=columns)

assert_eq(gdf, pdf)

0 comments on commit f00e922

Please sign in to comment.