Skip to content

Commit

Permalink
Preserve name of the column while initializing a DataFrame (#14110)
Browse files Browse the repository at this point in the history
Fixes: #14088 

This PR preserves `names` of `column` object while constructing a `DataFrame` through various constructor flows.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ashwin Srinath (https://github.com/shwina)

URL: #14110
  • Loading branch information
galipremsagar authored Sep 27, 2023
1 parent 66ac962 commit b789d4c
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 10 deletions.
2 changes: 0 additions & 2 deletions python/cudf/cudf/core/column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,6 @@ def nlevels(self) -> int:

@property
def name(self) -> Any:
if len(self._data) == 0:
return None
return self.level_names[-1]

@property
Expand Down
26 changes: 23 additions & 3 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -665,7 +665,10 @@ def __init__(
len(self), dtype="object", masked=True
)
for k in columns
}
},
level_names=tuple(columns.names)
if isinstance(columns, pd.Index)
else None,
)
elif isinstance(data, ColumnAccessor):
raise TypeError(
Expand Down Expand Up @@ -712,6 +715,11 @@ def __init__(

self._data = new_df._data
self._index = new_df._index
self._data._level_names = (
tuple(columns.names)
if isinstance(columns, pd.Index)
else self._data._level_names
)
elif len(data) > 0 and isinstance(data[0], Series):
self._init_from_series_list(
data=data, columns=columns, index=index
Expand Down Expand Up @@ -834,6 +842,11 @@ def _init_from_series_list(self, data, columns, index):
self._data[col_name] = column.column_empty(
row_count=len(self), dtype=None, masked=True
)
self._data._level_names = (
tuple(columns.names)
if isinstance(columns, pd.Index)
else self._data._level_names
)
self._data = self._data.select_by_label(columns)

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -957,6 +970,11 @@ def _init_from_dict_like(
data[col_name],
nan_as_null=nan_as_null,
)
self._data._level_names = (
tuple(columns.names)
if isinstance(columns, pd.Index)
else self._data._level_names
)

@classmethod
def _from_data(
Expand Down Expand Up @@ -5131,7 +5149,7 @@ def from_pandas(cls, dataframe, nan_as_null=None):

index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null)
df = cls._from_data(data, index)
df._data._level_names = list(dataframe.columns.names)
df._data._level_names = tuple(dataframe.columns.names)

# Set columns only if it is a MultiIndex
if isinstance(dataframe.columns, pd.MultiIndex):
Expand Down Expand Up @@ -5377,6 +5395,8 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
df = df.set_index(index)
else:
df._index = as_index(index)
if isinstance(columns, pd.Index):
df._data._level_names = tuple(columns.names)
return df

@classmethod
Expand Down Expand Up @@ -5434,7 +5454,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
data, nan_as_null=nan_as_null
)
if isinstance(columns, pd.Index):
df._data._level_names = list(columns.names)
df._data._level_names = tuple(columns.names)

if index is None:
df._index = RangeIndex(start=0, stop=len(data))
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2661,7 +2661,9 @@ def _reindex(
data=cudf.core.column_accessor.ColumnAccessor(
cols,
multiindex=self._data.multiindex,
level_names=self._data.level_names,
level_names=tuple(column_names.names)
if isinstance(column_names, pd.Index)
else None,
),
index=index,
)
Expand Down
29 changes: 25 additions & 4 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6394,6 +6394,7 @@ def test_df_series_dataframe_astype_dtype_dict(copy):
([range(100), range(100)], ["range" + str(i) for i in range(100)]),
(((1, 2, 3), (1, 2, 3)), ["tuple0", "tuple1", "tuple2"]),
([[1, 2, 3]], ["list col1", "list col2", "list col3"]),
([[1, 2, 3]], pd.Index(["col1", "col2", "col3"], name="rapids")),
([range(100)], ["range" + str(i) for i in range(100)]),
(((1, 2, 3),), ["k1", "k2", "k3"]),
],
Expand Down Expand Up @@ -7969,6 +7970,7 @@ def test_series_empty(ps):
@pytest.mark.parametrize(
"data",
[
None,
[],
[1],
{"a": [10, 11, 12]},
Expand All @@ -7979,7 +7981,10 @@ def test_series_empty(ps):
},
],
)
@pytest.mark.parametrize("columns", [["a"], ["another column name"], None])
@pytest.mark.parametrize(
"columns",
[["a"], ["another column name"], None, pd.Index(["a"], name="index name")],
)
def test_dataframe_init_with_columns(data, columns):
pdf = pd.DataFrame(data, columns=columns)
gdf = cudf.DataFrame(data, columns=columns)
Expand Down Expand Up @@ -8047,7 +8052,16 @@ def test_dataframe_init_with_columns(data, columns):
],
)
@pytest.mark.parametrize(
"columns", [None, ["0"], [0], ["abc"], [144, 13], [2, 1, 0]]
"columns",
[
None,
["0"],
[0],
["abc"],
[144, 13],
[2, 1, 0],
pd.Index(["abc"], name="custom_name"),
],
)
def test_dataframe_init_from_series_list(data, ignore_dtype, columns):
gd_data = [cudf.from_pandas(obj) for obj in data]
Expand Down Expand Up @@ -10239,14 +10253,21 @@ def test_dataframe_binop_with_datetime_index():


@pytest.mark.parametrize(
"columns", ([], ["c", "a"], ["a", "d", "b", "e", "c"], ["a", "b", "c"])
"columns",
(
[],
["c", "a"],
["a", "d", "b", "e", "c"],
["a", "b", "c"],
pd.Index(["b", "a", "c"], name="custom_name"),
),
)
@pytest.mark.parametrize("index", (None, [4, 5, 6]))
def test_dataframe_dict_like_with_columns(columns, index):
data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
expect = pd.DataFrame(data, columns=columns, index=index)
actual = cudf.DataFrame(data, columns=columns, index=index)
if index is None and columns == []:
if index is None and len(columns) == 0:
# We make an empty range index, pandas makes an empty index
expect = expect.reset_index(drop=True)
assert_eq(expect, actual)
Expand Down

0 comments on commit b789d4c

Please sign in to comment.