Skip to content

Commit

Permalink
Fix construction of DataFrames from dict when columns are provided
Browse files Browse the repository at this point in the history
If a columns argument is provided to the dataframe constructor, this
should be used to select columns from the provided data dictionary.
The previous logic did do this correctly, but didn't preserve the
appropriate order of the resulting columns (which should come out in
the order that the column selection is in).

- Closes #13738
  • Loading branch information
wence- committed Jul 26, 2023
1 parent 7dcf052 commit 7bdb3d8
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 44 deletions.
49 changes: 22 additions & 27 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -864,31 +864,29 @@ def _init_from_dict_like(
self, data, index=None, columns=None, nan_as_null=None
):
if columns is not None:
# remove all entries in `data` that are
# not in `columns`
keys = [key for key in data.keys() if key in columns]
extra_cols = [col for col in columns if col not in keys]
if keys:
# if keys is non-empty,
# add null columns for all values
# in `columns` that don't exist in `keys`:
data = {key: data[key] for key in keys}
data.update({key: None for key in extra_cols})
# remove all entries in data that are not in columns,
# inserting new empty columns for entries in columns that
# are not in data
if any(c in data for c in columns):
# Let the downstream logic determine the length of the
# empty columns here
empty_column = lambda: None # noqa: E731
else:
# If keys is empty, none of the data keys match the columns, so
# we need to create an empty DataFrame. To match pandas, the
# size of the dataframe must match the provided index, so we
# need to return a masked array of nulls if an index is given.
row_count = 0 if index is None else len(index)
masked = index is not None
data = {
key: cudf.core.column.column_empty(
row_count=row_count,
dtype=None,
masked=masked,
)
for key in extra_cols
}
# If keys is empty, none of the data keys match the
# columns, so we need to create an empty DataFrame. To
# match pandas, the size of the dataframe must match
# the provided index, so we need to return a masked
# array of nulls if an index is given.
empty_column = functools.partial(
cudf.core.column.column_empty,
row_count=(0 if index is None else len(index)),
dtype=None,
masked=index is not None,
)

data = {
c: data[c] if c in data else empty_column() for c in columns
}

data, index = self._align_input_series_indices(data, index=index)

Expand Down Expand Up @@ -930,9 +928,6 @@ def _init_from_dict_like(
nan_as_null=nan_as_null,
)

if columns is not None:
self.columns = columns

@classmethod
def _from_data(
cls,
Expand Down
31 changes: 14 additions & 17 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,6 @@ def test_series_from_cupy_scalars():
@pytest.mark.parametrize("a", [[1, 2, 3], [1, 10, 30]])
@pytest.mark.parametrize("b", [[4, 5, 6], [-11, -100, 30]])
def test_append_index(a, b):

df = pd.DataFrame()
df["a"] = a
df["b"] = b
Expand Down Expand Up @@ -368,7 +367,6 @@ def test_dataframe_truncate_datetimeindex():


def test_series_init_none():

# test for creating empty series
# 1: without initializing
sr1 = cudf.Series()
Expand Down Expand Up @@ -1503,7 +1501,6 @@ def test_dataframe_concat_different_column_types():
"df_2", [cudf.DataFrame({"a": [], "b": []}), cudf.DataFrame({})]
)
def test_concat_empty_dataframe(df_1, df_2):

got = cudf.concat([df_1, df_2])
expect = pd.concat([df_1.to_pandas(), df_2.to_pandas()], sort=False)

Expand Down Expand Up @@ -2630,7 +2627,6 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index):

@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
def test_cuda_array_interface(dtype):

np_data = np.arange(10).astype(dtype)
cupy_data = cupy.array(np_data)
pd_data = pd.Series(np_data)
Expand Down Expand Up @@ -3807,7 +3803,6 @@ def test_diff(dtype, period, data_empty):
@pytest.mark.parametrize("df", _dataframe_na_data())
@pytest.mark.parametrize("nan_as_null", [True, False, None])
def test_dataframe_isnull_isna(df, nan_as_null):

gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)

assert_eq(df.isnull(), gdf.isnull())
Expand All @@ -3822,7 +3817,6 @@ def test_dataframe_isnull_isna(df, nan_as_null):
@pytest.mark.parametrize("df", _dataframe_na_data())
@pytest.mark.parametrize("nan_as_null", [True, False, None])
def test_dataframe_notna_notnull(df, nan_as_null):

gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)

assert_eq(df.notnull(), gdf.notnull())
Expand Down Expand Up @@ -5202,7 +5196,6 @@ def test_rowwise_ops_nullable_int_dtypes(op, expected):
@pytest.mark.parametrize("op", ["max", "min"])
@pytest.mark.parametrize("skipna", [True, False])
def test_rowwise_ops_datetime_dtypes(data, op, skipna):

gdf = cudf.DataFrame(data)

pdf = gdf.to_pandas()
Expand Down Expand Up @@ -5266,7 +5259,6 @@ def test_rowwise_ops_datetime_dtypes(data, op, skipna):
],
)
def test_rowwise_ops_datetime_dtypes_2(data, op, skipna):

gdf = cudf.DataFrame(data)

pdf = gdf.to_pandas()
Expand Down Expand Up @@ -5514,13 +5506,11 @@ def test_memory_usage(deep, index, set_index):
gdf = cudf.from_pandas(df)

if index and set_index is None:

# Special Case: Assume RangeIndex size == 0
with expect_warning_if(deep, UserWarning):
assert gdf.index.memory_usage(deep=deep) == 0

else:

# Check for Series only
assert df["B"].memory_usage(index=index, deep=deep) == gdf[
"B"
Expand Down Expand Up @@ -6234,7 +6224,6 @@ def test_from_pandas_unsupported_types(data, expected_upcast_type, error):
@pytest.mark.parametrize("nan_as_null", [True, False])
@pytest.mark.parametrize("index", [None, "a", ["a", "b"]])
def test_from_pandas_nan_as_null(nan_as_null, index):

data = [np.nan, 2.0, 3.0]

if index is None:
Expand Down Expand Up @@ -6268,7 +6257,6 @@ def test_from_pandas_nan_as_null(nan_as_null, index):

@pytest.mark.parametrize("nan_as_null", [True, False])
def test_from_pandas_for_series_nan_as_null(nan_as_null):

data = [np.nan, 2.0, 3.0]
psr = pd.Series(data)

Expand Down Expand Up @@ -6413,7 +6401,6 @@ def test_dataframe_init_1d_list(data, columns):
],
)
def test_dataframe_init_from_arrays_cols(data, cols, index):

gd_data = data
if isinstance(data, cupy.ndarray):
# pandas can't handle cupy arrays in general
Expand Down Expand Up @@ -6549,7 +6536,6 @@ def test_dataframe_assign_scalar_with_scalar_cols(col_data, assign_val):


def test_dataframe_info_basic():

buffer = io.StringIO()
str_cmp = textwrap.dedent(
"""\
Expand Down Expand Up @@ -7081,7 +7067,6 @@ def test_dataframe_to_dict(orient, into):
],
)
def test_dataframe_from_dict(data, orient, dtype, columns):

expected = pd.DataFrame.from_dict(
data=data, orient=orient, dtype=dtype, columns=columns
)
Expand Down Expand Up @@ -7179,7 +7164,6 @@ def test_dataframe_from_dict_transposed(dtype):
def test_dataframe_from_dict_cp_np_arrays(
pd_data, gd_data, orient, dtype, columns
):

expected = pd.DataFrame.from_dict(
data=pd_data, orient=orient, dtype=dtype, columns=columns
)
Expand Down Expand Up @@ -10004,7 +9988,6 @@ def test_non_string_column_name_to_arrow(data):


def test_complex_types_from_arrow():

expected = pa.Table.from_arrays(
[
pa.array([1, 2, 3]),
Expand Down Expand Up @@ -10147,3 +10130,17 @@ def test_dataframe_init_length_error(data, index):
{"data": data, "index": index},
),
)


@pytest.mark.parametrize(
"columns", ([], ["c", "a"], ["a", "d", "b", "e", "c"], ["a", "b", "c"])
)
@pytest.mark.parametrize("index", (None, [4, 5, 6]))
def test_dataframe_dict_like_with_columns(columns, index):
data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
expect = pd.DataFrame(data, columns=columns, index=index)
actual = cudf.DataFrame(data, columns=columns, index=index)
if index is None and columns == []:
# We make an empty range index, pandas makes an empty index
expect = expect.reset_index(drop=True)
assert_eq(expect, actual)

0 comments on commit 7bdb3d8

Please sign in to comment.