Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix construction of DataFrames from dict when columns are provided #13766

Merged
merged 2 commits into from
Jul 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 22 additions & 27 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -868,31 +868,29 @@ def _init_from_dict_like(
self, data, index=None, columns=None, nan_as_null=None
):
if columns is not None:
# remove all entries in `data` that are
# not in `columns`
keys = [key for key in data.keys() if key in columns]
extra_cols = [col for col in columns if col not in keys]
if keys:
# if keys is non-empty,
# add null columns for all values
# in `columns` that don't exist in `keys`:
data = {key: data[key] for key in keys}
data.update({key: None for key in extra_cols})
# remove all entries in data that are not in columns,
# inserting new empty columns for entries in columns that
# are not in data
if any(c in data for c in columns):
# Let the downstream logic determine the length of the
# empty columns here
empty_column = lambda: None # noqa: E731
else:
# If keys is empty, none of the data keys match the columns, so
# we need to create an empty DataFrame. To match pandas, the
# size of the dataframe must match the provided index, so we
# need to return a masked array of nulls if an index is given.
row_count = 0 if index is None else len(index)
masked = index is not None
data = {
key: cudf.core.column.column_empty(
row_count=row_count,
dtype=None,
masked=masked,
)
for key in extra_cols
}
# If keys is empty, none of the data keys match the
# columns, so we need to create an empty DataFrame. To
# match pandas, the size of the dataframe must match
# the provided index, so we need to return a masked
# array of nulls if an index is given.
empty_column = functools.partial(
cudf.core.column.column_empty,
row_count=(0 if index is None else len(index)),
dtype=None,
masked=index is not None,
)

data = {
c: data[c] if c in data else empty_column() for c in columns
}

data, index = self._align_input_series_indices(data, index=index)

Expand Down Expand Up @@ -934,9 +932,6 @@ def _init_from_dict_like(
nan_as_null=nan_as_null,
)

if columns is not None:
self.columns = columns

@classmethod
def _from_data(
cls,
Expand Down
31 changes: 14 additions & 17 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,6 @@ def test_series_from_cupy_scalars():
@pytest.mark.parametrize("a", [[1, 2, 3], [1, 10, 30]])
@pytest.mark.parametrize("b", [[4, 5, 6], [-11, -100, 30]])
def test_append_index(a, b):

df = pd.DataFrame()
df["a"] = a
df["b"] = b
Expand Down Expand Up @@ -368,7 +367,6 @@ def test_dataframe_truncate_datetimeindex():


def test_series_init_none():

# test for creating empty series
# 1: without initializing
sr1 = cudf.Series()
Expand Down Expand Up @@ -1503,7 +1501,6 @@ def test_dataframe_concat_different_column_types():
"df_2", [cudf.DataFrame({"a": [], "b": []}), cudf.DataFrame({})]
)
def test_concat_empty_dataframe(df_1, df_2):

got = cudf.concat([df_1, df_2])
expect = pd.concat([df_1.to_pandas(), df_2.to_pandas()], sort=False)

Expand Down Expand Up @@ -2644,7 +2641,6 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index):

@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
def test_cuda_array_interface(dtype):

np_data = np.arange(10).astype(dtype)
cupy_data = cupy.array(np_data)
pd_data = pd.Series(np_data)
Expand Down Expand Up @@ -3822,7 +3818,6 @@ def test_diff(dtype, period, data_empty):
@pytest.mark.parametrize("df", _dataframe_na_data())
@pytest.mark.parametrize("nan_as_null", [True, False, None])
def test_dataframe_isnull_isna(df, nan_as_null):

gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)

assert_eq(df.isnull(), gdf.isnull())
Expand All @@ -3837,7 +3832,6 @@ def test_dataframe_isnull_isna(df, nan_as_null):
@pytest.mark.parametrize("df", _dataframe_na_data())
@pytest.mark.parametrize("nan_as_null", [True, False, None])
def test_dataframe_notna_notnull(df, nan_as_null):

gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)

assert_eq(df.notnull(), gdf.notnull())
Expand Down Expand Up @@ -5217,7 +5211,6 @@ def test_rowwise_ops_nullable_int_dtypes(op, expected):
@pytest.mark.parametrize("op", ["max", "min"])
@pytest.mark.parametrize("skipna", [True, False])
def test_rowwise_ops_datetime_dtypes(data, op, skipna):

gdf = cudf.DataFrame(data)

pdf = gdf.to_pandas()
Expand Down Expand Up @@ -5281,7 +5274,6 @@ def test_rowwise_ops_datetime_dtypes(data, op, skipna):
],
)
def test_rowwise_ops_datetime_dtypes_2(data, op, skipna):

gdf = cudf.DataFrame(data)

pdf = gdf.to_pandas()
Expand Down Expand Up @@ -5529,13 +5521,11 @@ def test_memory_usage(deep, index, set_index):
gdf = cudf.from_pandas(df)

if index and set_index is None:

# Special Case: Assume RangeIndex size == 0
with expect_warning_if(deep, UserWarning):
assert gdf.index.memory_usage(deep=deep) == 0

else:

# Check for Series only
assert df["B"].memory_usage(index=index, deep=deep) == gdf[
"B"
Expand Down Expand Up @@ -6249,7 +6239,6 @@ def test_from_pandas_unsupported_types(data, expected_upcast_type, error):
@pytest.mark.parametrize("nan_as_null", [True, False])
@pytest.mark.parametrize("index", [None, "a", ["a", "b"]])
def test_from_pandas_nan_as_null(nan_as_null, index):

data = [np.nan, 2.0, 3.0]

if index is None:
Expand Down Expand Up @@ -6283,7 +6272,6 @@ def test_from_pandas_nan_as_null(nan_as_null, index):

@pytest.mark.parametrize("nan_as_null", [True, False])
def test_from_pandas_for_series_nan_as_null(nan_as_null):

data = [np.nan, 2.0, 3.0]
psr = pd.Series(data)

Expand Down Expand Up @@ -6428,7 +6416,6 @@ def test_dataframe_init_1d_list(data, columns):
],
)
def test_dataframe_init_from_arrays_cols(data, cols, index):

gd_data = data
if isinstance(data, cupy.ndarray):
# pandas can't handle cupy arrays in general
Expand Down Expand Up @@ -6564,7 +6551,6 @@ def test_dataframe_assign_scalar_with_scalar_cols(col_data, assign_val):


def test_dataframe_info_basic():

buffer = io.StringIO()
str_cmp = textwrap.dedent(
"""\
Expand Down Expand Up @@ -7096,7 +7082,6 @@ def test_dataframe_to_dict(orient, into):
],
)
def test_dataframe_from_dict(data, orient, dtype, columns):

expected = pd.DataFrame.from_dict(
data=data, orient=orient, dtype=dtype, columns=columns
)
Expand Down Expand Up @@ -7194,7 +7179,6 @@ def test_dataframe_from_dict_transposed(dtype):
def test_dataframe_from_dict_cp_np_arrays(
pd_data, gd_data, orient, dtype, columns
):

expected = pd.DataFrame.from_dict(
data=pd_data, orient=orient, dtype=dtype, columns=columns
)
Expand Down Expand Up @@ -10019,7 +10003,6 @@ def test_non_string_column_name_to_arrow(data):


def test_complex_types_from_arrow():

expected = pa.Table.from_arrays(
[
pa.array([1, 2, 3]),
Expand Down Expand Up @@ -10164,6 +10147,20 @@ def test_dataframe_init_length_error(data, index):
)


@pytest.mark.parametrize(
"columns", ([], ["c", "a"], ["a", "d", "b", "e", "c"], ["a", "b", "c"])
)
@pytest.mark.parametrize("index", (None, [4, 5, 6]))
def test_dataframe_dict_like_with_columns(columns, index):
data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
expect = pd.DataFrame(data, columns=columns, index=index)
actual = cudf.DataFrame(data, columns=columns, index=index)
if index is None and columns == []:
# We make an empty range index, pandas makes an empty index
expect = expect.reset_index(drop=True)
assert_eq(expect, actual)


def test_dataframe_init_columns_named_multiindex():
np.random.seed(0)
data = np.random.randn(2, 2)
Expand Down