From 7bdb3d81e4533991d3e474ba4fb2e798cc0b9775 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 26 Jul 2023 12:18:17 +0100 Subject: [PATCH] Fix construction of DataFrames from dict when columns are provided If a columns argument is provided to the dataframe constructor, this should be used to select columns from the provided data dictionary. The previous logic did do this correctly, but didn't preserve the appropriate order of the resulting columns (which should come out in the order that the column selection is in). - Closes #13738 --- python/cudf/cudf/core/dataframe.py | 49 +++++++++++------------- python/cudf/cudf/tests/test_dataframe.py | 31 +++++++-------- 2 files changed, 36 insertions(+), 44 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0fe89490905..093fb7f47a2 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -864,31 +864,29 @@ def _init_from_dict_like( self, data, index=None, columns=None, nan_as_null=None ): if columns is not None: - # remove all entries in `data` that are - # not in `columns` - keys = [key for key in data.keys() if key in columns] - extra_cols = [col for col in columns if col not in keys] - if keys: - # if keys is non-empty, - # add null columns for all values - # in `columns` that don't exist in `keys`: - data = {key: data[key] for key in keys} - data.update({key: None for key in extra_cols}) + # remove all entries in data that are not in columns, + # inserting new empty columns for entries in columns that + # are not in data + if any(c in data for c in columns): + # Let the downstream logic determine the length of the + # empty columns here + empty_column = lambda: None # noqa: E731 else: - # If keys is empty, none of the data keys match the columns, so - # we need to create an empty DataFrame. To match pandas, the - # size of the dataframe must match the provided index, so we - # need to return a masked array of nulls if an index is given. - row_count = 0 if index is None else len(index) - masked = index is not None - data = { - key: cudf.core.column.column_empty( - row_count=row_count, - dtype=None, - masked=masked, - ) - for key in extra_cols - } + # If keys is empty, none of the data keys match the + # columns, so we need to create an empty DataFrame. To + # match pandas, the size of the dataframe must match + # the provided index, so we need to return a masked + # array of nulls if an index is given. + empty_column = functools.partial( + cudf.core.column.column_empty, + row_count=(0 if index is None else len(index)), + dtype=None, + masked=index is not None, + ) + + data = { + c: data[c] if c in data else empty_column() for c in columns + } data, index = self._align_input_series_indices(data, index=index) @@ -930,9 +928,6 @@ def _init_from_dict_like( nan_as_null=nan_as_null, ) - if columns is not None: - self.columns = columns - @classmethod def _from_data( cls, diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 25a17697538..c8f0fa96efc 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -240,7 +240,6 @@ def test_series_from_cupy_scalars(): @pytest.mark.parametrize("a", [[1, 2, 3], [1, 10, 30]]) @pytest.mark.parametrize("b", [[4, 5, 6], [-11, -100, 30]]) def test_append_index(a, b): - df = pd.DataFrame() df["a"] = a df["b"] = b @@ -368,7 +367,6 @@ def test_dataframe_truncate_datetimeindex(): def test_series_init_none(): - # test for creating empty series # 1: without initializing sr1 = cudf.Series() @@ -1503,7 +1501,6 @@ def test_dataframe_concat_different_column_types(): "df_2", [cudf.DataFrame({"a": [], "b": []}), cudf.DataFrame({})] ) def test_concat_empty_dataframe(df_1, df_2): - got = cudf.concat([df_1, df_2]) expect = pd.concat([df_1.to_pandas(), df_2.to_pandas()], sort=False) @@ -2630,7 +2627,6 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index): @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"]) def test_cuda_array_interface(dtype): - np_data = np.arange(10).astype(dtype) cupy_data = cupy.array(np_data) pd_data = pd.Series(np_data) @@ -3807,7 +3803,6 @@ def test_diff(dtype, period, data_empty): @pytest.mark.parametrize("df", _dataframe_na_data()) @pytest.mark.parametrize("nan_as_null", [True, False, None]) def test_dataframe_isnull_isna(df, nan_as_null): - gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null) assert_eq(df.isnull(), gdf.isnull()) @@ -3822,7 +3817,6 @@ def test_dataframe_isnull_isna(df, nan_as_null): @pytest.mark.parametrize("df", _dataframe_na_data()) @pytest.mark.parametrize("nan_as_null", [True, False, None]) def test_dataframe_notna_notnull(df, nan_as_null): - gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null) assert_eq(df.notnull(), gdf.notnull()) @@ -5202,7 +5196,6 @@ def test_rowwise_ops_nullable_int_dtypes(op, expected): @pytest.mark.parametrize("op", ["max", "min"]) @pytest.mark.parametrize("skipna", [True, False]) def test_rowwise_ops_datetime_dtypes(data, op, skipna): - gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() @@ -5266,7 +5259,6 @@ def test_rowwise_ops_datetime_dtypes(data, op, skipna): ], ) def test_rowwise_ops_datetime_dtypes_2(data, op, skipna): - gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() @@ -5514,13 +5506,11 @@ def test_memory_usage(deep, index, set_index): gdf = cudf.from_pandas(df) if index and set_index is None: - # Special Case: Assume RangeIndex size == 0 with expect_warning_if(deep, UserWarning): assert gdf.index.memory_usage(deep=deep) == 0 else: - # Check for Series only assert df["B"].memory_usage(index=index, deep=deep) == gdf[ "B" @@ -6234,7 +6224,6 @@ def test_from_pandas_unsupported_types(data, expected_upcast_type, error): @pytest.mark.parametrize("nan_as_null", [True, False]) @pytest.mark.parametrize("index", [None, "a", ["a", "b"]]) def test_from_pandas_nan_as_null(nan_as_null, index): - data = [np.nan, 2.0, 3.0] if index is None: @@ -6268,7 +6257,6 @@ def test_from_pandas_nan_as_null(nan_as_null, index): @pytest.mark.parametrize("nan_as_null", [True, False]) def test_from_pandas_for_series_nan_as_null(nan_as_null): - data = [np.nan, 2.0, 3.0] psr = pd.Series(data) @@ -6413,7 +6401,6 @@ def test_dataframe_init_1d_list(data, columns): ], ) def test_dataframe_init_from_arrays_cols(data, cols, index): - gd_data = data if isinstance(data, cupy.ndarray): # pandas can't handle cupy arrays in general @@ -6549,7 +6536,6 @@ def test_dataframe_assign_scalar_with_scalar_cols(col_data, assign_val): def test_dataframe_info_basic(): - buffer = io.StringIO() str_cmp = textwrap.dedent( """\ @@ -7081,7 +7067,6 @@ def test_dataframe_to_dict(orient, into): ], ) def test_dataframe_from_dict(data, orient, dtype, columns): - expected = pd.DataFrame.from_dict( data=data, orient=orient, dtype=dtype, columns=columns ) @@ -7179,7 +7164,6 @@ def test_dataframe_from_dict_transposed(dtype): def test_dataframe_from_dict_cp_np_arrays( pd_data, gd_data, orient, dtype, columns ): - expected = pd.DataFrame.from_dict( data=pd_data, orient=orient, dtype=dtype, columns=columns ) @@ -10004,7 +9988,6 @@ def test_non_string_column_name_to_arrow(data): def test_complex_types_from_arrow(): - expected = pa.Table.from_arrays( [ pa.array([1, 2, 3]), @@ -10147,3 +10130,17 @@ def test_dataframe_init_length_error(data, index): {"data": data, "index": index}, ), ) + + +@pytest.mark.parametrize( + "columns", ([], ["c", "a"], ["a", "d", "b", "e", "c"], ["a", "b", "c"]) +) +@pytest.mark.parametrize("index", (None, [4, 5, 6])) +def test_dataframe_dict_like_with_columns(columns, index): + data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} + expect = pd.DataFrame(data, columns=columns, index=index) + actual = cudf.DataFrame(data, columns=columns, index=index) + if index is None and columns == []: + # We make an empty range index, pandas makes an empty index + expect = expect.reset_index(drop=True) + assert_eq(expect, actual)