diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0fe89490905..093fb7f47a2 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -864,31 +864,29 @@ def _init_from_dict_like( self, data, index=None, columns=None, nan_as_null=None ): if columns is not None: - # remove all entries in `data` that are - # not in `columns` - keys = [key for key in data.keys() if key in columns] - extra_cols = [col for col in columns if col not in keys] - if keys: - # if keys is non-empty, - # add null columns for all values - # in `columns` that don't exist in `keys`: - data = {key: data[key] for key in keys} - data.update({key: None for key in extra_cols}) + # remove all entries in data that are not in columns, + # inserting new empty columns for entries in columns that + # are not in data + if any(c in data for c in columns): + # Let the downstream logic determine the length of the + # empty columns here + empty_column = lambda: None # noqa: E731 else: - # If keys is empty, none of the data keys match the columns, so - # we need to create an empty DataFrame. To match pandas, the - # size of the dataframe must match the provided index, so we - # need to return a masked array of nulls if an index is given. - row_count = 0 if index is None else len(index) - masked = index is not None - data = { - key: cudf.core.column.column_empty( - row_count=row_count, - dtype=None, - masked=masked, - ) - for key in extra_cols - } + # If keys is empty, none of the data keys match the + # columns, so we need to create an empty DataFrame. To + # match pandas, the size of the dataframe must match + # the provided index, so we need to return a masked + # array of nulls if an index is given. + empty_column = functools.partial( + cudf.core.column.column_empty, + row_count=(0 if index is None else len(index)), + dtype=None, + masked=index is not None, + ) + + data = { + c: data[c] if c in data else empty_column() for c in columns + } data, index = self._align_input_series_indices(data, index=index) @@ -930,9 +928,6 @@ def _init_from_dict_like( nan_as_null=nan_as_null, ) - if columns is not None: - self.columns = columns - @classmethod def _from_data( cls, diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 25a17697538..c8f0fa96efc 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -240,7 +240,6 @@ def test_series_from_cupy_scalars(): @pytest.mark.parametrize("a", [[1, 2, 3], [1, 10, 30]]) @pytest.mark.parametrize("b", [[4, 5, 6], [-11, -100, 30]]) def test_append_index(a, b): - df = pd.DataFrame() df["a"] = a df["b"] = b @@ -368,7 +367,6 @@ def test_dataframe_truncate_datetimeindex(): def test_series_init_none(): - # test for creating empty series # 1: without initializing sr1 = cudf.Series() @@ -1503,7 +1501,6 @@ def test_dataframe_concat_different_column_types(): "df_2", [cudf.DataFrame({"a": [], "b": []}), cudf.DataFrame({})] ) def test_concat_empty_dataframe(df_1, df_2): - got = cudf.concat([df_1, df_2]) expect = pd.concat([df_1.to_pandas(), df_2.to_pandas()], sort=False) @@ -2630,7 +2627,6 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index): @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"]) def test_cuda_array_interface(dtype): - np_data = np.arange(10).astype(dtype) cupy_data = cupy.array(np_data) pd_data = pd.Series(np_data) @@ -3807,7 +3803,6 @@ def test_diff(dtype, period, data_empty): @pytest.mark.parametrize("df", _dataframe_na_data()) @pytest.mark.parametrize("nan_as_null", [True, False, None]) def test_dataframe_isnull_isna(df, nan_as_null): - gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null) assert_eq(df.isnull(), gdf.isnull()) @@ -3822,7 +3817,6 @@ def test_dataframe_isnull_isna(df, nan_as_null): @pytest.mark.parametrize("df", _dataframe_na_data()) @pytest.mark.parametrize("nan_as_null", [True, False, None]) def test_dataframe_notna_notnull(df, nan_as_null): - gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null) assert_eq(df.notnull(), gdf.notnull()) @@ -5202,7 +5196,6 @@ def test_rowwise_ops_nullable_int_dtypes(op, expected): @pytest.mark.parametrize("op", ["max", "min"]) @pytest.mark.parametrize("skipna", [True, False]) def test_rowwise_ops_datetime_dtypes(data, op, skipna): - gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() @@ -5266,7 +5259,6 @@ def test_rowwise_ops_datetime_dtypes(data, op, skipna): ], ) def test_rowwise_ops_datetime_dtypes_2(data, op, skipna): - gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() @@ -5514,13 +5506,11 @@ def test_memory_usage(deep, index, set_index): gdf = cudf.from_pandas(df) if index and set_index is None: - # Special Case: Assume RangeIndex size == 0 with expect_warning_if(deep, UserWarning): assert gdf.index.memory_usage(deep=deep) == 0 else: - # Check for Series only assert df["B"].memory_usage(index=index, deep=deep) == gdf[ "B" @@ -6234,7 +6224,6 @@ def test_from_pandas_unsupported_types(data, expected_upcast_type, error): @pytest.mark.parametrize("nan_as_null", [True, False]) @pytest.mark.parametrize("index", [None, "a", ["a", "b"]]) def test_from_pandas_nan_as_null(nan_as_null, index): - data = [np.nan, 2.0, 3.0] if index is None: @@ -6268,7 +6257,6 @@ def test_from_pandas_nan_as_null(nan_as_null, index): @pytest.mark.parametrize("nan_as_null", [True, False]) def test_from_pandas_for_series_nan_as_null(nan_as_null): - data = [np.nan, 2.0, 3.0] psr = pd.Series(data) @@ -6413,7 +6401,6 @@ def test_dataframe_init_1d_list(data, columns): ], ) def test_dataframe_init_from_arrays_cols(data, cols, index): - gd_data = data if isinstance(data, cupy.ndarray): # pandas can't handle cupy arrays in general @@ -6549,7 +6536,6 @@ def test_dataframe_assign_scalar_with_scalar_cols(col_data, assign_val): def test_dataframe_info_basic(): - buffer = io.StringIO() str_cmp = textwrap.dedent( """\ @@ -7081,7 +7067,6 @@ def test_dataframe_to_dict(orient, into): ], ) def test_dataframe_from_dict(data, orient, dtype, columns): - expected = pd.DataFrame.from_dict( data=data, orient=orient, dtype=dtype, columns=columns ) @@ -7179,7 +7164,6 @@ def test_dataframe_from_dict_transposed(dtype): def test_dataframe_from_dict_cp_np_arrays( pd_data, gd_data, orient, dtype, columns ): - expected = pd.DataFrame.from_dict( data=pd_data, orient=orient, dtype=dtype, columns=columns ) @@ -10004,7 +9988,6 @@ def test_non_string_column_name_to_arrow(data): def test_complex_types_from_arrow(): - expected = pa.Table.from_arrays( [ pa.array([1, 2, 3]), @@ -10147,3 +10130,17 @@ def test_dataframe_init_length_error(data, index): {"data": data, "index": index}, ), ) + + +@pytest.mark.parametrize( + "columns", ([], ["c", "a"], ["a", "d", "b", "e", "c"], ["a", "b", "c"]) +) +@pytest.mark.parametrize("index", (None, [4, 5, 6])) +def test_dataframe_dict_like_with_columns(columns, index): + data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} + expect = pd.DataFrame(data, columns=columns, index=index) + actual = cudf.DataFrame(data, columns=columns, index=index) + if index is None and columns == []: + # We make an empty range index, pandas makes an empty index + expect = expect.reset_index(drop=True) + assert_eq(expect, actual)