Skip to content

Commit

Permalink
Use as_column instead of full (#14698)
Browse files Browse the repository at this point in the history
Similar to #14689, ensures there's 1 entrypoint to create a column from a scalar.

This builds on #14620

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #14698
  • Loading branch information
mroeschke authored Mar 5, 2024
1 parent f12b8e1 commit 3571291
Show file tree
Hide file tree
Showing 18 changed files with 101 additions and 135 deletions.
1 change: 0 additions & 1 deletion python/cudf/cudf/core/column/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
column_empty_like_same_mask,
concat_columns,
deserialize_columns,
full,
serialize_columns,
)
from cudf.core.column.datetime import DatetimeColumn # noqa: F401
Expand Down
12 changes: 5 additions & 7 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -734,8 +734,8 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn:
)
return other

ary = column.full(
len(self), self._encode(other), dtype=self.codes.dtype
ary = column.as_column(
self._encode(other), length=len(self), dtype=self.codes.dtype
)
return column.build_categorical_column(
categories=self.dtype.categories._values,
Expand Down Expand Up @@ -1444,11 +1444,9 @@ def _create_empty_categorical_column(
return column.build_categorical_column(
categories=column.as_column(dtype.categories),
codes=column.as_column(
column.full(
categorical_column.size,
_DEFAULT_CATEGORICAL_VALUE,
categorical_column.codes.dtype,
)
_DEFAULT_CATEGORICAL_VALUE,
length=categorical_column.size,
dtype=categorical_column.codes.dtype,
),
offset=categorical_column.offset,
size=categorical_column.size,
Expand Down
100 changes: 31 additions & 69 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@
infer_dtype,
is_bool_dtype,
is_datetime64_dtype,
is_decimal_dtype,
is_dtype_equal,
is_integer_dtype,
is_list_dtype,
Expand Down Expand Up @@ -866,7 +865,7 @@ def isin(self, values: Sequence) -> ColumnBase:
except ValueError:
# pandas functionally returns all False when cleansing via
# typecasting fails
return full(len(self), False, dtype="bool")
return as_column(False, length=len(self), dtype="bool")

return lhs._obtain_isin_result(rhs)

Expand All @@ -893,9 +892,9 @@ def _isin_earlystop(self, rhs: ColumnBase) -> Union[ColumnBase, None]:
if self.null_count and rhs.null_count:
return self.isnull()
else:
return cudf.core.column.full(len(self), False, dtype="bool")
return as_column(False, length=len(self), dtype="bool")
elif self.null_count == 0 and (rhs.null_count == len(rhs)):
return cudf.core.column.full(len(self), False, dtype="bool")
return as_column(False, length=len(self), dtype="bool")
else:
return None

Expand Down Expand Up @@ -1356,9 +1355,7 @@ def _label_encoding(
na_sentinel = cudf.Scalar(-1)

def _return_sentinel_column():
return cudf.core.column.full(
size=len(self), fill_value=na_sentinel, dtype=dtype
)
return as_column(na_sentinel, dtype=dtype, length=len(self))

if dtype is None:
dtype = min_scalar_type(max(len(cats), na_sentinel), 8)
Expand Down Expand Up @@ -1455,7 +1452,9 @@ def column_empty(
elif isinstance(dtype, ListDtype):
data = None
children = (
full(row_count + 1, 0, dtype=libcudf.types.size_type_dtype),
as_column(
0, length=row_count + 1, dtype=libcudf.types.size_type_dtype
),
column_empty(row_count, dtype=dtype.element_type),
)
elif isinstance(dtype, CategoricalDtype):
Expand All @@ -1474,7 +1473,9 @@ def column_empty(
elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype):
data = as_buffer(rmm.DeviceBuffer(size=0))
children = (
full(row_count + 1, 0, dtype=libcudf.types.size_type_dtype),
as_column(
0, length=row_count + 1, dtype=libcudf.types.size_type_dtype
),
)
else:
data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize))
Expand Down Expand Up @@ -2017,33 +2018,32 @@ def as_column(
if dtype is not None:
data = data.astype(dtype)

elif isinstance(arbitrary, (pd.Timestamp, pd.Timedelta)):
# This will always treat NaTs as nulls since it's not technically a
# discrete value like NaN
length = length or 1
data = as_column(
pa.array(pd.Series([arbitrary] * length), from_pandas=True)
)
if dtype is not None:
data = data.astype(dtype)

elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
length = length or 1
elif is_scalar(arbitrary) and not isinstance(arbitrary, memoryview):
if length is None:
length = 1
elif length < 0:
raise ValueError(f"{length=} must be >=0.")
if isinstance(arbitrary, pd.Interval):
# No cudf.Scalar support yet
return as_column(
pd.Series([arbitrary] * length),
nan_as_null=nan_as_null,
dtype=dtype,
length=length,
)
if (
(nan_as_null is True)
nan_as_null is True
and isinstance(arbitrary, (np.floating, float))
and np.isnan(arbitrary)
):
arbitrary = None
if dtype is None:
dtype = cudf.dtype("float64")

data = as_column(full(length, arbitrary, dtype=dtype))
if not nan_as_null and not is_decimal_dtype(data.dtype):
if np.issubdtype(data.dtype, np.floating):
data = data.fillna(np.nan)
elif np.issubdtype(data.dtype, np.datetime64):
data = data.fillna(np.datetime64("NaT"))
dtype = getattr(arbitrary, "dtype", cudf.dtype("float64"))
arbitrary = None
arbitrary = cudf.Scalar(arbitrary, dtype=dtype)
if length == 0:
return column_empty(length, dtype=arbitrary.dtype)
else:
return ColumnBase.from_scalar(arbitrary, length)

elif hasattr(arbitrary, "__array_interface__"):
# CUDF assumes values are always contiguous
Expand Down Expand Up @@ -2161,8 +2161,6 @@ def as_column(
return as_column(
np.asarray(view), dtype=dtype, nan_as_null=nan_as_null
)
elif isinstance(arbitrary, cudf.Scalar):
data = ColumnBase.from_scalar(arbitrary, length if length else 1)
else:
if dtype is not None:
# Arrow throws a type error if the input is of
Expand Down Expand Up @@ -2505,42 +2503,6 @@ def deserialize_columns(headers: List[dict], frames: List) -> List[ColumnBase]:
return columns


def full(
size: int, fill_value: ScalarLike, dtype: Optional[Dtype] = None
) -> ColumnBase:
"""
Returns a column of given size and dtype, filled with a given value.
Parameters
----------
size : int
size of the expected column.
fill_value : scalar
A scalar value to fill a new array.
dtype : default None
Data type specifier. It is inferred from other arguments by default.
Returns
-------
Column
Examples
--------
>>> import cudf
>>> col = cudf.core.column.full(size=5, fill_value=7, dtype='int8')
>>> col
<cudf.core.column.numerical.NumericalColumn object at 0x7fa0912e8b90>
>>> cudf.Series(col)
0 7
1 7
2 7
3 7
4 7
dtype: int8
"""
return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size)


def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
"""Concatenate a sequence of columns."""
if len(objs) == 0:
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ def as_string_column(
def __pow__(self, other):
if isinstance(other, int):
if other == 0:
res = cudf.core.column.full(
size=len(self), fill_value=1, dtype=self.dtype
res = cudf.core.column.as_column(
1, dtype=self.dtype, length=len(self)
)
if self.nullable:
res = res.set_mask(self.mask)
Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
as_column,
build_column,
column,
full,
string,
)
from cudf.core.dtypes import CategoricalDtype
Expand Down Expand Up @@ -513,7 +512,7 @@ def find_and_replace(
)
if len(replacement_col) == 1 and len(to_replace_col) > 1:
replacement_col = column.as_column(
full(len(to_replace_col), replacement[0], self.dtype)
replacement[0], length=len(to_replace_col), dtype=self.dtype
)
elif len(replacement_col) == 1 and len(to_replace_col) == 0:
return self.copy()
Expand Down
12 changes: 8 additions & 4 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5499,7 +5499,9 @@ def __init__(

if len(children) == 0 and size != 0:
# all nulls-column:
offsets = column.full(size + 1, 0, dtype=size_type_dtype)
offsets = column.as_column(
0, length=size + 1, dtype=size_type_dtype
)

children = (offsets,)

Expand Down Expand Up @@ -5930,8 +5932,8 @@ def _binaryop(
"__eq__",
"__ne__",
}:
return column.full(
len(self), op == "__ne__", dtype="bool"
return column.as_column(
op == "__ne__", length=len(self), dtype="bool"
).set_mask(self.mask)
else:
return NotImplemented
Expand All @@ -5940,7 +5942,9 @@ def _binaryop(
if isinstance(other, cudf.Scalar):
other = cast(
StringColumn,
column.full(len(self), other, dtype="object"),
column.as_column(
other, length=len(self), dtype="object"
),
)

# Explicit types are necessary because mypy infers ColumnBase
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,7 @@ def components(self, index=None) -> "cudf.DataFrame":
break

for name in keys_list:
res_col = cudf.core.column.full(len(self), 0, dtype="int64")
res_col = column.as_column(0, length=len(self), dtype="int64")
if self.nullable:
res_col = res_col.set_mask(self.mask)
data[name] = res_col
Expand Down Expand Up @@ -599,7 +599,7 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn":
# of nanoseconds.

if self._time_unit != "ns":
res_col = cudf.core.column.full(len(self), 0, dtype="int64")
res_col = column.as_column(0, length=len(self), dtype="int64")
if self.nullable:
res_col = res_col.set_mask(self.mask)
return cast("cudf.core.column.NumericalColumn", res_col)
Expand Down
26 changes: 16 additions & 10 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1407,7 +1407,7 @@ def __setitem__(self, arg, value):
allow_non_unique=True,
)
if is_scalar(value):
self._data[arg] = column.full(len(self), value)
self._data[arg] = as_column(value, length=len(self))
else:
value = as_column(value)
self._data[arg] = value
Expand Down Expand Up @@ -1455,8 +1455,8 @@ def __setitem__(self, arg, value):
else:
for col in arg:
if is_scalar(value):
self._data[col] = column.full(
size=len(self), fill_value=value
self._data[col] = as_column(
value, length=len(self)
)
else:
self._data[col] = column.as_column(value)
Expand Down Expand Up @@ -3205,10 +3205,16 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
)

if _is_scalar_or_zero_d_array(value):
value = column.full(
len(self),
dtype = None
if isinstance(value, (np.ndarray, cupy.ndarray)):
dtype = value.dtype
value = value.item()
if libcudf.scalar._is_null_host_scalar(value):
dtype = "str"
value = as_column(
value,
"str" if libcudf.scalar._is_null_host_scalar(value) else None,
length=len(self),
dtype=dtype,
)

if len(self) == 0:
Expand Down Expand Up @@ -5912,7 +5918,7 @@ def isin(self, values):
fill_value = cudf.Scalar(False)

def make_false_column_like_self():
return column.full(len(self), fill_value, "bool")
return column.as_column(fill_value, length=len(self), dtype="bool")

# Preprocess different input types into a mapping from column names to
# a list of values to check.
Expand Down Expand Up @@ -6031,7 +6037,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
{
name: filtered._data[name]._get_mask_as_column()
if filtered._data[name].nullable
else column.full(len(filtered._data[name]), True)
else as_column(True, length=len(filtered._data[name]))
for name in filtered._data.names
}
)
Expand Down Expand Up @@ -7822,8 +7828,8 @@ def func(left, right, output):
return output

for name in uncommon_columns:
output._data[name] = column.full(
size=len(output), fill_value=value, dtype="bool"
output._data[name] = as_column(
value, length=len(output), dtype="bool"
)
return output

Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1231,9 +1231,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
)

needle = as_column(target)
result = cudf.core.column.full(
len(needle),
fill_value=-1,
result = as_column(
-1,
length=len(needle),
dtype=libcudf.types.size_type_dtype,
)

Expand Down
Loading

0 comments on commit 3571291

Please sign in to comment.