Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "Use as_column instead of full" #15235

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions python/cudf/cudf/core/column/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
column_empty_like_same_mask,
concat_columns,
deserialize_columns,
full,
serialize_columns,
)
from cudf.core.column.datetime import DatetimeColumn # noqa: F401
Expand Down
12 changes: 7 additions & 5 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -734,8 +734,8 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn:
)
return other

ary = column.as_column(
self._encode(other), length=len(self), dtype=self.codes.dtype
ary = column.full(
len(self), self._encode(other), dtype=self.codes.dtype
)
return column.build_categorical_column(
categories=self.dtype.categories._values,
Expand Down Expand Up @@ -1444,9 +1444,11 @@ def _create_empty_categorical_column(
return column.build_categorical_column(
categories=column.as_column(dtype.categories),
codes=column.as_column(
_DEFAULT_CATEGORICAL_VALUE,
length=categorical_column.size,
dtype=categorical_column.codes.dtype,
column.full(
categorical_column.size,
_DEFAULT_CATEGORICAL_VALUE,
categorical_column.codes.dtype,
)
),
offset=categorical_column.offset,
size=categorical_column.size,
Expand Down
100 changes: 69 additions & 31 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
infer_dtype,
is_bool_dtype,
is_datetime64_dtype,
is_decimal_dtype,
is_dtype_equal,
is_integer_dtype,
is_list_dtype,
Expand Down Expand Up @@ -865,7 +866,7 @@ def isin(self, values: Sequence) -> ColumnBase:
except ValueError:
# pandas functionally returns all False when cleansing via
# typecasting fails
return as_column(False, length=len(self), dtype="bool")
return full(len(self), False, dtype="bool")

return lhs._obtain_isin_result(rhs)

Expand All @@ -892,9 +893,9 @@ def _isin_earlystop(self, rhs: ColumnBase) -> Union[ColumnBase, None]:
if self.null_count and rhs.null_count:
return self.isnull()
else:
return as_column(False, length=len(self), dtype="bool")
return cudf.core.column.full(len(self), False, dtype="bool")
elif self.null_count == 0 and (rhs.null_count == len(rhs)):
return as_column(False, length=len(self), dtype="bool")
return cudf.core.column.full(len(self), False, dtype="bool")
else:
return None

Expand Down Expand Up @@ -1355,7 +1356,9 @@ def _label_encoding(
na_sentinel = cudf.Scalar(-1)

def _return_sentinel_column():
return as_column(na_sentinel, dtype=dtype, length=len(self))
return cudf.core.column.full(
size=len(self), fill_value=na_sentinel, dtype=dtype
)

if dtype is None:
dtype = min_scalar_type(max(len(cats), na_sentinel), 8)
Expand Down Expand Up @@ -1452,9 +1455,7 @@ def column_empty(
elif isinstance(dtype, ListDtype):
data = None
children = (
as_column(
0, length=row_count + 1, dtype=libcudf.types.size_type_dtype
),
full(row_count + 1, 0, dtype=libcudf.types.size_type_dtype),
column_empty(row_count, dtype=dtype.element_type),
)
elif isinstance(dtype, CategoricalDtype):
Expand All @@ -1473,9 +1474,7 @@ def column_empty(
elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype):
data = as_buffer(rmm.DeviceBuffer(size=0))
children = (
as_column(
0, length=row_count + 1, dtype=libcudf.types.size_type_dtype
),
full(row_count + 1, 0, dtype=libcudf.types.size_type_dtype),
)
else:
data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize))
Expand Down Expand Up @@ -2018,32 +2017,33 @@ def as_column(
if dtype is not None:
data = data.astype(dtype)

elif is_scalar(arbitrary) and not isinstance(arbitrary, memoryview):
if length is None:
length = 1
elif length < 0:
raise ValueError(f"{length=} must be >=0.")
if isinstance(arbitrary, pd.Interval):
# No cudf.Scalar support yet
return as_column(
pd.Series([arbitrary] * length),
nan_as_null=nan_as_null,
dtype=dtype,
length=length,
)
elif isinstance(arbitrary, (pd.Timestamp, pd.Timedelta)):
# This will always treat NaTs as nulls since it's not technically a
# discrete value like NaN
length = length or 1
data = as_column(
pa.array(pd.Series([arbitrary] * length), from_pandas=True)
)
if dtype is not None:
data = data.astype(dtype)

elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
length = length or 1
if (
nan_as_null is True
(nan_as_null is True)
and isinstance(arbitrary, (np.floating, float))
and np.isnan(arbitrary)
):
if dtype is None:
dtype = getattr(arbitrary, "dtype", cudf.dtype("float64"))
arbitrary = None
arbitrary = cudf.Scalar(arbitrary, dtype=dtype)
if length == 0:
return column_empty(length, dtype=arbitrary.dtype)
else:
return ColumnBase.from_scalar(arbitrary, length)
if dtype is None:
dtype = cudf.dtype("float64")

data = as_column(full(length, arbitrary, dtype=dtype))
if not nan_as_null and not is_decimal_dtype(data.dtype):
if np.issubdtype(data.dtype, np.floating):
data = data.fillna(np.nan)
elif np.issubdtype(data.dtype, np.datetime64):
data = data.fillna(np.datetime64("NaT"))

elif hasattr(arbitrary, "__array_interface__"):
# CUDF assumes values are always contiguous
Expand Down Expand Up @@ -2161,6 +2161,8 @@ def as_column(
return as_column(
np.asarray(view), dtype=dtype, nan_as_null=nan_as_null
)
elif isinstance(arbitrary, cudf.Scalar):
data = ColumnBase.from_scalar(arbitrary, length if length else 1)
else:
if dtype is not None:
# Arrow throws a type error if the input is of
Expand Down Expand Up @@ -2503,6 +2505,42 @@ def deserialize_columns(headers: List[dict], frames: List) -> List[ColumnBase]:
return columns


def full(
size: int, fill_value: ScalarLike, dtype: Optional[Dtype] = None
) -> ColumnBase:
"""
Returns a column of given size and dtype, filled with a given value.

Parameters
----------
size : int
size of the expected column.
fill_value : scalar
A scalar value to fill a new array.
dtype : default None
Data type specifier. It is inferred from other arguments by default.

Returns
-------
Column

Examples
--------
>>> import cudf
>>> col = cudf.core.column.full(size=5, fill_value=7, dtype='int8')
>>> col
<cudf.core.column.numerical.NumericalColumn object at 0x7fa0912e8b90>
>>> cudf.Series(col)
0 7
1 7
2 7
3 7
4 7
dtype: int8
"""
return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size)


def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
"""Concatenate a sequence of columns."""
if len(objs) == 0:
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ def as_string_column(
def __pow__(self, other):
if isinstance(other, int):
if other == 0:
res = cudf.core.column.as_column(
1, dtype=self.dtype, length=len(self)
res = cudf.core.column.full(
size=len(self), fill_value=1, dtype=self.dtype
)
if self.nullable:
res = res.set_mask(self.mask)
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
as_column,
build_column,
column,
full,
string,
)
from cudf.core.dtypes import CategoricalDtype
Expand Down Expand Up @@ -512,7 +513,7 @@ def find_and_replace(
)
if len(replacement_col) == 1 and len(to_replace_col) > 1:
replacement_col = column.as_column(
replacement[0], length=len(to_replace_col), dtype=self.dtype
full(len(to_replace_col), replacement[0], self.dtype)
)
elif len(replacement_col) == 1 and len(to_replace_col) == 0:
return self.copy()
Expand Down
12 changes: 4 additions & 8 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5499,9 +5499,7 @@ def __init__(

if len(children) == 0 and size != 0:
# all nulls-column:
offsets = column.as_column(
0, length=size + 1, dtype=size_type_dtype
)
offsets = column.full(size + 1, 0, dtype=size_type_dtype)

children = (offsets,)

Expand Down Expand Up @@ -5932,8 +5930,8 @@ def _binaryop(
"__eq__",
"__ne__",
}:
return column.as_column(
op == "__ne__", length=len(self), dtype="bool"
return column.full(
len(self), op == "__ne__", dtype="bool"
).set_mask(self.mask)
else:
return NotImplemented
Expand All @@ -5942,9 +5940,7 @@ def _binaryop(
if isinstance(other, cudf.Scalar):
other = cast(
StringColumn,
column.as_column(
other, length=len(self), dtype="object"
),
column.full(len(self), other, dtype="object"),
)

# Explicit types are necessary because mypy infers ColumnBase
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,7 @@ def components(self, index=None) -> "cudf.DataFrame":
break

for name in keys_list:
res_col = column.as_column(0, length=len(self), dtype="int64")
res_col = cudf.core.column.full(len(self), 0, dtype="int64")
if self.nullable:
res_col = res_col.set_mask(self.mask)
data[name] = res_col
Expand Down Expand Up @@ -599,7 +599,7 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn":
# of nanoseconds.

if self._time_unit != "ns":
res_col = column.as_column(0, length=len(self), dtype="int64")
res_col = cudf.core.column.full(len(self), 0, dtype="int64")
if self.nullable:
res_col = res_col.set_mask(self.mask)
return cast("cudf.core.column.NumericalColumn", res_col)
Expand Down
26 changes: 10 additions & 16 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1407,7 +1407,7 @@ def __setitem__(self, arg, value):
allow_non_unique=True,
)
if is_scalar(value):
self._data[arg] = as_column(value, length=len(self))
self._data[arg] = column.full(len(self), value)
else:
value = as_column(value)
self._data[arg] = value
Expand Down Expand Up @@ -1455,8 +1455,8 @@ def __setitem__(self, arg, value):
else:
for col in arg:
if is_scalar(value):
self._data[col] = as_column(
value, length=len(self)
self._data[col] = column.full(
size=len(self), fill_value=value
)
else:
self._data[col] = column.as_column(value)
Expand Down Expand Up @@ -3205,16 +3205,10 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
)

if _is_scalar_or_zero_d_array(value):
dtype = None
if isinstance(value, (np.ndarray, cupy.ndarray)):
dtype = value.dtype
value = value.item()
if libcudf.scalar._is_null_host_scalar(value):
dtype = "str"
value = as_column(
value = column.full(
len(self),
value,
length=len(self),
dtype=dtype,
"str" if libcudf.scalar._is_null_host_scalar(value) else None,
)

if len(self) == 0:
Expand Down Expand Up @@ -5918,7 +5912,7 @@ def isin(self, values):
fill_value = cudf.Scalar(False)

def make_false_column_like_self():
return column.as_column(fill_value, length=len(self), dtype="bool")
return column.full(len(self), fill_value, "bool")

# Preprocess different input types into a mapping from column names to
# a list of values to check.
Expand Down Expand Up @@ -6037,7 +6031,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
{
name: filtered._data[name]._get_mask_as_column()
if filtered._data[name].nullable
else as_column(True, length=len(filtered._data[name]))
else column.full(len(filtered._data[name]), True)
for name in filtered._data.names
}
)
Expand Down Expand Up @@ -7828,8 +7822,8 @@ def func(left, right, output):
return output

for name in uncommon_columns:
output._data[name] = as_column(
value, length=len(output), dtype="bool"
output._data[name] = column.full(
size=len(output), fill_value=value, dtype="bool"
)
return output

Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1231,9 +1231,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
)

needle = as_column(target)
result = as_column(
-1,
length=len(needle),
result = cudf.core.column.full(
len(needle),
fill_value=-1,
dtype=libcudf.types.size_type_dtype,
)

Expand Down
Loading
Loading