Skip to content

Commit

Permalink
Disallow cudf.Series to accept column in favor of ._from_column (#1…
Browse files Browse the repository at this point in the history
…6454)

`cudf.Series` is a public constructor that happens to accept a private `ColumnBase` object. Many ops return Columns and is natural to want to reconstruct a `Series`.

This PR adds a `SingleColumnFrame._from_column` classmethod for instances where we need to wrap a new column in an `Index` or `Series`. This constructor also passes some unneeded validation in `ColumnAccessor` and `Series`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #16454
  • Loading branch information
mroeschke authored Aug 7, 2024
1 parent e8156d4 commit 6b0bff4
Show file tree
Hide file tree
Showing 32 changed files with 360 additions and 268 deletions.
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/byte_pair_encoding.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -27,7 +27,7 @@ class BytePairEncoder:
def __init__(self, merges_pair: "cudf.Series"):
self.merge_pairs = cpp_merge_pairs(merges_pair._column)

def __call__(self, text, separator: str = " "):
def __call__(self, text, separator: str = " ") -> cudf.Series:
"""
Parameters
Expand Down Expand Up @@ -56,4 +56,4 @@ def __call__(self, text, separator: str = " "):
sep = cudf.Scalar(separator, dtype="str")
result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep)

return cudf.Series(result)
return cudf.Series._from_column(result)
16 changes: 6 additions & 10 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def categories(self) -> "cudf.core.index.Index":
return self._column.dtype.categories

@property
def codes(self) -> "cudf.Series":
def codes(self) -> cudf.Series:
"""
Return Series of codes as well as the index.
"""
Expand All @@ -132,7 +132,7 @@ def codes(self) -> "cudf.Series":
if isinstance(self._parent, cudf.Series)
else None
)
return cudf.Series(self._column.codes, index=index)
return cudf.Series._from_column(self._column.codes, index=index)

@property
def ordered(self) -> bool:
Expand Down Expand Up @@ -918,7 +918,7 @@ def find_and_replace(
)
cur_categories = replaced.categories
new_categories = cur_categories.apply_boolean_mask(
~cudf.Series(cur_categories.isin(drop_values))
cur_categories.isin(drop_values).unary_operator("not")
)
replaced = replaced._set_categories(new_categories)
df = df.dropna(subset=["new"])
Expand All @@ -943,7 +943,7 @@ def find_and_replace(
# If a category is being replaced by an existing one, we
# want to map it to None. If it's totally new, we want to
# map it to the new label it is to be replaced by
dtype_replace = cudf.Series._from_data({None: replacement_col})
dtype_replace = cudf.Series._from_column(replacement_col)
dtype_replace[dtype_replace.isin(cats_col)] = None
new_cats_col = cats_col.find_and_replace(
to_replace_col, dtype_replace._column
Expand Down Expand Up @@ -1273,12 +1273,8 @@ def _categories_equal(
return False
# if order doesn't matter, sort before the equals call below
if not ordered:
cur_categories = cudf.Series(cur_categories).sort_values(
ignore_index=True
)
new_categories = cudf.Series(new_categories).sort_values(
ignore_index=True
)
cur_categories = cur_categories.sort_values()
new_categories = new_categories.sort_values()
return cur_categories.equals(new_categories)

def _set_categories(
Expand Down
15 changes: 7 additions & 8 deletions python/cudf/cudf/core/column/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from typing_extensions import Literal

import cudf
import cudf.core.column
import cudf.core.column_accessor
from cudf.utils.utils import NotIterable

ParentType = Union["cudf.Series", "cudf.core.index.Index"]
Expand Down Expand Up @@ -84,14 +86,11 @@ def _return_or_inplace(
data=table, index=self._parent.index
)
elif isinstance(self._parent, cudf.Series):
if retain_index:
return cudf.Series(
new_col,
name=self._parent.name,
index=self._parent.index,
)
else:
return cudf.Series(new_col, name=self._parent.name)
return cudf.Series._from_column(
new_col,
name=self._parent.name,
index=self._parent.index if retain_index else None,
)
elif isinstance(self._parent, cudf.BaseIndex):
return cudf.Index(new_col, name=self._parent.name)
else:
Expand Down
12 changes: 4 additions & 8 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,11 +555,8 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:

if self.dtype.kind == "f":
# Exclude 'np.inf', '-np.inf'
s = cudf.Series(self)
# TODO: replace np.inf with cudf scalar when
# https://github.com/rapidsai/cudf/pull/6297 merges
non_infs = s[~((s == np.inf) | (s == -np.inf))]
col = non_infs._column
not_inf = (self != np.inf) & (self != -np.inf)
col = self.apply_boolean_mask(not_inf)
else:
col = self

Expand Down Expand Up @@ -599,8 +596,7 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
else:
filled = self.fillna(0)
return (
cudf.Series(filled).astype(to_dtype).astype(filled.dtype)
== cudf.Series(filled)
filled.astype(to_dtype).astype(filled.dtype) == filled
).all()

# want to cast float to int:
Expand All @@ -615,7 +611,7 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
# NOTE(seberg): it would make sense to limit to the mantissa range.
if (float(self.min()) >= min_) and (float(self.max()) <= max_):
filled = self.fillna(0)
return (cudf.Series(filled) % 1 == 0).all()
return (filled % 1 == 0).all()
else:
return False

Expand Down
23 changes: 10 additions & 13 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ def cat(self, others=None, sep=None, na_rep=None):
)

if len(data) == 1 and data.null_count == 1:
data = [""]
data = cudf.core.column.as_column("", length=len(data))
# We only want to keep the index if we are adding something to each
# row, not if we are joining all the rows into a single string.
out = self._return_or_inplace(data, retain_index=others is not None)
Expand Down Expand Up @@ -3623,7 +3623,7 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex:
data = libstrings.findall(self._column, pat, flags)
return self._return_or_inplace(data)

def find_multiple(self, patterns: SeriesOrIndex) -> "cudf.Series":
def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series:
"""
Find all first occurrences of patterns in the Series/Index.
Expand Down Expand Up @@ -3679,12 +3679,12 @@ def find_multiple(self, patterns: SeriesOrIndex) -> "cudf.Series":
f"got: {patterns_column.dtype}"
)

return cudf.Series(
return cudf.Series._from_column(
libstrings.find_multiple(self._column, patterns_column),
name=self._parent.name,
index=self._parent.index
if isinstance(self._parent, cudf.Series)
else self._parent,
name=self._parent.name,
)

def isempty(self) -> SeriesOrIndex:
Expand Down Expand Up @@ -4376,14 +4376,9 @@ def code_points(self) -> SeriesOrIndex:
2 99
dtype: int32
"""

new_col = libstrings.code_points(self._column)
if isinstance(self._parent, cudf.Series):
return cudf.Series(new_col, name=self._parent.name)
elif isinstance(self._parent, cudf.BaseIndex):
return cudf.Index(new_col, name=self._parent.name)
else:
return new_col
return self._return_or_inplace(
libstrings.code_points(self._column), retain_index=False
)

def translate(self, table: dict) -> SeriesOrIndex:
"""
Expand Down Expand Up @@ -4694,7 +4689,9 @@ def character_tokenize(self) -> SeriesOrIndex:
if isinstance(self._parent, cudf.Series):
lengths = self.len().fillna(0)
index = self._parent.index.repeat(lengths)
return cudf.Series(result_col, name=self._parent.name, index=index)
return cudf.Series._from_column(
result_col, name=self._parent.name, index=index
)
elif isinstance(self._parent, cudf.BaseIndex):
return cudf.Index(result_col, name=self._parent.name)
else:
Expand Down
Loading

0 comments on commit 6b0bff4

Please sign in to comment.