Skip to content

Commit

Permalink
API: Check for integer overflows when creating scalar form python int (
Browse files Browse the repository at this point in the history
…#16140)

This aligns with NumPy, which deprecated this since a while and raises an error now on NumPy 2, for example for `Scalar(-1, dtype=np.uint8)`.

Since it aligns with NumPy, the DeprecationWarning of earlier NumPy versions is inherited for those.

This (or similar handling) is required to be compatible with NumPy 2/pandas, since the default needs to be to reject operation when values are out of bounds for e.g. `uint8_series + 1000`, the 1000 should not be silently cast to a `uint8`.

---

Split from gh-15897

xref: rapidsai/build-planning#38

Authors:
  - Sebastian Berg (https://github.com/seberg)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #16140
  • Loading branch information
seberg authored Jul 15, 2024
1 parent 1889c7c commit 128f0c9
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 7 deletions.
17 changes: 17 additions & 0 deletions python/cudf/cudf/tests/test_scalar.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pandas as pd
import pyarrow as pa
import pytest
from packaging import version

import rmm

Expand Down Expand Up @@ -253,6 +254,22 @@ def test_generic_null_scalar_construction_fails(value):
cudf.Scalar(value)


@pytest.mark.parametrize(
"value, dtype", [(1000, "uint8"), (2**30, "int16"), (-1, "uint16")]
)
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_scalar_out_of_bounds_pyint_fails(value, dtype):
# Test that we align with NumPy on scalar creation behavior from
# Python integers.
if version.parse(np.__version__) >= version.parse("2.0"):
with pytest.raises(OverflowError):
cudf.Scalar(value, dtype)
else:
# NumPy allowed this, but it gives a DeprecationWarning on newer
# versions (which cudf did not used to do).
assert cudf.Scalar(value, dtype).value == np.dtype(dtype).type(value)


@pytest.mark.parametrize(
"dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["object"]
)
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/tests/test_unaops.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,10 @@ def generate_valid_scalar_unaop_combos():
@pytest.mark.parametrize("slr,dtype,op", generate_valid_scalar_unaop_combos())
def test_scalar_unary_operations(slr, dtype, op):
slr_host = np.array([slr])[0].astype(cudf.dtype(dtype))
slr_device = cudf.Scalar(slr, dtype=dtype)
# The scalar may be out of bounds, so go via array force-cast
# NOTE: This is a change in behavior
slr = np.array(slr).astype(dtype)[()]
slr_device = cudf.Scalar(slr)

expect = op(slr_host)
got = op(slr_device)
Expand Down
14 changes: 8 additions & 6 deletions python/cudf/cudf/utils/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,16 +253,18 @@ def to_cudf_compatible_scalar(val, dtype=None):
elif isinstance(val, datetime.timedelta):
val = np.timedelta64(val)

val = _maybe_convert_to_default_type(
cudf.api.types.pandas_dtype(type(val))
).type(val)

if dtype is not None:
if isinstance(val, str) and np.dtype(dtype).kind == "M":
dtype = np.dtype(dtype)
if isinstance(val, str) and dtype.kind == "M":
# pd.Timestamp can handle str, but not np.str_
val = pd.Timestamp(str(val)).to_datetime64().astype(dtype)
else:
val = val.astype(dtype)
# At least datetimes cannot be converted to scalar via dtype.type:
val = np.array(val, dtype)[()]
else:
val = _maybe_convert_to_default_type(
cudf.api.types.pandas_dtype(type(val))
).type(val)

if val.dtype.type is np.datetime64:
time_unit, _ = np.datetime_data(val.dtype)
Expand Down

0 comments on commit 128f0c9

Please sign in to comment.