From 128f0c917bbc3342f9eca12ca2bf714c88206256 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Mon, 15 Jul 2024 20:34:14 +0200
Subject: [PATCH] API: Check for integer overflows when creating scalar form
 python int (#16140)

This aligns with NumPy, which deprecated this since a while and raises an error now on NumPy 2, for example for `Scalar(-1, dtype=np.uint8)`.

Since it aligns with NumPy, the DeprecationWarning of earlier NumPy versions is inherited for those.

This (or similar handling) is required to be compatible with NumPy 2/pandas, since the default needs to be to reject operation when values are out of bounds for e.g. `uint8_series + 1000`, the 1000 should not be silently cast to a `uint8`.

---

Split from gh-15897

xref: https://github.com/rapidsai/build-planning/issues/38

Authors:
  - Sebastian Berg (https://github.com/seberg)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16140
---
 python/cudf/cudf/tests/test_scalar.py | 17 +++++++++++++++++
 python/cudf/cudf/tests/test_unaops.py |  5 ++++-
 python/cudf/cudf/utils/dtypes.py      | 14 ++++++++------
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index 05a91a8fea3..195231e9960 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -8,6 +8,7 @@
 import pandas as pd
 import pyarrow as pa
 import pytest
+from packaging import version
 
 import rmm
 
@@ -253,6 +254,22 @@ def test_generic_null_scalar_construction_fails(value):
         cudf.Scalar(value)
 
 
+@pytest.mark.parametrize(
+    "value, dtype", [(1000, "uint8"), (2**30, "int16"), (-1, "uint16")]
+)
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_scalar_out_of_bounds_pyint_fails(value, dtype):
+    # Test that we align with NumPy on scalar creation behavior from
+    # Python integers.
+    if version.parse(np.__version__) >= version.parse("2.0"):
+        with pytest.raises(OverflowError):
+            cudf.Scalar(value, dtype)
+    else:
+        # NumPy allowed this, but it gives a DeprecationWarning on newer
+        # versions (which cudf did not used to do).
+        assert cudf.Scalar(value, dtype).value == np.dtype(dtype).type(value)
+
+
 @pytest.mark.parametrize(
     "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["object"]
 )
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index dbbf4fba3a6..5f5d79c1dce 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -81,7 +81,10 @@ def generate_valid_scalar_unaop_combos():
 @pytest.mark.parametrize("slr,dtype,op", generate_valid_scalar_unaop_combos())
 def test_scalar_unary_operations(slr, dtype, op):
     slr_host = np.array([slr])[0].astype(cudf.dtype(dtype))
-    slr_device = cudf.Scalar(slr, dtype=dtype)
+    # The scalar may be out of bounds, so go via array force-cast
+    # NOTE: This is a change in behavior
+    slr = np.array(slr).astype(dtype)[()]
+    slr_device = cudf.Scalar(slr)
 
     expect = op(slr_host)
     got = op(slr_device)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 2aa3129ab30..0dec857ea96 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -253,16 +253,18 @@ def to_cudf_compatible_scalar(val, dtype=None):
     elif isinstance(val, datetime.timedelta):
         val = np.timedelta64(val)
 
-    val = _maybe_convert_to_default_type(
-        cudf.api.types.pandas_dtype(type(val))
-    ).type(val)
-
     if dtype is not None:
-        if isinstance(val, str) and np.dtype(dtype).kind == "M":
+        dtype = np.dtype(dtype)
+        if isinstance(val, str) and dtype.kind == "M":
             # pd.Timestamp can handle str, but not np.str_
             val = pd.Timestamp(str(val)).to_datetime64().astype(dtype)
         else:
-            val = val.astype(dtype)
+            # At least datetimes cannot be converted to scalar via dtype.type:
+            val = np.array(val, dtype)[()]
+    else:
+        val = _maybe_convert_to_default_type(
+            cudf.api.types.pandas_dtype(type(val))
+        ).type(val)
 
     if val.dtype.type is np.datetime64:
         time_unit, _ = np.datetime_data(val.dtype)