Cleanup some timedelta/datetime column logic (rapidsai#14715)

Remove private `_time_unit` attribute in favor of the public one and perform dtype validation earlier in `__init__` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Ashwin Srinath (https://github.com/shwina) - Lawrence Mitchell (https://github.com/wence-) URL: rapidsai#14715
rjzamora · Apr 5, 2024 · a00c3c9 · a00c3c9
1 parent 0ed224d
commit a00c3c9
Show file tree

Hide file tree

Showing 5 changed files with 61 additions and 52 deletions.
diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
@@ -114,7 +114,7 @@ def _find_ambiguous_and_nonexistent(
     tz_data_for_zone = get_tz_data(zone_name)
     transition_times = tz_data_for_zone["transition_times"]
     offsets = tz_data_for_zone["offsets"].astype(
-        f"timedelta64[{data._time_unit}]"
+        f"timedelta64[{data.time_unit}]"
     )
 
     if len(offsets) == 1:  # no transitions
@@ -183,7 +183,7 @@ def localize(
             "Already localized. "
             "Use `tz_convert` to convert between time zones."
         )
-    dtype = pd.DatetimeTZDtype(data._time_unit, zone_name)
+    dtype = pd.DatetimeTZDtype(data.time_unit, zone_name)
     ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name)
     localized = cast(
         DatetimeColumn,
@@ -230,7 +230,7 @@ def convert(data: DatetimeTZColumn, zone_name: str) -> DatetimeTZColumn:
         DatetimeTZColumn,
         build_column(
             data=utc_time.base_data,
-            dtype=pd.DatetimeTZDtype(data._time_unit, zone_name),
+            dtype=pd.DatetimeTZDtype(data.time_unit, zone_name),
             mask=utc_time.base_mask,
             size=utc_time.size,
             offset=utc_time.offset,

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -537,13 +537,7 @@ def element_indexing(self, index: int):
             idx = len(self) + idx
         if idx > len(self) - 1 or idx < 0:
             raise IndexError("single positional indexer is out-of-bounds")
-        result = libcudf.copying.get_element(self, idx).value
-        if cudf.get_option("mode.pandas_compatible"):
-            if isinstance(result, np.datetime64):
-                return pd.Timestamp(result)
-            elif isinstance(result, np.timedelta64):
-                return pd.Timedelta(result)
-        return result
+        return libcudf.copying.get_element(self, idx).value
 
     def slice(
         self, start: int, stop: int, stride: Optional[int] = None

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import datetime
+import functools
 import locale
 import re
 from locale import nl_langinfo
@@ -241,6 +242,8 @@ def __init__(
         null_count: Optional[int] = None,
     ):
         dtype = cudf.dtype(dtype)
+        if dtype.kind != "M":
+            raise TypeError(f"{self.dtype} is not a supported datetime type")
 
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
@@ -256,26 +259,26 @@ def __init__(
             null_count=null_count,
         )
 
-        if self.dtype.type is not np.datetime64:
-            raise TypeError(f"{self.dtype} is not a supported datetime type")
-
-        self._time_unit, _ = np.datetime_data(self.dtype)
-
     def __contains__(self, item: ScalarLike) -> bool:
         try:
-            item_as_dt64 = np.datetime64(item, self._time_unit)
-        except ValueError:
-            # If item cannot be converted to datetime type
-            # np.datetime64 raises ValueError, hence `item`
-            # cannot exist in `self`.
+            ts = pd.Timestamp(item).as_unit(self.time_unit)
+        except Exception:
+            # pandas can raise a variety of errors
+            # item cannot exist in self.
             return False
-        return item_as_dt64.astype("int64") in self.as_numerical_column(
+        if ts.tzinfo is None and isinstance(self.dtype, pd.DatetimeTZDtype):
+            return False
+        elif ts.tzinfo is not None:
+            ts = ts.tz_convert(None)
+        return ts.to_numpy().astype("int64") in self.as_numerical_column(
             "int64"
         )
 
-    @property
+    @functools.cached_property
     def time_unit(self) -> str:
-        return self._time_unit
+        if isinstance(self.dtype, pd.DatetimeTZDtype):
+            return self.dtype.unit
+        return np.datetime_data(self.dtype)[0]
 
     @property
     def year(self) -> ColumnBase:
@@ -322,6 +325,12 @@ def values(self):
             "DateTime Arrays is not yet implemented in cudf"
         )
 
+    def element_indexing(self, index: int):
+        result = super().element_indexing(index)
+        if cudf.get_option("mode.pandas_compatible"):
+            return pd.Timestamp(result)
+        return result
+
     def get_dt_field(self, field: str) -> ColumnBase:
         return libcudf.datetime.extract_datetime_component(self, field)
 

diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import datetime
+import functools
 from typing import Any, Optional, Sequence, cast
 
 import numpy as np
@@ -19,13 +20,6 @@
 from cudf.utils.dtypes import np_to_pa_dtype
 from cudf.utils.utils import _all_bools_with_nulls
 
-_dtype_to_format_conversion = {
-    "timedelta64[ns]": "%D days %H:%M:%S",
-    "timedelta64[us]": "%D days %H:%M:%S",
-    "timedelta64[ms]": "%D days %H:%M:%S",
-    "timedelta64[s]": "%D days %H:%M:%S",
-}
-
 _unit_to_nanoseconds_conversion = {
     "ns": 1,
     "us": 1_000,
@@ -87,6 +81,8 @@ def __init__(
         null_count: Optional[int] = None,
     ):
         dtype = cudf.dtype(dtype)
+        if dtype.kind != "m":
+            raise TypeError(f"{self.dtype} is not a supported duration type")
 
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
@@ -102,14 +98,9 @@ def __init__(
             null_count=null_count,
         )
 
-        if self.dtype.type is not np.timedelta64:
-            raise TypeError(f"{self.dtype} is not a supported duration type")
-
-        self._time_unit, _ = np.datetime_data(self.dtype)
-
     def __contains__(self, item: DatetimeLikeScalar) -> bool:
         try:
-            item = np.timedelta64(item, self._time_unit)
+            item = np.timedelta64(item, self.time_unit)
         except ValueError:
             # If item cannot be converted to duration type
             # np.timedelta64 raises ValueError, hence `item`
@@ -126,6 +117,12 @@ def values(self):
             "TimeDelta Arrays is not yet implemented in cudf"
         )
 
+    def element_indexing(self, index: int):
+        result = super().element_indexing(index)
+        if cudf.get_option("mode.pandas_compatible"):
+            return pd.Timedelta(result)
+        return result
+
     @acquire_spill_lock()
     def to_arrow(self) -> pa.Array:
         mask = None
@@ -219,16 +216,12 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
             "Cannot perform binary operation on timezone-naive columns"
             " and timezone-aware timestamps."
         )
-        if isinstance(other, pd.Timestamp):
-            if other.tz is not None:
+        if isinstance(other, datetime.datetime):
+            if other.tzinfo is not None:
                 raise NotImplementedError(tz_error_msg)
-            other = other.to_datetime64()
-        elif isinstance(other, pd.Timedelta):
-            other = other.to_timedelta64()
+            other = pd.Timestamp(other).to_datetime64()
         elif isinstance(other, datetime.timedelta):
-            other = np.timedelta64(other)
-        elif isinstance(other, datetime.datetime) and other.tzinfo is not None:
-            raise NotImplementedError(tz_error_msg)
+            other = pd.Timedelta(other).to_timedelta64()
 
         if isinstance(other, np.timedelta64):
             other_time_unit = cudf.utils.dtypes.get_time_unit(other)
@@ -245,13 +238,13 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
             else:
                 common_dtype = determine_out_dtype(self.dtype, other.dtype)
             return cudf.Scalar(other.astype(common_dtype))
-        elif np.isscalar(other):
+        elif is_scalar(other):
             return cudf.Scalar(other)
         return NotImplemented
 
-    @property
+    @functools.cached_property
     def time_unit(self) -> str:
-        return self._time_unit
+        return np.datetime_data(self.dtype)[0]
 
     def fillna(
         self,
@@ -292,9 +285,7 @@ def as_string_column(
         self, dtype: Dtype, format: str | None = None
     ) -> "cudf.core.column.StringColumn":
         if format is None:
-            format = _dtype_to_format_conversion.get(
-                self.dtype.name, "%D days %H:%M:%S"
-            )
+            format = "%D days %H:%M:%S"
         if len(self) > 0:
             return string._timedelta_to_str_typecast_functions[
                 cudf.dtype(self.dtype)
@@ -479,7 +470,7 @@ def components(self, index=None) -> "cudf.DataFrame":
                     _unit_to_nanoseconds_conversion[value[1]], "ns"
                 ).astype(self.dtype)
             )
-            if self._time_unit == value[1]:
+            if self.time_unit == value[1]:
                 break
 
         for name in keys_list:
@@ -571,7 +562,7 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn":
         # performing division operation to extract the number
         # of nanoseconds.
 
-        if self._time_unit != "ns":
+        if self.time_unit != "ns":
             res_col = column.as_column(0, length=len(self), dtype="int64")
             if self.nullable:
                 res_col = res_col.set_mask(self.mask)

diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py
@@ -203,3 +203,18 @@ def test_tz_aware_attributes_local():
     result = dti.hour
     expected = cudf.Index([9, 9, 9], dtype="int16")
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "item, expected",
+    [
+        ["2020-01-01", False],
+        ["2020-01-01T00:00:00+00:00", True],
+        ["2020-01-01T00:00:00-08:00", False],
+        ["2019-12-31T16:00:00-08:00", True],
+    ],
+)
+def test_contains_tz_aware(item, expected):
+    dti = cudf.date_range("2020", periods=2, freq="D").tz_localize("UTC")
+    result = item in dti
+    assert result == expected