From a00c3c916947d16fbf997095a32a02ca510b78e5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 4 Apr 2024 22:20:32 -1000 Subject: [PATCH] Cleanup some timedelta/datetime column logic (#14715) Remove private `_time_unit` attribute in favor of the public one and perform dtype validation earlier in `__init__` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Ashwin Srinath (https://github.com/shwina) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/14715 --- python/cudf/cudf/core/_internals/timezones.py | 6 +-- python/cudf/cudf/core/column/column.py | 8 +-- python/cudf/cudf/core/column/datetime.py | 35 ++++++++----- python/cudf/cudf/core/column/timedelta.py | 49 ++++++++----------- .../cudf/tests/series/test_datetimelike.py | 15 ++++++ 5 files changed, 61 insertions(+), 52 deletions(-) diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index 4e2fad08d56..4888cdd9ac9 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -114,7 +114,7 @@ def _find_ambiguous_and_nonexistent( tz_data_for_zone = get_tz_data(zone_name) transition_times = tz_data_for_zone["transition_times"] offsets = tz_data_for_zone["offsets"].astype( - f"timedelta64[{data._time_unit}]" + f"timedelta64[{data.time_unit}]" ) if len(offsets) == 1: # no transitions @@ -183,7 +183,7 @@ def localize( "Already localized. " "Use `tz_convert` to convert between time zones." ) - dtype = pd.DatetimeTZDtype(data._time_unit, zone_name) + dtype = pd.DatetimeTZDtype(data.time_unit, zone_name) ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name) localized = cast( DatetimeColumn, @@ -230,7 +230,7 @@ def convert(data: DatetimeTZColumn, zone_name: str) -> DatetimeTZColumn: DatetimeTZColumn, build_column( data=utc_time.base_data, - dtype=pd.DatetimeTZDtype(data._time_unit, zone_name), + dtype=pd.DatetimeTZDtype(data.time_unit, zone_name), mask=utc_time.base_mask, size=utc_time.size, offset=utc_time.offset, diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2541e076250..835da36fbfd 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -537,13 +537,7 @@ def element_indexing(self, index: int): idx = len(self) + idx if idx > len(self) - 1 or idx < 0: raise IndexError("single positional indexer is out-of-bounds") - result = libcudf.copying.get_element(self, idx).value - if cudf.get_option("mode.pandas_compatible"): - if isinstance(result, np.datetime64): - return pd.Timestamp(result) - elif isinstance(result, np.timedelta64): - return pd.Timedelta(result) - return result + return libcudf.copying.get_element(self, idx).value def slice( self, start: int, stop: int, stride: Optional[int] = None diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 9a5d9dcd47a..b84c1dc7ccd 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -3,6 +3,7 @@ from __future__ import annotations import datetime +import functools import locale import re from locale import nl_langinfo @@ -241,6 +242,8 @@ def __init__( null_count: Optional[int] = None, ): dtype = cudf.dtype(dtype) + if dtype.kind != "M": + raise TypeError(f"{self.dtype} is not a supported datetime type") if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") @@ -256,26 +259,26 @@ def __init__( null_count=null_count, ) - if self.dtype.type is not np.datetime64: - raise TypeError(f"{self.dtype} is not a supported datetime type") - - self._time_unit, _ = np.datetime_data(self.dtype) - def __contains__(self, item: ScalarLike) -> bool: try: - item_as_dt64 = np.datetime64(item, self._time_unit) - except ValueError: - # If item cannot be converted to datetime type - # np.datetime64 raises ValueError, hence `item` - # cannot exist in `self`. + ts = pd.Timestamp(item).as_unit(self.time_unit) + except Exception: + # pandas can raise a variety of errors + # item cannot exist in self. return False - return item_as_dt64.astype("int64") in self.as_numerical_column( + if ts.tzinfo is None and isinstance(self.dtype, pd.DatetimeTZDtype): + return False + elif ts.tzinfo is not None: + ts = ts.tz_convert(None) + return ts.to_numpy().astype("int64") in self.as_numerical_column( "int64" ) - @property + @functools.cached_property def time_unit(self) -> str: - return self._time_unit + if isinstance(self.dtype, pd.DatetimeTZDtype): + return self.dtype.unit + return np.datetime_data(self.dtype)[0] @property def year(self) -> ColumnBase: @@ -322,6 +325,12 @@ def values(self): "DateTime Arrays is not yet implemented in cudf" ) + def element_indexing(self, index: int): + result = super().element_indexing(index) + if cudf.get_option("mode.pandas_compatible"): + return pd.Timestamp(result) + return result + def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component(self, field) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 0d24e8e5120..c5ed889b5dc 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -3,6 +3,7 @@ from __future__ import annotations import datetime +import functools from typing import Any, Optional, Sequence, cast import numpy as np @@ -19,13 +20,6 @@ from cudf.utils.dtypes import np_to_pa_dtype from cudf.utils.utils import _all_bools_with_nulls -_dtype_to_format_conversion = { - "timedelta64[ns]": "%D days %H:%M:%S", - "timedelta64[us]": "%D days %H:%M:%S", - "timedelta64[ms]": "%D days %H:%M:%S", - "timedelta64[s]": "%D days %H:%M:%S", -} - _unit_to_nanoseconds_conversion = { "ns": 1, "us": 1_000, @@ -87,6 +81,8 @@ def __init__( null_count: Optional[int] = None, ): dtype = cudf.dtype(dtype) + if dtype.kind != "m": + raise TypeError(f"{self.dtype} is not a supported duration type") if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") @@ -102,14 +98,9 @@ def __init__( null_count=null_count, ) - if self.dtype.type is not np.timedelta64: - raise TypeError(f"{self.dtype} is not a supported duration type") - - self._time_unit, _ = np.datetime_data(self.dtype) - def __contains__(self, item: DatetimeLikeScalar) -> bool: try: - item = np.timedelta64(item, self._time_unit) + item = np.timedelta64(item, self.time_unit) except ValueError: # If item cannot be converted to duration type # np.timedelta64 raises ValueError, hence `item` @@ -126,6 +117,12 @@ def values(self): "TimeDelta Arrays is not yet implemented in cudf" ) + def element_indexing(self, index: int): + result = super().element_indexing(index) + if cudf.get_option("mode.pandas_compatible"): + return pd.Timedelta(result) + return result + @acquire_spill_lock() def to_arrow(self) -> pa.Array: mask = None @@ -219,16 +216,12 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand: "Cannot perform binary operation on timezone-naive columns" " and timezone-aware timestamps." ) - if isinstance(other, pd.Timestamp): - if other.tz is not None: + if isinstance(other, datetime.datetime): + if other.tzinfo is not None: raise NotImplementedError(tz_error_msg) - other = other.to_datetime64() - elif isinstance(other, pd.Timedelta): - other = other.to_timedelta64() + other = pd.Timestamp(other).to_datetime64() elif isinstance(other, datetime.timedelta): - other = np.timedelta64(other) - elif isinstance(other, datetime.datetime) and other.tzinfo is not None: - raise NotImplementedError(tz_error_msg) + other = pd.Timedelta(other).to_timedelta64() if isinstance(other, np.timedelta64): other_time_unit = cudf.utils.dtypes.get_time_unit(other) @@ -245,13 +238,13 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand: else: common_dtype = determine_out_dtype(self.dtype, other.dtype) return cudf.Scalar(other.astype(common_dtype)) - elif np.isscalar(other): + elif is_scalar(other): return cudf.Scalar(other) return NotImplemented - @property + @functools.cached_property def time_unit(self) -> str: - return self._time_unit + return np.datetime_data(self.dtype)[0] def fillna( self, @@ -292,9 +285,7 @@ def as_string_column( self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.StringColumn": if format is None: - format = _dtype_to_format_conversion.get( - self.dtype.name, "%D days %H:%M:%S" - ) + format = "%D days %H:%M:%S" if len(self) > 0: return string._timedelta_to_str_typecast_functions[ cudf.dtype(self.dtype) @@ -479,7 +470,7 @@ def components(self, index=None) -> "cudf.DataFrame": _unit_to_nanoseconds_conversion[value[1]], "ns" ).astype(self.dtype) ) - if self._time_unit == value[1]: + if self.time_unit == value[1]: break for name in keys_list: @@ -571,7 +562,7 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn": # performing division operation to extract the number # of nanoseconds. - if self._time_unit != "ns": + if self.time_unit != "ns": res_col = column.as_column(0, length=len(self), dtype="int64") if self.nullable: res_col = res_col.set_mask(self.mask) diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py index 98be7045923..6ee339ee3ea 100644 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ b/python/cudf/cudf/tests/series/test_datetimelike.py @@ -203,3 +203,18 @@ def test_tz_aware_attributes_local(): result = dti.hour expected = cudf.Index([9, 9, 9], dtype="int16") assert_eq(result, expected) + + +@pytest.mark.parametrize( + "item, expected", + [ + ["2020-01-01", False], + ["2020-01-01T00:00:00+00:00", True], + ["2020-01-01T00:00:00-08:00", False], + ["2019-12-31T16:00:00-08:00", True], + ], +) +def test_contains_tz_aware(item, expected): + dti = cudf.date_range("2020", periods=2, freq="D").tz_localize("UTC") + result = item in dti + assert result == expected