Skip to content

Commit

Permalink
Cleanup some timedelta/datetime column logic (rapidsai#14715)
Browse files Browse the repository at this point in the history
Remove private `_time_unit` attribute in favor of the public one and perform dtype validation earlier in `__init__`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Lawrence Mitchell (https://github.com/wence-)

URL: rapidsai#14715
  • Loading branch information
mroeschke authored Apr 5, 2024
1 parent 0ed224d commit a00c3c9
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 52 deletions.
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/_internals/timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def _find_ambiguous_and_nonexistent(
tz_data_for_zone = get_tz_data(zone_name)
transition_times = tz_data_for_zone["transition_times"]
offsets = tz_data_for_zone["offsets"].astype(
f"timedelta64[{data._time_unit}]"
f"timedelta64[{data.time_unit}]"
)

if len(offsets) == 1: # no transitions
Expand Down Expand Up @@ -183,7 +183,7 @@ def localize(
"Already localized. "
"Use `tz_convert` to convert between time zones."
)
dtype = pd.DatetimeTZDtype(data._time_unit, zone_name)
dtype = pd.DatetimeTZDtype(data.time_unit, zone_name)
ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name)
localized = cast(
DatetimeColumn,
Expand Down Expand Up @@ -230,7 +230,7 @@ def convert(data: DatetimeTZColumn, zone_name: str) -> DatetimeTZColumn:
DatetimeTZColumn,
build_column(
data=utc_time.base_data,
dtype=pd.DatetimeTZDtype(data._time_unit, zone_name),
dtype=pd.DatetimeTZDtype(data.time_unit, zone_name),
mask=utc_time.base_mask,
size=utc_time.size,
offset=utc_time.offset,
Expand Down
8 changes: 1 addition & 7 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,13 +537,7 @@ def element_indexing(self, index: int):
idx = len(self) + idx
if idx > len(self) - 1 or idx < 0:
raise IndexError("single positional indexer is out-of-bounds")
result = libcudf.copying.get_element(self, idx).value
if cudf.get_option("mode.pandas_compatible"):
if isinstance(result, np.datetime64):
return pd.Timestamp(result)
elif isinstance(result, np.timedelta64):
return pd.Timedelta(result)
return result
return libcudf.copying.get_element(self, idx).value

def slice(
self, start: int, stop: int, stride: Optional[int] = None
Expand Down
35 changes: 22 additions & 13 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import datetime
import functools
import locale
import re
from locale import nl_langinfo
Expand Down Expand Up @@ -241,6 +242,8 @@ def __init__(
null_count: Optional[int] = None,
):
dtype = cudf.dtype(dtype)
if dtype.kind != "M":
raise TypeError(f"{self.dtype} is not a supported datetime type")

if data.size % dtype.itemsize:
raise ValueError("Buffer size must be divisible by element size")
Expand All @@ -256,26 +259,26 @@ def __init__(
null_count=null_count,
)

if self.dtype.type is not np.datetime64:
raise TypeError(f"{self.dtype} is not a supported datetime type")

self._time_unit, _ = np.datetime_data(self.dtype)

def __contains__(self, item: ScalarLike) -> bool:
try:
item_as_dt64 = np.datetime64(item, self._time_unit)
except ValueError:
# If item cannot be converted to datetime type
# np.datetime64 raises ValueError, hence `item`
# cannot exist in `self`.
ts = pd.Timestamp(item).as_unit(self.time_unit)
except Exception:
# pandas can raise a variety of errors
# item cannot exist in self.
return False
return item_as_dt64.astype("int64") in self.as_numerical_column(
if ts.tzinfo is None and isinstance(self.dtype, pd.DatetimeTZDtype):
return False
elif ts.tzinfo is not None:
ts = ts.tz_convert(None)
return ts.to_numpy().astype("int64") in self.as_numerical_column(
"int64"
)

@property
@functools.cached_property
def time_unit(self) -> str:
return self._time_unit
if isinstance(self.dtype, pd.DatetimeTZDtype):
return self.dtype.unit
return np.datetime_data(self.dtype)[0]

@property
def year(self) -> ColumnBase:
Expand Down Expand Up @@ -322,6 +325,12 @@ def values(self):
"DateTime Arrays is not yet implemented in cudf"
)

def element_indexing(self, index: int):
result = super().element_indexing(index)
if cudf.get_option("mode.pandas_compatible"):
return pd.Timestamp(result)
return result

def get_dt_field(self, field: str) -> ColumnBase:
return libcudf.datetime.extract_datetime_component(self, field)

Expand Down
49 changes: 20 additions & 29 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import datetime
import functools
from typing import Any, Optional, Sequence, cast

import numpy as np
Expand All @@ -19,13 +20,6 @@
from cudf.utils.dtypes import np_to_pa_dtype
from cudf.utils.utils import _all_bools_with_nulls

_dtype_to_format_conversion = {
"timedelta64[ns]": "%D days %H:%M:%S",
"timedelta64[us]": "%D days %H:%M:%S",
"timedelta64[ms]": "%D days %H:%M:%S",
"timedelta64[s]": "%D days %H:%M:%S",
}

_unit_to_nanoseconds_conversion = {
"ns": 1,
"us": 1_000,
Expand Down Expand Up @@ -87,6 +81,8 @@ def __init__(
null_count: Optional[int] = None,
):
dtype = cudf.dtype(dtype)
if dtype.kind != "m":
raise TypeError(f"{self.dtype} is not a supported duration type")

if data.size % dtype.itemsize:
raise ValueError("Buffer size must be divisible by element size")
Expand All @@ -102,14 +98,9 @@ def __init__(
null_count=null_count,
)

if self.dtype.type is not np.timedelta64:
raise TypeError(f"{self.dtype} is not a supported duration type")

self._time_unit, _ = np.datetime_data(self.dtype)

def __contains__(self, item: DatetimeLikeScalar) -> bool:
try:
item = np.timedelta64(item, self._time_unit)
item = np.timedelta64(item, self.time_unit)
except ValueError:
# If item cannot be converted to duration type
# np.timedelta64 raises ValueError, hence `item`
Expand All @@ -126,6 +117,12 @@ def values(self):
"TimeDelta Arrays is not yet implemented in cudf"
)

def element_indexing(self, index: int):
result = super().element_indexing(index)
if cudf.get_option("mode.pandas_compatible"):
return pd.Timedelta(result)
return result

@acquire_spill_lock()
def to_arrow(self) -> pa.Array:
mask = None
Expand Down Expand Up @@ -219,16 +216,12 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
"Cannot perform binary operation on timezone-naive columns"
" and timezone-aware timestamps."
)
if isinstance(other, pd.Timestamp):
if other.tz is not None:
if isinstance(other, datetime.datetime):
if other.tzinfo is not None:
raise NotImplementedError(tz_error_msg)
other = other.to_datetime64()
elif isinstance(other, pd.Timedelta):
other = other.to_timedelta64()
other = pd.Timestamp(other).to_datetime64()
elif isinstance(other, datetime.timedelta):
other = np.timedelta64(other)
elif isinstance(other, datetime.datetime) and other.tzinfo is not None:
raise NotImplementedError(tz_error_msg)
other = pd.Timedelta(other).to_timedelta64()

if isinstance(other, np.timedelta64):
other_time_unit = cudf.utils.dtypes.get_time_unit(other)
Expand All @@ -245,13 +238,13 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
else:
common_dtype = determine_out_dtype(self.dtype, other.dtype)
return cudf.Scalar(other.astype(common_dtype))
elif np.isscalar(other):
elif is_scalar(other):
return cudf.Scalar(other)
return NotImplemented

@property
@functools.cached_property
def time_unit(self) -> str:
return self._time_unit
return np.datetime_data(self.dtype)[0]

def fillna(
self,
Expand Down Expand Up @@ -292,9 +285,7 @@ def as_string_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.StringColumn":
if format is None:
format = _dtype_to_format_conversion.get(
self.dtype.name, "%D days %H:%M:%S"
)
format = "%D days %H:%M:%S"
if len(self) > 0:
return string._timedelta_to_str_typecast_functions[
cudf.dtype(self.dtype)
Expand Down Expand Up @@ -479,7 +470,7 @@ def components(self, index=None) -> "cudf.DataFrame":
_unit_to_nanoseconds_conversion[value[1]], "ns"
).astype(self.dtype)
)
if self._time_unit == value[1]:
if self.time_unit == value[1]:
break

for name in keys_list:
Expand Down Expand Up @@ -571,7 +562,7 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn":
# performing division operation to extract the number
# of nanoseconds.

if self._time_unit != "ns":
if self.time_unit != "ns":
res_col = column.as_column(0, length=len(self), dtype="int64")
if self.nullable:
res_col = res_col.set_mask(self.mask)
Expand Down
15 changes: 15 additions & 0 deletions python/cudf/cudf/tests/series/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,3 +203,18 @@ def test_tz_aware_attributes_local():
result = dti.hour
expected = cudf.Index([9, 9, 9], dtype="int16")
assert_eq(result, expected)


@pytest.mark.parametrize(
"item, expected",
[
["2020-01-01", False],
["2020-01-01T00:00:00+00:00", True],
["2020-01-01T00:00:00-08:00", False],
["2019-12-31T16:00:00-08:00", True],
],
)
def test_contains_tz_aware(item, expected):
dti = cudf.date_range("2020", periods=2, freq="D").tz_localize("UTC")
result = item in dti
assert result == expected

0 comments on commit a00c3c9

Please sign in to comment.