From 3b691f4be744ff1155df3634cd334211e738e37d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Sep 2023 10:03:52 -1000 Subject: [PATCH] Raise NotImplementedError in to_datetime with dayfirst without infer_format (#14058) Raises a `NotImplementedError` to avoid this incorrect behavior (which seems to actually not be implemented) ```python In [6]: cudf.to_datetime(["10-02-2014"], dayfirst=True) Out[6]: DatetimeIndex(['2014-10-02'], dtype='datetime64[ns]') ``` closes https://github.com/rapidsai/cudf/issues/14042 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14058 --- python/cudf/cudf/core/tools/datetimes.py | 11 +++---- python/cudf/cudf/tests/test_datetime.py | 38 +++++++++++++++++++----- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index f736e055163..a3f4bacf206 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -353,15 +353,16 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format): format=format, ) else: - if infer_datetime_format and format is None: + if format is None: + if not infer_datetime_format and dayfirst: + raise NotImplementedError( + f"{dayfirst=} not implemented " + f"when {format=} and {infer_datetime_format=}." + ) format = column.datetime.infer_format( element=col.element_indexing(0), dayfirst=dayfirst, ) - elif format is None: - format = column.datetime.infer_format( - element=col.element_indexing(0) - ) return col.as_datetime_column( dtype=_unit_dtype_map[unit], format=format, diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 0cc7112454c..164856ed6f5 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -617,22 +617,44 @@ def test_datetime_dataframe(): @pytest.mark.parametrize("infer_datetime_format", [True, False]) def test_cudf_to_datetime(data, dayfirst, infer_datetime_format): pd_data = data + is_string_data = False if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): gd_data = cudf.from_pandas(pd_data) + is_string_data = ( + gd_data.ndim == 1 + and not gd_data.empty + and gd_data.dtype.kind == "O" + ) else: if type(pd_data).__module__ == np.__name__: gd_data = cp.array(pd_data) else: gd_data = pd_data + is_string_data = isinstance(gd_data, list) and isinstance( + next(iter(gd_data), None), str + ) - expected = pd.to_datetime( - pd_data, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format - ) - actual = cudf.to_datetime( - gd_data, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format - ) - - assert_eq(actual, expected) + if dayfirst and not infer_datetime_format and is_string_data: + # Note: pandas<2.0 also does not respect dayfirst=True correctly + # for object data + with pytest.raises(NotImplementedError): + cudf.to_datetime( + gd_data, + dayfirst=dayfirst, + infer_datetime_format=infer_datetime_format, + ) + else: + expected = pd.to_datetime( + pd_data, + dayfirst=dayfirst, + infer_datetime_format=infer_datetime_format, + ) + actual = cudf.to_datetime( + gd_data, + dayfirst=dayfirst, + infer_datetime_format=infer_datetime_format, + ) + assert_eq(actual, expected) @pytest.mark.parametrize(