From f8e3f5e15a13e60ca8817ac1938ed93ca13fd55a Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 20 Aug 2024 13:54:42 -0700 Subject: [PATCH] REF (string): de-duplicate str_endswith, startswith --- pandas/core/arrays/_arrow_string_mixins.py | 69 +++++++++++++++++++++- pandas/core/arrays/arrow/array.py | 33 +---------- pandas/core/arrays/string_arrow.py | 39 +----------- 3 files changed, 69 insertions(+), 72 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 06c74290bd82ed..c60adf878f1285 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -9,20 +9,35 @@ from pandas.compat import pa_version_under10p1 +from pandas.core.dtypes.missing import isna + if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc if TYPE_CHECKING: - from pandas._typing import Self + from collections.abc import Sized + + from pandas._typing import ( + Scalar, + Self, + ) class ArrowStringArrayMixin: - _pa_array = None + # _object_compat specifies whether we should 1) attempt to match behaviors + # of the object-backed StringDtype and 2) fall back to object-based + # computation for cases that pyarrow does not support natively. + _object_compat = False + _pa_array: Sized def __init__(self, *args, **kwargs) -> None: raise NotImplementedError + def _result_converter(self, values, na=None): + # Convert a bool-dtype pyarrow result to appropriate output type. + raise NotImplementedError + def _str_pad( self, width: int, @@ -89,3 +104,53 @@ def _str_removesuffix(self, suffix: str): removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) result = pc.if_else(ends_with, removed, self._pa_array) return type(self)(result) + + def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + if isinstance(pat, str): + result = pc.starts_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + if self._object_compat: + # mimic existing behaviour of string extension array + # and python string method + result = pa.array( + np.zeros(len(self._pa_array), dtype=np.bool_), + mask=isna(self._pa_array), + ) + else: + # For empty tuple, pd.StringDtype() returns null for missing values + # and false for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.starts_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) + if not isna(na): + result = result.fill_null(na) + return self._result_converter(result) + + def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + if isinstance(pat, str): + result = pc.ends_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + if self._object_compat: + # mimic existing behaviour of string extension array + # and python string method + result = pa.array( + np.zeros(len(self._pa_array), dtype=np.bool_), + mask=isna(self._pa_array), + ) + else: + # For empty tuple, pd.StringDtype() returns null for missing values + # and false for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.ends_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) + if not isna(na): + result = result.fill_null(na) + return self._result_converter(result) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e95fa441e18fbd..376acdd7fbd769 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2305,38 +2305,7 @@ def _str_contains( result = result.fill_null(na) return type(self)(result) - def _str_startswith(self, pat: str | tuple[str, ...], na=None) -> Self: - if isinstance(pat, str): - result = pc.starts_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.starts_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return type(self)(result) - - def _str_endswith(self, pat: str | tuple[str, ...], na=None) -> Self: - if isinstance(pat, str): - result = pc.ends_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.ends_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) + def _result_converter(self, result): return type(self)(result) def _str_replace( diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 67114815341b6f..43c8abe015434d 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -278,6 +278,7 @@ def astype(self, dtype, copy: bool = True): # ------------------------------------------------------------------------ # String methods interface + _object_compat = True _str_map = BaseStringArray._str_map @@ -298,44 +299,6 @@ def _str_contains( result[isna(result)] = bool(na) return result - def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): - if isinstance(pat, str): - result = pc.starts_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # mimic existing behaviour of string extension array - # and python string method - result = pa.array( - np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) - ) - else: - result = pc.starts_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return self._result_converter(result) - - def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): - if isinstance(pat, str): - result = pc.ends_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # mimic existing behaviour of string extension array - # and python string method - result = pa.array( - np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) - ) - else: - result = pc.ends_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return self._result_converter(result) - def _str_replace( self, pat: str | re.Pattern,