diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index a39668faf779e..e136b4f92031d 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -10,6 +10,7 @@ import numpy as np +from pandas._libs import lib from pandas.compat import ( pa_version_under10p1, pa_version_under11p0, @@ -17,8 +18,6 @@ pa_version_under17p0, ) -from pandas.core.dtypes.missing import isna - if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc @@ -38,7 +37,7 @@ class ArrowStringArrayMixin: def __init__(self, *args, **kwargs) -> None: raise NotImplementedError - def _convert_bool_result(self, result): + def _convert_bool_result(self, result, na=lib.no_default, method_name=None): # Convert a bool-dtype result to the appropriate result type raise NotImplementedError @@ -212,7 +211,9 @@ def _str_removesuffix(self, suffix: str): result = pc.if_else(ends_with, removed, self._pa_array) return type(self)(result) - def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + def _str_startswith( + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default + ): if isinstance(pat, str): result = pc.starts_with(self._pa_array, pattern=pat) else: @@ -225,11 +226,11 @@ def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): for p in pat[1:]: result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): # pyright: ignore [reportGeneralTypeIssues] - result = result.fill_null(na) - return self._convert_bool_result(result) + return self._convert_bool_result(result, na=na, method_name="startswith") - def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + def _str_endswith( + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default + ): if isinstance(pat, str): result = pc.ends_with(self._pa_array, pattern=pat) else: @@ -242,9 +243,7 @@ def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): for p in pat[1:]: result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): # pyright: ignore [reportGeneralTypeIssues] - result = result.fill_null(na) - return self._convert_bool_result(result) + return self._convert_bool_result(result, na=na, method_name="endswith") def _str_isalnum(self): result = pc.utf8_is_alnum(self._pa_array) @@ -283,7 +282,12 @@ def _str_isupper(self): return self._convert_bool_result(result) def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True + self, + pat, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, + regex: bool = True, ): if flags: raise NotImplementedError(f"contains not implemented with {flags=}") @@ -293,19 +297,25 @@ def _str_contains( else: pa_contains = pc.match_substring result = pa_contains(self._pa_array, pat, ignore_case=not case) - if not isna(na): # pyright: ignore [reportGeneralTypeIssues] - result = result.fill_null(na) - return self._convert_bool_result(result) + return self._convert_bool_result(result, na=na, method_name="contains") def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + self, + pat: str, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, ): if not pat.startswith("^"): pat = f"^{pat}" return self._str_contains(pat, case, flags, na, regex=True) def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None + self, + pat, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, ): if not pat.endswith("$") or pat.endswith("\\$"): pat = f"{pat}$" diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e0ccbd6fdc5fd..f3d7a3cc6d694 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2285,7 +2285,11 @@ def _apply_elementwise(self, func: Callable) -> list[list[Any]]: for chunk in self._pa_array.iterchunks() ] - def _convert_bool_result(self, result): + def _convert_bool_result(self, result, na=lib.no_default, method_name=None): + if na is not lib.no_default and not isna( + na + ): # pyright: ignore [reportGeneralTypeIssues] + result = result.fill_null(na) return type(self)(result) def _convert_int_result(self, result): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 97004474648b2..366253a923f6c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2675,16 +2675,28 @@ def _replace(self, *, to_replace, value, inplace: bool = False): # ------------------------------------------------------------------------ # String methods interface def _str_map( - self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True + self, f, na_value=lib.no_default, dtype=np.dtype("object"), convert: bool = True ): # Optimization to apply the callable `f` to the categories once # and rebuild the result by `take`ing from the result with the codes. # Returns the same type as the object-dtype implementation though. - from pandas.core.arrays import NumpyExtensionArray - categories = self.categories codes = self.codes - result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype) + if categories.dtype == "string": + result = categories.array._str_map(f, na_value, dtype) # type: ignore[attr-defined] + if ( + categories.dtype.na_value is np.nan # type: ignore[union-attr] + and is_bool_dtype(dtype) + and (na_value is lib.no_default or isna(na_value)) + ): + # NaN propagates as False for functions with boolean return type + na_value = False + else: + from pandas.core.arrays import NumpyExtensionArray + + result = NumpyExtensionArray(categories.to_numpy())._str_map( + f, na_value, dtype + ) return take_nd(result, codes, fill_value=na_value) def _str_get_dummies(self, sep: str = "|"): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0b0fffcb928a3..5b69344bac0c8 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -377,7 +377,11 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: return cls._from_sequence(scalars, dtype=dtype) def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + self, + f, + na_value=lib.no_default, + dtype: Dtype | None = None, + convert: bool = True, ): if self.dtype.na_value is np.nan: return self._str_map_nan_semantics( @@ -388,7 +392,7 @@ def _str_map( if dtype is None: dtype = self.dtype - if na_value is None: + if na_value is lib.no_default: na_value = self.dtype.na_value mask = isna(self) @@ -458,12 +462,20 @@ def _str_map_str_or_object( return lib.map_infer_mask(arr, f, mask.view("uint8")) def _str_map_nan_semantics( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + self, + f, + na_value=lib.no_default, + dtype: Dtype | None = None, + convert: bool = True, ): if dtype is None: dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value + if na_value is lib.no_default: + if is_bool_dtype(dtype): + # NaN propagates as False + na_value = False + else: + na_value = self.dtype.na_value mask = isna(self) arr = np.asarray(self) @@ -474,7 +486,8 @@ def _str_map_nan_semantics( if is_integer_dtype(dtype): na_value = 0 else: - na_value = True + # NaN propagates as False + na_value = False result = lib.map_infer_mask( arr, @@ -484,15 +497,13 @@ def _str_map_nan_semantics( na_value=na_value, dtype=np.dtype(cast(type, dtype)), ) - if na_value_is_na and mask.any(): + if na_value_is_na and is_integer_dtype(dtype) and mask.any(): # TODO: we could alternatively do this check before map_infer_mask # and adjust the dtype/na_value we pass there. Which is more # performant? - if is_integer_dtype(dtype): - result = result.astype("float64") - else: - result = result.astype("object") + result = result.astype("float64") result[mask] = np.nan + return result else: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 8dcf7643b579e..9389f7cffca9f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -211,9 +211,29 @@ def insert(self, loc: int, item) -> ArrowStringArray: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) - def _convert_bool_result(self, values): + def _convert_bool_result(self, values, na=lib.no_default, method_name=None): + if na is not lib.no_default and not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + f"Allowing a non-bool 'na' in obj.str.{method_name} is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + na = bool(na) + if self.dtype.na_value is np.nan: - return ArrowExtensionArray(values).to_numpy(na_value=np.nan) + if na is lib.no_default or isna(na): + # NaN propagates as False + values = values.fill_null(False) + else: + values = values.fill_null(na) + return values.to_numpy() + else: + if na is not lib.no_default and not isna( + na + ): # pyright: ignore [reportGeneralTypeIssues] + values = values.fill_null(na) return BooleanDtype().__from_arrow__(values) def _maybe_convert_setitem_value(self, value): @@ -309,22 +329,16 @@ def _data(self): _str_slice = ArrowStringArrayMixin._str_slice def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True + self, + pat, + case: bool = True, + flags: int = 0, + na=lib.no_default, + regex: bool = True, ): if flags: return super()._str_contains(pat, case, flags, na, regex) - if not isna(na): - if not isinstance(na, bool): - # GH#59561 - warnings.warn( - "Allowing a non-bool 'na' in obj.str.contains is deprecated " - "and will raise in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - na = bool(na) - return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex) def _str_replace( diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index da10a12d02ae4..563dce3008480 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1199,7 +1199,12 @@ def join(self, sep: str): @forbid_nonstring_types(["bytes"]) def contains( - self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True + self, + pat, + case: bool = True, + flags: int = 0, + na=lib.no_default, + regex: bool = True, ): r""" Test if pattern or regex is contained within a string of a Series or Index. @@ -1217,8 +1222,9 @@ def contains( Flags to pass through to the re module, e.g. re.IGNORECASE. na : scalar, optional Fill value for missing values. The default depends on dtype of the - array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, - ``pandas.NA`` is used. + array. For object-dtype, ``numpy.nan`` is used. For the nullable + ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype, + ``False`` is used. regex : bool, default True If True, assumes the pat is a regular expression. @@ -1336,7 +1342,7 @@ def contains( return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def match(self, pat: str, case: bool = True, flags: int = 0, na=None): + def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default): """ Determine if each string starts with a match of a regular expression. @@ -1350,8 +1356,9 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=None): Regex module flags, e.g. re.IGNORECASE. na : scalar, optional Fill value for missing values. The default depends on dtype of the - array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, - ``pandas.NA`` is used. + array. For object-dtype, ``numpy.nan`` is used. For the nullable + ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype, + ``False`` is used. Returns ------- @@ -1377,7 +1384,7 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=None): return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None): + def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): """ Determine if each string entirely matches a regular expression. @@ -1391,8 +1398,9 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None): Regex module flags, e.g. re.IGNORECASE. na : scalar, optional Fill value for missing values. The default depends on dtype of the - array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, - ``pandas.NA`` is used. + array. For object-dtype, ``numpy.nan`` is used. For the nullable + ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype, + ``False`` is used. Returns ------- @@ -2415,7 +2423,7 @@ def count(self, pat, flags: int = 0): @forbid_nonstring_types(["bytes"]) def startswith( - self, pat: str | tuple[str, ...], na: Scalar | None = None + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default ) -> Series | Index: """ Test if the start of each string element matches a pattern. @@ -2427,10 +2435,11 @@ def startswith( pat : str or tuple[str, ...] Character sequence or tuple of strings. Regular expressions are not accepted. - na : object, default NaN + na : scalar, optional Object shown if element tested is not a string. The default depends on dtype of the array. For object-dtype, ``numpy.nan`` is used. - For ``StringDtype``, ``pandas.NA`` is used. + For the nullable ``StringDtype``, ``pandas.NA`` is used. + For the ``"str"`` dtype, ``False`` is used. Returns ------- @@ -2485,7 +2494,7 @@ def startswith( @forbid_nonstring_types(["bytes"]) def endswith( - self, pat: str | tuple[str, ...], na: Scalar | None = None + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default ) -> Series | Index: """ Test if the end of each string element matches a pattern. @@ -2497,10 +2506,11 @@ def endswith( pat : str or tuple[str, ...] Character sequence or tuple of strings. Regular expressions are not accepted. - na : object, default NaN + na : scalar, optional Object shown if element tested is not a string. The default depends on dtype of the array. For object-dtype, ``numpy.nan`` is used. - For ``StringDtype``, ``pandas.NA`` is used. + For the nullable ``StringDtype``, ``pandas.NA`` is used. + For the ``"str"`` dtype, ``False`` is used. Returns ------- diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 96b0352666b41..316c86d152db3 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -7,7 +7,7 @@ Literal, ) -import numpy as np +from pandas._libs import lib if TYPE_CHECKING: from collections.abc import Sequence @@ -85,7 +85,11 @@ def _str_repeat(self, repeats: int | Sequence[int]): @abc.abstractmethod def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar = np.nan + self, + pat: str, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, ): pass @@ -95,7 +99,7 @@ def _str_fullmatch( pat: str | re.Pattern, case: bool = True, flags: int = 0, - na: Scalar = np.nan, + na: Scalar | lib.NoDefault = lib.no_default, ): pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index f376c239a0ce0..e82c6c20e86d9 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -44,7 +44,11 @@ def __len__(self) -> int: raise NotImplementedError def _str_map( - self, f, na_value=None, dtype: NpDtype | None = None, convert: bool = True + self, + f, + na_value=lib.no_default, + dtype: NpDtype | None = None, + convert: bool = True, ): """ Map a callable over valid elements of the array. @@ -65,7 +69,7 @@ def _str_map( """ if dtype is None: dtype = np.dtype("object") - if na_value is None: + if na_value is lib.no_default: na_value = self.dtype.na_value # type: ignore[attr-defined] if not len(self): @@ -127,7 +131,12 @@ def _str_pad( return self._str_map(f) def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True + self, + pat, + case: bool = True, + flags: int = 0, + na=lib.no_default, + regex: bool = True, ): if regex: if not case: @@ -142,7 +151,7 @@ def _str_contains( else: upper_pat = pat.upper() f = lambda x: upper_pat in x.upper() - if not isna(na) and not isinstance(na, bool): + if na is not lib.no_default and not isna(na) and not isinstance(na, bool): # GH#59561 warnings.warn( "Allowing a non-bool 'na' in obj.str.contains is deprecated " @@ -152,9 +161,9 @@ def _str_contains( ) return self._str_map(f, na, dtype=np.dtype("bool")) - def _str_startswith(self, pat, na=None): + def _str_startswith(self, pat, na=lib.no_default): f = lambda x: x.startswith(pat) - if not isna(na) and not isinstance(na, bool): + if na is not lib.no_default and not isna(na) and not isinstance(na, bool): # GH#59561 warnings.warn( "Allowing a non-bool 'na' in obj.str.startswith is deprecated " @@ -164,9 +173,9 @@ def _str_startswith(self, pat, na=None): ) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) - def _str_endswith(self, pat, na=None): + def _str_endswith(self, pat, na=lib.no_default): f = lambda x: x.endswith(pat) - if not isna(na) and not isinstance(na, bool): + if na is not lib.no_default and not isna(na) and not isinstance(na, bool): # GH#59561 warnings.warn( "Allowing a non-bool 'na' in obj.str.endswith is deprecated " @@ -235,7 +244,11 @@ def rep(x, r): return result def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + self, + pat: str, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, ): if not case: flags |= re.IGNORECASE @@ -250,7 +263,7 @@ def _str_fullmatch( pat: str | re.Pattern, case: bool = True, flags: int = 0, - na: Scalar | None = None, + na: Scalar | lib.NoDefault = lib.no_default, ): if not case: flags |= re.IGNORECASE diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index 31e005466af7b..8987fc36656c5 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -111,6 +111,7 @@ def test_api_per_method( any_allowed_skipna_inferred_dtype, any_string_method, request, + using_infer_string, ): # this test does not check correctness of the different methods, # just that the methods work on the specified (inferred) dtypes, @@ -149,6 +150,10 @@ def test_api_per_method( t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name) + if using_infer_string and dtype == "category": + string_allowed = method_name not in ["decode"] + else: + string_allowed = True bytes_allowed = method_name in ["decode", "get", "len", "slice"] # as of v0.23.4, all methods except 'cat' are very lenient with the # allowed data types, just returning NaN for entries that error. @@ -157,7 +162,8 @@ def test_api_per_method( mixed_allowed = method_name not in ["cat"] allowed_types = ( - ["string", "unicode", "empty"] + ["empty"] + + ["string", "unicode"] * string_allowed + ["bytes"] * bytes_allowed + ["mixed", "mixed-integer"] * mixed_allowed ) @@ -171,6 +177,7 @@ def test_api_per_method( msg = ( f"Cannot use .str.{method_name} with values of " f"inferred dtype {repr(inferred_dtype)}." + "|a bytes-like object is required, not 'str'" ) with pytest.raises(TypeError, match=msg): method(*args, **kwargs) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 2742c5b67e57e..48159c07de6ab 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -29,20 +29,28 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series( - np.array([False, np.nan, True, True, False], dtype=np.object_), - dtype=expected_dtype, - ) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([False, False, True, True, False], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series( + np.array([False, np.nan, True, True, False], dtype=np.object_), + dtype=expected_dtype, + ) + tm.assert_series_equal(result, expected) result = values.str.contains(pat, regex=False) - expected = Series( - np.array([False, np.nan, False, False, True], dtype=np.object_), - dtype=expected_dtype, - ) + if any_string_dtype == "str": + expected = Series([False, False, False, False, True], dtype=bool) + else: + expected = Series( + np.array([False, np.nan, False, False, True], dtype=np.object_), + dtype=expected_dtype, + ) tm.assert_series_equal(result, expected) values = Series( @@ -79,12 +87,16 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series( - np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype - ) + if any_string_dtype == "str": + expected = Series([False, False, True, True], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series( + np.array([False, np.nan, True, True], dtype=np.object_), + dtype=expected_dtype, + ) tm.assert_series_equal(result, expected) result = values.str.contains(pat, na=False) @@ -184,39 +196,45 @@ def test_contains_moar(any_string_dtype): ) result = s.str.contains("a") - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + na_value = False + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + na_value = np.nan expected = Series( - [False, False, False, True, True, False, np.nan, False, False, True], + [False, False, False, True, True, False, na_value, False, False, True], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("a", case=False) expected = Series( - [True, False, False, True, True, False, np.nan, True, False, True], + [True, False, False, True, True, False, na_value, True, False, True], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("Aa") expected = Series( - [False, False, False, True, False, False, np.nan, False, False, False], + [False, False, False, True, False, False, na_value, False, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("ba") expected = Series( - [False, False, False, True, False, False, np.nan, False, False, False], + [False, False, False, True, False, False, na_value, False, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("ba", case=False) expected = Series( - [False, False, False, True, True, False, np.nan, True, False, False], + [False, False, False, True, True, False, na_value, True, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) @@ -261,10 +279,14 @@ def test_contains_nan(any_string_dtype): tm.assert_series_equal(result, expected) result = s.str.contains("foo") - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([False, False, False], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -287,9 +309,7 @@ def test_startswith_endswith_validate_na(request, any_string_dtype): ) dtype = ser.dtype - if ( - isinstance(dtype, pd.StringDtype) and dtype.storage == "python" - ) or dtype == np.dtype("object"): + if (isinstance(dtype, pd.StringDtype)) or dtype == np.dtype("object"): msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): ser.str.startswith("kapow", na="baz") @@ -307,11 +327,12 @@ def test_startswith_endswith_validate_na(request, any_string_dtype): ser.str.endswith("kapow", na="baz") +@pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) @pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) -def test_startswith(pat, dtype, null_value, na): +def test_startswith(pat, dtype, null_value, na, using_infer_string): # add category dtype parametrizations for GH-36241 values = Series( ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"], @@ -325,6 +346,8 @@ def test_startswith(pat, dtype, null_value, na): exp = exp.fillna(null_value) elif dtype == "object" and null_value is None: exp[exp.isna()] = None + elif using_infer_string and dtype == "category": + exp = exp.fillna(False).astype(bool) tm.assert_series_equal(result, exp) result = values.str.startswith(pat, na=na) @@ -342,20 +365,31 @@ def test_startswith(pat, dtype, null_value, na): @pytest.mark.parametrize("na", [None, True, False]) -def test_startswith_nullable_string_dtype(nullable_string_dtype, na): +def test_startswith_string_dtype(any_string_dtype, na): values = Series( ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], - dtype=nullable_string_dtype, + dtype=any_string_dtype, ) result = values.str.startswith("foo", na=na) + + expected_dtype = ( + (object if na is None else bool) + if is_object_or_nan_string_dtype(any_string_dtype) + else "boolean" + ) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + if na is None: + na = False exp = Series( - [False, na, True, False, False, na, True, False, False], dtype="boolean" + [False, na, True, False, False, na, True, False, False], dtype=expected_dtype ) tm.assert_series_equal(result, exp) result = values.str.startswith("rege.", na=na) exp = Series( - [False, na, False, False, False, na, False, False, True], dtype="boolean" + [False, na, False, False, False, na, False, False, True], dtype=expected_dtype ) tm.assert_series_equal(result, exp) @@ -365,11 +399,12 @@ def test_startswith_nullable_string_dtype(nullable_string_dtype, na): # -------------------------------------------------------------------------------------- +@pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) @pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) -def test_endswith(pat, dtype, null_value, na): +def test_endswith(pat, dtype, null_value, na, using_infer_string): # add category dtype parametrizations for GH-36241 values = Series( ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"], @@ -383,6 +418,8 @@ def test_endswith(pat, dtype, null_value, na): exp = exp.fillna(null_value) elif dtype == "object" and null_value is None: exp[exp.isna()] = None + elif using_infer_string and dtype == "category": + exp = exp.fillna(False).astype(bool) tm.assert_series_equal(result, exp) result = values.str.endswith(pat, na=na) @@ -400,20 +437,30 @@ def test_endswith(pat, dtype, null_value, na): @pytest.mark.parametrize("na", [None, True, False]) -def test_endswith_nullable_string_dtype(nullable_string_dtype, na): +def test_endswith_string_dtype(any_string_dtype, na): values = Series( ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], - dtype=nullable_string_dtype, + dtype=any_string_dtype, ) result = values.str.endswith("foo", na=na) + expected_dtype = ( + (object if na is None else bool) + if is_object_or_nan_string_dtype(any_string_dtype) + else "boolean" + ) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + if na is None: + na = False exp = Series( - [False, na, False, False, True, na, True, False, False], dtype="boolean" + [False, na, False, False, True, na, True, False, False], dtype=expected_dtype ) tm.assert_series_equal(result, exp) result = values.str.endswith("rege.", na=na) exp = Series( - [False, na, False, False, False, na, False, False, True], dtype="boolean" + [False, na, False, False, False, na, False, False, True], dtype=expected_dtype ) tm.assert_series_equal(result, exp) @@ -692,36 +739,41 @@ def test_replace_regex_single_character(regex, any_string_dtype): def test_match(any_string_dtype): - # New match behavior introduced in 0.13 - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + na_value = False + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + na_value = np.nan values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) result = values.str.match(".*(BAD[_]+).*(BAD)") - expected = Series([True, np.nan, False], dtype=expected_dtype) + expected = Series([True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) values = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = values.str.match(".*BAD[_]+.*BAD") - expected = Series([True, True, np.nan, False], dtype=expected_dtype) + expected = Series([True, True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = values.str.match("BAD[_]+.*BAD") - expected = Series([False, True, np.nan, False], dtype=expected_dtype) + expected = Series([False, True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) values = Series( ["fooBAD__barBAD", "^BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = values.str.match("^BAD[_]+.*BAD") - expected = Series([False, False, np.nan, False], dtype=expected_dtype) + expected = Series([False, False, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = values.str.match("\\^BAD[_]+.*BAD") - expected = Series([False, True, np.nan, False], dtype=expected_dtype) + expected = Series([False, True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -757,10 +809,17 @@ def test_match_na_kwarg(any_string_dtype): tm.assert_series_equal(result, expected) result = s.str.match("a") - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series([True, False, np.nan], dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + na_value = False + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + na_value = np.nan + + expected = Series([True, False, na_value], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -785,10 +844,14 @@ def test_fullmatch(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD") - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series([True, False, np.nan, False], dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([True, False, False, False], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([True, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -796,10 +859,14 @@ def test_fullmatch_dollar_literal(any_string_dtype): # GH 56652 ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype) result = ser.str.fullmatch("foo\\$") - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series([False, False, np.nan, True], dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([False, False, False, True], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([False, False, np.nan, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index 517ddb164985c..cd3c512328139 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -38,7 +38,7 @@ def test_string_array(nullable_string_dtype, any_string_method): expected.values, skipna=True ): assert result.dtype == "boolean" - result = result.astype(object) + expected = expected.astype("boolean") elif expected.dtype == "bool": assert result.dtype == "boolean" diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 40b6c69dc8025..7c396e65b6120 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -217,8 +217,21 @@ def test_ismethods(method, expected, any_string_dtype): tm.assert_series_equal(result, expected) # compare with standard library - expected = [getattr(item, method)() for item in ser] - assert list(result) == expected + expected_stdlib = [getattr(item, method)() for item in ser] + assert list(result) == expected_stdlib + + # with missing value + ser.iloc[[1, 2, 3, 4]] = np.nan + result = getattr(ser.str, method)() + if ser.dtype == "object": + expected = expected.astype(object) + expected.iloc[[1, 2, 3, 4]] = np.nan + elif ser.dtype == "str": + # NaN propagates as False + expected.iloc[[1, 2, 3, 4]] = False + else: + # nullable dtypes propagate NaN + expected.iloc[[1, 2, 3, 4]] = np.nan @pytest.mark.parametrize( @@ -248,6 +261,7 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): assert list(result) == expected +@pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize( "method, expected", [ @@ -258,10 +272,14 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): def test_isnumeric_unicode_missing(method, expected, any_string_dtype): values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] # noqa: RUF001 ser = Series(values, dtype=any_string_dtype) - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series(expected, dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series(expected, dtype=object).fillna(False).astype(bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected)