From 8560001dd43430a44957175bf8e3d61d5dece33c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 6 Nov 2024 23:06:20 +0100 Subject: [PATCH] Backport PR #60222: ENH (string dtype): accept string_view in addition to string/large_string for ArrowStringArray input --- pandas/core/arrays/string_arrow.py | 7 +++++++ pandas/tests/arrays/string_/test_string_arrow.py | 14 ++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e7dd4f9dc5718..b6e98d8fdc7e5 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -18,6 +18,7 @@ from pandas.compat import ( pa_version_under10p1, pa_version_under13p0, + pa_version_under16p0, ) from pandas.util._exceptions import find_stack_level @@ -65,6 +66,10 @@ def _chk_pyarrow_available() -> None: raise ImportError(msg) +def _is_string_view(typ): + return not pa_version_under16p0 and pa.types.is_string_view(typ) + + # TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from # ObjectStringArrayMixin because we want to have the object-dtype based methods as # fallback for the ones that pyarrow doesn't yet support @@ -122,11 +127,13 @@ def __init__(self, values) -> None: _chk_pyarrow_available() if isinstance(values, (pa.Array, pa.ChunkedArray)) and ( pa.types.is_string(values.type) + or _is_string_view(values.type) or ( pa.types.is_dictionary(values.type) and ( pa.types.is_string(values.type.value_type) or pa.types.is_large_string(values.type.value_type) + or _is_string_view(values.type.value_type) ) ) ): diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 2f3840e92b62a..aa87f5fc0f49a 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -99,6 +99,20 @@ def test_constructor_valid_string_type_value_dictionary(string_type, chunked): assert pa.types.is_large_string(arr._pa_array.type) +@pytest.mark.parametrize("chunked", [True, False]) +def test_constructor_valid_string_view(chunked): + # requires pyarrow>=18 for casting string_view to string + pa = pytest.importorskip("pyarrow", minversion="18") + + arr = pa.array(["1", "2", "3"], pa.string_view()) + if chunked: + arr = pa.chunked_array(arr) + + arr = ArrowStringArray(arr) + # dictionary type get converted to dense large string array + assert pa.types.is_large_string(arr._pa_array.type) + + def test_constructor_from_list(): # GH#27673 pytest.importorskip("pyarrow")