From f499f275c437333e15fc0d2fb3321e8534bd50d8 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 16:22:23 -0500 Subject: [PATCH 01/73] First function is working: is_alnum. --- pyproject.toml | 3 +- src/awkward/operations/__init__.py | 2 +- src/awkward/operations/str/__init__.py | 21 +++++++++ src/awkward/operations/str/ak_is_alnum.py | 54 ++++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 32 +++++++++++++ 5 files changed, 110 insertions(+), 2 deletions(-) create mode 100644 src/awkward/operations/str/__init__.py create mode 100644 src/awkward/operations/str/ak_is_alnum.py create mode 100644 tests/test_2616_use_pyarrow_for_strings.py diff --git a/pyproject.toml b/pyproject.toml index 87a1b29507..d4b6ce3d3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -309,7 +309,8 @@ mccabe.max-complexity = 100 "src/awkward/_connect/*" = ["TID251"] "src/awkward/__init__.py" = ["E402", "F401", "F403", "I001"] "src/awkward/_ext.py" = ["F401"] -"src/awkward/operations/__init__.py" = ["F403"] +"src/awkward/operations/__init__.py" = ["F401", "F403"] +"src/awkward/operations/str/__init__.py" = ["F401", "F403"] "src/awkward/_nplikes/*" = ["TID251"] "src/awkward/_operators.py" = ["TID251"] "tests*/*" = ["T20", "TID251"] diff --git a/src/awkward/operations/__init__.py b/src/awkward/operations/__init__.py index 450e4679de..f378a12dc7 100644 --- a/src/awkward/operations/__init__.py +++ b/src/awkward/operations/__init__.py @@ -1,6 +1,6 @@ # BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE -# ruff: noqa: F401 +import awkward.operations.str from awkward.operations.ak_all import * from awkward.operations.ak_almost_equal import * from awkward.operations.ak_any import * diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py new file mode 100644 index 0000000000..1529f7411f --- /dev/null +++ b/src/awkward/operations/str/__init__.py @@ -0,0 +1,21 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +from awkward.operations.str.ak_is_alnum import * + + +def get_action(utf8_function, ascii_function): + from awkward.operations.ak_from_arrow import from_arrow + from awkward.operations.ak_to_arrow import to_arrow + + def action(layout, **kwargs): + if layout.is_list and layout.parameter("__array__") == "string": + return from_arrow( + utf8_function(to_arrow(layout, extensionarray=False)), highlevel=False + ) + + elif layout.is_list and layout.parameter("__array__") == "bytestring": + return from_arrow( + ascii_function(to_arrow(layout, extensionarray=False)), highlevel=False + ) + + return action diff --git a/src/awkward/operations/str/ak_is_alnum.py b/src/awkward/operations/str/ak_is_alnum.py new file mode 100644 index 0000000000..3c1f667ed6 --- /dev/null +++ b/src/awkward/operations/str/ak_is_alnum.py @@ -0,0 +1,54 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_alnum",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def is_alnum(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with booleans indicating whether they are alphanumeric. + + Note: this function does not raise an error if the `array` does + not contain any string data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_isalnum](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_alnum.html) + or + [pyarrow.compute.ascii_isalnum](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_alnum.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str.get_action( + pyarrow.compute.utf8_is_alnum, pyarrow.compute.ascii_is_alnum + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py new file mode 100644 index 0000000000..0be962bc7f --- /dev/null +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -0,0 +1,32 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +import awkward as ak + +string = ak.Array( + [ + ["\u03b1\u03b2\u03b3", ""], + [], + ["\u2192\u03b4\u03b5\u2190", "\u03b6z z\u03b6", "abc"], + ] +) +bytestring = ak.Array( + [ + ["\u03b1\u03b2\u03b3".encode(), b""], + [], + ["\u2192\u03b4\u03b5\u2190".encode(), "\u03b6z z\u03b6".encode(), b"abc"], + ] +) + + +def test_is_alnum(): + assert ak.str.is_alnum(string).tolist() == [ + [True, False], + [], + [False, False, True], + ] + # ArrowNotImplementedError + # assert ak.str.is_alnum(bytestring).tolist() == [ + # [False, False], + # [], + # [False, False, True], + # ] From 018b8e3351fc90f842588cd4ae2c3c3cc50c2cb2 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 16:51:23 -0500 Subject: [PATCH 02/73] is_alpha --- src/awkward/operations/str/__init__.py | 26 +++++++++-- src/awkward/operations/str/ak_is_alnum.py | 6 +-- src/awkward/operations/str/ak_is_alpha.py | 54 ++++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 28 ++++++++--- 4 files changed, 101 insertions(+), 13 deletions(-) create mode 100644 src/awkward/operations/str/ak_is_alpha.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 1529f7411f..0bf724411c 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -1,9 +1,10 @@ # BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE from awkward.operations.str.ak_is_alnum import * +from awkward.operations.str.ak_is_alpha import * -def get_action(utf8_function, ascii_function): +def _get_action(utf8_function, ascii_function, *, bytestring_to_string=False): from awkward.operations.ak_from_arrow import from_arrow from awkward.operations.ak_to_arrow import to_arrow @@ -14,8 +15,25 @@ def action(layout, **kwargs): ) elif layout.is_list and layout.parameter("__array__") == "bytestring": - return from_arrow( - ascii_function(to_arrow(layout, extensionarray=False)), highlevel=False - ) + if bytestring_to_string: + return from_arrow( + ascii_function( + to_arrow( + layout.copy( + content=layout.content.copy( + parameters={"__array__": "char"} + ), + parameters={"__array__": "string"}, + ), + extensionarray=False, + ) + ), + highlevel=False, + ) + else: + return from_arrow( + ascii_function(to_arrow(layout, extensionarray=False)), + highlevel=False, + ) return action diff --git a/src/awkward/operations/str/ak_is_alnum.py b/src/awkward/operations/str/ak_is_alnum.py index 3c1f667ed6..880894df3f 100644 --- a/src/awkward/operations/str/ak_is_alnum.py +++ b/src/awkward/operations/str/ak_is_alnum.py @@ -39,14 +39,14 @@ def is_alnum(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): import awkward._connect.pyarrow # noqa: F401, I001 - import pyarrow.compute + import pyarrow.compute as pc behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str.get_action( - pyarrow.compute.utf8_is_alnum, pyarrow.compute.ascii_is_alnum + ak.operations.str._get_action( + pc.utf8_is_alnum, pc.ascii_is_alnum, bytestring_to_string=True ), behavior, ) diff --git a/src/awkward/operations/str/ak_is_alpha.py b/src/awkward/operations/str/ak_is_alpha.py new file mode 100644 index 0000000000..94e26aa142 --- /dev/null +++ b/src/awkward/operations/str/ak_is_alpha.py @@ -0,0 +1,54 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_alpha",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def is_alpha(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with booleans indicating whether they are alphanumeric. + + Note: this function does not raise an error if the `array` does + not contain any string data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_alpha.html) + or + [pyarrow.compute.ascii_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_alpha.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_is_alpha, pc.ascii_is_alpha, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 0be962bc7f..dd533d03ce 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -1,7 +1,11 @@ # BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE +import pytest + import awkward as ak +pytest.importorskip("pyarrow") + string = ak.Array( [ ["\u03b1\u03b2\u03b3", ""], @@ -24,9 +28,21 @@ def test_is_alnum(): [], [False, False, True], ] - # ArrowNotImplementedError - # assert ak.str.is_alnum(bytestring).tolist() == [ - # [False, False], - # [], - # [False, False, True], - # ] + assert ak.str.is_alnum(bytestring).tolist() == [ + [False, False], + [], + [False, False, True], + ] + + +def test_is_alpha(): + assert ak.str.is_alpha(string).tolist() == [ + [True, False], + [], + [False, False, True], + ] + assert ak.str.is_alpha(bytestring).tolist() == [ + [False, False], + [], + [False, False, True], + ] From 1d97c326c3d1dda330a5a96dbfaa646bbffedb6b Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 16:53:06 -0500 Subject: [PATCH 03/73] is_decimal --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_is_decimal.py | 54 +++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 13 +++++ 3 files changed, 68 insertions(+) create mode 100644 src/awkward/operations/str/ak_is_decimal.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 0bf724411c..550df65c11 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -2,6 +2,7 @@ from awkward.operations.str.ak_is_alnum import * from awkward.operations.str.ak_is_alpha import * +from awkward.operations.str.ak_is_decimal import * def _get_action(utf8_function, ascii_function, *, bytestring_to_string=False): diff --git a/src/awkward/operations/str/ak_is_decimal.py b/src/awkward/operations/str/ak_is_decimal.py new file mode 100644 index 0000000000..15ab147eee --- /dev/null +++ b/src/awkward/operations/str/ak_is_decimal.py @@ -0,0 +1,54 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_decimal",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def is_decimal(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with booleans indicating whether they are alphanumeric. + + Note: this function does not raise an error if the `array` does + not contain any string data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_decimal.html) + or + [pyarrow.compute.ascii_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_decimal.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_is_decimal, pc.ascii_is_decimal, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index dd533d03ce..b04b299679 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -46,3 +46,16 @@ def test_is_alpha(): [], [False, False, True], ] + + +def test_is_decimal(): + assert ak.str.is_decimal(string).tolist() == [ + [False, False], + [], + [False, False, False], + ] + assert ak.str.is_decimal(bytestring).tolist() == [ + [False, False], + [], + [False, False, False], + ] From f3d20750021e31197695e0da01717b4270db9355 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 17:03:29 -0500 Subject: [PATCH 04/73] is_lower --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_is_alnum.py | 6 ++- src/awkward/operations/str/ak_is_alpha.py | 6 ++- src/awkward/operations/str/ak_is_decimal.py | 6 ++- src/awkward/operations/str/ak_is_lower.py | 56 +++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 13 +++++ 6 files changed, 82 insertions(+), 6 deletions(-) create mode 100644 src/awkward/operations/str/ak_is_lower.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 550df65c11..572ff911ac 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -3,6 +3,7 @@ from awkward.operations.str.ak_is_alnum import * from awkward.operations.str.ak_is_alpha import * from awkward.operations.str.ak_is_decimal import * +from awkward.operations.str.ak_is_lower import * def _get_action(utf8_function, ascii_function, *, bytestring_to_string=False): diff --git a/src/awkward/operations/str/ak_is_alnum.py b/src/awkward/operations/str/ak_is_alnum.py index 880894df3f..6ced0234c6 100644 --- a/src/awkward/operations/str/ak_is_alnum.py +++ b/src/awkward/operations/str/ak_is_alnum.py @@ -18,10 +18,12 @@ def is_alnum(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with booleans indicating whether they are alphanumeric. + Replaces any string-valued data with True iff the string is non-empty and consists only of alphanumeric Unicode characters. + + Replaces any bytestring-valued data with True iff the string is non-empty and consists only of alphanumeric ASCII characters. Note: this function does not raise an error if the `array` does - not contain any string data. + not contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_isalnum](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_alnum.html) diff --git a/src/awkward/operations/str/ak_is_alpha.py b/src/awkward/operations/str/ak_is_alpha.py index 94e26aa142..1910a51e90 100644 --- a/src/awkward/operations/str/ak_is_alpha.py +++ b/src/awkward/operations/str/ak_is_alpha.py @@ -18,10 +18,12 @@ def is_alpha(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with booleans indicating whether they are alphanumeric. + Replaces any string-valued data with True iff the string is non-empty and consists only of alphabetic Unicode characters. + + Replaces any bytestring-valued data with True iff the string is non-empty and consists only of alphabetic ASCII characters. Note: this function does not raise an error if the `array` does - not contain any string data. + not contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_alpha.html) diff --git a/src/awkward/operations/str/ak_is_decimal.py b/src/awkward/operations/str/ak_is_decimal.py index 15ab147eee..0ea853d3e1 100644 --- a/src/awkward/operations/str/ak_is_decimal.py +++ b/src/awkward/operations/str/ak_is_decimal.py @@ -18,10 +18,12 @@ def is_decimal(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with booleans indicating whether they are alphanumeric. + Replaces any string-valued data True iff the string is non-empty and consists only of decimal Unicode characters. + + Replaces any bytestring-valued data True iff the string is non-empty and consists only of decimal ASCII characters. Note: this function does not raise an error if the `array` does - not contain any string data. + not contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_decimal.html) diff --git a/src/awkward/operations/str/ak_is_lower.py b/src/awkward/operations/str/ak_is_lower.py new file mode 100644 index 0000000000..30e1b206d0 --- /dev/null +++ b/src/awkward/operations/str/ak_is_lower.py @@ -0,0 +1,56 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_lower",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def is_lower(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data True iff the string is non-empty and consists only of lowercase Unicode characters. + + Replaces any bytestring-valued data True iff the string is non-empty and consists only of lowercase ASCII characters. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_lower.html) + or + [pyarrow.compute.ascii_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_lower.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_is_lower, pc.ascii_is_lower, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index b04b299679..2023ed0481 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -59,3 +59,16 @@ def test_is_decimal(): [], [False, False, False], ] + + +def test_is_lower(): + assert ak.str.is_lower(string).tolist() == [ + [True, False], + [], + [True, True, True], + ] + assert ak.str.is_lower(bytestring).tolist() == [ + [False, False], + [], + [False, True, True], + ] From 784dc689cfc37c78834c606d4f6501938d294a4d Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 17:07:22 -0500 Subject: [PATCH 05/73] is_digit --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_is_digit.py | 58 ++++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 13 +++++ 3 files changed, 72 insertions(+) create mode 100644 src/awkward/operations/str/ak_is_digit.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 572ff911ac..97378a31fb 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -3,6 +3,7 @@ from awkward.operations.str.ak_is_alnum import * from awkward.operations.str.ak_is_alpha import * from awkward.operations.str.ak_is_decimal import * +from awkward.operations.str.ak_is_digit import * from awkward.operations.str.ak_is_lower import * diff --git a/src/awkward/operations/str/ak_is_digit.py b/src/awkward/operations/str/ak_is_digit.py new file mode 100644 index 0000000000..91d0113155 --- /dev/null +++ b/src/awkward/operations/str/ak_is_digit.py @@ -0,0 +1,58 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_digit",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def is_digit(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data True iff the string is non-empty and consists only of Unicode digits. + + Replaces any bytestring-valued data True iff the string is non-empty and consists only of Unicode digits. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_digit.html) + or + [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_digit.html) + on strings and bytestrings, respectively. + + (Arrow's compute module does not have an `ascii_is_digit`.) + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_is_digit, pc.utf8_is_digit, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 2023ed0481..e5ab8f98a0 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -61,6 +61,19 @@ def test_is_decimal(): ] +def test_is_digit(): + assert ak.str.is_digit(string).tolist() == [ + [False, False], + [], + [False, False, False], + ] + assert ak.str.is_digit(bytestring).tolist() == [ + [False, False], + [], + [False, False, False], + ] + + def test_is_lower(): assert ak.str.is_lower(string).tolist() == [ [True, False], From 73b346dbf1cd24ff7bc1d657f6ae87db8e02d98c Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 17:10:43 -0500 Subject: [PATCH 06/73] is_numeric --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_is_numeric.py | 58 +++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 13 +++++ 3 files changed, 72 insertions(+) create mode 100644 src/awkward/operations/str/ak_is_numeric.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 97378a31fb..579ac11230 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -5,6 +5,7 @@ from awkward.operations.str.ak_is_decimal import * from awkward.operations.str.ak_is_digit import * from awkward.operations.str.ak_is_lower import * +from awkward.operations.str.ak_is_numeric import * def _get_action(utf8_function, ascii_function, *, bytestring_to_string=False): diff --git a/src/awkward/operations/str/ak_is_numeric.py b/src/awkward/operations/str/ak_is_numeric.py new file mode 100644 index 0000000000..cdb250411b --- /dev/null +++ b/src/awkward/operations/str/ak_is_numeric.py @@ -0,0 +1,58 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_numeric",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def is_numeric(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data true iff the string is non-empty and consists only of numeric Unicode characters. + + Replaces any bytestring-valued data true iff the string is non-empty and consists only of numeric Unicode characters. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_numeric.html) + or + [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_numeric.html) + on strings and bytestrings, respectively. + + (Arrow's compute module does not have an `ascii_is_numeric`.) + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_is_numeric, pc.utf8_is_numeric, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index e5ab8f98a0..0889d43146 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -85,3 +85,16 @@ def test_is_lower(): [], [False, True, True], ] + + +def test_is_numeric(): + assert ak.str.is_numeric(string).tolist() == [ + [False, False], + [], + [False, False, False], + ] + assert ak.str.is_numeric(bytestring).tolist() == [ + [False, False], + [], + [False, False, False], + ] From eff2dfebc4b99432fc42d8aea61fdfd53d0f8703 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 17:13:14 -0500 Subject: [PATCH 07/73] is_printable --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_is_printable.py | 56 +++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 13 +++++ 3 files changed, 70 insertions(+) create mode 100644 src/awkward/operations/str/ak_is_printable.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 579ac11230..a7c6257368 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -6,6 +6,7 @@ from awkward.operations.str.ak_is_digit import * from awkward.operations.str.ak_is_lower import * from awkward.operations.str.ak_is_numeric import * +from awkward.operations.str.ak_is_printable import * def _get_action(utf8_function, ascii_function, *, bytestring_to_string=False): diff --git a/src/awkward/operations/str/ak_is_printable.py b/src/awkward/operations/str/ak_is_printable.py new file mode 100644 index 0000000000..108f8d6fc7 --- /dev/null +++ b/src/awkward/operations/str/ak_is_printable.py @@ -0,0 +1,56 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_printable",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def is_printable(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data True iff the string is non-empty and consists only of printable Unicode characters. + + Replaces any bytestring-valued data True iff the string is non-empty and consists only of printable ASCII characters. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_printable.html) + or + [pyarrow.compute.ascii_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_printable.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_is_printable, pc.ascii_is_printable, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 0889d43146..8dcee8c4c6 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -98,3 +98,16 @@ def test_is_numeric(): [], [False, False, False], ] + + +def test_is_printable(): + assert ak.str.is_printable(string).tolist() == [ + [True, True], + [], + [True, True, True], + ] + assert ak.str.is_printable(bytestring).tolist() == [ + [False, True], + [], + [False, False, True], + ] From 82b5a7bc9c8f8b3ec52f4d529c46e2cfec787e83 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 17:16:18 -0500 Subject: [PATCH 08/73] is_space --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_is_space.py | 56 ++++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 13 +++++ 3 files changed, 70 insertions(+) create mode 100644 src/awkward/operations/str/ak_is_space.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index a7c6257368..ca5cea673c 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -7,6 +7,7 @@ from awkward.operations.str.ak_is_lower import * from awkward.operations.str.ak_is_numeric import * from awkward.operations.str.ak_is_printable import * +from awkward.operations.str.ak_is_space import * def _get_action(utf8_function, ascii_function, *, bytestring_to_string=False): diff --git a/src/awkward/operations/str/ak_is_space.py b/src/awkward/operations/str/ak_is_space.py new file mode 100644 index 0000000000..fbcd54ce74 --- /dev/null +++ b/src/awkward/operations/str/ak_is_space.py @@ -0,0 +1,56 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_space",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def is_space(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data True iff the string is non-empty and consists only of whitespace Unicode characters. + + Replaces any bytestring-valued data True iff the string is non-empty and consists only of whitespace ASCII characters. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_space.html) + or + [pyarrow.compute.ascii_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_space.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_is_space, pc.ascii_is_space, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 8dcee8c4c6..d2dba86457 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -111,3 +111,16 @@ def test_is_printable(): [], [False, False, True], ] + + +def test_is_space(): + assert ak.str.is_space(string).tolist() == [ + [False, False], + [], + [False, False, False], + ] + assert ak.str.is_space(bytestring).tolist() == [ + [False, False], + [], + [False, False, False], + ] From c8c669c81f777affcaa1b4d53fc9fbc2dbc5c81c Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 17:25:38 -0500 Subject: [PATCH 09/73] is_upper --- src/awkward/operations/str/__init__.py | 4 ++ src/awkward/operations/str/ak_is_alnum.py | 4 +- src/awkward/operations/str/ak_is_alpha.py | 4 +- src/awkward/operations/str/ak_is_decimal.py | 4 +- src/awkward/operations/str/ak_is_digit.py | 4 +- src/awkward/operations/str/ak_is_lower.py | 4 +- src/awkward/operations/str/ak_is_numeric.py | 4 +- src/awkward/operations/str/ak_is_printable.py | 4 +- src/awkward/operations/str/ak_is_space.py | 4 +- src/awkward/operations/str/ak_is_upper.py | 59 +++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 13 ++++ 11 files changed, 92 insertions(+), 16 deletions(-) create mode 100644 src/awkward/operations/str/ak_is_upper.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index ca5cea673c..fe4e7c65f7 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -1,5 +1,8 @@ # BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE +# https://arrow.apache.org/docs/python/api/compute.html#string-predicates + +# string predicates from awkward.operations.str.ak_is_alnum import * from awkward.operations.str.ak_is_alpha import * from awkward.operations.str.ak_is_decimal import * @@ -8,6 +11,7 @@ from awkward.operations.str.ak_is_numeric import * from awkward.operations.str.ak_is_printable import * from awkward.operations.str.ak_is_space import * +from awkward.operations.str.ak_is_upper import * def _get_action(utf8_function, ascii_function, *, bytestring_to_string=False): diff --git a/src/awkward/operations/str/ak_is_alnum.py b/src/awkward/operations/str/ak_is_alnum.py index 6ced0234c6..fd21786918 100644 --- a/src/awkward/operations/str/ak_is_alnum.py +++ b/src/awkward/operations/str/ak_is_alnum.py @@ -18,9 +18,9 @@ def is_alnum(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with True iff the string is non-empty and consists only of alphanumeric Unicode characters. + Replaces any string-valued data with True if the string is non-empty and consists only of alphanumeric Unicode characters, False otherwise. - Replaces any bytestring-valued data with True iff the string is non-empty and consists only of alphanumeric ASCII characters. + Replaces any bytestring-valued data with True if the string is non-empty and consists only of alphanumeric ASCII characters, False otherwise. Note: this function does not raise an error if the `array` does not contain any string or bytestring data. diff --git a/src/awkward/operations/str/ak_is_alpha.py b/src/awkward/operations/str/ak_is_alpha.py index 1910a51e90..1ac2860624 100644 --- a/src/awkward/operations/str/ak_is_alpha.py +++ b/src/awkward/operations/str/ak_is_alpha.py @@ -18,9 +18,9 @@ def is_alpha(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with True iff the string is non-empty and consists only of alphabetic Unicode characters. + Replaces any string-valued data with True if the string is non-empty and consists only of alphabetic Unicode characters, False otherwise. - Replaces any bytestring-valued data with True iff the string is non-empty and consists only of alphabetic ASCII characters. + Replaces any bytestring-valued data with True if the string is non-empty and consists only of alphabetic ASCII characters, False otherwise. Note: this function does not raise an error if the `array` does not contain any string or bytestring data. diff --git a/src/awkward/operations/str/ak_is_decimal.py b/src/awkward/operations/str/ak_is_decimal.py index 0ea853d3e1..b1a7a51e32 100644 --- a/src/awkward/operations/str/ak_is_decimal.py +++ b/src/awkward/operations/str/ak_is_decimal.py @@ -18,9 +18,9 @@ def is_decimal(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data True iff the string is non-empty and consists only of decimal Unicode characters. + Replaces any string-valued data True if the string is non-empty and consists only of decimal Unicode characters, False otherwise. - Replaces any bytestring-valued data True iff the string is non-empty and consists only of decimal ASCII characters. + Replaces any bytestring-valued data True if the string is non-empty and consists only of decimal ASCII characters, False otherwise. Note: this function does not raise an error if the `array` does not contain any string or bytestring data. diff --git a/src/awkward/operations/str/ak_is_digit.py b/src/awkward/operations/str/ak_is_digit.py index 91d0113155..100ab33b07 100644 --- a/src/awkward/operations/str/ak_is_digit.py +++ b/src/awkward/operations/str/ak_is_digit.py @@ -18,9 +18,9 @@ def is_digit(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data True iff the string is non-empty and consists only of Unicode digits. + Replaces any string-valued data True if the string is non-empty and consists only of Unicode digits, False otherwise. - Replaces any bytestring-valued data True iff the string is non-empty and consists only of Unicode digits. + Replaces any bytestring-valued data True if the string is non-empty and consists only of Unicode digits, False otherwise. Note: this function does not raise an error if the `array` does not contain any string or bytestring data. diff --git a/src/awkward/operations/str/ak_is_lower.py b/src/awkward/operations/str/ak_is_lower.py index 30e1b206d0..6fd261c822 100644 --- a/src/awkward/operations/str/ak_is_lower.py +++ b/src/awkward/operations/str/ak_is_lower.py @@ -18,9 +18,9 @@ def is_lower(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data True iff the string is non-empty and consists only of lowercase Unicode characters. + Replaces any string-valued data True if the string is non-empty and consists only of lowercase Unicode characters, False otherwise. - Replaces any bytestring-valued data True iff the string is non-empty and consists only of lowercase ASCII characters. + Replaces any bytestring-valued data True if the string is non-empty and consists only of lowercase ASCII characters, False otherwise. Note: this function does not raise an error if the `array` does not contain any string or bytestring data. diff --git a/src/awkward/operations/str/ak_is_numeric.py b/src/awkward/operations/str/ak_is_numeric.py index cdb250411b..01f9d6fccc 100644 --- a/src/awkward/operations/str/ak_is_numeric.py +++ b/src/awkward/operations/str/ak_is_numeric.py @@ -18,9 +18,9 @@ def is_numeric(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data true iff the string is non-empty and consists only of numeric Unicode characters. + Replaces any string-valued data true if the string is non-empty and consists only of numeric Unicode characters, False otherwise. - Replaces any bytestring-valued data true iff the string is non-empty and consists only of numeric Unicode characters. + Replaces any bytestring-valued data true if the string is non-empty and consists only of numeric Unicode characters, False otherwise. Note: this function does not raise an error if the `array` does not contain any string or bytestring data. diff --git a/src/awkward/operations/str/ak_is_printable.py b/src/awkward/operations/str/ak_is_printable.py index 108f8d6fc7..c4dbc1d96f 100644 --- a/src/awkward/operations/str/ak_is_printable.py +++ b/src/awkward/operations/str/ak_is_printable.py @@ -18,9 +18,9 @@ def is_printable(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data True iff the string is non-empty and consists only of printable Unicode characters. + Replaces any string-valued data True if the string is non-empty and consists only of printable Unicode characters, False otherwise. - Replaces any bytestring-valued data True iff the string is non-empty and consists only of printable ASCII characters. + Replaces any bytestring-valued data True if the string is non-empty and consists only of printable ASCII characters, False otherwise. Note: this function does not raise an error if the `array` does not contain any string or bytestring data. diff --git a/src/awkward/operations/str/ak_is_space.py b/src/awkward/operations/str/ak_is_space.py index fbcd54ce74..13217936cd 100644 --- a/src/awkward/operations/str/ak_is_space.py +++ b/src/awkward/operations/str/ak_is_space.py @@ -18,9 +18,9 @@ def is_space(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data True iff the string is non-empty and consists only of whitespace Unicode characters. + Replaces any string-valued data True if the string is non-empty and consists only of whitespace Unicode characters, False otherwise. - Replaces any bytestring-valued data True iff the string is non-empty and consists only of whitespace ASCII characters. + Replaces any bytestring-valued data True if the string is non-empty and consists only of whitespace ASCII characters, False otherwise. Note: this function does not raise an error if the `array` does not contain any string or bytestring data. diff --git a/src/awkward/operations/str/ak_is_upper.py b/src/awkward/operations/str/ak_is_upper.py new file mode 100644 index 0000000000..bb35d2e6d1 --- /dev/null +++ b/src/awkward/operations/str/ak_is_upper.py @@ -0,0 +1,59 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_upper",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def is_upper(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data True if the string is non-empty and consists only of uppercase Unicode characters, False otherwise. + + Replaces any bytestring-valued data True if the string is non-empty and consists only of uppercase ASCII characters, False otherwise. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_upper.html) + or + [pyarrow.compute.ascii_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_upper.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_is_upper, + pc.ascii_is_upper, + # pc.ascii_is_upper is defined on binary, but for consistency with lower... + bytestring_to_string=True, + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index d2dba86457..b0928e1c18 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -124,3 +124,16 @@ def test_is_space(): [], [False, False, False], ] + + +def test_is_upper(): + assert ak.str.is_space(string).tolist() == [ + [False, False], + [], + [False, False, False], + ] + assert ak.str.is_space(bytestring).tolist() == [ + [False, False], + [], + [False, False, False], + ] From b9f986844b2309508b0721866fb06bf4f2dc77d9 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 17:36:45 -0500 Subject: [PATCH 10/73] is_title --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_is_decimal.py | 4 +- src/awkward/operations/str/ak_is_digit.py | 4 +- src/awkward/operations/str/ak_is_lower.py | 4 +- src/awkward/operations/str/ak_is_numeric.py | 4 +- src/awkward/operations/str/ak_is_printable.py | 4 +- src/awkward/operations/str/ak_is_space.py | 4 +- src/awkward/operations/str/ak_is_title.py | 56 +++++++++++++++++++ src/awkward/operations/str/ak_is_upper.py | 6 +- tests/test_2616_use_pyarrow_for_strings.py | 17 +++++- 10 files changed, 87 insertions(+), 17 deletions(-) create mode 100644 src/awkward/operations/str/ak_is_title.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index fe4e7c65f7..73793a8626 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -11,6 +11,7 @@ from awkward.operations.str.ak_is_numeric import * from awkward.operations.str.ak_is_printable import * from awkward.operations.str.ak_is_space import * +from awkward.operations.str.ak_is_title import * from awkward.operations.str.ak_is_upper import * diff --git a/src/awkward/operations/str/ak_is_decimal.py b/src/awkward/operations/str/ak_is_decimal.py index b1a7a51e32..c367086875 100644 --- a/src/awkward/operations/str/ak_is_decimal.py +++ b/src/awkward/operations/str/ak_is_decimal.py @@ -18,9 +18,9 @@ def is_decimal(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data True if the string is non-empty and consists only of decimal Unicode characters, False otherwise. + Replaces any string-valued data with True if the string is non-empty and consists only of decimal Unicode characters, False otherwise. - Replaces any bytestring-valued data True if the string is non-empty and consists only of decimal ASCII characters, False otherwise. + Replaces any bytestring-valued data with True if the string is non-empty and consists only of decimal ASCII characters, False otherwise. Note: this function does not raise an error if the `array` does not contain any string or bytestring data. diff --git a/src/awkward/operations/str/ak_is_digit.py b/src/awkward/operations/str/ak_is_digit.py index 100ab33b07..8797cb9f51 100644 --- a/src/awkward/operations/str/ak_is_digit.py +++ b/src/awkward/operations/str/ak_is_digit.py @@ -18,9 +18,9 @@ def is_digit(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data True if the string is non-empty and consists only of Unicode digits, False otherwise. + Replaces any string-valued data with True if the string is non-empty and consists only of Unicode digits, False otherwise. - Replaces any bytestring-valued data True if the string is non-empty and consists only of Unicode digits, False otherwise. + Replaces any bytestring-valued data with True if the string is non-empty and consists only of Unicode digits, False otherwise. Note: this function does not raise an error if the `array` does not contain any string or bytestring data. diff --git a/src/awkward/operations/str/ak_is_lower.py b/src/awkward/operations/str/ak_is_lower.py index 6fd261c822..f244d0740b 100644 --- a/src/awkward/operations/str/ak_is_lower.py +++ b/src/awkward/operations/str/ak_is_lower.py @@ -18,9 +18,9 @@ def is_lower(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data True if the string is non-empty and consists only of lowercase Unicode characters, False otherwise. + Replaces any string-valued data with True if the string is non-empty and consists only of lowercase Unicode characters, False otherwise. - Replaces any bytestring-valued data True if the string is non-empty and consists only of lowercase ASCII characters, False otherwise. + Replaces any bytestring-valued data with True if the string is non-empty and consists only of lowercase ASCII characters, False otherwise. Note: this function does not raise an error if the `array` does not contain any string or bytestring data. diff --git a/src/awkward/operations/str/ak_is_numeric.py b/src/awkward/operations/str/ak_is_numeric.py index 01f9d6fccc..a6ac673580 100644 --- a/src/awkward/operations/str/ak_is_numeric.py +++ b/src/awkward/operations/str/ak_is_numeric.py @@ -18,9 +18,9 @@ def is_numeric(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data true if the string is non-empty and consists only of numeric Unicode characters, False otherwise. + Replaces any string-valued data with True if the string is non-empty and consists only of numeric Unicode characters, False otherwise. - Replaces any bytestring-valued data true if the string is non-empty and consists only of numeric Unicode characters, False otherwise. + Replaces any bytestring-valued data with True if the string is non-empty and consists only of numeric Unicode characters, False otherwise. Note: this function does not raise an error if the `array` does not contain any string or bytestring data. diff --git a/src/awkward/operations/str/ak_is_printable.py b/src/awkward/operations/str/ak_is_printable.py index c4dbc1d96f..a6d78f98d8 100644 --- a/src/awkward/operations/str/ak_is_printable.py +++ b/src/awkward/operations/str/ak_is_printable.py @@ -18,9 +18,9 @@ def is_printable(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data True if the string is non-empty and consists only of printable Unicode characters, False otherwise. + Replaces any string-valued data with True if the string is non-empty and consists only of printable Unicode characters, False otherwise. - Replaces any bytestring-valued data True if the string is non-empty and consists only of printable ASCII characters, False otherwise. + Replaces any bytestring-valued data with True if the string is non-empty and consists only of printable ASCII characters, False otherwise. Note: this function does not raise an error if the `array` does not contain any string or bytestring data. diff --git a/src/awkward/operations/str/ak_is_space.py b/src/awkward/operations/str/ak_is_space.py index 13217936cd..8db50c6151 100644 --- a/src/awkward/operations/str/ak_is_space.py +++ b/src/awkward/operations/str/ak_is_space.py @@ -18,9 +18,9 @@ def is_space(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data True if the string is non-empty and consists only of whitespace Unicode characters, False otherwise. + Replaces any string-valued data with True if the string is non-empty and consists only of whitespace Unicode characters, False otherwise. - Replaces any bytestring-valued data True if the string is non-empty and consists only of whitespace ASCII characters, False otherwise. + Replaces any bytestring-valued data with True if the string is non-empty and consists only of whitespace ASCII characters, False otherwise. Note: this function does not raise an error if the `array` does not contain any string or bytestring data. diff --git a/src/awkward/operations/str/ak_is_title.py b/src/awkward/operations/str/ak_is_title.py new file mode 100644 index 0000000000..5d318dff2f --- /dev/null +++ b/src/awkward/operations/str/ak_is_title.py @@ -0,0 +1,56 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_title",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def is_title(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with True if the string is title-cased, i.e. it has at least one cased character, each uppercase character follows an uncased character, and each lowercase character follows an uppercase character, otherwise False. + + Replaces any bytestring-valued data with True if the string is title-cased, i.e. it has at least one cased character, each uppercase character follows an uncased character, and each lowercase character follows an uppercase character, otherwise False. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_title.html) + or + [pyarrow.compute.ascii_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_title.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_is_title, pc.ascii_is_title, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/ak_is_upper.py b/src/awkward/operations/str/ak_is_upper.py index bb35d2e6d1..a3ae3a9082 100644 --- a/src/awkward/operations/str/ak_is_upper.py +++ b/src/awkward/operations/str/ak_is_upper.py @@ -18,9 +18,9 @@ def is_upper(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data True if the string is non-empty and consists only of uppercase Unicode characters, False otherwise. + Replaces any string-valued data with True if the string is non-empty and consists only of uppercase Unicode characters, False otherwise. - Replaces any bytestring-valued data True if the string is non-empty and consists only of uppercase ASCII characters, False otherwise. + Replaces any bytestring-valued data with True if the string is non-empty and consists only of uppercase ASCII characters, False otherwise. Note: this function does not raise an error if the `array` does not contain any string or bytestring data. @@ -50,7 +50,7 @@ def _impl(array, highlevel, behavior): ak.operations.str._get_action( pc.utf8_is_upper, pc.ascii_is_upper, - # pc.ascii_is_upper is defined on binary, but for consistency with lower... + # pc.ascii_is_upper is defined on binary, but for consistency with is_lower and is_title... bytestring_to_string=True, ), behavior, diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index b0928e1c18..43029fde77 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -127,12 +127,25 @@ def test_is_space(): def test_is_upper(): - assert ak.str.is_space(string).tolist() == [ + assert ak.str.is_upper(string).tolist() == [ [False, False], [], [False, False, False], ] - assert ak.str.is_space(bytestring).tolist() == [ + assert ak.str.is_upper(bytestring).tolist() == [ + [False, False], + [], + [False, False, False], + ] + + +def test_is_title(): + assert ak.str.is_title(string).tolist() == [ + [False, False], + [], + [False, False, False], + ] + assert ak.str.is_title(bytestring).tolist() == [ [False, False], [], [False, False, False], From 88709b24bf7a0d870c8fd1776fc2fc6e25c37c96 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 17:45:22 -0500 Subject: [PATCH 11/73] is_ascii; done with string predicates --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_is_alnum.py | 4 +- src/awkward/operations/str/ak_is_alpha.py | 4 +- src/awkward/operations/str/ak_is_ascii.py | 56 +++++++++++++++++++ src/awkward/operations/str/ak_is_decimal.py | 4 +- src/awkward/operations/str/ak_is_digit.py | 4 +- src/awkward/operations/str/ak_is_lower.py | 4 +- src/awkward/operations/str/ak_is_numeric.py | 4 +- src/awkward/operations/str/ak_is_printable.py | 4 +- src/awkward/operations/str/ak_is_space.py | 4 +- src/awkward/operations/str/ak_is_title.py | 4 +- src/awkward/operations/str/ak_is_upper.py | 4 +- tests/test_2616_use_pyarrow_for_strings.py | 13 +++++ 13 files changed, 90 insertions(+), 20 deletions(-) create mode 100644 src/awkward/operations/str/ak_is_ascii.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 73793a8626..96a0b36e7b 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -5,6 +5,7 @@ # string predicates from awkward.operations.str.ak_is_alnum import * from awkward.operations.str.ak_is_alpha import * +from awkward.operations.str.ak_is_ascii import * from awkward.operations.str.ak_is_decimal import * from awkward.operations.str.ak_is_digit import * from awkward.operations.str.ak_is_lower import * diff --git a/src/awkward/operations/str/ak_is_alnum.py b/src/awkward/operations/str/ak_is_alnum.py index fd21786918..ac28e085b8 100644 --- a/src/awkward/operations/str/ak_is_alnum.py +++ b/src/awkward/operations/str/ak_is_alnum.py @@ -26,9 +26,9 @@ def is_alnum(array, *, highlevel=True, behavior=None): not contain any string or bytestring data. Requires the pyarrow library and calls - [pyarrow.compute.utf8_isalnum](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_alnum.html) + [pyarrow.compute.utf8_is_alnum](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_alnum.html) or - [pyarrow.compute.ascii_isalnum](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_alnum.html) + [pyarrow.compute.ascii_is_alnum](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_alnum.html) on strings and bytestrings, respectively. """ # Dispatch diff --git a/src/awkward/operations/str/ak_is_alpha.py b/src/awkward/operations/str/ak_is_alpha.py index 1ac2860624..283ad5a4c6 100644 --- a/src/awkward/operations/str/ak_is_alpha.py +++ b/src/awkward/operations/str/ak_is_alpha.py @@ -26,9 +26,9 @@ def is_alpha(array, *, highlevel=True, behavior=None): not contain any string or bytestring data. Requires the pyarrow library and calls - [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_alpha.html) + [pyarrow.compute.utf8_is_alpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_alpha.html) or - [pyarrow.compute.ascii_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_alpha.html) + [pyarrow.compute.ascii_is_alpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_alpha.html) on strings and bytestrings, respectively. """ # Dispatch diff --git a/src/awkward/operations/str/ak_is_ascii.py b/src/awkward/operations/str/ak_is_ascii.py new file mode 100644 index 0000000000..c00d349048 --- /dev/null +++ b/src/awkward/operations/str/ak_is_ascii.py @@ -0,0 +1,56 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_ascii",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def is_ascii(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with True iff the string consists only of ASCII characters, False otherwise. + + Replaces any bytestring-valued data with True iff the string consists only of ASCII characters, False otherwise. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.string_is_ascii](https://arrow.apache.org/docs/python/generated/pyarrow.compute.string_is_ascii.html) + or + [pyarrow.compute.string_is_ascii](https://arrow.apache.org/docs/python/generated/pyarrow.compute.string_is_ascii.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.string_is_ascii, pc.string_is_ascii, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/ak_is_decimal.py b/src/awkward/operations/str/ak_is_decimal.py index c367086875..8a2f4b0fe7 100644 --- a/src/awkward/operations/str/ak_is_decimal.py +++ b/src/awkward/operations/str/ak_is_decimal.py @@ -26,9 +26,9 @@ def is_decimal(array, *, highlevel=True, behavior=None): not contain any string or bytestring data. Requires the pyarrow library and calls - [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_decimal.html) + [pyarrow.compute.utf8_is_decimal](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_decimal.html) or - [pyarrow.compute.ascii_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_decimal.html) + [pyarrow.compute.ascii_is_decimal](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_decimal.html) on strings and bytestrings, respectively. """ # Dispatch diff --git a/src/awkward/operations/str/ak_is_digit.py b/src/awkward/operations/str/ak_is_digit.py index 8797cb9f51..3cd5f343ae 100644 --- a/src/awkward/operations/str/ak_is_digit.py +++ b/src/awkward/operations/str/ak_is_digit.py @@ -26,9 +26,9 @@ def is_digit(array, *, highlevel=True, behavior=None): not contain any string or bytestring data. Requires the pyarrow library and calls - [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_digit.html) + [pyarrow.compute.utf8_is_digit](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_digit.html) or - [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_digit.html) + [pyarrow.compute.utf8_is_digit](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_digit.html) on strings and bytestrings, respectively. (Arrow's compute module does not have an `ascii_is_digit`.) diff --git a/src/awkward/operations/str/ak_is_lower.py b/src/awkward/operations/str/ak_is_lower.py index f244d0740b..74c832ba77 100644 --- a/src/awkward/operations/str/ak_is_lower.py +++ b/src/awkward/operations/str/ak_is_lower.py @@ -26,9 +26,9 @@ def is_lower(array, *, highlevel=True, behavior=None): not contain any string or bytestring data. Requires the pyarrow library and calls - [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_lower.html) + [pyarrow.compute.utf8_is_lower](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_lower.html) or - [pyarrow.compute.ascii_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_lower.html) + [pyarrow.compute.ascii_is_lower](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_lower.html) on strings and bytestrings, respectively. """ # Dispatch diff --git a/src/awkward/operations/str/ak_is_numeric.py b/src/awkward/operations/str/ak_is_numeric.py index a6ac673580..9bf89c814a 100644 --- a/src/awkward/operations/str/ak_is_numeric.py +++ b/src/awkward/operations/str/ak_is_numeric.py @@ -26,9 +26,9 @@ def is_numeric(array, *, highlevel=True, behavior=None): not contain any string or bytestring data. Requires the pyarrow library and calls - [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_numeric.html) + [pyarrow.compute.utf8_is_numeric](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_numeric.html) or - [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_numeric.html) + [pyarrow.compute.utf8_is_numeric](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_numeric.html) on strings and bytestrings, respectively. (Arrow's compute module does not have an `ascii_is_numeric`.) diff --git a/src/awkward/operations/str/ak_is_printable.py b/src/awkward/operations/str/ak_is_printable.py index a6d78f98d8..cf42bfcc97 100644 --- a/src/awkward/operations/str/ak_is_printable.py +++ b/src/awkward/operations/str/ak_is_printable.py @@ -26,9 +26,9 @@ def is_printable(array, *, highlevel=True, behavior=None): not contain any string or bytestring data. Requires the pyarrow library and calls - [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_printable.html) + [pyarrow.compute.utf8_is_printable](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_printable.html) or - [pyarrow.compute.ascii_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_printable.html) + [pyarrow.compute.ascii_is_printable](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_printable.html) on strings and bytestrings, respectively. """ # Dispatch diff --git a/src/awkward/operations/str/ak_is_space.py b/src/awkward/operations/str/ak_is_space.py index 8db50c6151..00ace2eb51 100644 --- a/src/awkward/operations/str/ak_is_space.py +++ b/src/awkward/operations/str/ak_is_space.py @@ -26,9 +26,9 @@ def is_space(array, *, highlevel=True, behavior=None): not contain any string or bytestring data. Requires the pyarrow library and calls - [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_space.html) + [pyarrow.compute.utf8_is_space](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_space.html) or - [pyarrow.compute.ascii_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_space.html) + [pyarrow.compute.ascii_is_space](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_space.html) on strings and bytestrings, respectively. """ # Dispatch diff --git a/src/awkward/operations/str/ak_is_title.py b/src/awkward/operations/str/ak_is_title.py index 5d318dff2f..e463d00685 100644 --- a/src/awkward/operations/str/ak_is_title.py +++ b/src/awkward/operations/str/ak_is_title.py @@ -26,9 +26,9 @@ def is_title(array, *, highlevel=True, behavior=None): not contain any string or bytestring data. Requires the pyarrow library and calls - [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_title.html) + [pyarrow.compute.utf8_is_title](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_title.html) or - [pyarrow.compute.ascii_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_title.html) + [pyarrow.compute.ascii_is_title](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_title.html) on strings and bytestrings, respectively. """ # Dispatch diff --git a/src/awkward/operations/str/ak_is_upper.py b/src/awkward/operations/str/ak_is_upper.py index a3ae3a9082..8cff3a78bc 100644 --- a/src/awkward/operations/str/ak_is_upper.py +++ b/src/awkward/operations/str/ak_is_upper.py @@ -26,9 +26,9 @@ def is_upper(array, *, highlevel=True, behavior=None): not contain any string or bytestring data. Requires the pyarrow library and calls - [pyarrow.compute.utf8_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_upper.html) + [pyarrow.compute.utf8_is_upper](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_upper.html) or - [pyarrow.compute.ascii_isalpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_upper.html) + [pyarrow.compute.ascii_is_upper](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_is_upper.html) on strings and bytestrings, respectively. """ # Dispatch diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 43029fde77..7cbfc0a6ed 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -150,3 +150,16 @@ def test_is_title(): [], [False, False, False], ] + + +def test_is_ascii(): + assert ak.str.is_ascii(string).tolist() == [ + [False, True], + [], + [False, False, True], + ] + assert ak.str.is_ascii(bytestring).tolist() == [ + [False, True], + [], + [False, False, True], + ] From 7a5463a107fe05fc7f5c0352626a3f1b6fb7fc26 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 17:59:39 -0500 Subject: [PATCH 12/73] capitalize --- pyproject.toml | 2 +- src/awkward/operations/str/__init__.py | 7 ++- src/awkward/operations/str/ak_capitalize.py | 56 +++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 15 ++++++ 4 files changed, 77 insertions(+), 3 deletions(-) create mode 100644 src/awkward/operations/str/ak_capitalize.py diff --git a/pyproject.toml b/pyproject.toml index d4b6ce3d3e..24a6fed90a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -310,7 +310,7 @@ mccabe.max-complexity = 100 "src/awkward/__init__.py" = ["E402", "F401", "F403", "I001"] "src/awkward/_ext.py" = ["F401"] "src/awkward/operations/__init__.py" = ["F401", "F403"] -"src/awkward/operations/str/__init__.py" = ["F401", "F403"] +"src/awkward/operations/str/__init__.py" = ["F401", "F403", "I001"] "src/awkward/_nplikes/*" = ["TID251"] "src/awkward/_operators.py" = ["TID251"] "tests*/*" = ["T20", "TID251"] diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 96a0b36e7b..11044a3ac3 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -5,15 +5,18 @@ # string predicates from awkward.operations.str.ak_is_alnum import * from awkward.operations.str.ak_is_alpha import * -from awkward.operations.str.ak_is_ascii import * from awkward.operations.str.ak_is_decimal import * from awkward.operations.str.ak_is_digit import * from awkward.operations.str.ak_is_lower import * from awkward.operations.str.ak_is_numeric import * from awkward.operations.str.ak_is_printable import * from awkward.operations.str.ak_is_space import * -from awkward.operations.str.ak_is_title import * from awkward.operations.str.ak_is_upper import * +from awkward.operations.str.ak_is_title import * +from awkward.operations.str.ak_is_ascii import * + +# string transforms +from awkward.operations.str.ak_capitalize import * def _get_action(utf8_function, ascii_function, *, bytestring_to_string=False): diff --git a/src/awkward/operations/str/ak_capitalize.py b/src/awkward/operations/str/ak_capitalize.py new file mode 100644 index 0000000000..f86009622e --- /dev/null +++ b/src/awkward/operations/str/ak_capitalize.py @@ -0,0 +1,56 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("capitalize",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def capitalize(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with a capitalized version, with the first character uppercased and the others lowercased. + + Replaces any bytestring-valued data with a capitalized version (of all ASCII characters). + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_capitalize](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_capitalize.html) + or + [pyarrow.compute.ascii_capitalize](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_capitalize.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_capitalize, pc.ascii_capitalize, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 7cbfc0a6ed..1c4f172240 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -163,3 +163,18 @@ def test_is_ascii(): [], [False, False, True], ] + + +def test_capitalize(): + print(ak.str.capitalize(string)) + + assert ak.str.capitalize(string).tolist() == [ + ["Αβγ", ""], + [], + ["→δε←", "Ζz zζ", "Abc"], # noqa: RUF001, RUF003 (we care about Ζ vs Z) + ] + assert ak.str.capitalize(bytestring).tolist() == [ + ["αβγ", ""], + [], + ["→δε←", "ζz zζ", "Abc"], + ] From 56cb0b19612c559b8b3ad26efe45209b5dfde096 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 18:08:12 -0500 Subject: [PATCH 13/73] lower --- src/awkward/operations/str/__init__.py | 2 + src/awkward/operations/str/ak_capitalize.py | 4 +- src/awkward/operations/str/ak_length.py | 56 +++++++++++++++++++++ src/awkward/operations/str/ak_lower.py | 56 +++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 28 ++++++++++- 5 files changed, 142 insertions(+), 4 deletions(-) create mode 100644 src/awkward/operations/str/ak_length.py create mode 100644 src/awkward/operations/str/ak_lower.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 11044a3ac3..e7b32bb2e5 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -17,6 +17,8 @@ # string transforms from awkward.operations.str.ak_capitalize import * +from awkward.operations.str.ak_length import * +from awkward.operations.str.ak_lower import * def _get_action(utf8_function, ascii_function, *, bytestring_to_string=False): diff --git a/src/awkward/operations/str/ak_capitalize.py b/src/awkward/operations/str/ak_capitalize.py index f86009622e..84e2843e00 100644 --- a/src/awkward/operations/str/ak_capitalize.py +++ b/src/awkward/operations/str/ak_capitalize.py @@ -18,9 +18,9 @@ def capitalize(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with a capitalized version, with the first character uppercased and the others lowercased. + Replaces any string-valued data with a capitalized version (correctly transforming Unicode characters), with the first character uppercased and the others lowercased. - Replaces any bytestring-valued data with a capitalized version (of all ASCII characters). + Replaces any bytestring-valued data with a capitalized version (transforming ASCII characters only). Note: this function does not raise an error if the `array` does not contain any string or bytestring data. diff --git a/src/awkward/operations/str/ak_length.py b/src/awkward/operations/str/ak_length.py new file mode 100644 index 0000000000..f77ce22f76 --- /dev/null +++ b/src/awkward/operations/str/ak_length.py @@ -0,0 +1,56 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("length",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def length(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with its length in Unicode characters (not its length in bytes). + + Replaces any bytestring-valued data with its length of bytes. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_length](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_length.html) + or + [pyarrow.compute.binary_length](https://arrow.apache.org/docs/python/generated/pyarrow.compute.binary_length.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_length, pc.binary_length, bytestring_to_string=False + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/ak_lower.py b/src/awkward/operations/str/ak_lower.py new file mode 100644 index 0000000000..92766b8f48 --- /dev/null +++ b/src/awkward/operations/str/ak_lower.py @@ -0,0 +1,56 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("lower",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def lower(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with a lowercase version (correctly transforming Unicode characters). + + Replaces any bytestring-valued data with a lowercase version (transforming ASCII characters only). + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_lower](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_lower.html) + or + [pyarrow.compute.ascii_lower](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_lower.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_lower, pc.ascii_lower, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 1c4f172240..40fbb2859a 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -166,8 +166,6 @@ def test_is_ascii(): def test_capitalize(): - print(ak.str.capitalize(string)) - assert ak.str.capitalize(string).tolist() == [ ["Αβγ", ""], [], @@ -178,3 +176,29 @@ def test_capitalize(): [], ["→δε←", "ζz zζ", "Abc"], ] + + +def test_length(): + assert ak.str.length(string).tolist() == [ + [3, 0], + [], + [4, 5, 3], + ] + assert ak.str.length(bytestring).tolist() == [ + [6, 0], + [], + [10, 7, 3], + ] + + +def test_lower(): + assert ak.str.lower(string).tolist() == [ + ["αβγ", ""], + [], + ["→δε←", "ζz zζ", "abc"], + ] + assert ak.str.lower(bytestring).tolist() == [ + ["αβγ", ""], + [], + ["→δε←", "ζz zζ", "abc"], + ] From 2c1fe11b284a30e1a8cc41283c79d233ef0f86ae Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 18:11:39 -0500 Subject: [PATCH 14/73] upper --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_upper.py | 56 ++++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 13 +++++ 3 files changed, 70 insertions(+) create mode 100644 src/awkward/operations/str/ak_upper.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index e7b32bb2e5..cd3b1420a2 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -19,6 +19,7 @@ from awkward.operations.str.ak_capitalize import * from awkward.operations.str.ak_length import * from awkward.operations.str.ak_lower import * +from awkward.operations.str.ak_upper import * def _get_action(utf8_function, ascii_function, *, bytestring_to_string=False): diff --git a/src/awkward/operations/str/ak_upper.py b/src/awkward/operations/str/ak_upper.py new file mode 100644 index 0000000000..f4ae131af7 --- /dev/null +++ b/src/awkward/operations/str/ak_upper.py @@ -0,0 +1,56 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("upper",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def upper(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with a uppercase version (correctly transforming Unicode characters). + + Replaces any bytestring-valued data with a uppercase version (transforming ASCII characters only). + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_upper](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_upper.html) + or + [pyarrow.compute.ascii_upper](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_upper.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_upper, pc.ascii_upper, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 40fbb2859a..969b95ac8b 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -202,3 +202,16 @@ def test_lower(): [], ["→δε←", "ζz zζ", "abc"], ] + + +def test_upper(): + assert ak.str.upper(string).tolist() == [ + ["ΑΒΓ", ""], + [], + ["→ΔΕ←", "ΖZ ZΖ", "ABC"], # noqa: RUF001, RUF003 (we care about Ζ vs Z) + ] + assert ak.str.upper(bytestring).tolist() == [ + ["αβγ", ""], + [], + ["→δε←", "ζZ Zζ", "ABC"], + ] From d7db0423e449acea7a5144cf755e3b8454eb566b Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 18:13:08 -0500 Subject: [PATCH 15/73] upper --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_swapcase.py | 56 ++++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 13 +++++ 3 files changed, 70 insertions(+) create mode 100644 src/awkward/operations/str/ak_swapcase.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index cd3b1420a2..c7b21ad1c6 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -19,6 +19,7 @@ from awkward.operations.str.ak_capitalize import * from awkward.operations.str.ak_length import * from awkward.operations.str.ak_lower import * +from awkward.operations.str.ak_swapcase import * from awkward.operations.str.ak_upper import * diff --git a/src/awkward/operations/str/ak_swapcase.py b/src/awkward/operations/str/ak_swapcase.py new file mode 100644 index 0000000000..cc984af7c1 --- /dev/null +++ b/src/awkward/operations/str/ak_swapcase.py @@ -0,0 +1,56 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("swapcase",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def swapcase(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with a swapcase version (correctly transforming Unicode characters). + + Replaces any bytestring-valued data with a swapcase version (transforming ASCII characters only). + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_swapcase](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_swapcase.html) + or + [pyarrow.compute.ascii_swapcase](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_swapcase.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_swapcase, pc.ascii_swapcase, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 969b95ac8b..1bfd483864 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -204,6 +204,19 @@ def test_lower(): ] +def test_swapcase(): + assert ak.str.swapcase(string).tolist() == [ + ["ΑΒΓ", ""], + [], + ["→ΔΕ←", "ΖZ ZΖ", "ABC"], # noqa: RUF001, RUF003 (we care about Ζ vs Z) + ] + assert ak.str.swapcase(bytestring).tolist() == [ + ["αβγ", ""], + [], + ["→δε←", "ζZ Zζ", "ABC"], + ] + + def test_upper(): assert ak.str.upper(string).tolist() == [ ["ΑΒΓ", ""], From 951f9b9dab320ddc9697ef3cc8f35e4e14c313dd Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 18:18:31 -0500 Subject: [PATCH 16/73] title --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_swapcase.py | 4 +- src/awkward/operations/str/ak_title.py | 56 ++++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 13 +++++ 4 files changed, 72 insertions(+), 2 deletions(-) create mode 100644 src/awkward/operations/str/ak_title.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index c7b21ad1c6..4c6c54b1b0 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -20,6 +20,7 @@ from awkward.operations.str.ak_length import * from awkward.operations.str.ak_lower import * from awkward.operations.str.ak_swapcase import * +from awkward.operations.str.ak_title import * from awkward.operations.str.ak_upper import * diff --git a/src/awkward/operations/str/ak_swapcase.py b/src/awkward/operations/str/ak_swapcase.py index cc984af7c1..1ff02dabad 100644 --- a/src/awkward/operations/str/ak_swapcase.py +++ b/src/awkward/operations/str/ak_swapcase.py @@ -18,9 +18,9 @@ def swapcase(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with a swapcase version (correctly transforming Unicode characters). + Replaces any string-valued data with uppercase characters transformed to lowercase and vice-versa (correctly transforming Unicode characters). - Replaces any bytestring-valued data with a swapcase version (transforming ASCII characters only). + Replaces any bytestring-valued data with uppercase characters transformed to lowercase and vice-versa (transforming ASCII characters only). Note: this function does not raise an error if the `array` does not contain any string or bytestring data. diff --git a/src/awkward/operations/str/ak_title.py b/src/awkward/operations/str/ak_title.py new file mode 100644 index 0000000000..8314002311 --- /dev/null +++ b/src/awkward/operations/str/ak_title.py @@ -0,0 +1,56 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("title",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def title(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with a titlecase version (correctly transforming Unicode characters). Each word in the output will start with an uppercase character and its remaining characters will be lowercase. + + Replaces any bytestring-valued data with a titlecase version (transforming ASCII characters only). Each word in the output will start with an uppercase character and its remaining characters will be lowercase. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_title](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_title.html) + or + [pyarrow.compute.ascii_title](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_title.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_title, pc.ascii_title, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 1bfd483864..01b241b442 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -217,6 +217,19 @@ def test_swapcase(): ] +def test_title(): + assert ak.str.title(string).tolist() == [ + ["Αβγ", ""], + [], + ["→Δε←", "Ζz Zζ", "Abc"], # noqa: RUF001, RUF003 (we care about Ζ vs Z) + ] + assert ak.str.title(bytestring).tolist() == [ + ["αβγ", ""], + [], + ["→δε←", "ζZ Zζ", "Abc"], + ] + + def test_upper(): assert ak.str.upper(string).tolist() == [ ["ΑΒΓ", ""], From adab5998505c3d94c09efb69c92443f51d97ef68 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 18:32:51 -0500 Subject: [PATCH 17/73] T -> T operations on bytestrings should return bytestrings. --- src/awkward/operations/str/__init__.py | 9 ++++++++- tests/test_2616_use_pyarrow_for_strings.py | 20 ++++++++++---------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 4c6c54b1b0..1d0e380ece 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -36,7 +36,7 @@ def action(layout, **kwargs): elif layout.is_list and layout.parameter("__array__") == "bytestring": if bytestring_to_string: - return from_arrow( + out = from_arrow( ascii_function( to_arrow( layout.copy( @@ -50,6 +50,13 @@ def action(layout, **kwargs): ), highlevel=False, ) + if out.is_list and out.parameter("__array__") == "string": + out = out.copy( + content=out.content.copy(parameters={"__array__": "byte"}), + parameters={"__array__": "bytestring"}, + ) + return out + else: return from_arrow( ascii_function(to_arrow(layout, extensionarray=False)), diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 01b241b442..f034657503 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -172,9 +172,9 @@ def test_capitalize(): ["→δε←", "Ζz zζ", "Abc"], # noqa: RUF001, RUF003 (we care about Ζ vs Z) ] assert ak.str.capitalize(bytestring).tolist() == [ - ["αβγ", ""], + ["αβγ".encode(), b""], [], - ["→δε←", "ζz zζ", "Abc"], + ["→δε←".encode(), "ζz zζ".encode(), b"Abc"], ] @@ -198,9 +198,9 @@ def test_lower(): ["→δε←", "ζz zζ", "abc"], ] assert ak.str.lower(bytestring).tolist() == [ - ["αβγ", ""], + ["αβγ".encode(), b""], [], - ["→δε←", "ζz zζ", "abc"], + ["→δε←".encode(), "ζz zζ".encode(), b"abc"], ] @@ -211,9 +211,9 @@ def test_swapcase(): ["→ΔΕ←", "ΖZ ZΖ", "ABC"], # noqa: RUF001, RUF003 (we care about Ζ vs Z) ] assert ak.str.swapcase(bytestring).tolist() == [ - ["αβγ", ""], + ["αβγ".encode(), b""], [], - ["→δε←", "ζZ Zζ", "ABC"], + ["→δε←".encode(), "ζZ Zζ".encode(), b"ABC"], ] @@ -224,9 +224,9 @@ def test_title(): ["→Δε←", "Ζz Zζ", "Abc"], # noqa: RUF001, RUF003 (we care about Ζ vs Z) ] assert ak.str.title(bytestring).tolist() == [ - ["αβγ", ""], + ["αβγ".encode(), b""], [], - ["→δε←", "ζZ Zζ", "Abc"], + ["→δε←".encode(), "ζZ Zζ".encode(), b"Abc"], ] @@ -237,7 +237,7 @@ def test_upper(): ["→ΔΕ←", "ΖZ ZΖ", "ABC"], # noqa: RUF001, RUF003 (we care about Ζ vs Z) ] assert ak.str.upper(bytestring).tolist() == [ - ["αβγ", ""], + ["αβγ".encode(), b""], [], - ["→δε←", "ζZ Zζ", "ABC"], + ["→δε←".encode(), "ζZ Zζ".encode(), b"ABC"], ] From 8279fded3627cb0c32950e0c1ee4764e531ffbb5 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 19:14:38 -0500 Subject: [PATCH 18/73] repeat (the first that needs a broadcastable argument) --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_repeat.py | 107 +++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 24 +++++ 3 files changed, 132 insertions(+) create mode 100644 src/awkward/operations/str/ak_repeat.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 1d0e380ece..2ed7b8d7eb 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -22,6 +22,7 @@ from awkward.operations.str.ak_swapcase import * from awkward.operations.str.ak_title import * from awkward.operations.str.ak_upper import * +from awkward.operations.str.ak_repeat import * def _get_action(utf8_function, ascii_function, *, bytestring_to_string=False): diff --git a/src/awkward/operations/str/ak_repeat.py b/src/awkward/operations/str/ak_repeat.py new file mode 100644 index 0000000000..0bd722acc9 --- /dev/null +++ b/src/awkward/operations/str/ak_repeat.py @@ -0,0 +1,107 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("repeat",) + +import numbers + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout +from awkward._nplikes.numpylike import NumpyMetadata + +np = NumpyMetadata.instance() + + +@high_level_function +def repeat(array, num_repeats, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + num_repeats: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued or bytestring-valued data with the same value repeated `num_repeats` times, which can be a scalar integer or a (broadcasted) array of integers. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.binary_repeat](https://arrow.apache.org/docs/python/generated/pyarrow.compute.binary_repeat.html) + or + [pyarrow.compute.binary_repeat](https://arrow.apache.org/docs/python/generated/pyarrow.compute.binary_repeat.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, num_repeats, highlevel, behavior) + + +def _impl(array, num_repeats, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + from awkward.operations.ak_from_arrow import from_arrow + from awkward.operations.ak_to_arrow import to_arrow + + import pyarrow.compute as pc + + layout = ak.operations.to_layout(array) + behavior = behavior_of(array, behavior=behavior) + + num_repeats_layout = ak.operations.to_layout(num_repeats, allow_other=True) + + if not isinstance(num_repeats_layout, ak.contents.Content): + if not isinstance(num_repeats, numbers.Integral): + raise TypeError( + "num_repeats must be an integer or broadcastable to integers" + ) + num_repeats = int(num_repeats) + + def action(layout, **kwargs): + if layout.is_list and layout.parameter("__array__") in ( + "string", + "bytestring", + ): + return from_arrow( + pc.binary_repeat( + to_arrow(layout, extensionarray=False), num_repeats + ), + highlevel=False, + ) + + out = ak._do.recursively_apply(layout, action, behavior) + + else: + + def action(inputs, **kwargs): + if inputs[0].is_list and inputs[0].parameter("__array__") in ( + "string", + "bytestring", + ): + if not inputs[1].is_numpy or not issubclass( + inputs[1].dtype.type, np.integer + ): + raise TypeError( + "num_repeats must be an integer or broadcastable to integers" + ) + return ( + from_arrow( + pc.binary_repeat( + to_arrow(inputs[0], extensionarray=False), + to_arrow(inputs[1], extensionarray=False), + ), + highlevel=False, + ), + ) + + out = ak._broadcasting.broadcast_and_apply( + (layout, num_repeats_layout), action, behavior + ) + assert isinstance(out, tuple) and len(out) == 1 + out = out[0] + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index f034657503..a97902715e 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -241,3 +241,27 @@ def test_upper(): [], ["→δε←".encode(), "ζZ Zζ".encode(), b"ABC"], ] + + +def test_repeat(): + assert ak.str.repeat(string, 3).tolist() == [ + ["αβγαβγαβγ", ""], + [], + ["→δε←→δε←→δε←", "ζz zζζz zζζz zζ", "abcabcabc"], + ] + assert ak.str.repeat(bytestring, 3).tolist() == [ + ["αβγαβγαβγ".encode(), b""], + [], + ["→δε←→δε←→δε←".encode(), "ζz zζζz zζζz zζ".encode(), b"abcabcabc"], + ] + + assert ak.str.repeat(string, [[3, 3], [], [2, 0, 1]]).tolist() == [ + ["αβγαβγαβγ", ""], + [], + ["→δε←→δε←", "", "abc"], + ] + assert ak.str.repeat(bytestring, [[3, 3], [], [2, 0, 1]]).tolist() == [ + ["αβγαβγαβγ".encode(), b""], + [], + ["→δε←→δε←".encode(), b"", b"abc"], + ] From 4c41240eaed3bf235e083f4cea7957720fba51b4 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 19:24:45 -0500 Subject: [PATCH 19/73] reverse (because it's easy) --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_reverse.py | 56 ++++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 13 +++++ 3 files changed, 70 insertions(+) create mode 100644 src/awkward/operations/str/ak_reverse.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 2ed7b8d7eb..dc99b99689 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -23,6 +23,7 @@ from awkward.operations.str.ak_title import * from awkward.operations.str.ak_upper import * from awkward.operations.str.ak_repeat import * +from awkward.operations.str.ak_reverse import * def _get_action(utf8_function, ascii_function, *, bytestring_to_string=False): diff --git a/src/awkward/operations/str/ak_reverse.py b/src/awkward/operations/str/ak_reverse.py new file mode 100644 index 0000000000..627f8a95cf --- /dev/null +++ b/src/awkward/operations/str/ak_reverse.py @@ -0,0 +1,56 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("reverse",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def reverse(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Reverses the order of Unicode characters in any string-valued data. (This function operates on Unicode codepoints, not grapheme clusters. Hence, it will not correctly reverse grapheme clusters composed of multiple codepoints.) + + Reverses the order of bytes in any bytestring-valued data. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_reverse](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_reverse.html) + or + [pyarrow.compute.binary_reverse](https://arrow.apache.org/docs/python/generated/pyarrow.compute.binary_reverse.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_reverse, pc.binary_reverse, bytestring_to_string=False + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index a97902715e..e20cb7205a 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -265,3 +265,16 @@ def test_repeat(): [], ["→δε←→δε←".encode(), b"", b"abc"], ] + + +def test_reverse(): + assert ak.str.reverse(string).tolist() == [ + ["αβγ"[::-1], ""], + [], + ["→δε←"[::-1], "ζz zζ"[::-1], "abc"[::-1]], + ] + assert ak.str.reverse(bytestring).tolist() == [ + ["αβγ".encode()[::-1], b""], + [], + ["→δε←".encode()[::-1], "ζz zζ".encode()[::-1], b"abc"[::-1]], + ] From 42604f0d74b2196767e05382550dda2c6d96f648 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 19:46:09 -0500 Subject: [PATCH 20/73] replace_slice --- src/awkward/operations/str/__init__.py | 11 ++-- src/awkward/operations/str/ak_repeat.py | 1 - .../operations/str/ak_replace_slice.py | 59 +++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 13 ++++ 4 files changed, 79 insertions(+), 5 deletions(-) create mode 100644 src/awkward/operations/str/ak_replace_slice.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index dc99b99689..9ad840747a 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -23,17 +23,19 @@ from awkward.operations.str.ak_title import * from awkward.operations.str.ak_upper import * from awkward.operations.str.ak_repeat import * +from awkward.operations.str.ak_replace_slice import * from awkward.operations.str.ak_reverse import * -def _get_action(utf8_function, ascii_function, *, bytestring_to_string=False): +def _get_action(utf8_function, ascii_function, *args, bytestring_to_string=False): from awkward.operations.ak_from_arrow import from_arrow from awkward.operations.ak_to_arrow import to_arrow def action(layout, **kwargs): if layout.is_list and layout.parameter("__array__") == "string": return from_arrow( - utf8_function(to_arrow(layout, extensionarray=False)), highlevel=False + utf8_function(to_arrow(layout, extensionarray=False), *args), + highlevel=False, ) elif layout.is_list and layout.parameter("__array__") == "bytestring": @@ -48,7 +50,8 @@ def action(layout, **kwargs): parameters={"__array__": "string"}, ), extensionarray=False, - ) + ), + *args, ), highlevel=False, ) @@ -61,7 +64,7 @@ def action(layout, **kwargs): else: return from_arrow( - ascii_function(to_arrow(layout, extensionarray=False)), + ascii_function(to_arrow(layout, extensionarray=False), *args), highlevel=False, ) diff --git a/src/awkward/operations/str/ak_repeat.py b/src/awkward/operations/str/ak_repeat.py index 0bd722acc9..4419eed4c2 100644 --- a/src/awkward/operations/str/ak_repeat.py +++ b/src/awkward/operations/str/ak_repeat.py @@ -59,7 +59,6 @@ def _impl(array, num_repeats, highlevel, behavior): raise TypeError( "num_repeats must be an integer or broadcastable to integers" ) - num_repeats = int(num_repeats) def action(layout, **kwargs): if layout.is_list and layout.parameter("__array__") in ( diff --git a/src/awkward/operations/str/ak_replace_slice.py b/src/awkward/operations/str/ak_replace_slice.py new file mode 100644 index 0000000000..8bde0e2d0c --- /dev/null +++ b/src/awkward/operations/str/ak_replace_slice.py @@ -0,0 +1,59 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("replace_slice",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def replace_slice(array, start, stop, replacement, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces slices of any string or bytestring-valued data with `replacement` between `start` and `stop` indexes; `start` is inclusive and `stop` is exclusive and both are 0-indexed. + + For strings, `start` and `stop` are measured in Unicode characters; for bytestrings, `start` and `stop` are measured in bytes. + + The `start`, `stop`, and `replacement` are scalars; they cannot be different for each string/bytestring in the sample. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_replace_slice](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_replace_slice.html) + or + [pyarrow.compute.binary_replace_slice](https://arrow.apache.org/docs/python/generated/pyarrow.compute.binary_replace_slice.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, start, stop, replacement, highlevel, behavior) + + +def _impl(array, start, stop, replacement, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_replace_slice, pc.binary_replace_slice, start, stop, replacement + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index e20cb7205a..bffb042efb 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -267,6 +267,19 @@ def test_repeat(): ] +def test_replace_slice(): + assert ak.str.replace_slice(string[:, :1], 1, 2, "qj").tolist() == [ + ["αqjγ"], # noqa: RUF001 + [], + ["→qjε←"], + ] + assert ak.str.replace_slice(bytestring[:, :1], 1, 2, b"qj").tolist() == [ + [b"\xceqj\xce\xb2\xce\xb3"], + [], + [b"\xe2qj\x92\xce\xb4\xce\xb5\xe2\x86\x90"], + ] + + def test_reverse(): assert ak.str.reverse(string).tolist() == [ ["αβγ"[::-1], ""], From b69d7a24dcee32af4124d1c15fd7ae2c29ce7222 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 19:59:29 -0500 Subject: [PATCH 21/73] replace_substring --- src/awkward/operations/str/__init__.py | 14 ++-- .../operations/str/ak_replace_slice.py | 3 + .../operations/str/ak_replace_substring.py | 68 +++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 13 ++++ 4 files changed, 94 insertions(+), 4 deletions(-) create mode 100644 src/awkward/operations/str/ak_replace_substring.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 9ad840747a..6aec5c824c 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -25,16 +25,19 @@ from awkward.operations.str.ak_repeat import * from awkward.operations.str.ak_replace_slice import * from awkward.operations.str.ak_reverse import * +from awkward.operations.str.ak_replace_substring import * -def _get_action(utf8_function, ascii_function, *args, bytestring_to_string=False): +def _get_action( + utf8_function, ascii_function, *args, bytestring_to_string=False, **kwargs +): from awkward.operations.ak_from_arrow import from_arrow from awkward.operations.ak_to_arrow import to_arrow - def action(layout, **kwargs): + def action(layout, **absorb): if layout.is_list and layout.parameter("__array__") == "string": return from_arrow( - utf8_function(to_arrow(layout, extensionarray=False), *args), + utf8_function(to_arrow(layout, extensionarray=False), *args, **kwargs), highlevel=False, ) @@ -52,6 +55,7 @@ def action(layout, **kwargs): extensionarray=False, ), *args, + **kwargs, ), highlevel=False, ) @@ -64,7 +68,9 @@ def action(layout, **kwargs): else: return from_arrow( - ascii_function(to_arrow(layout, extensionarray=False), *args), + ascii_function( + to_arrow(layout, extensionarray=False), *args, **kwargs + ), highlevel=False, ) diff --git a/src/awkward/operations/str/ak_replace_slice.py b/src/awkward/operations/str/ak_replace_slice.py index 8bde0e2d0c..72653128bd 100644 --- a/src/awkward/operations/str/ak_replace_slice.py +++ b/src/awkward/operations/str/ak_replace_slice.py @@ -14,6 +14,9 @@ def replace_slice(array, start, stop, replacement, *, highlevel=True, behavior=N """ Args: array: Array-like data (anything #ak.to_layout recognizes). + start (int): Index to start slicing at (inclusive). + stop (int): Index to stop slicing at (exclusive). + replacement (str): What to replace the slice with. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if diff --git a/src/awkward/operations/str/ak_replace_substring.py b/src/awkward/operations/str/ak_replace_substring.py new file mode 100644 index 0000000000..380750da2f --- /dev/null +++ b/src/awkward/operations/str/ak_replace_substring.py @@ -0,0 +1,68 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("replace_substring",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def replace_substring( + array, pattern, replacement, *, max_replacements=None, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str): Substring pattern to look for inside input values. + replacement (str): What to replace the pattern with. + max_replacements (None or int): If not None and not -1, limits the + maximum number of replacements per string/bytestring, counting from + the left. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces non-overlapping subsequences of any string or bytestring-valued data that match a literal `pattern` with `replacement`. + + The `pattern` and `replacement` are scalars; they cannot be different for each string/bytestring in the sample. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.replace_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.replace_substring.html) + or + [pyarrow.compute.replace_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.replace_substring.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, replacement, max_replacements, highlevel, behavior) + + +def _impl(array, pattern, replacement, max_replacements, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.replace_substring, + pc.replace_substring, + pattern, + replacement, + max_replacements=max_replacements, + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index bffb042efb..c22e0b83a7 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -291,3 +291,16 @@ def test_reverse(): [], ["→δε←".encode()[::-1], "ζz zζ".encode()[::-1], b"abc"[::-1]], ] + + +def test_replace_substring(): + assert ak.str.replace_substring(string, "βγ", "HELLO").tolist() == [ + ["αHELLO", ""], # noqa: RUF001 + [], + ["→δε←", "ζz zζ", "abc"], + ] + assert ak.str.replace_substring(bytestring, "βγ".encode(), b"HELLO").tolist() == [ + ["αHELLO".encode(), b""], # noqa: RUF001 + [], + ["→δε←".encode(), "ζz zζ".encode(), b"abc"], + ] From 3d825aa1007975cf5a9067a23c7cddb20a9886e4 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 20:01:39 -0500 Subject: [PATCH 22/73] Also test 'max_replacements' in replace_substring. --- tests/test_2616_use_pyarrow_for_strings.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index c22e0b83a7..6efead56a7 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -304,3 +304,18 @@ def test_replace_substring(): [], ["→δε←".encode(), "ζz zζ".encode(), b"abc"], ] + + assert ak.str.replace_substring( + string, "βγ", "HELLO", max_replacements=0 + ).tolist() == [ + ["αβγ", ""], + [], + ["→δε←", "ζz zζ", "abc"], + ] + assert ak.str.replace_substring( + bytestring, "βγ".encode(), b"HELLO", max_replacements=0 + ).tolist() == [ + ["αβγ".encode(), b""], + [], + ["→δε←".encode(), "ζz zζ".encode(), b"abc"], + ] From 983c3ba798a0e10a44898f840589d7dbc3111ea6 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 20:05:09 -0500 Subject: [PATCH 23/73] replace_substring_regex: done with string transforms --- src/awkward/operations/str/__init__.py | 1 + .../str/ak_replace_substring_regex.py | 68 +++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 30 ++++++++ 3 files changed, 99 insertions(+) create mode 100644 src/awkward/operations/str/ak_replace_substring_regex.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 6aec5c824c..00d3d8e0de 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -26,6 +26,7 @@ from awkward.operations.str.ak_replace_slice import * from awkward.operations.str.ak_reverse import * from awkward.operations.str.ak_replace_substring import * +from awkward.operations.str.ak_replace_substring_regex import * def _get_action( diff --git a/src/awkward/operations/str/ak_replace_substring_regex.py b/src/awkward/operations/str/ak_replace_substring_regex.py new file mode 100644 index 0000000000..02a77c2b0a --- /dev/null +++ b/src/awkward/operations/str/ak_replace_substring_regex.py @@ -0,0 +1,68 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("replace_substring_regex",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def replace_substring_regex( + array, pattern, replacement, *, max_replacements=None, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str): Regular expression pattern to look for inside input values. + replacement (str): What to replace the pattern with. + max_replacements (None or int): If not None and not -1, limits the + maximum number of replacements per string/bytestring, counting from + the left. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces non-overlapping subsequences of any string or bytestring-valued data that match a regular expression `pattern` with `replacement`. + + The `pattern` and `replacement` are scalars; they cannot be different for each string/bytestring in the sample. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.replace_substring_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.replace_substring_regex.html) + or + [pyarrow.compute.replace_substring_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.replace_substring_regex.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, replacement, max_replacements, highlevel, behavior) + + +def _impl(array, pattern, replacement, max_replacements, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.replace_substring_regex, + pc.replace_substring_regex, + pattern, + replacement, + max_replacements=max_replacements, + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 6efead56a7..30ea6d6ae8 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -319,3 +319,33 @@ def test_replace_substring(): [], ["→δε←".encode(), "ζz zζ".encode(), b"abc"], ] + + +def test_replace_substring_regex(): + assert ak.str.replace_substring_regex(string, "βγ", "HELLO").tolist() == [ + ["αHELLO", ""], # noqa: RUF001 + [], + ["→δε←", "ζz zζ", "abc"], + ] + assert ak.str.replace_substring_regex( + bytestring, "βγ".encode(), b"HELLO" + ).tolist() == [ + ["αHELLO".encode(), b""], # noqa: RUF001 + [], + ["→δε←".encode(), "ζz zζ".encode(), b"abc"], + ] + + assert ak.str.replace_substring_regex( + string, "βγ", "HELLO", max_replacements=0 + ).tolist() == [ + ["αβγ", ""], + [], + ["→δε←", "ζz zζ", "abc"], + ] + assert ak.str.replace_substring_regex( + bytestring, "βγ".encode(), b"HELLO", max_replacements=0 + ).tolist() == [ + ["αβγ".encode(), b""], + [], + ["→δε←".encode(), "ζz zζ".encode(), b"abc"], + ] From bb8e8d74e407dd6d55d426aeb685df2da13151a6 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 20:16:21 -0500 Subject: [PATCH 24/73] center --- src/awkward/operations/str/__init__.py | 3 + src/awkward/operations/str/ak_center.py | 61 +++++++++++++++++++ .../operations/str/ak_replace_slice.py | 2 +- .../operations/str/ak_replace_substring.py | 2 +- .../str/ak_replace_substring_regex.py | 2 +- tests/test_2616_use_pyarrow_for_strings.py | 20 ++++++ 6 files changed, 87 insertions(+), 3 deletions(-) create mode 100644 src/awkward/operations/str/ak_center.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 00d3d8e0de..772f320cd9 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -28,6 +28,9 @@ from awkward.operations.str.ak_replace_substring import * from awkward.operations.str.ak_replace_substring_regex import * +# string padding +from awkward.operations.str.ak_center import * + def _get_action( utf8_function, ascii_function, *args, bytestring_to_string=False, **kwargs diff --git a/src/awkward/operations/str/ak_center.py b/src/awkward/operations/str/ak_center.py new file mode 100644 index 0000000000..0b8d16e3be --- /dev/null +++ b/src/awkward/operations/str/ak_center.py @@ -0,0 +1,61 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("center",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def center(array, width, padding=" ", *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + width (int): Desired string length. + padding (str or bytes): What to pad the string with. Should be one codepoint or byte. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string or bytestring-valued data with strings/bytestrings of a given `width`, padding both sides with the given `padding` codepoint or byte. + + If the data are strings, `width` is measured in codepoints and `padding` must be one codepoint. + + If the data are bytestrings, `width` is measured in bytes and `padding` must be one byte. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_center](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_center.html) + or + [pyarrow.compute.ascii_center](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_center.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, width, padding, highlevel, behavior) + + +def _impl(array, width, padding, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_center, pc.ascii_center, width, padding, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/ak_replace_slice.py b/src/awkward/operations/str/ak_replace_slice.py index 72653128bd..e569458b66 100644 --- a/src/awkward/operations/str/ak_replace_slice.py +++ b/src/awkward/operations/str/ak_replace_slice.py @@ -16,7 +16,7 @@ def replace_slice(array, start, stop, replacement, *, highlevel=True, behavior=N array: Array-like data (anything #ak.to_layout recognizes). start (int): Index to start slicing at (inclusive). stop (int): Index to stop slicing at (exclusive). - replacement (str): What to replace the slice with. + replacement (str or bytes): What to replace the slice with. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if diff --git a/src/awkward/operations/str/ak_replace_substring.py b/src/awkward/operations/str/ak_replace_substring.py index 380750da2f..a589afe136 100644 --- a/src/awkward/operations/str/ak_replace_substring.py +++ b/src/awkward/operations/str/ak_replace_substring.py @@ -17,7 +17,7 @@ def replace_substring( Args: array: Array-like data (anything #ak.to_layout recognizes). pattern (str): Substring pattern to look for inside input values. - replacement (str): What to replace the pattern with. + replacement (str or bytes): What to replace the pattern with. max_replacements (None or int): If not None and not -1, limits the maximum number of replacements per string/bytestring, counting from the left. diff --git a/src/awkward/operations/str/ak_replace_substring_regex.py b/src/awkward/operations/str/ak_replace_substring_regex.py index 02a77c2b0a..be63772e61 100644 --- a/src/awkward/operations/str/ak_replace_substring_regex.py +++ b/src/awkward/operations/str/ak_replace_substring_regex.py @@ -17,7 +17,7 @@ def replace_substring_regex( Args: array: Array-like data (anything #ak.to_layout recognizes). pattern (str): Regular expression pattern to look for inside input values. - replacement (str): What to replace the pattern with. + replacement (str or bytes): What to replace the pattern with. max_replacements (None or int): If not None and not -1, limits the maximum number of replacements per string/bytestring, counting from the left. diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 30ea6d6ae8..88e8063d5a 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -349,3 +349,23 @@ def test_replace_substring_regex(): [], ["→δε←".encode(), "ζz zζ".encode(), b"abc"], ] + + +def test_center(): + assert ak.str.center(string, 15, " ").tolist() == [ + [" αβγ ", " "], + [], + [" →δε← ", " ζz zζ ", " abc "], + ] + + print(ak.str.center(bytestring, 15, " ").tolist()) + + assert ak.str.center(bytestring, 15, b" ").tolist() == [ + [b" \xce\xb1\xce\xb2\xce\xb3 ", b" "], + [], + [ + b" \xe2\x86\x92\xce\xb4\xce\xb5\xe2\x86\x90 ", + b" \xce\xb6z z\xce\xb6 ", + b" abc ", + ], + ] From fa5d0bc58ba42c896a273f973e73c2e7a5ec5955 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 20:21:12 -0500 Subject: [PATCH 25/73] lpad and rpad --- src/awkward/operations/str/__init__.py | 2 + src/awkward/operations/str/ak_center.py | 2 +- src/awkward/operations/str/ak_lpad.py | 61 ++++++++++++++++++++++ src/awkward/operations/str/ak_rpad.py | 61 ++++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 40 ++++++++++++++ 5 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 src/awkward/operations/str/ak_lpad.py create mode 100644 src/awkward/operations/str/ak_rpad.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 772f320cd9..b936b0de8a 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -30,6 +30,8 @@ # string padding from awkward.operations.str.ak_center import * +from awkward.operations.str.ak_lpad import * +from awkward.operations.str.ak_rpad import * def _get_action( diff --git a/src/awkward/operations/str/ak_center.py b/src/awkward/operations/str/ak_center.py index 0b8d16e3be..284e6595c3 100644 --- a/src/awkward/operations/str/ak_center.py +++ b/src/awkward/operations/str/ak_center.py @@ -21,7 +21,7 @@ def center(array, width, padding=" ", *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string or bytestring-valued data with strings/bytestrings of a given `width`, padding both sides with the given `padding` codepoint or byte. + Replaces any string or bytestring-valued data with centered strings/bytestrings of a given `width`, padding both sides with the given `padding` codepoint or byte. If the data are strings, `width` is measured in codepoints and `padding` must be one codepoint. diff --git a/src/awkward/operations/str/ak_lpad.py b/src/awkward/operations/str/ak_lpad.py new file mode 100644 index 0000000000..2aad079ea7 --- /dev/null +++ b/src/awkward/operations/str/ak_lpad.py @@ -0,0 +1,61 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("lpad",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def lpad(array, width, padding=" ", *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + width (int): Desired string length. + padding (str or bytes): What to pad the string with. Should be one codepoint or byte. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string or bytestring-valued data with right-aligned strings/bytestrings of a given `width`, padding the left side with the given `padding` codepoint or byte. + + If the data are strings, `width` is measured in codepoints and `padding` must be one codepoint. + + If the data are bytestrings, `width` is measured in bytes and `padding` must be one byte. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_lpad](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_lpad.html) + or + [pyarrow.compute.ascii_lpad](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_lpad.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, width, padding, highlevel, behavior) + + +def _impl(array, width, padding, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_lpad, pc.ascii_lpad, width, padding, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/src/awkward/operations/str/ak_rpad.py b/src/awkward/operations/str/ak_rpad.py new file mode 100644 index 0000000000..5146abb6bb --- /dev/null +++ b/src/awkward/operations/str/ak_rpad.py @@ -0,0 +1,61 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("rpad",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def rpad(array, width, padding=" ", *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + width (int): Desired string length. + padding (str or bytes): What to pad the string with. Should be one codepoint or byte. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string or bytestring-valued data with left-aligned strings/bytestrings of a given `width`, padding the right side with the given `padding` codepoint or byte. + + If the data are strings, `width` is measured in codepoints and `padding` must be one codepoint. + + If the data are bytestrings, `width` is measured in bytes and `padding` must be one byte. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_rpad](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_rpad.html) + or + [pyarrow.compute.ascii_rpad](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_rpad.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, width, padding, highlevel, behavior) + + +def _impl(array, width, padding, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_rpad, pc.ascii_rpad, width, padding, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 88e8063d5a..36aabb7d4b 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -369,3 +369,43 @@ def test_center(): b" abc ", ], ] + + +def test_lpad(): + assert ak.str.lpad(string, 15, " ").tolist() == [ + [" αβγ", " "], + [], + [" →δε←", " ζz zζ", " abc"], + ] + + print(ak.str.lpad(bytestring, 15, " ").tolist()) + + assert ak.str.lpad(bytestring, 15, b" ").tolist() == [ + [b" \xce\xb1\xce\xb2\xce\xb3", b" "], + [], + [ + b" \xe2\x86\x92\xce\xb4\xce\xb5\xe2\x86\x90", + b" \xce\xb6z z\xce\xb6", + b" abc", + ], + ] + + +def test_rpad(): + assert ak.str.rpad(string, 15, " ").tolist() == [ + ["αβγ ", " "], + [], + ["→δε← ", "ζz zζ ", "abc "], + ] + + print(ak.str.rpad(bytestring, 15, " ").tolist()) + + assert ak.str.rpad(bytestring, 15, b" ").tolist() == [ + [b"\xce\xb1\xce\xb2\xce\xb3 ", b" "], + [], + [ + b"\xe2\x86\x92\xce\xb4\xce\xb5\xe2\x86\x90 ", + b"\xce\xb6z z\xce\xb6 ", + b"abc ", + ], + ] From 99c4ce0bbb4354958360398bf15175b8503f7be8 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 20:30:50 -0500 Subject: [PATCH 26/73] trim --- src/awkward/operations/str/__init__.py | 3 ++ src/awkward/operations/str/ak_trim.py | 60 ++++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 32 ++++++++++++ 3 files changed, 95 insertions(+) create mode 100644 src/awkward/operations/str/ak_trim.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index b936b0de8a..8f0a419840 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -33,6 +33,9 @@ from awkward.operations.str.ak_lpad import * from awkward.operations.str.ak_rpad import * +# string trimming +from awkward.operations.str.ak_trim import * + def _get_action( utf8_function, ascii_function, *args, bytestring_to_string=False, **kwargs diff --git a/src/awkward/operations/str/ak_trim.py b/src/awkward/operations/str/ak_trim.py new file mode 100644 index 0000000000..e51b638666 --- /dev/null +++ b/src/awkward/operations/str/ak_trim.py @@ -0,0 +1,60 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("trim",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def trim(array, characters, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + characters (str or bytes): Individual characters to be trimmed from the string. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Removes any leading or trailing characters of `characters` of any string or bytestring-valued data. + + If the data are strings, `characters` are interpreted as unordered, individual codepoints. + + If the data are bytestrings, `characters` are interpreted as unordered, individual bytes. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_trim](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_trim.html) + or + [pyarrow.compute.ascii_trim](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_trim.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, characters, highlevel, behavior) + + +def _impl(array, characters, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_trim, pc.ascii_trim, characters, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 36aabb7d4b..69579d584f 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -21,6 +21,25 @@ ] ) +string_padded = ak.Array( + [ + [" αβγ ", " "], + [], + [" →δε← ", " ζz zζ ", " abc "], + ] +) +bytestring_padded = ak.Array( + [ + [b" \xce\xb1\xce\xb2\xce\xb3 ", b" "], + [], + [ + b" \xe2\x86\x92\xce\xb4\xce\xb5\xe2\x86\x90 ", + b" \xce\xb6z z\xce\xb6 ", + b" abc ", + ], + ] +) + def test_is_alnum(): assert ak.str.is_alnum(string).tolist() == [ @@ -409,3 +428,16 @@ def test_rpad(): b"abc ", ], ] + + +def test_trim(): + assert ak.str.trim(string_padded, " ").tolist() == [ + ["αβγ", ""], + [], + ["→δε←", "ζz zζ", "abc"], + ] + assert ak.str.trim(bytestring_padded, b" ").tolist() == [ + ["αβγ".encode(), b""], + [], + ["→δε←".encode(), "ζz zζ".encode(), b"abc"], + ] From d71367048262566b3e234d7beeced0840854f9d5 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 20:33:19 -0500 Subject: [PATCH 27/73] trim_whitespace --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_trim.py | 2 +- .../operations/str/ak_trim_whitespace.py | 55 +++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 13 +++++ 4 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 src/awkward/operations/str/ak_trim_whitespace.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 8f0a419840..75d9ca92d5 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -35,6 +35,7 @@ # string trimming from awkward.operations.str.ak_trim import * +from awkward.operations.str.ak_trim_whitespace import * def _get_action( diff --git a/src/awkward/operations/str/ak_trim.py b/src/awkward/operations/str/ak_trim.py index e51b638666..d932016b3f 100644 --- a/src/awkward/operations/str/ak_trim.py +++ b/src/awkward/operations/str/ak_trim.py @@ -20,7 +20,7 @@ def trim(array, characters, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Removes any leading or trailing characters of `characters` of any string or bytestring-valued data. + Removes any leading or trailing characters of `characters` from any string or bytestring-valued data. If the data are strings, `characters` are interpreted as unordered, individual codepoints. diff --git a/src/awkward/operations/str/ak_trim_whitespace.py b/src/awkward/operations/str/ak_trim_whitespace.py new file mode 100644 index 0000000000..891c6d706e --- /dev/null +++ b/src/awkward/operations/str/ak_trim_whitespace.py @@ -0,0 +1,55 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("trim_whitespace",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def trim_whitespace(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Removes any leading or trailing whitespace from any string or bytestring-valued data. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_trim_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_trim_whitespace.html) + or + [pyarrow.compute.ascii_trim_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_trim_whitespace.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_trim_whitespace, pc.ascii_trim_whitespace, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 69579d584f..38bc25b11d 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -441,3 +441,16 @@ def test_trim(): [], ["→δε←".encode(), "ζz zζ".encode(), b"abc"], ] + + +def test_trim_whitespace(): + assert ak.str.trim_whitespace(string_padded).tolist() == [ + ["αβγ", ""], + [], + ["→δε←", "ζz zζ", "abc"], + ] + assert ak.str.trim_whitespace(bytestring_padded).tolist() == [ + ["αβγ".encode(), b""], + [], + ["→δε←".encode(), "ζz zζ".encode(), b"abc"], + ] From e63bd3eb9efbd96a712325ad04555cac7be04430 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 20:36:08 -0500 Subject: [PATCH 28/73] ltrim --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_ltrim.py | 60 ++++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 33 ++++++++++++ 3 files changed, 94 insertions(+) create mode 100644 src/awkward/operations/str/ak_ltrim.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 75d9ca92d5..dc236b8ba5 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -34,6 +34,7 @@ from awkward.operations.str.ak_rpad import * # string trimming +from awkward.operations.str.ak_ltrim import * from awkward.operations.str.ak_trim import * from awkward.operations.str.ak_trim_whitespace import * diff --git a/src/awkward/operations/str/ak_ltrim.py b/src/awkward/operations/str/ak_ltrim.py new file mode 100644 index 0000000000..1a1959bdb6 --- /dev/null +++ b/src/awkward/operations/str/ak_ltrim.py @@ -0,0 +1,60 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("ltrim",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def ltrim(array, characters, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + characters (str or bytes): Individual characters to be trimmed from the string. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Removes any leading characters of `characters` from any string or bytestring-valued data. + + If the data are strings, `characters` are interpreted as unordered, individual codepoints. + + If the data are bytestrings, `characters` are interpreted as unordered, individual bytes. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_ltrim](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_ltrim.html) + or + [pyarrow.compute.ascii_ltrim](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_ltrim.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, characters, highlevel, behavior) + + +def _impl(array, characters, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_ltrim, pc.ascii_ltrim, characters, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 38bc25b11d..3d256e8e9f 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -430,6 +430,39 @@ def test_rpad(): ] +# string_padded = ak.Array( +# [ +# [" αβγ ", " "], +# [], +# [" →δε← ", " ζz zζ ", " abc "], +# ] +# ) +# bytestring_padded = ak.Array( +# [ +# [b" \xce\xb1\xce\xb2\xce\xb3 ", b" "], +# [], +# [ +# b" \xe2\x86\x92\xce\xb4\xce\xb5\xe2\x86\x90 ", +# b" \xce\xb6z z\xce\xb6 ", +# b" abc ", +# ], +# ] +# ) + + +def test_ltrim(): + assert ak.str.ltrim(string_padded, " ").tolist() == [ + ["αβγ ", ""], + [], + ["→δε← ", "ζz zζ ", "abc "], + ] + assert ak.str.ltrim(bytestring_padded, b" ").tolist() == [ + ["αβγ ".encode(), b""], + [], + ["→δε← ".encode(), "ζz zζ ".encode(), b"abc "], + ] + + def test_trim(): assert ak.str.trim(string_padded, " ").tolist() == [ ["αβγ", ""], From 3040c4eea9838a17bac77eae1c975bec5c3f6011 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 20:39:19 -0500 Subject: [PATCH 29/73] rtrim --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_rtrim.py | 60 ++++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 33 +++++------- 3 files changed, 74 insertions(+), 20 deletions(-) create mode 100644 src/awkward/operations/str/ak_rtrim.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index dc236b8ba5..f4e3a8b15a 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -35,6 +35,7 @@ # string trimming from awkward.operations.str.ak_ltrim import * +from awkward.operations.str.ak_rtrim import * from awkward.operations.str.ak_trim import * from awkward.operations.str.ak_trim_whitespace import * diff --git a/src/awkward/operations/str/ak_rtrim.py b/src/awkward/operations/str/ak_rtrim.py new file mode 100644 index 0000000000..db5f8f7344 --- /dev/null +++ b/src/awkward/operations/str/ak_rtrim.py @@ -0,0 +1,60 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("rtrim",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def rtrim(array, characters, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + characters (str or bytes): Individual characters to be trimmed from the string. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Removes any trailing characters of `characters` from any string or bytestring-valued data. + + If the data are strings, `characters` are interpreted as unordered, individual codepoints. + + If the data are bytestrings, `characters` are interpreted as unordered, individual bytes. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_rtrim](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_rtrim.html) + or + [pyarrow.compute.ascii_rtrim](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_rtrim.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, characters, highlevel, behavior) + + +def _impl(array, characters, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_rtrim, pc.ascii_rtrim, characters, bytestring_to_string=True + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 3d256e8e9f..b170ffc9c4 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -430,26 +430,6 @@ def test_rpad(): ] -# string_padded = ak.Array( -# [ -# [" αβγ ", " "], -# [], -# [" →δε← ", " ζz zζ ", " abc "], -# ] -# ) -# bytestring_padded = ak.Array( -# [ -# [b" \xce\xb1\xce\xb2\xce\xb3 ", b" "], -# [], -# [ -# b" \xe2\x86\x92\xce\xb4\xce\xb5\xe2\x86\x90 ", -# b" \xce\xb6z z\xce\xb6 ", -# b" abc ", -# ], -# ] -# ) - - def test_ltrim(): assert ak.str.ltrim(string_padded, " ").tolist() == [ ["αβγ ", ""], @@ -463,6 +443,19 @@ def test_ltrim(): ] +def test_rtrim(): + assert ak.str.rtrim(string_padded, " ").tolist() == [ + [" αβγ", ""], + [], + [" →δε←", " ζz zζ", " abc"], + ] + assert ak.str.rtrim(bytestring_padded, b" ").tolist() == [ + [" αβγ".encode(), b""], + [], + [" →δε←".encode(), " ζz zζ".encode(), b" abc"], + ] + + def test_trim(): assert ak.str.trim(string_padded, " ").tolist() == [ ["αβγ", ""], From 6320f2e7c3e51f0af829129aa158776e6d0fd6b5 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 20:40:56 -0500 Subject: [PATCH 30/73] rtrim_whitespace --- src/awkward/operations/str/__init__.py | 1 + .../operations/str/ak_rtrim_whitespace.py | 57 +++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 13 +++++ 3 files changed, 71 insertions(+) create mode 100644 src/awkward/operations/str/ak_rtrim_whitespace.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index f4e3a8b15a..ecf688a55b 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -36,6 +36,7 @@ # string trimming from awkward.operations.str.ak_ltrim import * from awkward.operations.str.ak_rtrim import * +from awkward.operations.str.ak_rtrim_whitespace import * from awkward.operations.str.ak_trim import * from awkward.operations.str.ak_trim_whitespace import * diff --git a/src/awkward/operations/str/ak_rtrim_whitespace.py b/src/awkward/operations/str/ak_rtrim_whitespace.py new file mode 100644 index 0000000000..17df969275 --- /dev/null +++ b/src/awkward/operations/str/ak_rtrim_whitespace.py @@ -0,0 +1,57 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("rtrim_whitespace",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def rtrim_whitespace(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Removes any trailing whitespace from any string or bytestring-valued data. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_rtrim_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_rtrim_whitespace.html) + or + [pyarrow.compute.ascii_rtrim_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_rtrim_whitespace.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_rtrim_whitespace, + pc.ascii_rtrim_whitespace, + bytestring_to_string=True, + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index b170ffc9c4..f96891eac8 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -456,6 +456,19 @@ def test_rtrim(): ] +def test_rtrim_whitespace(): + assert ak.str.rtrim_whitespace(string_padded).tolist() == [ + [" αβγ", ""], + [], + [" →δε←", " ζz zζ", " abc"], + ] + assert ak.str.rtrim_whitespace(bytestring_padded).tolist() == [ + [" αβγ".encode(), b""], + [], + [" →δε←".encode(), " ζz zζ".encode(), b" abc"], + ] + + def test_trim(): assert ak.str.trim(string_padded, " ").tolist() == [ ["αβγ", ""], From 3d0998b087ee0b66df1eaac6ae0798233ed01f8a Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 20:42:48 -0500 Subject: [PATCH 31/73] ltrim_whitespace --- src/awkward/operations/str/__init__.py | 1 + .../operations/str/ak_ltrim_whitespace.py | 57 +++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 13 +++++ 3 files changed, 71 insertions(+) create mode 100644 src/awkward/operations/str/ak_ltrim_whitespace.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index ecf688a55b..6a67aee697 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -35,6 +35,7 @@ # string trimming from awkward.operations.str.ak_ltrim import * +from awkward.operations.str.ak_ltrim_whitespace import * from awkward.operations.str.ak_rtrim import * from awkward.operations.str.ak_rtrim_whitespace import * from awkward.operations.str.ak_trim import * diff --git a/src/awkward/operations/str/ak_ltrim_whitespace.py b/src/awkward/operations/str/ak_ltrim_whitespace.py new file mode 100644 index 0000000000..f465f81e13 --- /dev/null +++ b/src/awkward/operations/str/ak_ltrim_whitespace.py @@ -0,0 +1,57 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("ltrim_whitespace",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def ltrim_whitespace(array, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Removes any leading whitespace from any string or bytestring-valued data. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_ltrim_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_ltrim_whitespace.html) + or + [pyarrow.compute.ascii_ltrim_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_ltrim_whitespace.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_action( + pc.utf8_ltrim_whitespace, + pc.ascii_ltrim_whitespace, + bytestring_to_string=True, + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index f96891eac8..d5604b37a8 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -443,6 +443,19 @@ def test_ltrim(): ] +def test_ltrim_whitespace(): + assert ak.str.ltrim_whitespace(string_padded).tolist() == [ + ["αβγ ", ""], + [], + ["→δε← ", "ζz zζ ", "abc "], + ] + assert ak.str.ltrim_whitespace(bytestring_padded).tolist() == [ + ["αβγ ".encode(), b""], + [], + ["→δε← ".encode(), "ζz zζ ".encode(), b"abc "], + ] + + def test_rtrim(): assert ak.str.rtrim(string_padded, " ").tolist() == [ [" αβγ", ""], From e624ee3bd05982f069239156079089b80b532208 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 4 Aug 2023 21:09:41 -0500 Subject: [PATCH 32/73] slice --- src/awkward/operations/str/__init__.py | 12 ++++ src/awkward/operations/str/ak_slice.py | 69 ++++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 25 ++++++++ 3 files changed, 106 insertions(+) create mode 100644 src/awkward/operations/str/ak_slice.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 6a67aee697..7bb5fce6db 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -41,6 +41,18 @@ from awkward.operations.str.ak_trim import * from awkward.operations.str.ak_trim_whitespace import * +# string splitting + +# string component extraction + +# string joining + +# string slicing + +from awkward.operations.str.ak_slice import * + +# containment tests + def _get_action( utf8_function, ascii_function, *args, bytestring_to_string=False, **kwargs diff --git a/src/awkward/operations/str/ak_slice.py b/src/awkward/operations/str/ak_slice.py new file mode 100644 index 0000000000..7afaab7d93 --- /dev/null +++ b/src/awkward/operations/str/ak_slice.py @@ -0,0 +1,69 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("slice",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def slice(array, start, stop=None, step=1, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + start (int): Index to start slicing at (inclusive). + stop (None or int): Index to stop slicing at (exclusive). If not given, + slicing will stop at the end. + step (int): Slice step. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string or bytestring-valued data with a slice between `start` and `stop` indexes; `start` is inclusive and `stop` is exclusive and both are 0-indexed. + + For strings, `start` and `stop` are measured in Unicode characters; for bytestrings, `start` and `stop` are measured in bytes. + + The `start`, `stop`, and `replacement` are scalars; they cannot be different for each string/bytestring in the sample. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_slice_codeunits](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_slice_codeunits.html) + or performs a literal slice on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, start, stop, step, highlevel, behavior) + + +def _impl(array, start, stop, step, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + from awkward.operations.ak_from_arrow import from_arrow + from awkward.operations.ak_to_arrow import to_arrow + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + def action(layout, **absorb): + if layout.is_list and layout.parameter("__array__") == "string": + return from_arrow( + pc.utf8_slice_codeunits( + to_arrow(layout, extensionarray=False), start, stop, step + ), + highlevel=False, + ) + + elif layout.is_list and layout.parameter("__array__") == "bytestring": + return layout[:, start:stop:step] + + out = ak._do.recursively_apply(ak.operations.to_layout(array), action, behavior) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index d5604b37a8..51f822d92c 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -506,3 +506,28 @@ def test_trim_whitespace(): [], ["→δε←".encode(), "ζz zζ".encode(), b"abc"], ] + + +def test_slice(): + assert ak.str.slice(string, 1, 3).tolist() == [ + ["αβγ"[1:3], ""[1:3]], + [], + ["→δε←"[1:3], "ζz zζ"[1:3], "abc"[1:3]], + ] + assert ak.str.slice(bytestring, 1, 3).tolist() == [ + ["αβγ".encode()[1:3], b""[1:3]], + [], + ["→δε←".encode()[1:3], "ζz zζ".encode()[1:3], b"abc"[1:3]], + ] + + # ArrowInvalid: Negative buffer resize: -40 (looks like an Arrow bug) + # assert ak.str.slice(string, 1).tolist() == [ + # ["αβγ"[1:], ""[1:]], + # [], + # ["→δε←"[1:], "ζz zζ"[1:], "abc"[1:]], + # ] + assert ak.str.slice(bytestring, 1).tolist() == [ + ["αβγ".encode()[1:], b""[1:]], + [], + ["→δε←".encode()[1:], "ζz zζ".encode()[1:], b"abc"[1:]], + ] From 766c9df23b8092c3e02340a47e7250953bc082b9 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 10:13:16 +0100 Subject: [PATCH 33/73] feat: add `split_whitespace` --- src/awkward/operations/str/__init__.py | 1 + .../operations/str/ak_split_whitespace.py | 65 +++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 src/awkward/operations/str/ak_split_whitespace.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 7bb5fce6db..5224995c87 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -42,6 +42,7 @@ from awkward.operations.str.ak_trim_whitespace import * # string splitting +from awkward.operations.str.ak_split_whitespace import * # string component extraction diff --git a/src/awkward/operations/str/ak_split_whitespace.py b/src/awkward/operations/str/ak_split_whitespace.py new file mode 100644 index 0000000000..198aa09ac6 --- /dev/null +++ b/src/awkward/operations/str/ak_split_whitespace.py @@ -0,0 +1,65 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("split_whitespace",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def split_whitespace( + array, *, max_splits=None, reverse=False, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + max_splits (None or int): Maximum number of splits for each input value. If None, unlimited. + reverse (bool): If True, start splitting from the end of each input value; otherwise, start splitting + from the beginning of each value. This flag only has an effect if `max_splits` is not None. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Splits any string or bytestring-valued data into a list of substrings according to any non-zero length sequence of + whitespace characters. + + For strings, a split is performed for every sequence of Unicode whitespace characters; for bytestrings, splitting + is performed for sequences of ascii whitespace characters. + + The `max_splits`, and `reverse` arguments are scalars; they cannot be different for each string/bytestring in the + sample. + + Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.utf8_split_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_split_whitespace.html) + or [pyarrow.compute.ascii_split_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_split_whitespace.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, max_splits, reverse, highlevel, behavior) + + +def _impl(array, max_splits, reverse, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + action = ak.operations.str._get_action( + pc.utf8_split_whitespace, + pc.ascii_split_whitespace, + max_splits=max_splits, + reverse=reverse, + bytestring_to_string=True, + ) + out = ak._do.recursively_apply(ak.operations.to_layout(array), action, behavior) + + return wrap_layout(out, behavior, highlevel) From c25a5584a33a2faeea81d6aef2d3a82c94b3d292 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 10:17:57 +0100 Subject: [PATCH 34/73] test: add test for `split_whitespace` --- tests/test_2616_use_pyarrow_for_strings.py | 39 ++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 51f822d92c..d56b0413dc 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -531,3 +531,42 @@ def test_slice(): [], ["→δε←".encode()[1:], "ζz zζ".encode()[1:], b"abc"[1:]], ] + + +def test_split_whitespace(): + assert ak.str.split_whitespace(string_padded, max_splits=1).tolist() == [ + [["", "αβγ "], ["", " "]], + [], + [["", "→δε← "], ["", "ζz zζ "], ["", "abc "]], + ] + assert ak.str.split_whitespace( + string_padded, max_splits=1, reverse=True + ).tolist() == [ + [[" αβγ", ""], [" ", ""]], + [], + [[" →δε←", ""], [" ζz zζ", ""], [" abc", ""]], + ] + assert ak.str.split_whitespace(string_padded, max_splits=None).tolist() == [ + [["", "αβγ", "", ""], ["", "", ""]], + [], + [["", "→δε←", "", ""], ["", "ζz", "zζ", "", ""], ["", "abc", "", ""]], + ] + + # Bytestrings + assert ak.str.split_whitespace(bytestring_padded, max_splits=1).tolist() == [ + [["", "αβγ "], ["", ""]], + [], + [["", "→δε← "], ["", "ζz zζ "], ["", "abc "]], + ] + assert ak.str.split_whitespace( + bytestring_padded, max_splits=1, reverse=True + ).tolist() == [ + [[" αβγ", ""], ["", ""]], + [], + [[" →δε←", ""], [" ζz zζ", ""], [" abc", ""]], + ] + assert ak.str.split_whitespace(bytestring_padded, max_splits=None).tolist() == [ + [["", "αβγ", ""], ["", ""]], + [], + [["", "→δε←", ""], ["", "ζz", "zζ", ""], ["", "abc", ""]], + ] From ddc9bc724bf1331c4d125cf8a3a2a76c1c76f2a1 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 11:20:22 +0100 Subject: [PATCH 35/73] test: correct test --- tests/test_2616_use_pyarrow_for_strings.py | 24 ++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index d56b0413dc..f4eb716d18 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -554,19 +554,31 @@ def test_split_whitespace(): # Bytestrings assert ak.str.split_whitespace(bytestring_padded, max_splits=1).tolist() == [ - [["", "αβγ "], ["", ""]], + [[b"", "αβγ ".encode()], [b"", b""]], [], - [["", "→δε← "], ["", "ζz zζ "], ["", "abc "]], + [ + [b"", "→δε← ".encode()], + [b"", "ζz zζ ".encode()], + [b"", b"abc "], + ], ] assert ak.str.split_whitespace( bytestring_padded, max_splits=1, reverse=True ).tolist() == [ - [[" αβγ", ""], ["", ""]], + [[" αβγ".encode(), b""], [b"", b""]], [], - [[" →δε←", ""], [" ζz zζ", ""], [" abc", ""]], + [ + [" →δε←".encode(), b""], + [" ζz zζ".encode(), b""], + [b" abc", b""], + ], ] assert ak.str.split_whitespace(bytestring_padded, max_splits=None).tolist() == [ - [["", "αβγ", ""], ["", ""]], + [[b"", "αβγ".encode(), b""], [b"", b""]], [], - [["", "→δε←", ""], ["", "ζz", "zζ", ""], ["", "abc", ""]], + [ + [b"", "→δε←".encode(), b""], + [b"", "ζz".encode(), "zζ".encode(), b""], + [b"", b"abc", b""], + ], ] From 5638a79b12b18d4d28f6fdf1af16c14c9695c4ee Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 11:20:31 +0100 Subject: [PATCH 36/73] feat: add `split_pattern` --- src/awkward/operations/str/__init__.py | 1 + .../operations/str/ak_split_pattern.py | 58 +++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 src/awkward/operations/str/ak_split_pattern.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 5224995c87..cb70649438 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -43,6 +43,7 @@ # string splitting from awkward.operations.str.ak_split_whitespace import * +from awkward.operations.str.ak_split_pattern import * # string component extraction diff --git a/src/awkward/operations/str/ak_split_pattern.py b/src/awkward/operations/str/ak_split_pattern.py new file mode 100644 index 0000000000..d8d952db25 --- /dev/null +++ b/src/awkward/operations/str/ak_split_pattern.py @@ -0,0 +1,58 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("split_pattern",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def split_pattern( + array, pattern, *, max_splits=None, reverse=False, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str or bytes): Individual characters to be trimmed from the string. + max_splits (None or int): Maximum number of splits for each input value. If None, unlimited. + reverse (bool): If True, start splitting from the end of each input value; otherwise, start splitting + from the beginning of each value. This flag only has an effect if `max_splits` is not None. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Splits any string or bytestring-valued data into a list of substrings according to the given separator. + + Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.split_pattern](https://arrow.apache.org/docs/python/generated/pyarrow.compute.split_pattern.html). + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, max_splits, reverse, highlevel, behavior) + + +def _impl(array, pattern, max_splits, reverse, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + action = ak.operations.str._get_action( + pc.split_pattern, + pc.split_pattern, + pattern=pattern, + max_splits=max_splits, + reverse=reverse, + bytestring_to_string=True, + ) + out = ak._do.recursively_apply(ak.operations.to_layout(array), action, behavior) + + return wrap_layout(out, behavior, highlevel) From 3ef7ded2ea94a9e822667f143ebf5a83f1299ed1 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 12:43:02 +0100 Subject: [PATCH 37/73] refactor: rename `_get_action` --- src/awkward/contents/unmaskedarray.py | 2 +- src/awkward/operations/str/__init__.py | 87 ++++++++++++++++++- src/awkward/operations/str/ak_capitalize.py | 2 +- src/awkward/operations/str/ak_center.py | 2 +- src/awkward/operations/str/ak_is_alnum.py | 2 +- src/awkward/operations/str/ak_is_alpha.py | 2 +- src/awkward/operations/str/ak_is_ascii.py | 2 +- src/awkward/operations/str/ak_is_decimal.py | 2 +- src/awkward/operations/str/ak_is_digit.py | 2 +- src/awkward/operations/str/ak_is_lower.py | 2 +- src/awkward/operations/str/ak_is_numeric.py | 2 +- src/awkward/operations/str/ak_is_printable.py | 2 +- src/awkward/operations/str/ak_is_space.py | 2 +- src/awkward/operations/str/ak_is_title.py | 2 +- src/awkward/operations/str/ak_is_upper.py | 2 +- src/awkward/operations/str/ak_length.py | 2 +- src/awkward/operations/str/ak_lower.py | 2 +- src/awkward/operations/str/ak_lpad.py | 2 +- src/awkward/operations/str/ak_ltrim.py | 2 +- .../operations/str/ak_ltrim_whitespace.py | 2 +- .../operations/str/ak_replace_slice.py | 2 +- .../operations/str/ak_replace_substring.py | 2 +- .../str/ak_replace_substring_regex.py | 2 +- src/awkward/operations/str/ak_reverse.py | 2 +- src/awkward/operations/str/ak_rpad.py | 2 +- src/awkward/operations/str/ak_rtrim.py | 2 +- .../operations/str/ak_rtrim_whitespace.py | 2 +- .../operations/str/ak_split_pattern.py | 6 +- .../operations/str/ak_split_whitespace.py | 3 +- src/awkward/operations/str/ak_swapcase.py | 2 +- src/awkward/operations/str/ak_title.py | 2 +- src/awkward/operations/str/ak_trim.py | 2 +- .../operations/str/ak_trim_whitespace.py | 2 +- src/awkward/operations/str/ak_upper.py | 2 +- 34 files changed, 120 insertions(+), 38 deletions(-) diff --git a/src/awkward/contents/unmaskedarray.py b/src/awkward/contents/unmaskedarray.py index 804bf02c7b..12c31e4bc0 100644 --- a/src/awkward/contents/unmaskedarray.py +++ b/src/awkward/contents/unmaskedarray.py @@ -491,7 +491,7 @@ def _remove_structure(self, backend, options): return [self] def _drop_none(self) -> Content: - return self.to_ByteMaskedArray(True)._drop_none() + return self.content[:0] def _recursively_apply( self, action, behavior, depth, depth_context, lateral_context, options diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index cb70649438..260cb9c1b3 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -56,8 +56,13 @@ # containment tests -def _get_action( - utf8_function, ascii_function, *args, bytestring_to_string=False, **kwargs +def _get_ufunc_action( + utf8_function, + ascii_function, + *args, + bytestring_to_string=False, + drop_unmasked_option=False, + **kwargs, ): from awkward.operations.ak_from_arrow import from_arrow from awkward.operations.ak_to_arrow import to_arrow @@ -103,3 +108,81 @@ def action(layout, **absorb): ) return action + + +def _erase_list_option(layout): + from awkward.contents.unmaskedarray import UnmaskedArray + + assert layout.is_list + if layout.content.is_option: + assert isinstance(layout.content, UnmaskedArray) + return layout.copy(content=layout.content.content) + else: + return layout + + +def _get_split_action( + utf8_function, ascii_function, *args, bytestring_to_string=False, **kwargs +): + from awkward.operations.ak_from_arrow import from_arrow + from awkward.operations.ak_to_arrow import to_arrow + + def action(layout, **absorb): + if layout.is_list and layout.parameter("__array__") == "string": + return _erase_list_option( + from_arrow( + utf8_function( + to_arrow(layout, extensionarray=False), + *args, + **kwargs, + ), + highlevel=False, + ) + ) + + elif layout.is_list and layout.parameter("__array__") == "bytestring": + if bytestring_to_string: + out = _erase_list_option( + from_arrow( + ascii_function( + to_arrow( + layout.copy( + content=layout.content.copy( + parameters={"__array__": "char"} + ), + parameters={"__array__": "string"}, + ), + extensionarray=False, + ), + *args, + **kwargs, + ), + highlevel=False, + ) + ) + assert out.is_list + + assert ( + out.content.is_list + and out.content.parameter("__array__") == "string" + ) + return out.copy( + content=out.content.copy( + content=out.content.content.copy( + parameters={"__array__": "byte"} + ), + parameters={"__array__": "bytestring"}, + ), + ) + + else: + return _erase_list_option( + from_arrow( + ascii_function( + to_arrow(layout, extensionarray=False), *args, **kwargs + ), + highlevel=False, + ) + ) + + return action diff --git a/src/awkward/operations/str/ak_capitalize.py b/src/awkward/operations/str/ak_capitalize.py index 84e2843e00..9400c21c9e 100644 --- a/src/awkward/operations/str/ak_capitalize.py +++ b/src/awkward/operations/str/ak_capitalize.py @@ -47,7 +47,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_capitalize, pc.ascii_capitalize, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_center.py b/src/awkward/operations/str/ak_center.py index 284e6595c3..9bd2246673 100644 --- a/src/awkward/operations/str/ak_center.py +++ b/src/awkward/operations/str/ak_center.py @@ -52,7 +52,7 @@ def _impl(array, width, padding, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_center, pc.ascii_center, width, padding, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_is_alnum.py b/src/awkward/operations/str/ak_is_alnum.py index ac28e085b8..2f93d87982 100644 --- a/src/awkward/operations/str/ak_is_alnum.py +++ b/src/awkward/operations/str/ak_is_alnum.py @@ -47,7 +47,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_is_alnum, pc.ascii_is_alnum, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_is_alpha.py b/src/awkward/operations/str/ak_is_alpha.py index 283ad5a4c6..c40f612e75 100644 --- a/src/awkward/operations/str/ak_is_alpha.py +++ b/src/awkward/operations/str/ak_is_alpha.py @@ -47,7 +47,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_is_alpha, pc.ascii_is_alpha, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_is_ascii.py b/src/awkward/operations/str/ak_is_ascii.py index c00d349048..bc588f2888 100644 --- a/src/awkward/operations/str/ak_is_ascii.py +++ b/src/awkward/operations/str/ak_is_ascii.py @@ -47,7 +47,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.string_is_ascii, pc.string_is_ascii, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_is_decimal.py b/src/awkward/operations/str/ak_is_decimal.py index 8a2f4b0fe7..26ff606bd0 100644 --- a/src/awkward/operations/str/ak_is_decimal.py +++ b/src/awkward/operations/str/ak_is_decimal.py @@ -47,7 +47,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_is_decimal, pc.ascii_is_decimal, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_is_digit.py b/src/awkward/operations/str/ak_is_digit.py index 3cd5f343ae..338b86d30a 100644 --- a/src/awkward/operations/str/ak_is_digit.py +++ b/src/awkward/operations/str/ak_is_digit.py @@ -49,7 +49,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_is_digit, pc.utf8_is_digit, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_is_lower.py b/src/awkward/operations/str/ak_is_lower.py index 74c832ba77..87dd3462a6 100644 --- a/src/awkward/operations/str/ak_is_lower.py +++ b/src/awkward/operations/str/ak_is_lower.py @@ -47,7 +47,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_is_lower, pc.ascii_is_lower, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_is_numeric.py b/src/awkward/operations/str/ak_is_numeric.py index 9bf89c814a..437ff31b47 100644 --- a/src/awkward/operations/str/ak_is_numeric.py +++ b/src/awkward/operations/str/ak_is_numeric.py @@ -49,7 +49,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_is_numeric, pc.utf8_is_numeric, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_is_printable.py b/src/awkward/operations/str/ak_is_printable.py index cf42bfcc97..24c5184fde 100644 --- a/src/awkward/operations/str/ak_is_printable.py +++ b/src/awkward/operations/str/ak_is_printable.py @@ -47,7 +47,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_is_printable, pc.ascii_is_printable, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_is_space.py b/src/awkward/operations/str/ak_is_space.py index 00ace2eb51..5b69031d1f 100644 --- a/src/awkward/operations/str/ak_is_space.py +++ b/src/awkward/operations/str/ak_is_space.py @@ -47,7 +47,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_is_space, pc.ascii_is_space, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_is_title.py b/src/awkward/operations/str/ak_is_title.py index e463d00685..5275a1df0e 100644 --- a/src/awkward/operations/str/ak_is_title.py +++ b/src/awkward/operations/str/ak_is_title.py @@ -47,7 +47,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_is_title, pc.ascii_is_title, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_is_upper.py b/src/awkward/operations/str/ak_is_upper.py index 8cff3a78bc..fa20f04fe6 100644 --- a/src/awkward/operations/str/ak_is_upper.py +++ b/src/awkward/operations/str/ak_is_upper.py @@ -47,7 +47,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_is_upper, pc.ascii_is_upper, # pc.ascii_is_upper is defined on binary, but for consistency with is_lower and is_title... diff --git a/src/awkward/operations/str/ak_length.py b/src/awkward/operations/str/ak_length.py index f77ce22f76..e5ef1c7b84 100644 --- a/src/awkward/operations/str/ak_length.py +++ b/src/awkward/operations/str/ak_length.py @@ -47,7 +47,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_length, pc.binary_length, bytestring_to_string=False ), behavior, diff --git a/src/awkward/operations/str/ak_lower.py b/src/awkward/operations/str/ak_lower.py index 92766b8f48..971ffe043e 100644 --- a/src/awkward/operations/str/ak_lower.py +++ b/src/awkward/operations/str/ak_lower.py @@ -47,7 +47,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_lower, pc.ascii_lower, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_lpad.py b/src/awkward/operations/str/ak_lpad.py index 2aad079ea7..909f1663d9 100644 --- a/src/awkward/operations/str/ak_lpad.py +++ b/src/awkward/operations/str/ak_lpad.py @@ -52,7 +52,7 @@ def _impl(array, width, padding, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_lpad, pc.ascii_lpad, width, padding, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_ltrim.py b/src/awkward/operations/str/ak_ltrim.py index 1a1959bdb6..0180270067 100644 --- a/src/awkward/operations/str/ak_ltrim.py +++ b/src/awkward/operations/str/ak_ltrim.py @@ -51,7 +51,7 @@ def _impl(array, characters, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_ltrim, pc.ascii_ltrim, characters, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_ltrim_whitespace.py b/src/awkward/operations/str/ak_ltrim_whitespace.py index f465f81e13..e415a1400f 100644 --- a/src/awkward/operations/str/ak_ltrim_whitespace.py +++ b/src/awkward/operations/str/ak_ltrim_whitespace.py @@ -46,7 +46,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_ltrim_whitespace, pc.ascii_ltrim_whitespace, bytestring_to_string=True, diff --git a/src/awkward/operations/str/ak_replace_slice.py b/src/awkward/operations/str/ak_replace_slice.py index e569458b66..cd80f111aa 100644 --- a/src/awkward/operations/str/ak_replace_slice.py +++ b/src/awkward/operations/str/ak_replace_slice.py @@ -53,7 +53,7 @@ def _impl(array, start, stop, replacement, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_replace_slice, pc.binary_replace_slice, start, stop, replacement ), behavior, diff --git a/src/awkward/operations/str/ak_replace_substring.py b/src/awkward/operations/str/ak_replace_substring.py index a589afe136..691e9fd3e7 100644 --- a/src/awkward/operations/str/ak_replace_substring.py +++ b/src/awkward/operations/str/ak_replace_substring.py @@ -55,7 +55,7 @@ def _impl(array, pattern, replacement, max_replacements, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.replace_substring, pc.replace_substring, pattern, diff --git a/src/awkward/operations/str/ak_replace_substring_regex.py b/src/awkward/operations/str/ak_replace_substring_regex.py index be63772e61..77dc2c12b2 100644 --- a/src/awkward/operations/str/ak_replace_substring_regex.py +++ b/src/awkward/operations/str/ak_replace_substring_regex.py @@ -55,7 +55,7 @@ def _impl(array, pattern, replacement, max_replacements, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.replace_substring_regex, pc.replace_substring_regex, pattern, diff --git a/src/awkward/operations/str/ak_reverse.py b/src/awkward/operations/str/ak_reverse.py index 627f8a95cf..6f15db9df8 100644 --- a/src/awkward/operations/str/ak_reverse.py +++ b/src/awkward/operations/str/ak_reverse.py @@ -47,7 +47,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_reverse, pc.binary_reverse, bytestring_to_string=False ), behavior, diff --git a/src/awkward/operations/str/ak_rpad.py b/src/awkward/operations/str/ak_rpad.py index 5146abb6bb..da0cf61fb6 100644 --- a/src/awkward/operations/str/ak_rpad.py +++ b/src/awkward/operations/str/ak_rpad.py @@ -52,7 +52,7 @@ def _impl(array, width, padding, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_rpad, pc.ascii_rpad, width, padding, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_rtrim.py b/src/awkward/operations/str/ak_rtrim.py index db5f8f7344..3d1d518754 100644 --- a/src/awkward/operations/str/ak_rtrim.py +++ b/src/awkward/operations/str/ak_rtrim.py @@ -51,7 +51,7 @@ def _impl(array, characters, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_rtrim, pc.ascii_rtrim, characters, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_rtrim_whitespace.py b/src/awkward/operations/str/ak_rtrim_whitespace.py index 17df969275..e2064bc412 100644 --- a/src/awkward/operations/str/ak_rtrim_whitespace.py +++ b/src/awkward/operations/str/ak_rtrim_whitespace.py @@ -46,7 +46,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_rtrim_whitespace, pc.ascii_rtrim_whitespace, bytestring_to_string=True, diff --git a/src/awkward/operations/str/ak_split_pattern.py b/src/awkward/operations/str/ak_split_pattern.py index d8d952db25..b94187d9fe 100644 --- a/src/awkward/operations/str/ak_split_pattern.py +++ b/src/awkward/operations/str/ak_split_pattern.py @@ -16,7 +16,7 @@ def split_pattern( """ Args: array: Array-like data (anything #ak.to_layout recognizes). - pattern (str or bytes): Individual characters to be trimmed from the string. + pattern (str or bytes): Pattern of characters/bytes to split on. max_splits (None or int): Maximum number of splits for each input value. If None, unlimited. reverse (bool): If True, start splitting from the end of each input value; otherwise, start splitting from the beginning of each value. This flag only has an effect if `max_splits` is not None. @@ -45,13 +45,13 @@ def _impl(array, pattern, max_splits, reverse, highlevel, behavior): import pyarrow.compute as pc behavior = behavior_of(array, behavior=behavior) - action = ak.operations.str._get_action( + action = ak.operations.str._get_split_action( pc.split_pattern, pc.split_pattern, pattern=pattern, max_splits=max_splits, reverse=reverse, - bytestring_to_string=True, + bytestring_to_string=False, ) out = ak._do.recursively_apply(ak.operations.to_layout(array), action, behavior) diff --git a/src/awkward/operations/str/ak_split_whitespace.py b/src/awkward/operations/str/ak_split_whitespace.py index 198aa09ac6..07be7a0e5c 100644 --- a/src/awkward/operations/str/ak_split_whitespace.py +++ b/src/awkward/operations/str/ak_split_whitespace.py @@ -49,11 +49,10 @@ def split_whitespace( def _impl(array, max_splits, reverse, highlevel, behavior): import awkward._connect.pyarrow # noqa: F401, I001 - import pyarrow.compute as pc behavior = behavior_of(array, behavior=behavior) - action = ak.operations.str._get_action( + action = ak.operations.str._get_split_action( pc.utf8_split_whitespace, pc.ascii_split_whitespace, max_splits=max_splits, diff --git a/src/awkward/operations/str/ak_swapcase.py b/src/awkward/operations/str/ak_swapcase.py index 1ff02dabad..36d6d53e11 100644 --- a/src/awkward/operations/str/ak_swapcase.py +++ b/src/awkward/operations/str/ak_swapcase.py @@ -47,7 +47,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_swapcase, pc.ascii_swapcase, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_title.py b/src/awkward/operations/str/ak_title.py index 8314002311..cdd147c012 100644 --- a/src/awkward/operations/str/ak_title.py +++ b/src/awkward/operations/str/ak_title.py @@ -47,7 +47,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_title, pc.ascii_title, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_trim.py b/src/awkward/operations/str/ak_trim.py index d932016b3f..c43df209be 100644 --- a/src/awkward/operations/str/ak_trim.py +++ b/src/awkward/operations/str/ak_trim.py @@ -51,7 +51,7 @@ def _impl(array, characters, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_trim, pc.ascii_trim, characters, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_trim_whitespace.py b/src/awkward/operations/str/ak_trim_whitespace.py index 891c6d706e..197aa777cd 100644 --- a/src/awkward/operations/str/ak_trim_whitespace.py +++ b/src/awkward/operations/str/ak_trim_whitespace.py @@ -46,7 +46,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_trim_whitespace, pc.ascii_trim_whitespace, bytestring_to_string=True ), behavior, diff --git a/src/awkward/operations/str/ak_upper.py b/src/awkward/operations/str/ak_upper.py index f4ae131af7..776b0526c0 100644 --- a/src/awkward/operations/str/ak_upper.py +++ b/src/awkward/operations/str/ak_upper.py @@ -47,7 +47,7 @@ def _impl(array, highlevel, behavior): out = ak._do.recursively_apply( ak.operations.to_layout(array), - ak.operations.str._get_action( + ak.operations.str._get_ufunc_action( pc.utf8_upper, pc.ascii_upper, bytestring_to_string=True ), behavior, From 65d216696eae1157bd2dac0a4ab58c6b6db435c1 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 12:43:15 +0100 Subject: [PATCH 38/73] feat: add `ak_split_pattern_regex` --- src/awkward/operations/str/__init__.py | 1 + .../operations/str/ak_split_pattern_regex.py | 58 +++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 src/awkward/operations/str/ak_split_pattern_regex.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 260cb9c1b3..434d9150da 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -44,6 +44,7 @@ # string splitting from awkward.operations.str.ak_split_whitespace import * from awkward.operations.str.ak_split_pattern import * +from awkward.operations.str.ak_split_pattern_regex import * # string component extraction diff --git a/src/awkward/operations/str/ak_split_pattern_regex.py b/src/awkward/operations/str/ak_split_pattern_regex.py new file mode 100644 index 0000000000..56a7876efd --- /dev/null +++ b/src/awkward/operations/str/ak_split_pattern_regex.py @@ -0,0 +1,58 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("split_pattern_regex",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def split_pattern_regex( + array, pattern, *, max_splits=None, reverse=False, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str or bytes): Regular expression of characters/bytes to split on. + max_splits (None or int): Maximum number of splits for each input value. If None, unlimited. + reverse (bool): If True, start splitting from the end of each input value; otherwise, start splitting + from the beginning of each value. This flag only has an effect if `max_splits` is not None. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Splits any string or bytestring-valued data into a list of substrings according to the given regular expression. + + Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.split_pattern](https://arrow.apache.org/docs/python/generated/pyarrow.compute.split_pattern.html). + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, max_splits, reverse, highlevel, behavior) + + +def _impl(array, pattern, max_splits, reverse, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + action = ak.operations.str._get_split_action( + pc.split_pattern_regex, + pc.split_pattern_regex, + pattern=pattern, + max_splits=max_splits, + reverse=reverse, + bytestring_to_string=False, + ) + out = ak._do.recursively_apply(ak.operations.to_layout(array), action, behavior) + + return wrap_layout(out, behavior, highlevel) From 0e267980f789d857d06d5e9228ba98a92dc76508 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 13:09:38 +0100 Subject: [PATCH 39/73] test: update tests for new features --- tests/test_2616_use_pyarrow_for_strings.py | 104 ++++++++++++++++++++- 1 file changed, 103 insertions(+), 1 deletion(-) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index f4eb716d18..8af0023a0a 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -4,7 +4,7 @@ import awkward as ak -pytest.importorskip("pyarrow") +pyarrow = pytest.importorskip("pyarrow") string = ak.Array( [ @@ -40,6 +40,14 @@ ] ) +string_repeats = ak.Array( + [["foo123bar123baz", "foo", "bar"], ["123foo", "456bar", "foo123456bar"], []] +) + +bytestring_repeats = ak.Array( + [[b"foo123bar123baz", b"foo", b"bar"], [b"123foo", b"456bar", b"foo123456bar"], []] +) + def test_is_alnum(): assert ak.str.is_alnum(string).tolist() == [ @@ -582,3 +590,97 @@ def test_split_whitespace(): [b"", b"abc", b""], ], ] + + +def test_split_pattern(): + assert ak.str.split_pattern(string_repeats, "123", max_splits=1).tolist() == [ + [["foo", "bar123baz"], ["foo"], ["bar"]], + [["", "foo"], ["456bar"], ["foo", "456bar"]], + [], + ] + assert ak.str.split_pattern( + string_repeats, "123", max_splits=1, reverse=True + ).tolist() == [ + [["foo123bar", "baz"], ["foo"], ["bar"]], + [["", "foo"], ["456bar"], ["foo", "456bar"]], + [], + ] + assert ak.str.split_pattern(string_repeats, "123", max_splits=None).tolist() == [ + [["foo", "bar", "baz"], ["foo"], ["bar"]], + [["", "foo"], ["456bar"], ["foo", "456bar"]], + [], + ] + + # Bytestrings + assert ak.str.split_pattern(bytestring_repeats, b"123", max_splits=1).tolist() == [ + [[b"foo", b"bar123baz"], [b"foo"], [b"bar"]], + [[b"", b"foo"], [b"456bar"], [b"foo", b"456bar"]], + [], + ] + assert ak.str.split_pattern( + bytestring_repeats, b"123", max_splits=1, reverse=True + ).tolist() == [ + [[b"foo123bar", b"baz"], [b"foo"], [b"bar"]], + [[b"", b"foo"], [b"456bar"], [b"foo", b"456bar"]], + [], + ] + assert ak.str.split_pattern( + bytestring_repeats, b"123", max_splits=None + ).tolist() == [ + [[b"foo", b"bar", b"baz"], [b"foo"], [b"bar"]], + [[b"", b"foo"], [b"456bar"], [b"foo", b"456bar"]], + [], + ] + + +def test_split_pattern_regex(): + assert ak.str.split_pattern_regex( + string_repeats, r"\d{3}", max_splits=1 + ).tolist() == [ + [["foo", "bar123baz"], ["foo"], ["bar"]], + [["", "foo"], ["", "bar"], ["foo", "456bar"]], + [], + ] + with pytest.raises( + pyarrow.ArrowNotImplementedError, match=r"split in reverse with regex" + ): + assert ak.str.split_pattern_regex( + string_repeats, r"\d{3}", max_splits=1, reverse=True + ).tolist() == [ + [["foo123bar", "baz"], ["foo"], ["bar"]], + [["", "foo"], ["", "bar"], ["foo", "456bar"]], + [], + ] + assert ak.str.split_pattern_regex( + string_repeats, r"\d{3}", max_splits=None + ).tolist() == [ + [["foo", "bar", "baz"], ["foo"], ["bar"]], + [["", "foo"], ["", "bar"], ["foo", "", "bar"]], + [], + ] + + # Bytestrings + assert ak.str.split_pattern_regex( + bytestring_repeats, rb"\d{3}", max_splits=1 + ).tolist() == [ + [[b"foo", b"bar123baz"], [b"foo"], [b"bar"]], + [[b"", b"foo"], [b"", b"bar"], [b"foo", b"456bar"]], + [], + ] + with pytest.raises( + pyarrow.ArrowNotImplementedError, match=r"split in reverse with regex" + ): + assert ak.str.split_pattern_regex( + bytestring_repeats, rb"\d{3}", max_splits=1, reverse=True + ).tolist() == [ + [[b"foo123bar", b"baz"], [b"foo"], [b"bar"]], + [[b"", b"foo"], [b"", b"bar"], [b"foo", b"456bar"]], + [], + ] + assert ak.str.split_pattern_regex( + bytestring_repeats, rb"\d{3}", max_splits=None + ).tolist() == [ + [[b"foo", b"bar", b"baz"], [b"foo"], [b"bar"]], + [[b"", b"foo"], [b"", b"bar"], [b"foo", b"", b"bar"]], + [], + ] From 5ec706cabadb8d91151b183d0f5b69f436f6d27e Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Mon, 7 Aug 2023 13:04:30 -0500 Subject: [PATCH 40/73] Fixed UnmaskedArray._drop_none. --- src/awkward/contents/unmaskedarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/awkward/contents/unmaskedarray.py b/src/awkward/contents/unmaskedarray.py index 12c31e4bc0..4431eb6cb6 100644 --- a/src/awkward/contents/unmaskedarray.py +++ b/src/awkward/contents/unmaskedarray.py @@ -491,7 +491,7 @@ def _remove_structure(self, backend, options): return [self] def _drop_none(self) -> Content: - return self.content[:0] + return self.content def _recursively_apply( self, action, behavior, depth, depth_context, lateral_context, options From bd8e2e6fca5000fa0643baba9ded846fc6ce220f Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Mon, 7 Aug 2023 13:42:26 -0500 Subject: [PATCH 41/73] fix: adjust for numexpr 2.8.5, which hid getContext's frame_depth argument (#2617) From 73c81217c92e021d1f9c78ca8fd1d29650c6cf91 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Mon, 7 Aug 2023 15:07:25 -0500 Subject: [PATCH 42/73] extract_regex. --- src/awkward/operations/str/__init__.py | 2 + .../operations/str/ak_extract_regex.py | 78 +++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 28 +++++++ 3 files changed, 108 insertions(+) create mode 100644 src/awkward/operations/str/ak_extract_regex.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 434d9150da..76eca3363a 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -48,6 +48,8 @@ # string component extraction +from awkward.operations.str.ak_extract_regex import * + # string joining # string slicing diff --git a/src/awkward/operations/str/ak_extract_regex.py b/src/awkward/operations/str/ak_extract_regex.py new file mode 100644 index 0000000000..9a4aecd038 --- /dev/null +++ b/src/awkward/operations/str/ak_extract_regex.py @@ -0,0 +1,78 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("extract_regex",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def extract_regex(array, pattern, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str or bytes): Regular expression with named capture fields. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Replaces any string-valued data with None if the `pattern` does not match or records whose fields are named capture groups and the substrings they've captured if `pattern` does match. + + Uses [Google RE2](https://github.com/google/re2/wiki/Syntax), and `pattern` must + contain named groups. The syntax for a named group is `(?P<...>...)` in which + the first `...` is a name and the last `...` is a regular expression. + + For example, + + >>> array = ak.Array([["one1", "two2", "three3"], [], ["four4", "five5"]]) + >>> result = ak.str.extract_regex(array, "(?P[aeiou])(?P[0-9]+)") + >>> result.show(type=True) + type: 3 * var * ?{ + vowel: ?string, + number: ?string + } + [[{vowel: 'e', number: '1'}, {vowel: 'o', number: '2'}, {vowel: 'e', number: '3'}], + [], + [None, {vowel: 'e', number: '5'}]] + + (The string `"four4"` does not match because the vowel is not immediately before + the number.) + + Regular expressions with unnamed groups or features not implemented by RE2 raise an error. + + Note: this function does not raise an error if the `array` does + not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.extract_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.extract_regex.html) + or + [pyarrow.compute.extract_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.extract_regex.html) + on strings and bytestrings, respectively. + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, highlevel, behavior) + + +def _impl(array, pattern, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + behavior = behavior_of(array, behavior=behavior) + + out = ak._do.recursively_apply( + ak.operations.to_layout(array), + ak.operations.str._get_ufunc_action( + pc.extract_regex, pc.extract_regex, pattern, bytestring_to_string=False + ), + behavior, + ) + + return wrap_layout(out, behavior, highlevel) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 8af0023a0a..d276bbd579 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -684,3 +684,31 @@ def test_split_pattern_regex(): [[b"", b"foo"], [b"", b"bar"], [b"foo", b"", b"bar"]], [], ] + + +def test_extract_regex(): + assert ak.str.extract_regex( + ak.Array([["one1", "two2", "three3"], [], ["four4", "five5"]]), + "(?P[aeiou])(?P[0-9]+)", + ).tolist() == [ + [ + {"vowel": "e", "number": "1"}, + {"vowel": "o", "number": "2"}, + {"vowel": "e", "number": "3"}, + ], + [], + [None, {"vowel": "e", "number": "5"}], + ] + + assert ak.str.extract_regex( + ak.Array([[b"one1", b"two2", b"three3"], [], [b"four4", b"five5"]]), + b"(?P[aeiou])(?P[0-9]+)", + ).tolist() == [ + [ + {"vowel": b"e", "number": b"1"}, + {"vowel": b"o", "number": b"2"}, + {"vowel": b"e", "number": b"3"}, + ], + [], + [None, {"vowel": b"e", "number": b"5"}], + ] From dc0746ca747282b30f717f6b2de4b0992c0cd932 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Mon, 7 Aug 2023 15:23:25 -0500 Subject: [PATCH 43/73] join (almost entirely from https://gist.github.com/agoose77/28e5bb0250678e454356a85861a16368) --- src/awkward/operations/str/__init__.py | 2 + src/awkward/operations/str/ak_join.py | 122 +++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 30 +++++ 3 files changed, 154 insertions(+) create mode 100644 src/awkward/operations/str/ak_join.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 76eca3363a..762a90a0be 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -52,6 +52,8 @@ # string joining +from awkward.operations.str.ak_join import * + # string slicing from awkward.operations.str.ak_slice import * diff --git a/src/awkward/operations/str/ak_join.py b/src/awkward/operations/str/ak_join.py new file mode 100644 index 0000000000..622c483c22 --- /dev/null +++ b/src/awkward/operations/str/ak_join.py @@ -0,0 +1,122 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("join",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def join(array, separator, *, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + separator (str, bytes, or array of them to broadcast): separator to insert + between strings. If array-like, `separator` is broadcast against `array` + which permits a unique separator for each list of strings. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Concatenate the strings in `array`. The separator is inserted between each string. + + Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.binary_join](https://arrow.apache.org/docs/python/generated/pyarrow.compute.binary_join.html). + """ + # Dispatch + yield (array, separator) + + # Implementation + return _impl(array, separator, highlevel, behavior) + + +def _is_maybe_optional_list_of_string(layout): + if layout.is_list and layout.parameter("__array__") in {"string", "bytestring"}: + return True + elif layout.is_option or layout.is_indexed: + return _is_maybe_optional_list_of_string(layout.content) + else: + return False + + +def _impl(array, separator, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + from awkward.operations.ak_from_arrow import from_arrow + from awkward.operations.ak_to_arrow import to_arrow + + import pyarrow.compute as pc + + def apply_unary(layout, **kwargs): + if not (layout.is_list and layout.purelist_depth == 2): + return + + if not _is_maybe_optional_list_of_string(layout.content): + return + + # We have (maybe option/indexed type wrapping) strings + + arrow_array = to_arrow( + # Arrow needs an option type here + layout.copy(content=ak.contents.UnmaskedArray.simplified(layout.content)), + extensionarray=False, + # This kernel requires non-large string/bytestrings + string_to32=True, + bytestring_to32=True, + ) + return from_arrow( + pc.binary_join(arrow_array, separator), + highlevel=False, + ) + + def apply_binary(layouts, **kwargs): + layout, separator_layout = layouts + if not (layout.is_list and layout.purelist_depth == 2): + return + + if not _is_maybe_optional_list_of_string(layout.content): + return + + if not _is_maybe_optional_list_of_string(separator_layout): + raise TypeError( + f"separator must be a list of strings, not {type(separator_layout)}" + ) + + # We have (maybe option/indexed type wrapping) strings + layout_arrow = to_arrow( + # Arrow needs an option type here + layout.copy(content=ak.contents.UnmaskedArray.simplified(layout.content)), + extensionarray=False, + # This kernel requires non-large string/bytestrings + string_to32=True, + bytestring_to32=True, + ) + separator_arrow = to_arrow( + separator_layout, + extensionarray=False, + # This kernel requires non-large string/bytestrings + string_to32=True, + bytestring_to32=True, + ) + return ( + from_arrow( + pc.binary_join(layout_arrow, separator_arrow), + highlevel=False, + ), + ) + + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, separator, behavior=behavior) + if isinstance(separator, (bytes, str)): + out = ak._do.recursively_apply(layout, apply_unary, behavior=behavior) + else: + separator_layout = ak.to_layout(separator, allow_record=False, allow_other=True) + (out,) = ak._broadcasting.broadcast_and_apply( + (layout, separator_layout), apply_binary, behavior + ) + + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index d276bbd579..cd5437cdb3 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -712,3 +712,33 @@ def test_extract_regex(): [], [None, {"vowel": b"e", "number": b"5"}], ] + + +def test_join(): + array1 = ak.Array( + [ + ["this", "that"], + [], + ["foo", "bar", "baz"], + ] + ) + assert ak.str.join(array1, "-").tolist() == ["this-that", "", "foo-bar-baz"] + + separator = ak.Array(["→", "↑", "←"]) + assert ak.str.join(array1, separator).tolist() == ["this→that", "", "foo←bar←baz"] + + array2 = ak.Array( + [ + [b"this", b"that"], + [], + [b"foo", b"bar", b"baz"], + ] + ) + assert ak.str.join(array2, b"-").tolist() == [b"this-that", b"", b"foo-bar-baz"] + + separator = ak.Array(["→".encode(), "↑".encode(), "←".encode()]) + assert ak.str.join(array2, separator).tolist() == [ + "this→that".encode(), + b"", + "foo←bar←baz".encode(), + ] From 43aa272e21e683aff4547720644f067f78f35059 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Mon, 7 Aug 2023 15:29:49 -0500 Subject: [PATCH 44/73] use dispatch correctly --- src/awkward/operations/str/ak_center.py | 2 +- src/awkward/operations/str/ak_extract_regex.py | 2 +- src/awkward/operations/str/ak_lpad.py | 2 +- src/awkward/operations/str/ak_ltrim.py | 2 +- src/awkward/operations/str/ak_repeat.py | 2 +- src/awkward/operations/str/ak_replace_slice.py | 2 +- src/awkward/operations/str/ak_replace_substring.py | 4 ++-- src/awkward/operations/str/ak_replace_substring_regex.py | 4 ++-- src/awkward/operations/str/ak_rpad.py | 2 +- src/awkward/operations/str/ak_rtrim.py | 2 +- src/awkward/operations/str/ak_slice.py | 2 +- src/awkward/operations/str/ak_split_pattern.py | 4 ++-- src/awkward/operations/str/ak_split_pattern_regex.py | 4 ++-- src/awkward/operations/str/ak_split_whitespace.py | 4 ++-- src/awkward/operations/str/ak_trim.py | 2 +- 15 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/awkward/operations/str/ak_center.py b/src/awkward/operations/str/ak_center.py index 9bd2246673..3d0da1893b 100644 --- a/src/awkward/operations/str/ak_center.py +++ b/src/awkward/operations/str/ak_center.py @@ -37,7 +37,7 @@ def center(array, width, padding=" ", *, highlevel=True, behavior=None): on strings and bytestrings, respectively. """ # Dispatch - yield (array,) + yield (array, width, padding) # Implementation return _impl(array, width, padding, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_extract_regex.py b/src/awkward/operations/str/ak_extract_regex.py index 9a4aecd038..c3bcdc1d49 100644 --- a/src/awkward/operations/str/ak_extract_regex.py +++ b/src/awkward/operations/str/ak_extract_regex.py @@ -54,7 +54,7 @@ def extract_regex(array, pattern, *, highlevel=True, behavior=None): on strings and bytestrings, respectively. """ # Dispatch - yield (array,) + yield (array, pattern) # Implementation return _impl(array, pattern, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_lpad.py b/src/awkward/operations/str/ak_lpad.py index 909f1663d9..5a869f2a92 100644 --- a/src/awkward/operations/str/ak_lpad.py +++ b/src/awkward/operations/str/ak_lpad.py @@ -37,7 +37,7 @@ def lpad(array, width, padding=" ", *, highlevel=True, behavior=None): on strings and bytestrings, respectively. """ # Dispatch - yield (array,) + yield (array, width, padding) # Implementation return _impl(array, width, padding, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_ltrim.py b/src/awkward/operations/str/ak_ltrim.py index 0180270067..2399d7ed9a 100644 --- a/src/awkward/operations/str/ak_ltrim.py +++ b/src/awkward/operations/str/ak_ltrim.py @@ -36,7 +36,7 @@ def ltrim(array, characters, *, highlevel=True, behavior=None): on strings and bytestrings, respectively. """ # Dispatch - yield (array,) + yield (array, characters) # Implementation return _impl(array, characters, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_repeat.py b/src/awkward/operations/str/ak_repeat.py index 4419eed4c2..75324de63a 100644 --- a/src/awkward/operations/str/ak_repeat.py +++ b/src/awkward/operations/str/ak_repeat.py @@ -36,7 +36,7 @@ def repeat(array, num_repeats, *, highlevel=True, behavior=None): on strings and bytestrings, respectively. """ # Dispatch - yield (array,) + yield (array, num_repeats) # Implementation return _impl(array, num_repeats, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_replace_slice.py b/src/awkward/operations/str/ak_replace_slice.py index cd80f111aa..ce6ab2fbab 100644 --- a/src/awkward/operations/str/ak_replace_slice.py +++ b/src/awkward/operations/str/ak_replace_slice.py @@ -38,7 +38,7 @@ def replace_slice(array, start, stop, replacement, *, highlevel=True, behavior=N on strings and bytestrings, respectively. """ # Dispatch - yield (array,) + yield (array, start, stop, replacement) # Implementation return _impl(array, start, stop, replacement, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_replace_substring.py b/src/awkward/operations/str/ak_replace_substring.py index 691e9fd3e7..bc5dae8b96 100644 --- a/src/awkward/operations/str/ak_replace_substring.py +++ b/src/awkward/operations/str/ak_replace_substring.py @@ -11,7 +11,7 @@ @high_level_function def replace_substring( - array, pattern, replacement, *, max_replacements=None, highlevel=True, behavior=None + array, pattern, replacement, max_replacements=None, *, highlevel=True, behavior=None ): """ Args: @@ -40,7 +40,7 @@ def replace_substring( on strings and bytestrings, respectively. """ # Dispatch - yield (array,) + yield (array, pattern, replacement, max_replacements) # Implementation return _impl(array, pattern, replacement, max_replacements, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_replace_substring_regex.py b/src/awkward/operations/str/ak_replace_substring_regex.py index 77dc2c12b2..fc737da163 100644 --- a/src/awkward/operations/str/ak_replace_substring_regex.py +++ b/src/awkward/operations/str/ak_replace_substring_regex.py @@ -11,7 +11,7 @@ @high_level_function def replace_substring_regex( - array, pattern, replacement, *, max_replacements=None, highlevel=True, behavior=None + array, pattern, replacement, max_replacements=None, *, highlevel=True, behavior=None ): """ Args: @@ -40,7 +40,7 @@ def replace_substring_regex( on strings and bytestrings, respectively. """ # Dispatch - yield (array,) + yield (array, pattern, replacement, max_replacements) # Implementation return _impl(array, pattern, replacement, max_replacements, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_rpad.py b/src/awkward/operations/str/ak_rpad.py index da0cf61fb6..02fb2f4fcc 100644 --- a/src/awkward/operations/str/ak_rpad.py +++ b/src/awkward/operations/str/ak_rpad.py @@ -37,7 +37,7 @@ def rpad(array, width, padding=" ", *, highlevel=True, behavior=None): on strings and bytestrings, respectively. """ # Dispatch - yield (array,) + yield (array, width, padding) # Implementation return _impl(array, width, padding, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_rtrim.py b/src/awkward/operations/str/ak_rtrim.py index 3d1d518754..00be21f2ba 100644 --- a/src/awkward/operations/str/ak_rtrim.py +++ b/src/awkward/operations/str/ak_rtrim.py @@ -36,7 +36,7 @@ def rtrim(array, characters, *, highlevel=True, behavior=None): on strings and bytestrings, respectively. """ # Dispatch - yield (array,) + yield (array, characters) # Implementation return _impl(array, characters, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_slice.py b/src/awkward/operations/str/ak_slice.py index 7afaab7d93..cba2775a37 100644 --- a/src/awkward/operations/str/ak_slice.py +++ b/src/awkward/operations/str/ak_slice.py @@ -37,7 +37,7 @@ def slice(array, start, stop=None, step=1, *, highlevel=True, behavior=None): or performs a literal slice on strings and bytestrings, respectively. """ # Dispatch - yield (array,) + yield (array, start, stop, step) # Implementation return _impl(array, start, stop, step, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_split_pattern.py b/src/awkward/operations/str/ak_split_pattern.py index b94187d9fe..680eef3134 100644 --- a/src/awkward/operations/str/ak_split_pattern.py +++ b/src/awkward/operations/str/ak_split_pattern.py @@ -11,7 +11,7 @@ @high_level_function def split_pattern( - array, pattern, *, max_splits=None, reverse=False, highlevel=True, behavior=None + array, pattern, max_splits=None, reverse=False, *, highlevel=True, behavior=None ): """ Args: @@ -33,7 +33,7 @@ def split_pattern( [pyarrow.compute.split_pattern](https://arrow.apache.org/docs/python/generated/pyarrow.compute.split_pattern.html). """ # Dispatch - yield (array,) + yield (array, pattern, max_splits, reverse) # Implementation return _impl(array, pattern, max_splits, reverse, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_split_pattern_regex.py b/src/awkward/operations/str/ak_split_pattern_regex.py index 56a7876efd..f34ffa817d 100644 --- a/src/awkward/operations/str/ak_split_pattern_regex.py +++ b/src/awkward/operations/str/ak_split_pattern_regex.py @@ -11,7 +11,7 @@ @high_level_function def split_pattern_regex( - array, pattern, *, max_splits=None, reverse=False, highlevel=True, behavior=None + array, pattern, max_splits=None, reverse=False, *, highlevel=True, behavior=None ): """ Args: @@ -33,7 +33,7 @@ def split_pattern_regex( [pyarrow.compute.split_pattern](https://arrow.apache.org/docs/python/generated/pyarrow.compute.split_pattern.html). """ # Dispatch - yield (array,) + yield (array, pattern, max_splits, reverse) # Implementation return _impl(array, pattern, max_splits, reverse, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_split_whitespace.py b/src/awkward/operations/str/ak_split_whitespace.py index 07be7a0e5c..95d951f725 100644 --- a/src/awkward/operations/str/ak_split_whitespace.py +++ b/src/awkward/operations/str/ak_split_whitespace.py @@ -11,7 +11,7 @@ @high_level_function def split_whitespace( - array, *, max_splits=None, reverse=False, highlevel=True, behavior=None + array, max_splits=None, reverse=False, *, highlevel=True, behavior=None ): """ Args: @@ -41,7 +41,7 @@ def split_whitespace( on strings and bytestrings, respectively. """ # Dispatch - yield (array,) + yield (array, max_splits, reverse) # Implementation return _impl(array, max_splits, reverse, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_trim.py b/src/awkward/operations/str/ak_trim.py index c43df209be..192aad730a 100644 --- a/src/awkward/operations/str/ak_trim.py +++ b/src/awkward/operations/str/ak_trim.py @@ -36,7 +36,7 @@ def trim(array, characters, *, highlevel=True, behavior=None): on strings and bytestrings, respectively. """ # Dispatch - yield (array,) + yield (array, characters) # Implementation return _impl(array, characters, highlevel, behavior) From cbf15776f3a17590cb56a8e16d9988d884f689e9 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 22:21:13 +0100 Subject: [PATCH 45/73] fix: drop unused arg --- src/awkward/operations/str/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 762a90a0be..4d87449799 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -66,7 +66,6 @@ def _get_ufunc_action( ascii_function, *args, bytestring_to_string=False, - drop_unmasked_option=False, **kwargs, ): from awkward.operations.ak_from_arrow import from_arrow From 068b6af7c279daed23b101d2cf4decbfc312ffa0 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Mon, 7 Aug 2023 16:13:58 -0500 Subject: [PATCH 46/73] join_element_wise --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_join.py | 2 + .../operations/str/ak_join_element_wise.py | 71 +++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 22 ++++++ 4 files changed, 96 insertions(+) create mode 100644 src/awkward/operations/str/ak_join_element_wise.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 4d87449799..fd3c651f19 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -53,6 +53,7 @@ # string joining from awkward.operations.str.ak_join import * +from awkward.operations.str.ak_join_element_wise import * # string slicing diff --git a/src/awkward/operations/str/ak_join.py b/src/awkward/operations/str/ak_join.py index 622c483c22..40289bc4a6 100644 --- a/src/awkward/operations/str/ak_join.py +++ b/src/awkward/operations/str/ak_join.py @@ -27,6 +27,8 @@ def join(array, separator, *, highlevel=True, behavior=None): Requires the pyarrow library and calls [pyarrow.compute.binary_join](https://arrow.apache.org/docs/python/generated/pyarrow.compute.binary_join.html). + + See also: #ak.str.join_element_wise. """ # Dispatch yield (array, separator) diff --git a/src/awkward/operations/str/ak_join_element_wise.py b/src/awkward/operations/str/ak_join_element_wise.py new file mode 100644 index 0000000000..ad3639adb6 --- /dev/null +++ b/src/awkward/operations/str/ak_join_element_wise.py @@ -0,0 +1,71 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("join_element_wise",) + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def join_element_wise(*arrays, highlevel=True, behavior=None): + """ + Args: + arrays: Array-like data (anything #ak.to_layout recognizes). + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Broadcasts and concatenates all but the last array of strings in `arrays`; the last is used as a separator. + + Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.binary_join_element_wise](https://arrow.apache.org/docs/python/generated/pyarrow.compute.binary_join_element_wise.html). + + Unlike Arrow's `binary_join_element_wise`, this function has no `null_handling` + and `null_replacement` arguments. This function's behavior is like + `null_handling="emit_null"` (Arrow's default). The other cases can be implemented + with Awkward slices, #ak.drop_none, and #ak.fill_none. + + See also: #ak.str.join. + """ + # Dispatch + yield arrays + + # Implementation + return _impl(arrays, highlevel, behavior) + + +def _impl(arrays, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + from awkward.operations.ak_from_arrow import from_arrow + from awkward.operations.ak_to_arrow import to_arrow + + import pyarrow.compute as pc + + layouts = [ak.to_layout(x) for x in arrays] + behavior = behavior_of(*arrays, behavior=behavior) + + if len(arrays) < 1: + raise TypeError("at least one array is required") + + def action(layouts, **kwargs): + if all( + x.is_list and x.parameter("__array__") in ("string", "bytestring") + for x in layouts + ): + return ( + from_arrow( + pc.binary_join_element_wise( + *[to_arrow(x, extensionarray=False) for x in layouts] + ), + highlevel=False, + ), + ) + + (out,) = ak._broadcasting.broadcast_and_apply(layouts, action, behavior) + + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index cd5437cdb3..4c640e90a2 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -742,3 +742,25 @@ def test_join(): b"", "foo←bar←baz".encode(), ] + + +def test_join_element_wise(): + array1 = ak.Array([["one", "two", "three"], [], ["four", "five"]]) + array2 = ak.Array([["111", "222", "333"], [], ["444", "555"]]) + separator = ak.Array(["→", "↑", "←"]) + + assert ak.str.join_element_wise(array1, array2, separator).tolist() == [ + ["one→111", "two→222", "three→333"], + [], + ["four←444", "five←555"], + ] + + array1 = ak.Array([[b"one", b"two", b"three"], [], [b"four", b"five"]]) + array2 = ak.Array([[b"111", b"222", b"333"], [], [b"444", b"555"]]) + separator = ak.Array(["→".encode(), "↑".encode(), "←".encode()]) + + assert ak.str.join_element_wise(array1, array2, separator).tolist() == [ + ["one→111".encode(), "two→222".encode(), "three→333".encode()], + [], + ["four←444".encode(), "five←555".encode()], + ] From ffeef7b9674d0dcf3af08baa4b812c39d97b801e Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 22:23:12 +0100 Subject: [PATCH 47/73] Revert "use dispatch correctly" This reverts commit 559073b1b047e099b5fce0dcd0930bd2154feedd. --- src/awkward/operations/str/ak_center.py | 2 +- src/awkward/operations/str/ak_extract_regex.py | 2 +- src/awkward/operations/str/ak_lpad.py | 2 +- src/awkward/operations/str/ak_ltrim.py | 2 +- src/awkward/operations/str/ak_repeat.py | 2 +- src/awkward/operations/str/ak_replace_slice.py | 2 +- src/awkward/operations/str/ak_replace_substring.py | 4 ++-- src/awkward/operations/str/ak_replace_substring_regex.py | 4 ++-- src/awkward/operations/str/ak_rpad.py | 2 +- src/awkward/operations/str/ak_rtrim.py | 2 +- src/awkward/operations/str/ak_slice.py | 2 +- src/awkward/operations/str/ak_split_pattern.py | 4 ++-- src/awkward/operations/str/ak_split_pattern_regex.py | 4 ++-- src/awkward/operations/str/ak_split_whitespace.py | 4 ++-- src/awkward/operations/str/ak_trim.py | 2 +- 15 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/awkward/operations/str/ak_center.py b/src/awkward/operations/str/ak_center.py index 3d0da1893b..9bd2246673 100644 --- a/src/awkward/operations/str/ak_center.py +++ b/src/awkward/operations/str/ak_center.py @@ -37,7 +37,7 @@ def center(array, width, padding=" ", *, highlevel=True, behavior=None): on strings and bytestrings, respectively. """ # Dispatch - yield (array, width, padding) + yield (array,) # Implementation return _impl(array, width, padding, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_extract_regex.py b/src/awkward/operations/str/ak_extract_regex.py index c3bcdc1d49..9a4aecd038 100644 --- a/src/awkward/operations/str/ak_extract_regex.py +++ b/src/awkward/operations/str/ak_extract_regex.py @@ -54,7 +54,7 @@ def extract_regex(array, pattern, *, highlevel=True, behavior=None): on strings and bytestrings, respectively. """ # Dispatch - yield (array, pattern) + yield (array,) # Implementation return _impl(array, pattern, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_lpad.py b/src/awkward/operations/str/ak_lpad.py index 5a869f2a92..909f1663d9 100644 --- a/src/awkward/operations/str/ak_lpad.py +++ b/src/awkward/operations/str/ak_lpad.py @@ -37,7 +37,7 @@ def lpad(array, width, padding=" ", *, highlevel=True, behavior=None): on strings and bytestrings, respectively. """ # Dispatch - yield (array, width, padding) + yield (array,) # Implementation return _impl(array, width, padding, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_ltrim.py b/src/awkward/operations/str/ak_ltrim.py index 2399d7ed9a..0180270067 100644 --- a/src/awkward/operations/str/ak_ltrim.py +++ b/src/awkward/operations/str/ak_ltrim.py @@ -36,7 +36,7 @@ def ltrim(array, characters, *, highlevel=True, behavior=None): on strings and bytestrings, respectively. """ # Dispatch - yield (array, characters) + yield (array,) # Implementation return _impl(array, characters, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_repeat.py b/src/awkward/operations/str/ak_repeat.py index 75324de63a..4419eed4c2 100644 --- a/src/awkward/operations/str/ak_repeat.py +++ b/src/awkward/operations/str/ak_repeat.py @@ -36,7 +36,7 @@ def repeat(array, num_repeats, *, highlevel=True, behavior=None): on strings and bytestrings, respectively. """ # Dispatch - yield (array, num_repeats) + yield (array,) # Implementation return _impl(array, num_repeats, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_replace_slice.py b/src/awkward/operations/str/ak_replace_slice.py index ce6ab2fbab..cd80f111aa 100644 --- a/src/awkward/operations/str/ak_replace_slice.py +++ b/src/awkward/operations/str/ak_replace_slice.py @@ -38,7 +38,7 @@ def replace_slice(array, start, stop, replacement, *, highlevel=True, behavior=N on strings and bytestrings, respectively. """ # Dispatch - yield (array, start, stop, replacement) + yield (array,) # Implementation return _impl(array, start, stop, replacement, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_replace_substring.py b/src/awkward/operations/str/ak_replace_substring.py index bc5dae8b96..691e9fd3e7 100644 --- a/src/awkward/operations/str/ak_replace_substring.py +++ b/src/awkward/operations/str/ak_replace_substring.py @@ -11,7 +11,7 @@ @high_level_function def replace_substring( - array, pattern, replacement, max_replacements=None, *, highlevel=True, behavior=None + array, pattern, replacement, *, max_replacements=None, highlevel=True, behavior=None ): """ Args: @@ -40,7 +40,7 @@ def replace_substring( on strings and bytestrings, respectively. """ # Dispatch - yield (array, pattern, replacement, max_replacements) + yield (array,) # Implementation return _impl(array, pattern, replacement, max_replacements, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_replace_substring_regex.py b/src/awkward/operations/str/ak_replace_substring_regex.py index fc737da163..77dc2c12b2 100644 --- a/src/awkward/operations/str/ak_replace_substring_regex.py +++ b/src/awkward/operations/str/ak_replace_substring_regex.py @@ -11,7 +11,7 @@ @high_level_function def replace_substring_regex( - array, pattern, replacement, max_replacements=None, *, highlevel=True, behavior=None + array, pattern, replacement, *, max_replacements=None, highlevel=True, behavior=None ): """ Args: @@ -40,7 +40,7 @@ def replace_substring_regex( on strings and bytestrings, respectively. """ # Dispatch - yield (array, pattern, replacement, max_replacements) + yield (array,) # Implementation return _impl(array, pattern, replacement, max_replacements, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_rpad.py b/src/awkward/operations/str/ak_rpad.py index 02fb2f4fcc..da0cf61fb6 100644 --- a/src/awkward/operations/str/ak_rpad.py +++ b/src/awkward/operations/str/ak_rpad.py @@ -37,7 +37,7 @@ def rpad(array, width, padding=" ", *, highlevel=True, behavior=None): on strings and bytestrings, respectively. """ # Dispatch - yield (array, width, padding) + yield (array,) # Implementation return _impl(array, width, padding, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_rtrim.py b/src/awkward/operations/str/ak_rtrim.py index 00be21f2ba..3d1d518754 100644 --- a/src/awkward/operations/str/ak_rtrim.py +++ b/src/awkward/operations/str/ak_rtrim.py @@ -36,7 +36,7 @@ def rtrim(array, characters, *, highlevel=True, behavior=None): on strings and bytestrings, respectively. """ # Dispatch - yield (array, characters) + yield (array,) # Implementation return _impl(array, characters, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_slice.py b/src/awkward/operations/str/ak_slice.py index cba2775a37..7afaab7d93 100644 --- a/src/awkward/operations/str/ak_slice.py +++ b/src/awkward/operations/str/ak_slice.py @@ -37,7 +37,7 @@ def slice(array, start, stop=None, step=1, *, highlevel=True, behavior=None): or performs a literal slice on strings and bytestrings, respectively. """ # Dispatch - yield (array, start, stop, step) + yield (array,) # Implementation return _impl(array, start, stop, step, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_split_pattern.py b/src/awkward/operations/str/ak_split_pattern.py index 680eef3134..b94187d9fe 100644 --- a/src/awkward/operations/str/ak_split_pattern.py +++ b/src/awkward/operations/str/ak_split_pattern.py @@ -11,7 +11,7 @@ @high_level_function def split_pattern( - array, pattern, max_splits=None, reverse=False, *, highlevel=True, behavior=None + array, pattern, *, max_splits=None, reverse=False, highlevel=True, behavior=None ): """ Args: @@ -33,7 +33,7 @@ def split_pattern( [pyarrow.compute.split_pattern](https://arrow.apache.org/docs/python/generated/pyarrow.compute.split_pattern.html). """ # Dispatch - yield (array, pattern, max_splits, reverse) + yield (array,) # Implementation return _impl(array, pattern, max_splits, reverse, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_split_pattern_regex.py b/src/awkward/operations/str/ak_split_pattern_regex.py index f34ffa817d..56a7876efd 100644 --- a/src/awkward/operations/str/ak_split_pattern_regex.py +++ b/src/awkward/operations/str/ak_split_pattern_regex.py @@ -11,7 +11,7 @@ @high_level_function def split_pattern_regex( - array, pattern, max_splits=None, reverse=False, *, highlevel=True, behavior=None + array, pattern, *, max_splits=None, reverse=False, highlevel=True, behavior=None ): """ Args: @@ -33,7 +33,7 @@ def split_pattern_regex( [pyarrow.compute.split_pattern](https://arrow.apache.org/docs/python/generated/pyarrow.compute.split_pattern.html). """ # Dispatch - yield (array, pattern, max_splits, reverse) + yield (array,) # Implementation return _impl(array, pattern, max_splits, reverse, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_split_whitespace.py b/src/awkward/operations/str/ak_split_whitespace.py index 95d951f725..07be7a0e5c 100644 --- a/src/awkward/operations/str/ak_split_whitespace.py +++ b/src/awkward/operations/str/ak_split_whitespace.py @@ -11,7 +11,7 @@ @high_level_function def split_whitespace( - array, max_splits=None, reverse=False, *, highlevel=True, behavior=None + array, *, max_splits=None, reverse=False, highlevel=True, behavior=None ): """ Args: @@ -41,7 +41,7 @@ def split_whitespace( on strings and bytestrings, respectively. """ # Dispatch - yield (array, max_splits, reverse) + yield (array,) # Implementation return _impl(array, max_splits, reverse, highlevel, behavior) diff --git a/src/awkward/operations/str/ak_trim.py b/src/awkward/operations/str/ak_trim.py index 192aad730a..c43df209be 100644 --- a/src/awkward/operations/str/ak_trim.py +++ b/src/awkward/operations/str/ak_trim.py @@ -36,7 +36,7 @@ def trim(array, characters, *, highlevel=True, behavior=None): on strings and bytestrings, respectively. """ # Dispatch - yield (array, characters) + yield (array,) # Implementation return _impl(array, characters, highlevel, behavior) From 19c719730e9c85408056099c62cc3e110bd78468 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 22:24:21 +0100 Subject: [PATCH 48/73] fix: broadcast `num_repeats` --- src/awkward/operations/str/ak_repeat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/awkward/operations/str/ak_repeat.py b/src/awkward/operations/str/ak_repeat.py index 4419eed4c2..75324de63a 100644 --- a/src/awkward/operations/str/ak_repeat.py +++ b/src/awkward/operations/str/ak_repeat.py @@ -36,7 +36,7 @@ def repeat(array, num_repeats, *, highlevel=True, behavior=None): on strings and bytestrings, respectively. """ # Dispatch - yield (array,) + yield (array, num_repeats) # Implementation return _impl(array, num_repeats, highlevel, behavior) From 21973bdf2885089dca86af83878064b0c9d6c019 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 22:50:15 +0100 Subject: [PATCH 49/73] feat: add `count_substring[_pattern]` --- src/awkward/operations/str/__init__.py | 3 + .../operations/str/ak_count_substring.py | 56 ++++++++++++++++ .../str/ak_count_substring_regex.py | 56 ++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 66 +++++++++++++++++++ 4 files changed, 181 insertions(+) create mode 100644 src/awkward/operations/str/ak_count_substring.py create mode 100644 src/awkward/operations/str/ak_count_substring_regex.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index fd3c651f19..504ff93b3a 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -61,6 +61,9 @@ # containment tests +from awkward.operations.str.ak_count_substring import * +from awkward.operations.str.ak_count_substring_regex import * + def _get_ufunc_action( utf8_function, diff --git a/src/awkward/operations/str/ak_count_substring.py b/src/awkward/operations/str/ak_count_substring.py new file mode 100644 index 0000000000..52575999c8 --- /dev/null +++ b/src/awkward/operations/str/ak_count_substring.py @@ -0,0 +1,56 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("count_substring",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def count_substring( + array, pattern, *, ignore_case=False, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str, or bytes): Substring pattern to look for inside the given array. + ignore_case (bool): If True, perform a case-insensitive match; otherwise, the match is case-sensitive. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + For each string in the count the number of occurrences of the given literal pattern. + + Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.count_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.count_substring.html). + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, ignore_case, highlevel, behavior) + + +def _impl(array, pattern, ignore_case, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, behavior=behavior) + apply = ak.operations.str._get_ufunc_action( + pc.count_substring, + pc.count_substring, + bytestring_to_string=False, + ignore_case=ignore_case, + pattern=pattern, + ) + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/src/awkward/operations/str/ak_count_substring_regex.py b/src/awkward/operations/str/ak_count_substring_regex.py new file mode 100644 index 0000000000..413c46bca0 --- /dev/null +++ b/src/awkward/operations/str/ak_count_substring_regex.py @@ -0,0 +1,56 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("count_substring_regex",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def count_substring_regex( + array, pattern, *, ignore_case=False, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str, or bytes): Substring pattern to look for inside the given array. + ignore_case (bool): If True, perform a case-insensitive match; otherwise, the match is case-sensitive. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + For each string in the count the number of occurrences of the given regular expression pattern. + + Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.count_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.count_substring.html). + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, ignore_case, highlevel, behavior) + + +def _impl(array, pattern, ignore_case, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, behavior=behavior) + apply = ak.operations.str._get_ufunc_action( + pc.count_substring_regex, + pc.count_substring_regex, + bytestring_to_string=False, + ignore_case=ignore_case, + pattern=pattern, + ) + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 4c640e90a2..cc090a6b1d 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -764,3 +764,69 @@ def test_join_element_wise(): [], ["four←444".encode(), "five←555".encode()], ] + + +def test_count_substring(): + assert ak.str.count_substring(string_repeats, "BA").tolist() == [ + [0, 0, 0], + [0, 0, 0], + [], + ] + assert ak.str.count_substring(string_repeats, "BA", ignore_case=True).tolist() == [ + [2, 0, 1], + [0, 1, 1], + [], + ] + + # Bytestrings + assert ak.str.count_substring(bytestring_repeats, b"BA").tolist() == [ + [0, 0, 0], + [0, 0, 0], + [], + ] + assert ak.str.count_substring( + bytestring_repeats, b"BA", ignore_case=True + ).tolist() == [ + [2, 0, 1], + [0, 1, 1], + [], + ] + + +def test_count_substring_regex(): + assert ak.str.count_substring_regex(string_repeats, r"BA\d*").tolist() == [ + [0, 0, 0], + [0, 0, 0], + [], + ] + assert ak.str.count_substring_regex( + string_repeats, r"BA\d*", ignore_case=True + ).tolist() == [ + [2, 0, 1], + [0, 1, 1], + [], + ] + assert ak.str.count_substring_regex(string_repeats, r"\d{1,}").tolist() == [ + [2, 0, 0], + [1, 1, 1], + [], + ] + + # Bytestrings + assert ak.str.count_substring_regex(bytestring_repeats, rb"BA\d*").tolist() == [ + [0, 0, 0], + [0, 0, 0], + [], + ] + assert ak.str.count_substring_regex( + bytestring_repeats, rb"BA\d*", ignore_case=True + ).tolist() == [ + [2, 0, 1], + [0, 1, 1], + [], + ] + assert ak.str.count_substring_regex(bytestring_repeats, rb"\d{1,}").tolist() == [ + [2, 0, 0], + [1, 1, 1], + [], + ] From d385e615a750eb0683f56c8adc48e98a6e7f6f90 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 22:51:37 +0100 Subject: [PATCH 50/73] docs: fixup docstring --- src/awkward/operations/str/ak_count_substring.py | 2 +- src/awkward/operations/str/ak_count_substring_regex.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/awkward/operations/str/ak_count_substring.py b/src/awkward/operations/str/ak_count_substring.py index 52575999c8..cbd8cdc550 100644 --- a/src/awkward/operations/str/ak_count_substring.py +++ b/src/awkward/operations/str/ak_count_substring.py @@ -23,7 +23,7 @@ def count_substring( behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - For each string in the count the number of occurrences of the given literal pattern. + For each string in the array, count the number of occurrences of the given literal pattern. Note: this function does not raise an error if the `array` does not contain any string or bytestring data. diff --git a/src/awkward/operations/str/ak_count_substring_regex.py b/src/awkward/operations/str/ak_count_substring_regex.py index 413c46bca0..f3041c32e0 100644 --- a/src/awkward/operations/str/ak_count_substring_regex.py +++ b/src/awkward/operations/str/ak_count_substring_regex.py @@ -23,12 +23,12 @@ def count_substring_regex( behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - For each string in the count the number of occurrences of the given regular expression pattern. + For each string in the array, count the number of occurrences of the given regular expression pattern. Note: this function does not raise an error if the `array` does not contain any string or bytestring data. Requires the pyarrow library and calls - [pyarrow.compute.count_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.count_substring.html). + [pyarrow.compute.count_substring_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.count_substring_regex.html). """ # Dispatch yield (array,) From c9164d5da544459ed7eddb7896544914954aaab1 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 22:55:33 +0100 Subject: [PATCH 51/73] feat: add `ends_with` --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_ends_with.py | 53 ++++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 25 ++++++++++ 3 files changed, 79 insertions(+) create mode 100644 src/awkward/operations/str/ak_ends_with.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 504ff93b3a..ec8e79e078 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -63,6 +63,7 @@ from awkward.operations.str.ak_count_substring import * from awkward.operations.str.ak_count_substring_regex import * +from awkward.operations.str.ak_ends_with import * def _get_ufunc_action( diff --git a/src/awkward/operations/str/ak_ends_with.py b/src/awkward/operations/str/ak_ends_with.py new file mode 100644 index 0000000000..7a7b2f40a0 --- /dev/null +++ b/src/awkward/operations/str/ak_ends_with.py @@ -0,0 +1,53 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("ends_with",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def ends_with(array, pattern, *, ignore_case=False, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str, or bytes): Substring pattern to look for inside the given array. + ignore_case (bool): If True, perform a case-insensitive match; otherwise, the match is case-sensitive. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + For each string in the array, determine whether it ends with the given literal suffix. + + Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.count_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.count_substring.html). + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, ignore_case, highlevel, behavior) + + +def _impl(array, pattern, ignore_case, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, behavior=behavior) + apply = ak.operations.str._get_ufunc_action( + pc.ends_with, + pc.ends_with, + bytestring_to_string=False, + ignore_case=ignore_case, + pattern=pattern, + ) + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index cc090a6b1d..db759b7d78 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -830,3 +830,28 @@ def test_count_substring_regex(): [1, 1, 1], [], ] + + +def test_ends_with(): + assert ak.str.ends_with(string_repeats, "BAR").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.ends_with(string_repeats, "BAR", ignore_case=True).tolist() == [ + [False, False, True], + [False, True, True], + [], + ] + + # Bytestrings + assert ak.str.ends_with(bytestring_repeats, b"BAR").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.ends_with(bytestring_repeats, b"BAR", ignore_case=True).tolist() == [ + [False, False, True], + [False, True, True], + [], + ] From aac5e8a54491e920dd49e3b332bfaf99983d9abf Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 22:57:18 +0100 Subject: [PATCH 52/73] feat: add `starts_with` --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_starts_with.py | 53 ++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 27 ++++++++++ 3 files changed, 81 insertions(+) create mode 100644 src/awkward/operations/str/ak_starts_with.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index ec8e79e078..2bd84756a1 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -64,6 +64,7 @@ from awkward.operations.str.ak_count_substring import * from awkward.operations.str.ak_count_substring_regex import * from awkward.operations.str.ak_ends_with import * +from awkward.operations.str.ak_starts_with import * def _get_ufunc_action( diff --git a/src/awkward/operations/str/ak_starts_with.py b/src/awkward/operations/str/ak_starts_with.py new file mode 100644 index 0000000000..6452cb8e3b --- /dev/null +++ b/src/awkward/operations/str/ak_starts_with.py @@ -0,0 +1,53 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("starts_with",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def starts_with(array, pattern, *, ignore_case=False, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str, or bytes): Substring pattern to look for inside the given array. + ignore_case (bool): If True, perform a case-insensitive match; otherwise, the match is case-sensitive. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + For each string in the array, determine whether it starts with the given literal suffix. + + Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.count_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.count_substring.html). + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, ignore_case, highlevel, behavior) + + +def _impl(array, pattern, ignore_case, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, behavior=behavior) + apply = ak.operations.str._get_ufunc_action( + pc.starts_with, + pc.starts_with, + bytestring_to_string=False, + ignore_case=ignore_case, + pattern=pattern, + ) + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index db759b7d78..37fac3b5a9 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -855,3 +855,30 @@ def test_ends_with(): [False, True, True], [], ] + + +def test_starts_with(): + assert ak.str.starts_with(string_repeats, "FOO").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.starts_with(string_repeats, "FOO", ignore_case=True).tolist() == [ + [True, True, False], + [False, False, True], + [], + ] + + # Bytestrings + assert ak.str.starts_with(bytestring_repeats, b"FOO").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.starts_with( + bytestring_repeats, b"FOO", ignore_case=True + ).tolist() == [ + [True, True, False], + [False, False, True], + [], + ] From 17a6a0e470f6fb82ced119e54605c7ab8c4c193a Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 22:59:07 +0100 Subject: [PATCH 53/73] docs: fix link --- src/awkward/operations/str/ak_ends_with.py | 2 +- src/awkward/operations/str/ak_starts_with.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/awkward/operations/str/ak_ends_with.py b/src/awkward/operations/str/ak_ends_with.py index 7a7b2f40a0..89f82c8fa0 100644 --- a/src/awkward/operations/str/ak_ends_with.py +++ b/src/awkward/operations/str/ak_ends_with.py @@ -26,7 +26,7 @@ def ends_with(array, pattern, *, ignore_case=False, highlevel=True, behavior=Non Note: this function does not raise an error if the `array` does not contain any string or bytestring data. Requires the pyarrow library and calls - [pyarrow.compute.count_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.count_substring.html). + [pyarrow.compute.ends_with](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ends_with.html). """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_starts_with.py b/src/awkward/operations/str/ak_starts_with.py index 6452cb8e3b..a203c5a318 100644 --- a/src/awkward/operations/str/ak_starts_with.py +++ b/src/awkward/operations/str/ak_starts_with.py @@ -26,7 +26,7 @@ def starts_with(array, pattern, *, ignore_case=False, highlevel=True, behavior=N Note: this function does not raise an error if the `array` does not contain any string or bytestring data. Requires the pyarrow library and calls - [pyarrow.compute.count_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.count_substring.html). + [pyarrow.compute.starts_with](https://arrow.apache.org/docs/python/generated/pyarrow.compute.starts_with.html). """ # Dispatch yield (array,) From 83f1597b92b3b7f91f00d35304f78388c7bce104 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 23:02:05 +0100 Subject: [PATCH 54/73] feat: add `find_substring` --- src/awkward/operations/str/__init__.py | 1 + .../operations/str/ak_find_substring.py | 54 +++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 27 ++++++++++ 3 files changed, 82 insertions(+) create mode 100644 src/awkward/operations/str/ak_find_substring.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 2bd84756a1..15bc71f85d 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -64,6 +64,7 @@ from awkward.operations.str.ak_count_substring import * from awkward.operations.str.ak_count_substring_regex import * from awkward.operations.str.ak_ends_with import * +from awkward.operations.str.ak_find_substring import * from awkward.operations.str.ak_starts_with import * diff --git a/src/awkward/operations/str/ak_find_substring.py b/src/awkward/operations/str/ak_find_substring.py new file mode 100644 index 0000000000..ed75d90911 --- /dev/null +++ b/src/awkward/operations/str/ak_find_substring.py @@ -0,0 +1,54 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("find_substring",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def find_substring(array, pattern, *, ignore_case=False, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str, or bytes): Substring pattern to look for inside the given array. + ignore_case (bool): If True, perform a case-insensitive match; otherwise, the match is case-sensitive. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + For each string in the array, determine the index at which the first occurrence of the given literal pattern is + found. If the literay pattern is not found inside the string, the index is taken to be -1. + + Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.find_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.find_substring.html). + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, ignore_case, highlevel, behavior) + + +def _impl(array, pattern, ignore_case, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, behavior=behavior) + apply = ak.operations.str._get_ufunc_action( + pc.find_substring, + pc.find_substring, + bytestring_to_string=False, + ignore_case=ignore_case, + pattern=pattern, + ) + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 37fac3b5a9..619926b0b9 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -882,3 +882,30 @@ def test_starts_with(): [False, False, True], [], ] + + +def test_find_substring(): + assert ak.str.find_substring(string_repeats, "FOO").tolist() == [ + [-1, -1, -1], + [-1, -1, -1], + [], + ] + assert ak.str.find_substring(string_repeats, "FOO", ignore_case=True).tolist() == [ + [0, 0, -1], + [3, -1, 0], + [], + ] + + # Bytestrings + assert ak.str.find_substring(bytestring_repeats, b"FOO").tolist() == [ + [-1, -1, -1], + [-1, -1, -1], + [], + ] + assert ak.str.find_substring( + bytestring_repeats, b"FOO", ignore_case=True + ).tolist() == [ + [0, 0, -1], + [3, -1, 0], + [], + ] From 6ad578fda3e6f162ef0374b2e8add42b3fbdf07f Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 23:04:11 +0100 Subject: [PATCH 55/73] docs: fix typo --- src/awkward/operations/str/ak_find_substring.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/awkward/operations/str/ak_find_substring.py b/src/awkward/operations/str/ak_find_substring.py index ed75d90911..e7a9acaaa5 100644 --- a/src/awkward/operations/str/ak_find_substring.py +++ b/src/awkward/operations/str/ak_find_substring.py @@ -22,7 +22,7 @@ def find_substring(array, pattern, *, ignore_case=False, highlevel=True, behavio high-level. For each string in the array, determine the index at which the first occurrence of the given literal pattern is - found. If the literay pattern is not found inside the string, the index is taken to be -1. + found. If the literal pattern is not found inside the string, the index is taken to be -1. Note: this function does not raise an error if the `array` does not contain any string or bytestring data. From 3141ebb753e66a59dd97b6d7d0af3963798f45a9 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 23:06:58 +0100 Subject: [PATCH 56/73] feat: add `find_substring_regex` --- src/awkward/operations/str/__init__.py | 1 + .../operations/str/ak_find_substring_regex.py | 56 +++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 29 ++++++++++ 3 files changed, 86 insertions(+) create mode 100644 src/awkward/operations/str/ak_find_substring_regex.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 15bc71f85d..2475e43e03 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -65,6 +65,7 @@ from awkward.operations.str.ak_count_substring_regex import * from awkward.operations.str.ak_ends_with import * from awkward.operations.str.ak_find_substring import * +from awkward.operations.str.ak_find_substring_regex import * from awkward.operations.str.ak_starts_with import * diff --git a/src/awkward/operations/str/ak_find_substring_regex.py b/src/awkward/operations/str/ak_find_substring_regex.py new file mode 100644 index 0000000000..6a553163a6 --- /dev/null +++ b/src/awkward/operations/str/ak_find_substring_regex.py @@ -0,0 +1,56 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("find_substring_regex",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def find_substring_regex( + array, pattern, *, ignore_case=False, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str, or bytes): Substring pattern to look for inside the given array. + ignore_case (bool): If True, perform a case-insensitive match; otherwise, the match is case-sensitive. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + For each string in the array, determine the index at which the first occurrence of the given regular expression + pattern. is found. If the regular expression pattern is not found inside the string, the index is taken to be -1. + + Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.find_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.find_substring.html). + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, ignore_case, highlevel, behavior) + + +def _impl(array, pattern, ignore_case, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, behavior=behavior) + apply = ak.operations.str._get_ufunc_action( + pc.find_substring_regex, + pc.find_substring_regex, + bytestring_to_string=False, + ignore_case=ignore_case, + pattern=pattern, + ) + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 619926b0b9..607e7d39df 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -909,3 +909,32 @@ def test_find_substring(): [3, -1, 0], [], ] + + +def test_find_substring_regex(): + assert ak.str.find_substring_regex(string_repeats, r"FOO\d+").tolist() == [ + [-1, -1, -1], + [-1, -1, -1], + [], + ] + assert ak.str.find_substring_regex( + string_repeats, r"FOO\d+", ignore_case=True + ).tolist() == [ + [0, -1, -1], + [-1, -1, 0], + [], + ] + + # Bytestrings + assert ak.str.find_substring_regex(bytestring_repeats, rb"FOO\d+").tolist() == [ + [-1, -1, -1], + [-1, -1, -1], + [], + ] + assert ak.str.find_substring_regex( + bytestring_repeats, rb"FOO\d+", ignore_case=True + ).tolist() == [ + [0, -1, -1], + [-1, -1, 0], + [], + ] From 4c69e86e7cbed3a2d97069aa773e642f0082c0d1 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 23:09:29 +0100 Subject: [PATCH 57/73] docs: fix link --- src/awkward/operations/str/ak_find_substring_regex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/awkward/operations/str/ak_find_substring_regex.py b/src/awkward/operations/str/ak_find_substring_regex.py index 6a553163a6..a0dc5b2ce0 100644 --- a/src/awkward/operations/str/ak_find_substring_regex.py +++ b/src/awkward/operations/str/ak_find_substring_regex.py @@ -29,7 +29,7 @@ def find_substring_regex( Note: this function does not raise an error if the `array` does not contain any string or bytestring data. Requires the pyarrow library and calls - [pyarrow.compute.find_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.find_substring.html). + [pyarrow.compute.find_substring_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.find_substring_regex.html). """ # Dispatch yield (array,) From 8e230f4fc0a455c7d6483b1f3b60839f6896cb42 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 23:14:22 +0100 Subject: [PATCH 58/73] feat: add `match_like` --- src/awkward/operations/str/__init__.py | 1 + src/awkward/operations/str/ak_match_like.py | 55 +++++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 27 ++++++++++ 3 files changed, 83 insertions(+) create mode 100644 src/awkward/operations/str/ak_match_like.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 2475e43e03..a2c452ce29 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -66,6 +66,7 @@ from awkward.operations.str.ak_ends_with import * from awkward.operations.str.ak_find_substring import * from awkward.operations.str.ak_find_substring_regex import * +from awkward.operations.str.ak_match_like import * from awkward.operations.str.ak_starts_with import * diff --git a/src/awkward/operations/str/ak_match_like.py b/src/awkward/operations/str/ak_match_like.py new file mode 100644 index 0000000000..ef8462d513 --- /dev/null +++ b/src/awkward/operations/str/ak_match_like.py @@ -0,0 +1,55 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("match_like",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def match_like(array, pattern, *, ignore_case=False, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str, or bytes): Substring pattern to look for inside the given array. + ignore_case (bool): If True, perform a case-insensitive match; otherwise, the match is case-sensitive. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + For each string in the array, determine whether it matches the given SQL-style LIKE pattern. + '%' matches any number of characters, '_' matches exactly one character, and any other character matches itself. + To match a literal '%', '_', or "'", the character must be preceded with a backslash. + + Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.match_like](https://arrow.apache.org/docs/python/generated/pyarrow.compute.match_like.html). + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, ignore_case, highlevel, behavior) + + +def _impl(array, pattern, ignore_case, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, behavior=behavior) + apply = ak.operations.str._get_ufunc_action( + pc.match_like, + pc.match_like, + bytestring_to_string=False, + ignore_case=ignore_case, + pattern=pattern, + ) + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 607e7d39df..3acf98a84a 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -938,3 +938,30 @@ def test_find_substring_regex(): [-1, -1, 0], [], ] + + +def test_match_like(): + assert ak.str.match_like(string_repeats, "FOO%").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.match_like(string_repeats, "FOO%", ignore_case=True).tolist() == [ + [True, True, False], + [False, False, True], + [], + ] + + # Bytestrings + assert ak.str.match_like(bytestring_repeats, b"FOO%").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.match_like( + bytestring_repeats, b"FOO%", ignore_case=True + ).tolist() == [ + [True, True, False], + [False, False, True], + [], + ] From c676fbdfe64b3b326a0016067dc7d24aa47fc136 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 23:15:57 +0100 Subject: [PATCH 59/73] test: improve test --- tests/test_2616_use_pyarrow_for_strings.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 3acf98a84a..ae55309763 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -941,27 +941,27 @@ def test_find_substring_regex(): def test_match_like(): - assert ak.str.match_like(string_repeats, "FOO%").tolist() == [ + assert ak.str.match_like(string_repeats, "FOO%BA%").tolist() == [ [False, False, False], [False, False, False], [], ] - assert ak.str.match_like(string_repeats, "FOO%", ignore_case=True).tolist() == [ - [True, True, False], + assert ak.str.match_like(string_repeats, "FOO%BA%", ignore_case=True).tolist() == [ + [True, False, False], [False, False, True], [], ] # Bytestrings - assert ak.str.match_like(bytestring_repeats, b"FOO%").tolist() == [ + assert ak.str.match_like(bytestring_repeats, b"FOO%BA%").tolist() == [ [False, False, False], [False, False, False], [], ] assert ak.str.match_like( - bytestring_repeats, b"FOO%", ignore_case=True + bytestring_repeats, b"FOO%BA%", ignore_case=True ).tolist() == [ - [True, True, False], + [True, False, False], [False, False, True], [], ] From 99584ba165eac2498052a9f92358cc0398d16383 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 23:20:51 +0100 Subject: [PATCH 60/73] feat: add `match_substring`, `match_substring_regex` --- src/awkward/operations/str/__init__.py | 2 + .../operations/str/ak_match_substring.py | 55 ++++++++++++++++++ .../str/ak_match_substring_regex.py | 55 ++++++++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 56 +++++++++++++++++++ 4 files changed, 168 insertions(+) create mode 100644 src/awkward/operations/str/ak_match_substring.py create mode 100644 src/awkward/operations/str/ak_match_substring_regex.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index a2c452ce29..89cf32dfc7 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -67,6 +67,8 @@ from awkward.operations.str.ak_find_substring import * from awkward.operations.str.ak_find_substring_regex import * from awkward.operations.str.ak_match_like import * +from awkward.operations.str.ak_match_substring import * +from awkward.operations.str.ak_match_substring_regex import * from awkward.operations.str.ak_starts_with import * diff --git a/src/awkward/operations/str/ak_match_substring.py b/src/awkward/operations/str/ak_match_substring.py new file mode 100644 index 0000000000..9530b96388 --- /dev/null +++ b/src/awkward/operations/str/ak_match_substring.py @@ -0,0 +1,55 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("match_substring",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def match_substring( + array, pattern, *, ignore_case=False, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str, or bytes): Substring pattern to look for inside the given array. + ignore_case (bool): If True, perform a case-insensitive match; otherwise, the match is case-sensitive. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + For each string in the array, determine whether it contains the given literal pattern. + + Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.match_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.match_substring.html). + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, ignore_case, highlevel, behavior) + + +def _impl(array, pattern, ignore_case, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, behavior=behavior) + apply = ak.operations.str._get_ufunc_action( + pc.match_substring, + pc.match_substring, + bytestring_to_string=False, + ignore_case=ignore_case, + pattern=pattern, + ) + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/src/awkward/operations/str/ak_match_substring_regex.py b/src/awkward/operations/str/ak_match_substring_regex.py new file mode 100644 index 0000000000..bf0f659765 --- /dev/null +++ b/src/awkward/operations/str/ak_match_substring_regex.py @@ -0,0 +1,55 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("match_substring_regex",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def match_substring_regex( + array, pattern, *, ignore_case=False, highlevel=True, behavior=None +): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + pattern (str, or bytes): Substring pattern to look for inside the given array. + ignore_case (bool): If True, perform a case-insensitive match; otherwise, the match is case-sensitive. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + For each string in the array, determine whether it contains the given regular expression pattern. + + Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.match_substring_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.match_substring_regex.html). + """ + # Dispatch + yield (array,) + + # Implementation + return _impl(array, pattern, ignore_case, highlevel, behavior) + + +def _impl(array, pattern, ignore_case, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + layout = ak.to_layout(array, allow_record=False, allow_other=True) + behavior = behavior_of(array, behavior=behavior) + apply = ak.operations.str._get_ufunc_action( + pc.match_substring_regex, + pc.match_substring_regex, + bytestring_to_string=False, + ignore_case=ignore_case, + pattern=pattern, + ) + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index ae55309763..b53e6ef908 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -965,3 +965,59 @@ def test_match_like(): [False, False, True], [], ] + + +def test_match_substring(): + assert ak.str.match_substring(string_repeats, "FOO").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.match_substring(string_repeats, "FOO", ignore_case=True).tolist() == [ + [True, True, False], + [True, False, True], + [], + ] + + # Bytestrings + assert ak.str.match_substring(bytestring_repeats, b"FOO").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.match_substring( + bytestring_repeats, b"FOO", ignore_case=True + ).tolist() == [ + [True, True, False], + [True, False, True], + [], + ] + + +def test_match_substring_regex(): + assert ak.str.match_substring_regex(string_repeats, r"FOO\d+").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.match_substring_regex( + string_repeats, r"FOO\d+", ignore_case=True + ).tolist() == [ + [True, False, False], + [False, False, True], + [], + ] + + # Bytestrings + assert ak.str.match_substring_regex(bytestring_repeats, rb"FOO\d+").tolist() == [ + [False, False, False], + [False, False, False], + [], + ] + assert ak.str.match_substring_regex( + bytestring_repeats, rb"FOO\d+", ignore_case=True + ).tolist() == [ + [True, False, False], + [False, False, True], + [], + ] From c456b440e671378c5ea9db7b41b8e9c289565b4a Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 23:47:32 +0100 Subject: [PATCH 61/73] feat: add `is_in` and `index_in` --- src/awkward/operations/str/__init__.py | 2 + src/awkward/operations/str/ak_index_in.py | 80 +++++++++++++ src/awkward/operations/str/ak_is_in.py | 79 ++++++++++++ tests/test_2616_use_pyarrow_for_strings.py | 132 +++++++++++++++++++++ 4 files changed, 293 insertions(+) create mode 100644 src/awkward/operations/str/ak_index_in.py create mode 100644 src/awkward/operations/str/ak_is_in.py diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 89cf32dfc7..7d4357d12a 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -66,6 +66,8 @@ from awkward.operations.str.ak_ends_with import * from awkward.operations.str.ak_find_substring import * from awkward.operations.str.ak_find_substring_regex import * +from awkward.operations.str.ak_index_in import * +from awkward.operations.str.ak_is_in import * from awkward.operations.str.ak_match_like import * from awkward.operations.str.ak_match_substring import * from awkward.operations.str.ak_match_substring_regex import * diff --git a/src/awkward/operations/str/ak_index_in.py b/src/awkward/operations/str/ak_index_in.py new file mode 100644 index 0000000000..66b31e5aa8 --- /dev/null +++ b/src/awkward/operations/str/ak_index_in.py @@ -0,0 +1,80 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("index_in",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def index_in(array, value_set, *, skip_nones=False, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + value_set: Array-like data (anything #ak.to_layout recognizes), set of values to search for. + skip_nones (bool): If True, None values in `array` are not matched against `value_set`; otherwise, they are. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + For each string in the array, determine where it is found within the given set of values. If the string is + not found within the value set, the index is set to None. + + Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.index_in](https://arrow.apache.org/docs/python/generated/pyarrow.compute.index_in.html). + """ + # Dispatch + yield (array, value_set) + + # Implementation + return _impl(array, value_set, skip_nones, highlevel, behavior) + + +def _is_maybe_optional_list_of_string(layout): + if layout.is_list and layout.parameter("__array__") in {"string", "bytestring"}: + return True + elif layout.is_option or layout.index_indexed: + return _is_maybe_optional_list_of_string(layout.content) + else: + return False + + +def _impl(array, value_set, skip_nones, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + layout = ak.to_layout(array, allow_record=False, allow_other=True) + value_set_layout = ak.to_layout(value_set, allow_record=False, allow_other=True) + + if not _is_maybe_optional_list_of_string(value_set_layout): + raise TypeError("`value_set` must be 1D array of (maybe missing) strings") + + behavior = behavior_of(array, value_set, behavior=behavior) + + def apply(layout, **kwargs): + if ( + layout.is_list + and layout.purelist_depth == 2 + and _is_maybe_optional_list_of_string(layout.content) + ): + return layout.copy( + content=ak.from_arrow( + pc.index_in( + ak.to_arrow(layout.content, extensionarray=False), + ak.to_arrow(value_set_layout, extensionarray=False), + skip_nulls=skip_nones, + ), + highlevel=False, + ) + ) + + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/src/awkward/operations/str/ak_is_in.py b/src/awkward/operations/str/ak_is_in.py new file mode 100644 index 0000000000..2b5cee9d0c --- /dev/null +++ b/src/awkward/operations/str/ak_is_in.py @@ -0,0 +1,79 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +__all__ = ("is_in",) + + +import awkward as ak +from awkward._behavior import behavior_of +from awkward._dispatch import high_level_function +from awkward._layout import wrap_layout + + +@high_level_function +def is_in(array, value_set, *, skip_nones=False, highlevel=True, behavior=None): + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + value_set: Array-like data (anything #ak.to_layout recognizes), set of values to search for. + skip_nones (bool): If True, None values in `array` are not matched against `value_set`; otherwise, they are. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + For each string in the array, determine whether it is found within the given set of values. + + Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + + Requires the pyarrow library and calls + [pyarrow.compute.is_in](https://arrow.apache.org/docs/python/generated/pyarrow.compute.is_in.html). + """ + # Dispatch + yield (array, value_set) + + # Implementation + return _impl(array, value_set, skip_nones, highlevel, behavior) + + +def _is_maybe_optional_list_of_string(layout): + if layout.is_list and layout.parameter("__array__") in {"string", "bytestring"}: + return True + elif layout.is_option or layout.is_indexed: + return _is_maybe_optional_list_of_string(layout.content) + else: + return False + + +def _impl(array, value_set, skip_nones, highlevel, behavior): + import awkward._connect.pyarrow # noqa: F401, I001 + + import pyarrow.compute as pc + + layout = ak.to_layout(array, allow_record=False, allow_other=True) + value_set_layout = ak.to_layout(value_set, allow_record=False, allow_other=True) + + if not _is_maybe_optional_list_of_string(value_set_layout): + raise TypeError("`value_set` must be 1D array of (maybe missing) strings") + + behavior = behavior_of(array, value_set, behavior=behavior) + + def apply(layout, **kwargs): + if ( + layout.is_list + and layout.purelist_depth == 2 + and _is_maybe_optional_list_of_string(layout.content) + ): + return layout.copy( + content=ak.from_arrow( + pc.is_in( + ak.to_arrow(layout.content, extensionarray=False), + ak.to_arrow(value_set_layout, extensionarray=False), + skip_nulls=skip_nones, + ), + highlevel=False, + ) + ) + + out = ak._do.recursively_apply(layout, apply, behavior=behavior) + + return wrap_layout(out, highlevel=highlevel, behavior=behavior) diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index b53e6ef908..79ddfb4d82 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -1021,3 +1021,135 @@ def test_match_substring_regex(): [False, False, True], [], ] + + +def test_is_in(): + assert ak.str.is_in(string_repeats, ["123foo", "foo"]).tolist() == [ + [False, True, False], + [True, False, False], + [], + ] + assert ak.str.is_in( + [ + ["foo123bar123baz", "foo", "bar"], + ["123foo", "456bar", "foo123456bar"], + [None], + ], + ["123foo", "foo", None], + ).tolist() == [ + [False, True, False], + [True, False, False], + [True], + ] + assert ak.str.is_in( + [ + ["foo123bar123baz", "foo", "bar"], + ["123foo", "456bar", "foo123456bar"], + [None], + ], + ["123foo", "foo", None], + skip_nones=True, + ).tolist() == [ + [False, True, False], + [True, False, False], + [False], + ] + + # Bytestrings + + assert ak.str.is_in(string_repeats, [b"123foo", b"foo"]).tolist() == [ + [False, True, False], + [True, False, False], + [], + ] + assert ak.str.is_in( + [ + [b"foo123bar123baz", b"foo", b"bar"], + [b"123foo", b"456bar", b"foo123456bar"], + [None], + ], + [b"123foo", b"foo", None], + ).tolist() == [ + [False, True, False], + [True, False, False], + [True], + ] + assert ak.str.is_in( + [ + [b"foo123bar123baz", b"foo", b"bar"], + [b"123foo", b"456bar", b"foo123456bar"], + [None], + ], + [b"123foo", b"foo", None], + skip_nones=True, + ).tolist() == [ + [False, True, False], + [True, False, False], + [False], + ] + + +def test_index_in(): + assert ak.str.index_in(string_repeats, ["123foo", "foo"]).tolist() == [ + [None, 1, None], + [0, None, None], + [], + ] + assert ak.str.index_in( + [ + ["foo123bar123baz", "foo", "bar"], + ["123foo", "456bar", "foo123456bar"], + [None], + ], + ["123foo", "foo", None], + ).tolist() == [ + [None, 1, None], + [0, None, None], + [2], + ] + assert ak.str.index_in( + [ + ["foo123bar123baz", "foo", "bar"], + ["123foo", "456bar", "foo123456bar"], + [None], + ], + ["123foo", "foo", None], + skip_nones=True, + ).tolist() == [ + [None, 1, None], + [0, None, None], + [None], + ] + + # Bytestrings + + assert ak.str.index_in(string_repeats, [b"123foo", b"foo"]).tolist() == [ + [None, 1, None], + [0, None, None], + [], + ] + assert ak.str.index_in( + [ + [b"foo123bar123baz", b"foo", b"bar"], + [b"123foo", b"456bar", b"foo123456bar"], + [None], + ], + [b"123foo", b"foo", None], + ).tolist() == [ + [None, 1, None], + [0, None, None], + [2], + ] + assert ak.str.index_in( + [ + [b"foo123bar123baz", b"foo", b"bar"], + [b"123foo", b"456bar", b"foo123456bar"], + [None], + ], + [b"123foo", b"foo", None], + skip_nones=True, + ).tolist() == [ + [None, 1, None], + [0, None, None], + [None], + ] From 88f45cc2ae0a08481e288be0ea8d55ddb35fbca9 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 7 Aug 2023 23:50:59 +0100 Subject: [PATCH 62/73] fix: operate at leaf depth --- src/awkward/operations/str/ak_index_in.py | 24 +++++++++-------------- src/awkward/operations/str/ak_is_in.py | 22 ++++++++------------- 2 files changed, 17 insertions(+), 29 deletions(-) diff --git a/src/awkward/operations/str/ak_index_in.py b/src/awkward/operations/str/ak_index_in.py index 66b31e5aa8..2e0b681d7b 100644 --- a/src/awkward/operations/str/ak_index_in.py +++ b/src/awkward/operations/str/ak_index_in.py @@ -39,7 +39,7 @@ def index_in(array, value_set, *, skip_nones=False, highlevel=True, behavior=Non def _is_maybe_optional_list_of_string(layout): if layout.is_list and layout.parameter("__array__") in {"string", "bytestring"}: return True - elif layout.is_option or layout.index_indexed: + elif layout.is_option or layout.is_indexed: return _is_maybe_optional_list_of_string(layout.content) else: return False @@ -59,20 +59,14 @@ def _impl(array, value_set, skip_nones, highlevel, behavior): behavior = behavior_of(array, value_set, behavior=behavior) def apply(layout, **kwargs): - if ( - layout.is_list - and layout.purelist_depth == 2 - and _is_maybe_optional_list_of_string(layout.content) - ): - return layout.copy( - content=ak.from_arrow( - pc.index_in( - ak.to_arrow(layout.content, extensionarray=False), - ak.to_arrow(value_set_layout, extensionarray=False), - skip_nulls=skip_nones, - ), - highlevel=False, - ) + if _is_maybe_optional_list_of_string(layout) and layout.purelist_depth == 1: + return ak.from_arrow( + pc.index_in( + ak.to_arrow(layout, extensionarray=False), + ak.to_arrow(value_set_layout, extensionarray=False), + skip_nulls=skip_nones, + ), + highlevel=False, ) out = ak._do.recursively_apply(layout, apply, behavior=behavior) diff --git a/src/awkward/operations/str/ak_is_in.py b/src/awkward/operations/str/ak_is_in.py index 2b5cee9d0c..3240ed5b39 100644 --- a/src/awkward/operations/str/ak_is_in.py +++ b/src/awkward/operations/str/ak_is_in.py @@ -58,20 +58,14 @@ def _impl(array, value_set, skip_nones, highlevel, behavior): behavior = behavior_of(array, value_set, behavior=behavior) def apply(layout, **kwargs): - if ( - layout.is_list - and layout.purelist_depth == 2 - and _is_maybe_optional_list_of_string(layout.content) - ): - return layout.copy( - content=ak.from_arrow( - pc.is_in( - ak.to_arrow(layout.content, extensionarray=False), - ak.to_arrow(value_set_layout, extensionarray=False), - skip_nulls=skip_nones, - ), - highlevel=False, - ) + if _is_maybe_optional_list_of_string(layout) and layout.purelist_depth == 1: + return ak.from_arrow( + pc.is_in( + ak.to_arrow(layout, extensionarray=False), + ak.to_arrow(value_set_layout, extensionarray=False), + skip_nulls=skip_nones, + ), + highlevel=False, ) out = ak._do.recursively_apply(layout, apply, behavior=behavior) From 6745ba2ac4fc9bea1c7de76481e4d57de94fb7fb Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Tue, 8 Aug 2023 09:36:06 +0100 Subject: [PATCH 63/73] refactor: add internal `pyarrow.compute` helper --- src/awkward/_connect/pyarrow.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/awkward/_connect/pyarrow.py b/src/awkward/_connect/pyarrow.py index 54cae0ca92..b98c17975b 100644 --- a/src/awkward/_connect/pyarrow.py +++ b/src/awkward/_connect/pyarrow.py @@ -1,7 +1,9 @@ # BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE +from __future__ import annotations import json from collections.abc import Iterable, Sized +from types import ModuleType from packaging.version import parse as parse_version @@ -36,13 +38,13 @@ error_message = "pyarrow 7.0.0 or later required for {0}" -def import_pyarrow(name): +def import_pyarrow(name: str) -> ModuleType: if pyarrow is None: raise ImportError(error_message.format(name)) return pyarrow -def import_pyarrow_parquet(name): +def import_pyarrow_parquet(name: str) -> ModuleType: if pyarrow is None: raise ImportError(error_message.format(name)) @@ -51,7 +53,16 @@ def import_pyarrow_parquet(name): return out -def import_fsspec(name): +def import_pyarrow_compute(name: str) -> ModuleType: + if pyarrow is None: + raise ImportError(error_message.format(name)) + + import pyarrow.compute as out + + return out + + +def import_fsspec(name: str) -> ModuleType: try: import fsspec From 4422ad82ae54aa3e3d6b36af69ad239dce7c6a6d Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Tue, 8 Aug 2023 09:37:49 +0100 Subject: [PATCH 64/73] refactor: use pyarrow import helper --- src/awkward/operations/str/ak_capitalize.py | 5 ++--- src/awkward/operations/str/ak_center.py | 5 ++--- src/awkward/operations/str/ak_count_substring.py | 5 ++--- src/awkward/operations/str/ak_count_substring_regex.py | 5 ++--- src/awkward/operations/str/ak_ends_with.py | 5 ++--- src/awkward/operations/str/ak_extract_regex.py | 5 ++--- src/awkward/operations/str/ak_find_substring.py | 5 ++--- src/awkward/operations/str/ak_find_substring_regex.py | 5 ++--- src/awkward/operations/str/ak_is_alnum.py | 5 ++--- src/awkward/operations/str/ak_is_alpha.py | 5 ++--- src/awkward/operations/str/ak_is_ascii.py | 5 ++--- src/awkward/operations/str/ak_is_decimal.py | 5 ++--- src/awkward/operations/str/ak_is_digit.py | 5 ++--- src/awkward/operations/str/ak_is_lower.py | 5 ++--- src/awkward/operations/str/ak_is_numeric.py | 5 ++--- src/awkward/operations/str/ak_is_printable.py | 5 ++--- src/awkward/operations/str/ak_is_space.py | 5 ++--- src/awkward/operations/str/ak_is_title.py | 5 ++--- src/awkward/operations/str/ak_is_upper.py | 5 ++--- src/awkward/operations/str/ak_length.py | 5 ++--- src/awkward/operations/str/ak_lower.py | 5 ++--- src/awkward/operations/str/ak_lpad.py | 5 ++--- src/awkward/operations/str/ak_ltrim.py | 5 ++--- src/awkward/operations/str/ak_ltrim_whitespace.py | 5 ++--- src/awkward/operations/str/ak_match_like.py | 5 ++--- src/awkward/operations/str/ak_match_substring.py | 5 ++--- src/awkward/operations/str/ak_match_substring_regex.py | 4 ++-- src/awkward/operations/str/ak_replace_slice.py | 5 ++--- src/awkward/operations/str/ak_replace_substring.py | 5 ++--- src/awkward/operations/str/ak_replace_substring_regex.py | 5 ++--- src/awkward/operations/str/ak_reverse.py | 5 ++--- src/awkward/operations/str/ak_rpad.py | 5 ++--- src/awkward/operations/str/ak_rtrim.py | 5 ++--- src/awkward/operations/str/ak_rtrim_whitespace.py | 5 ++--- src/awkward/operations/str/ak_split_pattern.py | 5 ++--- src/awkward/operations/str/ak_split_pattern_regex.py | 5 ++--- src/awkward/operations/str/ak_split_whitespace.py | 4 ++-- src/awkward/operations/str/ak_starts_with.py | 5 ++--- src/awkward/operations/str/ak_swapcase.py | 5 ++--- src/awkward/operations/str/ak_title.py | 5 ++--- src/awkward/operations/str/ak_trim.py | 5 ++--- src/awkward/operations/str/ak_trim_whitespace.py | 5 ++--- src/awkward/operations/str/ak_upper.py | 5 ++--- 43 files changed, 86 insertions(+), 127 deletions(-) diff --git a/src/awkward/operations/str/ak_capitalize.py b/src/awkward/operations/str/ak_capitalize.py index 9400c21c9e..65fd9164db 100644 --- a/src/awkward/operations/str/ak_capitalize.py +++ b/src/awkward/operations/str/ak_capitalize.py @@ -39,10 +39,9 @@ def capitalize(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.capitalize") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_center.py b/src/awkward/operations/str/ak_center.py index 9bd2246673..5999372824 100644 --- a/src/awkward/operations/str/ak_center.py +++ b/src/awkward/operations/str/ak_center.py @@ -44,10 +44,9 @@ def center(array, width, padding=" ", *, highlevel=True, behavior=None): def _impl(array, width, padding, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.center") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_count_substring.py b/src/awkward/operations/str/ak_count_substring.py index cbd8cdc550..218e2bc5ce 100644 --- a/src/awkward/operations/str/ak_count_substring.py +++ b/src/awkward/operations/str/ak_count_substring.py @@ -38,10 +38,9 @@ def count_substring( def _impl(array, pattern, ignore_case, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.count_substring") layout = ak.to_layout(array, allow_record=False, allow_other=True) behavior = behavior_of(array, behavior=behavior) apply = ak.operations.str._get_ufunc_action( diff --git a/src/awkward/operations/str/ak_count_substring_regex.py b/src/awkward/operations/str/ak_count_substring_regex.py index f3041c32e0..46b36cfb73 100644 --- a/src/awkward/operations/str/ak_count_substring_regex.py +++ b/src/awkward/operations/str/ak_count_substring_regex.py @@ -38,10 +38,9 @@ def count_substring_regex( def _impl(array, pattern, ignore_case, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.count_substring_regex") layout = ak.to_layout(array, allow_record=False, allow_other=True) behavior = behavior_of(array, behavior=behavior) apply = ak.operations.str._get_ufunc_action( diff --git a/src/awkward/operations/str/ak_ends_with.py b/src/awkward/operations/str/ak_ends_with.py index 89f82c8fa0..189ef7b25a 100644 --- a/src/awkward/operations/str/ak_ends_with.py +++ b/src/awkward/operations/str/ak_ends_with.py @@ -36,10 +36,9 @@ def ends_with(array, pattern, *, ignore_case=False, highlevel=True, behavior=Non def _impl(array, pattern, ignore_case, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.ends_with") layout = ak.to_layout(array, allow_record=False, allow_other=True) behavior = behavior_of(array, behavior=behavior) apply = ak.operations.str._get_ufunc_action( diff --git a/src/awkward/operations/str/ak_extract_regex.py b/src/awkward/operations/str/ak_extract_regex.py index 9a4aecd038..ee0c899471 100644 --- a/src/awkward/operations/str/ak_extract_regex.py +++ b/src/awkward/operations/str/ak_extract_regex.py @@ -61,10 +61,9 @@ def extract_regex(array, pattern, *, highlevel=True, behavior=None): def _impl(array, pattern, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.extract_regex") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_find_substring.py b/src/awkward/operations/str/ak_find_substring.py index e7a9acaaa5..7955ffbc14 100644 --- a/src/awkward/operations/str/ak_find_substring.py +++ b/src/awkward/operations/str/ak_find_substring.py @@ -37,10 +37,9 @@ def find_substring(array, pattern, *, ignore_case=False, highlevel=True, behavio def _impl(array, pattern, ignore_case, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.find_substring") layout = ak.to_layout(array, allow_record=False, allow_other=True) behavior = behavior_of(array, behavior=behavior) apply = ak.operations.str._get_ufunc_action( diff --git a/src/awkward/operations/str/ak_find_substring_regex.py b/src/awkward/operations/str/ak_find_substring_regex.py index a0dc5b2ce0..58edb06794 100644 --- a/src/awkward/operations/str/ak_find_substring_regex.py +++ b/src/awkward/operations/str/ak_find_substring_regex.py @@ -39,10 +39,9 @@ def find_substring_regex( def _impl(array, pattern, ignore_case, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.find_substring_regex") layout = ak.to_layout(array, allow_record=False, allow_other=True) behavior = behavior_of(array, behavior=behavior) apply = ak.operations.str._get_ufunc_action( diff --git a/src/awkward/operations/str/ak_is_alnum.py b/src/awkward/operations/str/ak_is_alnum.py index 2f93d87982..9866039f3d 100644 --- a/src/awkward/operations/str/ak_is_alnum.py +++ b/src/awkward/operations/str/ak_is_alnum.py @@ -39,10 +39,9 @@ def is_alnum(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.is_alnum") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_is_alpha.py b/src/awkward/operations/str/ak_is_alpha.py index c40f612e75..76a6b5721c 100644 --- a/src/awkward/operations/str/ak_is_alpha.py +++ b/src/awkward/operations/str/ak_is_alpha.py @@ -39,10 +39,9 @@ def is_alpha(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.is_alpha") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_is_ascii.py b/src/awkward/operations/str/ak_is_ascii.py index bc588f2888..77747b3639 100644 --- a/src/awkward/operations/str/ak_is_ascii.py +++ b/src/awkward/operations/str/ak_is_ascii.py @@ -39,10 +39,9 @@ def is_ascii(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.is_ascii") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_is_decimal.py b/src/awkward/operations/str/ak_is_decimal.py index 26ff606bd0..fdf1a13942 100644 --- a/src/awkward/operations/str/ak_is_decimal.py +++ b/src/awkward/operations/str/ak_is_decimal.py @@ -39,10 +39,9 @@ def is_decimal(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.is_decimal") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_is_digit.py b/src/awkward/operations/str/ak_is_digit.py index 338b86d30a..3e66c21980 100644 --- a/src/awkward/operations/str/ak_is_digit.py +++ b/src/awkward/operations/str/ak_is_digit.py @@ -41,10 +41,9 @@ def is_digit(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.is_digit") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_is_lower.py b/src/awkward/operations/str/ak_is_lower.py index 87dd3462a6..c36ab6056a 100644 --- a/src/awkward/operations/str/ak_is_lower.py +++ b/src/awkward/operations/str/ak_is_lower.py @@ -39,10 +39,9 @@ def is_lower(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.is_lower") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_is_numeric.py b/src/awkward/operations/str/ak_is_numeric.py index 437ff31b47..6996782f3f 100644 --- a/src/awkward/operations/str/ak_is_numeric.py +++ b/src/awkward/operations/str/ak_is_numeric.py @@ -41,10 +41,9 @@ def is_numeric(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.is_numeric") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_is_printable.py b/src/awkward/operations/str/ak_is_printable.py index 24c5184fde..fcfdde24d6 100644 --- a/src/awkward/operations/str/ak_is_printable.py +++ b/src/awkward/operations/str/ak_is_printable.py @@ -39,10 +39,9 @@ def is_printable(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.is_printable") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_is_space.py b/src/awkward/operations/str/ak_is_space.py index 5b69031d1f..2b264664a0 100644 --- a/src/awkward/operations/str/ak_is_space.py +++ b/src/awkward/operations/str/ak_is_space.py @@ -39,10 +39,9 @@ def is_space(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.is_space") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_is_title.py b/src/awkward/operations/str/ak_is_title.py index 5275a1df0e..4e65d60037 100644 --- a/src/awkward/operations/str/ak_is_title.py +++ b/src/awkward/operations/str/ak_is_title.py @@ -39,10 +39,9 @@ def is_title(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.is_title") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_is_upper.py b/src/awkward/operations/str/ak_is_upper.py index fa20f04fe6..cf460e6aa3 100644 --- a/src/awkward/operations/str/ak_is_upper.py +++ b/src/awkward/operations/str/ak_is_upper.py @@ -39,10 +39,9 @@ def is_upper(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.is_upper") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_length.py b/src/awkward/operations/str/ak_length.py index e5ef1c7b84..800afd4287 100644 --- a/src/awkward/operations/str/ak_length.py +++ b/src/awkward/operations/str/ak_length.py @@ -39,10 +39,9 @@ def length(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.length") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_lower.py b/src/awkward/operations/str/ak_lower.py index 971ffe043e..51c391311b 100644 --- a/src/awkward/operations/str/ak_lower.py +++ b/src/awkward/operations/str/ak_lower.py @@ -39,10 +39,9 @@ def lower(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.lower") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_lpad.py b/src/awkward/operations/str/ak_lpad.py index 909f1663d9..ed9b5f98ac 100644 --- a/src/awkward/operations/str/ak_lpad.py +++ b/src/awkward/operations/str/ak_lpad.py @@ -44,10 +44,9 @@ def lpad(array, width, padding=" ", *, highlevel=True, behavior=None): def _impl(array, width, padding, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.lpad") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_ltrim.py b/src/awkward/operations/str/ak_ltrim.py index 0180270067..062c49ba95 100644 --- a/src/awkward/operations/str/ak_ltrim.py +++ b/src/awkward/operations/str/ak_ltrim.py @@ -43,10 +43,9 @@ def ltrim(array, characters, *, highlevel=True, behavior=None): def _impl(array, characters, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.ltrim") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_ltrim_whitespace.py b/src/awkward/operations/str/ak_ltrim_whitespace.py index e415a1400f..350cd52f05 100644 --- a/src/awkward/operations/str/ak_ltrim_whitespace.py +++ b/src/awkward/operations/str/ak_ltrim_whitespace.py @@ -38,10 +38,9 @@ def ltrim_whitespace(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.ltrim_whitespace") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_match_like.py b/src/awkward/operations/str/ak_match_like.py index ef8462d513..5515a829bd 100644 --- a/src/awkward/operations/str/ak_match_like.py +++ b/src/awkward/operations/str/ak_match_like.py @@ -38,10 +38,9 @@ def match_like(array, pattern, *, ignore_case=False, highlevel=True, behavior=No def _impl(array, pattern, ignore_case, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.match_like") layout = ak.to_layout(array, allow_record=False, allow_other=True) behavior = behavior_of(array, behavior=behavior) apply = ak.operations.str._get_ufunc_action( diff --git a/src/awkward/operations/str/ak_match_substring.py b/src/awkward/operations/str/ak_match_substring.py index 9530b96388..d1bf7626c6 100644 --- a/src/awkward/operations/str/ak_match_substring.py +++ b/src/awkward/operations/str/ak_match_substring.py @@ -38,10 +38,9 @@ def match_substring( def _impl(array, pattern, ignore_case, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.match_substring") layout = ak.to_layout(array, allow_record=False, allow_other=True) behavior = behavior_of(array, behavior=behavior) apply = ak.operations.str._get_ufunc_action( diff --git a/src/awkward/operations/str/ak_match_substring_regex.py b/src/awkward/operations/str/ak_match_substring_regex.py index bf0f659765..5a48f9ba62 100644 --- a/src/awkward/operations/str/ak_match_substring_regex.py +++ b/src/awkward/operations/str/ak_match_substring_regex.py @@ -38,9 +38,9 @@ def match_substring_regex( def _impl(array, pattern, ignore_case, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 + from awkward._connect.pyarrow import import_pyarrow_compute - import pyarrow.compute as pc + pc = import_pyarrow_compute("ak.str.match_substring_regex") layout = ak.to_layout(array, allow_record=False, allow_other=True) behavior = behavior_of(array, behavior=behavior) diff --git a/src/awkward/operations/str/ak_replace_slice.py b/src/awkward/operations/str/ak_replace_slice.py index cd80f111aa..44161cb6c2 100644 --- a/src/awkward/operations/str/ak_replace_slice.py +++ b/src/awkward/operations/str/ak_replace_slice.py @@ -45,10 +45,9 @@ def replace_slice(array, start, stop, replacement, *, highlevel=True, behavior=N def _impl(array, start, stop, replacement, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.replace_slice") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_replace_substring.py b/src/awkward/operations/str/ak_replace_substring.py index 691e9fd3e7..4408beb6fd 100644 --- a/src/awkward/operations/str/ak_replace_substring.py +++ b/src/awkward/operations/str/ak_replace_substring.py @@ -47,10 +47,9 @@ def replace_substring( def _impl(array, pattern, replacement, max_replacements, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.replace_substring") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_replace_substring_regex.py b/src/awkward/operations/str/ak_replace_substring_regex.py index 77dc2c12b2..2380ba3e29 100644 --- a/src/awkward/operations/str/ak_replace_substring_regex.py +++ b/src/awkward/operations/str/ak_replace_substring_regex.py @@ -47,10 +47,9 @@ def replace_substring_regex( def _impl(array, pattern, replacement, max_replacements, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.replace_substring_regex") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_reverse.py b/src/awkward/operations/str/ak_reverse.py index 6f15db9df8..2a573f0ccc 100644 --- a/src/awkward/operations/str/ak_reverse.py +++ b/src/awkward/operations/str/ak_reverse.py @@ -39,10 +39,9 @@ def reverse(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.reverse") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_rpad.py b/src/awkward/operations/str/ak_rpad.py index da0cf61fb6..34748043b8 100644 --- a/src/awkward/operations/str/ak_rpad.py +++ b/src/awkward/operations/str/ak_rpad.py @@ -44,10 +44,9 @@ def rpad(array, width, padding=" ", *, highlevel=True, behavior=None): def _impl(array, width, padding, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.rpad") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_rtrim.py b/src/awkward/operations/str/ak_rtrim.py index 3d1d518754..f83c651631 100644 --- a/src/awkward/operations/str/ak_rtrim.py +++ b/src/awkward/operations/str/ak_rtrim.py @@ -43,10 +43,9 @@ def rtrim(array, characters, *, highlevel=True, behavior=None): def _impl(array, characters, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.rtrim") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_rtrim_whitespace.py b/src/awkward/operations/str/ak_rtrim_whitespace.py index e2064bc412..cba8760bfe 100644 --- a/src/awkward/operations/str/ak_rtrim_whitespace.py +++ b/src/awkward/operations/str/ak_rtrim_whitespace.py @@ -38,10 +38,9 @@ def rtrim_whitespace(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.rtrim_whitespace") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_split_pattern.py b/src/awkward/operations/str/ak_split_pattern.py index b94187d9fe..c7a84b1bf9 100644 --- a/src/awkward/operations/str/ak_split_pattern.py +++ b/src/awkward/operations/str/ak_split_pattern.py @@ -40,10 +40,9 @@ def split_pattern( def _impl(array, pattern, max_splits, reverse, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.split_pattern") behavior = behavior_of(array, behavior=behavior) action = ak.operations.str._get_split_action( pc.split_pattern, diff --git a/src/awkward/operations/str/ak_split_pattern_regex.py b/src/awkward/operations/str/ak_split_pattern_regex.py index 56a7876efd..c870a922b3 100644 --- a/src/awkward/operations/str/ak_split_pattern_regex.py +++ b/src/awkward/operations/str/ak_split_pattern_regex.py @@ -40,10 +40,9 @@ def split_pattern_regex( def _impl(array, pattern, max_splits, reverse, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.split_pattern_regex") behavior = behavior_of(array, behavior=behavior) action = ak.operations.str._get_split_action( pc.split_pattern_regex, diff --git a/src/awkward/operations/str/ak_split_whitespace.py b/src/awkward/operations/str/ak_split_whitespace.py index 07be7a0e5c..31bab91854 100644 --- a/src/awkward/operations/str/ak_split_whitespace.py +++ b/src/awkward/operations/str/ak_split_whitespace.py @@ -48,9 +48,9 @@ def split_whitespace( def _impl(array, max_splits, reverse, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.split_whitespace") behavior = behavior_of(array, behavior=behavior) action = ak.operations.str._get_split_action( pc.utf8_split_whitespace, diff --git a/src/awkward/operations/str/ak_starts_with.py b/src/awkward/operations/str/ak_starts_with.py index a203c5a318..3a0799334b 100644 --- a/src/awkward/operations/str/ak_starts_with.py +++ b/src/awkward/operations/str/ak_starts_with.py @@ -36,10 +36,9 @@ def starts_with(array, pattern, *, ignore_case=False, highlevel=True, behavior=N def _impl(array, pattern, ignore_case, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.starts_with") layout = ak.to_layout(array, allow_record=False, allow_other=True) behavior = behavior_of(array, behavior=behavior) apply = ak.operations.str._get_ufunc_action( diff --git a/src/awkward/operations/str/ak_swapcase.py b/src/awkward/operations/str/ak_swapcase.py index 36d6d53e11..6c99413dc5 100644 --- a/src/awkward/operations/str/ak_swapcase.py +++ b/src/awkward/operations/str/ak_swapcase.py @@ -39,10 +39,9 @@ def swapcase(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.swapcase") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_title.py b/src/awkward/operations/str/ak_title.py index cdd147c012..e8bbd1af92 100644 --- a/src/awkward/operations/str/ak_title.py +++ b/src/awkward/operations/str/ak_title.py @@ -39,10 +39,9 @@ def title(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.title") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_trim.py b/src/awkward/operations/str/ak_trim.py index c43df209be..626e762702 100644 --- a/src/awkward/operations/str/ak_trim.py +++ b/src/awkward/operations/str/ak_trim.py @@ -43,10 +43,9 @@ def trim(array, characters, *, highlevel=True, behavior=None): def _impl(array, characters, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.trim") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_trim_whitespace.py b/src/awkward/operations/str/ak_trim_whitespace.py index 197aa777cd..0453b8a03a 100644 --- a/src/awkward/operations/str/ak_trim_whitespace.py +++ b/src/awkward/operations/str/ak_trim_whitespace.py @@ -38,10 +38,9 @@ def trim_whitespace(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.trim_whitespace") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( diff --git a/src/awkward/operations/str/ak_upper.py b/src/awkward/operations/str/ak_upper.py index 776b0526c0..dd25dc5de8 100644 --- a/src/awkward/operations/str/ak_upper.py +++ b/src/awkward/operations/str/ak_upper.py @@ -39,10 +39,9 @@ def upper(array, *, highlevel=True, behavior=None): def _impl(array, highlevel, behavior): - import awkward._connect.pyarrow # noqa: F401, I001 - - import pyarrow.compute as pc + from awkward._connect.pyarrow import import_pyarrow_compute + pc = import_pyarrow_compute("ak.str.upper") behavior = behavior_of(array, behavior=behavior) out = ak._do.recursively_apply( From ec6cefa0450e275978e40567564f47ca4e1b976b Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Tue, 8 Aug 2023 09:40:20 +0100 Subject: [PATCH 65/73] refactor: add `module` and `name` arguments to `high_level_function` --- src/awkward/operations/str/ak_capitalize.py | 2 +- src/awkward/operations/str/ak_center.py | 2 +- src/awkward/operations/str/ak_count_substring.py | 2 +- src/awkward/operations/str/ak_count_substring_regex.py | 2 +- src/awkward/operations/str/ak_ends_with.py | 2 +- src/awkward/operations/str/ak_extract_regex.py | 2 +- src/awkward/operations/str/ak_find_substring.py | 2 +- src/awkward/operations/str/ak_find_substring_regex.py | 2 +- src/awkward/operations/str/ak_index_in.py | 2 +- src/awkward/operations/str/ak_is_alnum.py | 2 +- src/awkward/operations/str/ak_is_alpha.py | 2 +- src/awkward/operations/str/ak_is_ascii.py | 2 +- src/awkward/operations/str/ak_is_decimal.py | 2 +- src/awkward/operations/str/ak_is_digit.py | 2 +- src/awkward/operations/str/ak_is_in.py | 2 +- src/awkward/operations/str/ak_is_lower.py | 2 +- src/awkward/operations/str/ak_is_numeric.py | 2 +- src/awkward/operations/str/ak_is_printable.py | 2 +- src/awkward/operations/str/ak_is_space.py | 2 +- src/awkward/operations/str/ak_is_title.py | 2 +- src/awkward/operations/str/ak_is_upper.py | 2 +- src/awkward/operations/str/ak_join.py | 2 +- src/awkward/operations/str/ak_join_element_wise.py | 2 +- src/awkward/operations/str/ak_length.py | 2 +- src/awkward/operations/str/ak_lower.py | 2 +- src/awkward/operations/str/ak_lpad.py | 2 +- src/awkward/operations/str/ak_ltrim.py | 2 +- src/awkward/operations/str/ak_ltrim_whitespace.py | 2 +- src/awkward/operations/str/ak_match_like.py | 2 +- src/awkward/operations/str/ak_match_substring.py | 2 +- src/awkward/operations/str/ak_match_substring_regex.py | 2 +- src/awkward/operations/str/ak_repeat.py | 2 +- src/awkward/operations/str/ak_replace_slice.py | 2 +- src/awkward/operations/str/ak_replace_substring.py | 2 +- src/awkward/operations/str/ak_replace_substring_regex.py | 2 +- src/awkward/operations/str/ak_reverse.py | 2 +- src/awkward/operations/str/ak_rpad.py | 2 +- src/awkward/operations/str/ak_rtrim.py | 2 +- src/awkward/operations/str/ak_rtrim_whitespace.py | 2 +- src/awkward/operations/str/ak_slice.py | 2 +- src/awkward/operations/str/ak_split_pattern.py | 2 +- src/awkward/operations/str/ak_split_pattern_regex.py | 2 +- src/awkward/operations/str/ak_split_whitespace.py | 2 +- src/awkward/operations/str/ak_starts_with.py | 2 +- src/awkward/operations/str/ak_swapcase.py | 2 +- src/awkward/operations/str/ak_title.py | 2 +- src/awkward/operations/str/ak_trim.py | 2 +- src/awkward/operations/str/ak_trim_whitespace.py | 2 +- src/awkward/operations/str/ak_upper.py | 2 +- 49 files changed, 49 insertions(+), 49 deletions(-) diff --git a/src/awkward/operations/str/ak_capitalize.py b/src/awkward/operations/str/ak_capitalize.py index 65fd9164db..d555d23138 100644 --- a/src/awkward/operations/str/ak_capitalize.py +++ b/src/awkward/operations/str/ak_capitalize.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def capitalize(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_center.py b/src/awkward/operations/str/ak_center.py index 5999372824..3a21d1520f 100644 --- a/src/awkward/operations/str/ak_center.py +++ b/src/awkward/operations/str/ak_center.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def center(array, width, padding=" ", *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_count_substring.py b/src/awkward/operations/str/ak_count_substring.py index 218e2bc5ce..bae8942694 100644 --- a/src/awkward/operations/str/ak_count_substring.py +++ b/src/awkward/operations/str/ak_count_substring.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def count_substring( array, pattern, *, ignore_case=False, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_count_substring_regex.py b/src/awkward/operations/str/ak_count_substring_regex.py index 46b36cfb73..63349c9d75 100644 --- a/src/awkward/operations/str/ak_count_substring_regex.py +++ b/src/awkward/operations/str/ak_count_substring_regex.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def count_substring_regex( array, pattern, *, ignore_case=False, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_ends_with.py b/src/awkward/operations/str/ak_ends_with.py index 189ef7b25a..898acf5e9b 100644 --- a/src/awkward/operations/str/ak_ends_with.py +++ b/src/awkward/operations/str/ak_ends_with.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def ends_with(array, pattern, *, ignore_case=False, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_extract_regex.py b/src/awkward/operations/str/ak_extract_regex.py index ee0c899471..b4e5f522e6 100644 --- a/src/awkward/operations/str/ak_extract_regex.py +++ b/src/awkward/operations/str/ak_extract_regex.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def extract_regex(array, pattern, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_find_substring.py b/src/awkward/operations/str/ak_find_substring.py index 7955ffbc14..875e3c6f25 100644 --- a/src/awkward/operations/str/ak_find_substring.py +++ b/src/awkward/operations/str/ak_find_substring.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def find_substring(array, pattern, *, ignore_case=False, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_find_substring_regex.py b/src/awkward/operations/str/ak_find_substring_regex.py index 58edb06794..952c51c41b 100644 --- a/src/awkward/operations/str/ak_find_substring_regex.py +++ b/src/awkward/operations/str/ak_find_substring_regex.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def find_substring_regex( array, pattern, *, ignore_case=False, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_index_in.py b/src/awkward/operations/str/ak_index_in.py index 2e0b681d7b..02cd16f997 100644 --- a/src/awkward/operations/str/ak_index_in.py +++ b/src/awkward/operations/str/ak_index_in.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def index_in(array, value_set, *, skip_nones=False, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_alnum.py b/src/awkward/operations/str/ak_is_alnum.py index 9866039f3d..23bbb4e2be 100644 --- a/src/awkward/operations/str/ak_is_alnum.py +++ b/src/awkward/operations/str/ak_is_alnum.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def is_alnum(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_alpha.py b/src/awkward/operations/str/ak_is_alpha.py index 76a6b5721c..31fcb06275 100644 --- a/src/awkward/operations/str/ak_is_alpha.py +++ b/src/awkward/operations/str/ak_is_alpha.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def is_alpha(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_ascii.py b/src/awkward/operations/str/ak_is_ascii.py index 77747b3639..d7ddb3c103 100644 --- a/src/awkward/operations/str/ak_is_ascii.py +++ b/src/awkward/operations/str/ak_is_ascii.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def is_ascii(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_decimal.py b/src/awkward/operations/str/ak_is_decimal.py index fdf1a13942..524ea18c7b 100644 --- a/src/awkward/operations/str/ak_is_decimal.py +++ b/src/awkward/operations/str/ak_is_decimal.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def is_decimal(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_digit.py b/src/awkward/operations/str/ak_is_digit.py index 3e66c21980..1fc5fafe59 100644 --- a/src/awkward/operations/str/ak_is_digit.py +++ b/src/awkward/operations/str/ak_is_digit.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def is_digit(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_in.py b/src/awkward/operations/str/ak_is_in.py index 3240ed5b39..528f8f8558 100644 --- a/src/awkward/operations/str/ak_is_in.py +++ b/src/awkward/operations/str/ak_is_in.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def is_in(array, value_set, *, skip_nones=False, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_lower.py b/src/awkward/operations/str/ak_is_lower.py index c36ab6056a..39afa0e21e 100644 --- a/src/awkward/operations/str/ak_is_lower.py +++ b/src/awkward/operations/str/ak_is_lower.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def is_lower(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_numeric.py b/src/awkward/operations/str/ak_is_numeric.py index 6996782f3f..438e6fb01a 100644 --- a/src/awkward/operations/str/ak_is_numeric.py +++ b/src/awkward/operations/str/ak_is_numeric.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def is_numeric(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_printable.py b/src/awkward/operations/str/ak_is_printable.py index fcfdde24d6..5fdd3b50d4 100644 --- a/src/awkward/operations/str/ak_is_printable.py +++ b/src/awkward/operations/str/ak_is_printable.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def is_printable(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_space.py b/src/awkward/operations/str/ak_is_space.py index 2b264664a0..a3acce9a91 100644 --- a/src/awkward/operations/str/ak_is_space.py +++ b/src/awkward/operations/str/ak_is_space.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def is_space(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_title.py b/src/awkward/operations/str/ak_is_title.py index 4e65d60037..d8f050725f 100644 --- a/src/awkward/operations/str/ak_is_title.py +++ b/src/awkward/operations/str/ak_is_title.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def is_title(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_upper.py b/src/awkward/operations/str/ak_is_upper.py index cf460e6aa3..f814d77d35 100644 --- a/src/awkward/operations/str/ak_is_upper.py +++ b/src/awkward/operations/str/ak_is_upper.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def is_upper(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_join.py b/src/awkward/operations/str/ak_join.py index 40289bc4a6..b04f6a1dd9 100644 --- a/src/awkward/operations/str/ak_join.py +++ b/src/awkward/operations/str/ak_join.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def join(array, separator, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_join_element_wise.py b/src/awkward/operations/str/ak_join_element_wise.py index ad3639adb6..80efa60b18 100644 --- a/src/awkward/operations/str/ak_join_element_wise.py +++ b/src/awkward/operations/str/ak_join_element_wise.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def join_element_wise(*arrays, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_length.py b/src/awkward/operations/str/ak_length.py index 800afd4287..fba1e2f4a6 100644 --- a/src/awkward/operations/str/ak_length.py +++ b/src/awkward/operations/str/ak_length.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def length(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_lower.py b/src/awkward/operations/str/ak_lower.py index 51c391311b..61453bb0a4 100644 --- a/src/awkward/operations/str/ak_lower.py +++ b/src/awkward/operations/str/ak_lower.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def lower(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_lpad.py b/src/awkward/operations/str/ak_lpad.py index ed9b5f98ac..11575a34ea 100644 --- a/src/awkward/operations/str/ak_lpad.py +++ b/src/awkward/operations/str/ak_lpad.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def lpad(array, width, padding=" ", *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_ltrim.py b/src/awkward/operations/str/ak_ltrim.py index 062c49ba95..4164700111 100644 --- a/src/awkward/operations/str/ak_ltrim.py +++ b/src/awkward/operations/str/ak_ltrim.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def ltrim(array, characters, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_ltrim_whitespace.py b/src/awkward/operations/str/ak_ltrim_whitespace.py index 350cd52f05..d095f93247 100644 --- a/src/awkward/operations/str/ak_ltrim_whitespace.py +++ b/src/awkward/operations/str/ak_ltrim_whitespace.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def ltrim_whitespace(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_match_like.py b/src/awkward/operations/str/ak_match_like.py index 5515a829bd..7b688f69a2 100644 --- a/src/awkward/operations/str/ak_match_like.py +++ b/src/awkward/operations/str/ak_match_like.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def match_like(array, pattern, *, ignore_case=False, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_match_substring.py b/src/awkward/operations/str/ak_match_substring.py index d1bf7626c6..f81f1a7fcd 100644 --- a/src/awkward/operations/str/ak_match_substring.py +++ b/src/awkward/operations/str/ak_match_substring.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def match_substring( array, pattern, *, ignore_case=False, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_match_substring_regex.py b/src/awkward/operations/str/ak_match_substring_regex.py index 5a48f9ba62..4a8d4b515f 100644 --- a/src/awkward/operations/str/ak_match_substring_regex.py +++ b/src/awkward/operations/str/ak_match_substring_regex.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def match_substring_regex( array, pattern, *, ignore_case=False, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_repeat.py b/src/awkward/operations/str/ak_repeat.py index 75324de63a..7110721729 100644 --- a/src/awkward/operations/str/ak_repeat.py +++ b/src/awkward/operations/str/ak_repeat.py @@ -13,7 +13,7 @@ np = NumpyMetadata.instance() -@high_level_function +@high_level_function() def repeat(array, num_repeats, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_replace_slice.py b/src/awkward/operations/str/ak_replace_slice.py index 44161cb6c2..c1c478c435 100644 --- a/src/awkward/operations/str/ak_replace_slice.py +++ b/src/awkward/operations/str/ak_replace_slice.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def replace_slice(array, start, stop, replacement, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_replace_substring.py b/src/awkward/operations/str/ak_replace_substring.py index 4408beb6fd..328c8a36ac 100644 --- a/src/awkward/operations/str/ak_replace_substring.py +++ b/src/awkward/operations/str/ak_replace_substring.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def replace_substring( array, pattern, replacement, *, max_replacements=None, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_replace_substring_regex.py b/src/awkward/operations/str/ak_replace_substring_regex.py index 2380ba3e29..68ef66ad40 100644 --- a/src/awkward/operations/str/ak_replace_substring_regex.py +++ b/src/awkward/operations/str/ak_replace_substring_regex.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def replace_substring_regex( array, pattern, replacement, *, max_replacements=None, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_reverse.py b/src/awkward/operations/str/ak_reverse.py index 2a573f0ccc..a360970404 100644 --- a/src/awkward/operations/str/ak_reverse.py +++ b/src/awkward/operations/str/ak_reverse.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def reverse(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_rpad.py b/src/awkward/operations/str/ak_rpad.py index 34748043b8..69499caa75 100644 --- a/src/awkward/operations/str/ak_rpad.py +++ b/src/awkward/operations/str/ak_rpad.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def rpad(array, width, padding=" ", *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_rtrim.py b/src/awkward/operations/str/ak_rtrim.py index f83c651631..4aca4c0e8c 100644 --- a/src/awkward/operations/str/ak_rtrim.py +++ b/src/awkward/operations/str/ak_rtrim.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def rtrim(array, characters, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_rtrim_whitespace.py b/src/awkward/operations/str/ak_rtrim_whitespace.py index cba8760bfe..e61037574d 100644 --- a/src/awkward/operations/str/ak_rtrim_whitespace.py +++ b/src/awkward/operations/str/ak_rtrim_whitespace.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def rtrim_whitespace(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_slice.py b/src/awkward/operations/str/ak_slice.py index 7afaab7d93..dc7e9df98c 100644 --- a/src/awkward/operations/str/ak_slice.py +++ b/src/awkward/operations/str/ak_slice.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def slice(array, start, stop=None, step=1, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_split_pattern.py b/src/awkward/operations/str/ak_split_pattern.py index c7a84b1bf9..e967106c4f 100644 --- a/src/awkward/operations/str/ak_split_pattern.py +++ b/src/awkward/operations/str/ak_split_pattern.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def split_pattern( array, pattern, *, max_splits=None, reverse=False, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_split_pattern_regex.py b/src/awkward/operations/str/ak_split_pattern_regex.py index c870a922b3..e74e8c05b1 100644 --- a/src/awkward/operations/str/ak_split_pattern_regex.py +++ b/src/awkward/operations/str/ak_split_pattern_regex.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def split_pattern_regex( array, pattern, *, max_splits=None, reverse=False, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_split_whitespace.py b/src/awkward/operations/str/ak_split_whitespace.py index 31bab91854..aa4ffeaf78 100644 --- a/src/awkward/operations/str/ak_split_whitespace.py +++ b/src/awkward/operations/str/ak_split_whitespace.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def split_whitespace( array, *, max_splits=None, reverse=False, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_starts_with.py b/src/awkward/operations/str/ak_starts_with.py index 3a0799334b..69e9192a65 100644 --- a/src/awkward/operations/str/ak_starts_with.py +++ b/src/awkward/operations/str/ak_starts_with.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def starts_with(array, pattern, *, ignore_case=False, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_swapcase.py b/src/awkward/operations/str/ak_swapcase.py index 6c99413dc5..208c384c51 100644 --- a/src/awkward/operations/str/ak_swapcase.py +++ b/src/awkward/operations/str/ak_swapcase.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def swapcase(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_title.py b/src/awkward/operations/str/ak_title.py index e8bbd1af92..87e8feaca3 100644 --- a/src/awkward/operations/str/ak_title.py +++ b/src/awkward/operations/str/ak_title.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def title(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_trim.py b/src/awkward/operations/str/ak_trim.py index 626e762702..4d05fa8c98 100644 --- a/src/awkward/operations/str/ak_trim.py +++ b/src/awkward/operations/str/ak_trim.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def trim(array, characters, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_trim_whitespace.py b/src/awkward/operations/str/ak_trim_whitespace.py index 0453b8a03a..edb8e22878 100644 --- a/src/awkward/operations/str/ak_trim_whitespace.py +++ b/src/awkward/operations/str/ak_trim_whitespace.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def trim_whitespace(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_upper.py b/src/awkward/operations/str/ak_upper.py index dd25dc5de8..8132071295 100644 --- a/src/awkward/operations/str/ak_upper.py +++ b/src/awkward/operations/str/ak_upper.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function +@high_level_function() def upper(array, *, highlevel=True, behavior=None): """ Args: From 307a3ea3dc5117af545bb81c4e238c002113cdac Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Tue, 8 Aug 2023 09:47:07 +0100 Subject: [PATCH 66/73] fix: pass `module` to str `high_level_function` --- src/awkward/operations/str/ak_capitalize.py | 2 +- src/awkward/operations/str/ak_center.py | 2 +- src/awkward/operations/str/ak_count_substring.py | 2 +- src/awkward/operations/str/ak_count_substring_regex.py | 2 +- src/awkward/operations/str/ak_ends_with.py | 2 +- src/awkward/operations/str/ak_extract_regex.py | 2 +- src/awkward/operations/str/ak_find_substring.py | 2 +- src/awkward/operations/str/ak_find_substring_regex.py | 2 +- src/awkward/operations/str/ak_index_in.py | 2 +- src/awkward/operations/str/ak_is_alnum.py | 2 +- src/awkward/operations/str/ak_is_alpha.py | 2 +- src/awkward/operations/str/ak_is_ascii.py | 2 +- src/awkward/operations/str/ak_is_decimal.py | 2 +- src/awkward/operations/str/ak_is_digit.py | 2 +- src/awkward/operations/str/ak_is_in.py | 2 +- src/awkward/operations/str/ak_is_lower.py | 2 +- src/awkward/operations/str/ak_is_numeric.py | 2 +- src/awkward/operations/str/ak_is_printable.py | 2 +- src/awkward/operations/str/ak_is_space.py | 2 +- src/awkward/operations/str/ak_is_title.py | 2 +- src/awkward/operations/str/ak_is_upper.py | 2 +- src/awkward/operations/str/ak_join.py | 2 +- src/awkward/operations/str/ak_join_element_wise.py | 2 +- src/awkward/operations/str/ak_length.py | 2 +- src/awkward/operations/str/ak_lower.py | 2 +- src/awkward/operations/str/ak_lpad.py | 2 +- src/awkward/operations/str/ak_ltrim.py | 2 +- src/awkward/operations/str/ak_ltrim_whitespace.py | 2 +- src/awkward/operations/str/ak_match_like.py | 2 +- src/awkward/operations/str/ak_match_substring.py | 2 +- src/awkward/operations/str/ak_match_substring_regex.py | 2 +- src/awkward/operations/str/ak_repeat.py | 2 +- src/awkward/operations/str/ak_replace_slice.py | 2 +- src/awkward/operations/str/ak_replace_substring.py | 2 +- src/awkward/operations/str/ak_replace_substring_regex.py | 2 +- src/awkward/operations/str/ak_reverse.py | 2 +- src/awkward/operations/str/ak_rpad.py | 2 +- src/awkward/operations/str/ak_rtrim.py | 2 +- src/awkward/operations/str/ak_rtrim_whitespace.py | 2 +- src/awkward/operations/str/ak_slice.py | 2 +- src/awkward/operations/str/ak_split_pattern.py | 2 +- src/awkward/operations/str/ak_split_pattern_regex.py | 2 +- src/awkward/operations/str/ak_split_whitespace.py | 2 +- src/awkward/operations/str/ak_starts_with.py | 2 +- src/awkward/operations/str/ak_swapcase.py | 2 +- src/awkward/operations/str/ak_title.py | 2 +- src/awkward/operations/str/ak_trim.py | 2 +- src/awkward/operations/str/ak_trim_whitespace.py | 2 +- src/awkward/operations/str/ak_upper.py | 2 +- 49 files changed, 49 insertions(+), 49 deletions(-) diff --git a/src/awkward/operations/str/ak_capitalize.py b/src/awkward/operations/str/ak_capitalize.py index d555d23138..6592da247a 100644 --- a/src/awkward/operations/str/ak_capitalize.py +++ b/src/awkward/operations/str/ak_capitalize.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def capitalize(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_center.py b/src/awkward/operations/str/ak_center.py index 3a21d1520f..8ccc51c3fb 100644 --- a/src/awkward/operations/str/ak_center.py +++ b/src/awkward/operations/str/ak_center.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def center(array, width, padding=" ", *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_count_substring.py b/src/awkward/operations/str/ak_count_substring.py index bae8942694..6ac2608f0d 100644 --- a/src/awkward/operations/str/ak_count_substring.py +++ b/src/awkward/operations/str/ak_count_substring.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def count_substring( array, pattern, *, ignore_case=False, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_count_substring_regex.py b/src/awkward/operations/str/ak_count_substring_regex.py index 63349c9d75..f84d898eff 100644 --- a/src/awkward/operations/str/ak_count_substring_regex.py +++ b/src/awkward/operations/str/ak_count_substring_regex.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def count_substring_regex( array, pattern, *, ignore_case=False, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_ends_with.py b/src/awkward/operations/str/ak_ends_with.py index 898acf5e9b..76cd93949f 100644 --- a/src/awkward/operations/str/ak_ends_with.py +++ b/src/awkward/operations/str/ak_ends_with.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def ends_with(array, pattern, *, ignore_case=False, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_extract_regex.py b/src/awkward/operations/str/ak_extract_regex.py index b4e5f522e6..29f2a05dce 100644 --- a/src/awkward/operations/str/ak_extract_regex.py +++ b/src/awkward/operations/str/ak_extract_regex.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def extract_regex(array, pattern, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_find_substring.py b/src/awkward/operations/str/ak_find_substring.py index 875e3c6f25..626e027fb2 100644 --- a/src/awkward/operations/str/ak_find_substring.py +++ b/src/awkward/operations/str/ak_find_substring.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def find_substring(array, pattern, *, ignore_case=False, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_find_substring_regex.py b/src/awkward/operations/str/ak_find_substring_regex.py index 952c51c41b..68b206d5ba 100644 --- a/src/awkward/operations/str/ak_find_substring_regex.py +++ b/src/awkward/operations/str/ak_find_substring_regex.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def find_substring_regex( array, pattern, *, ignore_case=False, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_index_in.py b/src/awkward/operations/str/ak_index_in.py index 02cd16f997..559856feb8 100644 --- a/src/awkward/operations/str/ak_index_in.py +++ b/src/awkward/operations/str/ak_index_in.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def index_in(array, value_set, *, skip_nones=False, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_alnum.py b/src/awkward/operations/str/ak_is_alnum.py index 23bbb4e2be..d3a9b8e0a1 100644 --- a/src/awkward/operations/str/ak_is_alnum.py +++ b/src/awkward/operations/str/ak_is_alnum.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def is_alnum(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_alpha.py b/src/awkward/operations/str/ak_is_alpha.py index 31fcb06275..987538ca95 100644 --- a/src/awkward/operations/str/ak_is_alpha.py +++ b/src/awkward/operations/str/ak_is_alpha.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def is_alpha(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_ascii.py b/src/awkward/operations/str/ak_is_ascii.py index d7ddb3c103..4fcdd9d518 100644 --- a/src/awkward/operations/str/ak_is_ascii.py +++ b/src/awkward/operations/str/ak_is_ascii.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def is_ascii(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_decimal.py b/src/awkward/operations/str/ak_is_decimal.py index 524ea18c7b..8d5b607791 100644 --- a/src/awkward/operations/str/ak_is_decimal.py +++ b/src/awkward/operations/str/ak_is_decimal.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def is_decimal(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_digit.py b/src/awkward/operations/str/ak_is_digit.py index 1fc5fafe59..2c8ba67827 100644 --- a/src/awkward/operations/str/ak_is_digit.py +++ b/src/awkward/operations/str/ak_is_digit.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def is_digit(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_in.py b/src/awkward/operations/str/ak_is_in.py index 528f8f8558..2df2dfe74d 100644 --- a/src/awkward/operations/str/ak_is_in.py +++ b/src/awkward/operations/str/ak_is_in.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def is_in(array, value_set, *, skip_nones=False, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_lower.py b/src/awkward/operations/str/ak_is_lower.py index 39afa0e21e..f9cbb78fb7 100644 --- a/src/awkward/operations/str/ak_is_lower.py +++ b/src/awkward/operations/str/ak_is_lower.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def is_lower(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_numeric.py b/src/awkward/operations/str/ak_is_numeric.py index 438e6fb01a..3f1817c169 100644 --- a/src/awkward/operations/str/ak_is_numeric.py +++ b/src/awkward/operations/str/ak_is_numeric.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def is_numeric(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_printable.py b/src/awkward/operations/str/ak_is_printable.py index 5fdd3b50d4..574439723d 100644 --- a/src/awkward/operations/str/ak_is_printable.py +++ b/src/awkward/operations/str/ak_is_printable.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def is_printable(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_space.py b/src/awkward/operations/str/ak_is_space.py index a3acce9a91..884521eb45 100644 --- a/src/awkward/operations/str/ak_is_space.py +++ b/src/awkward/operations/str/ak_is_space.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def is_space(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_title.py b/src/awkward/operations/str/ak_is_title.py index d8f050725f..38b105224e 100644 --- a/src/awkward/operations/str/ak_is_title.py +++ b/src/awkward/operations/str/ak_is_title.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def is_title(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_is_upper.py b/src/awkward/operations/str/ak_is_upper.py index f814d77d35..a8a301a65f 100644 --- a/src/awkward/operations/str/ak_is_upper.py +++ b/src/awkward/operations/str/ak_is_upper.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def is_upper(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_join.py b/src/awkward/operations/str/ak_join.py index b04f6a1dd9..8504e8ccb3 100644 --- a/src/awkward/operations/str/ak_join.py +++ b/src/awkward/operations/str/ak_join.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def join(array, separator, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_join_element_wise.py b/src/awkward/operations/str/ak_join_element_wise.py index 80efa60b18..f78a354b4f 100644 --- a/src/awkward/operations/str/ak_join_element_wise.py +++ b/src/awkward/operations/str/ak_join_element_wise.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def join_element_wise(*arrays, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_length.py b/src/awkward/operations/str/ak_length.py index fba1e2f4a6..3471d61c12 100644 --- a/src/awkward/operations/str/ak_length.py +++ b/src/awkward/operations/str/ak_length.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def length(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_lower.py b/src/awkward/operations/str/ak_lower.py index 61453bb0a4..098aa24423 100644 --- a/src/awkward/operations/str/ak_lower.py +++ b/src/awkward/operations/str/ak_lower.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def lower(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_lpad.py b/src/awkward/operations/str/ak_lpad.py index 11575a34ea..2398463eab 100644 --- a/src/awkward/operations/str/ak_lpad.py +++ b/src/awkward/operations/str/ak_lpad.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def lpad(array, width, padding=" ", *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_ltrim.py b/src/awkward/operations/str/ak_ltrim.py index 4164700111..f5f4dca355 100644 --- a/src/awkward/operations/str/ak_ltrim.py +++ b/src/awkward/operations/str/ak_ltrim.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def ltrim(array, characters, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_ltrim_whitespace.py b/src/awkward/operations/str/ak_ltrim_whitespace.py index d095f93247..73e4624ced 100644 --- a/src/awkward/operations/str/ak_ltrim_whitespace.py +++ b/src/awkward/operations/str/ak_ltrim_whitespace.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def ltrim_whitespace(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_match_like.py b/src/awkward/operations/str/ak_match_like.py index 7b688f69a2..95db65ba7e 100644 --- a/src/awkward/operations/str/ak_match_like.py +++ b/src/awkward/operations/str/ak_match_like.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def match_like(array, pattern, *, ignore_case=False, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_match_substring.py b/src/awkward/operations/str/ak_match_substring.py index f81f1a7fcd..3bf474c050 100644 --- a/src/awkward/operations/str/ak_match_substring.py +++ b/src/awkward/operations/str/ak_match_substring.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def match_substring( array, pattern, *, ignore_case=False, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_match_substring_regex.py b/src/awkward/operations/str/ak_match_substring_regex.py index 4a8d4b515f..3a0b65d11f 100644 --- a/src/awkward/operations/str/ak_match_substring_regex.py +++ b/src/awkward/operations/str/ak_match_substring_regex.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def match_substring_regex( array, pattern, *, ignore_case=False, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_repeat.py b/src/awkward/operations/str/ak_repeat.py index 7110721729..c2e4704dad 100644 --- a/src/awkward/operations/str/ak_repeat.py +++ b/src/awkward/operations/str/ak_repeat.py @@ -13,7 +13,7 @@ np = NumpyMetadata.instance() -@high_level_function() +@high_level_function(module="ak.str") def repeat(array, num_repeats, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_replace_slice.py b/src/awkward/operations/str/ak_replace_slice.py index c1c478c435..573359e140 100644 --- a/src/awkward/operations/str/ak_replace_slice.py +++ b/src/awkward/operations/str/ak_replace_slice.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def replace_slice(array, start, stop, replacement, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_replace_substring.py b/src/awkward/operations/str/ak_replace_substring.py index 328c8a36ac..595f606787 100644 --- a/src/awkward/operations/str/ak_replace_substring.py +++ b/src/awkward/operations/str/ak_replace_substring.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def replace_substring( array, pattern, replacement, *, max_replacements=None, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_replace_substring_regex.py b/src/awkward/operations/str/ak_replace_substring_regex.py index 68ef66ad40..e20f1e662c 100644 --- a/src/awkward/operations/str/ak_replace_substring_regex.py +++ b/src/awkward/operations/str/ak_replace_substring_regex.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def replace_substring_regex( array, pattern, replacement, *, max_replacements=None, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_reverse.py b/src/awkward/operations/str/ak_reverse.py index a360970404..bcc249e7e4 100644 --- a/src/awkward/operations/str/ak_reverse.py +++ b/src/awkward/operations/str/ak_reverse.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def reverse(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_rpad.py b/src/awkward/operations/str/ak_rpad.py index 69499caa75..e46e43a5d8 100644 --- a/src/awkward/operations/str/ak_rpad.py +++ b/src/awkward/operations/str/ak_rpad.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def rpad(array, width, padding=" ", *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_rtrim.py b/src/awkward/operations/str/ak_rtrim.py index 4aca4c0e8c..88f562a1a5 100644 --- a/src/awkward/operations/str/ak_rtrim.py +++ b/src/awkward/operations/str/ak_rtrim.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def rtrim(array, characters, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_rtrim_whitespace.py b/src/awkward/operations/str/ak_rtrim_whitespace.py index e61037574d..e438a98363 100644 --- a/src/awkward/operations/str/ak_rtrim_whitespace.py +++ b/src/awkward/operations/str/ak_rtrim_whitespace.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def rtrim_whitespace(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_slice.py b/src/awkward/operations/str/ak_slice.py index dc7e9df98c..06b67c59e7 100644 --- a/src/awkward/operations/str/ak_slice.py +++ b/src/awkward/operations/str/ak_slice.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def slice(array, start, stop=None, step=1, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_split_pattern.py b/src/awkward/operations/str/ak_split_pattern.py index e967106c4f..d2ef682562 100644 --- a/src/awkward/operations/str/ak_split_pattern.py +++ b/src/awkward/operations/str/ak_split_pattern.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def split_pattern( array, pattern, *, max_splits=None, reverse=False, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_split_pattern_regex.py b/src/awkward/operations/str/ak_split_pattern_regex.py index e74e8c05b1..373a1c0db6 100644 --- a/src/awkward/operations/str/ak_split_pattern_regex.py +++ b/src/awkward/operations/str/ak_split_pattern_regex.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def split_pattern_regex( array, pattern, *, max_splits=None, reverse=False, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_split_whitespace.py b/src/awkward/operations/str/ak_split_whitespace.py index aa4ffeaf78..f534de5bd5 100644 --- a/src/awkward/operations/str/ak_split_whitespace.py +++ b/src/awkward/operations/str/ak_split_whitespace.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def split_whitespace( array, *, max_splits=None, reverse=False, highlevel=True, behavior=None ): diff --git a/src/awkward/operations/str/ak_starts_with.py b/src/awkward/operations/str/ak_starts_with.py index 69e9192a65..d055c93355 100644 --- a/src/awkward/operations/str/ak_starts_with.py +++ b/src/awkward/operations/str/ak_starts_with.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def starts_with(array, pattern, *, ignore_case=False, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_swapcase.py b/src/awkward/operations/str/ak_swapcase.py index 208c384c51..e5eb0ee52e 100644 --- a/src/awkward/operations/str/ak_swapcase.py +++ b/src/awkward/operations/str/ak_swapcase.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def swapcase(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_title.py b/src/awkward/operations/str/ak_title.py index 87e8feaca3..aac266547b 100644 --- a/src/awkward/operations/str/ak_title.py +++ b/src/awkward/operations/str/ak_title.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def title(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_trim.py b/src/awkward/operations/str/ak_trim.py index 4d05fa8c98..1796fbaeb5 100644 --- a/src/awkward/operations/str/ak_trim.py +++ b/src/awkward/operations/str/ak_trim.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def trim(array, characters, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_trim_whitespace.py b/src/awkward/operations/str/ak_trim_whitespace.py index edb8e22878..6568249969 100644 --- a/src/awkward/operations/str/ak_trim_whitespace.py +++ b/src/awkward/operations/str/ak_trim_whitespace.py @@ -9,7 +9,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def trim_whitespace(array, *, highlevel=True, behavior=None): """ Args: diff --git a/src/awkward/operations/str/ak_upper.py b/src/awkward/operations/str/ak_upper.py index 8132071295..2391b439af 100644 --- a/src/awkward/operations/str/ak_upper.py +++ b/src/awkward/operations/str/ak_upper.py @@ -8,7 +8,7 @@ from awkward._layout import wrap_layout -@high_level_function() +@high_level_function(module="ak.str") def upper(array, *, highlevel=True, behavior=None): """ Args: From 51a5c5c9f18efef11640b6046fd71a477ed4cf1f Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Tue, 8 Aug 2023 10:25:20 +0100 Subject: [PATCH 67/73] docs: homogenize docstrings --- src/awkward/operations/str/ak_capitalize.py | 11 +++++---- src/awkward/operations/str/ak_center.py | 17 +++++++++----- .../operations/str/ak_count_substring.py | 13 +++++++---- .../str/ak_count_substring_regex.py | 13 +++++++---- src/awkward/operations/str/ak_ends_with.py | 13 +++++++---- .../operations/str/ak_extract_regex.py | 13 +++++------ .../operations/str/ak_find_substring.py | 14 +++++++---- .../operations/str/ak_find_substring_regex.py | 16 ++++++++----- src/awkward/operations/str/ak_index_in.py | 14 +++++++---- src/awkward/operations/str/ak_is_alnum.py | 10 ++++---- src/awkward/operations/str/ak_is_alpha.py | 10 ++++---- src/awkward/operations/str/ak_is_ascii.py | 10 ++++---- src/awkward/operations/str/ak_is_decimal.py | 10 ++++---- src/awkward/operations/str/ak_is_digit.py | 10 ++++---- src/awkward/operations/str/ak_is_in.py | 12 ++++++---- src/awkward/operations/str/ak_is_lower.py | 10 ++++---- src/awkward/operations/str/ak_is_numeric.py | 10 ++++---- src/awkward/operations/str/ak_is_printable.py | 10 ++++---- src/awkward/operations/str/ak_is_space.py | 10 ++++---- src/awkward/operations/str/ak_is_title.py | 18 ++++++++++----- src/awkward/operations/str/ak_is_upper.py | 10 ++++---- src/awkward/operations/str/ak_join.py | 13 +++++++---- .../operations/str/ak_join_element_wise.py | 6 +++-- src/awkward/operations/str/ak_length.py | 7 +++--- src/awkward/operations/str/ak_lower.py | 7 +++--- src/awkward/operations/str/ak_lpad.py | 17 +++++++++----- src/awkward/operations/str/ak_ltrim.py | 16 ++++++++----- .../operations/str/ak_ltrim_whitespace.py | 4 ++-- src/awkward/operations/str/ak_match_like.py | 20 +++++++++++----- .../operations/str/ak_match_substring.py | 11 +++++---- .../str/ak_match_substring_regex.py | 11 +++++---- src/awkward/operations/str/ak_repeat.py | 8 ++++--- .../operations/str/ak_replace_slice.py | 14 +++++++---- .../operations/str/ak_replace_substring.py | 10 ++++---- .../str/ak_replace_substring_regex.py | 10 ++++---- src/awkward/operations/str/ak_reverse.py | 9 +++++--- src/awkward/operations/str/ak_rpad.py | 17 +++++++++----- src/awkward/operations/str/ak_rtrim.py | 13 +++++++---- .../operations/str/ak_rtrim_whitespace.py | 4 ++-- src/awkward/operations/str/ak_slice.py | 14 +++++++---- .../operations/str/ak_split_pattern.py | 14 +++++++---- .../operations/str/ak_split_pattern_regex.py | 17 +++++++++----- .../operations/str/ak_split_whitespace.py | 23 +++++++++++-------- src/awkward/operations/str/ak_starts_with.py | 13 +++++++---- src/awkward/operations/str/ak_swapcase.py | 10 ++++---- src/awkward/operations/str/ak_title.py | 12 ++++++---- src/awkward/operations/str/ak_trim.py | 16 ++++++++----- .../operations/str/ak_trim_whitespace.py | 7 +++--- src/awkward/operations/str/ak_upper.py | 10 ++++---- 49 files changed, 369 insertions(+), 218 deletions(-) diff --git a/src/awkward/operations/str/ak_capitalize.py b/src/awkward/operations/str/ak_capitalize.py index 6592da247a..1c33e480f8 100644 --- a/src/awkward/operations/str/ak_capitalize.py +++ b/src/awkward/operations/str/ak_capitalize.py @@ -18,12 +18,15 @@ def capitalize(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with a capitalized version (correctly transforming Unicode characters), with the first character uppercased and the others lowercased. + Replaces any string-valued data with a capitalized version + (correctly transforming Unicode characters), with the first character + uppercased and the others lowercased. - Replaces any bytestring-valued data with a capitalized version (transforming ASCII characters only). + Replaces any bytestring-valued data with a capitalized version + (transforming ASCII characters only). - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_capitalize](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_capitalize.html) diff --git a/src/awkward/operations/str/ak_center.py b/src/awkward/operations/str/ak_center.py index 8ccc51c3fb..d7d1801136 100644 --- a/src/awkward/operations/str/ak_center.py +++ b/src/awkward/operations/str/ak_center.py @@ -15,20 +15,25 @@ def center(array, width, padding=" ", *, highlevel=True, behavior=None): Args: array: Array-like data (anything #ak.to_layout recognizes). width (int): Desired string length. - padding (str or bytes): What to pad the string with. Should be one codepoint or byte. + padding (str or bytes): What to pad the string with. Should be one + codepoint or byte. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string or bytestring-valued data with centered strings/bytestrings of a given `width`, padding both sides with the given `padding` codepoint or byte. + Replaces any string or bytestring-valued data with centered + strings/bytestrings of a given `width`, padding both sides with the given + `padding` codepoint or byte. - If the data are strings, `width` is measured in codepoints and `padding` must be one codepoint. + If the data are strings, `width` is measured in codepoints and `padding` + must be one codepoint. - If the data are bytestrings, `width` is measured in bytes and `padding` must be one byte. + If the data are bytestrings, `width` is measured in bytes and `padding` + must be one byte. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_center](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_center.html) diff --git a/src/awkward/operations/str/ak_count_substring.py b/src/awkward/operations/str/ak_count_substring.py index 6ac2608f0d..36cd8febca 100644 --- a/src/awkward/operations/str/ak_count_substring.py +++ b/src/awkward/operations/str/ak_count_substring.py @@ -16,16 +16,21 @@ def count_substring( """ Args: array: Array-like data (anything #ak.to_layout recognizes). - pattern (str, or bytes): Substring pattern to look for inside the given array. - ignore_case (bool): If True, perform a case-insensitive match; otherwise, the match is case-sensitive. + pattern (str or bytes): Substring pattern to count for each string in + `array`. + ignore_case (bool): If True, perform a case-insensitive match; + otherwise, the match is case-sensitive. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - For each string in the array, count the number of occurrences of the given literal pattern. + Counts the number of occurrences of the given literal `pattern` in every + string in `array`. Depending upon the value of `ignore_case`, the matching + function will be case-insensitive. - Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.count_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.count_substring.html). diff --git a/src/awkward/operations/str/ak_count_substring_regex.py b/src/awkward/operations/str/ak_count_substring_regex.py index f84d898eff..113d8acb9b 100644 --- a/src/awkward/operations/str/ak_count_substring_regex.py +++ b/src/awkward/operations/str/ak_count_substring_regex.py @@ -16,16 +16,21 @@ def count_substring_regex( """ Args: array: Array-like data (anything #ak.to_layout recognizes). - pattern (str, or bytes): Substring pattern to look for inside the given array. - ignore_case (bool): If True, perform a case-insensitive match; otherwise, the match is case-sensitive. + pattern (str or bytes): Regular expression that matches substrings to + count for each string in `array`. + ignore_case (bool): If True, perform a case-insensitive match; + otherwise, the match is case-sensitive. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - For each string in the array, count the number of occurrences of the given regular expression pattern. + Counts the number of occurrences of the given regular expression `pattern` + in every string in `array`. Depending upon the value of `ignore_case`, the + matching function will be case-insensitive. - Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.count_substring_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.count_substring_regex.html). diff --git a/src/awkward/operations/str/ak_ends_with.py b/src/awkward/operations/str/ak_ends_with.py index 76cd93949f..ed68476a1f 100644 --- a/src/awkward/operations/str/ak_ends_with.py +++ b/src/awkward/operations/str/ak_ends_with.py @@ -14,16 +14,21 @@ def ends_with(array, pattern, *, ignore_case=False, highlevel=True, behavior=Non """ Args: array: Array-like data (anything #ak.to_layout recognizes). - pattern (str, or bytes): Substring pattern to look for inside the given array. - ignore_case (bool): If True, perform a case-insensitive match; otherwise, the match is case-sensitive. + pattern (str or bytes): Substring pattern to test against the ending + of each string in `array`. + ignore_case (bool): If True, perform a case-insensitive match; + otherwise, the match is case-sensitive. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - For each string in the array, determine whether it ends with the given literal suffix. + Returns True for every string in `array` if it ends with the given literal + suffix `pattern`. Depending upon the value of `ignore_case`, the matching + function will be case-insensitive. - Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.ends_with](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ends_with.html). diff --git a/src/awkward/operations/str/ak_extract_regex.py b/src/awkward/operations/str/ak_extract_regex.py index 29f2a05dce..2592ba1268 100644 --- a/src/awkward/operations/str/ak_extract_regex.py +++ b/src/awkward/operations/str/ak_extract_regex.py @@ -20,7 +20,9 @@ def extract_regex(array, pattern, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with None if the `pattern` does not match or records whose fields are named capture groups and the substrings they've captured if `pattern` does match. + Returns None for every string in `array` if it does not match `pattern`; + otherwise, a record whose fields are named capture groups and whose + contents are the substrings they've captured. Uses [Google RE2](https://github.com/google/re2/wiki/Syntax), and `pattern` must contain named groups. The syntax for a named group is `(?P<...>...)` in which @@ -44,14 +46,11 @@ def extract_regex(array, pattern, *, highlevel=True, behavior=None): Regular expressions with unnamed groups or features not implemented by RE2 raise an error. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls - [pyarrow.compute.extract_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.extract_regex.html) - or - [pyarrow.compute.extract_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.extract_regex.html) - on strings and bytestrings, respectively. + [pyarrow.compute.extract_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.extract_regex.html). """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_find_substring.py b/src/awkward/operations/str/ak_find_substring.py index 626e027fb2..28d5d86aec 100644 --- a/src/awkward/operations/str/ak_find_substring.py +++ b/src/awkward/operations/str/ak_find_substring.py @@ -14,17 +14,21 @@ def find_substring(array, pattern, *, ignore_case=False, highlevel=True, behavio """ Args: array: Array-like data (anything #ak.to_layout recognizes). - pattern (str, or bytes): Substring pattern to look for inside the given array. - ignore_case (bool): If True, perform a case-insensitive match; otherwise, the match is case-sensitive. + pattern (str or bytes): Substring pattern to find inside each string + in `array`. + ignore_case (bool): If True, perform a case-insensitive match; + otherwise, the match is case-sensitive. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - For each string in the array, determine the index at which the first occurrence of the given literal pattern is - found. If the literal pattern is not found inside the string, the index is taken to be -1. + Returns the index of the first occurrence of the given literal `pattern` + for each string in `array`. If the literal pattern is not found inside the + string, the index is taken to be -1. - Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.find_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.find_substring.html). diff --git a/src/awkward/operations/str/ak_find_substring_regex.py b/src/awkward/operations/str/ak_find_substring_regex.py index 68b206d5ba..90ab671ce5 100644 --- a/src/awkward/operations/str/ak_find_substring_regex.py +++ b/src/awkward/operations/str/ak_find_substring_regex.py @@ -16,20 +16,24 @@ def find_substring_regex( """ Args: array: Array-like data (anything #ak.to_layout recognizes). - pattern (str, or bytes): Substring pattern to look for inside the given array. - ignore_case (bool): If True, perform a case-insensitive match; otherwise, the match is case-sensitive. + pattern (str or bytes): Regular expression that matches substrings to + find inside each string in `array`. + ignore_case (bool): If True, perform a case-insensitive match; + otherwise, the match is case-sensitive. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - For each string in the array, determine the index at which the first occurrence of the given regular expression - pattern. is found. If the regular expression pattern is not found inside the string, the index is taken to be -1. + Returns the index of the first occurrence of the given regular expression + `pattern` for each string in `array`. If the literal pattern is not found + inside the string, the index is taken to be -1. - Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls - [pyarrow.compute.find_substring_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.find_substring_regex.html). + [pyarrow.compute.find_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.find_substring.html). """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_index_in.py b/src/awkward/operations/str/ak_index_in.py index 559856feb8..84f1a2af1d 100644 --- a/src/awkward/operations/str/ak_index_in.py +++ b/src/awkward/operations/str/ak_index_in.py @@ -14,17 +14,21 @@ def index_in(array, value_set, *, skip_nones=False, highlevel=True, behavior=Non """ Args: array: Array-like data (anything #ak.to_layout recognizes). - value_set: Array-like data (anything #ak.to_layout recognizes), set of values to search for. - skip_nones (bool): If True, None values in `array` are not matched against `value_set`; otherwise, they are. + value_set: Array-like data (anything #ak.to_layout recognizes), set of + values to search for in `array`. + skip_nones (bool): If True, None values in `array` are not matched + against `value_set`; otherwise, None is considered a legal value. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - For each string in the array, determine where it is found within the given set of values. If the string is - not found within the value set, the index is set to None. + Returns the index of the first pattern in `value_set` that each string in + `array` matches. If the string is not found within `value_set`, then the + index is set to None. - Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.index_in](https://arrow.apache.org/docs/python/generated/pyarrow.compute.index_in.html). diff --git a/src/awkward/operations/str/ak_is_alnum.py b/src/awkward/operations/str/ak_is_alnum.py index d3a9b8e0a1..d18d42f2a9 100644 --- a/src/awkward/operations/str/ak_is_alnum.py +++ b/src/awkward/operations/str/ak_is_alnum.py @@ -18,12 +18,14 @@ def is_alnum(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with True if the string is non-empty and consists only of alphanumeric Unicode characters, False otherwise. + Replaces any string-valued data with True if the string is non-empty and + consists only of alphanumeric Unicode characters, False otherwise. - Replaces any bytestring-valued data with True if the string is non-empty and consists only of alphanumeric ASCII characters, False otherwise. + Replaces any bytestring-valued data with True if the string is non-empty + and consists only of alphanumeric ASCII characters, False otherwise. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_is_alnum](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_alnum.html) diff --git a/src/awkward/operations/str/ak_is_alpha.py b/src/awkward/operations/str/ak_is_alpha.py index 987538ca95..892336f661 100644 --- a/src/awkward/operations/str/ak_is_alpha.py +++ b/src/awkward/operations/str/ak_is_alpha.py @@ -18,12 +18,14 @@ def is_alpha(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with True if the string is non-empty and consists only of alphabetic Unicode characters, False otherwise. + Replaces any string-valued data with True if the string is non-empty and + consists only of alphabetic Unicode characters, False otherwise. - Replaces any bytestring-valued data with True if the string is non-empty and consists only of alphabetic ASCII characters, False otherwise. + Replaces any bytestring-valued data with True if the string is non-empty + and consists only of alphabetic ASCII characters, False otherwise. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_is_alpha](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_alpha.html) diff --git a/src/awkward/operations/str/ak_is_ascii.py b/src/awkward/operations/str/ak_is_ascii.py index 4fcdd9d518..db7092c842 100644 --- a/src/awkward/operations/str/ak_is_ascii.py +++ b/src/awkward/operations/str/ak_is_ascii.py @@ -18,12 +18,14 @@ def is_ascii(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with True iff the string consists only of ASCII characters, False otherwise. + Replaces any string-valued data with True iff the string consists only of + ASCII characters, False otherwise. - Replaces any bytestring-valued data with True iff the string consists only of ASCII characters, False otherwise. + Replaces any bytestring-valued data with True iff the string consists only + of ASCII characters, False otherwise. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.string_is_ascii](https://arrow.apache.org/docs/python/generated/pyarrow.compute.string_is_ascii.html) diff --git a/src/awkward/operations/str/ak_is_decimal.py b/src/awkward/operations/str/ak_is_decimal.py index 8d5b607791..7599f150d2 100644 --- a/src/awkward/operations/str/ak_is_decimal.py +++ b/src/awkward/operations/str/ak_is_decimal.py @@ -18,12 +18,14 @@ def is_decimal(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with True if the string is non-empty and consists only of decimal Unicode characters, False otherwise. + Replaces any string-valued data with True if the string is non-empty and + consists only of decimal Unicode characters, False otherwise. - Replaces any bytestring-valued data with True if the string is non-empty and consists only of decimal ASCII characters, False otherwise. + Replaces any bytestring-valued data with True if the string is non-empty + and consists only of decimal ASCII characters, False otherwise. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_is_decimal](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_decimal.html) diff --git a/src/awkward/operations/str/ak_is_digit.py b/src/awkward/operations/str/ak_is_digit.py index 2c8ba67827..2838d5e39a 100644 --- a/src/awkward/operations/str/ak_is_digit.py +++ b/src/awkward/operations/str/ak_is_digit.py @@ -18,12 +18,14 @@ def is_digit(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with True if the string is non-empty and consists only of Unicode digits, False otherwise. + Replaces any string-valued data with True if the string is non-empty and + consists only of Unicode digits, False otherwise. - Replaces any bytestring-valued data with True if the string is non-empty and consists only of Unicode digits, False otherwise. + Replaces any bytestring-valued data with True if the string is non-empty + and consists only of Unicode digits, False otherwise. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_is_digit](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_digit.html) diff --git a/src/awkward/operations/str/ak_is_in.py b/src/awkward/operations/str/ak_is_in.py index 2df2dfe74d..3ac79b44ec 100644 --- a/src/awkward/operations/str/ak_is_in.py +++ b/src/awkward/operations/str/ak_is_in.py @@ -14,16 +14,20 @@ def is_in(array, value_set, *, skip_nones=False, highlevel=True, behavior=None): """ Args: array: Array-like data (anything #ak.to_layout recognizes). - value_set: Array-like data (anything #ak.to_layout recognizes), set of values to search for. - skip_nones (bool): If True, None values in `array` are not matched against `value_set`; otherwise, they are. + value_set: Array-like data (anything #ak.to_layout recognizes), set of + values to search for in `array`. + skip_nones (bool): If True, None values in `array` are not matched + against `value_set`; otherwise, None is considered a legal value. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - For each string in the array, determine whether it is found within the given set of values. + Returns True for each string in `array` if it matches any pattern in + `value_set`; otherwise, returns False. - Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.is_in](https://arrow.apache.org/docs/python/generated/pyarrow.compute.is_in.html). diff --git a/src/awkward/operations/str/ak_is_lower.py b/src/awkward/operations/str/ak_is_lower.py index f9cbb78fb7..5b502ec5e6 100644 --- a/src/awkward/operations/str/ak_is_lower.py +++ b/src/awkward/operations/str/ak_is_lower.py @@ -18,12 +18,14 @@ def is_lower(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with True if the string is non-empty and consists only of lowercase Unicode characters, False otherwise. + Replaces any string-valued data with True if the string is non-empty and + consists only of lowercase Unicode characters, False otherwise. - Replaces any bytestring-valued data with True if the string is non-empty and consists only of lowercase ASCII characters, False otherwise. + Replaces any bytestring-valued data with True if the string is non-empty + and consists only of lowercase ASCII characters, False otherwise. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_is_lower](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_lower.html) diff --git a/src/awkward/operations/str/ak_is_numeric.py b/src/awkward/operations/str/ak_is_numeric.py index 3f1817c169..e7e5a0db49 100644 --- a/src/awkward/operations/str/ak_is_numeric.py +++ b/src/awkward/operations/str/ak_is_numeric.py @@ -18,12 +18,14 @@ def is_numeric(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with True if the string is non-empty and consists only of numeric Unicode characters, False otherwise. + Replaces any string-valued data with True if the string is non-empty and + consists only of numeric Unicode characters, False otherwise. - Replaces any bytestring-valued data with True if the string is non-empty and consists only of numeric Unicode characters, False otherwise. + Replaces any bytestring-valued data with True if the string is non-empty + and consists only of numeric Unicode characters, False otherwise. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_is_numeric](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_numeric.html) diff --git a/src/awkward/operations/str/ak_is_printable.py b/src/awkward/operations/str/ak_is_printable.py index 574439723d..3b825acf6a 100644 --- a/src/awkward/operations/str/ak_is_printable.py +++ b/src/awkward/operations/str/ak_is_printable.py @@ -18,12 +18,14 @@ def is_printable(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with True if the string is non-empty and consists only of printable Unicode characters, False otherwise. + Replaces any string-valued data with True if the string is non-empty and + consists only of printable Unicode characters, False otherwise. - Replaces any bytestring-valued data with True if the string is non-empty and consists only of printable ASCII characters, False otherwise. + Replaces any bytestring-valued data with True if the string is non-empty + and consists only of printable ASCII characters, False otherwise. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_is_printable](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_printable.html) diff --git a/src/awkward/operations/str/ak_is_space.py b/src/awkward/operations/str/ak_is_space.py index 884521eb45..624691cdf5 100644 --- a/src/awkward/operations/str/ak_is_space.py +++ b/src/awkward/operations/str/ak_is_space.py @@ -18,12 +18,14 @@ def is_space(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with True if the string is non-empty and consists only of whitespace Unicode characters, False otherwise. + Replaces any string-valued data with True if the string is non-empty and + consists only of whitespace Unicode characters, False otherwise. - Replaces any bytestring-valued data with True if the string is non-empty and consists only of whitespace ASCII characters, False otherwise. + Replaces any bytestring-valued data with True if the string is non-empty + and consists only of whitespace ASCII characters, False otherwise. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_is_space](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_space.html) diff --git a/src/awkward/operations/str/ak_is_title.py b/src/awkward/operations/str/ak_is_title.py index 38b105224e..05a13377dc 100644 --- a/src/awkward/operations/str/ak_is_title.py +++ b/src/awkward/operations/str/ak_is_title.py @@ -18,12 +18,18 @@ def is_title(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with True if the string is title-cased, i.e. it has at least one cased character, each uppercase character follows an uncased character, and each lowercase character follows an uppercase character, otherwise False. - - Replaces any bytestring-valued data with True if the string is title-cased, i.e. it has at least one cased character, each uppercase character follows an uncased character, and each lowercase character follows an uppercase character, otherwise False. - - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Replaces any string-valued data with True if the string is title-cased, + i.e. it has at least one cased character, each uppercase character follows + an uncased character, and each lowercase character follows an uppercase + character, otherwise False. + + Replaces any bytestring-valued data with True if the string is + title-cased, i.e. it has at least one cased character, each uppercase + character follows an uncased character, and each lowercase character + follows an uppercase character, otherwise False. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_is_title](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_title.html) diff --git a/src/awkward/operations/str/ak_is_upper.py b/src/awkward/operations/str/ak_is_upper.py index a8a301a65f..b37aa1c843 100644 --- a/src/awkward/operations/str/ak_is_upper.py +++ b/src/awkward/operations/str/ak_is_upper.py @@ -18,12 +18,14 @@ def is_upper(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with True if the string is non-empty and consists only of uppercase Unicode characters, False otherwise. + Replaces any string-valued data with True if the string is non-empty and + consists only of uppercase Unicode characters, False otherwise. - Replaces any bytestring-valued data with True if the string is non-empty and consists only of uppercase ASCII characters, False otherwise. + Replaces any bytestring-valued data with True if the string is non-empty + and consists only of uppercase ASCII characters, False otherwise. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_is_upper](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_is_upper.html) diff --git a/src/awkward/operations/str/ak_join.py b/src/awkward/operations/str/ak_join.py index 8504e8ccb3..062067b3db 100644 --- a/src/awkward/operations/str/ak_join.py +++ b/src/awkward/operations/str/ak_join.py @@ -13,17 +13,20 @@ def join(array, separator, *, highlevel=True, behavior=None): """ Args: array: Array-like data (anything #ak.to_layout recognizes). - separator (str, bytes, or array of them to broadcast): separator to insert - between strings. If array-like, `separator` is broadcast against `array` - which permits a unique separator for each list of strings. + separator (str, bytes, or array of them to broadcast): separator to + insert between strings. If array-like, `separator` is broadcast + against `array`. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Concatenate the strings in `array`. The separator is inserted between each string. + Concatenate the strings in `array`. The `separator` is inserted between + each string. If array-like, `separator` is broadcast against `array` which + permits a unique separator for each list of strings in `array`. - Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.binary_join](https://arrow.apache.org/docs/python/generated/pyarrow.compute.binary_join.html). diff --git a/src/awkward/operations/str/ak_join_element_wise.py b/src/awkward/operations/str/ak_join_element_wise.py index f78a354b4f..2026bb8ae0 100644 --- a/src/awkward/operations/str/ak_join_element_wise.py +++ b/src/awkward/operations/str/ak_join_element_wise.py @@ -18,9 +18,11 @@ def join_element_wise(*arrays, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Broadcasts and concatenates all but the last array of strings in `arrays`; the last is used as a separator. + Broadcasts and concatenates all but the last array of strings in `arrays`; + the last is used as a separator. - Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.binary_join_element_wise](https://arrow.apache.org/docs/python/generated/pyarrow.compute.binary_join_element_wise.html). diff --git a/src/awkward/operations/str/ak_length.py b/src/awkward/operations/str/ak_length.py index 3471d61c12..700dbe534c 100644 --- a/src/awkward/operations/str/ak_length.py +++ b/src/awkward/operations/str/ak_length.py @@ -18,12 +18,13 @@ def length(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with its length in Unicode characters (not its length in bytes). + Replaces any string-valued data with its length in Unicode characters + (not its length in bytes). Replaces any bytestring-valued data with its length of bytes. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_length](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_length.html) diff --git a/src/awkward/operations/str/ak_lower.py b/src/awkward/operations/str/ak_lower.py index 098aa24423..ade17e10ac 100644 --- a/src/awkward/operations/str/ak_lower.py +++ b/src/awkward/operations/str/ak_lower.py @@ -18,12 +18,13 @@ def lower(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with a lowercase version (correctly transforming Unicode characters). + Replaces any string-valued data with a lowercase version (correctly + transforming Unicode characters). Replaces any bytestring-valued data with a lowercase version (transforming ASCII characters only). - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_lower](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_lower.html) diff --git a/src/awkward/operations/str/ak_lpad.py b/src/awkward/operations/str/ak_lpad.py index 2398463eab..431557d086 100644 --- a/src/awkward/operations/str/ak_lpad.py +++ b/src/awkward/operations/str/ak_lpad.py @@ -15,20 +15,25 @@ def lpad(array, width, padding=" ", *, highlevel=True, behavior=None): Args: array: Array-like data (anything #ak.to_layout recognizes). width (int): Desired string length. - padding (str or bytes): What to pad the string with. Should be one codepoint or byte. + padding (str or bytes): What to pad the string with. Should be one + codepoint or byte. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string or bytestring-valued data with right-aligned strings/bytestrings of a given `width`, padding the left side with the given `padding` codepoint or byte. + Replaces any string or bytestring-valued data with right-aligned + strings/bytestrings of a given `width`, padding the left side with the + given `padding` codepoint or byte. - If the data are strings, `width` is measured in codepoints and `padding` must be one codepoint. + If the data are strings, `width` is measured in codepoints and `padding` + must be one codepoint. - If the data are bytestrings, `width` is measured in bytes and `padding` must be one byte. + If the data are bytestrings, `width` is measured in bytes and `padding` + must be one byte. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_lpad](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_lpad.html) diff --git a/src/awkward/operations/str/ak_ltrim.py b/src/awkward/operations/str/ak_ltrim.py index f5f4dca355..5274ed5ec7 100644 --- a/src/awkward/operations/str/ak_ltrim.py +++ b/src/awkward/operations/str/ak_ltrim.py @@ -14,20 +14,24 @@ def ltrim(array, characters, *, highlevel=True, behavior=None): """ Args: array: Array-like data (anything #ak.to_layout recognizes). - characters (str or bytes): Individual characters to be trimmed from the string. + characters (str or bytes): Individual characters to be trimmed + from the string. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Removes any leading characters of `characters` from any string or bytestring-valued data. + Removes any leading characters of `characters` from any string or + bytestring-valued data. - If the data are strings, `characters` are interpreted as unordered, individual codepoints. + If the data are strings, `characters` are interpreted as unordered, + individual codepoints. - If the data are bytestrings, `characters` are interpreted as unordered, individual bytes. + If the data are bytestrings, `characters` are interpreted as unordered, + individual bytes. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_ltrim](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_ltrim.html) diff --git a/src/awkward/operations/str/ak_ltrim_whitespace.py b/src/awkward/operations/str/ak_ltrim_whitespace.py index 73e4624ced..ca7f917e4e 100644 --- a/src/awkward/operations/str/ak_ltrim_whitespace.py +++ b/src/awkward/operations/str/ak_ltrim_whitespace.py @@ -21,8 +21,8 @@ def ltrim_whitespace(array, *, highlevel=True, behavior=None): Removes any leading whitespace from any string or bytestring-valued data. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_ltrim_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_ltrim_whitespace.html) diff --git a/src/awkward/operations/str/ak_match_like.py b/src/awkward/operations/str/ak_match_like.py index 95db65ba7e..6cc83443bb 100644 --- a/src/awkward/operations/str/ak_match_like.py +++ b/src/awkward/operations/str/ak_match_like.py @@ -14,18 +14,26 @@ def match_like(array, pattern, *, ignore_case=False, highlevel=True, behavior=No """ Args: array: Array-like data (anything #ak.to_layout recognizes). - pattern (str, or bytes): Substring pattern to look for inside the given array. - ignore_case (bool): If True, perform a case-insensitive match; otherwise, the match is case-sensitive. + pattern (str or bytes): SQL-style LIKE pattern to match against + strings in `array`. + ignore_case (bool): If True, perform a case-insensitive match; + otherwise, the match is case-sensitive. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - For each string in the array, determine whether it matches the given SQL-style LIKE pattern. - '%' matches any number of characters, '_' matches exactly one character, and any other character matches itself. - To match a literal '%', '_', or "'", the character must be preceded with a backslash. + For each string in the array, determine whether it matches the given + SQL-style LIKE pattern, which obeys the following rules: - Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + - '%' matches any number of characters. + - '_' matches exactly one character. + - Any other character matches itself. + - To match a literal '%', '_', or "'", the character must be preceded + with a backslash. + + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.match_like](https://arrow.apache.org/docs/python/generated/pyarrow.compute.match_like.html). diff --git a/src/awkward/operations/str/ak_match_substring.py b/src/awkward/operations/str/ak_match_substring.py index 3bf474c050..0b5ac3421e 100644 --- a/src/awkward/operations/str/ak_match_substring.py +++ b/src/awkward/operations/str/ak_match_substring.py @@ -16,16 +16,19 @@ def match_substring( """ Args: array: Array-like data (anything #ak.to_layout recognizes). - pattern (str, or bytes): Substring pattern to look for inside the given array. - ignore_case (bool): If True, perform a case-insensitive match; otherwise, the match is case-sensitive. + pattern (str or bytes): Substring pattern to look for inside `array`. + ignore_case (bool): If True, perform a case-insensitive match; + otherwise, the match is case-sensitive. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - For each string in the array, determine whether it contains the given literal pattern. + For each string in the array, determine whether it contains the given + literal `pattern`. - Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.match_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.match_substring.html). diff --git a/src/awkward/operations/str/ak_match_substring_regex.py b/src/awkward/operations/str/ak_match_substring_regex.py index 3a0b65d11f..f909936a42 100644 --- a/src/awkward/operations/str/ak_match_substring_regex.py +++ b/src/awkward/operations/str/ak_match_substring_regex.py @@ -16,16 +16,19 @@ def match_substring_regex( """ Args: array: Array-like data (anything #ak.to_layout recognizes). - pattern (str, or bytes): Substring pattern to look for inside the given array. - ignore_case (bool): If True, perform a case-insensitive match; otherwise, the match is case-sensitive. + pattern (str or bytes): Regular expression to search for inside `array`. + ignore_case (bool): If True, perform a case-insensitive match; + otherwise, the match is case-sensitive. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - For each string in the array, determine whether it contains the given regular expression pattern. + For each string in the array, determine whether any substring matches the + given regular expression `pattern` - Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.match_substring_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.match_substring_regex.html). diff --git a/src/awkward/operations/str/ak_repeat.py b/src/awkward/operations/str/ak_repeat.py index c2e4704dad..3d0edaa755 100644 --- a/src/awkward/operations/str/ak_repeat.py +++ b/src/awkward/operations/str/ak_repeat.py @@ -24,10 +24,12 @@ def repeat(array, num_repeats, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued or bytestring-valued data with the same value repeated `num_repeats` times, which can be a scalar integer or a (broadcasted) array of integers. + Replaces any string-valued or bytestring-valued data with the same value + repeated `num_repeats` times, which can be a scalar integer or a + (broadcasted) array of integers. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.binary_repeat](https://arrow.apache.org/docs/python/generated/pyarrow.compute.binary_repeat.html) diff --git a/src/awkward/operations/str/ak_replace_slice.py b/src/awkward/operations/str/ak_replace_slice.py index 573359e140..fc0668daf9 100644 --- a/src/awkward/operations/str/ak_replace_slice.py +++ b/src/awkward/operations/str/ak_replace_slice.py @@ -22,14 +22,18 @@ def replace_slice(array, start, stop, replacement, *, highlevel=True, behavior=N behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces slices of any string or bytestring-valued data with `replacement` between `start` and `stop` indexes; `start` is inclusive and `stop` is exclusive and both are 0-indexed. + Replaces slices of any string or bytestring-valued data with `replacement` + between `start` and `stop` indexes; `start` is inclusive and `stop` is + exclusive and both are 0-indexed. - For strings, `start` and `stop` are measured in Unicode characters; for bytestrings, `start` and `stop` are measured in bytes. + For strings, `start` and `stop` are measured in Unicode characters; for + bytestrings, `start` and `stop` are measured in bytes. - The `start`, `stop`, and `replacement` are scalars; they cannot be different for each string/bytestring in the sample. + The `start`, `stop`, and `replacement` are scalars; they cannot be + different for each string/bytestring in the sample. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_replace_slice](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_replace_slice.html) diff --git a/src/awkward/operations/str/ak_replace_substring.py b/src/awkward/operations/str/ak_replace_substring.py index 595f606787..9214c94199 100644 --- a/src/awkward/operations/str/ak_replace_substring.py +++ b/src/awkward/operations/str/ak_replace_substring.py @@ -26,12 +26,14 @@ def replace_substring( behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces non-overlapping subsequences of any string or bytestring-valued data that match a literal `pattern` with `replacement`. + Replaces non-overlapping subsequences of any string or bytestring-valued + data that match a literal `pattern` with `replacement`. - The `pattern` and `replacement` are scalars; they cannot be different for each string/bytestring in the sample. + The `pattern` and `replacement` are scalars; they cannot be different for + each string/bytestring in the sample. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.replace_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.replace_substring.html) diff --git a/src/awkward/operations/str/ak_replace_substring_regex.py b/src/awkward/operations/str/ak_replace_substring_regex.py index e20f1e662c..ae2d0ff043 100644 --- a/src/awkward/operations/str/ak_replace_substring_regex.py +++ b/src/awkward/operations/str/ak_replace_substring_regex.py @@ -26,12 +26,14 @@ def replace_substring_regex( behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces non-overlapping subsequences of any string or bytestring-valued data that match a regular expression `pattern` with `replacement`. + Replaces non-overlapping subsequences of any string or bytestring-valued + data that match a regular expression `pattern` with `replacement`. - The `pattern` and `replacement` are scalars; they cannot be different for each string/bytestring in the sample. + The `pattern` and `replacement` are scalars; they cannot be different + for each string/bytestring in the sample. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.replace_substring_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.replace_substring_regex.html) diff --git a/src/awkward/operations/str/ak_reverse.py b/src/awkward/operations/str/ak_reverse.py index bcc249e7e4..bd5e6c79f2 100644 --- a/src/awkward/operations/str/ak_reverse.py +++ b/src/awkward/operations/str/ak_reverse.py @@ -18,12 +18,15 @@ def reverse(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Reverses the order of Unicode characters in any string-valued data. (This function operates on Unicode codepoints, not grapheme clusters. Hence, it will not correctly reverse grapheme clusters composed of multiple codepoints.) + Reverses the order of Unicode characters in any string-valued data. + (This function operates on Unicode codepoints, not grapheme clusters. + Hence, it will not correctly reverse grapheme clusters composed of + multiple codepoints.) Reverses the order of bytes in any bytestring-valued data. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_reverse](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_reverse.html) diff --git a/src/awkward/operations/str/ak_rpad.py b/src/awkward/operations/str/ak_rpad.py index e46e43a5d8..99fe323d60 100644 --- a/src/awkward/operations/str/ak_rpad.py +++ b/src/awkward/operations/str/ak_rpad.py @@ -15,20 +15,25 @@ def rpad(array, width, padding=" ", *, highlevel=True, behavior=None): Args: array: Array-like data (anything #ak.to_layout recognizes). width (int): Desired string length. - padding (str or bytes): What to pad the string with. Should be one codepoint or byte. + padding (str or bytes): What to pad the string with. Should be one + codepoint or byte. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string or bytestring-valued data with left-aligned strings/bytestrings of a given `width`, padding the right side with the given `padding` codepoint or byte. + Replaces any string or bytestring-valued data with left-aligned + strings/bytestrings of a given `width`, padding the right side with the + given `padding` codepoint or byte. - If the data are strings, `width` is measured in codepoints and `padding` must be one codepoint. + If the data are strings, `width` is measured in codepoints and `padding` + must be one codepoint. - If the data are bytestrings, `width` is measured in bytes and `padding` must be one byte. + If the data are bytestrings, `width` is measured in bytes and `padding` + must be one byte. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_rpad](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_rpad.html) diff --git a/src/awkward/operations/str/ak_rtrim.py b/src/awkward/operations/str/ak_rtrim.py index 88f562a1a5..db96fe2061 100644 --- a/src/awkward/operations/str/ak_rtrim.py +++ b/src/awkward/operations/str/ak_rtrim.py @@ -20,14 +20,17 @@ def rtrim(array, characters, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Removes any trailing characters of `characters` from any string or bytestring-valued data. + Removes any trailing characters of `characters` from any string or + bytestring-valued data. - If the data are strings, `characters` are interpreted as unordered, individual codepoints. + If the data are strings, `characters` are interpreted as unordered, + individual codepoints. - If the data are bytestrings, `characters` are interpreted as unordered, individual bytes. + If the data are bytestrings, `characters` are interpreted as unordered, + individual bytes. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_rtrim](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_rtrim.html) diff --git a/src/awkward/operations/str/ak_rtrim_whitespace.py b/src/awkward/operations/str/ak_rtrim_whitespace.py index e438a98363..1216d3d5ce 100644 --- a/src/awkward/operations/str/ak_rtrim_whitespace.py +++ b/src/awkward/operations/str/ak_rtrim_whitespace.py @@ -21,8 +21,8 @@ def rtrim_whitespace(array, *, highlevel=True, behavior=None): Removes any trailing whitespace from any string or bytestring-valued data. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_rtrim_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_rtrim_whitespace.html) diff --git a/src/awkward/operations/str/ak_slice.py b/src/awkward/operations/str/ak_slice.py index 06b67c59e7..89385c1a03 100644 --- a/src/awkward/operations/str/ak_slice.py +++ b/src/awkward/operations/str/ak_slice.py @@ -23,14 +23,18 @@ def slice(array, start, stop=None, step=1, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string or bytestring-valued data with a slice between `start` and `stop` indexes; `start` is inclusive and `stop` is exclusive and both are 0-indexed. + Replaces any string or bytestring-valued data with a slice between `start` + and `stop` indexes; `start` is inclusive and `stop` is exclusive and both + are 0-indexed. - For strings, `start` and `stop` are measured in Unicode characters; for bytestrings, `start` and `stop` are measured in bytes. + For strings, `start` and `stop` are measured in Unicode characters; for + bytestrings, `start` and `stop` are measured in bytes. - The `start`, `stop`, and `replacement` are scalars; they cannot be different for each string/bytestring in the sample. + The `start`, `stop`, and `replacement` are scalars; they cannot be + different for each string/bytestring in the sample. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_slice_codeunits](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_slice_codeunits.html) diff --git a/src/awkward/operations/str/ak_split_pattern.py b/src/awkward/operations/str/ak_split_pattern.py index d2ef682562..604532edc0 100644 --- a/src/awkward/operations/str/ak_split_pattern.py +++ b/src/awkward/operations/str/ak_split_pattern.py @@ -17,17 +17,21 @@ def split_pattern( Args: array: Array-like data (anything #ak.to_layout recognizes). pattern (str or bytes): Pattern of characters/bytes to split on. - max_splits (None or int): Maximum number of splits for each input value. If None, unlimited. - reverse (bool): If True, start splitting from the end of each input value; otherwise, start splitting - from the beginning of each value. This flag only has an effect if `max_splits` is not None. + max_splits (None or int): Maximum number of splits for each input + value. If None, unlimited. + reverse (bool): If True, start splitting from the end of each input + value; otherwise, start splitting from the beginning of each + value. This flag only has an effect if `max_splits` is not None. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Splits any string or bytestring-valued data into a list of substrings according to the given separator. + Splits any string or bytestring-valued data into a list of substrings + according to the given separator. - Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.split_pattern](https://arrow.apache.org/docs/python/generated/pyarrow.compute.split_pattern.html). diff --git a/src/awkward/operations/str/ak_split_pattern_regex.py b/src/awkward/operations/str/ak_split_pattern_regex.py index 373a1c0db6..69a88e9b1e 100644 --- a/src/awkward/operations/str/ak_split_pattern_regex.py +++ b/src/awkward/operations/str/ak_split_pattern_regex.py @@ -16,18 +16,23 @@ def split_pattern_regex( """ Args: array: Array-like data (anything #ak.to_layout recognizes). - pattern (str or bytes): Regular expression of characters/bytes to split on. - max_splits (None or int): Maximum number of splits for each input value. If None, unlimited. - reverse (bool): If True, start splitting from the end of each input value; otherwise, start splitting - from the beginning of each value. This flag only has an effect if `max_splits` is not None. + pattern (str or bytes): Regular expression of characters/bytes to + split on. + max_splits (None or int): Maximum number of splits for each input + value. If None, unlimited. + reverse (bool): If True, start splitting from the end of each input + value; otherwise, start splitting from the beginning of each + value. This flag only has an effect if `max_splits` is not None. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Splits any string or bytestring-valued data into a list of substrings according to the given regular expression. + Splits any string or bytestring-valued data into a list of substrings + according to the given regular expression. - Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.split_pattern](https://arrow.apache.org/docs/python/generated/pyarrow.compute.split_pattern.html). diff --git a/src/awkward/operations/str/ak_split_whitespace.py b/src/awkward/operations/str/ak_split_whitespace.py index f534de5bd5..bf35a2c1b9 100644 --- a/src/awkward/operations/str/ak_split_whitespace.py +++ b/src/awkward/operations/str/ak_split_whitespace.py @@ -16,24 +16,29 @@ def split_whitespace( """ Args: array: Array-like data (anything #ak.to_layout recognizes). - max_splits (None or int): Maximum number of splits for each input value. If None, unlimited. - reverse (bool): If True, start splitting from the end of each input value; otherwise, start splitting - from the beginning of each value. This flag only has an effect if `max_splits` is not None. + max_splits (None or int): Maximum number of splits for each input + value. If None, unlimited. + reverse (bool): If True, start splitting from the end of each input + value; otherwise, start splitting from the beginning of each + value. This flag only has an effect if `max_splits` is not None. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Splits any string or bytestring-valued data into a list of substrings according to any non-zero length sequence of + Splits any string or bytestring-valued data into a list of substrings + according to any non-zero length sequence of whitespace characters. - For strings, a split is performed for every sequence of Unicode whitespace characters; for bytestrings, splitting - is performed for sequences of ascii whitespace characters. + For strings, a split is performed for every sequence of Unicode whitespace + characters; for bytestrings, splitting is performed for sequences of ascii + whitespace characters. - The `max_splits`, and `reverse` arguments are scalars; they cannot be different for each string/bytestring in the - sample. + The `max_splits`, and `reverse` arguments are scalars; they cannot be + different for each string/bytestring in the sample. - Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_split_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_split_whitespace.html) diff --git a/src/awkward/operations/str/ak_starts_with.py b/src/awkward/operations/str/ak_starts_with.py index d055c93355..e035c53920 100644 --- a/src/awkward/operations/str/ak_starts_with.py +++ b/src/awkward/operations/str/ak_starts_with.py @@ -14,16 +14,21 @@ def starts_with(array, pattern, *, ignore_case=False, highlevel=True, behavior=N """ Args: array: Array-like data (anything #ak.to_layout recognizes). - pattern (str, or bytes): Substring pattern to look for inside the given array. - ignore_case (bool): If True, perform a case-insensitive match; otherwise, the match is case-sensitive. + pattern (str or bytes): Substring pattern to test against the start + of each string in `array`. + ignore_case (bool): If True, perform a case-insensitive match; + otherwise, the match is case-sensitive. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - For each string in the array, determine whether it starts with the given literal suffix. + Returns True for every string in `array` if it starts with the given literal + suffix `pattern`. Depending upon the value of `ignore_case`, the matching + function will be case-insensitive. - Note: this function does not raise an error if the `array` does not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.starts_with](https://arrow.apache.org/docs/python/generated/pyarrow.compute.starts_with.html). diff --git a/src/awkward/operations/str/ak_swapcase.py b/src/awkward/operations/str/ak_swapcase.py index e5eb0ee52e..1629c65fdc 100644 --- a/src/awkward/operations/str/ak_swapcase.py +++ b/src/awkward/operations/str/ak_swapcase.py @@ -18,12 +18,14 @@ def swapcase(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with uppercase characters transformed to lowercase and vice-versa (correctly transforming Unicode characters). + Replaces any string-valued data with uppercase characters transformed to + lowercase and vice-versa (correctly transforming Unicode characters). - Replaces any bytestring-valued data with uppercase characters transformed to lowercase and vice-versa (transforming ASCII characters only). + Replaces any bytestring-valued data with uppercase characters transformed + to lowercase and vice-versa (transforming ASCII characters only). - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_swapcase](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_swapcase.html) diff --git a/src/awkward/operations/str/ak_title.py b/src/awkward/operations/str/ak_title.py index aac266547b..8c7d0361b4 100644 --- a/src/awkward/operations/str/ak_title.py +++ b/src/awkward/operations/str/ak_title.py @@ -18,12 +18,16 @@ def title(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with a titlecase version (correctly transforming Unicode characters). Each word in the output will start with an uppercase character and its remaining characters will be lowercase. + Replaces any string-valued data with a titlecase version (correctly + transforming Unicode characters). Each word in the output will start with + an uppercase character and its remaining characters will be lowercase. - Replaces any bytestring-valued data with a titlecase version (transforming ASCII characters only). Each word in the output will start with an uppercase character and its remaining characters will be lowercase. + Replaces any bytestring-valued data with a titlecase version (transforming + ASCII characters only). Each word in the output will start with an + uppercase character and its remaining characters will be lowercase. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_title](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_title.html) diff --git a/src/awkward/operations/str/ak_trim.py b/src/awkward/operations/str/ak_trim.py index 1796fbaeb5..ecf7d14f90 100644 --- a/src/awkward/operations/str/ak_trim.py +++ b/src/awkward/operations/str/ak_trim.py @@ -14,20 +14,24 @@ def trim(array, characters, *, highlevel=True, behavior=None): """ Args: array: Array-like data (anything #ak.to_layout recognizes). - characters (str or bytes): Individual characters to be trimmed from the string. + characters (str or bytes): Individual characters to be trimmed from + the string. highlevel (bool): If True, return an #ak.Array; otherwise, return a low-level #ak.contents.Content subclass. behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Removes any leading or trailing characters of `characters` from any string or bytestring-valued data. + Removes any leading or trailing characters of `characters` from any string + or bytestring-valued data. - If the data are strings, `characters` are interpreted as unordered, individual codepoints. + If the data are strings, `characters` are interpreted as unordered, + individual codepoints. - If the data are bytestrings, `characters` are interpreted as unordered, individual bytes. + If the data are bytestrings, `characters` are interpreted as unordered, + individual bytes. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_trim](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_trim.html) diff --git a/src/awkward/operations/str/ak_trim_whitespace.py b/src/awkward/operations/str/ak_trim_whitespace.py index 6568249969..de34c5fa87 100644 --- a/src/awkward/operations/str/ak_trim_whitespace.py +++ b/src/awkward/operations/str/ak_trim_whitespace.py @@ -19,10 +19,11 @@ def trim_whitespace(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Removes any leading or trailing whitespace from any string or bytestring-valued data. + Removes any leading or trailing whitespace from any string or + bytestring-valued data. - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_trim_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_trim_whitespace.html) diff --git a/src/awkward/operations/str/ak_upper.py b/src/awkward/operations/str/ak_upper.py index 2391b439af..4f0a8bf920 100644 --- a/src/awkward/operations/str/ak_upper.py +++ b/src/awkward/operations/str/ak_upper.py @@ -18,12 +18,14 @@ def upper(array, *, highlevel=True, behavior=None): behavior (None or dict): Custom #ak.behavior for the output array, if high-level. - Replaces any string-valued data with a uppercase version (correctly transforming Unicode characters). + Replaces any string-valued data with an uppercase version (correctly + transforming Unicode characters). - Replaces any bytestring-valued data with a uppercase version (transforming ASCII characters only). + Replaces any bytestring-valued data with am uppercase version (transforming + ASCII characters only). - Note: this function does not raise an error if the `array` does - not contain any string or bytestring data. + Note: this function does not raise an error if the `array` does not + contain any string or bytestring data. Requires the pyarrow library and calls [pyarrow.compute.utf8_upper](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_upper.html) From 447cde7d2ba2e7cb01dfe9e54e5e858fc706b37b Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Tue, 8 Aug 2023 11:02:07 +0100 Subject: [PATCH 68/73] docs: add see also --- src/awkward/operations/str/ak_count_substring.py | 2 ++ src/awkward/operations/str/ak_count_substring_regex.py | 2 ++ src/awkward/operations/str/ak_find_substring.py | 2 ++ src/awkward/operations/str/ak_find_substring_regex.py | 2 ++ src/awkward/operations/str/ak_ltrim.py | 2 ++ src/awkward/operations/str/ak_ltrim_whitespace.py | 2 ++ src/awkward/operations/str/ak_match_substring.py | 2 ++ src/awkward/operations/str/ak_match_substring_regex.py | 2 ++ src/awkward/operations/str/ak_replace_substring.py | 2 ++ src/awkward/operations/str/ak_replace_substring_regex.py | 2 ++ src/awkward/operations/str/ak_rtrim.py | 2 ++ src/awkward/operations/str/ak_rtrim_whitespace.py | 2 ++ src/awkward/operations/str/ak_split_pattern.py | 2 ++ src/awkward/operations/str/ak_split_pattern_regex.py | 2 ++ src/awkward/operations/str/ak_split_whitespace.py | 2 ++ src/awkward/operations/str/ak_trim.py | 2 ++ src/awkward/operations/str/ak_trim_whitespace.py | 2 ++ 17 files changed, 34 insertions(+) diff --git a/src/awkward/operations/str/ak_count_substring.py b/src/awkward/operations/str/ak_count_substring.py index 36cd8febca..8bbc44bcd7 100644 --- a/src/awkward/operations/str/ak_count_substring.py +++ b/src/awkward/operations/str/ak_count_substring.py @@ -34,6 +34,8 @@ def count_substring( Requires the pyarrow library and calls [pyarrow.compute.count_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.count_substring.html). + + See also: #ak.str.count_substring_regex. """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_count_substring_regex.py b/src/awkward/operations/str/ak_count_substring_regex.py index 113d8acb9b..4cd7f3fe8a 100644 --- a/src/awkward/operations/str/ak_count_substring_regex.py +++ b/src/awkward/operations/str/ak_count_substring_regex.py @@ -34,6 +34,8 @@ def count_substring_regex( Requires the pyarrow library and calls [pyarrow.compute.count_substring_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.count_substring_regex.html). + + See also: #ak.str.count_substring. """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_find_substring.py b/src/awkward/operations/str/ak_find_substring.py index 28d5d86aec..f936bdac86 100644 --- a/src/awkward/operations/str/ak_find_substring.py +++ b/src/awkward/operations/str/ak_find_substring.py @@ -32,6 +32,8 @@ def find_substring(array, pattern, *, ignore_case=False, highlevel=True, behavio Requires the pyarrow library and calls [pyarrow.compute.find_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.find_substring.html). + + See also: #ak.str.find_substring_regex. """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_find_substring_regex.py b/src/awkward/operations/str/ak_find_substring_regex.py index 90ab671ce5..e5059f846a 100644 --- a/src/awkward/operations/str/ak_find_substring_regex.py +++ b/src/awkward/operations/str/ak_find_substring_regex.py @@ -34,6 +34,8 @@ def find_substring_regex( Requires the pyarrow library and calls [pyarrow.compute.find_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.find_substring.html). + + See also: #ak.str.find_substring. """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_ltrim.py b/src/awkward/operations/str/ak_ltrim.py index 5274ed5ec7..a6904c1d11 100644 --- a/src/awkward/operations/str/ak_ltrim.py +++ b/src/awkward/operations/str/ak_ltrim.py @@ -38,6 +38,8 @@ def ltrim(array, characters, *, highlevel=True, behavior=None): or [pyarrow.compute.ascii_ltrim](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_ltrim.html) on strings and bytestrings, respectively. + + See also: #ak.str.ltrim_whitespace. """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_ltrim_whitespace.py b/src/awkward/operations/str/ak_ltrim_whitespace.py index ca7f917e4e..060af89288 100644 --- a/src/awkward/operations/str/ak_ltrim_whitespace.py +++ b/src/awkward/operations/str/ak_ltrim_whitespace.py @@ -29,6 +29,8 @@ def ltrim_whitespace(array, *, highlevel=True, behavior=None): or [pyarrow.compute.ascii_ltrim_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_ltrim_whitespace.html) on strings and bytestrings, respectively. + + See also: #ak.str.ltrim. """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_match_substring.py b/src/awkward/operations/str/ak_match_substring.py index 0b5ac3421e..29778364bb 100644 --- a/src/awkward/operations/str/ak_match_substring.py +++ b/src/awkward/operations/str/ak_match_substring.py @@ -32,6 +32,8 @@ def match_substring( Requires the pyarrow library and calls [pyarrow.compute.match_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.match_substring.html). + + See also: #ak.str.match_substring_regex. """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_match_substring_regex.py b/src/awkward/operations/str/ak_match_substring_regex.py index f909936a42..85bbe38eb5 100644 --- a/src/awkward/operations/str/ak_match_substring_regex.py +++ b/src/awkward/operations/str/ak_match_substring_regex.py @@ -32,6 +32,8 @@ def match_substring_regex( Requires the pyarrow library and calls [pyarrow.compute.match_substring_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.match_substring_regex.html). + + See also: #ak.str.match_substring. """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_replace_substring.py b/src/awkward/operations/str/ak_replace_substring.py index 9214c94199..38f00cbe42 100644 --- a/src/awkward/operations/str/ak_replace_substring.py +++ b/src/awkward/operations/str/ak_replace_substring.py @@ -40,6 +40,8 @@ def replace_substring( or [pyarrow.compute.replace_substring](https://arrow.apache.org/docs/python/generated/pyarrow.compute.replace_substring.html) on strings and bytestrings, respectively. + + See also: #ak.str.replace_substring_regex. """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_replace_substring_regex.py b/src/awkward/operations/str/ak_replace_substring_regex.py index ae2d0ff043..832bf83552 100644 --- a/src/awkward/operations/str/ak_replace_substring_regex.py +++ b/src/awkward/operations/str/ak_replace_substring_regex.py @@ -40,6 +40,8 @@ def replace_substring_regex( or [pyarrow.compute.replace_substring_regex](https://arrow.apache.org/docs/python/generated/pyarrow.compute.replace_substring_regex.html) on strings and bytestrings, respectively. + + See also: #ak.str.replace_substring_regex. """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_rtrim.py b/src/awkward/operations/str/ak_rtrim.py index db96fe2061..816605de40 100644 --- a/src/awkward/operations/str/ak_rtrim.py +++ b/src/awkward/operations/str/ak_rtrim.py @@ -37,6 +37,8 @@ def rtrim(array, characters, *, highlevel=True, behavior=None): or [pyarrow.compute.ascii_rtrim](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_rtrim.html) on strings and bytestrings, respectively. + + See also: #ak.str.rtrim_whitespace. """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_rtrim_whitespace.py b/src/awkward/operations/str/ak_rtrim_whitespace.py index 1216d3d5ce..19c18677b0 100644 --- a/src/awkward/operations/str/ak_rtrim_whitespace.py +++ b/src/awkward/operations/str/ak_rtrim_whitespace.py @@ -29,6 +29,8 @@ def rtrim_whitespace(array, *, highlevel=True, behavior=None): or [pyarrow.compute.ascii_rtrim_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_rtrim_whitespace.html) on strings and bytestrings, respectively. + + See also: #ak.str.rtrim. """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_split_pattern.py b/src/awkward/operations/str/ak_split_pattern.py index 604532edc0..e311ade93d 100644 --- a/src/awkward/operations/str/ak_split_pattern.py +++ b/src/awkward/operations/str/ak_split_pattern.py @@ -35,6 +35,8 @@ def split_pattern( Requires the pyarrow library and calls [pyarrow.compute.split_pattern](https://arrow.apache.org/docs/python/generated/pyarrow.compute.split_pattern.html). + + See also: #ak.str.split_whitespace, #ak.str.split_pattern_regex. """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_split_pattern_regex.py b/src/awkward/operations/str/ak_split_pattern_regex.py index 69a88e9b1e..dd71e8b9b5 100644 --- a/src/awkward/operations/str/ak_split_pattern_regex.py +++ b/src/awkward/operations/str/ak_split_pattern_regex.py @@ -36,6 +36,8 @@ def split_pattern_regex( Requires the pyarrow library and calls [pyarrow.compute.split_pattern](https://arrow.apache.org/docs/python/generated/pyarrow.compute.split_pattern.html). + + See also: #ak.str.split_whitespace, #ak.str.split_pattern. """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_split_whitespace.py b/src/awkward/operations/str/ak_split_whitespace.py index bf35a2c1b9..5bfb9e77a8 100644 --- a/src/awkward/operations/str/ak_split_whitespace.py +++ b/src/awkward/operations/str/ak_split_whitespace.py @@ -44,6 +44,8 @@ def split_whitespace( [pyarrow.compute.utf8_split_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.utf8_split_whitespace.html) or [pyarrow.compute.ascii_split_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_split_whitespace.html) on strings and bytestrings, respectively. + + See also: #ak.str.split_pattern, #ak.str.split_pattern_regex. """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_trim.py b/src/awkward/operations/str/ak_trim.py index ecf7d14f90..aa5352cd5e 100644 --- a/src/awkward/operations/str/ak_trim.py +++ b/src/awkward/operations/str/ak_trim.py @@ -38,6 +38,8 @@ def trim(array, characters, *, highlevel=True, behavior=None): or [pyarrow.compute.ascii_trim](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_trim.html) on strings and bytestrings, respectively. + + See also: #ak.str.trim_whitespace. """ # Dispatch yield (array,) diff --git a/src/awkward/operations/str/ak_trim_whitespace.py b/src/awkward/operations/str/ak_trim_whitespace.py index de34c5fa87..200118fdb7 100644 --- a/src/awkward/operations/str/ak_trim_whitespace.py +++ b/src/awkward/operations/str/ak_trim_whitespace.py @@ -30,6 +30,8 @@ def trim_whitespace(array, *, highlevel=True, behavior=None): or [pyarrow.compute.ascii_trim_whitespace](https://arrow.apache.org/docs/python/generated/pyarrow.compute.ascii_trim_whitespace.html) on strings and bytestrings, respectively. + + See also: #ak.str.trim. """ # Dispatch yield (array,) From cbba554e2a534c68389dac5de35d65223f0c720c Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Tue, 8 Aug 2023 11:02:17 +0100 Subject: [PATCH 69/73] docs: include `ak.str` in toctree --- docs/prepare_docstrings.py | 1 + docs/reference/toctree.txt | 73 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/docs/prepare_docstrings.py b/docs/prepare_docstrings.py index 963a3cb408..fa1f6c81b5 100644 --- a/docs/prepare_docstrings.py +++ b/docs/prepare_docstrings.py @@ -303,6 +303,7 @@ def dofunction(link, linelink, shortname, name, astfcn): .replace(".behaviors.string", "") ) shortname = re.sub(r"\.operations\.ak_\w+", "", shortname) + shortname = re.sub(r"\.operations\.str\.ak_\w+", ".str", shortname) shortname = re.sub(r"\.(contents|types|forms)\.\w+", r".\1", shortname) if ( diff --git a/docs/reference/toctree.txt b/docs/reference/toctree.txt index f442d9cb2c..2304bba695 100644 --- a/docs/reference/toctree.txt +++ b/docs/reference/toctree.txt @@ -145,6 +145,79 @@ generated/ak.argcartesian generated/ak.argcombinations +.. toctree:: + :caption: String predicates + + generated/ak.str.is_alnum + generated/ak.str.is_alpha + generated/ak.str.is_ascii + generated/ak.str.is_decimal + generated/ak.str.is_digit + generated/ak.str.is_lower + generated/ak.str.is_numeric + generated/ak.str.is_printable + generated/ak.str.is_space + generated/ak.str.is_title + generated/ak.str.is_upper + +.. toctree:: + :caption: String transforms + + generated/ak.str.capitalize + generated/ak.str.length + generated/ak.str.lower + generated/ak.str.repeat + generated/ak.str.replace_slice + generated/ak.str.replace_substring + generated/ak.str.replace_substring_regex + generated/ak.str.reverse + generated/ak.str.swapcase + generated/ak.str.title + generated/ak.str.upper + +.. toctree:: + :caption: String padding and trimming + + generated/ak.str.center + generated/ak.str.lpad + generated/ak.str.rpad + generated/ak.str.ltrim + generated/ak.str.ltrim_whitespace + generated/ak.str.rtrim + generated/ak.str.rtrim_whitespace + generated/ak.str.trim + generated/ak.str.trim_whitespace + +.. toctree:: + :caption: String splitting and joining + + generated/ak.str.split_pattern + generated/ak.str.split_pattern_regex + generated/ak.str.split_whitespace + generated/ak.str.join + generated/ak.str.join_element_wise + +.. toctree:: + :caption: String slicing and decomposition + + generated/ak.str.slice + generated/ak.str.extract_regex + +.. toctree:: + :caption: String containment tests + + generated/ak.str.count_substring + generated/ak.str.count_substring_regex + generated/ak.str.ends_with + generated/ak.str.find_substring + generated/ak.str.find_substring_regex + generated/ak.str.index_in + generated/ak.str.is_in + generated/ak.str.match_like + generated/ak.str.match_substring + generated/ak.str.match_substring_regex + generated/ak.str.starts_with + .. toctree:: :caption: Value and type conversions From 6e39bf168c564b1d4e4cbbd30cad00ae803bfb63 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 8 Aug 2023 09:28:33 +0100 Subject: [PATCH 70/73] chore: update pre-commit hooks (#2619) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> From 9fee3fc809948b07b1b0d16e90e8f317efabbc74 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Tue, 8 Aug 2023 12:05:40 +0100 Subject: [PATCH 71/73] refactor: cleanup error handling --- src/awkward/operations/str/ak_index_in.py | 4 ++-- src/awkward/operations/str/ak_is_in.py | 4 ++-- src/awkward/operations/str/ak_join.py | 14 +++----------- src/awkward/operations/str/ak_join_element_wise.py | 6 +++--- 4 files changed, 10 insertions(+), 18 deletions(-) diff --git a/src/awkward/operations/str/ak_index_in.py b/src/awkward/operations/str/ak_index_in.py index 84f1a2af1d..3c71e0a281 100644 --- a/src/awkward/operations/str/ak_index_in.py +++ b/src/awkward/operations/str/ak_index_in.py @@ -58,12 +58,12 @@ def _impl(array, value_set, skip_nones, highlevel, behavior): value_set_layout = ak.to_layout(value_set, allow_record=False, allow_other=True) if not _is_maybe_optional_list_of_string(value_set_layout): - raise TypeError("`value_set` must be 1D array of (maybe missing) strings") + raise TypeError("`value_set` must be 1D array of (possibly missing) strings") behavior = behavior_of(array, value_set, behavior=behavior) def apply(layout, **kwargs): - if _is_maybe_optional_list_of_string(layout) and layout.purelist_depth == 1: + if _is_maybe_optional_list_of_string(layout): return ak.from_arrow( pc.index_in( ak.to_arrow(layout, extensionarray=False), diff --git a/src/awkward/operations/str/ak_is_in.py b/src/awkward/operations/str/ak_is_in.py index 3ac79b44ec..99223cc5e0 100644 --- a/src/awkward/operations/str/ak_is_in.py +++ b/src/awkward/operations/str/ak_is_in.py @@ -57,12 +57,12 @@ def _impl(array, value_set, skip_nones, highlevel, behavior): value_set_layout = ak.to_layout(value_set, allow_record=False, allow_other=True) if not _is_maybe_optional_list_of_string(value_set_layout): - raise TypeError("`value_set` must be 1D array of (maybe missing) strings") + raise TypeError("`value_set` must be 1D array of (possibly missing) strings") behavior = behavior_of(array, value_set, behavior=behavior) def apply(layout, **kwargs): - if _is_maybe_optional_list_of_string(layout) and layout.purelist_depth == 1: + if _is_maybe_optional_list_of_string(layout): return ak.from_arrow( pc.is_in( ak.to_arrow(layout, extensionarray=False), diff --git a/src/awkward/operations/str/ak_join.py b/src/awkward/operations/str/ak_join.py index 062067b3db..7f0f93e5fb 100644 --- a/src/awkward/operations/str/ak_join.py +++ b/src/awkward/operations/str/ak_join.py @@ -57,14 +57,9 @@ def _impl(array, separator, highlevel, behavior): import pyarrow.compute as pc def apply_unary(layout, **kwargs): - if not (layout.is_list and layout.purelist_depth == 2): + if not (layout.is_list and _is_maybe_optional_list_of_string(layout.content)): return - if not _is_maybe_optional_list_of_string(layout.content): - return - - # We have (maybe option/indexed type wrapping) strings - arrow_array = to_arrow( # Arrow needs an option type here layout.copy(content=ak.contents.UnmaskedArray.simplified(layout.content)), @@ -80,15 +75,12 @@ def apply_unary(layout, **kwargs): def apply_binary(layouts, **kwargs): layout, separator_layout = layouts - if not (layout.is_list and layout.purelist_depth == 2): - return - - if not _is_maybe_optional_list_of_string(layout.content): + if not (layout.is_list and _is_maybe_optional_list_of_string(layout.content)): return if not _is_maybe_optional_list_of_string(separator_layout): raise TypeError( - f"separator must be a list of strings, not {type(separator_layout)}" + f"`separator` must be a list of (possibly missing) strings, not {ak.type(separator_layout)}" ) # We have (maybe option/indexed type wrapping) strings diff --git a/src/awkward/operations/str/ak_join_element_wise.py b/src/awkward/operations/str/ak_join_element_wise.py index 2026bb8ae0..cde4eef163 100644 --- a/src/awkward/operations/str/ak_join_element_wise.py +++ b/src/awkward/operations/str/ak_join_element_wise.py @@ -48,12 +48,12 @@ def _impl(arrays, highlevel, behavior): import pyarrow.compute as pc - layouts = [ak.to_layout(x) for x in arrays] - behavior = behavior_of(*arrays, behavior=behavior) - if len(arrays) < 1: raise TypeError("at least one array is required") + layouts = [ak.to_layout(x) for x in arrays] + behavior = behavior_of(*arrays, behavior=behavior) + def action(layouts, **kwargs): if all( x.is_list and x.parameter("__array__") in ("string", "bytestring") From c5f5cb73fee92d16aa537953a2e25d2086934132 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Tue, 8 Aug 2023 10:04:41 -0500 Subject: [PATCH 72/73] Rename ak_*.py modules -> akstr_*.py. --- src/awkward/operations/str/__init__.py | 98 +++++++++---------- .../{ak_capitalize.py => akstr_capitalize.py} | 0 .../str/{ak_center.py => akstr_center.py} | 0 ..._substring.py => akstr_count_substring.py} | 0 ...egex.py => akstr_count_substring_regex.py} | 0 .../{ak_ends_with.py => akstr_ends_with.py} | 0 ...xtract_regex.py => akstr_extract_regex.py} | 0 ...d_substring.py => akstr_find_substring.py} | 0 ...regex.py => akstr_find_substring_regex.py} | 0 .../str/{ak_index_in.py => akstr_index_in.py} | 0 .../str/{ak_is_alnum.py => akstr_is_alnum.py} | 0 .../str/{ak_is_alpha.py => akstr_is_alpha.py} | 0 .../str/{ak_is_ascii.py => akstr_is_ascii.py} | 0 .../{ak_is_decimal.py => akstr_is_decimal.py} | 0 .../str/{ak_is_digit.py => akstr_is_digit.py} | 0 .../str/{ak_is_in.py => akstr_is_in.py} | 0 .../str/{ak_is_lower.py => akstr_is_lower.py} | 0 .../{ak_is_numeric.py => akstr_is_numeric.py} | 0 ..._is_printable.py => akstr_is_printable.py} | 0 .../str/{ak_is_space.py => akstr_is_space.py} | 0 .../str/{ak_is_title.py => akstr_is_title.py} | 0 .../str/{ak_is_upper.py => akstr_is_upper.py} | 0 .../str/{ak_join.py => akstr_join.py} | 0 ...ent_wise.py => akstr_join_element_wise.py} | 0 .../str/{ak_length.py => akstr_length.py} | 0 .../str/{ak_lower.py => akstr_lower.py} | 0 .../str/{ak_lpad.py => akstr_lpad.py} | 0 .../str/{ak_ltrim.py => akstr_ltrim.py} | 0 ...hitespace.py => akstr_ltrim_whitespace.py} | 0 .../{ak_match_like.py => akstr_match_like.py} | 0 ..._substring.py => akstr_match_substring.py} | 0 ...egex.py => akstr_match_substring_regex.py} | 0 .../str/{ak_repeat.py => akstr_repeat.py} | 0 ...eplace_slice.py => akstr_replace_slice.py} | 0 ...ubstring.py => akstr_replace_substring.py} | 0 ...ex.py => akstr_replace_substring_regex.py} | 0 .../str/{ak_reverse.py => akstr_reverse.py} | 0 .../str/{ak_rpad.py => akstr_rpad.py} | 0 .../str/{ak_rtrim.py => akstr_rtrim.py} | 0 ...hitespace.py => akstr_rtrim_whitespace.py} | 0 .../str/{ak_slice.py => akstr_slice.py} | 0 ...plit_pattern.py => akstr_split_pattern.py} | 0 ..._regex.py => akstr_split_pattern_regex.py} | 0 ...hitespace.py => akstr_split_whitespace.py} | 0 ...ak_starts_with.py => akstr_starts_with.py} | 0 .../str/{ak_swapcase.py => akstr_swapcase.py} | 0 .../str/{ak_title.py => akstr_title.py} | 0 .../str/{ak_trim.py => akstr_trim.py} | 0 ...whitespace.py => akstr_trim_whitespace.py} | 0 .../str/{ak_upper.py => akstr_upper.py} | 0 50 files changed, 49 insertions(+), 49 deletions(-) rename src/awkward/operations/str/{ak_capitalize.py => akstr_capitalize.py} (100%) rename src/awkward/operations/str/{ak_center.py => akstr_center.py} (100%) rename src/awkward/operations/str/{ak_count_substring.py => akstr_count_substring.py} (100%) rename src/awkward/operations/str/{ak_count_substring_regex.py => akstr_count_substring_regex.py} (100%) rename src/awkward/operations/str/{ak_ends_with.py => akstr_ends_with.py} (100%) rename src/awkward/operations/str/{ak_extract_regex.py => akstr_extract_regex.py} (100%) rename src/awkward/operations/str/{ak_find_substring.py => akstr_find_substring.py} (100%) rename src/awkward/operations/str/{ak_find_substring_regex.py => akstr_find_substring_regex.py} (100%) rename src/awkward/operations/str/{ak_index_in.py => akstr_index_in.py} (100%) rename src/awkward/operations/str/{ak_is_alnum.py => akstr_is_alnum.py} (100%) rename src/awkward/operations/str/{ak_is_alpha.py => akstr_is_alpha.py} (100%) rename src/awkward/operations/str/{ak_is_ascii.py => akstr_is_ascii.py} (100%) rename src/awkward/operations/str/{ak_is_decimal.py => akstr_is_decimal.py} (100%) rename src/awkward/operations/str/{ak_is_digit.py => akstr_is_digit.py} (100%) rename src/awkward/operations/str/{ak_is_in.py => akstr_is_in.py} (100%) rename src/awkward/operations/str/{ak_is_lower.py => akstr_is_lower.py} (100%) rename src/awkward/operations/str/{ak_is_numeric.py => akstr_is_numeric.py} (100%) rename src/awkward/operations/str/{ak_is_printable.py => akstr_is_printable.py} (100%) rename src/awkward/operations/str/{ak_is_space.py => akstr_is_space.py} (100%) rename src/awkward/operations/str/{ak_is_title.py => akstr_is_title.py} (100%) rename src/awkward/operations/str/{ak_is_upper.py => akstr_is_upper.py} (100%) rename src/awkward/operations/str/{ak_join.py => akstr_join.py} (100%) rename src/awkward/operations/str/{ak_join_element_wise.py => akstr_join_element_wise.py} (100%) rename src/awkward/operations/str/{ak_length.py => akstr_length.py} (100%) rename src/awkward/operations/str/{ak_lower.py => akstr_lower.py} (100%) rename src/awkward/operations/str/{ak_lpad.py => akstr_lpad.py} (100%) rename src/awkward/operations/str/{ak_ltrim.py => akstr_ltrim.py} (100%) rename src/awkward/operations/str/{ak_ltrim_whitespace.py => akstr_ltrim_whitespace.py} (100%) rename src/awkward/operations/str/{ak_match_like.py => akstr_match_like.py} (100%) rename src/awkward/operations/str/{ak_match_substring.py => akstr_match_substring.py} (100%) rename src/awkward/operations/str/{ak_match_substring_regex.py => akstr_match_substring_regex.py} (100%) rename src/awkward/operations/str/{ak_repeat.py => akstr_repeat.py} (100%) rename src/awkward/operations/str/{ak_replace_slice.py => akstr_replace_slice.py} (100%) rename src/awkward/operations/str/{ak_replace_substring.py => akstr_replace_substring.py} (100%) rename src/awkward/operations/str/{ak_replace_substring_regex.py => akstr_replace_substring_regex.py} (100%) rename src/awkward/operations/str/{ak_reverse.py => akstr_reverse.py} (100%) rename src/awkward/operations/str/{ak_rpad.py => akstr_rpad.py} (100%) rename src/awkward/operations/str/{ak_rtrim.py => akstr_rtrim.py} (100%) rename src/awkward/operations/str/{ak_rtrim_whitespace.py => akstr_rtrim_whitespace.py} (100%) rename src/awkward/operations/str/{ak_slice.py => akstr_slice.py} (100%) rename src/awkward/operations/str/{ak_split_pattern.py => akstr_split_pattern.py} (100%) rename src/awkward/operations/str/{ak_split_pattern_regex.py => akstr_split_pattern_regex.py} (100%) rename src/awkward/operations/str/{ak_split_whitespace.py => akstr_split_whitespace.py} (100%) rename src/awkward/operations/str/{ak_starts_with.py => akstr_starts_with.py} (100%) rename src/awkward/operations/str/{ak_swapcase.py => akstr_swapcase.py} (100%) rename src/awkward/operations/str/{ak_title.py => akstr_title.py} (100%) rename src/awkward/operations/str/{ak_trim.py => akstr_trim.py} (100%) rename src/awkward/operations/str/{ak_trim_whitespace.py => akstr_trim_whitespace.py} (100%) rename src/awkward/operations/str/{ak_upper.py => akstr_upper.py} (100%) diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 7d4357d12a..610a99de4b 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -3,75 +3,75 @@ # https://arrow.apache.org/docs/python/api/compute.html#string-predicates # string predicates -from awkward.operations.str.ak_is_alnum import * -from awkward.operations.str.ak_is_alpha import * -from awkward.operations.str.ak_is_decimal import * -from awkward.operations.str.ak_is_digit import * -from awkward.operations.str.ak_is_lower import * -from awkward.operations.str.ak_is_numeric import * -from awkward.operations.str.ak_is_printable import * -from awkward.operations.str.ak_is_space import * -from awkward.operations.str.ak_is_upper import * -from awkward.operations.str.ak_is_title import * -from awkward.operations.str.ak_is_ascii import * +from awkward.operations.str.akstr_is_alnum import * +from awkward.operations.str.akstr_is_alpha import * +from awkward.operations.str.akstr_is_decimal import * +from awkward.operations.str.akstr_is_digit import * +from awkward.operations.str.akstr_is_lower import * +from awkward.operations.str.akstr_is_numeric import * +from awkward.operations.str.akstr_is_printable import * +from awkward.operations.str.akstr_is_space import * +from awkward.operations.str.akstr_is_upper import * +from awkward.operations.str.akstr_is_title import * +from awkward.operations.str.akstr_is_ascii import * # string transforms -from awkward.operations.str.ak_capitalize import * -from awkward.operations.str.ak_length import * -from awkward.operations.str.ak_lower import * -from awkward.operations.str.ak_swapcase import * -from awkward.operations.str.ak_title import * -from awkward.operations.str.ak_upper import * -from awkward.operations.str.ak_repeat import * -from awkward.operations.str.ak_replace_slice import * -from awkward.operations.str.ak_reverse import * -from awkward.operations.str.ak_replace_substring import * -from awkward.operations.str.ak_replace_substring_regex import * +from awkward.operations.str.akstr_capitalize import * +from awkward.operations.str.akstr_length import * +from awkward.operations.str.akstr_lower import * +from awkward.operations.str.akstr_swapcase import * +from awkward.operations.str.akstr_title import * +from awkward.operations.str.akstr_upper import * +from awkward.operations.str.akstr_repeat import * +from awkward.operations.str.akstr_replace_slice import * +from awkward.operations.str.akstr_reverse import * +from awkward.operations.str.akstr_replace_substring import * +from awkward.operations.str.akstr_replace_substring_regex import * # string padding -from awkward.operations.str.ak_center import * -from awkward.operations.str.ak_lpad import * -from awkward.operations.str.ak_rpad import * +from awkward.operations.str.akstr_center import * +from awkward.operations.str.akstr_lpad import * +from awkward.operations.str.akstr_rpad import * # string trimming -from awkward.operations.str.ak_ltrim import * -from awkward.operations.str.ak_ltrim_whitespace import * -from awkward.operations.str.ak_rtrim import * -from awkward.operations.str.ak_rtrim_whitespace import * -from awkward.operations.str.ak_trim import * -from awkward.operations.str.ak_trim_whitespace import * +from awkward.operations.str.akstr_ltrim import * +from awkward.operations.str.akstr_ltrim_whitespace import * +from awkward.operations.str.akstr_rtrim import * +from awkward.operations.str.akstr_rtrim_whitespace import * +from awkward.operations.str.akstr_trim import * +from awkward.operations.str.akstr_trim_whitespace import * # string splitting -from awkward.operations.str.ak_split_whitespace import * -from awkward.operations.str.ak_split_pattern import * -from awkward.operations.str.ak_split_pattern_regex import * +from awkward.operations.str.akstr_split_whitespace import * +from awkward.operations.str.akstr_split_pattern import * +from awkward.operations.str.akstr_split_pattern_regex import * # string component extraction -from awkward.operations.str.ak_extract_regex import * +from awkward.operations.str.akstr_extract_regex import * # string joining -from awkward.operations.str.ak_join import * -from awkward.operations.str.ak_join_element_wise import * +from awkward.operations.str.akstr_join import * +from awkward.operations.str.akstr_join_element_wise import * # string slicing -from awkward.operations.str.ak_slice import * +from awkward.operations.str.akstr_slice import * # containment tests -from awkward.operations.str.ak_count_substring import * -from awkward.operations.str.ak_count_substring_regex import * -from awkward.operations.str.ak_ends_with import * -from awkward.operations.str.ak_find_substring import * -from awkward.operations.str.ak_find_substring_regex import * -from awkward.operations.str.ak_index_in import * -from awkward.operations.str.ak_is_in import * -from awkward.operations.str.ak_match_like import * -from awkward.operations.str.ak_match_substring import * -from awkward.operations.str.ak_match_substring_regex import * -from awkward.operations.str.ak_starts_with import * +from awkward.operations.str.akstr_count_substring import * +from awkward.operations.str.akstr_count_substring_regex import * +from awkward.operations.str.akstr_ends_with import * +from awkward.operations.str.akstr_find_substring import * +from awkward.operations.str.akstr_find_substring_regex import * +from awkward.operations.str.akstr_index_in import * +from awkward.operations.str.akstr_is_in import * +from awkward.operations.str.akstr_match_like import * +from awkward.operations.str.akstr_match_substring import * +from awkward.operations.str.akstr_match_substring_regex import * +from awkward.operations.str.akstr_starts_with import * def _get_ufunc_action( diff --git a/src/awkward/operations/str/ak_capitalize.py b/src/awkward/operations/str/akstr_capitalize.py similarity index 100% rename from src/awkward/operations/str/ak_capitalize.py rename to src/awkward/operations/str/akstr_capitalize.py diff --git a/src/awkward/operations/str/ak_center.py b/src/awkward/operations/str/akstr_center.py similarity index 100% rename from src/awkward/operations/str/ak_center.py rename to src/awkward/operations/str/akstr_center.py diff --git a/src/awkward/operations/str/ak_count_substring.py b/src/awkward/operations/str/akstr_count_substring.py similarity index 100% rename from src/awkward/operations/str/ak_count_substring.py rename to src/awkward/operations/str/akstr_count_substring.py diff --git a/src/awkward/operations/str/ak_count_substring_regex.py b/src/awkward/operations/str/akstr_count_substring_regex.py similarity index 100% rename from src/awkward/operations/str/ak_count_substring_regex.py rename to src/awkward/operations/str/akstr_count_substring_regex.py diff --git a/src/awkward/operations/str/ak_ends_with.py b/src/awkward/operations/str/akstr_ends_with.py similarity index 100% rename from src/awkward/operations/str/ak_ends_with.py rename to src/awkward/operations/str/akstr_ends_with.py diff --git a/src/awkward/operations/str/ak_extract_regex.py b/src/awkward/operations/str/akstr_extract_regex.py similarity index 100% rename from src/awkward/operations/str/ak_extract_regex.py rename to src/awkward/operations/str/akstr_extract_regex.py diff --git a/src/awkward/operations/str/ak_find_substring.py b/src/awkward/operations/str/akstr_find_substring.py similarity index 100% rename from src/awkward/operations/str/ak_find_substring.py rename to src/awkward/operations/str/akstr_find_substring.py diff --git a/src/awkward/operations/str/ak_find_substring_regex.py b/src/awkward/operations/str/akstr_find_substring_regex.py similarity index 100% rename from src/awkward/operations/str/ak_find_substring_regex.py rename to src/awkward/operations/str/akstr_find_substring_regex.py diff --git a/src/awkward/operations/str/ak_index_in.py b/src/awkward/operations/str/akstr_index_in.py similarity index 100% rename from src/awkward/operations/str/ak_index_in.py rename to src/awkward/operations/str/akstr_index_in.py diff --git a/src/awkward/operations/str/ak_is_alnum.py b/src/awkward/operations/str/akstr_is_alnum.py similarity index 100% rename from src/awkward/operations/str/ak_is_alnum.py rename to src/awkward/operations/str/akstr_is_alnum.py diff --git a/src/awkward/operations/str/ak_is_alpha.py b/src/awkward/operations/str/akstr_is_alpha.py similarity index 100% rename from src/awkward/operations/str/ak_is_alpha.py rename to src/awkward/operations/str/akstr_is_alpha.py diff --git a/src/awkward/operations/str/ak_is_ascii.py b/src/awkward/operations/str/akstr_is_ascii.py similarity index 100% rename from src/awkward/operations/str/ak_is_ascii.py rename to src/awkward/operations/str/akstr_is_ascii.py diff --git a/src/awkward/operations/str/ak_is_decimal.py b/src/awkward/operations/str/akstr_is_decimal.py similarity index 100% rename from src/awkward/operations/str/ak_is_decimal.py rename to src/awkward/operations/str/akstr_is_decimal.py diff --git a/src/awkward/operations/str/ak_is_digit.py b/src/awkward/operations/str/akstr_is_digit.py similarity index 100% rename from src/awkward/operations/str/ak_is_digit.py rename to src/awkward/operations/str/akstr_is_digit.py diff --git a/src/awkward/operations/str/ak_is_in.py b/src/awkward/operations/str/akstr_is_in.py similarity index 100% rename from src/awkward/operations/str/ak_is_in.py rename to src/awkward/operations/str/akstr_is_in.py diff --git a/src/awkward/operations/str/ak_is_lower.py b/src/awkward/operations/str/akstr_is_lower.py similarity index 100% rename from src/awkward/operations/str/ak_is_lower.py rename to src/awkward/operations/str/akstr_is_lower.py diff --git a/src/awkward/operations/str/ak_is_numeric.py b/src/awkward/operations/str/akstr_is_numeric.py similarity index 100% rename from src/awkward/operations/str/ak_is_numeric.py rename to src/awkward/operations/str/akstr_is_numeric.py diff --git a/src/awkward/operations/str/ak_is_printable.py b/src/awkward/operations/str/akstr_is_printable.py similarity index 100% rename from src/awkward/operations/str/ak_is_printable.py rename to src/awkward/operations/str/akstr_is_printable.py diff --git a/src/awkward/operations/str/ak_is_space.py b/src/awkward/operations/str/akstr_is_space.py similarity index 100% rename from src/awkward/operations/str/ak_is_space.py rename to src/awkward/operations/str/akstr_is_space.py diff --git a/src/awkward/operations/str/ak_is_title.py b/src/awkward/operations/str/akstr_is_title.py similarity index 100% rename from src/awkward/operations/str/ak_is_title.py rename to src/awkward/operations/str/akstr_is_title.py diff --git a/src/awkward/operations/str/ak_is_upper.py b/src/awkward/operations/str/akstr_is_upper.py similarity index 100% rename from src/awkward/operations/str/ak_is_upper.py rename to src/awkward/operations/str/akstr_is_upper.py diff --git a/src/awkward/operations/str/ak_join.py b/src/awkward/operations/str/akstr_join.py similarity index 100% rename from src/awkward/operations/str/ak_join.py rename to src/awkward/operations/str/akstr_join.py diff --git a/src/awkward/operations/str/ak_join_element_wise.py b/src/awkward/operations/str/akstr_join_element_wise.py similarity index 100% rename from src/awkward/operations/str/ak_join_element_wise.py rename to src/awkward/operations/str/akstr_join_element_wise.py diff --git a/src/awkward/operations/str/ak_length.py b/src/awkward/operations/str/akstr_length.py similarity index 100% rename from src/awkward/operations/str/ak_length.py rename to src/awkward/operations/str/akstr_length.py diff --git a/src/awkward/operations/str/ak_lower.py b/src/awkward/operations/str/akstr_lower.py similarity index 100% rename from src/awkward/operations/str/ak_lower.py rename to src/awkward/operations/str/akstr_lower.py diff --git a/src/awkward/operations/str/ak_lpad.py b/src/awkward/operations/str/akstr_lpad.py similarity index 100% rename from src/awkward/operations/str/ak_lpad.py rename to src/awkward/operations/str/akstr_lpad.py diff --git a/src/awkward/operations/str/ak_ltrim.py b/src/awkward/operations/str/akstr_ltrim.py similarity index 100% rename from src/awkward/operations/str/ak_ltrim.py rename to src/awkward/operations/str/akstr_ltrim.py diff --git a/src/awkward/operations/str/ak_ltrim_whitespace.py b/src/awkward/operations/str/akstr_ltrim_whitespace.py similarity index 100% rename from src/awkward/operations/str/ak_ltrim_whitespace.py rename to src/awkward/operations/str/akstr_ltrim_whitespace.py diff --git a/src/awkward/operations/str/ak_match_like.py b/src/awkward/operations/str/akstr_match_like.py similarity index 100% rename from src/awkward/operations/str/ak_match_like.py rename to src/awkward/operations/str/akstr_match_like.py diff --git a/src/awkward/operations/str/ak_match_substring.py b/src/awkward/operations/str/akstr_match_substring.py similarity index 100% rename from src/awkward/operations/str/ak_match_substring.py rename to src/awkward/operations/str/akstr_match_substring.py diff --git a/src/awkward/operations/str/ak_match_substring_regex.py b/src/awkward/operations/str/akstr_match_substring_regex.py similarity index 100% rename from src/awkward/operations/str/ak_match_substring_regex.py rename to src/awkward/operations/str/akstr_match_substring_regex.py diff --git a/src/awkward/operations/str/ak_repeat.py b/src/awkward/operations/str/akstr_repeat.py similarity index 100% rename from src/awkward/operations/str/ak_repeat.py rename to src/awkward/operations/str/akstr_repeat.py diff --git a/src/awkward/operations/str/ak_replace_slice.py b/src/awkward/operations/str/akstr_replace_slice.py similarity index 100% rename from src/awkward/operations/str/ak_replace_slice.py rename to src/awkward/operations/str/akstr_replace_slice.py diff --git a/src/awkward/operations/str/ak_replace_substring.py b/src/awkward/operations/str/akstr_replace_substring.py similarity index 100% rename from src/awkward/operations/str/ak_replace_substring.py rename to src/awkward/operations/str/akstr_replace_substring.py diff --git a/src/awkward/operations/str/ak_replace_substring_regex.py b/src/awkward/operations/str/akstr_replace_substring_regex.py similarity index 100% rename from src/awkward/operations/str/ak_replace_substring_regex.py rename to src/awkward/operations/str/akstr_replace_substring_regex.py diff --git a/src/awkward/operations/str/ak_reverse.py b/src/awkward/operations/str/akstr_reverse.py similarity index 100% rename from src/awkward/operations/str/ak_reverse.py rename to src/awkward/operations/str/akstr_reverse.py diff --git a/src/awkward/operations/str/ak_rpad.py b/src/awkward/operations/str/akstr_rpad.py similarity index 100% rename from src/awkward/operations/str/ak_rpad.py rename to src/awkward/operations/str/akstr_rpad.py diff --git a/src/awkward/operations/str/ak_rtrim.py b/src/awkward/operations/str/akstr_rtrim.py similarity index 100% rename from src/awkward/operations/str/ak_rtrim.py rename to src/awkward/operations/str/akstr_rtrim.py diff --git a/src/awkward/operations/str/ak_rtrim_whitespace.py b/src/awkward/operations/str/akstr_rtrim_whitespace.py similarity index 100% rename from src/awkward/operations/str/ak_rtrim_whitespace.py rename to src/awkward/operations/str/akstr_rtrim_whitespace.py diff --git a/src/awkward/operations/str/ak_slice.py b/src/awkward/operations/str/akstr_slice.py similarity index 100% rename from src/awkward/operations/str/ak_slice.py rename to src/awkward/operations/str/akstr_slice.py diff --git a/src/awkward/operations/str/ak_split_pattern.py b/src/awkward/operations/str/akstr_split_pattern.py similarity index 100% rename from src/awkward/operations/str/ak_split_pattern.py rename to src/awkward/operations/str/akstr_split_pattern.py diff --git a/src/awkward/operations/str/ak_split_pattern_regex.py b/src/awkward/operations/str/akstr_split_pattern_regex.py similarity index 100% rename from src/awkward/operations/str/ak_split_pattern_regex.py rename to src/awkward/operations/str/akstr_split_pattern_regex.py diff --git a/src/awkward/operations/str/ak_split_whitespace.py b/src/awkward/operations/str/akstr_split_whitespace.py similarity index 100% rename from src/awkward/operations/str/ak_split_whitespace.py rename to src/awkward/operations/str/akstr_split_whitespace.py diff --git a/src/awkward/operations/str/ak_starts_with.py b/src/awkward/operations/str/akstr_starts_with.py similarity index 100% rename from src/awkward/operations/str/ak_starts_with.py rename to src/awkward/operations/str/akstr_starts_with.py diff --git a/src/awkward/operations/str/ak_swapcase.py b/src/awkward/operations/str/akstr_swapcase.py similarity index 100% rename from src/awkward/operations/str/ak_swapcase.py rename to src/awkward/operations/str/akstr_swapcase.py diff --git a/src/awkward/operations/str/ak_title.py b/src/awkward/operations/str/akstr_title.py similarity index 100% rename from src/awkward/operations/str/ak_title.py rename to src/awkward/operations/str/akstr_title.py diff --git a/src/awkward/operations/str/ak_trim.py b/src/awkward/operations/str/akstr_trim.py similarity index 100% rename from src/awkward/operations/str/ak_trim.py rename to src/awkward/operations/str/akstr_trim.py diff --git a/src/awkward/operations/str/ak_trim_whitespace.py b/src/awkward/operations/str/akstr_trim_whitespace.py similarity index 100% rename from src/awkward/operations/str/ak_trim_whitespace.py rename to src/awkward/operations/str/akstr_trim_whitespace.py diff --git a/src/awkward/operations/str/ak_upper.py b/src/awkward/operations/str/akstr_upper.py similarity index 100% rename from src/awkward/operations/str/ak_upper.py rename to src/awkward/operations/str/akstr_upper.py From 7bcb12c5e30959452859871896b955321a377783 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Tue, 8 Aug 2023 16:12:30 +0100 Subject: [PATCH 73/73] docs: be explicit about `ak_str_` --- docs/prepare_docstrings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/prepare_docstrings.py b/docs/prepare_docstrings.py index fa1f6c81b5..35756f516a 100644 --- a/docs/prepare_docstrings.py +++ b/docs/prepare_docstrings.py @@ -303,7 +303,7 @@ def dofunction(link, linelink, shortname, name, astfcn): .replace(".behaviors.string", "") ) shortname = re.sub(r"\.operations\.ak_\w+", "", shortname) - shortname = re.sub(r"\.operations\.str\.ak_\w+", ".str", shortname) + shortname = re.sub(r"\.operations\.str\.akstr_\w+", ".str", shortname) shortname = re.sub(r"\.(contents|types|forms)\.\w+", r".\1", shortname) if (