scikit-hep · jpivarski · Aug 8, 2023 · Aug 4, 2023 · Aug 4, 2023 · Aug 4, 2023
diff --git a/docs/prepare_docstrings.py b/docs/prepare_docstrings.py
@@ -303,6 +303,7 @@ def dofunction(link, linelink, shortname, name, astfcn):
  .replace(".behaviors.string", "")
  )
  shortname = re.sub(r"\.operations\.ak_\w+", "", shortname)
+ shortname = re.sub(r"\.operations\.str\.akstr_\w+", ".str", shortname)
  shortname = re.sub(r"\.(contents|types|forms)\.\w+", r".\1", shortname)
 
  if (

diff --git a/docs/reference/toctree.txt b/docs/reference/toctree.txt
@@ -145,6 +145,79 @@
  generated/ak.argcartesian
  generated/ak.argcombinations
 
+.. toctree::
+ :caption: String predicates
+
+ generated/ak.str.is_alnum
+ generated/ak.str.is_alpha
+ generated/ak.str.is_ascii
+ generated/ak.str.is_decimal
+ generated/ak.str.is_digit
+ generated/ak.str.is_lower
+ generated/ak.str.is_numeric
+ generated/ak.str.is_printable
+ generated/ak.str.is_space
+ generated/ak.str.is_title
+ generated/ak.str.is_upper
+
+.. toctree::
+ :caption: String transforms
+
+ generated/ak.str.capitalize
+ generated/ak.str.length
+ generated/ak.str.lower
+ generated/ak.str.repeat
+ generated/ak.str.replace_slice
+ generated/ak.str.replace_substring
+ generated/ak.str.replace_substring_regex
+ generated/ak.str.reverse
+ generated/ak.str.swapcase
+ generated/ak.str.title
+ generated/ak.str.upper
+
+.. toctree::
+ :caption: String padding and trimming
+
+ generated/ak.str.center
+ generated/ak.str.lpad
+ generated/ak.str.rpad
+ generated/ak.str.ltrim
+ generated/ak.str.ltrim_whitespace
+ generated/ak.str.rtrim
+ generated/ak.str.rtrim_whitespace
+ generated/ak.str.trim
+ generated/ak.str.trim_whitespace
+
+.. toctree::
+ :caption: String splitting and joining
+
+ generated/ak.str.split_pattern
+ generated/ak.str.split_pattern_regex
+ generated/ak.str.split_whitespace
+ generated/ak.str.join
+ generated/ak.str.join_element_wise
+
+.. toctree::
+ :caption: String slicing and decomposition
+
+ generated/ak.str.slice
+ generated/ak.str.extract_regex
+
+.. toctree::
+ :caption: String containment tests
+
+ generated/ak.str.count_substring
+ generated/ak.str.count_substring_regex
+ generated/ak.str.ends_with
+ generated/ak.str.find_substring
+ generated/ak.str.find_substring_regex
+ generated/ak.str.index_in
+ generated/ak.str.is_in
+ generated/ak.str.match_like
+ generated/ak.str.match_substring
+ generated/ak.str.match_substring_regex
+ generated/ak.str.starts_with
+
 .. toctree::
  :caption: Value and type conversions
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -309,7 +309,8 @@ mccabe.max-complexity = 100
 "src/awkward/_connect/*" = ["TID251"]
 "src/awkward/__init__.py" = ["E402", "F401", "F403", "I001"]
 "src/awkward/_ext.py" = ["F401"]
-"src/awkward/operations/__init__.py" = ["F403"]
+"src/awkward/operations/__init__.py" = ["F401", "F403"]
+"src/awkward/operations/str/__init__.py" = ["F401", "F403", "I001"]
 "src/awkward/_nplikes/*" = ["TID251"]
 "src/awkward/_operators.py" = ["TID251"]
 "tests*/*" = ["T20", "TID251"]

diff --git a/src/awkward/_connect/pyarrow.py b/src/awkward/_connect/pyarrow.py
@@ -1,7 +1,9 @@
 # BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE
+from __future__ import annotations
 
 import json
 from collections.abc import Iterable, Sized
+from types import ModuleType
 
 from packaging.version import parse as parse_version
 
@@ -36,13 +38,13 @@
  error_message = "pyarrow 7.0.0 or later required for {0}"
 
 
-def import_pyarrow(name):
+def import_pyarrow(name: str) -> ModuleType:
  if pyarrow is None:
  raise ImportError(error_message.format(name))
  return pyarrow
 
 
-def import_pyarrow_parquet(name):
+def import_pyarrow_parquet(name: str) -> ModuleType:
  if pyarrow is None:
  raise ImportError(error_message.format(name))
 
@@ -51,7 +53,16 @@ def import_pyarrow_parquet(name):
  return out
 
 
-def import_fsspec(name):
+def import_pyarrow_compute(name: str) -> ModuleType:
+ if pyarrow is None:
+ raise ImportError(error_message.format(name))
+
+ import pyarrow.compute as out
+
+ return out
+
+
+def import_fsspec(name: str) -> ModuleType:
  try:
  import fsspec
 

diff --git a/src/awkward/contents/unmaskedarray.py b/src/awkward/contents/unmaskedarray.py
@@ -491,7 +491,7 @@ def _remove_structure(self, backend, options):
  return [self]
 
  def _drop_none(self) -> Content:
- return self.to_ByteMaskedArray(True)._drop_none()
+ return self.content
 
  def _recursively_apply(
  self, action, behavior, depth, depth_context, lateral_context, options

diff --git a/src/awkward/operations/__init__.py b/src/awkward/operations/__init__.py
@@ -1,6 +1,6 @@
 # BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE
-# ruff: noqa: F401
 
+import awkward.operations.str
 from awkward.operations.ak_all import *
 from awkward.operations.ak_almost_equal import *
 from awkward.operations.ak_any import *

diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py
@@ -0,0 +1,205 @@
+# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE
+
+# https://arrow.apache.org/docs/python/api/compute.html#string-predicates
+
+# string predicates
+from awkward.operations.str.akstr_is_alnum import *
+from awkward.operations.str.akstr_is_alpha import *
+from awkward.operations.str.akstr_is_decimal import *
+from awkward.operations.str.akstr_is_digit import *
+from awkward.operations.str.akstr_is_lower import *
+from awkward.operations.str.akstr_is_numeric import *
+from awkward.operations.str.akstr_is_printable import *
+from awkward.operations.str.akstr_is_space import *
+from awkward.operations.str.akstr_is_upper import *
+from awkward.operations.str.akstr_is_title import *
+from awkward.operations.str.akstr_is_ascii import *
+
+# string transforms
+from awkward.operations.str.akstr_capitalize import *
+from awkward.operations.str.akstr_length import *
+from awkward.operations.str.akstr_lower import *
+from awkward.operations.str.akstr_swapcase import *
+from awkward.operations.str.akstr_title import *
+from awkward.operations.str.akstr_upper import *
+from awkward.operations.str.akstr_repeat import *
+from awkward.operations.str.akstr_replace_slice import *
+from awkward.operations.str.akstr_reverse import *
+from awkward.operations.str.akstr_replace_substring import *
+from awkward.operations.str.akstr_replace_substring_regex import *
+
+# string padding
+from awkward.operations.str.akstr_center import *
+from awkward.operations.str.akstr_lpad import *
+from awkward.operations.str.akstr_rpad import *
+
+# string trimming
+from awkward.operations.str.akstr_ltrim import *
+from awkward.operations.str.akstr_ltrim_whitespace import *
+from awkward.operations.str.akstr_rtrim import *
+from awkward.operations.str.akstr_rtrim_whitespace import *
+from awkward.operations.str.akstr_trim import *
+from awkward.operations.str.akstr_trim_whitespace import *
+
+# string splitting
+from awkward.operations.str.akstr_split_whitespace import *
+from awkward.operations.str.akstr_split_pattern import *
+from awkward.operations.str.akstr_split_pattern_regex import *
+
+# string component extraction
+
+from awkward.operations.str.akstr_extract_regex import *
+
+# string joining
+
+from awkward.operations.str.akstr_join import *
+from awkward.operations.str.akstr_join_element_wise import *
+
+# string slicing
+
+from awkward.operations.str.akstr_slice import *
+
+# containment tests
+
+from awkward.operations.str.akstr_count_substring import *
+from awkward.operations.str.akstr_count_substring_regex import *
+from awkward.operations.str.akstr_ends_with import *
+from awkward.operations.str.akstr_find_substring import *
+from awkward.operations.str.akstr_find_substring_regex import *
+from awkward.operations.str.akstr_index_in import *
+from awkward.operations.str.akstr_is_in import *
+from awkward.operations.str.akstr_match_like import *
+from awkward.operations.str.akstr_match_substring import *
+from awkward.operations.str.akstr_match_substring_regex import *
+from awkward.operations.str.akstr_starts_with import *
+
+
+def _get_ufunc_action(
+ utf8_function,
+ ascii_function,
+ *args,
+ bytestring_to_string=False,
+ **kwargs,
+):
+ from awkward.operations.ak_from_arrow import from_arrow
+ from awkward.operations.ak_to_arrow import to_arrow
+
+ def action(layout, **absorb):
+ if layout.is_list and layout.parameter("__array__") == "string":
+ return from_arrow(
+ utf8_function(to_arrow(layout, extensionarray=False), *args, **kwargs),
+ highlevel=False,
+ )
+
+ elif layout.is_list and layout.parameter("__array__") == "bytestring":
+ if bytestring_to_string:
+ out = from_arrow(
+ ascii_function(
+ to_arrow(
+ layout.copy(
+ content=layout.content.copy(
+ parameters={"__array__": "char"}
+ ),
+ parameters={"__array__": "string"},
+ ),
+ extensionarray=False,
+ ),
+ *args,
+ **kwargs,
+ ),
+ highlevel=False,
+ )
+ if out.is_list and out.parameter("__array__") == "string":
+ out = out.copy(
+ content=out.content.copy(parameters={"__array__": "byte"}),
+ parameters={"__array__": "bytestring"},
+ )
+ return out
+
+ else:
+ return from_arrow(
+ ascii_function(
+ to_arrow(layout, extensionarray=False), *args, **kwargs
+ ),
+ highlevel=False,
+ )
+
+ return action
+
+
+def _erase_list_option(layout):
+ from awkward.contents.unmaskedarray import UnmaskedArray
+
+ assert layout.is_list
+ if layout.content.is_option:
+ assert isinstance(layout.content, UnmaskedArray)
+ return layout.copy(content=layout.content.content)
+ else:
+ return layout
+
+
+def _get_split_action(
+ utf8_function, ascii_function, *args, bytestring_to_string=False, **kwargs
+):
+ from awkward.operations.ak_from_arrow import from_arrow
+ from awkward.operations.ak_to_arrow import to_arrow
+
+ def action(layout, **absorb):
+ if layout.is_list and layout.parameter("__array__") == "string":
+ return _erase_list_option(
+ from_arrow(
+ utf8_function(
+ to_arrow(layout, extensionarray=False),
+ *args,
+ **kwargs,
+ ),
+ highlevel=False,
+ )
+ )
+
+ elif layout.is_list and layout.parameter("__array__") == "bytestring":
+ if bytestring_to_string:
+ out = _erase_list_option(
+ from_arrow(
+ ascii_function(
+ to_arrow(
+ layout.copy(
+ content=layout.content.copy(
+ parameters={"__array__": "char"}
+ ),
+ parameters={"__array__": "string"},
+ ),
+ extensionarray=False,
+ ),
+ *args,
+ **kwargs,
+ ),
+ highlevel=False,
+ )
+ )
+ assert out.is_list
+
+ assert (
+ out.content.is_list
+ and out.content.parameter("__array__") == "string"
+ )
+ return out.copy(
+ content=out.content.copy(
+ content=out.content.content.copy(
+ parameters={"__array__": "byte"}
+ ),
+ parameters={"__array__": "bytestring"},
+ ),
+ )
+
+ else:
+ return _erase_list_option(
+ from_arrow(
+ ascii_function(
+ to_arrow(layout, extensionarray=False), *args, **kwargs
+ ),
+ highlevel=False,
+ )
+ )
+
+ return action