From c209daeb10dad9b153e0fbcde873c304951ff158 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 7 Nov 2024 08:52:24 -0800
Subject: [PATCH] Add io.text APIs to pylibcudf (#17232)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/17232
---
 .../api_docs/pylibcudf/io/index.rst           |   1 +
 .../user_guide/api_docs/pylibcudf/io/text.rst |   6 +
 python/cudf/cudf/_lib/text.pyx                |  82 +++-----
 python/pylibcudf/pylibcudf/io/CMakeLists.txt  |   2 +-
 python/pylibcudf/pylibcudf/io/__init__.pxd    |   2 +-
 python/pylibcudf/pylibcudf/io/__init__.py     |   2 +-
 python/pylibcudf/pylibcudf/io/text.pxd        |  30 +++
 python/pylibcudf/pylibcudf/io/text.pyx        | 193 ++++++++++++++++++
 .../pylibcudf/pylibcudf/tests/io/test_text.py |  29 +++
 9 files changed, 285 insertions(+), 62 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst
 create mode 100644 python/pylibcudf/pylibcudf/io/text.pxd
 create mode 100644 python/pylibcudf/pylibcudf/io/text.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/io/test_text.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
index 53638f071cc..cd5c5a5f77e 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -19,4 +19,5 @@ I/O Functions
     csv
     json
     parquet
+    text
     timezone
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst
new file mode 100644
index 00000000000..327ca043f36
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst
@@ -0,0 +1,6 @@
+====
+text
+====
+
+.. automodule:: pylibcudf.io.text
+   :members:
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
index b2c7232f549..7942d067c2b 100644
--- a/python/cudf/cudf/_lib/text.pyx
+++ b/python/cudf/cudf/_lib/text.pyx
@@ -1,33 +1,20 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from io import TextIOBase
+from libcpp cimport bool
 
-from cython.operator cimport dereference
-from libc.stdint cimport uint64_t
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
+from io import TextIOBase
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.io.text cimport (
-    byte_range_info,
-    data_chunk_source,
-    make_source,
-    make_source_from_bgzip_file,
-    make_source_from_file,
-    multibyte_split,
-    parse_options,
-)
+import pylibcudf as plc
 
 from cudf._lib.column cimport Column
 
 
 def read_text(object filepaths_or_buffers,
-              object delimiter=None,
-              object byte_range=None,
-              object strip_delimiters=False,
-              object compression=None,
-              object compression_offsets=None):
+              str delimiter,
+              object byte_range,
+              bool strip_delimiters,
+              object compression,
+              object compression_offsets):
     """
     Cython function to call into libcudf API, see `multibyte_split`.
 
@@ -35,24 +22,11 @@ def read_text(object filepaths_or_buffers,
     --------
     cudf.io.text.read_text
     """
-    cdef string delim = delimiter.encode()
-
-    cdef unique_ptr[data_chunk_source] datasource
-    cdef unique_ptr[column] c_col
-
-    cdef size_t c_byte_range_offset
-    cdef size_t c_byte_range_size
-    cdef uint64_t c_compression_begin_offset
-    cdef uint64_t c_compression_end_offset
-    cdef parse_options c_options
-
     if compression is None:
         if isinstance(filepaths_or_buffers, TextIOBase):
-            datasource = move(make_source(
-                filepaths_or_buffers.read().encode()))
+            datasource = plc.io.text.make_source(filepaths_or_buffers.read())
         else:
-            datasource = move(make_source_from_file(
-                filepaths_or_buffers.encode()))
+            datasource = plc.io.text.make_source_from_file(filepaths_or_buffers)
     elif compression == "bgzip":
         if isinstance(filepaths_or_buffers, TextIOBase):
             raise ValueError("bgzip compression requires a file path")
@@ -60,30 +34,20 @@ def read_text(object filepaths_or_buffers,
             if len(compression_offsets) != 2:
                 raise ValueError(
                     "compression offsets need to consist of two elements")
-            c_compression_begin_offset = compression_offsets[0]
-            c_compression_end_offset = compression_offsets[1]
-            datasource = move(make_source_from_bgzip_file(
-                filepaths_or_buffers.encode(),
-                c_compression_begin_offset,
-                c_compression_end_offset))
+            datasource = plc.io.text.make_source_from_bgzip_file(
+                filepaths_or_buffers,
+                compression_offsets[0],
+                compression_offsets[1]
+            )
         else:
-            datasource = move(make_source_from_bgzip_file(
-                filepaths_or_buffers.encode()))
+            datasource = plc.io.text.make_source_from_bgzip_file(
+                filepaths_or_buffers,
+            )
     else:
         raise ValueError("Only bgzip compression is supported at the moment")
 
-    c_options = parse_options()
-    if byte_range is not None:
-        c_byte_range_offset = byte_range[0]
-        c_byte_range_size = byte_range[1]
-        c_options.byte_range = byte_range_info(
-            c_byte_range_offset,
-            c_byte_range_size)
-    c_options.strip_delimiters = strip_delimiters
-    with nogil:
-        c_col = move(multibyte_split(
-            dereference(datasource),
-            delim,
-            c_options))
-
-    return Column.from_unique_ptr(move(c_col))
+    options = plc.io.text.ParseOptions(
+        byte_range=byte_range, strip_delimiters=strip_delimiters
+    )
+    plc_column = plc.io.text.multibyte_split(datasource, delimiter, options)
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
index 965724a47b1..f78d97ef4d1 100644
--- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx timezone.pyx
-                   types.pyx
+                   text.pyx types.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd
index 1bcc0a3f963..6ba7f78a013 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/io/__init__.pxd
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 # CSV is removed since it is def not cpdef (to force kw-only arguments)
-from . cimport avro, datasource, json, orc, parquet, timezone, types
+from . cimport avro, datasource, json, orc, parquet, timezone, text, types
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py
index 2e4f215b12c..0fc77dd0f57 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.py
+++ b/python/pylibcudf/pylibcudf/io/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, csv, datasource, json, orc, parquet, timezone, types
+from . import avro, csv, datasource, json, orc, parquet, timezone, text, types
 from .types import SinkInfo, SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/text.pxd b/python/pylibcudf/pylibcudf/io/text.pxd
new file mode 100644
index 00000000000..051e9bc0cde
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/text.pxd
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.io.text cimport parse_options, data_chunk_source
+
+cdef class ParseOptions:
+    cdef parse_options c_options
+
+cdef class DataChunkSource:
+    cdef unique_ptr[data_chunk_source] c_source
+    cdef string data_ref
+
+
+cpdef Column multibyte_split(
+    DataChunkSource source,
+    str delimiter,
+    ParseOptions options=*
+)
+
+cpdef DataChunkSource make_source(str data)
+
+cpdef DataChunkSource make_source_from_file(str filename)
+
+cpdef DataChunkSource make_source_from_bgzip_file(
+    str filename,
+    int virtual_begin=*,
+    int virtual_end=*,
+)
diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx
new file mode 100644
index 00000000000..667a054baaa
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/text.pyx
@@ -0,0 +1,193 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libc.stdint cimport uint64_t
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.io cimport text as cpp_text
+
+cdef class ParseOptions:
+    """
+    Parsing options for `multibyte_split`
+
+    Parameters
+    ----------
+    byte_range : list | tuple, default None
+        Only rows starting inside this byte range will be
+        part of the output column.
+
+    strip_delimiters : bool, default True
+        Whether delimiters at the end of rows should
+        be stripped from the output column.
+    """
+    def __init__(
+        self,
+        *,
+        byte_range=None,
+        strip_delimiters=False,
+    ):
+        self.c_options = cpp_text.parse_options()
+        if byte_range is not None:
+            c_byte_range_offset = byte_range[0]
+            c_byte_range_size = byte_range[1]
+            self.c_options.byte_range = cpp_text.byte_range_info(
+                c_byte_range_offset,
+                c_byte_range_size
+            )
+        self.c_options.strip_delimiters = strip_delimiters
+
+
+cdef class DataChunkSource:
+    """
+    Data source for `multibyte_split`
+
+    Parameters
+    ----------
+    data : str
+        Filename or data itself.
+    """
+
+    def __cinit__(self, str data):
+        # Need to keep a reference alive for make_source
+        self.data_ref = data.encode()
+
+
+cpdef DataChunkSource make_source(str data):
+    """
+    Creates a data source capable of producing device-buffered views
+    of the given string.
+
+    Parameters
+    ----------
+    data : str
+        The host data to be exposed as a data chunk source.
+
+    Returns
+    -------
+    DataChunkSource
+        The data chunk source for the provided host data.
+    """
+    cdef DataChunkSource dcs = DataChunkSource(data)
+    with nogil:
+        dcs.c_source = move(cpp_text.make_source(dcs.data_ref))
+    return dcs
+
+
+cpdef DataChunkSource make_source_from_file(str filename):
+    """
+    Creates a data source capable of producing device-buffered views of the file.
+
+    Parameters
+    ----------
+    filename : str
+        The filename of the file to be exposed as a data chunk source.
+
+    Returns
+    -------
+    DataChunkSource
+        The data chunk source for the provided filename.
+    """
+    cdef DataChunkSource dcs = DataChunkSource(filename)
+    with nogil:
+        dcs.c_source = move(cpp_text.make_source_from_file(dcs.data_ref))
+    return dcs
+
+cpdef DataChunkSource make_source_from_bgzip_file(
+    str filename,
+    int virtual_begin=-1,
+    int virtual_end=-1,
+):
+    """
+    Creates a data source capable of producing device-buffered views of
+    a BGZIP compressed file with virtual record offsets.
+
+    Parameters
+    ----------
+    filename : str
+        The filename of the BGZIP-compressed file to be exposed as a data chunk source.
+
+    virtual_begin : int
+        The virtual (Tabix) offset of the first byte to be read. Its upper 48 bits
+        describe the offset into the compressed file, its lower 16 bits describe the
+        block-local offset.
+
+    virtual_end : int, default None
+        The virtual (Tabix) offset one past the last byte to be read
+
+    Returns
+    -------
+    DataChunkSource
+        The data chunk source for the provided filename.
+    """
+    cdef uint64_t c_virtual_begin
+    cdef uint64_t c_virtual_end
+    cdef DataChunkSource dcs = DataChunkSource(filename)
+
+    if virtual_begin == -1 and virtual_end == -1:
+        with nogil:
+            dcs.c_source = move(cpp_text.make_source_from_bgzip_file(dcs.data_ref))
+    elif virtual_begin != -1 and virtual_end != -1:
+        c_virtual_begin = virtual_begin
+        c_virtual_end = virtual_end
+        with nogil:
+            dcs.c_source = move(
+                cpp_text.make_source_from_bgzip_file(
+                    dcs.data_ref,
+                    c_virtual_begin,
+                    c_virtual_end,
+                )
+            )
+    else:
+        raise ValueError(
+            "virtual_begin and virtual_end must both be None or both be int"
+        )
+    return dcs
+
+cpdef Column multibyte_split(
+    DataChunkSource source,
+    str delimiter,
+    ParseOptions options=None
+):
+    """
+    Splits the source text into a strings column using a multiple byte delimiter.
+
+    For details, see :cpp:func:`cudf::io::text::multibyte_split`
+
+    Parameters
+    ----------
+    source :
+        The source string.
+
+    delimiter : str
+        UTF-8 encoded string for which to find offsets in the source.
+
+    options : ParseOptions
+        The parsing options to use (including byte range).
+
+    Returns
+    -------
+    Column
+        The strings found by splitting the source by the delimiter
+        within the relevant byte range.
+    """
+    cdef unique_ptr[column] c_result
+    cdef unique_ptr[data_chunk_source] c_source = move(source.c_source)
+    cdef string c_delimiter = delimiter.encode()
+
+    if options is None:
+        options = ParseOptions()
+
+    cdef cpp_text.parse_options c_options = options.c_options
+
+    with nogil:
+        c_result = cpp_text.multibyte_split(
+            dereference(c_source),
+            c_delimiter,
+            c_options
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_text.py b/python/pylibcudf/pylibcudf/tests/io/test_text.py
new file mode 100644
index 00000000000..f69e940e34e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/io/test_text.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.mark.parametrize(
+    "source_func",
+    [
+        "make_source",
+        "make_source_from_file",
+    ],
+)
+@pytest.mark.parametrize("options", [None, plc.io.text.ParseOptions()])
+def test_multibyte_split(source_func, options, tmp_path):
+    data = "x::y::z"
+    func = getattr(plc.io.text, source_func)
+    if source_func == "make_source":
+        source = func(data)
+    elif source_func == "make_source_from_file":
+        fle = tmp_path / "fle.txt"
+        fle.write_text(data)
+        source = func(str(fle))
+    result = plc.io.text.multibyte_split(source, "::", options)
+    expected = pa.array(["x::", "y::", "z"])
+    assert_column_eq(result, expected)