From c209daeb10dad9b153e0fbcde873c304951ff158 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 7 Nov 2024 08:52:24 -0800 Subject: [PATCH] Add io.text APIs to pylibcudf (#17232) Contributes to https://github.com/rapidsai/cudf/issues/15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/17232 --- .../api_docs/pylibcudf/io/index.rst | 1 + .../user_guide/api_docs/pylibcudf/io/text.rst | 6 + python/cudf/cudf/_lib/text.pyx | 82 +++----- python/pylibcudf/pylibcudf/io/CMakeLists.txt | 2 +- python/pylibcudf/pylibcudf/io/__init__.pxd | 2 +- python/pylibcudf/pylibcudf/io/__init__.py | 2 +- python/pylibcudf/pylibcudf/io/text.pxd | 30 +++ python/pylibcudf/pylibcudf/io/text.pyx | 193 ++++++++++++++++++ .../pylibcudf/pylibcudf/tests/io/test_text.py | 29 +++ 9 files changed, 285 insertions(+), 62 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst create mode 100644 python/pylibcudf/pylibcudf/io/text.pxd create mode 100644 python/pylibcudf/pylibcudf/io/text.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/io/test_text.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst index 53638f071cc..cd5c5a5f77e 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst @@ -19,4 +19,5 @@ I/O Functions csv json parquet + text timezone diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst new file mode 100644 index 00000000000..327ca043f36 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst @@ -0,0 +1,6 @@ +==== +text +==== + +.. automodule:: pylibcudf.io.text + :members: diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx index b2c7232f549..7942d067c2b 100644 --- a/python/cudf/cudf/_lib/text.pyx +++ b/python/cudf/cudf/_lib/text.pyx @@ -1,33 +1,20 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from io import TextIOBase +from libcpp cimport bool -from cython.operator cimport dereference -from libc.stdint cimport uint64_t -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move +from io import TextIOBase -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.io.text cimport ( - byte_range_info, - data_chunk_source, - make_source, - make_source_from_bgzip_file, - make_source_from_file, - multibyte_split, - parse_options, -) +import pylibcudf as plc from cudf._lib.column cimport Column def read_text(object filepaths_or_buffers, - object delimiter=None, - object byte_range=None, - object strip_delimiters=False, - object compression=None, - object compression_offsets=None): + str delimiter, + object byte_range, + bool strip_delimiters, + object compression, + object compression_offsets): """ Cython function to call into libcudf API, see `multibyte_split`. @@ -35,24 +22,11 @@ def read_text(object filepaths_or_buffers, -------- cudf.io.text.read_text """ - cdef string delim = delimiter.encode() - - cdef unique_ptr[data_chunk_source] datasource - cdef unique_ptr[column] c_col - - cdef size_t c_byte_range_offset - cdef size_t c_byte_range_size - cdef uint64_t c_compression_begin_offset - cdef uint64_t c_compression_end_offset - cdef parse_options c_options - if compression is None: if isinstance(filepaths_or_buffers, TextIOBase): - datasource = move(make_source( - filepaths_or_buffers.read().encode())) + datasource = plc.io.text.make_source(filepaths_or_buffers.read()) else: - datasource = move(make_source_from_file( - filepaths_or_buffers.encode())) + datasource = plc.io.text.make_source_from_file(filepaths_or_buffers) elif compression == "bgzip": if isinstance(filepaths_or_buffers, TextIOBase): raise ValueError("bgzip compression requires a file path") @@ -60,30 +34,20 @@ def read_text(object filepaths_or_buffers, if len(compression_offsets) != 2: raise ValueError( "compression offsets need to consist of two elements") - c_compression_begin_offset = compression_offsets[0] - c_compression_end_offset = compression_offsets[1] - datasource = move(make_source_from_bgzip_file( - filepaths_or_buffers.encode(), - c_compression_begin_offset, - c_compression_end_offset)) + datasource = plc.io.text.make_source_from_bgzip_file( + filepaths_or_buffers, + compression_offsets[0], + compression_offsets[1] + ) else: - datasource = move(make_source_from_bgzip_file( - filepaths_or_buffers.encode())) + datasource = plc.io.text.make_source_from_bgzip_file( + filepaths_or_buffers, + ) else: raise ValueError("Only bgzip compression is supported at the moment") - c_options = parse_options() - if byte_range is not None: - c_byte_range_offset = byte_range[0] - c_byte_range_size = byte_range[1] - c_options.byte_range = byte_range_info( - c_byte_range_offset, - c_byte_range_size) - c_options.strip_delimiters = strip_delimiters - with nogil: - c_col = move(multibyte_split( - dereference(datasource), - delim, - c_options)) - - return Column.from_unique_ptr(move(c_col)) + options = plc.io.text.ParseOptions( + byte_range=byte_range, strip_delimiters=strip_delimiters + ) + plc_column = plc.io.text.multibyte_split(datasource, delimiter, options) + return Column.from_pylibcudf(plc_column) diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt index 965724a47b1..f78d97ef4d1 100644 --- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt @@ -13,7 +13,7 @@ # ============================================================================= set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx timezone.pyx - types.pyx + text.pyx types.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd index 1bcc0a3f963..6ba7f78a013 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.pxd +++ b/python/pylibcudf/pylibcudf/io/__init__.pxd @@ -1,5 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. # CSV is removed since it is def not cpdef (to force kw-only arguments) -from . cimport avro, datasource, json, orc, parquet, timezone, types +from . cimport avro, datasource, json, orc, parquet, timezone, text, types from .types cimport SourceInfo, TableWithMetadata diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py index 2e4f215b12c..0fc77dd0f57 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.py +++ b/python/pylibcudf/pylibcudf/io/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import avro, csv, datasource, json, orc, parquet, timezone, types +from . import avro, csv, datasource, json, orc, parquet, timezone, text, types from .types import SinkInfo, SourceInfo, TableWithMetadata diff --git a/python/pylibcudf/pylibcudf/io/text.pxd b/python/pylibcudf/pylibcudf/io/text.pxd new file mode 100644 index 00000000000..051e9bc0cde --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/text.pxd @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from pylibcudf.column cimport Column +from pylibcudf.libcudf.io.text cimport parse_options, data_chunk_source + +cdef class ParseOptions: + cdef parse_options c_options + +cdef class DataChunkSource: + cdef unique_ptr[data_chunk_source] c_source + cdef string data_ref + + +cpdef Column multibyte_split( + DataChunkSource source, + str delimiter, + ParseOptions options=* +) + +cpdef DataChunkSource make_source(str data) + +cpdef DataChunkSource make_source_from_file(str filename) + +cpdef DataChunkSource make_source_from_bgzip_file( + str filename, + int virtual_begin=*, + int virtual_end=*, +) diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx new file mode 100644 index 00000000000..667a054baaa --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/text.pyx @@ -0,0 +1,193 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libc.stdint cimport uint64_t +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.io cimport text as cpp_text + +cdef class ParseOptions: + """ + Parsing options for `multibyte_split` + + Parameters + ---------- + byte_range : list | tuple, default None + Only rows starting inside this byte range will be + part of the output column. + + strip_delimiters : bool, default True + Whether delimiters at the end of rows should + be stripped from the output column. + """ + def __init__( + self, + *, + byte_range=None, + strip_delimiters=False, + ): + self.c_options = cpp_text.parse_options() + if byte_range is not None: + c_byte_range_offset = byte_range[0] + c_byte_range_size = byte_range[1] + self.c_options.byte_range = cpp_text.byte_range_info( + c_byte_range_offset, + c_byte_range_size + ) + self.c_options.strip_delimiters = strip_delimiters + + +cdef class DataChunkSource: + """ + Data source for `multibyte_split` + + Parameters + ---------- + data : str + Filename or data itself. + """ + + def __cinit__(self, str data): + # Need to keep a reference alive for make_source + self.data_ref = data.encode() + + +cpdef DataChunkSource make_source(str data): + """ + Creates a data source capable of producing device-buffered views + of the given string. + + Parameters + ---------- + data : str + The host data to be exposed as a data chunk source. + + Returns + ------- + DataChunkSource + The data chunk source for the provided host data. + """ + cdef DataChunkSource dcs = DataChunkSource(data) + with nogil: + dcs.c_source = move(cpp_text.make_source(dcs.data_ref)) + return dcs + + +cpdef DataChunkSource make_source_from_file(str filename): + """ + Creates a data source capable of producing device-buffered views of the file. + + Parameters + ---------- + filename : str + The filename of the file to be exposed as a data chunk source. + + Returns + ------- + DataChunkSource + The data chunk source for the provided filename. + """ + cdef DataChunkSource dcs = DataChunkSource(filename) + with nogil: + dcs.c_source = move(cpp_text.make_source_from_file(dcs.data_ref)) + return dcs + +cpdef DataChunkSource make_source_from_bgzip_file( + str filename, + int virtual_begin=-1, + int virtual_end=-1, +): + """ + Creates a data source capable of producing device-buffered views of + a BGZIP compressed file with virtual record offsets. + + Parameters + ---------- + filename : str + The filename of the BGZIP-compressed file to be exposed as a data chunk source. + + virtual_begin : int + The virtual (Tabix) offset of the first byte to be read. Its upper 48 bits + describe the offset into the compressed file, its lower 16 bits describe the + block-local offset. + + virtual_end : int, default None + The virtual (Tabix) offset one past the last byte to be read + + Returns + ------- + DataChunkSource + The data chunk source for the provided filename. + """ + cdef uint64_t c_virtual_begin + cdef uint64_t c_virtual_end + cdef DataChunkSource dcs = DataChunkSource(filename) + + if virtual_begin == -1 and virtual_end == -1: + with nogil: + dcs.c_source = move(cpp_text.make_source_from_bgzip_file(dcs.data_ref)) + elif virtual_begin != -1 and virtual_end != -1: + c_virtual_begin = virtual_begin + c_virtual_end = virtual_end + with nogil: + dcs.c_source = move( + cpp_text.make_source_from_bgzip_file( + dcs.data_ref, + c_virtual_begin, + c_virtual_end, + ) + ) + else: + raise ValueError( + "virtual_begin and virtual_end must both be None or both be int" + ) + return dcs + +cpdef Column multibyte_split( + DataChunkSource source, + str delimiter, + ParseOptions options=None +): + """ + Splits the source text into a strings column using a multiple byte delimiter. + + For details, see :cpp:func:`cudf::io::text::multibyte_split` + + Parameters + ---------- + source : + The source string. + + delimiter : str + UTF-8 encoded string for which to find offsets in the source. + + options : ParseOptions + The parsing options to use (including byte range). + + Returns + ------- + Column + The strings found by splitting the source by the delimiter + within the relevant byte range. + """ + cdef unique_ptr[column] c_result + cdef unique_ptr[data_chunk_source] c_source = move(source.c_source) + cdef string c_delimiter = delimiter.encode() + + if options is None: + options = ParseOptions() + + cdef cpp_text.parse_options c_options = options.c_options + + with nogil: + c_result = cpp_text.multibyte_split( + dereference(c_source), + c_delimiter, + c_options + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_text.py b/python/pylibcudf/pylibcudf/tests/io/test_text.py new file mode 100644 index 00000000000..f69e940e34e --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/io/test_text.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.mark.parametrize( + "source_func", + [ + "make_source", + "make_source_from_file", + ], +) +@pytest.mark.parametrize("options", [None, plc.io.text.ParseOptions()]) +def test_multibyte_split(source_func, options, tmp_path): + data = "x::y::z" + func = getattr(plc.io.text, source_func) + if source_func == "make_source": + source = func(data) + elif source_func == "make_source_from_file": + fle = tmp_path / "fle.txt" + fle.write_text(data) + source = func(str(fle)) + result = plc.io.text.multibyte_split(source, "::", options) + expected = pa.array(["x::", "y::", "z"]) + assert_column_eq(result, expected)