Migrate NVtext subword tokenizing APIs to pylibcudf (#17096)

Apart of #15162 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: #17096
rapidsai · Oct 31, 2024 · a0711d0 · a0711d0
1 parent a69de57
commit a0711d0
Show file tree

Hide file tree

Showing 11 changed files with 202 additions and 43 deletions.
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -13,4 +13,5 @@ nvtext
     normalize
     replace
     stemmer
+    subword_tokenize
     tokenize
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/subword_tokenize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/subword_tokenize.rst
@@ -0,0 +1,6 @@
+================
+subword_tokenize
+================
+
+.. automodule:: pylibcudf.nvtext.subword_tokenize
+   :members:
diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
@@ -5,35 +5,16 @@ from libc.stdint cimport uint32_t
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.subword_tokenize cimport (
-    hashed_vocabulary as cpp_hashed_vocabulary,
-    load_vocabulary_file as cpp_load_vocabulary_file,
-    move as tr_move,
-    subword_tokenize as cpp_subword_tokenize,
-    tokenizer_result as cpp_tokenizer_result,
-)
 
 from cudf._lib.column cimport Column
 
-
-cdef class Hashed_Vocabulary:
-    cdef unique_ptr[cpp_hashed_vocabulary] c_obj
-
-    def __cinit__(self, hash_file):
-        cdef string c_hash_file = <string>str(hash_file).encode()
-        with nogil:
-            self.c_obj = move(cpp_load_vocabulary_file(c_hash_file))
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
 def subword_tokenize_inmem_hash(
     Column strings,
-    Hashed_Vocabulary hashed_vocabulary,
+    object hashed_vocabulary,
     uint32_t max_sequence_length=64,
     uint32_t stride=48,
     bool do_lower=True,
@@ -42,21 +23,16 @@ def subword_tokenize_inmem_hash(
     """
     Subword tokenizes text series by using the pre-loaded hashed vocabulary
     """
-    cdef column_view c_strings = strings.view()
-    cdef cpp_tokenizer_result c_result
-    with nogil:
-        c_result = tr_move(
-            cpp_subword_tokenize(
-                c_strings,
-                hashed_vocabulary.c_obj.get()[0],
-                max_sequence_length,
-                stride,
-                do_lower,
-                do_truncate,
-            )
-        )
+    result = nvtext.subword_tokenize.subword_tokenize(
+        strings.to_pylibcudf(mode="read"),
+        hashed_vocabulary,
+        max_sequence_length,
+        stride,
+        do_lower,
+        do_truncate,
+    )
     # return the 3 tensor components
-    tokens = Column.from_unique_ptr(move(c_result.tensor_token_ids))
-    masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask))
-    metadata = Column.from_unique_ptr(move(c_result.tensor_metadata))
+    tokens = Column.from_pylibcudf(result[0])
+    masks = Column.from_pylibcudf(result[1])
+    metadata = Column.from_pylibcudf(result[2])
     return tokens, masks, metadata
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
@@ -6,8 +6,9 @@
 
 import cupy as cp
 
+import pylibcudf as plc
+
 from cudf._lib.nvtext.subword_tokenize import (
-    Hashed_Vocabulary as cpp_hashed_vocabulary,
     subword_tokenize_inmem_hash as cpp_subword_tokenize,
 )
 
@@ -50,7 +51,9 @@ class SubwordTokenizer:
 
     def __init__(self, hash_file: str, do_lower_case: bool = True):
         self.do_lower_case = do_lower_case
-        self.vocab_file = cpp_hashed_vocabulary(hash_file)
+        self.vocab_file = plc.nvtext.subword_tokenize.HashedVocabulary(
+            hash_file
+        )
 
     def __call__(
         self,

diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
@@ -9,14 +9,14 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
-    cdef cppclass tokenizer_result "nvtext::tokenizer_result":
+    cdef cppclass tokenizer_result:
         uint32_t nrows_tensor
         uint32_t sequence_length
         unique_ptr[column] tensor_token_ids
         unique_ptr[column] tensor_attention_mask
         unique_ptr[column] tensor_metadata
 
-    cdef struct hashed_vocabulary "nvtext::hashed_vocabulary":
+    cdef cppclass hashed_vocabulary:
         uint16_t first_token_id
         uint16_t separator_token_id
         uint16_t unknown_token_id
@@ -26,14 +26,16 @@ cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
         unique_ptr[column] table
         unique_ptr[column] bin_coefficients
         unique_ptr[column] bin_offsets
+        unique_ptr[column] cp_metadata
+        unique_ptr[column] aux_cp_table
 
     cdef unique_ptr[hashed_vocabulary] load_vocabulary_file(
         const string &filename_hashed_vocabulary
     ) except +
 
     cdef tokenizer_result subword_tokenize(
         const column_view & strings,
-        hashed_vocabulary & hashed_vocablary_obj,
+        hashed_vocabulary & hashed_vocabulary_obj,
         uint32_t max_sequence_length,
         uint32_t stride,
         bool do_lower,

diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 set(cython_sources
     edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx
-    replace.pyx stemmer.pyx tokenize.pyx byte_pair_encode.pyx
+    replace.pyx stemmer.pyx tokenize.pyx byte_pair_encode.pyx subword_tokenize.pyx
 )
 
 set(linked_libraries cudf::cudf)

diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -10,6 +10,7 @@ from . cimport (
     normalize,
     replace,
     stemmer,
+    subword_tokenize,
     tokenize,
 )
 
@@ -23,5 +24,6 @@ __all__ = [
     "normalize",
     "replace",
     "stemmer",
+    "subword_tokenize",
     "tokenize",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -10,6 +10,7 @@
     normalize,
     replace,
     stemmer,
+    subword_tokenize,
     tokenize,
 )
 
@@ -23,5 +24,6 @@
     "normalize",
     "replace",
     "stemmer",
+    "subword_tokenize",
     "tokenize",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport uint32_t
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.subword_tokenize cimport hashed_vocabulary
+
+
+cdef class HashedVocabulary:
+    cdef unique_ptr[hashed_vocabulary] c_obj
+
+cpdef tuple[Column, Column, Column] subword_tokenize(
+    Column input,
+    HashedVocabulary vocabulary_table,
+    uint32_t max_sequence_length,
+    uint32_t stride,
+    bool do_lower_case,
+    bool do_truncate,
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx
@@ -0,0 +1,84 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libc.stdint cimport uint32_t
+from libcpp cimport bool
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.subword_tokenize cimport (
+    load_vocabulary_file as cpp_load_vocabulary_file,
+    move as tr_move,
+    subword_tokenize as cpp_subword_tokenize,
+    tokenizer_result as cpp_tokenizer_result,
+)
+
+
+cdef class HashedVocabulary:
+    """The vocabulary data for use with the subword_tokenize function.
+
+    For details, see :cpp:class:`cudf::nvtext::hashed_vocabulary`.
+    """
+    def __cinit__(self, hash_file):
+        cdef string c_hash_file = <string>str(hash_file).encode()
+        with nogil:
+            self.c_obj = move(cpp_load_vocabulary_file(c_hash_file))
+
+cpdef tuple[Column, Column, Column] subword_tokenize(
+    Column input,
+    HashedVocabulary vocabulary_table,
+    uint32_t max_sequence_length,
+    uint32_t stride,
+    bool do_lower_case,
+    bool do_truncate,
+):
+    """
+    Creates a tokenizer that cleans the text, splits it into
+    tokens and returns token-ids from an input vocabulary.
+
+    For details, see cpp:func:`subword_tokenize`
+
+    Parameters
+    ----------
+    input : Column
+        The input strings to tokenize.
+    vocabulary_table : HashedVocabulary
+        The vocabulary table pre-loaded into this object.
+    max_sequence_length : uint32_t
+        Limit of the number of token-ids per row in final tensor for each string.
+    stride : uint32_t
+        Each row in the output token-ids will replicate
+        ``max_sequence_length`` - ``stride`` the token-ids
+        from the previous row, unless it is the first string.
+    do_lower_case : bool
+        If true, the tokenizer will convert uppercase characters in the
+        input stream to lower-case and strip accents from those characters.
+        If false, accented and uppercase characters are not transformed.
+    do_truncate : bool
+        If true, the tokenizer will discard all the token-ids after
+        ``max_sequence_length`` for each input string. If false, it
+        will use a new row in the output token-ids to continue
+        generating the output.
+
+    Returns
+    -------
+    tuple[Column, Column, Column]
+        A tuple of three columns containing the
+        tokens, masks, and metadata.
+    """
+    cdef cpp_tokenizer_result c_result
+    with nogil:
+        c_result = tr_move(
+            cpp_subword_tokenize(
+                input.view(),
+                dereference(vocabulary_table.c_obj.get()),
+                max_sequence_length,
+                stride,
+                do_lower_case,
+                do_truncate,
+            )
+        )
+    cdef Column tokens = Column.from_libcudf(move(c_result.tensor_token_ids))
+    cdef Column masks = Column.from_libcudf(move(c_result.tensor_attention_mask))
+    cdef Column metadata = Column.from_libcudf(move(c_result.tensor_metadata))
+    return tokens, masks, metadata
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture
+def vocab_file(tmpdir):
+    hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt")
+    content = "1\n0\n10\n"
+    coefficients = [65559] * 10
+    for c in coefficients:
+        content = content + str(c) + " 0\n"
+    table = [0] * 10
+    table[0] = 3015668
+    content = content + "10\n"
+    for v in table:
+        content = content + str(v) + "\n"
+    content = content + "100\n101\n102\n\n"
+    hash_file.write(content)
+    return str(hash_file)
+
+
+@pytest.fixture
+def column_input():
+    return pa.array(["This is a test"])
+
+
+@pytest.mark.parametrize("max_sequence_length", [64, 128])
+@pytest.mark.parametrize("stride", [32, 64])
+@pytest.mark.parametrize("do_lower_case", [True, False])
+@pytest.mark.parametrize("do_truncate", [True, False])
+def test_subword_tokenize(
+    vocab_file,
+    column_input,
+    max_sequence_length,
+    stride,
+    do_lower_case,
+    do_truncate,
+):
+    vocab = plc.nvtext.subword_tokenize.HashedVocabulary(vocab_file)
+    tokens, masks, metadata = plc.nvtext.subword_tokenize.subword_tokenize(
+        plc.interop.from_arrow(column_input),
+        vocab,
+        max_sequence_length,
+        stride,
+        do_lower_case,
+        do_truncate,
+    )
+    expected_tokens = pa.array(
+        [100] * 4 + [0] * (max_sequence_length - 4), type=pa.uint32()
+    )
+    expected_masks = pa.array(
+        [1] * 4 + [0] * (max_sequence_length - 4), type=pa.uint32()
+    )
+    expected_metadata = pa.array([0, 0, 3], type=pa.uint32())
+
+    assert_column_eq(tokens, expected_tokens)
+    assert_column_eq(masks, expected_masks)
+    assert_column_eq(metadata, expected_metadata)