Skip to content

Commit

Permalink
Migrate NVtext subword tokenizing APIs to pylibcudf (#17096)
Browse files Browse the repository at this point in the history
Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #17096
  • Loading branch information
Matt711 authored Oct 31, 2024
1 parent a69de57 commit a0711d0
Show file tree
Hide file tree
Showing 11 changed files with 202 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ nvtext
normalize
replace
stemmer
subword_tokenize
tokenize
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
================
subword_tokenize
================

.. automodule:: pylibcudf.nvtext.subword_tokenize
:members:
50 changes: 13 additions & 37 deletions python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,16 @@ from libc.stdint cimport uint32_t
from cudf.core.buffer import acquire_spill_lock

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.subword_tokenize cimport (
hashed_vocabulary as cpp_hashed_vocabulary,
load_vocabulary_file as cpp_load_vocabulary_file,
move as tr_move,
subword_tokenize as cpp_subword_tokenize,
tokenizer_result as cpp_tokenizer_result,
)

from cudf._lib.column cimport Column


cdef class Hashed_Vocabulary:
cdef unique_ptr[cpp_hashed_vocabulary] c_obj

def __cinit__(self, hash_file):
cdef string c_hash_file = <string>str(hash_file).encode()
with nogil:
self.c_obj = move(cpp_load_vocabulary_file(c_hash_file))
from pylibcudf import nvtext


@acquire_spill_lock()
def subword_tokenize_inmem_hash(
Column strings,
Hashed_Vocabulary hashed_vocabulary,
object hashed_vocabulary,
uint32_t max_sequence_length=64,
uint32_t stride=48,
bool do_lower=True,
Expand All @@ -42,21 +23,16 @@ def subword_tokenize_inmem_hash(
"""
Subword tokenizes text series by using the pre-loaded hashed vocabulary
"""
cdef column_view c_strings = strings.view()
cdef cpp_tokenizer_result c_result
with nogil:
c_result = tr_move(
cpp_subword_tokenize(
c_strings,
hashed_vocabulary.c_obj.get()[0],
max_sequence_length,
stride,
do_lower,
do_truncate,
)
)
result = nvtext.subword_tokenize.subword_tokenize(
strings.to_pylibcudf(mode="read"),
hashed_vocabulary,
max_sequence_length,
stride,
do_lower,
do_truncate,
)
# return the 3 tensor components
tokens = Column.from_unique_ptr(move(c_result.tensor_token_ids))
masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask))
metadata = Column.from_unique_ptr(move(c_result.tensor_metadata))
tokens = Column.from_pylibcudf(result[0])
masks = Column.from_pylibcudf(result[1])
metadata = Column.from_pylibcudf(result[2])
return tokens, masks, metadata
7 changes: 5 additions & 2 deletions python/cudf/cudf/core/subword_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@

import cupy as cp

import pylibcudf as plc

from cudf._lib.nvtext.subword_tokenize import (
Hashed_Vocabulary as cpp_hashed_vocabulary,
subword_tokenize_inmem_hash as cpp_subword_tokenize,
)

Expand Down Expand Up @@ -50,7 +51,9 @@ class SubwordTokenizer:

def __init__(self, hash_file: str, do_lower_case: bool = True):
self.do_lower_case = do_lower_case
self.vocab_file = cpp_hashed_vocabulary(hash_file)
self.vocab_file = plc.nvtext.subword_tokenize.HashedVocabulary(
hash_file
)

def __call__(
self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ from pylibcudf.libcudf.column.column_view cimport column_view


cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
cdef cppclass tokenizer_result "nvtext::tokenizer_result":
cdef cppclass tokenizer_result:
uint32_t nrows_tensor
uint32_t sequence_length
unique_ptr[column] tensor_token_ids
unique_ptr[column] tensor_attention_mask
unique_ptr[column] tensor_metadata

cdef struct hashed_vocabulary "nvtext::hashed_vocabulary":
cdef cppclass hashed_vocabulary:
uint16_t first_token_id
uint16_t separator_token_id
uint16_t unknown_token_id
Expand All @@ -26,14 +26,16 @@ cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
unique_ptr[column] table
unique_ptr[column] bin_coefficients
unique_ptr[column] bin_offsets
unique_ptr[column] cp_metadata
unique_ptr[column] aux_cp_table

cdef unique_ptr[hashed_vocabulary] load_vocabulary_file(
const string &filename_hashed_vocabulary
) except +

cdef tokenizer_result subword_tokenize(
const column_view & strings,
hashed_vocabulary & hashed_vocablary_obj,
hashed_vocabulary & hashed_vocabulary_obj,
uint32_t max_sequence_length,
uint32_t stride,
bool do_lower,
Expand Down
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

set(cython_sources
edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx
replace.pyx stemmer.pyx tokenize.pyx byte_pair_encode.pyx
replace.pyx stemmer.pyx tokenize.pyx byte_pair_encode.pyx subword_tokenize.pyx
)

set(linked_libraries cudf::cudf)
Expand Down
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ from . cimport (
normalize,
replace,
stemmer,
subword_tokenize,
tokenize,
)

Expand All @@ -23,5 +24,6 @@ __all__ = [
"normalize",
"replace",
"stemmer",
"subword_tokenize",
"tokenize",
]
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
normalize,
replace,
stemmer,
subword_tokenize,
tokenize,
)

Expand All @@ -23,5 +24,6 @@
"normalize",
"replace",
"stemmer",
"subword_tokenize",
"tokenize",
]
20 changes: 20 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libc.stdint cimport uint32_t
from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from pylibcudf.column cimport Column
from pylibcudf.libcudf.nvtext.subword_tokenize cimport hashed_vocabulary


cdef class HashedVocabulary:
cdef unique_ptr[hashed_vocabulary] c_obj

cpdef tuple[Column, Column, Column] subword_tokenize(
Column input,
HashedVocabulary vocabulary_table,
uint32_t max_sequence_length,
uint32_t stride,
bool do_lower_case,
bool do_truncate,
)
84 changes: 84 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from cython.operator cimport dereference
from libc.stdint cimport uint32_t
from libcpp cimport bool
from libcpp.string cimport string
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.nvtext.subword_tokenize cimport (
load_vocabulary_file as cpp_load_vocabulary_file,
move as tr_move,
subword_tokenize as cpp_subword_tokenize,
tokenizer_result as cpp_tokenizer_result,
)


cdef class HashedVocabulary:
"""The vocabulary data for use with the subword_tokenize function.
For details, see :cpp:class:`cudf::nvtext::hashed_vocabulary`.
"""
def __cinit__(self, hash_file):
cdef string c_hash_file = <string>str(hash_file).encode()
with nogil:
self.c_obj = move(cpp_load_vocabulary_file(c_hash_file))

cpdef tuple[Column, Column, Column] subword_tokenize(
Column input,
HashedVocabulary vocabulary_table,
uint32_t max_sequence_length,
uint32_t stride,
bool do_lower_case,
bool do_truncate,
):
"""
Creates a tokenizer that cleans the text, splits it into
tokens and returns token-ids from an input vocabulary.
For details, see cpp:func:`subword_tokenize`
Parameters
----------
input : Column
The input strings to tokenize.
vocabulary_table : HashedVocabulary
The vocabulary table pre-loaded into this object.
max_sequence_length : uint32_t
Limit of the number of token-ids per row in final tensor for each string.
stride : uint32_t
Each row in the output token-ids will replicate
``max_sequence_length`` - ``stride`` the token-ids
from the previous row, unless it is the first string.
do_lower_case : bool
If true, the tokenizer will convert uppercase characters in the
input stream to lower-case and strip accents from those characters.
If false, accented and uppercase characters are not transformed.
do_truncate : bool
If true, the tokenizer will discard all the token-ids after
``max_sequence_length`` for each input string. If false, it
will use a new row in the output token-ids to continue
generating the output.
Returns
-------
tuple[Column, Column, Column]
A tuple of three columns containing the
tokens, masks, and metadata.
"""
cdef cpp_tokenizer_result c_result
with nogil:
c_result = tr_move(
cpp_subword_tokenize(
input.view(),
dereference(vocabulary_table.c_obj.get()),
max_sequence_length,
stride,
do_lower_case,
do_truncate,
)
)
cdef Column tokens = Column.from_libcudf(move(c_result.tensor_token_ids))
cdef Column masks = Column.from_libcudf(move(c_result.tensor_attention_mask))
cdef Column metadata = Column.from_libcudf(move(c_result.tensor_metadata))
return tokens, masks, metadata
63 changes: 63 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pytest
from utils import assert_column_eq

import pylibcudf as plc


@pytest.fixture
def vocab_file(tmpdir):
hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt")
content = "1\n0\n10\n"
coefficients = [65559] * 10
for c in coefficients:
content = content + str(c) + " 0\n"
table = [0] * 10
table[0] = 3015668
content = content + "10\n"
for v in table:
content = content + str(v) + "\n"
content = content + "100\n101\n102\n\n"
hash_file.write(content)
return str(hash_file)


@pytest.fixture
def column_input():
return pa.array(["This is a test"])


@pytest.mark.parametrize("max_sequence_length", [64, 128])
@pytest.mark.parametrize("stride", [32, 64])
@pytest.mark.parametrize("do_lower_case", [True, False])
@pytest.mark.parametrize("do_truncate", [True, False])
def test_subword_tokenize(
vocab_file,
column_input,
max_sequence_length,
stride,
do_lower_case,
do_truncate,
):
vocab = plc.nvtext.subword_tokenize.HashedVocabulary(vocab_file)
tokens, masks, metadata = plc.nvtext.subword_tokenize.subword_tokenize(
plc.interop.from_arrow(column_input),
vocab,
max_sequence_length,
stride,
do_lower_case,
do_truncate,
)
expected_tokens = pa.array(
[100] * 4 + [0] * (max_sequence_length - 4), type=pa.uint32()
)
expected_masks = pa.array(
[1] * 4 + [0] * (max_sequence_length - 4), type=pa.uint32()
)
expected_metadata = pa.array([0, 0, 3], type=pa.uint32())

assert_column_eq(tokens, expected_tokens)
assert_column_eq(masks, expected_masks)
assert_column_eq(metadata, expected_metadata)

0 comments on commit a0711d0

Please sign in to comment.