-
Notifications
You must be signed in to change notification settings - Fork 904
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Migrate NVtext subword tokenizing APIs to pylibcudf (#17096)
Apart of #15162 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: #17096
- Loading branch information
Showing
11 changed files
with
202 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,4 +13,5 @@ nvtext | |
normalize | ||
replace | ||
stemmer | ||
subword_tokenize | ||
tokenize |
6 changes: 6 additions & 0 deletions
6
docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/subword_tokenize.rst
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
================ | ||
subword_tokenize | ||
================ | ||
|
||
.. automodule:: pylibcudf.nvtext.subword_tokenize | ||
:members: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from libc.stdint cimport uint32_t | ||
from libcpp cimport bool | ||
from libcpp.memory cimport unique_ptr | ||
from pylibcudf.column cimport Column | ||
from pylibcudf.libcudf.nvtext.subword_tokenize cimport hashed_vocabulary | ||
|
||
|
||
cdef class HashedVocabulary: | ||
cdef unique_ptr[hashed_vocabulary] c_obj | ||
|
||
cpdef tuple[Column, Column, Column] subword_tokenize( | ||
Column input, | ||
HashedVocabulary vocabulary_table, | ||
uint32_t max_sequence_length, | ||
uint32_t stride, | ||
bool do_lower_case, | ||
bool do_truncate, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
# Copyright (c) 2020-2024, NVIDIA CORPORATION. | ||
|
||
from cython.operator cimport dereference | ||
from libc.stdint cimport uint32_t | ||
from libcpp cimport bool | ||
from libcpp.string cimport string | ||
from libcpp.utility cimport move | ||
from pylibcudf.column cimport Column | ||
from pylibcudf.libcudf.nvtext.subword_tokenize cimport ( | ||
load_vocabulary_file as cpp_load_vocabulary_file, | ||
move as tr_move, | ||
subword_tokenize as cpp_subword_tokenize, | ||
tokenizer_result as cpp_tokenizer_result, | ||
) | ||
|
||
|
||
cdef class HashedVocabulary: | ||
"""The vocabulary data for use with the subword_tokenize function. | ||
For details, see :cpp:class:`cudf::nvtext::hashed_vocabulary`. | ||
""" | ||
def __cinit__(self, hash_file): | ||
cdef string c_hash_file = <string>str(hash_file).encode() | ||
with nogil: | ||
self.c_obj = move(cpp_load_vocabulary_file(c_hash_file)) | ||
|
||
cpdef tuple[Column, Column, Column] subword_tokenize( | ||
Column input, | ||
HashedVocabulary vocabulary_table, | ||
uint32_t max_sequence_length, | ||
uint32_t stride, | ||
bool do_lower_case, | ||
bool do_truncate, | ||
): | ||
""" | ||
Creates a tokenizer that cleans the text, splits it into | ||
tokens and returns token-ids from an input vocabulary. | ||
For details, see cpp:func:`subword_tokenize` | ||
Parameters | ||
---------- | ||
input : Column | ||
The input strings to tokenize. | ||
vocabulary_table : HashedVocabulary | ||
The vocabulary table pre-loaded into this object. | ||
max_sequence_length : uint32_t | ||
Limit of the number of token-ids per row in final tensor for each string. | ||
stride : uint32_t | ||
Each row in the output token-ids will replicate | ||
``max_sequence_length`` - ``stride`` the token-ids | ||
from the previous row, unless it is the first string. | ||
do_lower_case : bool | ||
If true, the tokenizer will convert uppercase characters in the | ||
input stream to lower-case and strip accents from those characters. | ||
If false, accented and uppercase characters are not transformed. | ||
do_truncate : bool | ||
If true, the tokenizer will discard all the token-ids after | ||
``max_sequence_length`` for each input string. If false, it | ||
will use a new row in the output token-ids to continue | ||
generating the output. | ||
Returns | ||
------- | ||
tuple[Column, Column, Column] | ||
A tuple of three columns containing the | ||
tokens, masks, and metadata. | ||
""" | ||
cdef cpp_tokenizer_result c_result | ||
with nogil: | ||
c_result = tr_move( | ||
cpp_subword_tokenize( | ||
input.view(), | ||
dereference(vocabulary_table.c_obj.get()), | ||
max_sequence_length, | ||
stride, | ||
do_lower_case, | ||
do_truncate, | ||
) | ||
) | ||
cdef Column tokens = Column.from_libcudf(move(c_result.tensor_token_ids)) | ||
cdef Column masks = Column.from_libcudf(move(c_result.tensor_attention_mask)) | ||
cdef Column metadata = Column.from_libcudf(move(c_result.tensor_metadata)) | ||
return tokens, masks, metadata |
63 changes: 63 additions & 0 deletions
63
python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
import pyarrow as pa | ||
import pytest | ||
from utils import assert_column_eq | ||
|
||
import pylibcudf as plc | ||
|
||
|
||
@pytest.fixture | ||
def vocab_file(tmpdir): | ||
hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt") | ||
content = "1\n0\n10\n" | ||
coefficients = [65559] * 10 | ||
for c in coefficients: | ||
content = content + str(c) + " 0\n" | ||
table = [0] * 10 | ||
table[0] = 3015668 | ||
content = content + "10\n" | ||
for v in table: | ||
content = content + str(v) + "\n" | ||
content = content + "100\n101\n102\n\n" | ||
hash_file.write(content) | ||
return str(hash_file) | ||
|
||
|
||
@pytest.fixture | ||
def column_input(): | ||
return pa.array(["This is a test"]) | ||
|
||
|
||
@pytest.mark.parametrize("max_sequence_length", [64, 128]) | ||
@pytest.mark.parametrize("stride", [32, 64]) | ||
@pytest.mark.parametrize("do_lower_case", [True, False]) | ||
@pytest.mark.parametrize("do_truncate", [True, False]) | ||
def test_subword_tokenize( | ||
vocab_file, | ||
column_input, | ||
max_sequence_length, | ||
stride, | ||
do_lower_case, | ||
do_truncate, | ||
): | ||
vocab = plc.nvtext.subword_tokenize.HashedVocabulary(vocab_file) | ||
tokens, masks, metadata = plc.nvtext.subword_tokenize.subword_tokenize( | ||
plc.interop.from_arrow(column_input), | ||
vocab, | ||
max_sequence_length, | ||
stride, | ||
do_lower_case, | ||
do_truncate, | ||
) | ||
expected_tokens = pa.array( | ||
[100] * 4 + [0] * (max_sequence_length - 4), type=pa.uint32() | ||
) | ||
expected_masks = pa.array( | ||
[1] * 4 + [0] * (max_sequence_length - 4), type=pa.uint32() | ||
) | ||
expected_metadata = pa.array([0, 0, 3], type=pa.uint32()) | ||
|
||
assert_column_eq(tokens, expected_tokens) | ||
assert_column_eq(masks, expected_masks) | ||
assert_column_eq(metadata, expected_metadata) |