Skip to content

Commit

Permalink
add test_tokenize_document_with_slow_tokenizer_and_windowing()
Browse files Browse the repository at this point in the history
  • Loading branch information
ArneBinder committed Jan 11, 2024
1 parent e626582 commit dcb7330
Showing 1 changed file with 18 additions and 1 deletion.
19 changes: 18 additions & 1 deletion tests/document/processing/test_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,8 +647,25 @@ def test_tokenize_document_partition(text_document, tokenizer):

def test_tokenize_document_with_slow_tokenizer():
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)
text_document = TextBasedDocument(text="Hello World")
text_document = TextBasedDocument(text="Alice has a cat. Bob has a dog.")

tokenized_docs = tokenize_document(
text_document, tokenizer=tokenizer, result_document_type=TokenBasedDocument
)
assert len(tokenized_docs) == 1


def test_tokenize_document_with_slow_tokenizer_and_windowing():
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)
text_document = TextBasedDocument(text="Alice has a cat. Bob has a dog.")

tokenized_docs = tokenize_document(
text_document,
tokenizer=tokenizer,
result_document_type=TokenBasedDocument,
max_length=5,
return_overflowing_tokens=True,
)
assert (
len(tokenized_docs) == 3
) # the input text gets tokenized into 12 tokens and max_length is 5

0 comments on commit dcb7330

Please sign in to comment.