From 3bfaf5c0b094bbcd868636515f2bf8086b2d4814 Mon Sep 17 00:00:00 2001 From: Hk669 Date: Wed, 5 Jun 2024 20:26:10 +0530 Subject: [PATCH] fix: tests_train_bpe_w_specail_tokens --- tests/test_tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index bafea1e..834e64e 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -61,8 +61,8 @@ def test_train_bpe_w_special_tokens(): texts = "<|startoftext|> Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.<|endoftext|>" tokenizer.train(texts, vocab_size=310, verbose=False) - assert len(tokenizer.vocab) == 281 - assert len(tokenizer.merges) == 25 + assert len(tokenizer.vocab) == 310 + assert len(tokenizer.merges) == 54 assert tokenizer.decode(tokenizer.encode(texts)) == texts assert tokenizer.inverse_special_tokens == {v: k for k,v in special_tokens.items()} assert tokenizer.special_tokens == special_tokens