Skip to content

Commit

Permalink
[TESTS][zeta.tokenizers]
Browse files Browse the repository at this point in the history
  • Loading branch information
Kye committed Dec 23, 2023
1 parent 99ad2f9 commit d07d002
Show file tree
Hide file tree
Showing 18 changed files with 398 additions and 214 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "zetascale"
version = "1.2.3"
version = "1.2.4"
description = "Transformers at zeta scales"
authors = ["Zeta Team <[email protected]>"]
license = "MIT"
Expand Down
3 changes: 2 additions & 1 deletion tests/nn/modules/test_simple_res_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pytest
from zeta.nn.modules.simple_resblock import SimpleResBlock


def test_simple_resblock():
# Initialize a SimpleResBlock with 10 channels
resblock = SimpleResBlock(10)
Expand All @@ -20,4 +21,4 @@ def test_simple_resblock():
assert not torch.all(torch.eq(output, x))

# Check that the output is a tensor
assert isinstance(output, torch.Tensor)
assert isinstance(output, torch.Tensor)
5 changes: 4 additions & 1 deletion tests/structs/test_autoregressive_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from zeta.structs.auto_regressive_wrapper import AutoregressiveWrapper
from torch import nn


def test_autoregressive_wrapper_initialization():
net = nn.Linear(10, 10)
wrapper = AutoregressiveWrapper(net)
Expand All @@ -14,6 +15,7 @@ def test_autoregressive_wrapper_initialization():
assert wrapper.ignore_index == -100
assert wrapper.mask_prob == 0.0


def test_autoregressive_wrapper_forward():
net = nn.Linear(10, 10)
wrapper = AutoregressiveWrapper(net)
Expand All @@ -24,6 +26,7 @@ def test_autoregressive_wrapper_forward():
assert isinstance(logits, torch.Tensor)
assert logits.shape == torch.Size([1, 10, 10])


def test_autoregressive_wrapper_generate():
net = nn.Linear(10, 10)
wrapper = AutoregressiveWrapper(net)
Expand All @@ -32,4 +35,4 @@ def test_autoregressive_wrapper_generate():
generated = wrapper.generate(x, 10)

assert isinstance(generated, torch.Tensor)
assert generated.shape == torch.Size([1, 10])
assert generated.shape == torch.Size([1, 10])
5 changes: 4 additions & 1 deletion tests/structs/test_encoder_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from zeta.structs.encoder_decoder import EncoderDecoder
from argparse import Namespace


def test_encoder_decoder_initialization():
args = Namespace(share_all_embeddings=True)
encoder_decoder = EncoderDecoder(args)
Expand All @@ -12,6 +13,7 @@ def test_encoder_decoder_initialization():
assert encoder_decoder.args.share_all_embeddings == True
assert encoder_decoder.args.share_decoder_input_output_embed == True


def test_encoder_decoder_forward():
args = Namespace(share_all_embeddings=True)
encoder_decoder = EncoderDecoder(args)
Expand All @@ -24,6 +26,7 @@ def test_encoder_decoder_forward():
assert isinstance(output, torch.Tensor)
assert output.shape == prev_output_tokens.shape


def test_encoder_decoder_forward_features_only():
args = Namespace(share_all_embeddings=True)
encoder_decoder = EncoderDecoder(args)
Expand All @@ -34,4 +37,4 @@ def test_encoder_decoder_forward_features_only():
output = encoder_decoder(src_tokens, prev_output_tokens, features_only=True)

assert isinstance(output, torch.Tensor)
assert output.shape == prev_output_tokens.shape
assert output.shape == prev_output_tokens.shape
41 changes: 41 additions & 0 deletions tests/tokenizers/test_gptx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import torch
import pytest
from zeta.tokenizers.gptx_tokenizer import LanguageTokenizerGPTX


def test_language_tokenizer_gptx_initialization():
tokenizer = LanguageTokenizerGPTX()

assert isinstance(tokenizer, LanguageTokenizerGPTX)
assert tokenizer.tokenizer.eos_token == "<eos>"
assert tokenizer.tokenizer.pad_token == "<pad>"
assert tokenizer.tokenizer.model_max_length == 8192


def test_language_tokenizer_gptx_tokenize_texts():
tokenizer = LanguageTokenizerGPTX()

texts = ["Hello, world!", "Goodbye, world!"]
tokenized_texts = tokenizer.tokenize_texts(texts)

assert isinstance(tokenized_texts, torch.Tensor)
assert tokenized_texts.shape[0] == len(texts)


def test_language_tokenizer_gptx_decode():
tokenizer = LanguageTokenizerGPTX()

texts = ["Hello, world!", "Goodbye, world!"]
tokenized_texts = tokenizer.tokenize_texts(texts)
decoded_texts = tokenizer.decode(tokenized_texts[0])

assert isinstance(decoded_texts, str)


def test_language_tokenizer_gptx_len():
tokenizer = LanguageTokenizerGPTX()

num_tokens = len(tokenizer)

assert isinstance(num_tokens, int)
assert num_tokens > 0
59 changes: 59 additions & 0 deletions tests/tokenizers/test_multimodal_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from PIL import Image
import torch
import pytest
from zeta.tokenizers.multi_modal_tokenizer import MultiModalTokenizer


def test_multi_modal_tokenizer_initialization():
tokenizer = MultiModalTokenizer()

assert isinstance(tokenizer, MultiModalTokenizer)
assert tokenizer.max_length == 8192
assert tokenizer.tokenizer.eos_token == "<eos>"
assert tokenizer.tokenizer.pad_token == "<pad>"
assert tokenizer.tokenizer.model_max_length == tokenizer.max_length
assert tokenizer.im_idx == tokenizer.tokenizer.convert_tokens_to_ids(
"<image>"
)
assert tokenizer.im_end_idx == tokenizer.tokenizer.convert_tokens_to_ids(
"</image>"
)


def test_multi_modal_tokenizer_tokenize_texts():
tokenizer = MultiModalTokenizer()

texts = ["Hello, world!", "Goodbye, world!"]
tokenized_texts, only_text_tokens = tokenizer.tokenize_texts(texts)

assert isinstance(tokenized_texts, torch.Tensor)
assert tokenized_texts.shape[0] == len(texts)
assert isinstance(only_text_tokens, torch.Tensor)
assert only_text_tokens.shape[0] == len(texts)


def test_multi_modal_tokenizer_tokenize_images():
tokenizer = MultiModalTokenizer()

# Assuming images is a list of PIL Image objects
images = [Image.new("RGB", (60, 30), color="red") for _ in range(2)]
tokenized_images = tokenizer.tokenize_images(images)

assert isinstance(tokenized_images, torch.Tensor)
assert tokenized_images.shape[0] == len(images)


def test_multi_modal_tokenizer_tokenize():
tokenizer = MultiModalTokenizer()

sample = {
"target_text": ["Hello, world!", "Goodbye, world!"],
"image": [Image.new("RGB", (60, 30), color="red") for _ in range(2)],
}
tokenized_sample = tokenizer.tokenize(sample)

assert isinstance(tokenized_sample, dict)
assert "text_tokens" in tokenized_sample
assert "images" in tokenized_sample
assert "labels" in tokenized_sample
assert "attention_mask" in tokenized_sample
64 changes: 64 additions & 0 deletions tests/tokenizers/test_sentencepiece.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import pytest
import os
from zeta.tokenizers.sentence_piece import SentencePieceTokenizer


def test_sentence_piece_tokenizer_initialization():
model_path = "/path/to/your/model" # replace with your actual model path
assert os.path.isfile(model_path), "Model file does not exist"

tokenizer = SentencePieceTokenizer(model_path)

assert isinstance(tokenizer, SentencePieceTokenizer)
assert tokenizer.n_words == tokenizer.sp_model.vocab_size()
assert tokenizer.bos_id == tokenizer.sp_model.bos_id()
assert tokenizer.eos_id == tokenizer.sp_model.eos_id()
assert tokenizer.pad_id == tokenizer.sp_model.pad_id()


def test_sentence_piece_tokenizer_encode():
model_path = "/path/to/your/model" # replace with your actual model path
tokenizer = SentencePieceTokenizer(model_path)

text = "Hello, world!"
encoded_text = tokenizer.encode(text, bos=True, eos=True)

assert isinstance(encoded_text, list)
assert encoded_text[0] == tokenizer.bos_id
assert encoded_text[-1] == tokenizer.eos_id


def test_sentence_piece_tokenizer_decode():
model_path = "/path/to/your/model" # replace with your actual model path
tokenizer = SentencePieceTokenizer(model_path)

text = "Hello, world!"
encoded_text = tokenizer.encode(text, bos=True, eos=True)
decoded_text = tokenizer.decode(encoded_text)

assert isinstance(decoded_text, str)
assert decoded_text == text


def test_sentence_piece_tokenizer_encode_infilling():
model_path = "/path/to/your/model" # replace with your actual model path
tokenizer = SentencePieceTokenizer(model_path)

text = "Hello, world!"
encoded_text = tokenizer.encode_infilling(text)

assert isinstance(encoded_text, list)


def test_sentence_piece_tokenizer_decode_infilling():
model_path = "/path/to/your/model" # replace with your actual model path
tokenizer = SentencePieceTokenizer(model_path)

text = "Hello, world!"
encoded_text = tokenizer.encode_infilling(text)
decoded_text = tokenizer.decode_infilling(encoded_text)

assert isinstance(decoded_text, str)
assert (
decoded_text == text[1:]
) # the first character is removed in decode_infilling
145 changes: 145 additions & 0 deletions tests/tokenizers/test_tokenmonster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import pytest
from zeta.tokenizers.tokenmonster import TokenMonster


def test_token_monster_initialization():
tokenizer = TokenMonster("englishcode-32000-consistent-v1")

assert isinstance(tokenizer, TokenMonster)
assert tokenizer.vocab is not None


def test_token_monster_set_local_directory():
tokenizer = TokenMonster("englishcode-32000-consistent-v1")
tokenizer.set_local_directory(
"/path/to/your/directory"
) # replace with your actual directory

# There's no direct way to assert the effect of this method as it doesn't return anything
# and it doesn't change any accessible state of the TokenMonster object.
# You might need to check manually if the directory is set correctly.


def test_token_monster_load():
tokenizer = TokenMonster("englishcode-32000-consistent-v1")
tokenizer.load("englishcode-32000-consistent-v1")

assert tokenizer.vocab is not None


def test_token_monster_load_multiprocess_safe():
tokenizer = TokenMonster("englishcode-32000-consistent-v1")
tokenizer.load_multiprocess_safe("englishcode-32000-consistent-v1")

assert tokenizer.vocab is not None


def test_token_monster_new():
tokenizer = TokenMonster("englishcode-32000-consistent-v1")
yaml = """
tokens:
- token: " "
score: 0
- token: "e"
score: 1
- token: "t"
score: 2
"""
tokenizer.new(yaml)

assert tokenizer.vocab is not None


def test_token_monster_save():
tokenizer = TokenMonster("englishcode-32000-consistent-v1")
tokenizer.save("/path/to/your/file") # replace with your actual file path

# There's no direct way to assert the effect of this method as it doesn't return anything
# and it doesn't change any accessible state of the TokenMonster object.
# You might need to check manually if the file is saved correctly.


def test_token_monster_export_yaml():
tokenizer = TokenMonster("englishcode-32000-consistent-v1")
yaml = tokenizer.export_yaml()

assert isinstance(yaml, bytes)


def test_token_monster_tokenize():
tokenizer = TokenMonster("englishcode-32000-consistent-v1")
tokens = tokenizer.tokenize("Hello world!")

assert isinstance(tokens, list)


def test_token_monster_tokenize_count():
tokenizer = TokenMonster("englishcode-32000-consistent-v1")
count = tokenizer.tokenize_count("Hello world!")

assert isinstance(count, int)


def test_token_monster_decode():
tokenizer = TokenMonster("englishcode-32000-consistent-v1")
tokens = tokenizer.tokenize("Hello world!")
text = tokenizer.decode(tokens)

assert isinstance(text, str)
assert text == "Hello world!"


def test_token_monster_decoder():
tokenizer = TokenMonster("englishcode-32000-consistent-v1")
decoder = tokenizer.decoder()

assert decoder is not None


def test_token_monster_get_dictionary():
tokenizer = TokenMonster("englishcode-32000-consistent-v1")
dictionary = tokenizer.get_dictionary()

assert isinstance(dictionary, list)


def test_token_monster_charset():
tokenizer = TokenMonster("englishcode-32000-consistent-v1")
charset = tokenizer.charset()

assert isinstance(charset, str)


def test_token_monster_normalization():
tokenizer = TokenMonster("englishcode-32000-consistent-v1")
normalization = tokenizer.normalization()

assert isinstance(normalization, str)


def test_token_monster_capcode():
tokenizer = TokenMonster("englishcode-32000-consistent-v1")
capcode = tokenizer.capcode()

assert isinstance(capcode, int)


def test_token_monster_mode():
tokenizer = TokenMonster("englishcode-32000-consistent-v1")
mode = tokenizer.mode()

assert isinstance(mode, int)


def test_token_monster_id_to_token():
tokenizer = TokenMonster("englishcode-32000-consistent-v1")
token = tokenizer.id_to_token(1)

assert isinstance(token, str)


def test_token_monster_id_to_token_decoded():
tokenizer = TokenMonster("englishcode-32000-consistent-v1")
token = tokenizer.id_to_token_decoded(1)

assert isinstance(token, str)
Loading

0 comments on commit d07d002

Please sign in to comment.