-
-
Notifications
You must be signed in to change notification settings - Fork 39
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Kye
committed
Dec 23, 2023
1 parent
99ad2f9
commit d07d002
Showing
18 changed files
with
398 additions
and
214 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[tool.poetry] | ||
name = "zetascale" | ||
version = "1.2.3" | ||
version = "1.2.4" | ||
description = "Transformers at zeta scales" | ||
authors = ["Zeta Team <[email protected]>"] | ||
license = "MIT" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import torch | ||
import pytest | ||
from zeta.tokenizers.gptx_tokenizer import LanguageTokenizerGPTX | ||
|
||
|
||
def test_language_tokenizer_gptx_initialization(): | ||
tokenizer = LanguageTokenizerGPTX() | ||
|
||
assert isinstance(tokenizer, LanguageTokenizerGPTX) | ||
assert tokenizer.tokenizer.eos_token == "<eos>" | ||
assert tokenizer.tokenizer.pad_token == "<pad>" | ||
assert tokenizer.tokenizer.model_max_length == 8192 | ||
|
||
|
||
def test_language_tokenizer_gptx_tokenize_texts(): | ||
tokenizer = LanguageTokenizerGPTX() | ||
|
||
texts = ["Hello, world!", "Goodbye, world!"] | ||
tokenized_texts = tokenizer.tokenize_texts(texts) | ||
|
||
assert isinstance(tokenized_texts, torch.Tensor) | ||
assert tokenized_texts.shape[0] == len(texts) | ||
|
||
|
||
def test_language_tokenizer_gptx_decode(): | ||
tokenizer = LanguageTokenizerGPTX() | ||
|
||
texts = ["Hello, world!", "Goodbye, world!"] | ||
tokenized_texts = tokenizer.tokenize_texts(texts) | ||
decoded_texts = tokenizer.decode(tokenized_texts[0]) | ||
|
||
assert isinstance(decoded_texts, str) | ||
|
||
|
||
def test_language_tokenizer_gptx_len(): | ||
tokenizer = LanguageTokenizerGPTX() | ||
|
||
num_tokens = len(tokenizer) | ||
|
||
assert isinstance(num_tokens, int) | ||
assert num_tokens > 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from PIL import Image | ||
import torch | ||
import pytest | ||
from zeta.tokenizers.multi_modal_tokenizer import MultiModalTokenizer | ||
|
||
|
||
def test_multi_modal_tokenizer_initialization(): | ||
tokenizer = MultiModalTokenizer() | ||
|
||
assert isinstance(tokenizer, MultiModalTokenizer) | ||
assert tokenizer.max_length == 8192 | ||
assert tokenizer.tokenizer.eos_token == "<eos>" | ||
assert tokenizer.tokenizer.pad_token == "<pad>" | ||
assert tokenizer.tokenizer.model_max_length == tokenizer.max_length | ||
assert tokenizer.im_idx == tokenizer.tokenizer.convert_tokens_to_ids( | ||
"<image>" | ||
) | ||
assert tokenizer.im_end_idx == tokenizer.tokenizer.convert_tokens_to_ids( | ||
"</image>" | ||
) | ||
|
||
|
||
def test_multi_modal_tokenizer_tokenize_texts(): | ||
tokenizer = MultiModalTokenizer() | ||
|
||
texts = ["Hello, world!", "Goodbye, world!"] | ||
tokenized_texts, only_text_tokens = tokenizer.tokenize_texts(texts) | ||
|
||
assert isinstance(tokenized_texts, torch.Tensor) | ||
assert tokenized_texts.shape[0] == len(texts) | ||
assert isinstance(only_text_tokens, torch.Tensor) | ||
assert only_text_tokens.shape[0] == len(texts) | ||
|
||
|
||
def test_multi_modal_tokenizer_tokenize_images(): | ||
tokenizer = MultiModalTokenizer() | ||
|
||
# Assuming images is a list of PIL Image objects | ||
images = [Image.new("RGB", (60, 30), color="red") for _ in range(2)] | ||
tokenized_images = tokenizer.tokenize_images(images) | ||
|
||
assert isinstance(tokenized_images, torch.Tensor) | ||
assert tokenized_images.shape[0] == len(images) | ||
|
||
|
||
def test_multi_modal_tokenizer_tokenize(): | ||
tokenizer = MultiModalTokenizer() | ||
|
||
sample = { | ||
"target_text": ["Hello, world!", "Goodbye, world!"], | ||
"image": [Image.new("RGB", (60, 30), color="red") for _ in range(2)], | ||
} | ||
tokenized_sample = tokenizer.tokenize(sample) | ||
|
||
assert isinstance(tokenized_sample, dict) | ||
assert "text_tokens" in tokenized_sample | ||
assert "images" in tokenized_sample | ||
assert "labels" in tokenized_sample | ||
assert "attention_mask" in tokenized_sample |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
import pytest | ||
import os | ||
from zeta.tokenizers.sentence_piece import SentencePieceTokenizer | ||
|
||
|
||
def test_sentence_piece_tokenizer_initialization(): | ||
model_path = "/path/to/your/model" # replace with your actual model path | ||
assert os.path.isfile(model_path), "Model file does not exist" | ||
|
||
tokenizer = SentencePieceTokenizer(model_path) | ||
|
||
assert isinstance(tokenizer, SentencePieceTokenizer) | ||
assert tokenizer.n_words == tokenizer.sp_model.vocab_size() | ||
assert tokenizer.bos_id == tokenizer.sp_model.bos_id() | ||
assert tokenizer.eos_id == tokenizer.sp_model.eos_id() | ||
assert tokenizer.pad_id == tokenizer.sp_model.pad_id() | ||
|
||
|
||
def test_sentence_piece_tokenizer_encode(): | ||
model_path = "/path/to/your/model" # replace with your actual model path | ||
tokenizer = SentencePieceTokenizer(model_path) | ||
|
||
text = "Hello, world!" | ||
encoded_text = tokenizer.encode(text, bos=True, eos=True) | ||
|
||
assert isinstance(encoded_text, list) | ||
assert encoded_text[0] == tokenizer.bos_id | ||
assert encoded_text[-1] == tokenizer.eos_id | ||
|
||
|
||
def test_sentence_piece_tokenizer_decode(): | ||
model_path = "/path/to/your/model" # replace with your actual model path | ||
tokenizer = SentencePieceTokenizer(model_path) | ||
|
||
text = "Hello, world!" | ||
encoded_text = tokenizer.encode(text, bos=True, eos=True) | ||
decoded_text = tokenizer.decode(encoded_text) | ||
|
||
assert isinstance(decoded_text, str) | ||
assert decoded_text == text | ||
|
||
|
||
def test_sentence_piece_tokenizer_encode_infilling(): | ||
model_path = "/path/to/your/model" # replace with your actual model path | ||
tokenizer = SentencePieceTokenizer(model_path) | ||
|
||
text = "Hello, world!" | ||
encoded_text = tokenizer.encode_infilling(text) | ||
|
||
assert isinstance(encoded_text, list) | ||
|
||
|
||
def test_sentence_piece_tokenizer_decode_infilling(): | ||
model_path = "/path/to/your/model" # replace with your actual model path | ||
tokenizer = SentencePieceTokenizer(model_path) | ||
|
||
text = "Hello, world!" | ||
encoded_text = tokenizer.encode_infilling(text) | ||
decoded_text = tokenizer.decode_infilling(encoded_text) | ||
|
||
assert isinstance(decoded_text, str) | ||
assert ( | ||
decoded_text == text[1:] | ||
) # the first character is removed in decode_infilling |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
import pytest | ||
from zeta.tokenizers.tokenmonster import TokenMonster | ||
|
||
|
||
def test_token_monster_initialization(): | ||
tokenizer = TokenMonster("englishcode-32000-consistent-v1") | ||
|
||
assert isinstance(tokenizer, TokenMonster) | ||
assert tokenizer.vocab is not None | ||
|
||
|
||
def test_token_monster_set_local_directory(): | ||
tokenizer = TokenMonster("englishcode-32000-consistent-v1") | ||
tokenizer.set_local_directory( | ||
"/path/to/your/directory" | ||
) # replace with your actual directory | ||
|
||
# There's no direct way to assert the effect of this method as it doesn't return anything | ||
# and it doesn't change any accessible state of the TokenMonster object. | ||
# You might need to check manually if the directory is set correctly. | ||
|
||
|
||
def test_token_monster_load(): | ||
tokenizer = TokenMonster("englishcode-32000-consistent-v1") | ||
tokenizer.load("englishcode-32000-consistent-v1") | ||
|
||
assert tokenizer.vocab is not None | ||
|
||
|
||
def test_token_monster_load_multiprocess_safe(): | ||
tokenizer = TokenMonster("englishcode-32000-consistent-v1") | ||
tokenizer.load_multiprocess_safe("englishcode-32000-consistent-v1") | ||
|
||
assert tokenizer.vocab is not None | ||
|
||
|
||
def test_token_monster_new(): | ||
tokenizer = TokenMonster("englishcode-32000-consistent-v1") | ||
yaml = """ | ||
tokens: | ||
- token: " " | ||
score: 0 | ||
- token: "e" | ||
score: 1 | ||
- token: "t" | ||
score: 2 | ||
""" | ||
tokenizer.new(yaml) | ||
|
||
assert tokenizer.vocab is not None | ||
|
||
|
||
def test_token_monster_save(): | ||
tokenizer = TokenMonster("englishcode-32000-consistent-v1") | ||
tokenizer.save("/path/to/your/file") # replace with your actual file path | ||
|
||
# There's no direct way to assert the effect of this method as it doesn't return anything | ||
# and it doesn't change any accessible state of the TokenMonster object. | ||
# You might need to check manually if the file is saved correctly. | ||
|
||
|
||
def test_token_monster_export_yaml(): | ||
tokenizer = TokenMonster("englishcode-32000-consistent-v1") | ||
yaml = tokenizer.export_yaml() | ||
|
||
assert isinstance(yaml, bytes) | ||
|
||
|
||
def test_token_monster_tokenize(): | ||
tokenizer = TokenMonster("englishcode-32000-consistent-v1") | ||
tokens = tokenizer.tokenize("Hello world!") | ||
|
||
assert isinstance(tokens, list) | ||
|
||
|
||
def test_token_monster_tokenize_count(): | ||
tokenizer = TokenMonster("englishcode-32000-consistent-v1") | ||
count = tokenizer.tokenize_count("Hello world!") | ||
|
||
assert isinstance(count, int) | ||
|
||
|
||
def test_token_monster_decode(): | ||
tokenizer = TokenMonster("englishcode-32000-consistent-v1") | ||
tokens = tokenizer.tokenize("Hello world!") | ||
text = tokenizer.decode(tokens) | ||
|
||
assert isinstance(text, str) | ||
assert text == "Hello world!" | ||
|
||
|
||
def test_token_monster_decoder(): | ||
tokenizer = TokenMonster("englishcode-32000-consistent-v1") | ||
decoder = tokenizer.decoder() | ||
|
||
assert decoder is not None | ||
|
||
|
||
def test_token_monster_get_dictionary(): | ||
tokenizer = TokenMonster("englishcode-32000-consistent-v1") | ||
dictionary = tokenizer.get_dictionary() | ||
|
||
assert isinstance(dictionary, list) | ||
|
||
|
||
def test_token_monster_charset(): | ||
tokenizer = TokenMonster("englishcode-32000-consistent-v1") | ||
charset = tokenizer.charset() | ||
|
||
assert isinstance(charset, str) | ||
|
||
|
||
def test_token_monster_normalization(): | ||
tokenizer = TokenMonster("englishcode-32000-consistent-v1") | ||
normalization = tokenizer.normalization() | ||
|
||
assert isinstance(normalization, str) | ||
|
||
|
||
def test_token_monster_capcode(): | ||
tokenizer = TokenMonster("englishcode-32000-consistent-v1") | ||
capcode = tokenizer.capcode() | ||
|
||
assert isinstance(capcode, int) | ||
|
||
|
||
def test_token_monster_mode(): | ||
tokenizer = TokenMonster("englishcode-32000-consistent-v1") | ||
mode = tokenizer.mode() | ||
|
||
assert isinstance(mode, int) | ||
|
||
|
||
def test_token_monster_id_to_token(): | ||
tokenizer = TokenMonster("englishcode-32000-consistent-v1") | ||
token = tokenizer.id_to_token(1) | ||
|
||
assert isinstance(token, str) | ||
|
||
|
||
def test_token_monster_id_to_token_decoded(): | ||
tokenizer = TokenMonster("englishcode-32000-consistent-v1") | ||
token = tokenizer.id_to_token_decoded(1) | ||
|
||
assert isinstance(token, str) |
Oops, something went wrong.