From 187b420e9ef2d126bed4e2024abc406d4b147c8e Mon Sep 17 00:00:00 2001 From: Kye Date: Sat, 23 Sep 2023 21:46:43 -0400 Subject: [PATCH] tokenmonster --- pyproject.toml | 1 + zeta/tokenizers/__init__.py | 3 +- zeta/tokenizers/token_monsters.py | 333 ++++++++++++++++++++++++++++++ 3 files changed, 336 insertions(+), 1 deletion(-) create mode 100644 zeta/tokenizers/token_monsters.py diff --git a/pyproject.toml b/pyproject.toml index 183f68b5..43a65c8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ lion-pytorch = "*" sentencepiece = "*" colt5-attention = "0.10.14" vector-quantize-pytorch = "1.1.5" +tokenmonster = "*" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/zeta/tokenizers/__init__.py b/zeta/tokenizers/__init__.py index 9a57b855..507f8000 100644 --- a/zeta/tokenizers/__init__.py +++ b/zeta/tokenizers/__init__.py @@ -1,3 +1,4 @@ from zeta.tokenizers.language_tokenizer import LanguageTokenizerGPTX from zeta.tokenizers.multi_modal_tokenizer import MultiModalTokenizer -from zeta.tokenizers.sentence_piece import SentencePieceTokenizer \ No newline at end of file +from zeta.tokenizers.sentence_piece import SentencePieceTokenizer +from zeta.tokenizers.token_monsters import TokenMonster \ No newline at end of file diff --git a/zeta/tokenizers/token_monsters.py b/zeta/tokenizers/token_monsters.py new file mode 100644 index 00000000..5fef7a66 --- /dev/null +++ b/zeta/tokenizers/token_monsters.py @@ -0,0 +1,333 @@ +import numpy as np +import tokenmonster + +class TokenMonster: + """ + A class that encapsulates the functionality of the tokenmonster library. + + >>> from zeta.tokenizers import TokenMonster + >>> tokenizer = TokenMonster("englishcode-32000-consistent-v1") + >>> tokenizer.tokenize("Hello world!") + """ + def __init__(self, path): + """ + Initializes the TokenMonster class and loads a vocabulary. + + Args: + path (str): A filepath, URL or pre-built vocabulary name. + """ + self.vocab = tokenmonster.load(path) + + def set_local_directory(self, dir=None): + """ + Sets the local directory for TokenMonster. + + Args: + dir (str, optional): The local directory to use. Defaults to None. + """ + tokenmonster.set_local_directory(dir) + + def load(self, path): + """ + Loads a TokenMonster vocabulary from file, URL or by name. + + Args: + path (str): A filepath, URL or pre-built vocabulary name. + """ + self.vocab = tokenmonster.load(path) + + def load_multiprocess_safe(self, path): + """ + Loads a TokenMonster vocabulary from file, URL or by name. It's safe for multiprocessing, + but vocabulary modification is disabled and tokenization is slightly slower. + + Args: + path (str): A filepath, URL or pre-built vocabulary name. + """ + self.vocab = tokenmonster.load_multiprocess_safe(path) + + def new(self, yaml): + """ + Creates a new vocabulary from a YAML string. + + Args: + yaml (str): The YAML file. + """ + self.vocab = tokenmonster.new(yaml) + + def save(self, fname): + """ + Saves the current vocabulary to a file. + + Args: + fname (str): The filename to save the vocabulary to. + """ + self.vocab.save(fname) + + def export_yaml(self, order_by_score=False): + """ + Exports the vocabulary as a YAML file, which is returned as a bytes string. + + Args: + order_by_score (bool, optional): If true the tokens are order by score instead of alphabetically. Defaults to False. + + Returns: + bytes: The vocabulary in YAML format. + """ + return self.vocab.export_yaml(order_by_score) + + def tokenize(self, text): + """ + Tokenizes a string into tokens according to the vocabulary. + + Args: + text (str): A string or bytes string, or list of strings or bytes strings. + + Returns: + numpy array: The tokens IDs + """ + return self.vocab.tokenize(text) + + def tokenize_count(self, text): + """ + Same as tokenize, but it returns only the number of tokens. + + Args: + text (str): A string or bytes string, or list of strings or bytes strings. + + Returns: + int: The number of tokens for each input string + """ + return self.vocab.tokenize_count(text) + + def decode(self, tokens): + """ + Decodes tokens into a string. + + Args: + tokens (int, list of int, or numpy array): The tokens to decode into a string. + + Returns: + str: The composed string from the input tokens. + """ + return self.vocab.decode(tokens) + + def decoder(self): + """ + Returns a new decoder instance used for decoding tokens into text. + + Returns: + tokenmonster.DecoderInstance: A new decoder instance. + """ + return self.vocab.decoder() + + def get_dictionary(self): + """ + Returns a dictionary of all tokens in the vocabulary. + + Returns: + list: A list of dictionaries where the index is the token ID and each is a dictionary. + """ + return self.vocab.get_dictionary() + + def charset(self): + """ + Returns the character set used by the vocabulary. + + Returns: + str: The character set used by the vocabulary. Possible values are "UTF-8", "None". + """ + return self.vocab.charset() + + def normalization(self): + """ + Returns the normalization of the vocabulary. + + Returns: + str: The normalization of the vocabulary. Possible values are "None", "NFD", "Lowercase", "Accents", "Quotemarks", "Collapse", "Trim", "LeadingSpace", "UnixLines". + """ + return self.vocab.normalization() + + def capcode(self): + """ + Returns the capcode level of the vocabulary. + + Returns: + int: The capcode level (0-2). + """ + return self.vocab.capcode() + + def mode(self): + """ + Returns the optimization mode of the vocabulary. + + Returns: + int: The optimization mode (0-5). + """ + return self.vocab.mode() + + def id_to_token(self, id): + """ + Get the token string from a single token ID, in its capcode-encoded form. + + Args: + id (int): The token ID. + + Returns: + str or None: The token string corresponding to the input ID. None if the ID is not in the vocabulary. + """ + return self.vocab.id_to_token(id) + + def id_to_token_decoded(self, id): + """ + Get the token string from a single token ID, in its capcode-decoded form. + + Args: + id (int): The token ID. + + Returns: + str or None: The token string corresponding to the input ID. None if the ID is not in the vocabulary. + """ + return self.vocab.id_to_token_decoded(id) + + def token_to_id(self, token): + """ + Returns the ID of a single token. + + Args: + token (str): The token to get the ID for. + + Returns: + int or None: The ID of the token. None if the token is not in the vocabulary. + """ + return self.vocab.token_to_id(token) + + def modify(self, add_special_tokens=None, add_regular_tokens=None, delete_tokens=None, resize=None, change_unk=None): + """ + Modifies the vocabulary. + + Args: + add_special_tokens (str or list of str, optional): Special tokens to add to the vocabulary. + add_regular_tokens (str or list of str, optional): Regular tokens to add to the vocabulary. + delete_tokens (str or list of str, optional): Regular or Special tokens to delete. + resize (int, optional): Resizes the vocabulary to this size. + change_unk (bool, optional): If set, it enables or disables the UNK token. + + Returns: + int: The new size of the vocabulary. + """ + return self.vocab.modify(add_special_tokens, add_regular_tokens, delete_tokens, resize, change_unk) + + def add_token(self, token): + """ + Add one or more regular tokens. + + Args: + token (str or list of str): The regular tokens to add. + + Returns: + int: The new size of the vocabulary. + """ + return self.vocab.add_token(token) + + def delete_token(self, token): + """ + Delete one or more regular or special tokens. + + Args: + token (str or list of str): The tokens to delete. + + Returns: + int: The new size of the vocabulary. + """ + return self.vocab.delete_token(token) + + def delete_token_by_id(self, id): + """ + Delete one or more regular or special token by specifying the token ID. + + Args: + id (int or list of int): The IDs of the tokens to delete. + + Returns: + int: The new size of the vocabulary. + """ + return self.vocab.delete_token_by_id(id) + + def add_special_token(self, token): + """ + Add one or more special tokens. + + Args: + token (str or list of str): The special tokens to add. + + Returns: + int: The new size of the vocabulary. + """ + return self.vocab.add_special_token(token) + + def resize(self, size): + """ + Changes the size of the vocabulary. + + Args: + size (int): The new size of the vocabulary. + + Returns: + int: The new size of the vocabulary. + """ + return self.vocab.resize(size) + + def reset_token_ids(self): + """ + Resets the token IDs to be sequential beginning from zero. + """ + self.vocab.reset_token_ids() + + def enable_unk_token(self): + """ + Enables the UNK token. + + Returns: + int: The new size of the vocabulary. + """ + return self.vocab.enable_unk_token() + + def disable_unk_token(self): + """ + Disables the UNK token. + + Returns: + int: The new size of the vocabulary. + """ + return self.vocab.disable_unk_token() + + def disconnect(self): + """ + Disconnects and closes tokenmonsterserver. + """ + tokenmonster.disconnect() + + def serialize_tokens(self, integer_list): + """ + Serializes tokens from a list of ints or numpy array into a binary string. + + Args: + integer_list (list of int or numpy array): The tokens to serialize. + + Returns: + bytes: The serialized binary string. + """ + return self.vocab.serialize_tokens(integer_list) + + def deserialize_tokens(self, binary_string): + """ + Deserializes a binary string into a numpy array of token IDs. + + Args: + binary_string (bytes): The binary string to deserialize. + + Returns: + np.array: The deserialized tokens. + """ + return self.vocab.deserialize_tokens(binary_string) \ No newline at end of file