From f8606f2903520ff821cf8a584fe3daed41a8c96b Mon Sep 17 00:00:00 2001 From: Hk669 Date: Thu, 6 Jun 2024 14:17:22 +0530 Subject: [PATCH 1/5] feat: starttime-endtime added with the throughput on verbose --- bpetokenizer/tokenizer.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/bpetokenizer/tokenizer.py b/bpetokenizer/tokenizer.py index 505eb5e..1a75f83 100644 --- a/bpetokenizer/tokenizer.py +++ b/bpetokenizer/tokenizer.py @@ -17,6 +17,7 @@ from .base import Tokenizer, get_stats, merge import regex as re import os +import time # from the openai/tiktoken (used in gpt4 tokenizer) GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" # raw string @@ -31,7 +32,7 @@ def __init__(self, pattern=None, special_tokens=None): self.compiled_pattern = re.compile(self.pattern) self.special_tokens = {} if special_tokens is None else special_tokens self.inverse_special_tokens = {} if special_tokens is None else {v: k for k, v in special_tokens.items()} - + self.vocab_size = len(self.vocab) if self.vocab else 0 @classmethod def from_pretrained(cls, @@ -63,9 +64,9 @@ def train(self, texts, vocab_size, verbose=False, min_frequency=1) -> None: text_chunks = re.findall(self.compiled_pattern, texts) # handles the desired pattern of tokens with regex pattern ids = [list(tokens.encode("utf-8")) for tokens in text_chunks] # List[List[int]] - merges = {} - vocab = {idx: bytes([idx]) for idx in range(256)} # vocab for first 255 bytes + self.vocab = {idx: bytes([idx]) for idx in range(256)} # vocab for first 255 bytes + start_time = time.time() # bpe algorithm for i in range(num_merges): stats = {} @@ -78,14 +79,22 @@ def train(self, texts, vocab_size, verbose=False, min_frequency=1) -> None: idx = 256 + i ids = [merge(chunk_ids, pair, idx) for chunk_ids in ids] # merge all the max occuring pair in the each chunk in ids - merges[pair] = idx - vocab[idx] = vocab[pair[0]] + vocab[pair[1]] # concat of bytes + self.merges[pair] = idx + self.vocab[idx] = self.vocab[pair[0]] + self.vocab[pair[1]] # concat of bytes if verbose: - print(f"merging {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} frequency") + print(f"merging {i+1}/{num_merges}: {pair} -> {idx} ({self.vocab[idx]}) had {stats[pair]} frequency") + + end_time = time.time() + total_time = end_time - start_time - self.merges = merges - self.vocab = vocab + # Calculate throughput + total_chunks = len(text_chunks) + throughput_chunks = total_chunks / total_time + + if verbose: + print(f"Total time taken: {total_time:.2f} seconds") + print(f"Throughput: {throughput_chunks:.2f} chunks/second") def _encode(self, _bytes) -> list: From 1a8c2105f4a007ce0971894ecfc88e81ebe71dfa Mon Sep 17 00:00:00 2001 From: Hk669 Date: Thu, 6 Jun 2024 14:23:53 +0530 Subject: [PATCH 2/5] fix: ZeroDivisionError --- bpetokenizer/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bpetokenizer/tokenizer.py b/bpetokenizer/tokenizer.py index 1a75f83..688adf6 100644 --- a/bpetokenizer/tokenizer.py +++ b/bpetokenizer/tokenizer.py @@ -90,7 +90,7 @@ def train(self, texts, vocab_size, verbose=False, min_frequency=1) -> None: # Calculate throughput total_chunks = len(text_chunks) - throughput_chunks = total_chunks / total_time + throughput_chunks = (total_chunks / total_time) if total_time != 0 else 0 if verbose: print(f"Total time taken: {total_time:.2f} seconds") From 7d82651fc38158ae0caed167c34e20b09d123c73 Mon Sep 17 00:00:00 2001 From: Hk669 Date: Thu, 6 Jun 2024 23:43:31 +0530 Subject: [PATCH 3/5] update: more optimized and fix enc --- bpetokenizer/base.py | 2 +- .../pretrained/wi17k_base/wi17k_base.json | 17 ++++---- bpetokenizer/tokenizer.py | 42 +++++++++---------- bpetokenizer/version.py | 2 +- 4 files changed, 33 insertions(+), 30 deletions(-) diff --git a/bpetokenizer/base.py b/bpetokenizer/base.py index e0d8123..71a022a 100644 --- a/bpetokenizer/base.py +++ b/bpetokenizer/base.py @@ -169,7 +169,7 @@ def load(self, file_name, mode="json"): self.merges = {tuple(map(int, k.strip('()').split(','))): v for k, v in merges.items()} vocab = data["vocab"] self.vocab = {int(k): v.encode("utf-8") for k, v in vocab.items()} - self.inverse_vocab = {v.decode("utf-8"): k for k, v in self.vocab.items()} + self.inverse_vocab = {str(v.decode("utf-8")): k for k, v in self.vocab.items()} diff --git a/bpetokenizer/pretrained/wi17k_base/wi17k_base.json b/bpetokenizer/pretrained/wi17k_base/wi17k_base.json index 45fe235..aca5ff9 100644 --- a/bpetokenizer/pretrained/wi17k_base/wi17k_base.json +++ b/bpetokenizer/pretrained/wi17k_base/wi17k_base.json @@ -2,12 +2,13 @@ "version": "1.0.4", "pattern": "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+", "special_tokens": { - "": 17311, - "": 17312, - "": 17313, - "": 17314, + "": 17317, + "": 17318, + "": 17319, + "": 17320, "<|startoftext|>": 17315, - "<|endoftext|>": 17316 + "<|endoftext|>": 17316, + "\n": 17317 }, "merges": { "(32, 116)": 256, @@ -18091,7 +18092,7 @@ "1021": " pers", "1022": "pect", "1023": " mov", - "1024": " def", + "1024": "def", "1025": "view", "1026": " several", "1027": "ros", @@ -34377,6 +34378,8 @@ "17307": " Lourinh�", "17308": " Lourinhã", "17309": " differs", - "17310": " allosaurid" + "17311": " def", + "17312": "_stats", + "17313": " get" } } \ No newline at end of file diff --git a/bpetokenizer/tokenizer.py b/bpetokenizer/tokenizer.py index 688adf6..7d7be60 100644 --- a/bpetokenizer/tokenizer.py +++ b/bpetokenizer/tokenizer.py @@ -46,6 +46,9 @@ def from_pretrained(cls, if not os.path.exists(tokenizer_file): raise FileNotFoundError(f"tokenizer file not found: {tokenizer_file}. Please check the tokenizer name") tokenizer.load(tokenizer_file, mode="json") + if verbose: + print('---\nSpecial tokens: ', tokenizer.special_tokens) + print('---\nLength of Vocab: ', len(tokenizer.vocab)) return tokenizer @@ -60,7 +63,8 @@ def train(self, texts, vocab_size, verbose=False, min_frequency=1) -> None: """ assert vocab_size >= 256 num_merges = vocab_size - 256 - + assert num_merges > 0 + text_chunks = re.findall(self.compiled_pattern, texts) # handles the desired pattern of tokens with regex pattern ids = [list(tokens.encode("utf-8")) for tokens in text_chunks] # List[List[int]] @@ -119,6 +123,8 @@ def encode_ord(self, text) -> list: for chunk in text_chunks: if chunk in self.vocab: ids.append(self.vocab[chunk]) + elif chunk in self.special_tokens: + ids.append(self.special_tokens[chunk]) else: _bytes = chunk.encode("utf-8") chunk_ids = self._encode(_bytes) @@ -143,19 +149,18 @@ def encode(self, text, special_tokens="none") -> list: assert all(token not in text for token in self.special_tokens) else: raise ValueError(f"invalid special tokens argument: {special_tokens}") + - if not special: - return self.encode_ord(text) - - special_pattern = "(" + "|".join(re.escape(k) for k in special) + ")" - text_chunks = re.split(special_pattern, text) + text_chunks = re.findall(self.compiled_pattern, text) ids = [] for chunk in text_chunks: - if chunk in special: - ids.append(special[chunk]) + if chunk in self.inverse_vocab: + ids.append(self.inverse_vocab[chunk]) + elif chunk in self.special_tokens: + ids.append(self.special_tokens[chunk]) else: - chunkids = self._encode(chunk.encode("utf-8")) - ids.extend(chunkids) + chunk_ids = self._encode(chunk.encode("utf-8")) + ids.extend(chunk_ids) return ids @@ -184,16 +189,11 @@ def _special_tokens(self, special_tokens) -> None: def tokens(self, text, verbose=False) -> list: text_chunks = re.findall(self.compiled_pattern, text) - - _tokens = [] - for chunk in text_chunks: - _bytes = chunk.encode("utf-8") - chunk_ids = self._encode(_bytes) - chunk_tokens = [self.vocab[idx].decode("utf-8", errors="replace") if idx in self.vocab else f"[UNK{idx}]" for idx in chunk_ids] - _tokens.extend(chunk_tokens) + ids = self.encode(text, special_tokens="all") if verbose: - print(f"---\nlength: {len(text_chunks)}\n") - print(f"---\ntext chunks: {text_chunks}\n") - print(f"---\npattern: {self.pattern}\n") - return _tokens + print(f"---\nText chunks: {text_chunks}\n") + print(f"---\nLength Text chunks: {len(text_chunks)}\n") + print(f"---\nIDs: {ids}") + print(f"---\nLength: {len(ids)}\n") + return ids \ No newline at end of file diff --git a/bpetokenizer/version.py b/bpetokenizer/version.py index 4a2bfa8..42cf7cd 100644 --- a/bpetokenizer/version.py +++ b/bpetokenizer/version.py @@ -1 +1 @@ -__version__ = "1.2.0" \ No newline at end of file +__version__ = "1.2.1" \ No newline at end of file From 211f5d8e371d7bd83ec4665639accc85bdf2523a Mon Sep 17 00:00:00 2001 From: Hk669 Date: Thu, 6 Jun 2024 23:44:07 +0530 Subject: [PATCH 4/5] feat: pretrained version 1.2.1 --- README.md | 28 +++++++++++++++++++++++++--- setup.py | 2 +- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 8e0bc80..0ea0930 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # bpetokenizer -A Byte Pair Encoding (BPE) tokenizer, which algorithmically follows along the GPT tokenizer. The tokenizer is capable of handling special tokens and uses a customizable regex pattern for tokenization(includes the gpt4 regex pattern). supports `save` and `load` tokenizers in the `json` and `file` format. +A Byte Pair Encoding (BPE) tokenizer, which algorithmically follows along the GPT tokenizer(tiktoken), allows you to train your own tokenizer. The tokenizer is capable of handling special tokens and uses a customizable regex pattern for tokenization(includes the gpt4 regex pattern). supports `save` and `load` tokenizers in the `json` and `file` format. The `bpetokenizer` also supports [pretrained](bpetokenizer/pretrained/) tokenizers. ### Overview @@ -31,7 +31,7 @@ Every LLM(LLama, Gemini, Mistral..) use their own Tokenizers trained on their ow 2. [BPETokenizer](bpetokenizer/tokenizer.py): This class emphasizes the real power of the tokenizer(used in gpt4 tokenizer..[tiktoken](https://github.com/openai/tiktoken)), uses the `GPT4_SPLIT_PATTERN` to split the text as mentioned in the gpt4 tokenizer. also handles the `special_tokens` (refer [sample_bpetokenizer](sample/bpetokenizer/sample_bpetokenizer.py)). which inherits the `save` and `load` functionlities to save and load the tokenizer respectively. -3. [PreTrained Tokenizer](pretrained/wi17k_base.json): PreTrained Tokenizer wi17k_base, has a 17316 vocabulary. trained with the wikitext dataset (len: 1000000). with 6 special_tokens. +3. [PreTrained Tokenizer](bpetokenizer/pretrained/wi17k_base): PreTrained Tokenizer wi17k_base, has a 17316 vocabulary. trained with the wikitext dataset (len: 1000000). with 6 special_tokens. ### Usage @@ -121,6 +121,28 @@ print("tokens: ", tokens) ``` refer to the [load_json_vocab](sample/load_json_vocab/) and run the `bpetokenizer_json` to get an overview of `vocab`, `merges`, `special_tokens` and to view the tokens that are split by the tokenizer using pattern, look at [tokens](sample/load_json_vocab/tokens.py) + +#### To load the pretrained tokenizers + +```py +from bpetokenizer import BPETokenzier + +tokenizer = BPETokenizer.from_pretrained("wi17k_base", verbose=True) + +texts = """ +def get_stats(tokens, counts=None) -> dict: + "Get statistics of the tokens. Includes the frequency of each consecutive pair of tokens" + counts = if counts is None else counts + for pair in zip(tokens, tokens[1:]): + counts[pair] = counts.get(pair, 0) + 1 + return counts +""" +tokenizer.tokens(texts, verbose=True) + +``` +for now, we only have a single 17k vocab tokenizer `wi17_base` at [pretrained](/bpetokenizer/pretrained/) + + ### Run Tests the tests folder `tests/` include the tests of the tokenizer, uses pytest. @@ -138,7 +160,7 @@ Contributions to the BPE Tokenizer are most welcomed! If you would like to contr - Star and Fork the repository. - Create a new branch (git checkout -b feature/your-feature). -- Commit your changes (git commit -am 'Add some feature'). +- Commit your changes (git commit -m 'Add some feature'). - Push to the branch (git push origin feature/your-feature). - Create a new Pull Request. diff --git a/setup.py b/setup.py index 60f95e3..f80d535 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ setup( name="bpetokenizer", version=__version__, - description="Byte Pair Encoding Tokenizer with special tokens and regex pattern", + description="A Byte Pair Encoding (BPE) tokenizer, which algorithmically follows along the GPT tokenizer(tiktoken), allows you to train your own tokenizer. The tokenizer is capable of handling special tokens and uses a customizable regex pattern for tokenization(includes the gpt4 regex pattern). supports `save` and `load` tokenizers in the `json` and `file` format. The `bpetokenizer` also supports [pretrained](bpetokenizer/pretrained/) tokenizers.", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/Hk669/bpetokenizer", From e16edab6d0c3d1853996df0c00472c2050407aa1 Mon Sep 17 00:00:00 2001 From: Hk669 Date: Thu, 6 Jun 2024 23:57:22 +0530 Subject: [PATCH 5/5] fix: self.inverse_vocab init --- bpetokenizer/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bpetokenizer/base.py b/bpetokenizer/base.py index 71a022a..01b5a0a 100644 --- a/bpetokenizer/base.py +++ b/bpetokenizer/base.py @@ -67,6 +67,7 @@ def __init__(self, special_tokens=None): self.compiled_pattern = re.compile(self.pattern) if self.pattern else "" self.special_tokens = special_tokens if special_tokens else {} self.vocab = self._build_vocab() if self.merges else {} + self.inverse_vocab = {str(v.decode("utf-8")): k for k, v in self.vocab.items()} if self.vocab else {} def _build_vocab(self) -> dict: """Build the vocab from the merges and special tokens. This will be used to encode/decode the tokens.""" @@ -169,7 +170,7 @@ def load(self, file_name, mode="json"): self.merges = {tuple(map(int, k.strip('()').split(','))): v for k, v in merges.items()} vocab = data["vocab"] self.vocab = {int(k): v.encode("utf-8") for k, v in vocab.items()} - self.inverse_vocab = {str(v.decode("utf-8")): k for k, v in self.vocab.items()} + self.inverse_vocab = {str(v.decode("utf-8")): k for k, v in self.vocab.items()} if self.vocab else {}