Hk669 · Hk669 · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # bpetokenizer
 
-A Byte Pair Encoding (BPE) tokenizer, which algorithmically follows along the GPT tokenizer. The tokenizer is capable of handling special tokens and uses a customizable regex pattern for tokenization(includes the gpt4 regex pattern). supports `save` and `load` tokenizers in the `json` and `file` format.
+A Byte Pair Encoding (BPE) tokenizer, which algorithmically follows along the GPT tokenizer(tiktoken), allows you to train your own tokenizer. The tokenizer is capable of handling special tokens and uses a customizable regex pattern for tokenization(includes the gpt4 regex pattern). supports `save` and `load` tokenizers in the `json` and `file` format. The `bpetokenizer` also supports [pretrained](bpetokenizer/pretrained/) tokenizers.
 
 
 ### Overview
@@ -31,7 +31,7 @@ Every LLM(LLama, Gemini, Mistral..) use their own Tokenizers trained on their ow
 
 2. [BPETokenizer](bpetokenizer/tokenizer.py): This class emphasizes the real power of the tokenizer(used in gpt4 tokenizer..[tiktoken](https://github.com/openai/tiktoken)), uses the `GPT4_SPLIT_PATTERN` to split the text as mentioned in the gpt4 tokenizer. also handles the `special_tokens` (refer [sample_bpetokenizer](sample/bpetokenizer/sample_bpetokenizer.py)). which inherits the `save` and `load` functionlities to save and load the tokenizer respectively.
 
-3. [PreTrained Tokenizer](pretrained/wi17k_base.json): PreTrained Tokenizer wi17k_base, has a 17316 vocabulary. trained with the wikitext dataset (len: 1000000). with 6 special_tokens.
+3. [PreTrained Tokenizer](bpetokenizer/pretrained/wi17k_base): PreTrained Tokenizer wi17k_base, has a 17316 vocabulary. trained with the wikitext dataset (len: 1000000). with 6 special_tokens.
 
 
 ### Usage
@@ -121,6 +121,28 @@ print("tokens: ", tokens)
 ```
 refer to the [load_json_vocab](sample/load_json_vocab/) and run the `bpetokenizer_json` to get an overview of `vocab`, `merges`, `special_tokens` and to view the tokens that are split by the tokenizer using pattern, look at [tokens](sample/load_json_vocab/tokens.py)
 
+
+#### To load the pretrained tokenizers
+
+```py
+from bpetokenizer import BPETokenzier
+
+tokenizer = BPETokenizer.from_pretrained("wi17k_base", verbose=True)
+
+texts = """
+def get_stats(tokens, counts=None) -> dict:
+    "Get statistics of the tokens. Includes the frequency of each consecutive pair of tokens"
+    counts = if counts is None else counts
+    for pair in zip(tokens, tokens[1:]):
+        counts[pair] = counts.get(pair, 0) + 1
+    return counts
+"""
+tokenizer.tokens(texts, verbose=True)
+
+```
+for now, we only have a single 17k vocab tokenizer `wi17_base` at [pretrained](/bpetokenizer/pretrained/)
+
+
 ### Run Tests
 
 the tests folder `tests/` include the tests of the tokenizer, uses pytest.
@@ -138,7 +160,7 @@ Contributions to the BPE Tokenizer are most welcomed! If you would like to contr
 
 - Star and Fork the repository.
 - Create a new branch (git checkout -b feature/your-feature).
-- Commit your changes (git commit -am 'Add some feature').
+- Commit your changes (git commit -m 'Add some feature').
 - Push to the branch (git push origin feature/your-feature).
 - Create a new Pull Request.
 

diff --git a/bpetokenizer/base.py b/bpetokenizer/base.py
@@ -67,6 +67,7 @@ def __init__(self, special_tokens=None):
         self.compiled_pattern = re.compile(self.pattern) if self.pattern else ""
         self.special_tokens = special_tokens if special_tokens else {}
         self.vocab = self._build_vocab() if self.merges else {}
+        self.inverse_vocab = {str(v.decode("utf-8")): k for k, v in self.vocab.items()} if self.vocab else {}
 
     def _build_vocab(self) -> dict:
         """Build the vocab from the merges and special tokens. This will be used to encode/decode the tokens."""
@@ -169,7 +170,7 @@ def load(self, file_name, mode="json"):
                 self.merges = {tuple(map(int, k.strip('()').split(','))): v for k, v in merges.items()}
                 vocab = data["vocab"]
                 self.vocab = {int(k): v.encode("utf-8") for k, v in vocab.items()}
-                self.inverse_vocab = {v.decode("utf-8"): k for k, v in self.vocab.items()}
+                self.inverse_vocab = {str(v.decode("utf-8")): k for k, v in self.vocab.items()} if self.vocab else {}
 
 
 

diff --git a/bpetokenizer/pretrained/wi17k_base/wi17k_base.json b/bpetokenizer/pretrained/wi17k_base/wi17k_base.json
@@ -2,12 +2,13 @@
     "version": "1.0.4",
     "pattern": "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
     "special_tokens": {
-        "<PAD>": 17311,
-        "<BOS>": 17312,
-        "<EOS>": 17313,
-        "<UNK>": 17314,
+        "<PAD>": 17317,
+        "<BOS>": 17318,
+        "<EOS>": 17319,
+        "<UNK>": 17320,
         "<|startoftext|>": 17315,
-        "<|endoftext|>": 17316
+        "<|endoftext|>": 17316,
+        "\n": 17317
     },
     "merges": {
         "(32, 116)": 256,
@@ -18091,7 +18092,7 @@
         "1021": " pers",
         "1022": "pect",
         "1023": " mov",
-        "1024": " def",
+        "1024": "def",
         "1025": "view",
         "1026": " several",
         "1027": "ros",
@@ -34377,6 +34378,8 @@
         "17307": " Lourinh�",
         "17308": " Lourinhã",
         "17309": " differs",
-        "17310": " allosaurid"
+        "17311": " def",
+        "17312": "_stats",
+        "17313": " get"
     }
 }
diff --git a/bpetokenizer/tokenizer.py b/bpetokenizer/tokenizer.py
@@ -17,6 +17,7 @@
 from .base import Tokenizer, get_stats, merge
 import regex as re
 import os
+import time
 
 # from the openai/tiktoken (used in gpt4 tokenizer)
 GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""     # raw string
@@ -31,7 +32,7 @@ def __init__(self, pattern=None, special_tokens=None):
         self.compiled_pattern = re.compile(self.pattern)
         self.special_tokens = {} if special_tokens is None else special_tokens
         self.inverse_special_tokens = {} if special_tokens is None else {v: k for k, v in special_tokens.items()}
-
+        self.vocab_size = len(self.vocab) if self.vocab else 0
 
     @classmethod
     def from_pretrained(cls, 
@@ -45,6 +46,9 @@ def from_pretrained(cls,
         if not os.path.exists(tokenizer_file):
             raise FileNotFoundError(f"tokenizer file not found: {tokenizer_file}. Please check the tokenizer name")
         tokenizer.load(tokenizer_file, mode="json")
+        if verbose:
+            print('---\nSpecial tokens: ', tokenizer.special_tokens)
+            print('---\nLength of Vocab: ', len(tokenizer.vocab))
         return tokenizer
 
 
@@ -59,13 +63,14 @@ def train(self, texts, vocab_size, verbose=False, min_frequency=1) -> None:
         """
         assert vocab_size >= 256
         num_merges = vocab_size - 256
-
+        assert num_merges > 0
+
         text_chunks = re.findall(self.compiled_pattern, texts) # handles the desired pattern of tokens with regex pattern
 
         ids = [list(tokens.encode("utf-8")) for tokens in text_chunks]      # List[List[int]]
-        merges = {}
-        vocab = {idx: bytes([idx]) for idx in range(256)} # vocab for first 255 bytes
+        self.vocab = {idx: bytes([idx]) for idx in range(256)} # vocab for first 255 bytes
 
+        start_time = time.time()
         # bpe algorithm
         for i in range(num_merges):
             stats = {}
@@ -78,14 +83,22 @@ def train(self, texts, vocab_size, verbose=False, min_frequency=1) -> None:
 
             idx = 256 + i
             ids = [merge(chunk_ids, pair, idx) for chunk_ids in ids] # merge all the max occuring pair in the each chunk in ids
-            merges[pair] = idx
-            vocab[idx] = vocab[pair[0]] + vocab[pair[1]] # concat of bytes
+            self.merges[pair] = idx
+            self.vocab[idx] = self.vocab[pair[0]] + self.vocab[pair[1]] # concat of bytes
 
             if verbose:
-                print(f"merging {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} frequency")
+                print(f"merging {i+1}/{num_merges}: {pair} -> {idx} ({self.vocab[idx]}) had {stats[pair]} frequency")
 
-        self.merges = merges
-        self.vocab = vocab
+        end_time = time.time()
+        total_time = end_time - start_time
+
+        # Calculate throughput
+        total_chunks = len(text_chunks)
+        throughput_chunks = (total_chunks / total_time) if total_time != 0 else 0
+
+        if verbose:
+            print(f"Total time taken: {total_time:.2f} seconds")
+            print(f"Throughput: {throughput_chunks:.2f} chunks/second")
 
 
     def _encode(self, _bytes) -> list:
@@ -110,6 +123,8 @@ def encode_ord(self, text) -> list:
         for chunk in text_chunks:
             if chunk in self.vocab:
                 ids.append(self.vocab[chunk])
+            elif chunk in self.special_tokens:
+                ids.append(self.special_tokens[chunk])
             else:
                 _bytes = chunk.encode("utf-8")
                 chunk_ids = self._encode(_bytes)
@@ -134,19 +149,18 @@ def encode(self, text, special_tokens="none") -> list:
             assert all(token not in text for token in self.special_tokens)
         else:
             raise ValueError(f"invalid special tokens argument: {special_tokens}")
+
 
-        if not special:
-            return self.encode_ord(text)
-
-        special_pattern = "(" + "|".join(re.escape(k) for k in special) + ")"
-        text_chunks = re.split(special_pattern, text)
+        text_chunks = re.findall(self.compiled_pattern, text)
         ids = []
         for chunk in text_chunks:
-            if chunk in special:
-                ids.append(special[chunk])
+            if chunk in self.inverse_vocab:
+                ids.append(self.inverse_vocab[chunk])
+            elif chunk in self.special_tokens:
+                ids.append(self.special_tokens[chunk])
             else:
-                chunkids = self._encode(chunk.encode("utf-8"))
-                ids.extend(chunkids)
+                chunk_ids = self._encode(chunk.encode("utf-8"))
+                ids.extend(chunk_ids)
         return ids
 
 
@@ -175,16 +189,11 @@ def _special_tokens(self, special_tokens) -> None:
 
     def tokens(self, text, verbose=False) -> list:
         text_chunks = re.findall(self.compiled_pattern, text)
-
-        _tokens = []
-        for chunk in text_chunks:
-            _bytes = chunk.encode("utf-8")
-            chunk_ids = self._encode(_bytes)
-            chunk_tokens = [self.vocab[idx].decode("utf-8", errors="replace") if idx in self.vocab else f"[UNK{idx}]" for idx in chunk_ids]
-            _tokens.extend(chunk_tokens)
+        ids = self.encode(text, special_tokens="all")
         if verbose:
-            print(f"---\nlength: {len(text_chunks)}\n")
-            print(f"---\ntext chunks: {text_chunks}\n")
-            print(f"---\npattern: {self.pattern}\n")
-        return _tokens
+            print(f"---\nText chunks: {text_chunks}\n")
+            print(f"---\nLength Text chunks: {len(text_chunks)}\n")
+            print(f"---\nIDs: {ids}")
+            print(f"---\nLength: {len(ids)}\n")
+        return ids
 
diff --git a/bpetokenizer/version.py b/bpetokenizer/version.py
@@ -1 +1 @@
-__version__ = "1.2.0"
+__version__ = "1.2.1"
diff --git a/setup.py b/setup.py
@@ -17,7 +17,7 @@
 setup(
     name="bpetokenizer",
     version=__version__,
-    description="Byte Pair Encoding Tokenizer with special tokens and regex pattern",
+    description="A Byte Pair Encoding (BPE) tokenizer, which algorithmically follows along the GPT tokenizer(tiktoken), allows you to train your own tokenizer. The tokenizer is capable of handling special tokens and uses a customizable regex pattern for tokenization(includes the gpt4 regex pattern). supports `save` and `load` tokenizers in the `json` and `file` format. The `bpetokenizer` also supports [pretrained](bpetokenizer/pretrained/) tokenizers.",
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/Hk669/bpetokenizer",