Fix SegFault on AddedTokens For BPE tokenizer

apaniukov · Nov 20, 2023 · 6c3bae3 · 6c3bae3
1 parent e54b42e
commit 6c3bae3
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 8 deletions.
diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py
@@ -341,7 +341,7 @@ def convert_sentencepiece_model_tokenizer(
         hf_tokenizer.save_pretrained(tmp)
         vocab_file = Path(tmp) / hf_tokenizer.vocab_files_names["vocab_file"]
 
-        if (is_chatglm := getattr(hf_tokenizer, "name", None) == "GLMTokenizer"):
+        if is_chatglm := getattr(hf_tokenizer, "name", None) == "GLMTokenizer":
             add_tokens_to_sentencepiece_model(vocab_file, hf_tokenizer)
 
         sp_model = np.fromfile(vocab_file, dtype=np.uint8)

diff --git a/.../custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py b/.../custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py
@@ -326,20 +326,37 @@ class BPETokenizationStep(TokenizationModelStep):
     suffix_indicator: str = ""
     end_suffix: str = ""
     byte_fallback: bool = False
+    added_tokens: Optional[Dict[int, str]] = None
+
+    def __post_init__(self):
+        if self.added_tokens is not None:
+            self.extend_vocab_with_added_tokens()
+
+    def extend_vocab_with_added_tokens(self) -> None:
+        for idx, token in sorted(self.added_tokens.items()):
+            self.vocab.append(token)
 
     @classmethod
     def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "BPETokenizationStep":
+        vocab = [token for token, index in sorted(tokenizer_json["model"]["vocab"].items(), key=lambda x: x[1])]
         return cls(
             unk_token=tokenizer_json["model"]["unk_token"] or "",
             fuse_unk=tokenizer_json["model"]["fuse_unk"] or False,
             suffix_indicator=tokenizer_json["model"]["continuing_subword_prefix"] or "",
             end_suffix=tokenizer_json["model"]["end_of_word_suffix"] or "",
-            vocab=[token for token, index in sorted(tokenizer_json["model"]["vocab"].items(), key=lambda x: x[1])],
+            vocab=vocab,
             merges=tokenizer_json["model"]["merges"],
+            added_tokens={
+                token["id"]: token["content"] for token in tokenizer_json["added_tokens"] if token["id"] >= len(vocab)
+            },
         )
 
     @classmethod
-    def from_tiktoken_encoding(cls, encoding: "Encoding") -> "BPETokenizationStep":  # noqa
+    def from_tiktoken_encoding(
+        cls,
+        encoding: "Encoding",  # noqa
+        added_tokens: Optional[Dict[int, str]] = None,
+    ) -> "BPETokenizationStep":
         from .tiktoken_parser import generate_vocab_and_merges
 
         vocab, merges = generate_vocab_and_merges(encoding)
@@ -350,6 +367,7 @@ def from_tiktoken_encoding(cls, encoding: "Encoding") -> "BPETokenizationStep":
             end_suffix="",
             vocab=[token for token, idx in sorted(vocab.items(), key=lambda x: x[1])],
             merges=merges,
+            added_tokens=added_tokens,
         )
 
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:

diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json
@@ -3,8 +3,8 @@
     "tokenizers_test.py::test_hf_wordpiece_tokenizers_multiple_strings": 0.641025641025641,
     "tokenizers_test.py::test_sentencepiece_model_tokenizer": 0.6875,
     "tokenizers_test.py::test_sentencepiece_model_detokenizer": 0.5525,
-    "tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.846875,
-    "tokenizers_test.py::test_bpe_detokenizer": 0.93125,
+    "tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.88,
+    "tokenizers_test.py::test_bpe_detokenizer": 0.9529411764705882,
     "tokenizers_test.py::test_tiktoken_tokenizers": 0.9,
-    "tokenizers_test.py::test_": 0.8078960038517092
+    "tokenizers_test.py::test_": 0.8124118476727785
 }
diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py
@@ -86,7 +86,7 @@
     "microsoft/deberta-base",
     "bigscience/bloom",
     "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
-    # "Salesforce/codegen-16B-multi",  # Segfalts on ""A lot\t\tof whitespaces!""
+    "Salesforce/codegen-16B-multi",
     # "google/flan-t5-xxl",  # needs Precompiled/CharsMap
     # "jinmang2/textcnn-ko-dialect-classifier",  # Needs Metaspace Pretokenizer
     # "hyunwoongko/blenderbot-9B",  # hf script to get fast tokenizer doesn't work
@@ -284,7 +284,6 @@ def test_bpe_detokenizer(hf_and_ov_bpe_detokenizer, test_string):
     assert ov_output == hf_output
 
 
-# @pytest.mark.skip(reason="tiktoken tokenizer is WIP")
 @pytest.mark.parametrize(
     "test_string",
     [