From 6c3bae38391737b231282623a9fea82986886ff1 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Mon, 20 Nov 2023 15:53:12 +0000 Subject: [PATCH] Fix SegFault on AddedTokens For BPE tokenizer --- .../python/ov_tokenizer/hf_parser.py | 2 +- .../python/ov_tokenizer/tokenizer_pipeline.py | 22 +++++++++++++++++-- .../tokenizer/python/tests/pass_rates.json | 6 ++--- .../tokenizer/python/tests/tokenizers_test.py | 3 +-- 4 files changed, 25 insertions(+), 8 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index 71f8d8e89..53c647d7c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -341,7 +341,7 @@ def convert_sentencepiece_model_tokenizer( hf_tokenizer.save_pretrained(tmp) vocab_file = Path(tmp) / hf_tokenizer.vocab_files_names["vocab_file"] - if (is_chatglm := getattr(hf_tokenizer, "name", None) == "GLMTokenizer"): + if is_chatglm := getattr(hf_tokenizer, "name", None) == "GLMTokenizer": add_tokens_to_sentencepiece_model(vocab_file, hf_tokenizer) sp_model = np.fromfile(vocab_file, dtype=np.uint8) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py index dc2867797..d155bf93c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py @@ -326,20 +326,37 @@ class BPETokenizationStep(TokenizationModelStep): suffix_indicator: str = "" end_suffix: str = "" byte_fallback: bool = False + added_tokens: Optional[Dict[int, str]] = None + + def __post_init__(self): + if self.added_tokens is not None: + self.extend_vocab_with_added_tokens() + + def extend_vocab_with_added_tokens(self) -> None: + for idx, token in sorted(self.added_tokens.items()): + self.vocab.append(token) @classmethod def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "BPETokenizationStep": + vocab = [token for token, index in sorted(tokenizer_json["model"]["vocab"].items(), key=lambda x: x[1])] return cls( unk_token=tokenizer_json["model"]["unk_token"] or "", fuse_unk=tokenizer_json["model"]["fuse_unk"] or False, suffix_indicator=tokenizer_json["model"]["continuing_subword_prefix"] or "", end_suffix=tokenizer_json["model"]["end_of_word_suffix"] or "", - vocab=[token for token, index in sorted(tokenizer_json["model"]["vocab"].items(), key=lambda x: x[1])], + vocab=vocab, merges=tokenizer_json["model"]["merges"], + added_tokens={ + token["id"]: token["content"] for token in tokenizer_json["added_tokens"] if token["id"] >= len(vocab) + }, ) @classmethod - def from_tiktoken_encoding(cls, encoding: "Encoding") -> "BPETokenizationStep": # noqa + def from_tiktoken_encoding( + cls, + encoding: "Encoding", # noqa + added_tokens: Optional[Dict[int, str]] = None, + ) -> "BPETokenizationStep": from .tiktoken_parser import generate_vocab_and_merges vocab, merges = generate_vocab_and_merges(encoding) @@ -350,6 +367,7 @@ def from_tiktoken_encoding(cls, encoding: "Encoding") -> "BPETokenizationStep": end_suffix="", vocab=[token for token, idx in sorted(vocab.items(), key=lambda x: x[1])], merges=merges, + added_tokens=added_tokens, ) def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json index fbfe0189c..1ec7a932d 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json @@ -3,8 +3,8 @@ "tokenizers_test.py::test_hf_wordpiece_tokenizers_multiple_strings": 0.641025641025641, "tokenizers_test.py::test_sentencepiece_model_tokenizer": 0.6875, "tokenizers_test.py::test_sentencepiece_model_detokenizer": 0.5525, - "tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.846875, - "tokenizers_test.py::test_bpe_detokenizer": 0.93125, + "tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.88, + "tokenizers_test.py::test_bpe_detokenizer": 0.9529411764705882, "tokenizers_test.py::test_tiktoken_tokenizers": 0.9, - "tokenizers_test.py::test_": 0.8078960038517092 + "tokenizers_test.py::test_": 0.8124118476727785 } \ No newline at end of file diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py index 288e77a1a..ca2611a44 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py @@ -86,7 +86,7 @@ "microsoft/deberta-base", "bigscience/bloom", "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", - # "Salesforce/codegen-16B-multi", # Segfalts on ""A lot\t\tof whitespaces!"" + "Salesforce/codegen-16B-multi", # "google/flan-t5-xxl", # needs Precompiled/CharsMap # "jinmang2/textcnn-ko-dialect-classifier", # Needs Metaspace Pretokenizer # "hyunwoongko/blenderbot-9B", # hf script to get fast tokenizer doesn't work @@ -284,7 +284,6 @@ def test_bpe_detokenizer(hf_and_ov_bpe_detokenizer, test_string): assert ov_output == hf_output -# @pytest.mark.skip(reason="tiktoken tokenizer is WIP") @pytest.mark.parametrize( "test_string", [