Skip to content

Commit

Permalink
Fix SegFault on AddedTokens For BPE tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
apaniukov committed Nov 20, 2023
1 parent e54b42e commit 6c3bae3
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ def convert_sentencepiece_model_tokenizer(
hf_tokenizer.save_pretrained(tmp)
vocab_file = Path(tmp) / hf_tokenizer.vocab_files_names["vocab_file"]

if (is_chatglm := getattr(hf_tokenizer, "name", None) == "GLMTokenizer"):
if is_chatglm := getattr(hf_tokenizer, "name", None) == "GLMTokenizer":
add_tokens_to_sentencepiece_model(vocab_file, hf_tokenizer)

sp_model = np.fromfile(vocab_file, dtype=np.uint8)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -326,20 +326,37 @@ class BPETokenizationStep(TokenizationModelStep):
suffix_indicator: str = ""
end_suffix: str = ""
byte_fallback: bool = False
added_tokens: Optional[Dict[int, str]] = None

def __post_init__(self):
if self.added_tokens is not None:
self.extend_vocab_with_added_tokens()

def extend_vocab_with_added_tokens(self) -> None:
for idx, token in sorted(self.added_tokens.items()):
self.vocab.append(token)

@classmethod
def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "BPETokenizationStep":
vocab = [token for token, index in sorted(tokenizer_json["model"]["vocab"].items(), key=lambda x: x[1])]
return cls(
unk_token=tokenizer_json["model"]["unk_token"] or "",
fuse_unk=tokenizer_json["model"]["fuse_unk"] or False,
suffix_indicator=tokenizer_json["model"]["continuing_subword_prefix"] or "",
end_suffix=tokenizer_json["model"]["end_of_word_suffix"] or "",
vocab=[token for token, index in sorted(tokenizer_json["model"]["vocab"].items(), key=lambda x: x[1])],
vocab=vocab,
merges=tokenizer_json["model"]["merges"],
added_tokens={
token["id"]: token["content"] for token in tokenizer_json["added_tokens"] if token["id"] >= len(vocab)
},
)

@classmethod
def from_tiktoken_encoding(cls, encoding: "Encoding") -> "BPETokenizationStep": # noqa
def from_tiktoken_encoding(
cls,
encoding: "Encoding", # noqa
added_tokens: Optional[Dict[int, str]] = None,
) -> "BPETokenizationStep":
from .tiktoken_parser import generate_vocab_and_merges

vocab, merges = generate_vocab_and_merges(encoding)
Expand All @@ -350,6 +367,7 @@ def from_tiktoken_encoding(cls, encoding: "Encoding") -> "BPETokenizationStep":
end_suffix="",
vocab=[token for token, idx in sorted(vocab.items(), key=lambda x: x[1])],
merges=merges,
added_tokens=added_tokens,
)

def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
"tokenizers_test.py::test_hf_wordpiece_tokenizers_multiple_strings": 0.641025641025641,
"tokenizers_test.py::test_sentencepiece_model_tokenizer": 0.6875,
"tokenizers_test.py::test_sentencepiece_model_detokenizer": 0.5525,
"tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.846875,
"tokenizers_test.py::test_bpe_detokenizer": 0.93125,
"tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.88,
"tokenizers_test.py::test_bpe_detokenizer": 0.9529411764705882,
"tokenizers_test.py::test_tiktoken_tokenizers": 0.9,
"tokenizers_test.py::test_": 0.8078960038517092
"tokenizers_test.py::test_": 0.8124118476727785
}
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
"microsoft/deberta-base",
"bigscience/bloom",
"laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
# "Salesforce/codegen-16B-multi", # Segfalts on ""A lot\t\tof whitespaces!""
"Salesforce/codegen-16B-multi",
# "google/flan-t5-xxl", # needs Precompiled/CharsMap
# "jinmang2/textcnn-ko-dialect-classifier", # Needs Metaspace Pretokenizer
# "hyunwoongko/blenderbot-9B", # hf script to get fast tokenizer doesn't work
Expand Down Expand Up @@ -284,7 +284,6 @@ def test_bpe_detokenizer(hf_and_ov_bpe_detokenizer, test_string):
assert ov_output == hf_output


# @pytest.mark.skip(reason="tiktoken tokenizer is WIP")
@pytest.mark.parametrize(
"test_string",
[
Expand Down

0 comments on commit 6c3bae3

Please sign in to comment.