From 83c0dadf7517937d84fbda75c74f607680868d98 Mon Sep 17 00:00:00 2001 From: Victor Lee Date: Wed, 4 Oct 2023 16:37:21 -0400 Subject: [PATCH 1/2] recover `transformers 4.34 refactored` Signed-off-by: Victor Lee --- ctransformers/transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctransformers/transformers.py b/ctransformers/transformers.py index fd9d0e6..e94b446 100644 --- a/ctransformers/transformers.py +++ b/ctransformers/transformers.py @@ -81,8 +81,8 @@ def forward( class CTransformersTokenizer(PreTrainedTokenizer): def __init__(self, llm: LLM, **kwargs): - super().__init__(**kwargs) self._llm = llm + super().__init__(**kwargs) @property def vocab_size(self) -> int: From 790ea3564f539e509c4c07894b8843b35811797d Mon Sep 17 00:00:00 2001 From: Victor Lee Date: Fri, 6 Oct 2023 17:40:07 -0400 Subject: [PATCH 2/2] implement def get_vocab(self) from transformers Signed-off-by: Victor Lee --- ctransformers/transformers.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ctransformers/transformers.py b/ctransformers/transformers.py index e94b446..1d0e1fc 100644 --- a/ctransformers/transformers.py +++ b/ctransformers/transformers.py @@ -158,3 +158,10 @@ def _convert_id_to_token(self, index: int) -> str: def convert_tokens_to_string(self, tokens: List[str]) -> str: return "".join(tokens) + + # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_vocab + def get_vocab(self): + """Returns vocab as a dict""" + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab