diff --git a/ctransformers/transformers.py b/ctransformers/transformers.py index fd9d0e6..1d0e1fc 100644 --- a/ctransformers/transformers.py +++ b/ctransformers/transformers.py @@ -81,8 +81,8 @@ def forward( class CTransformersTokenizer(PreTrainedTokenizer): def __init__(self, llm: LLM, **kwargs): - super().__init__(**kwargs) self._llm = llm + super().__init__(**kwargs) @property def vocab_size(self) -> int: @@ -158,3 +158,10 @@ def _convert_id_to_token(self, index: int) -> str: def convert_tokens_to_string(self, tokens: List[str]) -> str: return "".join(tokens) + + # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_vocab + def get_vocab(self): + """Returns vocab as a dict""" + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab