Skip to content

Commit

Permalink
quick fix middle token
Browse files Browse the repository at this point in the history
  • Loading branch information
baptisteroziere committed Dec 20, 2024
1 parent 5cac5e1 commit 8fd8d2b
Showing 1 changed file with 2 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/mistral_common/tokens/tokenizers/sentencepiece.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ def __init__(self, tokenizer: Tokenizer, mm_encoder: Optional[MultiModalEncoder]
self.BOS = self.tokenizer.get_control_token(SpecialTokens.bos.value)
self.PREFIX = self.tokenizer.get_control_token(SpecialTokens.prefix.value)
self.SUFFIX = self.tokenizer.get_control_token(SpecialTokens.suffix.value)
self.MIDDLE = self.tokenizer.get_control_token(SpecialTokens.middle.value)

def encode_user_message(
self,
Expand Down Expand Up @@ -453,6 +454,7 @@ def encode_fim(self, request: FIMRequest) -> Tokenized:
*suffix_tokens,
self.PREFIX,
*prefix_tokens,
self.MIDDLE,
]
return Tokenized(tokens=tokens, text=self.tokenizer.to_string(tokens))

Expand Down

0 comments on commit 8fd8d2b

Please sign in to comment.