diff --git a/pyproject.toml b/pyproject.toml index 2177dde..3c5523b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "mistral_common" -version = "1.3.1" +version = "1.3.2" description = "" authors = ["bam4d "] readme = "README.md" diff --git a/src/mistral_common/__init__.py b/src/mistral_common/__init__.py index 9c73af2..f708a9b 100644 --- a/src/mistral_common/__init__.py +++ b/src/mistral_common/__init__.py @@ -1 +1 @@ -__version__ = "1.3.1" +__version__ = "1.3.2" diff --git a/src/mistral_common/tokens/tokenizers/tekken.py b/src/mistral_common/tokens/tokenizers/tekken.py index 45a4017..864fa98 100644 --- a/src/mistral_common/tokens/tokenizers/tekken.py +++ b/src/mistral_common/tokens/tokenizers/tekken.py @@ -200,7 +200,17 @@ def _decode_all(self, tokens: List[int], special_token_policy: SpecialTokenPolic for is_special, group in groupby(tokens, lambda t: t < self.num_special_tokens): if is_special: if special_token_policy == SpecialTokenPolicy.RAISE: - raise ValueError(f"Special tokens not allowed in this context: {list(group)}") + raise ValueError( + f"Decoding `tokens` that contain special tokens ({list(group)}) is not allowed. \n" + "Either make sure `tokens` do not include any special tokens or, " + "if you want to decode `tokens` that includes special tokens, " + "change the tokenizer's special token policy to IGNORE or KEEP: \n" + "```\nfrom mistral_common.tokens.tokenizers.mistral import MistralTokenizer" + "\nfrom mistral_common.tokens.tokenizers.tekken import SpecialTokenPolicy" + "\n\ntokenizer = MistralTokenizer.v3(is_tekken=True)" + "\ntokenizer.special_token_policy = SpecialTokenPolicy.IGNORE # or SpecialTokenPolicy.KEEP" + "\n```" + ) elif special_token_policy == SpecialTokenPolicy.KEEP: decoded.extend(self._all_special_tokens[t] for t in group) elif special_token_policy == SpecialTokenPolicy.IGNORE: