diff --git a/src/transformers/models/code_llama/tokenization_code_llama.py b/src/transformers/models/code_llama/tokenization_code_llama.py index 165aa3634a4c08..db280bbc156150 100644 --- a/src/transformers/models/code_llama/tokenization_code_llama.py +++ b/src/transformers/models/code_llama/tokenization_code_llama.py @@ -149,9 +149,9 @@ def __init__( ): requires_backends(self, "protobuf") self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token - eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token - unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token self.use_default_system_prompt = use_default_system_prompt # mark tokens special to skip them diff --git a/tests/models/code_llama/test_tokenization_code_llama.py b/tests/models/code_llama/test_tokenization_code_llama.py index 7c3d89a8dd584e..7452990ba75515 100644 --- a/tests/models/code_llama/test_tokenization_code_llama.py +++ b/tests/models/code_llama/test_tokenization_code_llama.py @@ -150,6 +150,8 @@ def test_save_pretrained(self): self.tokenizers_list = [ (self.rust_tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}), (self.tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}), + (self.tokenizer_class, "codellama/CodeLlama-34b-Instruct-hf", {}), + (self.rust_tokenizer_class, "codellama/CodeLlama-34b-Instruct-hf", {}), ] for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):