diff --git a/nemo/collections/common/tokenizers/tiktoken_tokenizer.py b/nemo/collections/common/tokenizers/tiktoken_tokenizer.py index d24f333bc6b1..04396631cc07 100644 --- a/nemo/collections/common/tokenizers/tiktoken_tokenizer.py +++ b/nemo/collections/common/tokenizers/tiktoken_tokenizer.py @@ -87,6 +87,7 @@ def __init__( num_special_tokens: int = 1000, special_tokens: Optional[List[str]] = None, ): + self.vocab_file = vocab_file if not vocab_file or not os.path.exists(vocab_file): raise ValueError(f"vocab_file: {vocab_file} is invalid") diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py index a22ed72f4656..8efe5cdbd918 100644 --- a/nemo/collections/llm/gpt/data/fine_tuning.py +++ b/nemo/collections/llm/gpt/data/fine_tuning.py @@ -20,7 +20,7 @@ import lightning.pytorch as pl from torch.utils.data import DataLoader -from nemo.collections.common.tokenizers import AutoTokenizer +from nemo.collections.common.tokenizers import AutoTokenizer, TiktokenTokenizer from nemo.collections.llm.gpt.data.core import create_sft_dataset from nemo.lightning.data import WrappedDataLoader from nemo.lightning.pytorch.plugins import MegatronDataSampler @@ -309,6 +309,8 @@ def _extract_tokenizer_model_name(self) -> str: else: # hf_org/hf_model => hf_org--hf_model tokenizer_model_name = name.replace("/", "--") + elif isinstance(self.tokenizer, TiktokenTokenizer): + tokenizer_model_name = Path(self.tokenizer.vocab_file).name else: tokenizer_model_name = f"unknown_tokenizer_{hash(self.tokenizer)}" return tokenizer_model_name