Skip to content

Commit

Permalink
tiktoken tokenizer for packed sequence
Browse files Browse the repository at this point in the history
Signed-off-by: Chen Cui <[email protected]>
  • Loading branch information
cuichenx committed Dec 19, 2024
1 parent 9762bc4 commit ecbdc5c
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 1 deletion.
1 change: 1 addition & 0 deletions nemo/collections/common/tokenizers/tiktoken_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def __init__(
num_special_tokens: int = 1000,
special_tokens: Optional[List[str]] = None,
):
self.vocab_file = vocab_file
if not vocab_file or not os.path.exists(vocab_file):
raise ValueError(f"vocab_file: {vocab_file} is invalid")

Expand Down
4 changes: 3 additions & 1 deletion nemo/collections/llm/gpt/data/fine_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import lightning.pytorch as pl
from torch.utils.data import DataLoader

from nemo.collections.common.tokenizers import AutoTokenizer
from nemo.collections.common.tokenizers import AutoTokenizer, TiktokenTokenizer
from nemo.collections.llm.gpt.data.core import create_sft_dataset
from nemo.lightning.data import WrappedDataLoader
from nemo.lightning.pytorch.plugins import MegatronDataSampler
Expand Down Expand Up @@ -309,6 +309,8 @@ def _extract_tokenizer_model_name(self) -> str:
else:
# hf_org/hf_model => hf_org--hf_model
tokenizer_model_name = name.replace("/", "--")
elif isinstance(self.tokenizer, TiktokenTokenizer):
tokenizer_model_name = Path(self.tokenizer.vocab_file).name
else:
tokenizer_model_name = f"unknown_tokenizer_{hash(self.tokenizer)}"
return tokenizer_model_name

0 comments on commit ecbdc5c

Please sign in to comment.