Skip to content

Commit

Permalink
Allow using vocab size from config (#11718)
Browse files Browse the repository at this point in the history
* Allow using vocab size from config

This MR allows the use of vocab size from config. In several cases once the indexed dataset is created, the tokenizer is used only to get vocab size. 

Signed-off-by: Shanmugam Ramasamy <[email protected]>

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <[email protected]>

---------

Signed-off-by: Shanmugam Ramasamy <[email protected]>
Signed-off-by: shanmugamr1992 <[email protected]>
Co-authored-by: shanmugamr1992 <[email protected]>
  • Loading branch information
shanmugamr1992 and shanmugamr1992 authored Dec 31, 2024
1 parent 805a70e commit 54964fc
Showing 1 changed file with 16 additions and 5 deletions.
21 changes: 16 additions & 5 deletions nemo/collections/llm/gpt/model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,10 +206,11 @@ def configure_model(self, tokenizer, pre_process=None, post_process=None) -> "MC

if hasattr(self, 'vocab_size'):
vocab_size = self.vocab_size
logging.info(
f"Use preset vocab_size: {vocab_size}, original vocab_size: {tokenizer.vocab_size}, dummy tokens:"
f" {vocab_size - tokenizer.vocab_size}."
)
if tokenizer is not None:
logging.info(
f"Use preset vocab_size: {vocab_size}, original vocab_size: {tokenizer.vocab_size}, dummy tokens:"
f" {vocab_size - tokenizer.vocab_size}."
)
else:
vocab_size = get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by)

Expand Down Expand Up @@ -405,11 +406,21 @@ def get_inference_wrapper(self, params_dtype, inference_batch_times_seqlen_thres
if mcore_model is None or type(mcore_model) is not MCoreGPTModel:
raise ValueError("Exact McoreGPTModel instance not found in the model structure.")

vocab_size = None
if self.tokenizer is not None:
vocab_size = self.tokenizer.vocab_size
elif hasattr(self.config, 'vocab_size'):
vocab_size = self.config.vocab_size
else:
raise ValueError(
'Unable to find vocab size. Either pass in a tokenizer with vocab size, or set vocab size in the model config'
)

inference_wrapper_config = InferenceWrapperConfig(
hidden_size=mcore_model.config.hidden_size,
params_dtype=params_dtype,
inference_batch_times_seqlen_threshold=inference_batch_times_seqlen_threshold,
padded_vocab_size=self.tokenizer.vocab_size,
padded_vocab_size=vocab_size,
)

model_inference_wrapper = GPTInferenceWrapper(mcore_model, inference_wrapper_config)
Expand Down

0 comments on commit 54964fc

Please sign in to comment.