Skip to content

Commit

Permalink
Merge branch 'main' into huiyingl/nemo2sftpeft_notebook
Browse files Browse the repository at this point in the history
Signed-off-by: HuiyingLi <[email protected]>
  • Loading branch information
HuiyingLi committed Nov 15, 2024
2 parents d3ca86f + 83a2166 commit 77f1f63
Show file tree
Hide file tree
Showing 40 changed files with 2,892 additions and 70 deletions.
11 changes: 10 additions & 1 deletion nemo/collections/llm/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,15 @@
from typing_extensions import Annotated

import nemo.lightning as nl
from nemo.lightning import AutoResume, NeMoLogger, OptimizerModule, Trainer, io
from nemo.lightning import (
AutoResume,
NeMoLogger,
OptimizerModule,
Trainer,
configure_no_restart_validation_training_loop,
io,
)
from nemo.lightning.base import NEMO_MODELS_CACHE
from nemo.lightning.pytorch.callbacks import PEFT, ModelTransform
from nemo.utils import logging

Expand Down Expand Up @@ -492,6 +500,7 @@ def _setup(
tokenizer: Optional[TokenizerType],
model_transform: Optional[Union[PEFT, ModelTransform, Callable]],
) -> Any: # Return type is Any because app_state's type is not specified
configure_no_restart_validation_training_loop(trainer)
_log = log or NeMoLogger()
if resume and isinstance(model_transform, PEFT) and _log.ckpt:
logging.info("Disabling try_restore_best_ckpt restoration for adapters")
Expand Down
35 changes: 16 additions & 19 deletions nemo/collections/llm/gpt/data/hf_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader
from nemo.lightning.pytorch.plugins import MegatronDataSampler


class HfDatasetDataModule(pl.LightningDataModule):
Expand All @@ -24,6 +25,7 @@ def __init__(
num_workers=2,
pin_memory=True,
persistent_workers=True,
seq_length=1024,
micro_batch_size=2,
global_batch_size=2,
pad_token_id=0,
Expand All @@ -37,6 +39,7 @@ def __init__(
self.num_workers = num_workers
self.pin_memory = pin_memory
self.persistent_workers = persistent_workers
self.seq_length = seq_length
self.micro_batch_size = micro_batch_size
self.global_batch_size = global_batch_size
self.pad_token_id = pad_token_id
Expand All @@ -58,6 +61,7 @@ def pad_within_micro(batch, pad_token_id):
max_len = max(map(len, batch))
return [item + [pad_token_id] * (max_len - len(item)) for item in batch]

keys = list(filter(lambda x: x in batch[0], ['tokens', 'labels', 'position_ids', 'loss_mask']))
return {
key: batchify(
torch.LongTensor(
Expand All @@ -67,37 +71,30 @@ def pad_within_micro(batch, pad_token_id):
)
)
)
for key in ['tokens', 'labels']
for key in keys
}

def setup(self, stage: str):
if not self.use_mcore_sampler:
return
self.data_sampler = MegatronDataSampler(
seq_len=self.seq_length,
micro_batch_size=self.micro_batch_size,
global_batch_size=self.global_batch_size,
dataloader_type=self.mcore_dataloader_type,
)

def train_dataloader(self, collate_fn=None):
from nemo.lightning.data import add_megatron_sampler

if collate_fn is None:
collate_fn = lambda x: HfDatasetDataModule.collate_fn(x, pad_token_id=self.pad_token_id)

dataloader = DataLoader(
return DataLoader(
self.dataset,
num_workers=self.num_workers,
pin_memory=self.pin_memory,
persistent_workers=self.persistent_workers,
collate_fn=collate_fn,
batch_size=self.micro_batch_size,
)
if not self.use_mcore_sampler:
return dataloader

rank = 0
world_size = 1
if torch.distributed.is_initialized():
rank = torch.distributed.get_rank()
world_size = torch.distributed.get_world_size()

return add_megatron_sampler(
dataloader,
self.micro_batch_size,
self.global_batch_size,
dataloader_type=self.mcore_dataloader_type,
rank=rank,
world_size=world_size,
)
7 changes: 6 additions & 1 deletion nemo/collections/llm/gpt/model/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,12 @@ def make_vocab_size_divisible_by(vocab_size):
base //= 2
return base

output = LlamaConfig(
if getattr(source, 'rope_scaling', None) is not None and source.rope_scaling.get('rope_type') == 'llama3':
# Apply Llama3.1 customize rope scaling
cls = Llama31Config
else:
cls = LlamaConfig
output = cls(
num_layers=source.num_hidden_layers,
hidden_size=source.hidden_size,
ffn_hidden_size=source.intermediate_size,
Expand Down
4 changes: 4 additions & 0 deletions nemo/collections/llm/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
llama3_70b,
llama3_70b_16k,
llama3_70b_64k,
llama31_8b,
llama31_70b,
llama31_405b,
mamba2_1_3b,
mamba2_2_7b,
Expand Down Expand Up @@ -65,6 +67,8 @@
"llama3_70b",
"llama3_70b_16k",
"llama3_70b_64k",
"llama31_8b",
"llama31_70b",
"llama31_405b",
"mamba2_130m",
"mamba2_370m",
Expand Down
19 changes: 17 additions & 2 deletions nemo/collections/llm/recipes/finetune_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import nemo_run as run
import pytorch_lightning as pl
import torch

import nemo.lightning as nl
from nemo.collections import llm
Expand Down Expand Up @@ -82,7 +83,7 @@ def default_finetune_recipe(
def default_finetune_trainer(
tensor_parallelism=1,
pipeline_parallelism=1,
pipeline_parallelism_type=None,
pipeline_parallelism_type=torch.bfloat16,
virtual_pipeline_parallelism=None,
context_parallelism=1,
sequence_parallelism=False,
Expand All @@ -93,6 +94,19 @@ def default_finetune_trainer(
limit_val_batches=None,
val_check_interval=30,
):
"""
Create a default fine-tuning trainer for any model.
This function sets up a template for strategy and trainer.
Args:
See docstrings of MegatronStrategy and Trainer.
Returns:
run.Config: Config for a finetuning trainer.
See usages of this in recipes for further details.
"""
strategy = run.Config(
nl.MegatronStrategy,
tensor_model_parallel_size=tensor_parallelism,
Expand Down Expand Up @@ -125,7 +139,8 @@ def default_finetune_trainer(

def nemo_resume(model_id: str) -> run.Config[nl.AutoResume]:
"""
Configure automatic resumption from a NeMo checkpoint converted from Huggingface for https://huggingface.co/{model_id}.
Configure automatic resumption from a NeMo checkpoint converted from Huggingface for
https://huggingface.co/{model_id}.
This NeMo checkpoint should be converted from Huggingface beforehand, using nemo.collections.llm.import_ckpt.
When converting the checkpoint, the NeMo checkpoint will be saved in NEMO_HOME (set to ~/.cache/nemo by default).
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/llm/recipes/gemma_2b.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def finetune_recipe(
model(), "google/gemma-2b", dir, name, num_nodes, num_gpus_per_node, packed_sequence
)
if peft_scheme is None or peft_scheme.lower() == 'none':
recipe.trainer.strategy.tensor_model_parallel_size = 2
recipe.trainer.strategy.context_parallel_size = 2
recipe.optim.config.lr = 5e-6
elif peft_scheme.lower() == 'lora':
recipe.peft = run.Config(LoRA)
Expand Down
161 changes: 161 additions & 0 deletions nemo/collections/llm/recipes/llama31_405b.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@
from nemo import lightning as nl
from nemo.collections.llm.api import pretrain
from nemo.collections.llm.gpt.data.mock import MockDataModule
from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
from nemo.collections.llm.gpt.model.llama import Llama31Config405B, LlamaModel
from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
)
from nemo.lightning.pytorch.callbacks import GarbageCollectionCallback
from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
from nemo.utils.exp_manager import TimingCallback

Expand Down Expand Up @@ -237,3 +239,162 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
)

return recipe


@run.cli.factory(target=finetune, name=NAME)
def finetune_recipe(
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 3,
num_gpus_per_node: int = 8,
peft_scheme: Optional[str] = 'lora',
seq_length: Optional[int] = None,
packed_sequence: Optional[bool] = None,
performance_mode: bool = False,
) -> run.Partial:
"""
Create a fine-tuning recipe for Llama3.1 405B model.
This function sets up a complete configuration for fine-tuning, including
model, trainer, data, logging, optimization, and resumption settings.
The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
Args:
dir (Optional[str]): Directory for saving logs and checkpoints.
name (str): Name of the fine-tuning run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
peft_scheme (Optional[str]): Name of the peft scheme to use for finetuning. Allowed values: 'lora'/'none'/None.
seq_length (int): Maximum number of tokens per microbatch.
packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
maximum seq_length for better efficiency. By default, this value equals performance_mode.
performance_mode (bool): If true, enables optimizations for maximum performance.
Returns:
run.Partial: Partial configuration for fine-tuning.
Examples:
CLI usage:
$ nemo llm finetune --factory llama31_405b
$ nemo llm finetune --factory "llama31_405b(num_nodes=3, name='my_llama31_405b_finetune')"
Python API usage:
>>> recipe = finetune_recipe(name="llama31_405b_finetune", num_nodes=3)
>>> print(recipe)
Note:
This recipe uses the SQuAD dataset for fine-tuning. Be aware that fine-tuning a 405B model
requires substantial computational resources.
"""
if packed_sequence is None:
packed_sequence = performance_mode

if seq_length is None:
seq_length = 2048

if num_nodes is None:
if peft_scheme is None or peft_scheme.lower() == 'none':
num_nodes = 12
elif peft_scheme.lower() == 'lora':
num_nodes = 3

recipe = default_finetune_recipe(
model(), "meta-llama/Llama-3.1-405B", dir, name, num_nodes, num_gpus_per_node, packed_sequence
)
if peft_scheme is None or peft_scheme.lower() == 'none':
recipe.trainer.strategy.tensor_model_parallel_size = 8
recipe.trainer.strategy.pipeline_model_parallel_size = 14
recipe.data.global_batch_size = 6
recipe.optim.config.lr = 5e-6
elif peft_scheme.lower() == 'lora':
recipe.peft = run.Config(LoRA)
recipe.peft.dim = 16
recipe.peft.alpha = 32
recipe.peft.target_modules = ['linear_qkv']
recipe.optim.config.use_distributed_optimizer = False

# some settings currently do not function correctly with LoRA
recipe.model.config.cross_entropy_loss_fusion = False
recipe.trainer.strategy.tensor_model_parallel_size = 4
recipe.trainer.strategy.pipeline_model_parallel_size = 6
recipe.trainer.strategy.virtual_pipeline_model_parallel_size = 7
recipe.data.global_batch_size = 6
recipe.optim.config.lr = 1e-4
else:
raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")

# Sequence length settings in the model and dataset must agree
recipe.model.config.seq_length = seq_length
recipe.data.seq_length = seq_length
if packed_sequence:
recipe.data.dataset_kwargs = {'pad_to_max_length': True}
recipe.data.packed_sequence_specs = run.Config(PackedSequenceSpecs, packed_sequence_size=seq_length)

if performance_mode:
recipe = finetune_performance_optimizations(recipe, peft_scheme)

return recipe


def finetune_performance_optimizations(
recipe: run.Partial,
peft_scheme: str,
) -> run.Partial:
"""
Modify the given recipe to optimize settings for performance.
This method enables performance optimizations that may not be suitable for all use cases.
Intended to build upon the standard fine-tuning recipe.
Args:
recipe (run.Partial): Base fine-tuning recipe to which performance optimizations will be added
peft_scheme (str): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
Returns:
run.Partial: Partial configuration for performance-optimized fine-tuning.
Note:
Use this method with caution and only when you need maximum performance.
It may not be suitable for all hardware configurations or use cases.
"""

if not hasattr(recipe.trainer, "callbacks"):
recipe.trainer.callbacks = []

if peft_scheme is None or peft_scheme.lower() == 'none':
# Note: limited support. This is not necessarily the most optimized setting
recipe.trainer.strategy.tensor_model_parallel_size = 8
recipe.trainer.strategy.pipeline_model_parallel_size = 14
recipe.trainer.plugins.grad_reduce_in_fp32 = False
recipe.trainer.strategy.ddp = run.Config(
DistributedDataParallelConfig,
check_for_nan_in_grad=True,
grad_reduce_in_fp32=False,
overlap_grad_reduce=True,
overlap_param_gather=True,
average_in_collective=True,
)
recipe.trainer.callbacks.append(
run.Config(
MegatronCommOverlapCallback,
tp_comm_overlap=True,
defer_embedding_wgrad_compute=True,
wgrad_deferral_limit=22,
)
)
else:
recipe.trainer.strategy.tensor_model_parallel_size = 4
recipe.trainer.strategy.pipeline_model_parallel_size = 6
recipe.trainer.strategy.virtual_pipeline_model_parallel_size = 7

recipe.trainer.strategy.sequence_parallel = True

recipe.trainer.callbacks.append(run.Config(TimingCallback))
recipe.trainer.callbacks.append(
run.Config(
GarbageCollectionCallback,
100,
100,
)
)

return recipe
Loading

0 comments on commit 77f1f63

Please sign in to comment.