From 630e6544ef4b2927e489994caa56a1faf305be3a Mon Sep 17 00:00:00 2001 From: "Zhenghang (Max) Xu" Date: Tue, 17 Dec 2024 14:17:03 -0800 Subject: [PATCH] Update nemotron_340b.yaml Update the recipe arguments to achieve similar MFU of paper https://arxiv.org/pdf/2406.11704 --- .../conf/training/nemotron/nemotron_340b.yaml | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/launcher_scripts/conf/training/nemotron/nemotron_340b.yaml b/launcher_scripts/conf/training/nemotron/nemotron_340b.yaml index 6003ea945..241d1198c 100644 --- a/launcher_scripts/conf/training/nemotron/nemotron_340b.yaml +++ b/launcher_scripts/conf/training/nemotron/nemotron_340b.yaml @@ -58,8 +58,8 @@ model: rampup_batch_size: null context_parallel_size: 1 tensor_model_parallel_size: 8 - pipeline_model_parallel_size: 12 - virtual_pipeline_model_parallel_size: 8 + pipeline_model_parallel_size: 8 + virtual_pipeline_model_parallel_size: 12 encoder_seq_length: 4096 max_position_embeddings: 4096 num_layers: 96 @@ -131,9 +131,17 @@ model: fsdp_sharding_strategy: 'full' # Method to shard model states. Available options are 'full', 'hybrid', and 'grad'. fsdp_grad_reduce_dtype: 32 # Gradient reduction data type. fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint. + + defer_embedding_wgrad_compute: True + wgrad_deferral_limit: 22 + cross_entropy_loss_fusion: True + enable_vboost: True + ub_tp_comm_overlap: True + apply_rope_fusion: True + deteministic_mode: False + overlap_p2p_comm: True # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + batch_p2p_comm: False # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 - overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 - batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 num_query_groups: 8 # Number of query groups for group query attention. If None, normal attention is used. ## Network @@ -188,4 +196,4 @@ model: - .0333 - ${data_dir}/my-nemotron_00_text_document - .0333 - - ${data_dir}/my-nemotron_00_text_document \ No newline at end of file + - ${data_dir}/my-nemotron_00_text_document