update yaml and sh script

Signed-off-by: gaod <[email protected]>
NVIDIA · Jul 23, 2024 · 17480c7 · 17480c7
1 parent 3fc0d65
commit 17480c7
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 39 deletions.
diff --git a/examples/training/mixtral/h100/mixtral_8x3b_bf16.sh b/examples/training/mixtral/h100/mixtral_8x3b_bf16.sh
@@ -6,21 +6,12 @@ DATA_DIR=${DATA_DIR}
 TOK_PATH=${TOK_PATH}
 
 HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
-training==mixtral/mixtral_8x3b \
+training=mixtral/mixtral_8x3b \
 stages=[training] \
 data_dir=${DATA_DIR} \
 launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
 base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
 training.run.name="mixtral_8x3b_bf16" \
 training.run.time_limit=0:30:00 \
 training.model.tokenizer.model=${TOK_PATH} \
-training.model.pipeline_model_parallel_size=1 \
-training.model.virtual_pipeline_model_parallel_size=null \
-training.model.expert_model_parallel_size=8 \
-training.model.moe_grouped_gemm=False \
-training.model.gradient_accumulation_fusion=True \
-training.model.optim.name=mcore_distributed_optim \
-+training.model.optim.overlap_grad_sync=True \
-+training.model.optim.overlap_param_sync=True \
-+training.model.optim.grad_sync_dtype=bf16 \
 +env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \
diff --git a/examples/training/mixtral/h100/mixtral_8x7b_bf16.sh b/examples/training/mixtral/h100/mixtral_8x7b_bf16.sh
@@ -6,25 +6,12 @@ DATA_DIR=${DATA_DIR}
 TOK_PATH=${TOK_PATH}
 
 HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
-training==mixtral/mixtral_8x7b \
+training=mixtral/mixtral_8x7b \
 stages=[training] \
 data_dir=${DATA_DIR} \
 launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
 base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
 training.run.name="mixtral_8x7b_bf16" \
 training.run.time_limit=0:30:00 \
 training.model.tokenizer.model=${TOK_PATH} \
-training.model.tensor_model_parallel_size=1 \
-training.model.pipeline_model_parallel_size=4 \
-training.model.virtual_pipeline_model_parallel_size=8 \
-training.model.expert_model_parallel_size=8 \
-training.model.sequence_parallel=False \
-training.model.moe_grouped_gemm=False \
-training.model.gradient_accumulation_fusion=True \
-training.model.overlap_p2p_comm=True \
-training.model.batch_p2p_comm=False \
-training.model.optim.name=mcore_distributed_optim \
-+training.model.optim.overlap_grad_sync=True \
-+training.model.optim.overlap_param_sync=True \
-+training.model.optim.grad_sync_dtype=bf16 \
 +env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \
diff --git a/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml b/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml
@@ -46,17 +46,17 @@ exp_manager:
 
 model:
   mcore_gpt: true
-  moe_grouped_gemm: true
+  moe_grouped_gemm: false
   moe_token_dispatcher_type: alltoall
   moe_pad_expert_input_to_capacity: True
   moe_expert_capacity_factor: 1.0
   micro_batch_size: 1
   global_batch_size: 128
   rampup_batch_size: null
   tensor_model_parallel_size: 1
-  pipeline_model_parallel_size: 4
-  expert_model_parallel_size: 1
-  virtual_pipeline_model_parallel_size: 8
+  pipeline_model_parallel_size: 1
+  expert_model_parallel_size: 8
+  virtual_pipeline_model_parallel_size: null
   encoder_seq_length: 4096
   max_position_embeddings: 32768
   num_layers: 32
@@ -107,7 +107,7 @@ model:
   megatron_amp_O2: True
   grad_allreduce_chunk_size_mb: 125
   grad_div_ar_fusion: true
-  gradient_accumulation_fusion: false
+  gradient_accumulation_fusion: true
   bias_activation_fusion: true
   bias_dropout_add_fusion: true
   masked_softmax_fusion: false
@@ -146,7 +146,10 @@ model:
     - 0
     gen_shape: false
   optim:
-    name: distributed_fused_adam
+    name: mcore_distributed_optim
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    grad_sync_dtype: bf16
     lr: 0.0001
     weight_decay: 0.1
     betas:

diff --git a/launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml b/launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml
@@ -46,18 +46,18 @@ exp_manager:
 
 model:
   mcore_gpt: true
-  moe_grouped_gemm: true
+  moe_grouped_gemm: false
   moe_token_dispatcher_type: alltoall
   moe_pad_expert_input_to_capacity: True
   moe_expert_capacity_factor: 1.0
   moe_aux_loss_coeff: 0.01
   micro_batch_size: 1
   global_batch_size: 256
   rampup_batch_size: null
-  tensor_model_parallel_size: 8
+  tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 4
-  expert_model_parallel_size: 1
-  virtual_pipeline_model_parallel_size: null
+  expert_model_parallel_size: 8
+  virtual_pipeline_model_parallel_size: 8
   encoder_seq_length: 4096
   max_position_embeddings: 32768
   num_layers: 32
@@ -92,8 +92,8 @@ model:
   num_moe_experts: 8
   attention_type: multihead
   share_embeddings_and_output_weights: false
-  overlap_p2p_comm: false
-  batch_p2p_comm: true
+  overlap_p2p_comm: true
+  batch_p2p_comm: false
   seq_len_interpolation_factor: null
   num_query_groups: 8
   tokenizer:
@@ -108,7 +108,7 @@ model:
   megatron_amp_O2: True
   grad_allreduce_chunk_size_mb: 125
   grad_div_ar_fusion: true
-  gradient_accumulation_fusion: false
+  gradient_accumulation_fusion: true
   bias_activation_fusion: true
   bias_dropout_add_fusion: true
   masked_softmax_fusion: false
@@ -125,7 +125,7 @@ model:
   activations_checkpoint_num_layers: null
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
-  sequence_parallel: true
+  sequence_parallel: false
   transformer_engine: true
   fp8: false
   fp8_e4m3: false
@@ -147,7 +147,10 @@ model:
     - 0
     gen_shape: false
   optim:
-    name: distributed_fused_adam
+    name: mcore_distributed_optim
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    grad_sync_dtype: bf16
     lr: 0.0001
     weight_decay: 0.1
     betas: