Skip to content

Commit

Permalink
update yaml and sh script
Browse files Browse the repository at this point in the history
Signed-off-by: gaod <[email protected]>
  • Loading branch information
gdengk committed Jul 23, 2024
1 parent 3fc0d65 commit 17480c7
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 39 deletions.
11 changes: 1 addition & 10 deletions examples/training/mixtral/h100/mixtral_8x3b_bf16.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,12 @@ DATA_DIR=${DATA_DIR}
TOK_PATH=${TOK_PATH}

HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
training==mixtral/mixtral_8x3b \
training=mixtral/mixtral_8x3b \
stages=[training] \
data_dir=${DATA_DIR} \
launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
training.run.name="mixtral_8x3b_bf16" \
training.run.time_limit=0:30:00 \
training.model.tokenizer.model=${TOK_PATH} \
training.model.pipeline_model_parallel_size=1 \
training.model.virtual_pipeline_model_parallel_size=null \
training.model.expert_model_parallel_size=8 \
training.model.moe_grouped_gemm=False \
training.model.gradient_accumulation_fusion=True \
training.model.optim.name=mcore_distributed_optim \
+training.model.optim.overlap_grad_sync=True \
+training.model.optim.overlap_param_sync=True \
+training.model.optim.grad_sync_dtype=bf16 \
+env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \
15 changes: 1 addition & 14 deletions examples/training/mixtral/h100/mixtral_8x7b_bf16.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,12 @@ DATA_DIR=${DATA_DIR}
TOK_PATH=${TOK_PATH}

HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
training==mixtral/mixtral_8x7b \
training=mixtral/mixtral_8x7b \
stages=[training] \
data_dir=${DATA_DIR} \
launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
training.run.name="mixtral_8x7b_bf16" \
training.run.time_limit=0:30:00 \
training.model.tokenizer.model=${TOK_PATH} \
training.model.tensor_model_parallel_size=1 \
training.model.pipeline_model_parallel_size=4 \
training.model.virtual_pipeline_model_parallel_size=8 \
training.model.expert_model_parallel_size=8 \
training.model.sequence_parallel=False \
training.model.moe_grouped_gemm=False \
training.model.gradient_accumulation_fusion=True \
training.model.overlap_p2p_comm=True \
training.model.batch_p2p_comm=False \
training.model.optim.name=mcore_distributed_optim \
+training.model.optim.overlap_grad_sync=True \
+training.model.optim.overlap_param_sync=True \
+training.model.optim.grad_sync_dtype=bf16 \
+env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \
15 changes: 9 additions & 6 deletions launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,17 @@ exp_manager:

model:
mcore_gpt: true
moe_grouped_gemm: true
moe_grouped_gemm: false
moe_token_dispatcher_type: alltoall
moe_pad_expert_input_to_capacity: True
moe_expert_capacity_factor: 1.0
micro_batch_size: 1
global_batch_size: 128
rampup_batch_size: null
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 4
expert_model_parallel_size: 1
virtual_pipeline_model_parallel_size: 8
pipeline_model_parallel_size: 1
expert_model_parallel_size: 8
virtual_pipeline_model_parallel_size: null
encoder_seq_length: 4096
max_position_embeddings: 32768
num_layers: 32
Expand Down Expand Up @@ -107,7 +107,7 @@ model:
megatron_amp_O2: True
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: true
gradient_accumulation_fusion: false
gradient_accumulation_fusion: true
bias_activation_fusion: true
bias_dropout_add_fusion: true
masked_softmax_fusion: false
Expand Down Expand Up @@ -146,7 +146,10 @@ model:
- 0
gen_shape: false
optim:
name: distributed_fused_adam
name: mcore_distributed_optim
overlap_grad_sync: true
overlap_param_sync: true
grad_sync_dtype: bf16
lr: 0.0001
weight_decay: 0.1
betas:
Expand Down
21 changes: 12 additions & 9 deletions launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,18 @@ exp_manager:

model:
mcore_gpt: true
moe_grouped_gemm: true
moe_grouped_gemm: false
moe_token_dispatcher_type: alltoall
moe_pad_expert_input_to_capacity: True
moe_expert_capacity_factor: 1.0
moe_aux_loss_coeff: 0.01
micro_batch_size: 1
global_batch_size: 256
rampup_batch_size: null
tensor_model_parallel_size: 8
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 4
expert_model_parallel_size: 1
virtual_pipeline_model_parallel_size: null
expert_model_parallel_size: 8
virtual_pipeline_model_parallel_size: 8
encoder_seq_length: 4096
max_position_embeddings: 32768
num_layers: 32
Expand Down Expand Up @@ -92,8 +92,8 @@ model:
num_moe_experts: 8
attention_type: multihead
share_embeddings_and_output_weights: false
overlap_p2p_comm: false
batch_p2p_comm: true
overlap_p2p_comm: true
batch_p2p_comm: false
seq_len_interpolation_factor: null
num_query_groups: 8
tokenizer:
Expand All @@ -108,7 +108,7 @@ model:
megatron_amp_O2: True
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: true
gradient_accumulation_fusion: false
gradient_accumulation_fusion: true
bias_activation_fusion: true
bias_dropout_add_fusion: true
masked_softmax_fusion: false
Expand All @@ -125,7 +125,7 @@ model:
activations_checkpoint_num_layers: null
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: true
sequence_parallel: false
transformer_engine: true
fp8: false
fp8_e4m3: false
Expand All @@ -147,7 +147,10 @@ model:
- 0
gen_shape: false
optim:
name: distributed_fused_adam
name: mcore_distributed_optim
overlap_grad_sync: true
overlap_param_sync: true
grad_sync_dtype: bf16
lr: 0.0001
weight_decay: 0.1
betas:
Expand Down

0 comments on commit 17480c7

Please sign in to comment.