Merge pull request #374 from gdengk/gaod/moe/mixtral-script-2405

Update mixtral scripts for performance
NVIDIA · Jul 10, 2024 · 82c297c · 82c297c
2 parents 74d8409 + 3383d82
commit 82c297c
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 0 deletions.
diff --git a/examples/training/mixtral/h100/mixtral_8x3b_bf16.sh b/examples/training/mixtral/h100/mixtral_8x3b_bf16.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
+NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
+DATA_DIR=${DATA_DIR}
+TOK_PATH=${TOK_PATH}
+
+HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
+training==mixtral/mixtral_8x3b \
+stages=[training] \
+data_dir=${DATA_DIR} \
+launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
+base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
+training.run.name="mixtral_8x3b_bf16" \
+training.run.time_limit=0:30:00 \
+training.model.tokenizer.model=${TOK_PATH} \
+training.model.pipeline_model_parallel_size=1 \
+training.model.virtual_pipeline_model_parallel_size=null \
+training.model.expert_model_parallel_size=8 \
+training.model.moe_grouped_gemm=False \
+training.model.gradient_accumulation_fusion=True \
+training.model.optim.name=mcore_distributed_optim \
++training.model.optim.overlap_grad_sync=True \
++training.model.optim.overlap_param_sync=True \
++training.model.optim.grad_sync_dtype=bf16 \
++env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \
diff --git a/examples/training/mixtral/h100/mixtral_8x7b_bf16.sh b/examples/training/mixtral/h100/mixtral_8x7b_bf16.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
+NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
+DATA_DIR=${DATA_DIR}
+TOK_PATH=${TOK_PATH}
+
+HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
+training==mixtral/mixtral_8x7b \
+stages=[training] \
+data_dir=${DATA_DIR} \
+launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
+base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
+training.run.name="mixtral_8x7b_bf16" \
+training.run.time_limit=0:30:00 \
+training.model.tokenizer.model=${TOK_PATH} \
+training.model.tensor_model_parallel_size=1 \
+training.model.pipeline_model_parallel_size=4 \
+training.model.virtual_pipeline_model_parallel_size=8 \
+training.model.expert_model_parallel_size=8 \
+training.model.sequence_parallel=False \
+training.model.moe_grouped_gemm=False \
+training.model.gradient_accumulation_fusion=True \
+training.model.overlap_p2p_comm=True \
+training.model.batch_p2p_comm=False \
+training.model.optim.name=mcore_distributed_optim \
++training.model.optim.overlap_grad_sync=True \
++training.model.optim.overlap_param_sync=True \
++training.model.optim.grad_sync_dtype=bf16 \
++env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \