From b38f285883d07be17ea30f1d155cdfb64516500e Mon Sep 17 00:00:00 2001 From: gaod Date: Tue, 9 Jul 2024 20:00:01 -0700 Subject: [PATCH 1/4] add mixtral scripts Signed-off-by: gaod --- .../mixtral/h100/mixtral_8x3b_bf16.sh | 26 ++++++++++++++++++ .../mixtral/h100/mixtral_8x7b_bf16.sh | 27 +++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 examples/training/mixtral/h100/mixtral_8x3b_bf16.sh create mode 100644 examples/training/mixtral/h100/mixtral_8x7b_bf16.sh diff --git a/examples/training/mixtral/h100/mixtral_8x3b_bf16.sh b/examples/training/mixtral/h100/mixtral_8x3b_bf16.sh new file mode 100644 index 000000000..fe1931028 --- /dev/null +++ b/examples/training/mixtral/h100/mixtral_8x3b_bf16.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +#Users should setup their cluster type in /launcher_scripts/conf/config.yaml +NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} +DATA_DIR=${DATA_DIR} +TOK_PATH=${TOK_PATH} + +HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ +training==mixtral/mixtral_8x7b \ +stages=[training] \ +data_dir=${DATA_DIR} \ +launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ +base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ +training.run.name="mixtral_8x7b_bf16" \ +training.run.time_limit=0:30:00 \ +training.trainer.num_nodes=1 \ +training.model.tokenizer.model=${TOK_PATH} \ +training.model.pipeline_model_parallel_size=1 \ +training.model.virtual_pipeline_model_parallel_size=null \ +training.model.expert_model_parallel_size=8 \ +training.model.moe_grouped_gemm=False \ +training.model.gradient_accumulation_fusion=True \ +training.optim.name=mcore_distributed_optim \ ++training.optim.overlap_grad_sync=True \ ++training.optim.overlap_param_sync=True \ ++training.model.optim.grad_sync_dtype=bf16 \ diff --git a/examples/training/mixtral/h100/mixtral_8x7b_bf16.sh b/examples/training/mixtral/h100/mixtral_8x7b_bf16.sh new file mode 100644 index 000000000..1ce5d3ea9 --- /dev/null +++ b/examples/training/mixtral/h100/mixtral_8x7b_bf16.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +#Users should setup their cluster type in /launcher_scripts/conf/config.yaml +NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} +DATA_DIR=${DATA_DIR} +TOK_PATH=${TOK_PATH} + +HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ +training==mixtral/mixtral_8x7b \ +stages=[training] \ +data_dir=${DATA_DIR} \ +launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ +base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ +training.run.name="mixtral_8x7b_bf16" \ +training.run.time_limit=0:30:00 \ +training.trainer.num_nodes=1 \ +training.model.tokenizer.model=${TOK_PATH} \ +training.model.tensor_model_parallel_size=1 \ +training.model.pipeline_model_parallel_size=4 \ +training.model.virtual_pipeline_model_parallel_size=8 \ +training.model.sequence_parallel=False \ +training.model.moe_grouped_gemm=False \ +training.model.gradient_accumulation_fusion=True \ +training.optim.name=mcore_distributed_optim \ ++training.optim.overlap_grad_sync=True \ ++training.optim.overlap_param_sync=True \ ++training.model.optim.grad_sync_dtype=bf16 \ From 179e1d58dc3a73267328ae5060c857aec64cc1b8 Mon Sep 17 00:00:00 2001 From: gaod Date: Wed, 10 Jul 2024 09:07:11 -0700 Subject: [PATCH 2/4] update configs Signed-off-by: gaod --- examples/training/mixtral/h100/mixtral_8x3b_bf16.sh | 8 ++++---- examples/training/mixtral/h100/mixtral_8x7b_bf16.sh | 9 +++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/training/mixtral/h100/mixtral_8x3b_bf16.sh b/examples/training/mixtral/h100/mixtral_8x3b_bf16.sh index fe1931028..029b0b69b 100644 --- a/examples/training/mixtral/h100/mixtral_8x3b_bf16.sh +++ b/examples/training/mixtral/h100/mixtral_8x3b_bf16.sh @@ -13,14 +13,14 @@ launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ training.run.name="mixtral_8x7b_bf16" \ training.run.time_limit=0:30:00 \ -training.trainer.num_nodes=1 \ training.model.tokenizer.model=${TOK_PATH} \ training.model.pipeline_model_parallel_size=1 \ training.model.virtual_pipeline_model_parallel_size=null \ training.model.expert_model_parallel_size=8 \ training.model.moe_grouped_gemm=False \ training.model.gradient_accumulation_fusion=True \ -training.optim.name=mcore_distributed_optim \ -+training.optim.overlap_grad_sync=True \ -+training.optim.overlap_param_sync=True \ +training.model.optim.name=mcore_distributed_optim \ ++training.model.optim.overlap_grad_sync=True \ ++training.model.optim.overlap_param_sync=True \ +training.model.optim.grad_sync_dtype=bf16 \ ++env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \ diff --git a/examples/training/mixtral/h100/mixtral_8x7b_bf16.sh b/examples/training/mixtral/h100/mixtral_8x7b_bf16.sh index 1ce5d3ea9..c5ac0f6d5 100644 --- a/examples/training/mixtral/h100/mixtral_8x7b_bf16.sh +++ b/examples/training/mixtral/h100/mixtral_8x7b_bf16.sh @@ -13,15 +13,16 @@ launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ training.run.name="mixtral_8x7b_bf16" \ training.run.time_limit=0:30:00 \ -training.trainer.num_nodes=1 \ training.model.tokenizer.model=${TOK_PATH} \ training.model.tensor_model_parallel_size=1 \ training.model.pipeline_model_parallel_size=4 \ training.model.virtual_pipeline_model_parallel_size=8 \ +training.model.expert_model_parallel_size=8 \ training.model.sequence_parallel=False \ training.model.moe_grouped_gemm=False \ training.model.gradient_accumulation_fusion=True \ -training.optim.name=mcore_distributed_optim \ -+training.optim.overlap_grad_sync=True \ -+training.optim.overlap_param_sync=True \ +training.model.optim.name=mcore_distributed_optim \ ++training.model.optim.overlap_grad_sync=True \ ++training.model.optim.overlap_param_sync=True \ +training.model.optim.grad_sync_dtype=bf16 \ ++env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \ \ No newline at end of file From 9477feac417a92de291753a6e2b70c8f05365d2b Mon Sep 17 00:00:00 2001 From: gaod Date: Wed, 10 Jul 2024 11:20:13 -0700 Subject: [PATCH 3/4] update P2P configs Signed-off-by: gaod --- examples/training/mixtral/h100/mixtral_8x7b_bf16.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/training/mixtral/h100/mixtral_8x7b_bf16.sh b/examples/training/mixtral/h100/mixtral_8x7b_bf16.sh index c5ac0f6d5..82cbd83f7 100644 --- a/examples/training/mixtral/h100/mixtral_8x7b_bf16.sh +++ b/examples/training/mixtral/h100/mixtral_8x7b_bf16.sh @@ -21,6 +21,8 @@ training.model.expert_model_parallel_size=8 \ training.model.sequence_parallel=False \ training.model.moe_grouped_gemm=False \ training.model.gradient_accumulation_fusion=True \ +training.model.overlap_p2p_comm=True \ +training.model.batch_p2p_comm=False \ training.model.optim.name=mcore_distributed_optim \ +training.model.optim.overlap_grad_sync=True \ +training.model.optim.overlap_param_sync=True \ From 3383d82847ea4123be8b0ee1357bfd813a9a0996 Mon Sep 17 00:00:00 2001 From: gdengk <160076886+gdengk@users.noreply.github.com> Date: Wed, 10 Jul 2024 11:29:36 -0700 Subject: [PATCH 4/4] Update mixtral_8x3b_bf16.sh typo --- examples/training/mixtral/h100/mixtral_8x3b_bf16.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/training/mixtral/h100/mixtral_8x3b_bf16.sh b/examples/training/mixtral/h100/mixtral_8x3b_bf16.sh index 029b0b69b..33c7299e2 100644 --- a/examples/training/mixtral/h100/mixtral_8x3b_bf16.sh +++ b/examples/training/mixtral/h100/mixtral_8x3b_bf16.sh @@ -6,12 +6,12 @@ DATA_DIR=${DATA_DIR} TOK_PATH=${TOK_PATH} HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ -training==mixtral/mixtral_8x7b \ +training==mixtral/mixtral_8x3b \ stages=[training] \ data_dir=${DATA_DIR} \ launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ -training.run.name="mixtral_8x7b_bf16" \ +training.run.name="mixtral_8x3b_bf16" \ training.run.time_limit=0:30:00 \ training.model.tokenizer.model=${TOK_PATH} \ training.model.pipeline_model_parallel_size=1 \