diff --git a/examples/training/llama/h100/llama3_405b_bf16.sh b/examples/training/llama/h100/llama3_405b_bf16.sh new file mode 100644 index 0000000000..027a4f3d42 --- /dev/null +++ b/examples/training/llama/h100/llama3_405b_bf16.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +#Users should setup their cluster type in /launcher_scripts/conf/config.yaml +NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} +DATA_DIR=${DATA_DIR} +TOK_PATH=${TOK_PATH} + +HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ +training=llama/llama3_405b \ +stages=[training] \ +data_dir=${DATA_DIR} \ +launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ +base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ +training.run.name="llama3_1_405b_bf16" \ +training.run.time_limit=0:30:00 \ +training.trainer.num_nodes=72 \ +training.model.global_batch_size=252 \ +training.model.tokenizer.model=${TOK_PATH} \ ++training.model.optim.grad_sync_dtype=bf16 \ diff --git a/examples/training/llama/h100/llama3_405b_fp8.sh b/examples/training/llama/h100/llama3_405b_fp8.sh new file mode 100644 index 0000000000..c902676562 --- /dev/null +++ b/examples/training/llama/h100/llama3_405b_fp8.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +#Users should setup their cluster type in /launcher_scripts/conf/config.yaml +NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} +DATA_DIR=${DATA_DIR} +TOK_PATH=${TOK_PATH} + +HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ +training=llama/llama3_405b \ +stages=[training] \ +data_dir=${DATA_DIR} \ +launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ +base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ +training.run.name="llama3_1_405b_fp8" \ +training.run.time_limit=0:30:00 \ +training.trainer.num_nodes=72 \ +training.model.global_batch_size=252 \ +training.model.fp8=True \ +training.model.fp8_hybrid=True \ +training.model.tokenizer.model=${TOK_PATH} \ ++training.model.optim.grad_sync_dtype=bf16 \ +training/tp_overlap@training.model.ub_tp_comm_overlap_cfg=ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192 \ diff --git a/launcher_scripts/conf/training/llama/llama2_13b.yaml b/launcher_scripts/conf/training/llama/llama2_13b.yaml index 121e370e55..3babf1cd3a 100644 --- a/launcher_scripts/conf/training/llama/llama2_13b.yaml +++ b/launcher_scripts/conf/training/llama/llama2_13b.yaml @@ -124,6 +124,8 @@ model: num_micro_batches_with_partial_activation_checkpoints: 0 activations_checkpoint_layers_per_pipeline: 0 sequence_parallel: false + deterministic_mode: false + cross_entropy_loss_fusion: true transformer_engine: true fp8: false fp8_e4m3: false diff --git a/launcher_scripts/conf/training/llama/llama2_70b.yaml b/launcher_scripts/conf/training/llama/llama2_70b.yaml index 381928a1cc..b5f7726517 100644 --- a/launcher_scripts/conf/training/llama/llama2_70b.yaml +++ b/launcher_scripts/conf/training/llama/llama2_70b.yaml @@ -58,7 +58,7 @@ model: rampup_batch_size: null tensor_model_parallel_size: 4 pipeline_model_parallel_size: 4 - virtual_pipeline_model_parallel_size: 20 + virtual_pipeline_model_parallel_size: 5 encoder_seq_length: 4096 max_position_embeddings: 4096 num_layers: 80 @@ -124,6 +124,10 @@ model: num_micro_batches_with_partial_activation_checkpoints: 0 activations_checkpoint_layers_per_pipeline: 0 sequence_parallel: true + defer_embedding_wgrad_compute: true + wgrad_deferral_limit: 22 + cross_entropy_loss_fusion: true + deterministic_mode: false transformer_engine: true fp8: false fp8_e4m3: false diff --git a/launcher_scripts/conf/training/llama/llama2_7b.yaml b/launcher_scripts/conf/training/llama/llama2_7b.yaml index 00a4ec0fee..150037ab70 100644 --- a/launcher_scripts/conf/training/llama/llama2_7b.yaml +++ b/launcher_scripts/conf/training/llama/llama2_7b.yaml @@ -124,6 +124,8 @@ model: num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null sequence_parallel: false + deterministic_mode: false + cross_entropy_loss_fusion: true ## Transformer Engine transformer_engine: true diff --git a/launcher_scripts/conf/training/llama/llama3_1_405b.yaml b/launcher_scripts/conf/training/llama/llama3_1_405b.yaml index 4ec5d55245..2435f3cfd9 100644 --- a/launcher_scripts/conf/training/llama/llama3_1_405b.yaml +++ b/launcher_scripts/conf/training/llama/llama3_1_405b.yaml @@ -1,6 +1,6 @@ defaults: - _self_ - - optional tp_overlap@model.ub_tp_comm_overlap_cfg: ub_cfg_h100_h8192_tp4_mbs1_seqlen8192 + - optional tp_overlap@model.ub_tp_comm_overlap_cfg: ub_cfg_h100_bf16_h16384_tp8_cp2_mbs1_seqlen8192 hydra: searchpath: @@ -124,6 +124,9 @@ model: num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null sequence_parallel: true + defer_embedding_wgrad_compute: true + wgrad_deferral_limit: 50 + deterministic_mode: false ## Transformer Engine transformer_engine: true diff --git a/launcher_scripts/conf/training/llama/llama3_1_70b.yaml b/launcher_scripts/conf/training/llama/llama3_1_70b.yaml index b6153ad4c5..ca255b2940 100644 --- a/launcher_scripts/conf/training/llama/llama3_1_70b.yaml +++ b/launcher_scripts/conf/training/llama/llama3_1_70b.yaml @@ -124,6 +124,9 @@ model: num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null sequence_parallel: true + defer_embedding_wgrad_compute: true + wgrad_deferral_limit: 22 + deterministic_mode: false ## Transformer Engine transformer_engine: true diff --git a/launcher_scripts/conf/training/llama/llama3_1_8b.yaml b/launcher_scripts/conf/training/llama/llama3_1_8b.yaml index 6228baf7c7..2f78386446 100644 --- a/launcher_scripts/conf/training/llama/llama3_1_8b.yaml +++ b/launcher_scripts/conf/training/llama/llama3_1_8b.yaml @@ -124,6 +124,8 @@ model: num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null sequence_parallel: false + sequence_parallel: false + deterministic_mode: false ## Transformer Engine transformer_engine: true diff --git a/launcher_scripts/conf/training/llama/llama3_70b.yaml b/launcher_scripts/conf/training/llama/llama3_70b.yaml index 6202bc52a3..a1690e5c81 100644 --- a/launcher_scripts/conf/training/llama/llama3_70b.yaml +++ b/launcher_scripts/conf/training/llama/llama3_70b.yaml @@ -59,7 +59,7 @@ model: rampup_batch_size: null tensor_model_parallel_size: 4 pipeline_model_parallel_size: 4 - virtual_pipeline_model_parallel_size: 10 + virtual_pipeline_model_parallel_size: 5 context_parallel_size: 2 encoder_seq_length: 8192 max_position_embeddings: 8192 @@ -123,6 +123,9 @@ model: num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null sequence_parallel: true + defer_embedding_wgrad_compute: true + wgrad_deferral_limit: 22 + deterministic_mode: false ## Transformer Engine transformer_engine: true diff --git a/launcher_scripts/conf/training/llama/llama3_8b.yaml b/launcher_scripts/conf/training/llama/llama3_8b.yaml index 44e2ff6190..019ed63275 100644 --- a/launcher_scripts/conf/training/llama/llama3_8b.yaml +++ b/launcher_scripts/conf/training/llama/llama3_8b.yaml @@ -123,6 +123,7 @@ model: num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null sequence_parallel: false + deterministic_mode: false ## Transformer Engine transformer_engine: true diff --git a/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_bf16_h16384_tp8_cp2_mbs1_seqlen8192.yaml b/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_bf16_h16384_tp8_cp2_mbs1_seqlen8192.yaml new file mode 100644 index 0000000000..fd88f0a9c4 --- /dev/null +++ b/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_bf16_h16384_tp8_cp2_mbs1_seqlen8192.yaml @@ -0,0 +1,63 @@ +# UB communicator configurations +# Model configs: H100/405B/TP8/CP2/MBS1/SeqLen8K/FP8 + +# Bulk overlap with AllGather / ReduceScatter +qkv_dgrad: + method: bulk + num_sm: 2 + cga_size: 2 + set_sm_margin: 0 + +qkv_wgrad: + method: bulk + num_sm: 24 + cga_size: 2 + set_sm_margin: 0 + +fc1_dgrad: + method: bulk + num_sm: 2 + cga_size: 2 + set_sm_margin: 0 + +fc1_wgrad: + method: bulk + num_sm: 2 + cga_size: 2 + set_sm_margin: 0 + +## Ring +qkv_fprop: + method: ring_exchange + aggregate: 1 + +proj_dgrad: + method: ring_exchange + aggregate: 1 + +fc1_fprop: + method: ring_exchange + aggregate: 1 + +fc2_dgrad: + method: ring_exchange + aggregate: 1 + +# Chunked +proj_fprop: + method: pipeline + num_sm: 24 + cga_size: 2 + num_splits: 4 + set_sm_margin: 1 + fp8_buf: 0 + atomic_gemm: 0 + +fc2_fprop: + method: pipeline + num_sm: 8 + cga_size: 2 + num_splits: 4 + set_sm_margin: 1 + fp8_buf: 0 + atomic_gemm: 0 \ No newline at end of file diff --git a/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192.yaml b/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192.yaml new file mode 100644 index 0000000000..7fabf039fb --- /dev/null +++ b/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192.yaml @@ -0,0 +1,63 @@ +# UB communicator configurations +# Model configs: H100/405B/TP8/CP2/MBS1/SeqLen8K/FP8 + +# Bulk overlap with AllGather / ReduceScatter +qkv_dgrad: + method: bulk + num_sm: 2 + cga_size: 2 + set_sm_margin: 0 + +qkv_wgrad: + method: bulk + num_sm: 24 + cga_size: 2 + set_sm_margin: 0 + +fc1_dgrad: + method: bulk + num_sm: 2 + cga_size: 2 + set_sm_margin: 0 + +fc1_wgrad: + method: bulk + num_sm: 2 + cga_size: 2 + set_sm_margin: 0 + +## Ring +qkv_fprop: + method: ring_exchange + aggregate: 1 + +proj_dgrad: + method: ring_exchange + aggregate: 1 + +fc1_fprop: + method: ring_exchange + aggregate: 1 + +fc2_dgrad: + method: ring_exchange + aggregate: 0 + +# Chunked +proj_fprop: + method: pipeline + num_sm: 24 + cga_size: 2 + num_splits: 4 + set_sm_margin: 1 + fp8_buf: 1 + atomic_gemm: 0 + +fc2_fprop: + method: pipeline + num_sm: 16 + cga_size: 2 + num_splits: 4 + set_sm_margin: 1 + fp8_buf: 1 + atomic_gemm: 0 \ No newline at end of file