update llama2/3 configs

Signed-off-by: Sangkug Lym <[email protected]>
NVIDIA · Aug 2, 2024 · fee1762 · fee1762
1 parent 8325688
commit fee1762
Show file tree

Hide file tree

Showing 12 changed files with 190 additions and 3 deletions.
diff --git a/examples/training/llama/h100/llama3_405b_bf16.sh b/examples/training/llama/h100/llama3_405b_bf16.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
+NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
+DATA_DIR=${DATA_DIR}
+TOK_PATH=${TOK_PATH}
+
+HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
+training=llama/llama3_405b \
+stages=[training] \
+data_dir=${DATA_DIR} \
+launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
+base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
+training.run.name="llama3_1_405b_bf16" \
+training.run.time_limit=0:30:00 \
+training.trainer.num_nodes=72 \
+training.model.global_batch_size=252 \
+training.model.tokenizer.model=${TOK_PATH} \
++training.model.optim.grad_sync_dtype=bf16 \
diff --git a/examples/training/llama/h100/llama3_405b_fp8.sh b/examples/training/llama/h100/llama3_405b_fp8.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
+NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
+DATA_DIR=${DATA_DIR}
+TOK_PATH=${TOK_PATH}
+
+HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
+training=llama/llama3_405b \
+stages=[training] \
+data_dir=${DATA_DIR} \
+launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
+base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
+training.run.name="llama3_1_405b_fp8" \
+training.run.time_limit=0:30:00 \
+training.trainer.num_nodes=72 \
+training.model.global_batch_size=252 \
+training.model.fp8=True \
+training.model.fp8_hybrid=True \
+training.model.tokenizer.model=${TOK_PATH} \
++training.model.optim.grad_sync_dtype=bf16 \
+training/[email protected]_tp_comm_overlap_cfg=ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192 \
diff --git a/launcher_scripts/conf/training/llama/llama2_13b.yaml b/launcher_scripts/conf/training/llama/llama2_13b.yaml
@@ -124,6 +124,8 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: 0
   activations_checkpoint_layers_per_pipeline: 0
   sequence_parallel: false
+  deterministic_mode: false
+  cross_entropy_loss_fusion: true
   transformer_engine: true
   fp8: false
   fp8_e4m3: false

diff --git a/launcher_scripts/conf/training/llama/llama2_70b.yaml b/launcher_scripts/conf/training/llama/llama2_70b.yaml
@@ -58,7 +58,7 @@ model:
   rampup_batch_size: null
   tensor_model_parallel_size: 4
   pipeline_model_parallel_size: 4
-  virtual_pipeline_model_parallel_size: 20
+  virtual_pipeline_model_parallel_size: 5
   encoder_seq_length: 4096
   max_position_embeddings: 4096
   num_layers: 80
@@ -124,6 +124,10 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: 0
   activations_checkpoint_layers_per_pipeline: 0
   sequence_parallel: true
+  defer_embedding_wgrad_compute: true
+  wgrad_deferral_limit: 22
+  cross_entropy_loss_fusion: true
+  deterministic_mode: false
   transformer_engine: true
   fp8: false
   fp8_e4m3: false

diff --git a/launcher_scripts/conf/training/llama/llama2_7b.yaml b/launcher_scripts/conf/training/llama/llama2_7b.yaml
@@ -124,6 +124,8 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false
+  deterministic_mode: false
+  cross_entropy_loss_fusion: true
 
   ## Transformer Engine
   transformer_engine: true

diff --git a/launcher_scripts/conf/training/llama/llama3_1_405b.yaml b/launcher_scripts/conf/training/llama/llama3_1_405b.yaml
@@ -1,6 +1,6 @@
 defaults:
   - _self_
-  - optional [email protected]_tp_comm_overlap_cfg: ub_cfg_h100_h8192_tp4_mbs1_seqlen8192
+  - optional [email protected]_tp_comm_overlap_cfg: ub_cfg_h100_bf16_h16384_tp8_cp2_mbs1_seqlen8192
 
 hydra:
   searchpath:
@@ -124,6 +124,9 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: true
+  defer_embedding_wgrad_compute: true
+  wgrad_deferral_limit: 50
+  deterministic_mode: false
 
   ## Transformer Engine
   transformer_engine: true

diff --git a/launcher_scripts/conf/training/llama/llama3_1_70b.yaml b/launcher_scripts/conf/training/llama/llama3_1_70b.yaml
@@ -124,6 +124,9 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: true
+  defer_embedding_wgrad_compute: true
+  wgrad_deferral_limit: 22
+  deterministic_mode: false
 
   ## Transformer Engine
   transformer_engine: true

diff --git a/launcher_scripts/conf/training/llama/llama3_1_8b.yaml b/launcher_scripts/conf/training/llama/llama3_1_8b.yaml
@@ -124,6 +124,8 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false
+  sequence_parallel: false
+  deterministic_mode: false
 
   ## Transformer Engine
   transformer_engine: true

diff --git a/launcher_scripts/conf/training/llama/llama3_70b.yaml b/launcher_scripts/conf/training/llama/llama3_70b.yaml
@@ -59,7 +59,7 @@ model:
   rampup_batch_size: null
   tensor_model_parallel_size: 4
   pipeline_model_parallel_size: 4
-  virtual_pipeline_model_parallel_size: 10
+  virtual_pipeline_model_parallel_size: 5
   context_parallel_size: 2
   encoder_seq_length: 8192
   max_position_embeddings: 8192
@@ -123,6 +123,9 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: true
+  defer_embedding_wgrad_compute: true
+  wgrad_deferral_limit: 22
+  deterministic_mode: false
 
   ## Transformer Engine
   transformer_engine: true

diff --git a/launcher_scripts/conf/training/llama/llama3_8b.yaml b/launcher_scripts/conf/training/llama/llama3_8b.yaml
@@ -123,6 +123,7 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false
+  deterministic_mode: false
 
   ## Transformer Engine
   transformer_engine: true

diff --git a/...her_scripts/conf/training/tp_overlap/ub_cfg_h100_bf16_h16384_tp8_cp2_mbs1_seqlen8192.yaml b/...her_scripts/conf/training/tp_overlap/ub_cfg_h100_bf16_h16384_tp8_cp2_mbs1_seqlen8192.yaml
@@ -0,0 +1,63 @@
+# UB communicator configurations
+# Model configs: H100/405B/TP8/CP2/MBS1/SeqLen8K/FP8
+
+# Bulk overlap with AllGather / ReduceScatter
+qkv_dgrad:
+  method: bulk
+  num_sm: 2
+  cga_size: 2
+  set_sm_margin: 0
+
+qkv_wgrad:
+  method: bulk
+  num_sm: 24
+  cga_size: 2
+  set_sm_margin: 0
+
+fc1_dgrad:
+  method: bulk
+  num_sm: 2
+  cga_size: 2
+  set_sm_margin: 0
+
+fc1_wgrad:
+  method: bulk
+  num_sm: 2
+  cga_size: 2
+  set_sm_margin: 0
+
+## Ring
+qkv_fprop:
+  method: ring_exchange
+  aggregate: 1
+
+proj_dgrad:
+  method: ring_exchange
+  aggregate: 1
+
+fc1_fprop:
+  method: ring_exchange
+  aggregate: 1
+
+fc2_dgrad:
+  method: ring_exchange
+  aggregate: 1
+
+# Chunked
+proj_fprop:
+  method: pipeline
+  num_sm: 24
+  cga_size: 2
+  num_splits: 4
+  set_sm_margin: 1
+  fp8_buf: 0
+  atomic_gemm: 0
+
+fc2_fprop:
+  method: pipeline
+  num_sm: 8
+  cga_size: 2
+  num_splits: 4
+  set_sm_margin: 1
+  fp8_buf: 0
+  atomic_gemm: 0
diff --git a/...cher_scripts/conf/training/tp_overlap/ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192.yaml b/...cher_scripts/conf/training/tp_overlap/ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192.yaml
@@ -0,0 +1,63 @@
+# UB communicator configurations
+# Model configs: H100/405B/TP8/CP2/MBS1/SeqLen8K/FP8
+
+# Bulk overlap with AllGather / ReduceScatter
+qkv_dgrad:
+  method: bulk
+  num_sm: 2
+  cga_size: 2
+  set_sm_margin: 0
+
+qkv_wgrad:
+  method: bulk
+  num_sm: 24
+  cga_size: 2
+  set_sm_margin: 0
+
+fc1_dgrad:
+  method: bulk
+  num_sm: 2
+  cga_size: 2
+  set_sm_margin: 0
+
+fc1_wgrad:
+  method: bulk
+  num_sm: 2
+  cga_size: 2
+  set_sm_margin: 0
+
+## Ring
+qkv_fprop:
+  method: ring_exchange
+  aggregate: 1
+
+proj_dgrad:
+  method: ring_exchange
+  aggregate: 1
+
+fc1_fprop:
+  method: ring_exchange
+  aggregate: 1
+
+fc2_dgrad:
+  method: ring_exchange
+  aggregate: 0
+
+# Chunked
+proj_fprop:
+  method: pipeline
+  num_sm: 24
+  cga_size: 2
+  num_splits: 4
+  set_sm_margin: 1
+  fp8_buf: 1
+  atomic_gemm: 0
+
+fc2_fprop:
+  method: pipeline
+  num_sm: 16
+  cga_size: 2
+  num_splits: 4
+  set_sm_margin: 1
+  fp8_buf: 1
+  atomic_gemm: 0