From fee1762f2c6c66cf8c14935d7d9acfb211d51291 Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Fri, 2 Aug 2024 16:19:39 -0700
Subject: [PATCH] update llama2/3 configs

Signed-off-by: Sangkug Lym <slym@nvidia.com>
---
 .../training/llama/h100/llama3_405b_bf16.sh   | 19 ++++++
 .../training/llama/h100/llama3_405b_fp8.sh    | 22 +++++++
 .../conf/training/llama/llama2_13b.yaml       |  2 +
 .../conf/training/llama/llama2_70b.yaml       |  6 +-
 .../conf/training/llama/llama2_7b.yaml        |  2 +
 .../conf/training/llama/llama3_1_405b.yaml    |  5 +-
 .../conf/training/llama/llama3_1_70b.yaml     |  3 +
 .../conf/training/llama/llama3_1_8b.yaml      |  2 +
 .../conf/training/llama/llama3_70b.yaml       |  5 +-
 .../conf/training/llama/llama3_8b.yaml        |  1 +
 ...0_bf16_h16384_tp8_cp2_mbs1_seqlen8192.yaml | 63 +++++++++++++++++++
 ...00_fp8_h16384_tp8_cp2_mbs1_seqlen8192.yaml | 63 +++++++++++++++++++
 12 files changed, 190 insertions(+), 3 deletions(-)
 create mode 100644 examples/training/llama/h100/llama3_405b_bf16.sh
 create mode 100644 examples/training/llama/h100/llama3_405b_fp8.sh
 create mode 100644 launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_bf16_h16384_tp8_cp2_mbs1_seqlen8192.yaml
 create mode 100644 launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192.yaml

diff --git a/examples/training/llama/h100/llama3_405b_bf16.sh b/examples/training/llama/h100/llama3_405b_bf16.sh
new file mode 100644
index 0000000000..027a4f3d42
--- /dev/null
+++ b/examples/training/llama/h100/llama3_405b_bf16.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
+NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
+DATA_DIR=${DATA_DIR}
+TOK_PATH=${TOK_PATH}
+
+HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
+training=llama/llama3_405b \
+stages=[training] \
+data_dir=${DATA_DIR} \
+launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
+base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
+training.run.name="llama3_1_405b_bf16" \
+training.run.time_limit=0:30:00 \
+training.trainer.num_nodes=72 \
+training.model.global_batch_size=252 \
+training.model.tokenizer.model=${TOK_PATH} \
++training.model.optim.grad_sync_dtype=bf16 \
diff --git a/examples/training/llama/h100/llama3_405b_fp8.sh b/examples/training/llama/h100/llama3_405b_fp8.sh
new file mode 100644
index 0000000000..c902676562
--- /dev/null
+++ b/examples/training/llama/h100/llama3_405b_fp8.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
+NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
+DATA_DIR=${DATA_DIR}
+TOK_PATH=${TOK_PATH}
+
+HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
+training=llama/llama3_405b \
+stages=[training] \
+data_dir=${DATA_DIR} \
+launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
+base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
+training.run.name="llama3_1_405b_fp8" \
+training.run.time_limit=0:30:00 \
+training.trainer.num_nodes=72 \
+training.model.global_batch_size=252 \
+training.model.fp8=True \
+training.model.fp8_hybrid=True \
+training.model.tokenizer.model=${TOK_PATH} \
++training.model.optim.grad_sync_dtype=bf16 \
+training/tp_overlap@training.model.ub_tp_comm_overlap_cfg=ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192 \
diff --git a/launcher_scripts/conf/training/llama/llama2_13b.yaml b/launcher_scripts/conf/training/llama/llama2_13b.yaml
index 121e370e55..3babf1cd3a 100644
--- a/launcher_scripts/conf/training/llama/llama2_13b.yaml
+++ b/launcher_scripts/conf/training/llama/llama2_13b.yaml
@@ -124,6 +124,8 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: 0
   activations_checkpoint_layers_per_pipeline: 0
   sequence_parallel: false
+  deterministic_mode: false
+  cross_entropy_loss_fusion: true
   transformer_engine: true
   fp8: false
   fp8_e4m3: false
diff --git a/launcher_scripts/conf/training/llama/llama2_70b.yaml b/launcher_scripts/conf/training/llama/llama2_70b.yaml
index 381928a1cc..b5f7726517 100644
--- a/launcher_scripts/conf/training/llama/llama2_70b.yaml
+++ b/launcher_scripts/conf/training/llama/llama2_70b.yaml
@@ -58,7 +58,7 @@ model:
   rampup_batch_size: null
   tensor_model_parallel_size: 4
   pipeline_model_parallel_size: 4
-  virtual_pipeline_model_parallel_size: 20
+  virtual_pipeline_model_parallel_size: 5
   encoder_seq_length: 4096
   max_position_embeddings: 4096
   num_layers: 80
@@ -124,6 +124,10 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: 0
   activations_checkpoint_layers_per_pipeline: 0
   sequence_parallel: true
+  defer_embedding_wgrad_compute: true
+  wgrad_deferral_limit: 22
+  cross_entropy_loss_fusion: true
+  deterministic_mode: false
   transformer_engine: true
   fp8: false
   fp8_e4m3: false
diff --git a/launcher_scripts/conf/training/llama/llama2_7b.yaml b/launcher_scripts/conf/training/llama/llama2_7b.yaml
index 00a4ec0fee..150037ab70 100644
--- a/launcher_scripts/conf/training/llama/llama2_7b.yaml
+++ b/launcher_scripts/conf/training/llama/llama2_7b.yaml
@@ -124,6 +124,8 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false
+  deterministic_mode: false
+  cross_entropy_loss_fusion: true
 
   ## Transformer Engine
   transformer_engine: true
diff --git a/launcher_scripts/conf/training/llama/llama3_1_405b.yaml b/launcher_scripts/conf/training/llama/llama3_1_405b.yaml
index 4ec5d55245..2435f3cfd9 100644
--- a/launcher_scripts/conf/training/llama/llama3_1_405b.yaml
+++ b/launcher_scripts/conf/training/llama/llama3_1_405b.yaml
@@ -1,6 +1,6 @@
 defaults:
   - _self_
-  - optional tp_overlap@model.ub_tp_comm_overlap_cfg: ub_cfg_h100_h8192_tp4_mbs1_seqlen8192
+  - optional tp_overlap@model.ub_tp_comm_overlap_cfg: ub_cfg_h100_bf16_h16384_tp8_cp2_mbs1_seqlen8192
 
 hydra:
   searchpath:
@@ -124,6 +124,9 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: true
+  defer_embedding_wgrad_compute: true
+  wgrad_deferral_limit: 50
+  deterministic_mode: false
 
   ## Transformer Engine
   transformer_engine: true
diff --git a/launcher_scripts/conf/training/llama/llama3_1_70b.yaml b/launcher_scripts/conf/training/llama/llama3_1_70b.yaml
index b6153ad4c5..ca255b2940 100644
--- a/launcher_scripts/conf/training/llama/llama3_1_70b.yaml
+++ b/launcher_scripts/conf/training/llama/llama3_1_70b.yaml
@@ -124,6 +124,9 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: true
+  defer_embedding_wgrad_compute: true
+  wgrad_deferral_limit: 22
+  deterministic_mode: false
 
   ## Transformer Engine
   transformer_engine: true
diff --git a/launcher_scripts/conf/training/llama/llama3_1_8b.yaml b/launcher_scripts/conf/training/llama/llama3_1_8b.yaml
index 6228baf7c7..2f78386446 100644
--- a/launcher_scripts/conf/training/llama/llama3_1_8b.yaml
+++ b/launcher_scripts/conf/training/llama/llama3_1_8b.yaml
@@ -124,6 +124,8 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false
+  sequence_parallel: false
+  deterministic_mode: false
 
   ## Transformer Engine
   transformer_engine: true
diff --git a/launcher_scripts/conf/training/llama/llama3_70b.yaml b/launcher_scripts/conf/training/llama/llama3_70b.yaml
index 6202bc52a3..a1690e5c81 100644
--- a/launcher_scripts/conf/training/llama/llama3_70b.yaml
+++ b/launcher_scripts/conf/training/llama/llama3_70b.yaml
@@ -59,7 +59,7 @@ model:
   rampup_batch_size: null
   tensor_model_parallel_size: 4
   pipeline_model_parallel_size: 4
-  virtual_pipeline_model_parallel_size: 10
+  virtual_pipeline_model_parallel_size: 5
   context_parallel_size: 2
   encoder_seq_length: 8192
   max_position_embeddings: 8192
@@ -123,6 +123,9 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: true
+  defer_embedding_wgrad_compute: true
+  wgrad_deferral_limit: 22
+  deterministic_mode: false
 
   ## Transformer Engine
   transformer_engine: true
diff --git a/launcher_scripts/conf/training/llama/llama3_8b.yaml b/launcher_scripts/conf/training/llama/llama3_8b.yaml
index 44e2ff6190..019ed63275 100644
--- a/launcher_scripts/conf/training/llama/llama3_8b.yaml
+++ b/launcher_scripts/conf/training/llama/llama3_8b.yaml
@@ -123,6 +123,7 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false
+  deterministic_mode: false
 
   ## Transformer Engine
   transformer_engine: true
diff --git a/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_bf16_h16384_tp8_cp2_mbs1_seqlen8192.yaml b/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_bf16_h16384_tp8_cp2_mbs1_seqlen8192.yaml
new file mode 100644
index 0000000000..fd88f0a9c4
--- /dev/null
+++ b/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_bf16_h16384_tp8_cp2_mbs1_seqlen8192.yaml
@@ -0,0 +1,63 @@
+# UB communicator configurations
+# Model configs: H100/405B/TP8/CP2/MBS1/SeqLen8K/FP8
+
+# Bulk overlap with AllGather / ReduceScatter
+qkv_dgrad:
+  method: bulk
+  num_sm: 2
+  cga_size: 2
+  set_sm_margin: 0
+
+qkv_wgrad:
+  method: bulk
+  num_sm: 24
+  cga_size: 2
+  set_sm_margin: 0
+
+fc1_dgrad:
+  method: bulk
+  num_sm: 2
+  cga_size: 2
+  set_sm_margin: 0
+
+fc1_wgrad:
+  method: bulk
+  num_sm: 2
+  cga_size: 2
+  set_sm_margin: 0
+
+## Ring
+qkv_fprop:
+  method: ring_exchange
+  aggregate: 1
+
+proj_dgrad:
+  method: ring_exchange
+  aggregate: 1
+
+fc1_fprop:
+  method: ring_exchange
+  aggregate: 1
+
+fc2_dgrad:
+  method: ring_exchange
+  aggregate: 1
+
+# Chunked
+proj_fprop:
+  method: pipeline
+  num_sm: 24
+  cga_size: 2
+  num_splits: 4
+  set_sm_margin: 1
+  fp8_buf: 0
+  atomic_gemm: 0
+
+fc2_fprop:
+  method: pipeline
+  num_sm: 8
+  cga_size: 2
+  num_splits: 4
+  set_sm_margin: 1
+  fp8_buf: 0
+  atomic_gemm: 0
\ No newline at end of file
diff --git a/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192.yaml b/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192.yaml
new file mode 100644
index 0000000000..7fabf039fb
--- /dev/null
+++ b/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192.yaml
@@ -0,0 +1,63 @@
+# UB communicator configurations
+# Model configs: H100/405B/TP8/CP2/MBS1/SeqLen8K/FP8
+
+# Bulk overlap with AllGather / ReduceScatter
+qkv_dgrad:
+  method: bulk
+  num_sm: 2
+  cga_size: 2
+  set_sm_margin: 0
+
+qkv_wgrad:
+  method: bulk
+  num_sm: 24
+  cga_size: 2
+  set_sm_margin: 0
+
+fc1_dgrad:
+  method: bulk
+  num_sm: 2
+  cga_size: 2
+  set_sm_margin: 0
+
+fc1_wgrad:
+  method: bulk
+  num_sm: 2
+  cga_size: 2
+  set_sm_margin: 0
+
+## Ring
+qkv_fprop:
+  method: ring_exchange
+  aggregate: 1
+
+proj_dgrad:
+  method: ring_exchange
+  aggregate: 1
+
+fc1_fprop:
+  method: ring_exchange
+  aggregate: 1
+
+fc2_dgrad:
+  method: ring_exchange
+  aggregate: 0
+
+# Chunked
+proj_fprop:
+  method: pipeline
+  num_sm: 24
+  cga_size: 2
+  num_splits: 4
+  set_sm_margin: 1
+  fp8_buf: 1
+  atomic_gemm: 0
+
+fc2_fprop:
+  method: pipeline
+  num_sm: 16
+  cga_size: 2
+  num_splits: 4
+  set_sm_margin: 1
+  fp8_buf: 1
+  atomic_gemm: 0
\ No newline at end of file