From 271fda963f0624c3ef85b2ad112df70da28773e8 Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Thu, 18 Jul 2024 21:49:38 -0700
Subject: [PATCH] Add GPT 175B mlperf config

Signed-off-by: Sangkug Lym <slym@nvidia.com>
---
 .../gpt3/{175b_fp8.yaml => 175b_mlperf.yaml}  | 24 +++++---
 ...b_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml | 11 ++--
 ...b_cfg_h100_h12288_tp4_mbs2_seqlen2048.yaml | 58 +++++++++++++++++++
 3 files changed, 78 insertions(+), 15 deletions(-)
 rename launcher_scripts/conf/training/gpt3/{175b_fp8.yaml => 175b_mlperf.yaml} (95%)
 create mode 100644 launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h12288_tp4_mbs2_seqlen2048.yaml

diff --git a/launcher_scripts/conf/training/gpt3/175b_fp8.yaml b/launcher_scripts/conf/training/gpt3/175b_mlperf.yaml
similarity index 95%
rename from launcher_scripts/conf/training/gpt3/175b_fp8.yaml
rename to launcher_scripts/conf/training/gpt3/175b_mlperf.yaml
index f4cc1f6cf2..29b643e09a 100755
--- a/launcher_scripts/conf/training/gpt3/175b_fp8.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b_mlperf.yaml
@@ -3,7 +3,7 @@
 # convergence (e.g., 300B tokens) is not guaranteed.
 defaults:
   - _self_
-  - optional tp_overlap@model.ub_tp_comm_overlap_cfg: ub_cfg_h100_h12288_tp8_mbs2_seqlen2048
+  - optional tp_overlap@model.ub_tp_comm_overlap_cfg: ub_cfg_h100_h12288_tp4_mbs2_seqlen2048
 
 hydra:
   searchpath:
@@ -16,7 +16,7 @@ run:
   dependency: "singleton"
 
 trainer:
-  num_nodes: 128
+  num_nodes: 64
   devices: 8
   accelerator: gpu
   precision: bf16
@@ -61,7 +61,7 @@ model:
   micro_batch_size: 2
   global_batch_size: 2048
   context_parallel_size: 1
-  tensor_model_parallel_size: 8
+  tensor_model_parallel_size: 4
   pipeline_model_parallel_size: 8
   virtual_pipeline_model_parallel_size: 6 # interleaved pipeline, set to maximum
   resume_from_checkpoint: null # manually set the checkpoint file to load from
@@ -73,16 +73,20 @@ model:
   ffn_hidden_size: ${multiply:4, ${.hidden_size}}  # Transformer FFN hidden size. 4 * hidden_size.
   num_attention_heads: 96
   init_method_std: 0.006  # Standard deviation of the zero mean normal distribution used for weight initialization.')
-  hidden_dropout: 0.1  # Dropout probability for hidden state transformer.
-  attention_dropout: 0.1
+  use_scaled_init_method: false
+  hidden_dropout: 0.0  # Dropout probability for hidden state transformer.
+  attention_dropout: 0.0
   kv_channels: null  # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
-  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  apply_query_key_layer_scaling: false # scale Q * K^T by 1 / layer-number.
   layernorm_epsilon: 1e-5
   make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
   pre_process: True # add embedding
   post_process: True # add pooler
   persist_layer_norm: True # Use of persistent fused layer norm kernel.
   gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  normalization: layernorm1p
+  do_layer_norm_weight_decay: true
+  bias: true
 
   # Fusion
   grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
@@ -145,8 +149,7 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
-  fp8_wgrad: True
-  ub_tp_comm_overlap: False
+  ub_tp_comm_overlap: True
 
   # miscellaneous
   seed: 1234
@@ -168,10 +171,11 @@ model:
 
   optim:
     name: distributed_fused_adam
-    bucket_cap_mb: 220
+    bucket_cap_mb: 125
     overlap_grad_sync: True
     overlap_param_sync: true
     contiguous_grad_buffer: True
+    contiguous_param_buffer: True
     grad_sync_dtype: bf16
     lr: 0.9e-4
     weight_decay: 0.1 
@@ -257,3 +261,5 @@ model:
       - .0334
       - ${data_dir}/my-gpt3_29_text_document
 
+env_vars:
+  NVTE_FUSED_ATTN: 1
diff --git a/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml b/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml
index 21d02f3dd2..c5849b2fc3 100644
--- a/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml
+++ b/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml
@@ -10,7 +10,7 @@ qkv_dgrad:
 
 qkv_wgrad:
   method: bulk
-  num_sm: 8
+  num_sm: 4
   cga_size: 2
   set_sm_margin: 0
 
@@ -41,7 +41,7 @@ fc1_fprop:
 
 fc2_dgrad:
   method: ring_exchange
-  aggregate: 1
+  aggregate: 0
 
 # Chunked-collective overlap with ReduceScatter
 proj_fprop:
@@ -50,10 +50,9 @@ proj_fprop:
   cga_size: 2
   num_splits: 4
   set_sm_margin: 1
+  fp8_buf: 1
 
 fc2_fprop:
-  method: pipeline
-  num_sm: 20
-  cga_size: 2
-  num_splits: 4
+  method: ring_exchange
+  num_sm: 1
   set_sm_margin: 1
diff --git a/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h12288_tp4_mbs2_seqlen2048.yaml b/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h12288_tp4_mbs2_seqlen2048.yaml
new file mode 100644
index 0000000000..c5849b2fc3
--- /dev/null
+++ b/launcher_scripts/conf/training/tp_overlap/ub_cfg_h100_h12288_tp4_mbs2_seqlen2048.yaml
@@ -0,0 +1,58 @@
+# UB communicator configurations
+# Model configs: H100/175B/TP4/MBS1/SeqLen2K/FP8
+
+# Bulk overlap with AllGather / ReduceScatter
+qkv_dgrad:
+  method: bulk
+  num_sm: 4
+  cga_size: 2
+  set_sm_margin: 0
+
+qkv_wgrad:
+  method: bulk
+  num_sm: 4
+  cga_size: 2
+  set_sm_margin: 0
+
+fc1_dgrad:
+  method: bulk
+  num_sm: 2
+  cga_size: 2
+  set_sm_margin: 0
+
+fc1_wgrad:
+  method: bulk
+  num_sm: 4
+  cga_size: 2
+  set_sm_margin: 0
+
+## Ring-exchange overlap with AllGather
+qkv_fprop:
+  method: ring_exchange
+  aggregate: 0
+
+proj_dgrad:
+  method: ring_exchange
+  aggregate: 0
+
+fc1_fprop:
+  method: ring_exchange
+  aggregate: 0
+
+fc2_dgrad:
+  method: ring_exchange
+  aggregate: 0
+
+# Chunked-collective overlap with ReduceScatter
+proj_fprop:
+  method: pipeline
+  num_sm: 24
+  cga_size: 2
+  num_splits: 4
+  set_sm_margin: 1
+  fp8_buf: 1
+
+fc2_fprop:
+  method: ring_exchange
+  num_sm: 1
+  set_sm_margin: 1