From 2e3563ebe6dbe0cb9c81cb9cd94cbc25af41996d Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Wed, 12 Jul 2023 05:29:12 -0700
Subject: [PATCH 01/62] add data preparation for llama, fix pile downloading
 url

---
 .gitignore                                    |  1 +
 launcher_scripts/conf/cluster/bcm.yaml        | 12 +++++-----
 launcher_scripts/conf/config.yaml             | 13 ++++++-----
 .../gpt3/download_gpt3_pile.yaml              |  2 +-
 .../llama/download_llama_pile.yaml            | 23 +++++++++++++++++++
 launcher_scripts/main.py                      |  2 +-
 .../pile_dataprep/conf/config.yaml            |  2 ++
 .../pile_dataprep/preprocess.py               | 10 ++++++++
 .../nemo_launcher/core/data_stages.py         |  2 ++
 9 files changed, 53 insertions(+), 14 deletions(-)
 create mode 100755 launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml

diff --git a/.gitignore b/.gitignore
index b254c682e8..4517784c6b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 #*.ipynb
 output
 result
+data
 *.pt
 tests/data/asr
 .DS_Store
diff --git a/launcher_scripts/conf/cluster/bcm.yaml b/launcher_scripts/conf/cluster/bcm.yaml
index e1f7b32c6b..8ff05b1fe3 100755
--- a/launcher_scripts/conf/cluster/bcm.yaml
+++ b/launcher_scripts/conf/cluster/bcm.yaml
@@ -1,9 +1,9 @@
-partition: null
-account: null
-exclusive: True
+partition: luna
+account: devtech
+exclusive: true
 gpus_per_task: null
-gpus_per_node: 8
+gpus_per_node: null
 mem: 0
-job_name_prefix: "nemo-megatron-"
+job_name_prefix: 'devtech-gpt:'
 srun_args:
-  - "--no-container-mount-home"
+  - --no-container-mount-home
diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml
index 50ada47bdc..1fed0ca7ed 100755
--- a/launcher_scripts/conf/config.yaml
+++ b/launcher_scripts/conf/config.yaml
@@ -1,7 +1,7 @@
 defaults:
   - _self_
   - cluster: bcm  # Leave it as bcm even if using bcp. It will be ignored for bcp.
-  - data_preparation: gpt3/download_gpt3_pile
+  - data_preparation: llama/download_llama_pile
   - training: gpt3/5b
   - conversion: gpt3/convert_gpt3
   - fine_tuning: null
@@ -20,13 +20,14 @@ hydra:
 debug: False
 
 stages:
-  - training
-  - conversion
-  - evaluation
-  - export
+  - data_preparation
+    #- training
+    #- conversion
+    #- evaluation
+    #- export
 
 cluster_type: bcm  # bcm or bcp. If bcm, it must match - cluster above.
-launcher_scripts_path: ???  # Path to NeMo Megatron Launch scripts, should ends with /launcher_scripts
+launcher_scripts_path: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/launcher_scripts  # Path to NeMo Megatron Launch scripts, should ends with /launcher_scripts
 data_dir: ${launcher_scripts_path}/data  # Location to store and read the data.
 base_results_dir: ${launcher_scripts_path}/results  # Location to store the results, checkpoints and logs.
 container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
diff --git a/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml b/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml
index 632ccdadd2..ab6614480a 100755
--- a/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml
+++ b/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml
@@ -9,7 +9,7 @@ run:
 
 dataset: pile
 download_the_pile: True  # Whether to download the pile dataset from the internet.
-the_pile_url: "https://mystic.the-eye.eu/public/AI/pile/train/"  # Source URL to download The Pile dataset from.
+the_pile_url: "https://the-eye.eu/public/AI/pile/train/"  # Source URL to download The Pile dataset from.
 file_numbers: "0-29"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
 preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
 download_vocab_url: "https://huggingface.co/gpt2/resolve/main/vocab.json"  # URL to download the vocab from.
diff --git a/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml b/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml
new file mode 100755
index 0000000000..cc23a234a7
--- /dev/null
+++ b/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml
@@ -0,0 +1,23 @@
+run:
+  name: download_llama_pile
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "1:00:00"
+  dependency: "singleton"
+  node_array_size: 30
+  array: ${..file_numbers}
+  bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster.
+
+dataset: pile
+download_the_pile: True  # Whether to download the pile dataset from the internet.
+the_pile_url: "https://the-eye.eu/public/AI/pile/train/"  # Source URL to download The Pile dataset from.
+file_numbers: "0-1"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
+preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
+#download_vocab_url: "https://huggingface.co/gpt2/resolve/main/vocab.json"  # URL to download the vocab from.
+#download_merges_url: "https://huggingface.co/gpt2/resolve/main/merges.txt"  # URL to download the merges from.
+#vocab_save_dir: ${data_dir}/bpe
+#merges_save_dir: ${data_dir}/bpe
+#tokenizer_type: GPT2BPETokenizer
+tokenizer_library: "sentencepiece"
+tokenizer_model: "/lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/checkpoints/llama/7B/tokenizer.model"
+rm_downloaded: False # Extract script will remove downloaded zst after extraction
+rm_extracted: False # Preprocess script will remove extracted files after preproc.
diff --git a/launcher_scripts/main.py b/launcher_scripts/main.py
index 4053328f2c..25d5a016ca 100755
--- a/launcher_scripts/main.py
+++ b/launcher_scripts/main.py
@@ -47,7 +47,7 @@
         NeMoEvaluation: ["t5", "mt5", "prompt_t5", "prompt_mt5", "adapter_t5", "adapter_gpt3", "ia3_t5", "ia3_gpt3"],
     },
     "data_preparation": {
-        PileDataPreparation: ["gpt3", "t5", "bert"],
+        PileDataPreparation: ["gpt3", "t5", "bert", "llama"],
         MC4DataPreparation: ["mt5"],
         CustomDataPreparation: ["generic"],
     },
diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/conf/config.yaml b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/conf/config.yaml
index 448dbb1dbc..14917628a8 100755
--- a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/conf/config.yaml
+++ b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/conf/config.yaml
@@ -9,3 +9,5 @@ rm_extracted: True
 tokenizer_type: null
 vocab_save_dir: null
 merges_save_dir: null
+tokenizer_library: null
+tokenizer_model: null
diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py
index 61a9e36560..44ef368c1b 100755
--- a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py
+++ b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py
@@ -28,6 +28,8 @@ def main(cfg):
     data_dir = cfg.get("data_dir")
     rm_extracted = cfg.get("rm_extracted")
     tokenizer_type = cfg.get("tokenizer_type")
+    tokenizer_library = cfg.get("tokenizer_library")
+    tokenizer_model = cfg.get("tokenizer_model")
     assert data_dir is not None, "data_dir must be a valid path"
 
     # Vocab
@@ -67,6 +69,8 @@ def main(cfg):
             model_type = 'bert'
         elif 'gpt3' in data_config:
             model_type = 'gpt3'
+        elif 'llama' in data_config:
+            model_type = 'llama'
 
         output_prefix = os.path.join(data_dir, f"my-{model_type}_{file_number:02d}")
 
@@ -77,6 +81,8 @@ def main(cfg):
             f"--dataset-impl mmap "
             f"--tokenizer-library megatron "
             f"--tokenizer-type {tokenizer_type} "
+            f"--tokenizer-library {tokenizer_library} "
+            f"--tokenizer-model {tokenizer_model} "
             f"--workers $SLURM_CPUS_ON_NODE "
         )
 
@@ -119,6 +125,8 @@ def main(cfg):
                 model_type = 'bert'
             elif 'gpt3' in data_config:
                 model_type = 'gpt3'
+            elif 'llama' in data_config:
+                model_type = 'llama'
 
             output_prefix = os.path.join(data_dir, f"my-{model_type}_{file_number:02d}")
 
@@ -129,6 +137,8 @@ def main(cfg):
                 f"--dataset-impl mmap "
                 f"--tokenizer-library megatron "
                 f"--tokenizer-type {tokenizer_type} "
+                f"--tokenizer-library {tokenizer_library} "
+                f"--tokenizer-model {tokenizer_model} "
                 f"--workers {ncpus} "
             )
 
diff --git a/launcher_scripts/nemo_launcher/core/data_stages.py b/launcher_scripts/nemo_launcher/core/data_stages.py
index c3713786e5..7158750900 100755
--- a/launcher_scripts/nemo_launcher/core/data_stages.py
+++ b/launcher_scripts/nemo_launcher/core/data_stages.py
@@ -252,6 +252,8 @@ def _make_sub_stage_command(self, sub_stage: str) -> List[str]:
             rm_downloaded=self.stage_cfg.get("rm_downloaded"),
             rm_extracted=self.stage_cfg.get("rm_extracted"),
             tokenizer_type=self.stage_cfg.get("tokenizer_type"),
+            tokenizer_library=self.stage_cfg.get("tokenizer_library", "megatron"),
+            tokenizer_model=self.stage_cfg.get("tokenizer_model", None),
             vocab_save_dir=self.stage_cfg.get("vocab_save_dir"),
             merges_save_dir=self.stage_cfg.get("merges_save_dir"),
         )

From 69f9066085d20e064e668c455881c37ef58b2c67 Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Thu, 13 Jul 2023 00:32:09 -0700
Subject: [PATCH 02/62] add training for llama, add download_tokenizer_url for
 data preparation

---
 launcher_scripts/conf/config.yaml             |   8 +-
 .../llama/download_llama_pile.yaml            |   4 +-
 launcher_scripts/conf/training/llama/13b.yaml | 157 +++++++++++++
 launcher_scripts/conf/training/llama/30b.yaml | 156 +++++++++++++
 launcher_scripts/conf/training/llama/65b.yaml | 157 +++++++++++++
 launcher_scripts/conf/training/llama/7b.yaml  | 221 ++++++++++++++++++
 .../nemo_launcher/core/data_stages.py         |   9 +
 launcher_scripts/nemo_launcher/core/stages.py |   6 +-
 8 files changed, 712 insertions(+), 6 deletions(-)
 create mode 100644 launcher_scripts/conf/training/llama/13b.yaml
 create mode 100644 launcher_scripts/conf/training/llama/30b.yaml
 create mode 100644 launcher_scripts/conf/training/llama/65b.yaml
 create mode 100755 launcher_scripts/conf/training/llama/7b.yaml

diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml
index 1fed0ca7ed..f9242678d9 100755
--- a/launcher_scripts/conf/config.yaml
+++ b/launcher_scripts/conf/config.yaml
@@ -2,7 +2,7 @@ defaults:
   - _self_
   - cluster: bcm  # Leave it as bcm even if using bcp. It will be ignored for bcp.
   - data_preparation: llama/download_llama_pile
-  - training: gpt3/5b
+  - training: llama/30b
   - conversion: gpt3/convert_gpt3
   - fine_tuning: null
   - prompt_learning: null
@@ -20,8 +20,8 @@ hydra:
 debug: False
 
 stages:
-  - data_preparation
-    #- training
+        #- data_preparation
+  - training
     #- conversion
     #- evaluation
     #- export
@@ -34,7 +34,7 @@ container_mounts: # List of additional paths to mount to container. They will be
   - null
 container: nvcr.io/ea-bignlp/nemofw-training:23.05-py3
 
-wandb_api_key_file: null  # File where the w&B api key is stored. Key must be on the first line.
+wandb_api_key_file: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/NeMo-Megatron-Launcher/wandb_api_key  # File where the w&B api key is stored. Key must be on the first line.
 
 env_vars:
   NCCL_TOPO_FILE: null # Should be a path to an XML file describing the topology
diff --git a/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml b/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml
index cc23a234a7..863f817661 100755
--- a/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml
+++ b/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml
@@ -12,12 +12,14 @@ download_the_pile: True  # Whether to download the pile dataset from the interne
 the_pile_url: "https://the-eye.eu/public/AI/pile/train/"  # Source URL to download The Pile dataset from.
 file_numbers: "0-1"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
 preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
+download_tokenizer_url: "https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model"
 #download_vocab_url: "https://huggingface.co/gpt2/resolve/main/vocab.json"  # URL to download the vocab from.
 #download_merges_url: "https://huggingface.co/gpt2/resolve/main/merges.txt"  # URL to download the merges from.
 #vocab_save_dir: ${data_dir}/bpe
 #merges_save_dir: ${data_dir}/bpe
 #tokenizer_type: GPT2BPETokenizer
 tokenizer_library: "sentencepiece"
-tokenizer_model: "/lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/checkpoints/llama/7B/tokenizer.model"
+tokenizer_save_dir: ${data_dir}/llama
+tokenizer_model:  ${.tokenizer_save_dir}/llama_tokenizer.model
 rm_downloaded: False # Extract script will remove downloaded zst after extraction
 rm_extracted: False # Preprocess script will remove extracted files after preproc.
diff --git a/launcher_scripts/conf/training/llama/13b.yaml b/launcher_scripts/conf/training/llama/13b.yaml
new file mode 100644
index 0000000000..cf6f8ec8cc
--- /dev/null
+++ b/launcher_scripts/conf/training/llama/13b.yaml
@@ -0,0 +1,157 @@
+run:
+  name: llama_13b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: 0-02:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 4
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  replace_sampler_ddp: false
+  max_epochs: null
+  max_steps: 300000
+  max_time: '5:23:30:00'
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 50
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_llama
+    name: ${training.run.name}
+  resume_if_exists: true
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: false
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: false
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size},
+      ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+model:
+  micro_batch_size: 2
+  global_batch_size: 2048
+  rampup_batch_size: null
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 2048
+  max_position_embeddings: 2048
+  num_layers: 40
+  hidden_size: 5120
+  ffn_hidden_size: 13824
+  num_attention_heads: 40
+  init_method_std: 0.01
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: sentencepiece
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: selective
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: 0
+  activations_checkpoint_layers_per_pipeline: 0
+  sequence_parallel: false
+  transformer_engine: false
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: most_recent
+  use_emha: false
+  ub_tp_comm_overlap: false
+  use_flash_attention: false
+  optim:
+    name: distributed_fused_adam
+    lr: 0.0001
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: false
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 107
+      constant_steps: 11873
+      min_lr: 1.0e-05
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 2048
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - 0.5
+    - ${data_dir}/my-llama_00_text_document
+    - 0.5
+    - ${data_dir}/my-llama_01_text_document
diff --git a/launcher_scripts/conf/training/llama/30b.yaml b/launcher_scripts/conf/training/llama/30b.yaml
new file mode 100644
index 0000000000..33ed5054c8
--- /dev/null
+++ b/launcher_scripts/conf/training/llama/30b.yaml
@@ -0,0 +1,156 @@
+run:
+  name: llama_30b
+  results_dir:  ${base_results_dir}/${.name}
+  time_limit: 0-01:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 16
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  replace_sampler_ddp: false
+  max_epochs: null
+  max_steps: 300000
+  max_time: '19:23:30:00'
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 50
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_llama
+    name: ${training.run.name}
+  resume_if_exists: true
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: false
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+model:
+  micro_batch_size: 1
+  global_batch_size: 2048
+  rampup_batch_size: null
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 4
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 2048
+  max_position_embeddings: 2048
+  num_layers: 60
+  hidden_size: 6656
+  ffn_hidden_size: 17920
+  num_attention_heads: 52
+  init_method_std: 0.008944
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: sentencepiece
+    type: null
+    model:  ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: selective
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: 2
+  activations_checkpoint_layers_per_pipeline: 32
+  sequence_parallel: false
+  transformer_engine: false
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: most_recent
+  use_emha: false
+  ub_tp_comm_overlap: false
+  use_flash_attention: false
+  optim:
+    name: fused_adam
+    lr: 0.0001
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+      #bucket_cap_mb: 125
+      #overlap_grad_sync: false
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 107
+      constant_steps: 11873
+      min_lr: 1.0e-05
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 2048
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .5
+    - ${data_dir}/my-llama_00_text_document
+    - .5
+    - ${data_dir}/my-llama_01_text_document
diff --git a/launcher_scripts/conf/training/llama/65b.yaml b/launcher_scripts/conf/training/llama/65b.yaml
new file mode 100644
index 0000000000..464af39c09
--- /dev/null
+++ b/launcher_scripts/conf/training/llama/65b.yaml
@@ -0,0 +1,157 @@
+run:
+  name: llama_65b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: 0-01:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 16
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  replace_sampler_ddp: false
+  max_epochs: null
+  max_steps: 300000
+  max_time: '19:23:30:00'
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 50
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_llama
+    name: ${training.run.name}
+  resume_if_exists: true
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: false
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+model:
+  micro_batch_size: 1
+  global_batch_size: 2048
+  rampup_batch_size: null
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 8
+  virtual_pipeline_model_parallel_size: 10
+  encoder_seq_length: 2048
+  max_position_embeddings: 2048
+  num_layers: 80
+  hidden_size: 8192
+  ffn_hidden_size: 22016
+  num_attention_heads: 64
+  init_method_std: 0.008944
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: sentencepiece
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: selective
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: 80
+  activations_checkpoint_layers_per_pipeline: 0
+  sequence_parallel: false
+  transformer_engine: false
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: most_recent
+  use_emha: false
+  ub_tp_comm_overlap: false
+  use_flash_attention: false
+  optim:
+    name: fused_adam
+    lr: 0.0001
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+      #bucket_cap_mb: 125
+      #overlap_grad_sync: false
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 107
+      constant_steps: 11873
+      min_lr: 1.0e-05
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 2048
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .5
+    - ${data_dir}/my-llama_00_text_document
+    - .5
+    - ${data_dir}/my-llama_01_text_document
+
diff --git a/launcher_scripts/conf/training/llama/7b.yaml b/launcher_scripts/conf/training/llama/7b.yaml
new file mode 100755
index 0000000000..fcc4aa58da
--- /dev/null
+++ b/launcher_scripts/conf/training/llama/7b.yaml
@@ -0,0 +1,221 @@
+run:
+  name: llama_7b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "0-02:00:00"
+  dependency: "singleton"
+trainer:
+  num_nodes: 4
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  replace_sampler_ddp: False
+  max_epochs: null
+  max_steps: 300000 # consumed_samples = global_step * global_batch_size
+  max_time: "05:23:30:00" # days:hours:minutes:seconds
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 50
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: True
+  wandb_logger_kwargs:
+    project: nemo_llama
+    name: ${training.run.name}
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: True
+  step_timing_kwargs:
+    sync_cuda: True
+    buffer_size: 5
+
+model:
+  micro_batch_size: 2
+  global_batch_size: 2048
+  rampup_batch_size: null
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 2048
+  max_position_embeddings: 2048
+  num_layers: 32
+  hidden_size: 4096
+  ffn_hidden_size: 11008
+  num_attention_heads: 32
+  init_method_std: 0.01
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: 'sentencepiece'
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: False
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: selective
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: false # does not support sequence parallel
+
+  ## Transformer Engine
+  # fp8 training is currently not supported in the improved models
+  transformer_engine: False
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  use_emha: False
+  ub_tp_comm_overlap: False
+  use_flash_attention: false
+  optim:
+    name: distributed_fused_adam
+    lr: 1e-4
+    weight_decay: 0.1
+    betas:
+      - 0.9
+      - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: False
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 1e-5
+  data:
+    data_impl: mmap
+    splits_string: "99990,8,2"
+    seq_length: 2048
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .5
+    - ${data_dir}/my-llama_00_text_document
+    - .5
+    - ${data_dir}/my-llama_01_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_00_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_01_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_02_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_03_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_04_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_05_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_06_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_07_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_08_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_09_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_10_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_11_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_12_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_13_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_14_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_15_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_16_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_17_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_18_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_19_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_20_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_21_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_22_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_23_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_24_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_25_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_26_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_27_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_28_text_document
+            #      - .0334
+            #      - ${data_dir}/my-gpt3_29_text_document
+
diff --git a/launcher_scripts/nemo_launcher/core/data_stages.py b/launcher_scripts/nemo_launcher/core/data_stages.py
index 7158750900..989a06b263 100755
--- a/launcher_scripts/nemo_launcher/core/data_stages.py
+++ b/launcher_scripts/nemo_launcher/core/data_stages.py
@@ -176,6 +176,15 @@ def setup_folder_and_data(self) -> None:
         download_merges_url = data_cfg.get("download_merges_url")
         vocab_save_dir = data_cfg.get("vocab_save_dir")
         merges_save_dir = data_cfg.get("merges_save_dir")
+        download_tokenizer_url = data_cfg.get("download_tokenizer_url")
+        tokenizer_save_dir = data_cfg.get("tokenizer_save_dir")
+
+        if download_tokenizer_url is not None:
+            assert tokenizer_save_dir is not None, "tokenizer_save_dir must be a valid path."
+            download_single_file(
+                url=download_tokenizer_url, save_dir=tokenizer_save_dir, file_name="llama_tokenizer.model",
+            )
+
         # Download vocab
         if download_vocab_url is not None:
             assert vocab_save_dir is not None, "vocab_save_dir must be a valid path."
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 4acd81daa7..b0319ae7ff 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -620,6 +620,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
             "t5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_pretraining.py",
             "mt5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_pretraining.py",
             "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/megatron_gpt_pretraining.py",
+            "llama": self._nemo_code_path / "examples/nlp/language_modeling/megatron_gpt_pretraining.py",
             "bert": self._nemo_code_path / "examples/nlp/language_modeling/megatron_bert_pretraining.py",
         }
         return model_type_to_code_path[model_type]
@@ -683,7 +684,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         :return: path current stage's essential nemo scripts code 
         :rtype: Path
         """
-        if model_type == "gpt3":
+        if model_type == "gpt3" or model_type == "llama":
             raise NotImplementedError("Fine-tuning is not supported in NeMo Megatron GPT-3 models.")
         model_type_to_code_path = {
             "t5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py",
@@ -725,6 +726,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         """
         model_type_to_code_path = {
             "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/megatron_gpt_prompt_learning.py",
+            "llama": self._nemo_code_path / "examples/nlp/language_modeling/megatron_gpt_prompt_learning.py",
             "t5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_prompt_learning.py",
             "mt5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_prompt_learning.py",
         }
@@ -748,6 +750,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         """
         model_type_to_code_path = {
             "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py",
+            "llama": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py",
             "t5": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py",
         }
         return model_type_to_code_path[model_type]
@@ -770,6 +773,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         """
         model_type_to_code_path = {
             "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_ia3_tuning.py",
+            "llama": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_ia3_tuning.py",
             "t5": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_t5_ia3_tuning.py",
         }
         return model_type_to_code_path[model_type]

From 9a757fbaff326dd91efc0014b9c417cfb183eff6 Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Thu, 13 Jul 2023 02:14:33 -0700
Subject: [PATCH 03/62] add conversion for llama

---
 launcher_scripts/conf/config.yaml             |  6 +++---
 .../conf/conversion/llama/convert_llama.yaml  | 21 +++++++++++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)
 create mode 100755 launcher_scripts/conf/conversion/llama/convert_llama.yaml

diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml
index f9242678d9..6ca1791c0b 100755
--- a/launcher_scripts/conf/config.yaml
+++ b/launcher_scripts/conf/config.yaml
@@ -3,7 +3,7 @@ defaults:
   - cluster: bcm  # Leave it as bcm even if using bcp. It will be ignored for bcp.
   - data_preparation: llama/download_llama_pile
   - training: llama/30b
-  - conversion: gpt3/convert_gpt3
+  - conversion: llama/convert_llama
   - fine_tuning: null
   - prompt_learning: null
   - adapter_learning: null
@@ -21,8 +21,8 @@ debug: False
 
 stages:
         #- data_preparation
-  - training
-    #- conversion
+        #- training
+  - conversion
     #- evaluation
     #- export
 
diff --git a/launcher_scripts/conf/conversion/llama/convert_llama.yaml b/launcher_scripts/conf/conversion/llama/convert_llama.yaml
new file mode 100755
index 0000000000..ba8018ee85
--- /dev/null
+++ b/launcher_scripts/conf/conversion/llama/convert_llama.yaml
@@ -0,0 +1,21 @@
+run:
+  name: convert_${conversion.run.model_train_name}
+  nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
+  time_limit: "1:00:00"
+  dependency: "singleton"
+  ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
+  convert_name: convert_nemo
+  model_train_name: llama_7b
+  train_dir: ${base_results_dir}/${.model_train_name}
+  results_dir: ${.train_dir}/${.convert_name}
+  nemo_file_name: megatron_llama.nemo # name of nemo checkpoint; must be .nemo file
+
+model:
+  model_type: gpt # gpt or t5, use t5 for mt5 as well
+  checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
+  checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_llama-*last.ckpt)
+  hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+  model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
+  tokenizer_model: ${data_dir}/llama/llama_tokenizer.model

From 6a004fd62731f1118ec76d461aed59dc04096aa5 Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Wed, 19 Jul 2023 00:13:35 -0700
Subject: [PATCH 04/62] add convertion/PEFT/Evaluation for llama

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 .../conf/adapter_learning/llama/squad.yaml    | 107 +++++++++
 launcher_scripts/conf/config.yaml             |  23 +-
 .../conf/conversion/llama/convert_llama.yaml  |   6 +-
 .../conf/evaluation/llama/evaluate_all.yaml   |  24 ++
 .../conf/evaluation/prompt_llama/squad.yaml   |  21 ++
 .../conf/ia3_learning/llama/squad.yaml        |  98 ++++++++
 .../conf/prompt_learning/llama/squad.yaml     | 111 +++++++++
 launcher_scripts/conf/training/llama/7b.yaml  |   2 +-
 launcher_scripts/main.py                      |   2 +-
 .../dolly_dataprep/download.py                |   2 +-
 .../collections/eval_harness/evaluate.py      |   8 +-
 .../eval_harness/lm_eval/models/__init__.py   |   4 +-
 .../eval_harness/lm_eval/models/nemo_llama.py | 218 ++++++++++++++++++
 .../lm_eval/models/nemo_llama_prompt.py       | 174 ++++++++++++++
 launcher_scripts/nemo_launcher/core/stages.py |   9 +-
 15 files changed, 788 insertions(+), 21 deletions(-)
 create mode 100755 launcher_scripts/conf/adapter_learning/llama/squad.yaml
 create mode 100755 launcher_scripts/conf/evaluation/llama/evaluate_all.yaml
 create mode 100755 launcher_scripts/conf/evaluation/prompt_llama/squad.yaml
 create mode 100755 launcher_scripts/conf/ia3_learning/llama/squad.yaml
 create mode 100755 launcher_scripts/conf/prompt_learning/llama/squad.yaml
 create mode 100755 launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py
 create mode 100755 launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama_prompt.py

diff --git a/launcher_scripts/conf/adapter_learning/llama/squad.yaml b/launcher_scripts/conf/adapter_learning/llama/squad.yaml
new file mode 100755
index 0000000000..9907d52635
--- /dev/null
+++ b/launcher_scripts/conf/adapter_learning/llama/squad.yaml
@@ -0,0 +1,107 @@
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: llama_7b
+  convert_dir: ${base_results_dir}/${adapter_learning.run.model_train_name}/${adapter_learning.run.convert_name}
+  task_name: "squad"
+  results_dir: ${base_results_dir}/${.model_train_name}/adapter_learning_${.task_name}
+
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False
+  enable_checkpointing: False
+  replace_sampler_ddp: False
+  max_epochs: 4
+  max_steps: -1
+  log_every_n_steps: 10
+  val_check_interval: 0.1
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+
+
+exp_manager:
+  explicit_log_dir: ${adapter_learning.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama_adapter
+  create_wandb_logger: True
+  wandb_logger_kwargs:
+    project: nemo_llama_adapter
+    name: ${adapter_learning.run.name}
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 5
+    mode: min
+    save_nemo_on_train_end: False
+    filename: "megatron_llama_adapter_learn--{val_loss:.3f}-{step}"
+    model_parallel_size: ${adapter_learning.model.model_parallel_size}
+    save_best_model: True
+
+model:
+  seed: 1234
+  nemo_path: ${adapter_learning.run.results_dir}/results/megatron_gpt_adapter.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
+  virtual_prompt_style: 'no-prompts' # adapter tuning requires no virtual prompts
+  encoder_seq_length: 2048 
+  gradient_as_bucket_view: false
+  tensor_model_parallel_size: 4 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
+  global_batch_size: 64
+  micro_batch_size: 8
+
+  restore_path: null # Path to an existing adapter .nemo model you wish to add new tasks to or run inference with
+  language_model_path: ${adapter_learning.run.convert_dir}/results/megatron_llama.nemo # Path to the GPT language model .nemo file, always required
+  existing_tasks: [] # List of tasks the model has already been p-tuned/prompt-tuned for, needed when a restore path is given
+  new_tasks: ["squad"] # List of new tasknames to be prompt-tuned
+
+  task_templates: # Add more/replace tasks as needed, these are just examples
+  - taskname: "squad" 
+    prompt_template: "context: {context} question: {question} answer: {answer}" 
+    total_virtual_tokens: 0
+    virtual_token_splits: []
+    truncate_field: null
+    answer_only_loss: True
+    answer_field: "answer"
+
+  adapter_tuning:
+    type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+    adapter_dim: 16
+    adapter_dropout: 0.1
+    norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used.
+    column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+    row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+    norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+
+  data:
+    train_ds:
+      - ${data_dir}/prompt_data/v1.1/squad_train.jsonl # multiple prompt dataset can be given at the same time
+    validation_ds:
+      - ${data_dir}/prompt_data/v1.1/squad_val.jsonl
+    add_eos: True
+    shuffle: True
+    num_workers: 4
+    pin_memory: True
+
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml
index 6ca1791c0b..9ee398f347 100755
--- a/launcher_scripts/conf/config.yaml
+++ b/launcher_scripts/conf/config.yaml
@@ -2,13 +2,14 @@ defaults:
   - _self_
   - cluster: bcm  # Leave it as bcm even if using bcp. It will be ignored for bcp.
   - data_preparation: llama/download_llama_pile
-  - training: llama/30b
+  - training: llama/7b
   - conversion: llama/convert_llama
   - fine_tuning: null
-  - prompt_learning: null
-  - adapter_learning: null
-  - ia3_learning: null
-  - evaluation: gpt3/evaluate_all
+  - prompt_learning: llama/squad
+  - adapter_learning: llama/squad
+  - ia3_learning: llama/squad
+    #- evaluation: llama/evaluate_all
+  - evaluation: prompt_llama/squad
   - export: gpt3/export_gpt3
   - override hydra/job_logging: stdout
 
@@ -20,15 +21,19 @@ hydra:
 debug: False
 
 stages:
-        #- data_preparation
-        #- training
+  #- data_preparation
+  #- training
   - conversion
-    #- evaluation
-    #- export
+  #- prompt_learning
+  #- adapter_learning
+  #- ia3_learning
+  #- evaluation
+  #- export
 
 cluster_type: bcm  # bcm or bcp. If bcm, it must match - cluster above.
 launcher_scripts_path: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/launcher_scripts  # Path to NeMo Megatron Launch scripts, should ends with /launcher_scripts
 data_dir: ${launcher_scripts_path}/data  # Location to store and read the data.
+nemo_dir: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/nemo_repo/internal/NeMo
 base_results_dir: ${launcher_scripts_path}/results  # Location to store the results, checkpoints and logs.
 container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
   - null
diff --git a/launcher_scripts/conf/conversion/llama/convert_llama.yaml b/launcher_scripts/conf/conversion/llama/convert_llama.yaml
index ba8018ee85..451d916b20 100755
--- a/launcher_scripts/conf/conversion/llama/convert_llama.yaml
+++ b/launcher_scripts/conf/conversion/llama/convert_llama.yaml
@@ -8,13 +8,13 @@ run:
   model_train_name: llama_7b
   train_dir: ${base_results_dir}/${.model_train_name}
   results_dir: ${.train_dir}/${.convert_name}
-  nemo_file_name: megatron_llama.nemo # name of nemo checkpoint; must be .nemo file
+  nemo_file_name: megatron_llama_prompt.nemo # name of nemo checkpoint; must be .nemo file
 
 model:
   model_type: gpt # gpt or t5, use t5 for mt5 as well
-  checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
+  checkpoint_folder: ${conversion.run.train_dir}/prompt_learning_squad/results/checkpoints
   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_llama-*last.ckpt)
-  hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
+  hparams_file: ${conversion.run.train_dir}/prompt_learning_squad/results/hparams.yaml
   tensor_model_parallel_size: 2
   pipeline_model_parallel_size: 1
   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
diff --git a/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml b/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml
new file mode 100755
index 0000000000..ca4d9b7456
--- /dev/null
+++ b/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml
@@ -0,0 +1,24 @@
+run:
+  name: ${.eval_name}_${.model_train_name}
+  time_limit: "01:00:00"
+  dependency: "singleton"
+  nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
+  ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
+  eval_name: eval_all
+  model_train_name: llama_7b
+  train_dir: ${base_results_dir}/${.model_train_name}
+  tasks: all_tasks  # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks
+  results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
+
+model:
+  model_type: nemo-llama
+  nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints
+  checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
+  checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
+  hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+  model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
+  precision: bf16 # must match training precision - 32, 16 or bf16
+  eval_batch_size: 4
+  tokenizer_model: ${data_dir}/llama/llama_tokenizer.model 
diff --git a/launcher_scripts/conf/evaluation/prompt_llama/squad.yaml b/launcher_scripts/conf/evaluation/prompt_llama/squad.yaml
new file mode 100755
index 0000000000..7890e97eab
--- /dev/null
+++ b/launcher_scripts/conf/evaluation/prompt_llama/squad.yaml
@@ -0,0 +1,21 @@
+run:
+  name: ${.eval_name}_${.model_train_name}
+  time_limit: "1:00:00"
+  nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
+  ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
+  eval_name: eval_prompt_squad
+  model_train_name: llama_7b
+  tasks: "prompt" # general prompt task
+  prompt_learning_dir: ${base_results_dir}/${.model_train_name}/prompt_learning_squad # assume prompt learning was on squad task
+  results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
+
+model:
+  model_type: nemo-llama-prompt
+  nemo_model: ${evaluation.run.prompt_learning_dir}/results/megatron_llama_prompt.nemo
+  tensor_model_parallel_size: 2 #1 for 126m, 2 for 5b, 8 for 20b
+  pipeline_model_parallel_size: 1
+  model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
+  precision: bf16 # must match training precision - 32, 16 or bf16
+  eval_batch_size: 4
+  prompt_dataset_paths: ${data_dir}/prompt_data/v1.1/squad_val.jsonl
+  disable_special_tokens: False # Whether to disable virtual tokens in prompt model evaluation. This is equivalent to evaluate without prompt-/p-tuning.
diff --git a/launcher_scripts/conf/ia3_learning/llama/squad.yaml b/launcher_scripts/conf/ia3_learning/llama/squad.yaml
new file mode 100755
index 0000000000..01c22b6f02
--- /dev/null
+++ b/launcher_scripts/conf/ia3_learning/llama/squad.yaml
@@ -0,0 +1,98 @@
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: llama_7b
+  convert_dir: ${base_results_dir}/${ia3_learning.run.model_train_name}/${ia3_learning.run.convert_name}
+  task_name: "squad"
+  results_dir: ${base_results_dir}/${.model_train_name}/ia3_learning_${.task_name}
+
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False
+  enable_checkpointing: False
+  replace_sampler_ddp: False
+  max_epochs: 4
+  max_steps: -1
+  log_every_n_steps: 10
+  val_check_interval: 0.1
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+
+
+exp_manager:
+  explicit_log_dir: ${ia3_learning.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama_ia3
+  create_wandb_logger: True
+  wandb_logger_kwargs:
+    project: nemo_llama_ia3
+    name: ${ia3_learning.run.name}
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 5
+    mode: min
+    save_nemo_on_train_end: False
+    filename: "megatron_gpt_ia3_learn--{val_loss:.3f}-{step}"
+    model_parallel_size: ${ia3_learning.model.model_parallel_size}
+    save_best_model: True
+
+model:
+  seed: 1234
+  nemo_path: ${ia3_learning.run.results_dir}/results/megatron_llama_ia3.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
+  virtual_prompt_style: 'no-prompts' # ia3 tuning requires no virtual prompts
+  encoder_seq_length: 2048 
+  gradient_as_bucket_view: false
+  tensor_model_parallel_size: 2 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
+  global_batch_size: 64
+  micro_batch_size: 8
+
+  restore_path: null # Path to an existing ia3 .nemo model you wish to add new tasks to or run inference with
+  language_model_path: ${ia3_learning.run.convert_dir}/results/megatron_llama.nemo # Path to the GPT language model .nemo file, always required
+  existing_tasks: [] # List of tasks the model has already been p-tuned/prompt-tuned for, needed when a restore path is given
+  new_tasks: ["squad"] # List of new tasknames to be prompt-tuned
+
+  task_templates: # Add more/replace tasks as needed, these are just examples
+  - taskname: "squad" 
+    prompt_template: "context: {context} question: {question} answer: {answer}" 
+    total_virtual_tokens: 0
+    virtual_token_splits: []
+    truncate_field: null
+    answer_only_loss: True
+    answer_field: "answer"
+
+  data:
+    train_ds:
+      - ${data_dir}/prompt_data/v1.1/squad_train.jsonl # multiple prompt dataset can be given at the same time
+    validation_ds:
+      - ${data_dir}/prompt_data/v1.1/squad_val.jsonl
+    add_eos: True
+    shuffle: True
+    num_workers: 4
+    pin_memory: True
+
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/launcher_scripts/conf/prompt_learning/llama/squad.yaml b/launcher_scripts/conf/prompt_learning/llama/squad.yaml
new file mode 100755
index 0000000000..51104ba17d
--- /dev/null
+++ b/launcher_scripts/conf/prompt_learning/llama/squad.yaml
@@ -0,0 +1,111 @@
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "01:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: llama_7b
+  convert_dir: ${base_results_dir}/${prompt_learning.run.model_train_name}/${prompt_learning.run.convert_name}
+  task_name: "squad"
+  results_dir: ${base_results_dir}/${.model_train_name}/prompt_learning_${.task_name}
+
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False
+  enable_checkpointing: False
+  replace_sampler_ddp: False
+  max_epochs: 4
+  max_steps: -1
+  log_every_n_steps: 10
+  val_check_interval: 200
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+
+exp_manager:
+  explicit_log_dir: ${prompt_learning.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama_prompt
+  create_wandb_logger: True
+  wandb_logger_kwargs:
+    project: nemo_llama_prompt
+    name: ${prompt_learning.run.name}
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 5
+    mode: min
+    save_nemo_on_train_end: False
+    filename: "megatron_llama_prompt_learn--{val_loss:.3f}-{step}"
+    model_parallel_size: ${prompt_learning.model.model_parallel_size}
+    save_best_model: True
+
+model:
+  seed: 1234
+  nemo_path: ${prompt_learning.run.results_dir}/results/megatron_gpt_prompt.nemo # the place to save prompt learning nemo checkpoint
+  virtual_prompt_style: 'p-tuning' # One of 'p-tuning', 'prompt-tuning', or 'inference'. We recommend 'p-tuning' over 'prompt-tuning'.
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+  model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
+  encoder_seq_length: 2048
+  global_batch_size: 64
+  micro_batch_size: 8
+
+  restore_path: null # used to restore from a prompt tuned checkpoint and add new tasks
+  language_model_path: ${prompt_learning.run.convert_dir}/results/megatron_llama.nemo # Restore lanugage model from pre-trained .nemo checkpoint
+  existing_tasks: [] # if restore from a prompt tuned checkpoint and add new tasks, existing task names should be included here.
+  new_tasks: ["squad"] # multiple tasks can be tuned at the same time
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  task_templates: # task_templates for all existing_tasks and new_tasks are required.
+  - taskname: "squad" # The task name
+    prompt_template: "<|VIRTUAL_PROMPT_0|>Context: {context} Question: {question} Answer: {answer}" # Prompt template for task, specify virtual prompt positions with <|VIRTUAL_PROMPT_#|>
+    total_virtual_tokens: 10 # Sum of tokens in virtual_token_splits must add to this number. Can differ between new and existing tasks, but must match across all new tasks being tuned at the same time.
+    virtual_token_splits: [10] # number of virtual tokens to be inserted at each VIRTUAL PROMPT location, must add to total_virtual_tokens
+    truncate_field: "context" # The {field} in the prompt template whose text will be truncated if the input is too long, if null, inputs that are too long will just be skipped.
+    answer_field: "answer" # Answer/Target field
+    answer_only_loss: True # If true, the loss will only be calculated with answer_field text vs. ground truth. If false, the loss will be calculated over entire sentence.
+
+  prompt_learning: # Prompt tunin specific params
+    new_prompt_init_methods: null # e.g ['text'], List of 'text' or 'random', should correspond to tasks listed in new tasks
+    new_prompt_init_text: null # e.g ['some init text goes here'], some init text if init method is text, or None if init method is random
+
+  p_tuning: # P-tuning specific params
+    encoder_type: "tpmlp" # ['tpmlp', 'lstm', 'biglstm', 'mlp'] 
+    dropout: 0.0
+    num_layers: 2  # number of layers for MLP or LSTM layers. Note, it has no effect for tpmlp currently as it always assumes it is two layers.
+    encoder_hidden: 2048 # encoder hidden for biglstm and tpmlp
+    init_std: 0.023  # init std for tpmlp layers
+
+  data:
+    train_ds:
+      - ${data_dir}/prompt_data/v1.1/squad_train.jsonl # multiple prompt dataset can be given at the same time
+    validation_ds:
+      - ${data_dir}/prompt_data/v1.1/squad_val.jsonl
+    add_eos: True
+    shuffle: True
+    num_workers: 4
+    pin_memory: True
+
+  optim:
+    name: fused_adam
+    lr: 2.0e-4
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      constant_steps: 10
+      min_lr: 0.0 # has to be zero
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/launcher_scripts/conf/training/llama/7b.yaml b/launcher_scripts/conf/training/llama/7b.yaml
index fcc4aa58da..96cb4790c0 100755
--- a/launcher_scripts/conf/training/llama/7b.yaml
+++ b/launcher_scripts/conf/training/llama/7b.yaml
@@ -1,7 +1,7 @@
 run:
   name: llama_7b
   results_dir: ${base_results_dir}/${.name}
-  time_limit: "0-02:00:00"
+  time_limit: "0-04:00:00"
   dependency: "singleton"
 trainer:
   num_nodes: 4
diff --git a/launcher_scripts/main.py b/launcher_scripts/main.py
index 25d5a016ca..986ec8357d 100755
--- a/launcher_scripts/main.py
+++ b/launcher_scripts/main.py
@@ -43,7 +43,7 @@
     "conversion": Conversion,
     "export": Export,
     "evaluation": {
-        EvalHarnessEvaluation: ["gpt3", "prompt_gpt3"],
+        EvalHarnessEvaluation: ["gpt3", "prompt_gpt3", "llama", "prompt_llama"],
         NeMoEvaluation: ["t5", "mt5", "prompt_t5", "prompt_mt5", "adapter_t5", "adapter_gpt3", "ia3_t5", "ia3_gpt3"],
     },
     "data_preparation": {
diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/dolly_dataprep/download.py b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/dolly_dataprep/download.py
index de679a6fb9..0ce8d12382 100644
--- a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/dolly_dataprep/download.py
+++ b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/dolly_dataprep/download.py
@@ -23,7 +23,7 @@
 import os
 from argparse import ArgumentParser
 
-default_link = "https://github.com/databrickslabs/dolly/raw/master/data/databricks-dolly-15k.jsonl"
+default_link = "https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl"
 
 def get_file_name(link):
     file_name = link.split('/')[-1]
diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/evaluate.py b/launcher_scripts/nemo_launcher/collections/eval_harness/evaluate.py
index f6a664ef8e..df6c20f27c 100755
--- a/launcher_scripts/nemo_launcher/collections/eval_harness/evaluate.py
+++ b/launcher_scripts/nemo_launcher/collections/eval_harness/evaluate.py
@@ -85,7 +85,7 @@ def parse_args(parser_main):
     parser.add_argument("--model", required=True)
 
     parser.add_argument(
-        "--nemo_model", type=str, default=None, required=False, help="Pass path to model's .nemo file",
+        "--nemo_model", default=None, required=False, help="Pass path to model's .nemo file",
     )
     parser.add_argument(
         "--checkpoint_folder",
@@ -120,6 +120,7 @@ def parse_args(parser_main):
 
     parser.add_argument("--vocab_file", default=None)
     parser.add_argument("--merge_file", default=None)
+    parser.add_argument("--tokenizer_model", default=None)
 
     parser.add_argument(
         "--prompt_dataset_paths",
@@ -292,9 +293,10 @@ def main():
     pipeline_model_parallel_size = args.pipeline_model_parallel_size
     vocab_file = args.vocab_file
     merge_file = args.merge_file
+    tokenizer_model = args.tokenizer_model
 
     hparams_override_file = None
-    if args.nemo_model is None:  # Not loading from .nemo checkpoint
+    if args.nemo_model is None or args.nemo_model == "None":  # Not loading from .nemo checkpoint
         # Checkpoint search
         if checkpoint_name == "latest":
             checkpoints = os.path.join(checkpoint_folder, "*.ckpt")
@@ -322,6 +324,8 @@ def main():
                 conf.cfg.tokenizer.vocab_file = vocab_file
             if merge_file is not None:
                 conf.cfg.tokenizer.merge_file = merge_file
+            if tokenizer_model is not None:
+                conf.cfg.tokenizer.model = tokenizer_model
             if "activations_checkpoint_granularity" in conf.cfg:
                 conf.cfg.activations_checkpoint_granularity = None
             if "activations_checkpoint_method" in conf.cfg:
diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py
index 8c9a1b5ed2..1b18dc64e5 100755
--- a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py
+++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py
@@ -14,11 +14,13 @@
 
 from lm_eval.base import LM
 
-from . import dummy, nemo_gpt3, nemo_gpt3_prompt
+from . import dummy, nemo_gpt3, nemo_gpt3_prompt, nemo_llama, nemo_llama_prompt
 
 MODEL_REGISTRY = {
     "nemo-gpt3": nemo_gpt3.NeMo_GPT3LM_TP_PP,
+    "nemo-llama": nemo_llama.NeMo_LLAMALM_TP_PP,
     "nemo-gpt3-prompt": nemo_gpt3_prompt.NeMo_GPT3_PROMPTLM,
+    "nemo-llama-prompt": nemo_llama_prompt.NeMo_LLAMA_PROMPTLM,
     "dummy": dummy.DummyLM,
 }
 
diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py
new file mode 100755
index 0000000000..462d28f549
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py
@@ -0,0 +1,218 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from omegaconf import OmegaConf, open_dict
+
+import torch
+import tqdm
+from megatron.core import parallel_state
+from lm_eval import utils
+from lm_eval.base import LM
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
+from nemo.collections.nlp.modules.common.text_generation_utils import generate, get_computeprob_response
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
+from nemo.utils import logging
+from nemo.utils.app_state import AppState
+from nemo.utils.get_rank import is_global_rank_zero
+from nemo.utils.model_utils import inject_model_parallel_rank
+from pytorch_lightning.trainer.trainer import Trainer
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.dataloader import default_collate
+
+from .nemo_gpt3 import RequestDataset, setup_trainer_and_model, DDP_initialize
+
+class NeMo_LLAMALM_TP_PP(LM):
+    def __init__(self, args, truncate=False, batch_size=1):
+        super().__init__()
+
+        # get nemo megatron
+        logging.info(f"**** Building LLaMA model ...")
+        self.trainer, self.model = setup_trainer_and_model(args)
+        self.tokenizer = self.model.tokenizer
+        self.model.eval()
+
+        self.max_length = self.model.cfg.get("max_position_embeddings")
+        assert self.tokenizer.text_to_ids("hello\n\nhello") == [
+            22172,
+            13,
+            13,
+            12199,
+        ], "Tokenizer text_to_ids is not working as expected."
+
+        self.truncate = truncate
+        self.batch_size = batch_size
+
+        # initialize DDP and move model to GPU
+        DDP_initialize(self.model)
+        self.model = self.model.cuda()
+
+    @classmethod
+    def create_from_arg_string(cls, arg_string, additional_config={}):
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(args, **args2)
+
+    def loglikelihood(self, requests):
+        return self._loglikelihood(requests)
+
+    """
+    request: (context, continuation)
+    how this all works:
+             CTX      CONT
+    inp    0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
+    gpt2    \               \
+    logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the [:, -len(continuation_enc):, :self.VOCAB_SIZE] slice
+    cont_toks      4 5 6 7 8 9
+    when too long to fit in context, truncate from the left
+    """
+
+    def _loglikelihood(self, requests):
+        def pad_collate(batch, eos_id=2):
+            tokens = [item[0] for item in batch]
+            conti_lens = [item[1] for item in batch]
+            lens = [len(token) - 1 for token in tokens]  # fake delete last token by reducing input len
+            max_len = max(lens)
+            extra_pad_len = 0
+            if max_len % 8 != 0:
+                extra_pad_len = 8 - (max_len % 8)
+                max_len += extra_pad_len
+            # extra_pad_len = 2048 - max_len
+            # max_len += extra_pad_len
+
+            tokens_pad = pad_sequence(tokens, batch_first=False, padding_value=eos_id)
+            if extra_pad_len > 0:
+                extra_pad = torch.ones(extra_pad_len, len(batch)) * eos_id
+                extra_pad = extra_pad.type_as(tokens_pad)
+                tokens_pad = torch.vstack((tokens_pad, extra_pad))
+            # Add padding to all samples to adapt nemo generate api
+
+            new_batch = []
+            for token, lenn, conti_len in zip(tokens_pad.T, lens, conti_lens):
+                # (token, lenn, tokens_to_generate, compute_logprobs)
+                new_batch.append((token, max_len, lenn, conti_len))
+
+            new_batch = default_collate(new_batch)
+            return new_batch
+
+        def _collate(x):  # used to reorder request and remove duplications
+            """
+              the negative sign on len(toks) sorts descending - this has a few advantages:
+              - time estimates will always be over not underestimates, which is more useful for planning
+              - to know the size of a batch when going through the list, you know the first one is always the batch padded context length.
+                this is useful to simplify the batching logic and more importantly to make automatic adaptive batches much much easier to implement
+              - any OOMs will happen right away rather than near the end
+            """
+            toks = x[0] + x[1]
+            return -len(toks), tuple(toks)
+
+        reord = utils.Reorderer(requests, _collate)
+        request_ds = RequestDataset(reord.get_reordered(), self.model.tokenizer)
+        request_dl = DataLoader(request_ds, collate_fn=pad_collate, batch_size=self.batch_size, shuffle=False)
+
+        def logits_to_results(batch, response):
+            input_token_ids_batch, _, lens, conti_lens = batch
+            batch_size = len(lens)
+            assert len(response['token_ids']) == batch_size, "Response's length not equal to batch size."
+
+            batch_res = []
+            for index in range(batch_size):
+                inp_len = lens[index]
+                conti_len = conti_lens[index]
+
+                inp_token_ids = input_token_ids_batch[index].tolist()[: inp_len + 1]  # recover fake deleted token
+                response_token_ids = response['token_ids'][index][:inp_len]
+
+                assert response_token_ids == inp_token_ids[:-1], f"Mismatch in input tokens."
+
+                log_probs = response['full_logprob'][index][:inp_len]  # torch.tensor
+                log_probs = log_probs[-conti_len:]
+
+                greedy_tokens = log_probs.argmax(dim=-1)
+                greedy_tokens = self.tokenizer.ids_to_tokens(greedy_tokens.cpu().numpy().tolist())
+
+                conti_token_ids = inp_token_ids[-conti_len:]
+                conti_tokens = self.tokenizer.ids_to_tokens(conti_token_ids)
+
+                max_equal = greedy_tokens == conti_tokens
+                log_probs = log_probs.cpu().to(torch.float32)
+                conti_enc = torch.tensor(self.tokenizer.tokens_to_ids(conti_tokens))
+                conti_probs = torch.gather(log_probs, 1, conti_enc.unsqueeze(-1)).squeeze(-1)
+
+                batch_res.append((float(conti_probs.sum()), bool(max_equal), greedy_tokens, conti_tokens))
+            return batch_res
+
+        res = []
+        for batch in tqdm.tqdm(request_dl):
+            # inputs = (token_ids, conti_lens)
+            inputs = (batch[0].cuda(), batch[1].cuda())
+            response = generate(
+                model=self.model,
+                inputs=inputs,
+                tokens_to_generate=1,
+                all_probs=True,
+                temperature=1.0,
+                add_BOS=False,
+                top_k=0,
+                top_p=0.9,
+                greedy=True,
+                repetition_penalty=1.0,
+                min_tokens_to_generate=0,
+            )
+            response = get_computeprob_response(self.tokenizer, response, inputs)
+
+            if is_global_rank_zero():
+                res.extend(logits_to_results(batch, response))
+
+            del inputs, response
+
+        return reord.get_original(res) if self.can_access_output() else None
+
+    def loglikelihood_rolling(self, requests):
+        loglikelihoods = []
+        len_rolling_token_windows = [0]
+        all_rolling_token_windows = []
+
+        for (string,) in requests:
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tokenizer.text_to_ids(string),
+                        prefix_token=2,
+                        max_seq_len=self.max_length,
+                        context_len=1,
+                    ),
+                )
+            )
+
+            len_rolling_token_windows.append(len(rolling_token_windows) + len_rolling_token_windows[-1])
+            all_rolling_token_windows.extend(rolling_token_windows)
+
+        string_nll = self._loglikelihood(all_rolling_token_windows)
+        if self.can_access_output():
+            string_nll = [x[0] for x in string_nll]
+            # discard is_greedy
+            for i in range(len(len_rolling_token_windows) - 1):
+                loglikelihoods.append(sum(string_nll[len_rolling_token_windows[i] : len_rolling_token_windows[i + 1]]))
+
+        return loglikelihoods
+
+    def greedy_until(self, requests):
+        raise NotImplementedError
+
+    def can_access_output(self):
+        return is_global_rank_zero()
diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama_prompt.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama_prompt.py
new file mode 100755
index 0000000000..59be96c5e7
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama_prompt.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import tqdm
+from lm_eval import utils
+from lm_eval.base import LM
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import (
+    MegatronGPTPromptLearningModel,
+)
+from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
+from nemo.collections.nlp.modules.common.text_generation_utils import generate, get_computeprob_response
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+from nemo.utils import logging
+from nemo.utils.app_state import AppState
+from nemo.utils.get_rank import is_global_rank_zero
+from pytorch_lightning.trainer.trainer import Trainer
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.dataloader import default_collate
+
+from .nemo_gpt3_prompt import PromptRequestDataset, setup_trainer_and_model, DDP_initialize
+
+class NeMo_LLAMA_PROMPTLM(LM):
+    def __init__(self, args, truncate=False, batch_size=1):
+        super().__init__()
+
+        # get nemo megatron
+        logging.info(f"**** Building LLaMA Prompt model ...")
+        self.trainer, self.model = setup_trainer_and_model(args)
+        self.tokenizer = self.model.tokenizer
+        self.model.eval()
+
+        self.max_length = self.model.cfg.get("max_position_embeddings")
+        assert self.tokenizer.text_to_ids("hello\n\nhello") == [
+            22172,
+            13,
+            13,
+            12199,
+        ], "Tokenizer text_to_ids is not working as expected."
+
+        self.truncate = truncate
+        self.batch_size = batch_size
+
+        # initialize DDP and move model to GPU
+        DDP_initialize(self.model)
+        self.model = self.model.cuda()
+
+    @classmethod
+    def create_from_arg_string(cls, arg_string, additional_config={}):
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(args, **args2)
+
+    def loglikelihood(self, requests):
+        return self._loglikelihood(requests)
+
+    """
+    request: (context, continuation)
+    how this all works:
+             CTX      CONT
+    inp    0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
+    gpt2    \               \
+    logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the [:, -len(continuation_enc):, :self.VOCAB_SIZE] slice
+    cont_toks      4 5 6 7 8 9
+    when too long to fit in context, truncate from the left
+    """
+
+    def _loglikelihood(self, requests):
+        def pad_collate(batch, eos_id=2):
+            tokens, conti_lens, task_ids, *_ = map(list, zip(*batch))
+            lens = [len(token) - 1 for token in tokens]  # fake delete last token by reducing input len
+            max_len = max(lens)
+
+            tokens_pad = pad_sequence(tokens, batch_first=False, padding_value=eos_id)
+            # Add padding to all samples to adapt nemo generate api
+            # tokens_pad = torch.cat((tokens_pad, torch.ones((1, len(tokens)), dtype=torch.int) * eos_id), 0)
+
+            new_batch = []
+            for token, lenn, conti_len, task_id in zip(tokens_pad.T, lens, conti_lens, task_ids):
+                new_batch.append((token, max_len, task_id, lenn, conti_len))
+
+            new_batch = default_collate(new_batch)
+            return new_batch
+
+        def _collate(x):  # used to reorder request and remove duplications
+            """
+              the negative sign on len(toks) sorts descending - this has a few advantages:
+              - time estimates will always be over not underestimates, which is more useful for planning
+              - to know the size of a batch when going through the list, you know the first one is always the batch padded context length.
+                this is useful to simplify the batching logic and more importantly to make automatic adaptive batches much much easier to implement
+              - any OOMs will happen right away rather than near the end
+            """
+            toks = x[0] + x[1]
+            return -len(toks), tuple(toks)
+
+        reord = utils.Reorderer(requests, _collate)
+        request_ds = PromptRequestDataset(reord.get_reordered(), self.model.tokenizer)
+        request_dl = DataLoader(request_ds, collate_fn=pad_collate, batch_size=self.batch_size, shuffle=False)
+
+        def logits_to_results(batch, response):
+            input_token_ids_batch, _, _, lens, conti_lens = batch
+            batch_size = len(lens)
+            assert len(response["token_ids"]) == batch_size, "Response's length not equal to batch size."
+
+            batch_res = []
+            for index in range(batch_size):
+                inp_len = lens[index]
+                conti_len = conti_lens[index]
+
+                inp_token_ids = input_token_ids_batch[index].tolist()[: inp_len + 1]  # recover fake deleted token
+
+                log_probs = response["full_logprob"][index][:inp_len]  # torch.tensor
+                log_probs = log_probs[-conti_len:]
+
+                greedy_tokens = log_probs.argmax(dim=-1)
+                greedy_tokens = self.tokenizer.ids_to_tokens(greedy_tokens.cpu().numpy().tolist())
+
+                conti_token_ids = inp_token_ids[-conti_len:]
+                conti_tokens = self.tokenizer.ids_to_tokens(conti_token_ids)
+
+                max_equal = greedy_tokens == conti_tokens
+                log_probs = log_probs.cpu().to(torch.float32)
+                conti_enc = torch.tensor(self.tokenizer.tokens_to_ids(conti_tokens))
+                conti_probs = torch.gather(log_probs, 1, conti_enc.unsqueeze(-1)).squeeze(-1)
+
+                batch_res.append((float(conti_probs.sum()), bool(max_equal), greedy_tokens, conti_tokens))
+            return batch_res
+
+        res = []
+        for batch in tqdm.tqdm(request_dl):
+            # inputs = (token_ids, conti_lens)
+            inputs = (batch[0].cuda(), batch[1].cuda())
+            task_ids = torch.zeros((self.batch_size, 1), device='cuda')
+            response = generate(
+                model=self.model,
+                inputs=inputs,
+                task_ids=task_ids,
+                tokens_to_generate=1,
+                all_probs=True,
+                temperature=1.0,
+                add_BOS=False,
+                top_k=0,
+                top_p=0.9,
+                greedy=True,
+                repetition_penalty=1.0,
+                min_tokens_to_generate=0,
+            )
+            response = get_computeprob_response(self.tokenizer, response, inputs)
+
+            if is_global_rank_zero():
+                res.extend(logits_to_results(batch, response))
+
+        return reord.get_original(res) if self.can_access_output() else None
+
+    def loglikelihood_rolling(self, requests):
+        raise NotImplementedError
+
+    def greedy_until(self, requests):
+        raise NotImplementedError
+
+    def can_access_output(self):
+        return is_global_rank_zero()
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index b0319ae7ff..3277996eb1 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -213,8 +213,9 @@ def add_container_mounts(container_mounts):
 
         cfg = self.cfg
         data_dir = cfg.get("data_dir")
+        nemo_dir = cfg.get("nemo_dir")
         base_results_dir = cfg.get("base_results_dir")
-        mounts_string = f"{self._launcher_scripts_path}:{self._launcher_scripts_path},{data_dir}:{data_dir},{base_results_dir}:{base_results_dir}"
+        mounts_string = f"{self._launcher_scripts_path}:{self._launcher_scripts_path},{data_dir}:{data_dir},{base_results_dir}:{base_results_dir},{nemo_dir}:{nemo_dir}"
 
         container_mounts = cfg.get("container_mounts")
         mounts_string += add_container_mounts(container_mounts)
@@ -382,7 +383,7 @@ def _launcher_scripts_path(self) -> Path:
 
     @property
     def _nemo_code_path(self) -> Path:
-        return Path("/opt/NeMo")
+        return Path(self.cfg.get("nemo_dir", "/opt/NeMo"))
 
     @property
     def _data_dir(self) -> Path:
@@ -976,7 +977,8 @@ class EvalHarnessEvaluation(NemoMegatronStage):
     def __init__(self, cfg):
         super().__init__(cfg)
         choice_model_type, choice_name = self.get_stage_config_choice()
-        self.prompt_evaluation = choice_model_type == "prompt_gpt3"
+        #self.prompt_evaluation = choice_model_type == "prompt_gpt3"
+        self.prompt_evaluation = True if "prompt" in choice_model_type else False
 
     def setup_stage_vars(self, cfg):
         """Setup the stage vars, i.e. stage name and stage cfg"""
@@ -1053,6 +1055,7 @@ def make_stage_command_groups(self, stage_cfg_path: Path) -> List[List[str]]:
                 nemo_model=model_cfg.get("nemo_model"),
                 checkpoint_folder=model_cfg.get("checkpoint_folder"),
                 checkpoint_name=model_cfg.get("checkpoint_name"),
+                tokenizer_model=model_cfg.get("tokenizer_model"),
                 hparams_file=model_cfg.get("hparams_file"),
             )
 

From e0a2d64deaccd782d11d39f30207aaa6243cadcc Mon Sep 17 00:00:00 2001
From: Joseph Jennings <jjennings@nvidia.com>
Date: Thu, 27 Jul 2023 12:32:28 -0700
Subject: [PATCH 05/62] First commit of quality filtering stage

---
 launcher_scripts/main.py                      |  19 +-
 .../core/data_curation_stages.py              | 162 ++++++++++++++++++
 launcher_scripts/nemo_launcher/core/stages.py |  42 ++---
 3 files changed, 198 insertions(+), 25 deletions(-)
 create mode 100644 launcher_scripts/nemo_launcher/core/data_curation_stages.py

diff --git a/launcher_scripts/main.py b/launcher_scripts/main.py
index 4053328f2c..61bec11eb6 100755
--- a/launcher_scripts/main.py
+++ b/launcher_scripts/main.py
@@ -18,6 +18,7 @@
 import hydra
 import omegaconf
 from nemo_launcher.core.data_stages import CustomDataPreparation, MC4DataPreparation, PileDataPreparation
+from nemo_launcher.core.data_curation_stages import QualityFiltering
 from nemo_launcher.core.export_stages import Export
 from nemo_launcher.core.stages import (
     AdapterLearning,
@@ -30,9 +31,15 @@
     Training,
 )
 
-omegaconf.OmegaConf.register_new_resolver("multiply", lambda x, y: x * y, replace=True)
-omegaconf.OmegaConf.register_new_resolver("divide_ceil", lambda x, y: int(math.ceil(x / y)), replace=True)
-omegaconf.OmegaConf.register_new_resolver("divide_floor", lambda x, y: int(math.floor(x / y)), replace=True)
+omegaconf.OmegaConf.register_new_resolver("multiply",
+                                          lambda x, y: x * y,
+                                          replace=True)
+omegaconf.OmegaConf.register_new_resolver("divide_ceil",
+                                          lambda x, y: int(math.ceil(x / y)),
+                                          replace=True)
+omegaconf.OmegaConf.register_new_resolver("divide_floor",
+                                          lambda x, y: int(math.floor(x / y)),
+                                          replace=True)
 
 STR2STAGECLASS = {
     "training": Training,
@@ -44,13 +51,17 @@
     "export": Export,
     "evaluation": {
         EvalHarnessEvaluation: ["gpt3", "prompt_gpt3"],
-        NeMoEvaluation: ["t5", "mt5", "prompt_t5", "prompt_mt5", "adapter_t5", "adapter_gpt3", "ia3_t5", "ia3_gpt3"],
+        NeMoEvaluation: [
+            "t5", "mt5", "prompt_t5", "prompt_mt5", "adapter_t5",
+            "adapter_gpt3", "ia3_t5", "ia3_gpt3"
+        ],
     },
     "data_preparation": {
         PileDataPreparation: ["gpt3", "t5", "bert"],
         MC4DataPreparation: ["mt5"],
         CustomDataPreparation: ["generic"],
     },
+    "quality_filtering": QualityFiltering,
 }
 
 
diff --git a/launcher_scripts/nemo_launcher/core/data_curation_stages.py b/launcher_scripts/nemo_launcher/core/data_curation_stages.py
new file mode 100644
index 0000000000..c743f16569
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/data_curation_stages.py
@@ -0,0 +1,162 @@
+import copy
+import shlex
+import omegaconf
+from typing import Dict, List
+from pathlib import Path
+
+from nemo_launcher.core.stages import (
+    NemoMegatronStage,
+    create_args_list,
+    clean_command_groups,
+)
+from nemo_launcher.core.launchers import AutoLauncher
+
+
+class DataCurationStage(NemoMegatronStage):
+    def __init__(self, cfg):
+        super().__init__(cfg)
+        self.log_folder = Path()
+        self.conf_folder = Path()
+
+    def setup_folder_and_data(self):
+        job_path = self.get_job_path()
+        job_path.folder.mkdir(parents=True, exist_ok=True)
+        # make the results dir
+        results_folder = job_path.results_folder
+        results_folder.mkdir(parents=True, exist_ok=True)
+        # make the log dir
+        self.log_folder = Path(job_path.folder, 'log')
+        self.log_folder.mkdir(parents=True, exist_ok=True)
+        # Make the conf dir
+        self.conf_folder = Path(job_path.folder, 'config')
+        self.conf_folder.mkdir(parents=True, exist_ok=True)
+
+    def _make_cluster_parameters(
+        self,
+        cluster: str,
+    ) -> Dict:
+        """
+      Make a cluster-specific parameters for jobs on different clusters.
+      Current clusters include bcm(slurm), bcp and interactive.
+      For example for bcm, it will return slurm parameters:
+          {'job_name': 'some_name', 'nodes': 2, 'ntasks_per_node': 8, ...}
+
+      :param str cluster: i.e. `bcm`, `bcp`, `interactive`, etc.
+      :param Optional sub_stage: current sub_stage name
+      :return: a dictionary of cluster parameters, e.g. `ntasks_per_node`
+      :rtype: Dict
+      """
+        cfg = self.cfg
+        stage_cfg = self.stage_cfg
+
+        run_cfg = stage_cfg.get("run")
+        job_name = run_cfg.get("name")
+        time_limit = run_cfg.get("time_limit")
+        nodes = run_cfg.get('nodes')
+        # Allow for updating the partition as we might run
+        # on CPU only nodes
+        partition = run_cfg.get('partition')
+
+        container_image = cfg.get("container")
+        container_mounts = self._make_container_mounts_string()
+
+        shared_parameters = {
+            "job_name": job_name,
+            "time": time_limit,
+        }
+        if cluster == "bcm":
+            cluster_cfg = cfg.get("cluster")
+            slurm_cfg = {**copy.deepcopy(cluster_cfg)}
+            job_name_prefix = slurm_cfg.pop("job_name_prefix")
+            cluster_params = {
+                **slurm_cfg,
+            }
+            cluster_params.update({
+                **shared_parameters,
+                "container_image": container_image,
+                "container_mounts": container_mounts,
+            })
+            cluster_params[
+                "job_name"] = job_name_prefix + cluster_params["job_name"]
+            cluster_params['nodes'] = nodes
+            cluster_params['partition'] = partition
+
+        return cluster_params
+
+    def run(self):
+        self.setup_folder_and_data()
+        job_path = self.get_job_path()
+
+        cluster_parameters = self._make_cluster_parameters(self.cluster)
+        stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(
+            self.stage_cfg,
+            job_path,
+        )
+
+        command_groups = self.make_stage_command_groups(stage_cfg_path)
+
+        launcher = AutoLauncher(
+            folder=self.get_job_path().folder,
+            cluster=self.cluster,
+            **cluster_parameters,
+        )
+
+        job_id = launcher.launch(command_groups)
+
+        return job_id
+
+
+class QualityFiltering(DataCurationStage):
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+    def setup_stage_vars(self, cfg):
+        self.stage_name = "quality_filtering"
+        self.stage_cfg = cfg.get("quality_filtering")
+
+    def make_stage_command_groups(
+        self,
+        stage_cfg_path: Path,
+    ) -> List[List[str]]:
+
+        stage_cfg = self.stage_cfg
+        job_path = self.get_job_path()
+
+        # Write out the filter configuration as a separate config file
+        filter_cfg = Path(self.conf_folder, "heuristic_filter.yaml")
+        omegaconf.OmegaConf.save(stage_cfg.get('filter'), filter_cfg)
+
+        command_groups = [[]]
+
+        optional_args = {
+            "output_removed_document_dir":
+            stage_cfg.get('output_removed_document_dir'),
+            "output_document_score_dir":
+            stage_cfg.get('output_document_score_dir'),
+        }
+
+        # Remove any arguments that are not specified
+        optional_args = {
+            arg: optional_args[arg]
+            for arg in optional_args if optional_args[arg]
+        }
+
+        args = create_args_list(
+            replace_underscore=False,
+            log_dir=self.log_folder,
+            res_dir=job_path.results_folder,
+            conf_dir=self.conf_folder,
+            input_dir=stage_cfg.get("input_dir"),
+            filter_config_file=f"{self.conf_folder}/{filter_cfg}",
+            output_retained_document_dir=stage_cfg.get(
+                "output_retained_document_dir"),
+            **optional_args,
+        )
+
+        core_command = ["filter_documents", *args]
+
+        core_command_string = " \\\n  ".join(core_command)
+        command_groups[-1] += [core_command_string]
+        command_groups = clean_command_groups(command_groups)
+
+        return command_groups
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 4acd81daa7..6aeafda39d 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -240,7 +240,7 @@ def _make_cluster_parameters(self, cluster: str) -> Dict:
         dependency = run_cfg.get("dependency")
         if nodes is None:
             nodes = stage_cfg.get("trainer").get("num_nodes")
-        
+
         ntasks_per_node = run_cfg.get("ntasks_per_node")
         if ntasks_per_node is None:
             ntasks_per_node = stage_cfg.get("trainer").get("devices")
@@ -287,7 +287,7 @@ def _make_cluster_parameters(self, cluster: str) -> Dict:
             cluster_parameters.update(shared_parameters)
 
         return cluster_parameters
-    
+
     def _find_optimal_nodes(self, cfg, gpus) -> None:
         nodes_scheduler_path = f"{cfg.get('training').get('run').get('results_dir')}/nodes_scheduler.json"
 
@@ -320,7 +320,7 @@ def _find_optimal_nodes(self, cfg, gpus) -> None:
                         optimal_lst.append(nodes)
 
                 self.nodes_scheduler[str(b)] = max(optimal_lst)
-            
+
             sched_rbs = [int(i) for i in self.nodes_scheduler.keys()]
             assert rbs[::-1] == sched_rbs, (
                 "please, make sure you enter the correct combination of"
@@ -329,7 +329,7 @@ def _find_optimal_nodes(self, cfg, gpus) -> None:
 
             with open(nodes_scheduler_path, 'w') as nodes_scheduler:
                 nodes_scheduler.write(json.dumps(self.nodes_scheduler))
-    
+
     def _get_current_gbs(self, cfg):
         start_bs = cfg.get('training').get('model').get('rampup_batch_size')[0]
         results_dir = cfg.get('training').get('run').get('results_dir')
@@ -340,16 +340,16 @@ def _get_current_gbs(self, cfg):
             for file in glob.glob("*.out"):
                 file = file.split('_')[-1].split('.')[0]
                 job_numbers.append(int(file))
-        
+
             job_number = max(job_numbers)
             last_job = glob.glob(f"*{job_number}.out")[0]
             with open(last_job, 'r') as logs:
                 logs = logs.read()
-        
+
             current_gbs = re.findall(r'global_batch_size=(\d+)', logs)[-1]
         except:
             current_gbs =  start_bs
-    
+
         return current_gbs
 
     def get_env_vars(self) -> Dict:
@@ -528,11 +528,11 @@ def _make_nemo_call_string(self, stage_cfg_path: Path) -> str:
     def _make_hydra_override(self) -> List:
         """
         Override some existing hydra configurations if necessary.
-        
+
         Example use cases are:
             1. For bcp cluster, `+rank=\${RANK}` is required running some NeMo scripts.
                 Existing hydra config doesn't have `rank` field, so we overwrite on the fly.
-            2. Auto blend training dataset by overwriting empty `model.data.data_prefix` as 
+            2. Auto blend training dataset by overwriting empty `model.data.data_prefix` as
                 `model.data.data_prefix=\$({auto_blend_command})`. Existing `model.data.data_prefix`
                 could be None in cfg, so we overwrite it in this function.
         """
@@ -578,7 +578,7 @@ def _make_hydra_override(self) -> List:
         Example use cases are:
             1. For bcp cluster, `+rank=\${RANK}` is required running some NeMo scripts.
                 Existing hydra config doesn't have `rank` field, so we overwrite on the fly.
-            2. Auto blend training dataset by overwriting empty `model.data.data_prefix` as 
+            2. Auto blend training dataset by overwriting empty `model.data.data_prefix` as
                 `model.data.data_prefix=\$({auto_blend_command})`. Existing `model.data.data_prefix`
                 could be None in cfg, so we overwrite it in this function.
 
@@ -613,7 +613,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3.
 
         :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc.
-        :return: path current stage's essential nemo scripts code 
+        :return: path current stage's essential nemo scripts code
         :rtype: Path
         """
         model_type_to_code_path = {
@@ -678,9 +678,9 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         """
         Provide the essential nemo code path for running the stage, usually different model types use different nemo scripts.
         For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3.
-        
+
         :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc.
-        :return: path current stage's essential nemo scripts code 
+        :return: path current stage's essential nemo scripts code
         :rtype: Path
         """
         if model_type == "gpt3":
@@ -718,9 +718,9 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         """
         Provide the essential nemo code path for running the stage, usually different model types use different nemo scripts.
         For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3.
-        
+
         :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc.
-        :return: path current stage's essential nemo scripts code 
+        :return: path current stage's essential nemo scripts code
         :rtype: Path
         """
         model_type_to_code_path = {
@@ -741,9 +741,9 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         """
         Provide the essential nemo code path for running the stage, usually different model types use different nemo scripts.
         For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3.
-        
+
         :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc.
-        :return: path current stage's essential nemo scripts code 
+        :return: path current stage's essential nemo scripts code
         :rtype: Path
         """
         model_type_to_code_path = {
@@ -763,9 +763,9 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         """
         Provide the essential nemo code path for running the stage, usually different model types use different nemo scripts.
         For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3.
-        
+
         :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc.
-        :return: path current stage's essential nemo scripts code 
+        :return: path current stage's essential nemo scripts code
         :rtype: Path
         """
         model_type_to_code_path = {
@@ -945,9 +945,9 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         """
         Provide the essential nemo code path for running the stage, usually different model types use different nemo scripts.
         For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3.
-        
+
         :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc.
-        :return: path current stage's essential nemo scripts code 
+        :return: path current stage's essential nemo scripts code
         :rtype: Path
         """
         if model_type in ["gpt3", "prompt_gpt3"]:

From ba9539e696999fa3deb90c355a7c8c406e9d070b Mon Sep 17 00:00:00 2001
From: Joseph Jennings <jjennings@nvidia.com>
Date: Thu, 27 Jul 2023 12:53:09 -0700
Subject: [PATCH 06/62] Add config for sub stage

---
 .../quality_filtering/heuristic/english.yaml  | 126 ++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 launcher_scripts/conf/quality_filtering/heuristic/english.yaml

diff --git a/launcher_scripts/conf/quality_filtering/heuristic/english.yaml b/launcher_scripts/conf/quality_filtering/heuristic/english.yaml
new file mode 100644
index 0000000000..9e7d6fedb0
--- /dev/null
+++ b/launcher_scripts/conf/quality_filtering/heuristic/english.yaml
@@ -0,0 +1,126 @@
+run:
+  name: 'heuristic-filter-en'
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "08:00:00"
+  dependency: "singleton"
+  nodes: 1
+  partition:
+  cpus_per_node: 48
+
+# Provide the downloader, data loader and extraction modules that
+# define how the dataset will be built from the URLs
+filter:
+  filter_module: ndc.filter.heuristics.filter.CascadedHeuristicFilter
+  params:
+    # The filters below define a chain of heuristic filters to be applied to each document in a corpus.
+    # This particular cascade of filters is intended to filter English language data. 
+    # The filter listed at the top will be applied first, and the following filters will be applied in
+    # the order they appear in this file. Each filter can be removed and re-ordered as desired.
+    # New filters can be added as described in docs/1_document_filtering.rst
+    filters:
+    - name: ndc.filter.heuristics.filter.NonAlphaNumericFilter
+      params:
+        max_non_alpha_numeric_to_text_ratio: 0.25
+    - name: ndc.filter.heuristics.filter.SymbolsToWordsFilter
+      params:
+        max_symbol_to_word_ratio: 0.1
+    - name: ndc.filter.heuristics.filter.NumbersFilter
+      params:
+        max_number_to_text_ratio: 0.15
+    - name: ndc.filter.heuristics.filter.UrlsFilter
+      params:
+        max_url_to_text_ratio: 0.2
+    - name: ndc.filter.heuristics.filter.WhiteSpaceFilter
+      params:
+        max_white_space_ratio: 0.25
+    - name: ndc.filter.heuristics.filter.ParenthesesFilter
+      params:
+        max_parentheses_ratio: 0.1
+    - name: ndc.filter.heuristics.filter.BoilerPlateStringFilter
+      params:
+        remove_if_at_top_or_bottom: True
+        max_boilerplate_string_ratio: 0.4
+    - name: ndc.filter.heuristics.filter.RepeatedLinesFilter
+      params:
+        max_repeated_line_fraction: 0.7
+    - name: ndc.filter.heuristics.filter.RepeatedParagraphsFilter
+      params:
+        max_repeated_paragraphs_ratio: 0.7
+    - name: ndc.filter.heuristics.filter.RepeatedLinesByCharFilter
+      params:
+        max_repeated_lines_char_ratio: 0.8
+    - name: ndc.filter.heuristics.filter.RepeatedParagraphsByCharFilter
+      params:
+        max_repeated_paragraphs_char_ratio: 0.8
+    - name: ndc.filter.heuristics.filter.WordCountFilter
+      params:
+        min_words: 50
+        max_words: 100000
+    - name: ndc.filter.heuristics.filter.PunctuationFilter
+      params:
+        max_num_sentences_without_endmark_ratio: 0.85
+    - name: ndc.filter.heuristics.filter.WordsWithoutAlphabetsFilter
+      params:
+        max_words_without_alphabets: 0.8
+    - name: ndc.filter.heuristics.filter.CommonEnglishWordsFilter
+      params:
+        min_num_common_words: 2
+        stop_at_false: True
+    - name: ndc.filter.heuristics.filter.MeanWordLengthFilter
+      params:
+        max_mean_word_length: 10
+        min_mean_word_length: 3
+    - name: ndc.filter.heuristics.filter.LongWordFilter
+      params:
+        max_word_length: 1000
+    - name: ndc.filter.heuristics.filter.EllipsisFilter
+      params:
+        max_num_lines_ending_with_ellipsis_ratio: 0.3
+    # Top N-Gram filters for N-grams 2, 3, and 4
+    - name: ndc.filter.heuristics.filter.RepeatingTopNGramsFilter
+      params:
+        n: 2
+        max_repeating_ngram_ratio: 0.2
+    - name: ndc.filter.heuristics.filter.RepeatingTopNGramsFilter
+      params:
+        n: 3
+        max_repeating_ngram_ratio: 0.18
+    - name: ndc.filter.heuristics.filter.RepeatingTopNGramsFilter
+      params:
+        n: 4
+        max_repeating_ngram_ratio: 0.16
+    # Duplicate N-gram filters for N-grams 5, 6, 7, 8, 9, and 10
+    - name: ndc.filter.heuristics.filter.RepeatingDuplicateNGramsFilter
+      params:
+        n: 5
+        max_repeating_duplicate_ngram_ratio: 0.15
+    - name: ndc.filter.heuristics.filter.RepeatingDuplicateNGramsFilter
+      params:
+        n: 6
+        max_repeating_duplicate_ngram_ratio: 0.14
+    - name: ndc.filter.heuristics.filter.RepeatingDuplicateNGramsFilter
+      params:
+        n: 7
+        max_repeating_duplicate_ngram_ratio: 0.13
+    - name: ndc.filter.heuristics.filter.RepeatingDuplicateNGramsFilter
+      params:
+        n: 8
+        max_repeating_duplicate_ngram_ratio: 0.12
+    - name: ndc.filter.heuristics.filter.RepeatingDuplicateNGramsFilter
+      params:
+        n: 9
+        max_repeating_duplicate_ngram_ratio: 0.11
+    - name: ndc.filter.heuristics.filter.RepeatingDuplicateNGramsFilter
+      params:
+        n: 10
+        max_repeating_duplicate_ngram_ratio: 0.10
+    - name: ndc.filter.heuristics.filter.BulletsFilter
+      params:
+        max_bullet_lines_ratio: 0.9
+    # If True, the chained operation defined by the filters above 
+    # will stop at first filter that is triggered during the above defined pipeline
+    stop_at_true: True
+
+input_dir: ${data_dir}/json/original
+# Output directory to where filtered documents will be written
+output_retained_document_dir: ${data_dir}/json/filtered/high_quality

From fe11fab098f2c303548e436152c9ad8be5bc64c0 Mon Sep 17 00:00:00 2001
From: Joseph Jennings <jjennings@nvidia.com>
Date: Thu, 27 Jul 2023 18:27:36 -0700
Subject: [PATCH 07/62] Fix some path errors

---
 .../nemo_launcher/core/data_curation_stages.py           | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/launcher_scripts/nemo_launcher/core/data_curation_stages.py b/launcher_scripts/nemo_launcher/core/data_curation_stages.py
index c743f16569..f4872c562b 100644
--- a/launcher_scripts/nemo_launcher/core/data_curation_stages.py
+++ b/launcher_scripts/nemo_launcher/core/data_curation_stages.py
@@ -120,7 +120,6 @@ def make_stage_command_groups(
     ) -> List[List[str]]:
 
         stage_cfg = self.stage_cfg
-        job_path = self.get_job_path()
 
         # Write out the filter configuration as a separate config file
         filter_cfg = Path(self.conf_folder, "heuristic_filter.yaml")
@@ -142,12 +141,10 @@ def make_stage_command_groups(
         }
 
         args = create_args_list(
-            replace_underscore=False,
+            replace_underscore=True,
             log_dir=self.log_folder,
-            res_dir=job_path.results_folder,
-            conf_dir=self.conf_folder,
-            input_dir=stage_cfg.get("input_dir"),
-            filter_config_file=f"{self.conf_folder}/{filter_cfg}",
+            input_data_dir=stage_cfg.get("input_dir"),
+            filter_config_file=f"{filter_cfg}",
             output_retained_document_dir=stage_cfg.get(
                 "output_retained_document_dir"),
             **optional_args,

From fd30fde3c52a03346c71a295c3a76572470093f3 Mon Sep 17 00:00:00 2001
From: Joseph Jennings <jjennings@nvidia.com>
Date: Fri, 28 Jul 2023 08:47:25 -0700
Subject: [PATCH 08/62] Change formatting to be consistent with current code.
 Add additional comments

---
 launcher_scripts/main.py                      | 17 ++---
 .../core/data_curation_stages.py              | 62 +++++++++++++------
 2 files changed, 46 insertions(+), 33 deletions(-)

diff --git a/launcher_scripts/main.py b/launcher_scripts/main.py
index 61bec11eb6..744f6cefa5 100755
--- a/launcher_scripts/main.py
+++ b/launcher_scripts/main.py
@@ -31,15 +31,9 @@
     Training,
 )
 
-omegaconf.OmegaConf.register_new_resolver("multiply",
-                                          lambda x, y: x * y,
-                                          replace=True)
-omegaconf.OmegaConf.register_new_resolver("divide_ceil",
-                                          lambda x, y: int(math.ceil(x / y)),
-                                          replace=True)
-omegaconf.OmegaConf.register_new_resolver("divide_floor",
-                                          lambda x, y: int(math.floor(x / y)),
-                                          replace=True)
+omegaconf.OmegaConf.register_new_resolver("multiply", lambda x, y: x * y, replace=True)
+omegaconf.OmegaConf.register_new_resolver("divide_ceil", lambda x, y: int(math.ceil(x / y)), replace=True)
+omegaconf.OmegaConf.register_new_resolver("divide_floor", lambda x, y: int(math.floor(x / y)), replace=True)
 
 STR2STAGECLASS = {
     "training": Training,
@@ -51,10 +45,7 @@
     "export": Export,
     "evaluation": {
         EvalHarnessEvaluation: ["gpt3", "prompt_gpt3"],
-        NeMoEvaluation: [
-            "t5", "mt5", "prompt_t5", "prompt_mt5", "adapter_t5",
-            "adapter_gpt3", "ia3_t5", "ia3_gpt3"
-        ],
+        NeMoEvaluation: ["t5", "mt5", "prompt_t5", "prompt_mt5", "adapter_t5", "adapter_gpt3", "ia3_t5", "ia3_gpt3"],
     },
     "data_preparation": {
         PileDataPreparation: ["gpt3", "t5", "bert"],
diff --git a/launcher_scripts/nemo_launcher/core/data_curation_stages.py b/launcher_scripts/nemo_launcher/core/data_curation_stages.py
index f4872c562b..902132b992 100644
--- a/launcher_scripts/nemo_launcher/core/data_curation_stages.py
+++ b/launcher_scripts/nemo_launcher/core/data_curation_stages.py
@@ -13,12 +13,24 @@
 
 
 class DataCurationStage(NemoMegatronStage):
+    """
+    DataCurationStage is a base class for data curation stages.
+    It can hold multiple sub-stages. For example, preparing data from
+    Common Crawl requires download, extraction, deduplication and filtering.
+    They have dependencies on each other and will be launched one by one.
+    """
+
     def __init__(self, cfg):
         super().__init__(cfg)
         self.log_folder = Path()
         self.conf_folder = Path()
 
     def setup_folder_and_data(self):
+        """
+        Each job in the data curation pipeline creates a directory
+        for writing logs (log_folder), writing and reading intermediate
+        results (results_folder) and for reading configs (conf_folder)
+        """
         job_path = self.get_job_path()
         job_path.folder.mkdir(parents=True, exist_ok=True)
         # make the results dir
@@ -31,21 +43,18 @@ def setup_folder_and_data(self):
         self.conf_folder = Path(job_path.folder, 'config')
         self.conf_folder.mkdir(parents=True, exist_ok=True)
 
-    def _make_cluster_parameters(
-        self,
-        cluster: str,
-    ) -> Dict:
+    def _make_cluster_parameters(self, cluster: str) -> Dict:
+        """
+        Make a cluster-specific parameters for jobs on different clusters.
+        Current clusters include bcm(slurm), bcp and interactive.
+        For example for bcm, it will return slurm parameters:
+            {'job_name': 'some_name', 'nodes': 2, 'ntasks_per_node': 8, ...}
+
+        :param str cluster: i.e. `bcm`, `bcp`, `interactive`, etc.
+        :param Optional sub_stage: current sub_stage name
+        :return: a dictionary of cluster parameters, e.g. `ntasks_per_node`
+        :rtype: Dict
         """
-      Make a cluster-specific parameters for jobs on different clusters.
-      Current clusters include bcm(slurm), bcp and interactive.
-      For example for bcm, it will return slurm parameters:
-          {'job_name': 'some_name', 'nodes': 2, 'ntasks_per_node': 8, ...}
-
-      :param str cluster: i.e. `bcm`, `bcp`, `interactive`, etc.
-      :param Optional sub_stage: current sub_stage name
-      :return: a dictionary of cluster parameters, e.g. `ntasks_per_node`
-      :rtype: Dict
-      """
         cfg = self.cfg
         stage_cfg = self.stage_cfg
 
@@ -83,42 +92,53 @@ def _make_cluster_parameters(
 
         return cluster_params
 
-    def run(self):
+    def run(self) -> str:
+        """
+        Run current stage including all of the substages, returns job id on slurm based system otherwise empty string
+
+        :return: job id on slurm based system otherwise empty string
+        :rtype: str
+        """
+        # Create the job folders
         self.setup_folder_and_data()
         job_path = self.get_job_path()
 
+        # Make cluster configuration parameters
         cluster_parameters = self._make_cluster_parameters(self.cluster)
         stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(
             self.stage_cfg,
             job_path,
         )
 
+        # Build commands to launch on cluster
         command_groups = self.make_stage_command_groups(stage_cfg_path)
 
+        # Create the launcher for the cluster
         launcher = AutoLauncher(
             folder=self.get_job_path().folder,
             cluster=self.cluster,
             **cluster_parameters,
         )
 
+        # Launch the job on the cluster
         job_id = launcher.launch(command_groups)
 
         return job_id
 
 
 class QualityFiltering(DataCurationStage):
+    """ DataCurationStage for performing quality filtering on documents """
+
     def __init__(self, cfg):
         super().__init__(cfg)
 
     def setup_stage_vars(self, cfg):
+        """Setup the stage vars, i.e. stage name and stage cfg"""
         self.stage_name = "quality_filtering"
         self.stage_cfg = cfg.get("quality_filtering")
 
-    def make_stage_command_groups(
-        self,
-        stage_cfg_path: Path,
-    ) -> List[List[str]]:
-
+    def make_stage_command_groups(self, stage_cfg_path: Path) -> List[List[str]]:
+        """ Builds the command groups for the current stage """
         stage_cfg = self.stage_cfg
 
         # Write out the filter configuration as a separate config file
@@ -127,6 +147,7 @@ def make_stage_command_groups(
 
         command_groups = [[]]
 
+        # If certain arguments are not specified, we remove them from the list
         optional_args = {
             "output_removed_document_dir":
             stage_cfg.get('output_removed_document_dir'),
@@ -140,6 +161,7 @@ def make_stage_command_groups(
             for arg in optional_args if optional_args[arg]
         }
 
+        # Create the list of arguments for the filter_documents command
         args = create_args_list(
             replace_underscore=True,
             log_dir=self.log_folder,

From a3a521a19b90b847b70d1955a921b6e809e848d0 Mon Sep 17 00:00:00 2001
From: Joseph Jennings <jjennings@nvidia.com>
Date: Fri, 28 Jul 2023 08:52:14 -0700
Subject: [PATCH 09/62] Add quality filtering to base config

---
 launcher_scripts/conf/config.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml
index 50ada47bdc..0b72956ac4 100755
--- a/launcher_scripts/conf/config.yaml
+++ b/launcher_scripts/conf/config.yaml
@@ -2,6 +2,7 @@ defaults:
   - _self_
   - cluster: bcm  # Leave it as bcm even if using bcp. It will be ignored for bcp.
   - data_preparation: gpt3/download_gpt3_pile
+  - quality_filtering: heuristic/english
   - training: gpt3/5b
   - conversion: gpt3/convert_gpt3
   - fine_tuning: null
@@ -57,6 +58,7 @@ numa_mapping:
 
 # Do not modify below, use the values above instead.
 data_preparation_config: ${hydra:runtime.choices.data_preparation}
+quality_filtering_config: ${hydra:runtime.choices.quality_filtering}
 training_config: ${hydra:runtime.choices.training}
 fine_tuning_config: ${hydra:runtime.choices.fine_tuning}
 prompt_learning_config: ${hydra:runtime.choices.prompt_learning}

From 911b22d0881cf52ccc687e1862ac370ea1ee2fb7 Mon Sep 17 00:00:00 2001
From: Joseph Jennings <jjennings@nvidia.com>
Date: Thu, 3 Aug 2023 06:35:32 -0700
Subject: [PATCH 10/62] Add documentation relating to task deduplication

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index ed1870da84..4478dfd0c7 100755
--- a/README.md
+++ b/README.md
@@ -5163,6 +5163,8 @@ Currently, within the NeMo Data Curator, we support the following data-curation
    - Fuzzy deduplication. Our implementation of fuzzy deduplication builds off of the following existing libraries:
      - For computing MinHash signatures we use a modified version of the MinHasher class provided in [pyLSH](https://github.com/mattilyra/LSH)
      - For the locality sensitive hashing, we extended the Redis-based implementation found in [datasketch](https://github.com/ekzhu/datasketch) beyond a single Redis server to a Redis Cluster. This enables this module to efficiently deduplicate large datasets that do not fit in memory of a single node (e.g., several TB of text)
+ - Multilingual downstream-task decontamination
+    -  Our implementation follows the approach of [OpenAI GPT3](https://arxiv.org/pdf/2005.14165.pdf) and [Microsoft Turing NLG 530B](https://arxiv.org/abs/2201.11990)
 
 The modules are implemented in a scalable manner using [Message Passing Interface (MPI) for Python (mpi4py)](https://mpi4py.readthedocs.io/en/stable/) and we use [Dask](https://dask.org) for creating balanced input jsonl files. With the scalable modules within the NeMo Data Curator, we have been have been able to fully process a [Common Crawl Snapshot](https://commoncrawl.org/2020/12/nov-dec-2020-crawl-archive-now-available/) (consisting of 60 TB of compressed WARC files) in approximately two days using 30 CPU nodes (with hardware similar to the `c5.24xlarge` [Amazon AWS C5 instance](https://aws.amazon.com/ec2/instance-types/c5/)). Please note that the core functions used within the NeMo Data Curator (e.g., html extraction, text cleaning, heuristic filtering, etc.) have not been fully optimized. The main goal of the NeMo Data Curator is to provide users the capability to apply these functions to their large datasets using many compute nodes.
 

From 056a90edf9140a32bfd39652509244e7db055651 Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <dpykhtar@nvidia.com>
Date: Mon, 7 Aug 2023 12:51:40 -0700
Subject: [PATCH 11/62] ptl bug fix

Signed-off-by: Dmytro Pykhtar <dpykhtar@nvidia.com>
---
 auto_configurator/base_configs/bert.yaml                  | 2 +-
 auto_configurator/base_configs/gpt3.yaml                  | 2 +-
 auto_configurator/base_configs/mt5.yaml                   | 2 +-
 auto_configurator/base_configs/t5.yaml                    | 2 +-
 .../tests/base_configs_tests/test_base_configs.py         | 8 ++++----
 launcher_scripts/conf/adapter_learning/gpt3/squad.yaml    | 2 +-
 launcher_scripts/conf/adapter_learning/t5/squad.yaml      | 2 +-
 launcher_scripts/conf/evaluation/adapter_gpt3/squad.yaml  | 2 +-
 launcher_scripts/conf/evaluation/adapter_t5/squad.yaml    | 2 +-
 launcher_scripts/conf/evaluation/ia3_gpt3/squad.yaml      | 2 +-
 launcher_scripts/conf/evaluation/ia3_t5/squad.yaml        | 2 +-
 launcher_scripts/conf/evaluation/mt5/custom_task.yaml     | 2 +-
 launcher_scripts/conf/evaluation/mt5/xquad.yaml           | 2 +-
 launcher_scripts/conf/evaluation/prompt_mt5/squad.yaml    | 2 +-
 launcher_scripts/conf/evaluation/prompt_t5/squad.yaml     | 2 +-
 launcher_scripts/conf/evaluation/t5/custom_task.yaml      | 2 +-
 launcher_scripts/conf/evaluation/t5/squad.yaml            | 2 +-
 launcher_scripts/conf/fine_tuning/gpt3/custom_task.yaml   | 2 +-
 launcher_scripts/conf/fine_tuning/gpt3/squad.yaml         | 2 +-
 launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml    | 2 +-
 launcher_scripts/conf/fine_tuning/mt5/xquad.yaml          | 2 +-
 launcher_scripts/conf/fine_tuning/t5/custom_task.yaml     | 2 +-
 launcher_scripts/conf/fine_tuning/t5/squad.yaml           | 2 +-
 launcher_scripts/conf/ia3_learning/gpt3/squad.yaml        | 2 +-
 launcher_scripts/conf/ia3_learning/t5/squad.yaml          | 2 +-
 launcher_scripts/conf/prompt_learning/gpt3/squad.yaml     | 2 +-
 launcher_scripts/conf/prompt_learning/mt5/squad.yaml      | 2 +-
 launcher_scripts/conf/prompt_learning/t5/squad.yaml       | 2 +-
 launcher_scripts/conf/training/bert/100b.yaml             | 2 +-
 launcher_scripts/conf/training/bert/110m.yaml             | 2 +-
 launcher_scripts/conf/training/bert/20b.yaml              | 2 +-
 launcher_scripts/conf/training/bert/4b.yaml               | 2 +-
 launcher_scripts/conf/training/gpt3/126m.yaml             | 2 +-
 launcher_scripts/conf/training/gpt3/175b.yaml             | 2 +-
 launcher_scripts/conf/training/gpt3/175b_performance.yaml | 2 +-
 launcher_scripts/conf/training/gpt3/1b_improved.yaml      | 2 +-
 launcher_scripts/conf/training/gpt3/20b.yaml              | 2 +-
 launcher_scripts/conf/training/gpt3/400m_improved.yaml    | 2 +-
 launcher_scripts/conf/training/gpt3/40b.yaml              | 2 +-
 launcher_scripts/conf/training/gpt3/40b_improved.yaml     | 2 +-
 launcher_scripts/conf/training/gpt3/5b.yaml               | 2 +-
 launcher_scripts/conf/training/gpt3/7b_improved.yaml      | 2 +-
 launcher_scripts/conf/training/mt5/11b.yaml               | 2 +-
 launcher_scripts/conf/training/mt5/170m.yaml              | 2 +-
 launcher_scripts/conf/training/mt5/23b.yaml               | 2 +-
 launcher_scripts/conf/training/mt5/390m.yaml              | 2 +-
 launcher_scripts/conf/training/mt5/3b.yaml                | 2 +-
 launcher_scripts/conf/training/t5/11b.yaml                | 2 +-
 launcher_scripts/conf/training/t5/220m.yaml               | 2 +-
 launcher_scripts/conf/training/t5/23b.yaml                | 2 +-
 launcher_scripts/conf/training/t5/3b.yaml                 | 2 +-
 launcher_scripts/conf/training/t5/41b.yaml                | 2 +-
 52 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/auto_configurator/base_configs/bert.yaml b/auto_configurator/base_configs/bert.yaml
index 305040666e..01e3be140e 100644
--- a/auto_configurator/base_configs/bert.yaml
+++ b/auto_configurator/base_configs/bert.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   max_time: "00:23:30:00"
diff --git a/auto_configurator/base_configs/gpt3.yaml b/auto_configurator/base_configs/gpt3.yaml
index 4eeaf79ce2..a69ba139eb 100644
--- a/auto_configurator/base_configs/gpt3.yaml
+++ b/auto_configurator/base_configs/gpt3.yaml
@@ -11,7 +11,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 600000 # consumed_samples = global_step * global_batch_size
   max_time: "00:23:30:00" # days:hours:minutes:seconds
diff --git a/auto_configurator/base_configs/mt5.yaml b/auto_configurator/base_configs/mt5.yaml
index 96053b9ac4..a0f3d70d8a 100644
--- a/auto_configurator/base_configs/mt5.yaml
+++ b/auto_configurator/base_configs/mt5.yaml
@@ -16,7 +16,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1000000 # consumed_samples = global_step * global_batch_size
   max_time: "06:23:30:00"
diff --git a/auto_configurator/base_configs/t5.yaml b/auto_configurator/base_configs/t5.yaml
index cd1ef0ac87..06c6016f78 100644
--- a/auto_configurator/base_configs/t5.yaml
+++ b/auto_configurator/base_configs/t5.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1000000 # consumed_samples = global_step * global_batch_size
   max_time: "06:23:30:00"
diff --git a/auto_configurator/tests/base_configs_tests/test_base_configs.py b/auto_configurator/tests/base_configs_tests/test_base_configs.py
index 0919ee65ab..4fb155628d 100644
--- a/auto_configurator/tests/base_configs_tests/test_base_configs.py
+++ b/auto_configurator/tests/base_configs_tests/test_base_configs.py
@@ -18,7 +18,7 @@ def test_gpt3_base_config(self):
           precision: bf16
           logger: False
           enable_checkpointing: False
-          replace_sampler_ddp: False
+          use_distributed_sampler: False
           max_epochs: null
           max_steps: 600000
           max_time: "00:23:30:00"
@@ -196,7 +196,7 @@ def test_t5_base_config(self):
           precision: bf16
           logger: False # logger provided by exp_manager
           enable_checkpointing: False
-          replace_sampler_ddp: False
+          use_distributed_sampler: False
           max_epochs: null
           max_steps: 1000000 # consumed_samples = global_step * global_batch_size
           max_time: "06:23:30:00"
@@ -421,7 +421,7 @@ def test_mt5_base_config(self):
           precision: bf16
           logger: False # logger provided by exp_manager
           enable_checkpointing: False
-          replace_sampler_ddp: False
+          use_distributed_sampler: False
           max_epochs: null
           max_steps: 1000000 # consumed_samples = global_step * global_batch_size
           max_time: "06:23:30:00"
@@ -642,7 +642,7 @@ def test_bert_base_config(self):
           precision: bf16
           logger: False # logger provided by exp_manager
           enable_checkpointing: False
-          replace_sampler_ddp: False
+          use_distributed_sampler: False
           max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
           max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
           max_time: "00:23:30:00"
diff --git a/launcher_scripts/conf/adapter_learning/gpt3/squad.yaml b/launcher_scripts/conf/adapter_learning/gpt3/squad.yaml
index 5abbbc9cb3..fe2ceea017 100755
--- a/launcher_scripts/conf/adapter_learning/gpt3/squad.yaml
+++ b/launcher_scripts/conf/adapter_learning/gpt3/squad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 4
   max_steps: -1
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/adapter_learning/t5/squad.yaml b/launcher_scripts/conf/adapter_learning/t5/squad.yaml
index f82940d489..a5fc08f7a0 100755
--- a/launcher_scripts/conf/adapter_learning/t5/squad.yaml
+++ b/launcher_scripts/conf/adapter_learning/t5/squad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 4
   max_steps: -1
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/evaluation/adapter_gpt3/squad.yaml b/launcher_scripts/conf/evaluation/adapter_gpt3/squad.yaml
index 75c9774e14..a7dbd31065 100755
--- a/launcher_scripts/conf/evaluation/adapter_gpt3/squad.yaml
+++ b/launcher_scripts/conf/evaluation/adapter_gpt3/squad.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 inference:
diff --git a/launcher_scripts/conf/evaluation/adapter_t5/squad.yaml b/launcher_scripts/conf/evaluation/adapter_t5/squad.yaml
index d18cc08856..91d2cec798 100755
--- a/launcher_scripts/conf/evaluation/adapter_t5/squad.yaml
+++ b/launcher_scripts/conf/evaluation/adapter_t5/squad.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 inference:
diff --git a/launcher_scripts/conf/evaluation/ia3_gpt3/squad.yaml b/launcher_scripts/conf/evaluation/ia3_gpt3/squad.yaml
index d109a98557..046d7c9ae0 100755
--- a/launcher_scripts/conf/evaluation/ia3_gpt3/squad.yaml
+++ b/launcher_scripts/conf/evaluation/ia3_gpt3/squad.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 inference:
diff --git a/launcher_scripts/conf/evaluation/ia3_t5/squad.yaml b/launcher_scripts/conf/evaluation/ia3_t5/squad.yaml
index 48480c074a..40b9594f68 100755
--- a/launcher_scripts/conf/evaluation/ia3_t5/squad.yaml
+++ b/launcher_scripts/conf/evaluation/ia3_t5/squad.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 inference:
diff --git a/launcher_scripts/conf/evaluation/mt5/custom_task.yaml b/launcher_scripts/conf/evaluation/mt5/custom_task.yaml
index ce3523d3e7..128937204b 100755
--- a/launcher_scripts/conf/evaluation/mt5/custom_task.yaml
+++ b/launcher_scripts/conf/evaluation/mt5/custom_task.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 
diff --git a/launcher_scripts/conf/evaluation/mt5/xquad.yaml b/launcher_scripts/conf/evaluation/mt5/xquad.yaml
index 6d733fec7f..89771d546b 100755
--- a/launcher_scripts/conf/evaluation/mt5/xquad.yaml
+++ b/launcher_scripts/conf/evaluation/mt5/xquad.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 exp_manager:
diff --git a/launcher_scripts/conf/evaluation/prompt_mt5/squad.yaml b/launcher_scripts/conf/evaluation/prompt_mt5/squad.yaml
index 01278be854..a223289ffc 100755
--- a/launcher_scripts/conf/evaluation/prompt_mt5/squad.yaml
+++ b/launcher_scripts/conf/evaluation/prompt_mt5/squad.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 data:
diff --git a/launcher_scripts/conf/evaluation/prompt_t5/squad.yaml b/launcher_scripts/conf/evaluation/prompt_t5/squad.yaml
index 7b549fedf7..c1fb88caed 100755
--- a/launcher_scripts/conf/evaluation/prompt_t5/squad.yaml
+++ b/launcher_scripts/conf/evaluation/prompt_t5/squad.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 data:
diff --git a/launcher_scripts/conf/evaluation/t5/custom_task.yaml b/launcher_scripts/conf/evaluation/t5/custom_task.yaml
index 90e0ebb38d..2959469ccd 100755
--- a/launcher_scripts/conf/evaluation/t5/custom_task.yaml
+++ b/launcher_scripts/conf/evaluation/t5/custom_task.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 
diff --git a/launcher_scripts/conf/evaluation/t5/squad.yaml b/launcher_scripts/conf/evaluation/t5/squad.yaml
index f50843d82f..39c954a943 100755
--- a/launcher_scripts/conf/evaluation/t5/squad.yaml
+++ b/launcher_scripts/conf/evaluation/t5/squad.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 
diff --git a/launcher_scripts/conf/fine_tuning/gpt3/custom_task.yaml b/launcher_scripts/conf/fine_tuning/gpt3/custom_task.yaml
index 9d9ebabd1b..e55341fb9b 100644
--- a/launcher_scripts/conf/fine_tuning/gpt3/custom_task.yaml
+++ b/launcher_scripts/conf/fine_tuning/gpt3/custom_task.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 1
   max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10 # frequency with which training steps are logged 
diff --git a/launcher_scripts/conf/fine_tuning/gpt3/squad.yaml b/launcher_scripts/conf/fine_tuning/gpt3/squad.yaml
index 4730f2f1ae..17dfb0fdc7 100644
--- a/launcher_scripts/conf/fine_tuning/gpt3/squad.yaml
+++ b/launcher_scripts/conf/fine_tuning/gpt3/squad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 13000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10 # frequency with which training steps are logged 
diff --git a/launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml b/launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml
index bea1aacee8..abd3c2565c 100755
--- a/launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml
+++ b/launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 5
   max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/fine_tuning/mt5/xquad.yaml b/launcher_scripts/conf/fine_tuning/mt5/xquad.yaml
index f8d677fba5..8190e47aa5 100755
--- a/launcher_scripts/conf/fine_tuning/mt5/xquad.yaml
+++ b/launcher_scripts/conf/fine_tuning/mt5/xquad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 5
   max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/fine_tuning/t5/custom_task.yaml b/launcher_scripts/conf/fine_tuning/t5/custom_task.yaml
index a3b2960f9c..54c3166405 100755
--- a/launcher_scripts/conf/fine_tuning/t5/custom_task.yaml
+++ b/launcher_scripts/conf/fine_tuning/t5/custom_task.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 5
   max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/fine_tuning/t5/squad.yaml b/launcher_scripts/conf/fine_tuning/t5/squad.yaml
index da5cc2c252..d608fd28ec 100755
--- a/launcher_scripts/conf/fine_tuning/t5/squad.yaml
+++ b/launcher_scripts/conf/fine_tuning/t5/squad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 5
   max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/ia3_learning/gpt3/squad.yaml b/launcher_scripts/conf/ia3_learning/gpt3/squad.yaml
index 2e7ed23f3e..b5d643c94c 100755
--- a/launcher_scripts/conf/ia3_learning/gpt3/squad.yaml
+++ b/launcher_scripts/conf/ia3_learning/gpt3/squad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 4
   max_steps: -1
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/ia3_learning/t5/squad.yaml b/launcher_scripts/conf/ia3_learning/t5/squad.yaml
index 840fce46b2..3e0900b058 100755
--- a/launcher_scripts/conf/ia3_learning/t5/squad.yaml
+++ b/launcher_scripts/conf/ia3_learning/t5/squad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 4
   max_steps: -1
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/prompt_learning/gpt3/squad.yaml b/launcher_scripts/conf/prompt_learning/gpt3/squad.yaml
index ea42f3c4ba..32fda8389c 100755
--- a/launcher_scripts/conf/prompt_learning/gpt3/squad.yaml
+++ b/launcher_scripts/conf/prompt_learning/gpt3/squad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 4
   max_steps: -1
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/prompt_learning/mt5/squad.yaml b/launcher_scripts/conf/prompt_learning/mt5/squad.yaml
index 19bf9c7447..99c9871ca8 100755
--- a/launcher_scripts/conf/prompt_learning/mt5/squad.yaml
+++ b/launcher_scripts/conf/prompt_learning/mt5/squad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 10
   max_steps: -1
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/prompt_learning/t5/squad.yaml b/launcher_scripts/conf/prompt_learning/t5/squad.yaml
index 755323e938..27d54627c6 100755
--- a/launcher_scripts/conf/prompt_learning/t5/squad.yaml
+++ b/launcher_scripts/conf/prompt_learning/t5/squad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 10
   max_steps: -1
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/training/bert/100b.yaml b/launcher_scripts/conf/training/bert/100b.yaml
index d63a844756..84d7170dae 100755
--- a/launcher_scripts/conf/training/bert/100b.yaml
+++ b/launcher_scripts/conf/training/bert/100b.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
   max_steps: 860000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   max_time: "81:23:30:00"
diff --git a/launcher_scripts/conf/training/bert/110m.yaml b/launcher_scripts/conf/training/bert/110m.yaml
index 47b2e95839..8d72872eb2 100755
--- a/launcher_scripts/conf/training/bert/110m.yaml
+++ b/launcher_scripts/conf/training/bert/110m.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
   max_steps: 13800000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   max_time: "7:23:30:00"
diff --git a/launcher_scripts/conf/training/bert/20b.yaml b/launcher_scripts/conf/training/bert/20b.yaml
index 79312130cf..729b8e0ef7 100755
--- a/launcher_scripts/conf/training/bert/20b.yaml
+++ b/launcher_scripts/conf/training/bert/20b.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
   max_steps: 860000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   max_time: "90:23:30:00"
diff --git a/launcher_scripts/conf/training/bert/4b.yaml b/launcher_scripts/conf/training/bert/4b.yaml
index 5e435c48a2..e925f5621a 100755
--- a/launcher_scripts/conf/training/bert/4b.yaml
+++ b/launcher_scripts/conf/training/bert/4b.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
   max_steps: 1720000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   max_time: "26:23:30:00"
diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml
index 3e921b5bb0..affee0765e 100755
--- a/launcher_scripts/conf/training/gpt3/126m.yaml
+++ b/launcher_scripts/conf/training/gpt3/126m.yaml
@@ -11,7 +11,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 600000 # consumed_samples = global_step * global_batch_size
   max_time: "00:23:30:00" # days:hours:minutes:seconds
diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml
index c37e35e01e..493d24d516 100755
--- a/launcher_scripts/conf/training/gpt3/175b.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b.yaml
@@ -11,7 +11,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 75000 # consumed_samples = global_step * global_batch_size
   max_time: "25:23:00:00"
diff --git a/launcher_scripts/conf/training/gpt3/175b_performance.yaml b/launcher_scripts/conf/training/gpt3/175b_performance.yaml
index a83d4f956a..976deda501 100755
--- a/launcher_scripts/conf/training/gpt3/175b_performance.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b_performance.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 75000 # consumed_samples = global_step * global_batch_size
   max_time: "25:23:00:00"
diff --git a/launcher_scripts/conf/training/gpt3/1b_improved.yaml b/launcher_scripts/conf/training/gpt3/1b_improved.yaml
index 25cbabb00e..1ff6b3dbf0 100644
--- a/launcher_scripts/conf/training/gpt3/1b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/1b_improved.yaml
@@ -11,7 +11,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 300000 # consumed_samples = global_step * global_batch_size
   max_time: "02:23:30:00" # days:hours:minutes:seconds
diff --git a/launcher_scripts/conf/training/gpt3/20b.yaml b/launcher_scripts/conf/training/gpt3/20b.yaml
index b8b08c3a32..e48788e197 100755
--- a/launcher_scripts/conf/training/gpt3/20b.yaml
+++ b/launcher_scripts/conf/training/gpt3/20b.yaml
@@ -11,7 +11,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 75000 # consumed_samples = global_step * global_batch_size
   max_time: "06:23:30:00"
diff --git a/launcher_scripts/conf/training/gpt3/400m_improved.yaml b/launcher_scripts/conf/training/gpt3/400m_improved.yaml
index 039b9bdf01..5b1e6b915f 100644
--- a/launcher_scripts/conf/training/gpt3/400m_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/400m_improved.yaml
@@ -11,7 +11,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 600000 # consumed_samples = global_step * global_batch_size
   max_time: "01:23:30:00" # days:hours:minutes:seconds
diff --git a/launcher_scripts/conf/training/gpt3/40b.yaml b/launcher_scripts/conf/training/gpt3/40b.yaml
index e2748f9b8b..84c1802bc9 100755
--- a/launcher_scripts/conf/training/gpt3/40b.yaml
+++ b/launcher_scripts/conf/training/gpt3/40b.yaml
@@ -11,7 +11,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 75000 # consumed_samples = global_step * global_batch_size
   max_time: "6:11:00:00"
diff --git a/launcher_scripts/conf/training/gpt3/40b_improved.yaml b/launcher_scripts/conf/training/gpt3/40b_improved.yaml
index dad3f3e639..8686a171be 100644
--- a/launcher_scripts/conf/training/gpt3/40b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/40b_improved.yaml
@@ -11,7 +11,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 100000 # consumed_samples = global_step * global_batch_size
   max_time: "6:11:00:00" # days:hours:minutes:seconds
diff --git a/launcher_scripts/conf/training/gpt3/5b.yaml b/launcher_scripts/conf/training/gpt3/5b.yaml
index 564fb4503a..ae99d3e063 100755
--- a/launcher_scripts/conf/training/gpt3/5b.yaml
+++ b/launcher_scripts/conf/training/gpt3/5b.yaml
@@ -11,7 +11,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 75000 # consumed_samples = global_step * global_batch_size
   max_time: "05:23:30:00"
diff --git a/launcher_scripts/conf/training/gpt3/7b_improved.yaml b/launcher_scripts/conf/training/gpt3/7b_improved.yaml
index 40f8d9ab88..0eec1b43ba 100644
--- a/launcher_scripts/conf/training/gpt3/7b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/7b_improved.yaml
@@ -11,7 +11,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 300000 # consumed_samples = global_step * global_batch_size
   max_time: "05:23:30:00" # days:hours:minutes:seconds
diff --git a/launcher_scripts/conf/training/mt5/11b.yaml b/launcher_scripts/conf/training/mt5/11b.yaml
index f6d6a67fc1..3111159db4 100755
--- a/launcher_scripts/conf/training/mt5/11b.yaml
+++ b/launcher_scripts/conf/training/mt5/11b.yaml
@@ -16,7 +16,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1066667 # consumed_samples = global_step * global_batch_size
   max_time: "44:23:30:00"
diff --git a/launcher_scripts/conf/training/mt5/170m.yaml b/launcher_scripts/conf/training/mt5/170m.yaml
index 49a04fc2a2..b166c26496 100755
--- a/launcher_scripts/conf/training/mt5/170m.yaml
+++ b/launcher_scripts/conf/training/mt5/170m.yaml
@@ -16,7 +16,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1000000 # consumed_samples = global_step * global_batch_size
   max_time: "06:23:30:00"
diff --git a/launcher_scripts/conf/training/mt5/23b.yaml b/launcher_scripts/conf/training/mt5/23b.yaml
index d38ea399cf..dab9d9504e 100755
--- a/launcher_scripts/conf/training/mt5/23b.yaml
+++ b/launcher_scripts/conf/training/mt5/23b.yaml
@@ -16,7 +16,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1066667 # consumed_samples = global_step * global_batch_size
   max_time: "54:23:30:00"
diff --git a/launcher_scripts/conf/training/mt5/390m.yaml b/launcher_scripts/conf/training/mt5/390m.yaml
index 479b533b3f..c03436bb8b 100755
--- a/launcher_scripts/conf/training/mt5/390m.yaml
+++ b/launcher_scripts/conf/training/mt5/390m.yaml
@@ -16,7 +16,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1000000 # consumed_samples = global_step * global_batch_size
   max_time: "06:23:30:00"
diff --git a/launcher_scripts/conf/training/mt5/3b.yaml b/launcher_scripts/conf/training/mt5/3b.yaml
index 3a0df27e4c..96b2c367bb 100755
--- a/launcher_scripts/conf/training/mt5/3b.yaml
+++ b/launcher_scripts/conf/training/mt5/3b.yaml
@@ -16,7 +16,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1066667 # consumed_samples = global_step * global_batch_size
   max_time: "17:23:30:00"
diff --git a/launcher_scripts/conf/training/t5/11b.yaml b/launcher_scripts/conf/training/t5/11b.yaml
index 9ee9b3288d..0f47b6e5e7 100755
--- a/launcher_scripts/conf/training/t5/11b.yaml
+++ b/launcher_scripts/conf/training/t5/11b.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1066667 # consumed_samples = global_step * global_batch_size
   max_time: "44:23:30:00"
diff --git a/launcher_scripts/conf/training/t5/220m.yaml b/launcher_scripts/conf/training/t5/220m.yaml
index 2b1549dc8c..73f56344a5 100755
--- a/launcher_scripts/conf/training/t5/220m.yaml
+++ b/launcher_scripts/conf/training/t5/220m.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1000000 # consumed_samples = global_step * global_batch_size
   max_time: "06:23:30:00"
diff --git a/launcher_scripts/conf/training/t5/23b.yaml b/launcher_scripts/conf/training/t5/23b.yaml
index 30ae8d6037..1050285cc7 100755
--- a/launcher_scripts/conf/training/t5/23b.yaml
+++ b/launcher_scripts/conf/training/t5/23b.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1066667 # consumed_samples = global_step * global_batch_size
   max_time: "54:23:30:00"
diff --git a/launcher_scripts/conf/training/t5/3b.yaml b/launcher_scripts/conf/training/t5/3b.yaml
index a2f4c99e59..02c51654fc 100755
--- a/launcher_scripts/conf/training/t5/3b.yaml
+++ b/launcher_scripts/conf/training/t5/3b.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1066667 # consumed_samples = global_step * global_batch_size
   max_time: "14:23:30:00"
diff --git a/launcher_scripts/conf/training/t5/41b.yaml b/launcher_scripts/conf/training/t5/41b.yaml
index 6d23f6e670..599e389f16 100755
--- a/launcher_scripts/conf/training/t5/41b.yaml
+++ b/launcher_scripts/conf/training/t5/41b.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1066667 # consumed_samples = global_step * global_batch_size
   max_time: "99:23:30:00"

From a3393e3f078c1f7693abe98539827c873af494f9 Mon Sep 17 00:00:00 2001
From: "Zhenghang (Max) Xu" <zhenghax@gmail.com>
Date: Mon, 7 Aug 2023 16:39:22 -0700
Subject: [PATCH 12/62] Update main.py to add PEFT stage

Add PEFT stage
---
 launcher_scripts/main.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/launcher_scripts/main.py b/launcher_scripts/main.py
index 4053328f2c..e1685e2a27 100755
--- a/launcher_scripts/main.py
+++ b/launcher_scripts/main.py
@@ -24,6 +24,7 @@
     Conversion,
     EvalHarnessEvaluation,
     FineTuning,
+    PEFT,
     IA3Learning,
     NeMoEvaluation,
     PromptLearning,
@@ -37,6 +38,7 @@
 STR2STAGECLASS = {
     "training": Training,
     "fine_tuning": FineTuning,
+    "peft": PEFT,
     "prompt_learning": PromptLearning,
     "adapter_learning": AdapterLearning,
     "ia3_learning": IA3Learning,

From c29586224ec8c4c65404511bcea3c72a0d8c01ac Mon Sep 17 00:00:00 2001
From: "Zhenghang (Max) Xu" <zhenghax@gmail.com>
Date: Mon, 7 Aug 2023 16:41:16 -0700
Subject: [PATCH 13/62] Update stages.py

Add PEFT stage, which uses unified NeMo PEFT tuning script examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py
---
 launcher_scripts/nemo_launcher/core/stages.py | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 6196a94425..ae093d0844 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -691,6 +691,45 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         }
         return model_type_to_code_path[model_type]
 
+class PEFT(NeMoStage):
+    """Stage class of PEFT with NeMo scripts"""
+
+    def setup_stage_vars(self, cfg):
+        """Setup the stage vars, i.e. stage name and stage cfg"""
+        self.stage_name = "peft"
+        self.stage_cfg = cfg.get("peft")
+
+    def setup_folder_and_data(self) -> None:
+        """Setup job/data folders and fine-tuning/prompt-learning dataset"""
+        # Setup folders
+        super().setup_folder_and_data()
+
+        # Prepare prompt learning dataset
+        data_dir = self.cfg.get("data_dir")
+        task_name = self.stage_cfg.run.get("task_name")
+
+        # Prepare dataset for squad
+        if task_name in ["squad", "xquad"]:
+            prepare_squad_for_fine_tuning(data_dir=os.path.join(data_dir, "squad_data"))
+
+
+    def _get_nemo_code_path(self, model_type: str) -> Path:
+        """
+        Provide the essential nemo code path for running the stage, usually different model types use different nemo scripts.
+        For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3.
+        
+        :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc.
+        :return: path current stage's essential nemo scripts code 
+        :rtype: Path
+        """
+        if model_type == "t5":
+            raise NotImplementedError("PEFT is not supported in NeMo Megatron t5 models.")
+        if model_type == "mt5":
+            raise NotImplementedError("PEFT is not supported in NeMo Megatron mt5 models.")
+        model_type_to_code_path = {
+            "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py",
+        }
+        return model_type_to_code_path[model_type]
 
 class PromptLearning(NeMoStage):
     """Stage class of prompt-learning with NeMo scripts"""

From 75de3e66cf8b45a2741920a8761dfbb85c717836 Mon Sep 17 00:00:00 2001
From: "Zhenghang (Max) Xu" <zhenghax@gmail.com>
Date: Mon, 7 Aug 2023 17:14:43 -0700
Subject: [PATCH 14/62] Update config.yaml

Add PEFT stage
---
 launcher_scripts/conf/config.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml
index e5e1da9e0a..d27c89c934 100755
--- a/launcher_scripts/conf/config.yaml
+++ b/launcher_scripts/conf/config.yaml
@@ -5,6 +5,7 @@ defaults:
   - training: gpt3/5b
   - conversion: gpt3/convert_gpt3
   - fine_tuning: null
+  - peft: null
   - prompt_learning: null
   - adapter_learning: null
   - ia3_learning: null
@@ -59,6 +60,7 @@ numa_mapping:
 data_preparation_config: ${hydra:runtime.choices.data_preparation}
 training_config: ${hydra:runtime.choices.training}
 fine_tuning_config: ${hydra:runtime.choices.fine_tuning}
+peft_config: ${hydra:runtime.choices.peft}
 prompt_learning_config: ${hydra:runtime.choices.prompt_learning}
 adapter_learning_config: ${hydra:runtime.choices.adapter_learning}
 ia3_learning_config: ${hydra:runtime.choices.ia3_learning}

From 1ba30f759813e4d447bb4415d8aa351e153d320c Mon Sep 17 00:00:00 2001
From: "Zhenghang (Max) Xu" <zhenghax@gmail.com>
Date: Mon, 7 Aug 2023 17:19:56 -0700
Subject: [PATCH 15/62] Create squad.yaml

Have peft_scheme in the file. Available options: adapter, ia3, ptuning, adapter_and_ptuning and lora. PEFT command example for BCP:
python3 /opt/NeMo-Megatron-Launcher/launcher_scripts/main.py peft=gpt3/squad stages=[peft] cluster_type=bcp launcher_scripts_path=/opt/NeMo-Megatron-Launcher/launcher_scripts peft.model.peft.peft_scheme=ptuning
---
 launcher_scripts/conf/peft/gpt3/squad.yaml | 225 +++++++++++++++++++++
 1 file changed, 225 insertions(+)
 create mode 100644 launcher_scripts/conf/peft/gpt3/squad.yaml

diff --git a/launcher_scripts/conf/peft/gpt3/squad.yaml b/launcher_scripts/conf/peft/gpt3/squad.yaml
new file mode 100644
index 0000000000..e3da77bba7
--- /dev/null
+++ b/launcher_scripts/conf/peft/gpt3/squad.yaml
@@ -0,0 +1,225 @@
+name: megatron_gpt_peft_tuning-${peft.model.peft.peft_scheme}
+
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: gpt3_5b
+  convert_dir: ${base_results_dir}/${peft.run.model_train_name}/${peft.run.convert_name}
+  task_name: "squad"
+  results_dir: ${base_results_dir}/${.model_train_name}/peft_${.task_name}
+
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False
+  enable_checkpointing: False
+  replace_sampler_ddp: False
+  max_epochs: 4
+  max_steps: -1
+  log_every_n_steps: 10
+  val_check_interval: 200
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${peft.name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${peft.model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${peft.name}--{${peft.exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${peft.model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: True
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ${peft.run.convert_dir}/results/megatron_gpt.nemo # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  peft:
+    peft_scheme: "ptuning"  # can be either adapter,ia3, or ptuning
+    restore_from_path: null
+    
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+    
+    lora_tuning:
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+    
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+  
+  data:
+    chat: False # whether use chatbot data or not
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names: 
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names: 
+      - ${data_dir}/squad_data/v1.1/train-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${peft.model.global_batch_size}
+      micro_batch_size: ${peft.model.micro_batch_size}
+      shuffle: True
+      num_workers: 4
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities: 
+      - 1.0 # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: False
+      separate_prompt_and_response_with_newline: True
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+    validation_ds:
+      file_names:
+      - ${data_dir}/squad_data/v1.1/dev-v1.1_gpt.json  # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: 
+      - ${peft.run.task_name} # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${peft.model.global_batch_size}
+      micro_batch_size: ${peft.model.micro_batch_size}
+      shuffle: True
+      num_workers: 4
+      pin_memory: True
+      max_seq_length: ${peft.model.data.train_ds.max_seq_length}
+      min_seq_length: ${peft.model.data.train_ds.min_seq_length}
+      drop_last: True
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: ${peft.model.data.train_ds.add_eos}
+      add_sep: ${peft.model.data.train_ds.add_sep}
+      add_bos: ${peft.model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+    test_ds:
+      file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${peft.model.global_batch_size}
+      micro_batch_size: ${peft.model.micro_batch_size}
+      shuffle: True
+      num_workers: 4
+      pin_memory: True
+      max_seq_length: ${peft.model.data.train_ds.max_seq_length}
+      min_seq_length: ${peft.model.data.train_ds.min_seq_length}
+      drop_last: True
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: ${peft.model.data.train_ds.add_eos}
+      add_sep: ${peft.model.data.train_ds.add_sep}
+      add_bos: ${peft.model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+  
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false

From 4f1c768889e82e746d6cd966f5545144439a7326 Mon Sep 17 00:00:00 2001
From: "Zhenghang (Max) Xu" <zhenghax@gmail.com>
Date: Mon, 14 Aug 2023 09:59:45 -0700
Subject: [PATCH 16/62] Update README.md

Update with PEFT Framework section
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 3df4288d54..8affd0ba3f 100755
--- a/README.md
+++ b/README.md
@@ -207,6 +207,7 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
       - [5.16.2.6 PPO Hyper-parameters](#51626-ppo-hyper-parameters)
     + [5.16.3. Future Work](#5163-future-work)
   * [5.17 Curating pretraining datasets with the NeMo Data Curator](#517-curating-pretraining-datasets-with-the-nemo-data-curator)
+  * [5.18 Parameter-Efficient Fine-Tuning (PEFT) Framework with unified PEFT methods](#518-parameter-efficient-fine-tuning-(peft)-framework-with-unified-peft-methods)
 - [6. Deploying the NeMo Megatron Model](#6-deploying-the-nemo-megatron-model)
   * [6.1. Run NVIDIA Triton Server with Generated Model Repository](#61-run-nvidia-triton-server-with-generated-model-repository)
 - [6.2. GPT Text Generation with Ensemble](#62-gpt-text-generation-with-ensemble)

From a30a02228c7e73b65da69b16dbb7b37731267457 Mon Sep 17 00:00:00 2001
From: "Zhenghang (Max) Xu" <zhenghax@gmail.com>
Date: Mon, 14 Aug 2023 10:35:28 -0700
Subject: [PATCH 17/62] Update README.md

Update PEFT Framework Training with Launcher
---
 README.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8affd0ba3f..e9c2be6e40 100755
--- a/README.md
+++ b/README.md
@@ -145,8 +145,10 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
   * [5.12 LoRA Model and Generalized PEFT Framework](#512-lora-model-and-generalized-peft-framework)
     + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models)
       - [5.12.1.1 PEFT Training and Inference](#51211-peft-training-and-inference)
+      - [5.12.1.2 PEFT Training with NeMo Megatron Launcher](#51212-peft-training-with-nemo-megatron-launcer)
       - [5.12.2 PEFT Training and Inference for mT5/T5-style Models](#5122-peft-training-and-inference-for-mt5-t5-style-models)
       - [5.12.2.1 PEFT Training and Inference](#51221-peft-training-and-inference)
+    + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models)
   * [5.13. Model Evaluation](#513-model-evaluation)
     + [5.13.1. GPT Evaluation](#5131-gpt-evaluation)
       - [5.13.1.1. Common](#51311-common)
@@ -207,7 +209,6 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
       - [5.16.2.6 PPO Hyper-parameters](#51626-ppo-hyper-parameters)
     + [5.16.3. Future Work](#5163-future-work)
   * [5.17 Curating pretraining datasets with the NeMo Data Curator](#517-curating-pretraining-datasets-with-the-nemo-data-curator)
-  * [5.18 Parameter-Efficient Fine-Tuning (PEFT) Framework with unified PEFT methods](#518-parameter-efficient-fine-tuning-(peft)-framework-with-unified-peft-methods)
 - [6. Deploying the NeMo Megatron Model](#6-deploying-the-nemo-megatron-model)
   * [6.1. Run NVIDIA Triton Server with Generated Model Repository](#61-run-nvidia-triton-server-with-generated-model-repository)
 - [6.2. GPT Text Generation with Ensemble](#62-gpt-text-generation-with-ensemble)
@@ -3777,6 +3778,13 @@ inference.outfile_path=<OUTPUT_FILE>
 ```
 Additionally, NeMo has a notebook which walks through the steps (which these scripts encapsulate) to train and run inference for PEFT models: https://github.com/NVIDIA/NeMo/blob/main/tutorials/nlp/lora.ipynb
 
+##### 5.12.1.2 PEFT Training with NeMo Megatron Launcher
+PEFT stage could launch PEFT methods including PTuning, LoRA, Adapters and IA3 in a single stage, by setting different peft scheme.
+It is implemented via adapter_mixins framework with a unify style.
+mix-n-match PEFT scheme like adapter_and_ptuning can be easily extended to do ia3_and_ptuning or lora_and_ptuning
+
+PTuning does not need to flexibility to insert prompt tokens anywhere in the input. This feature has been removed for simplicity.
+
 ##### 5.12.2 PEFT Training and Inference for mT5/T5-style Models
 We offer training and inference scripts in NeMo for parameter efficient tuning of mT5/T5-style models. You can train a LoRA, P-tuning, Adapter, or IA3 model using its corresponding training and inference script. 
 

From a43a0c3c28a6122b2e37eec5b160adbcc766342a Mon Sep 17 00:00:00 2001
From: "Zhenghang (Max) Xu" <zhenghax@gmail.com>
Date: Mon, 14 Aug 2023 10:44:01 -0700
Subject: [PATCH 18/62] edit readme for PEFT framework methods

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e9c2be6e40..db6cc3e5c5 100755
--- a/README.md
+++ b/README.md
@@ -145,7 +145,7 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
   * [5.12 LoRA Model and Generalized PEFT Framework](#512-lora-model-and-generalized-peft-framework)
     + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models)
       - [5.12.1.1 PEFT Training and Inference](#51211-peft-training-and-inference)
-      - [5.12.1.2 PEFT Training with NeMo Megatron Launcher](#51212-peft-training-with-nemo-megatron-launcer)
+      - [5.12.1.2 PEFT Training with NeMo Megatron Launcher](#51212-peft-training-with-nemo-megatron-launcher)
       - [5.12.2 PEFT Training and Inference for mT5/T5-style Models](#5122-peft-training-and-inference-for-mt5-t5-style-models)
       - [5.12.2.1 PEFT Training and Inference](#51221-peft-training-and-inference)
     + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models)

From aa517f3d6191d57eb680f305b48b4d9b03227074 Mon Sep 17 00:00:00 2001
From: "Zhenghang (Max) Xu" <zhenghax@gmail.com>
Date: Mon, 14 Aug 2023 11:02:51 -0700
Subject: [PATCH 19/62] Update README.md

Update PEFT Framework methods with example script
---
 README.md | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 68 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index db6cc3e5c5..b9d9c49893 100755
--- a/README.md
+++ b/README.md
@@ -145,7 +145,8 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
   * [5.12 LoRA Model and Generalized PEFT Framework](#512-lora-model-and-generalized-peft-framework)
     + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models)
       - [5.12.1.1 PEFT Training and Inference](#51211-peft-training-and-inference)
-      - [5.12.1.2 PEFT Training with NeMo Megatron Launcher](#51212-peft-training-with-nemo-megatron-launcher)
+      + [5.12.1.2 PEFT Training with NeMo Megatron Launcher](#51212-peft-training-with-nemo-megatron-launcher)
+        - [5.12.1.2.1 Base Command Platform](#512121-base-command-platform)
       - [5.12.2 PEFT Training and Inference for mT5/T5-style Models](#5122-peft-training-and-inference-for-mt5-t5-style-models)
       - [5.12.2.1 PEFT Training and Inference](#51221-peft-training-and-inference)
     + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models)
@@ -3785,6 +3786,72 @@ mix-n-match PEFT scheme like adapter_and_ptuning can be easily extended to do ia
 
 PTuning does not need to flexibility to insert prompt tokens anywhere in the input. This feature has been removed for simplicity.
 
+##### 5.12.1.2.1 Base Command Platform
+<a id="markdown-base-command-platform" name="base-command-platform"></a>
+In order to run the ptuning learning script on Base Command Platform, set the
+`cluster_type` parameter in `conf/config.yaml` to `bcp` or `interactive`. This can also be overridden
+from the command line, using hydra. 
+
+```bash
+export HYDRA_FULL_ERROR=1
+export TORCH_CPP_LOG_LEVEL=INFO NCCL_DEBUG=INFO
+  
+TRAIN="[/mount/workspace/databricks-dolly-15k-train.jsonl]"
+VALID="[/mount/workspace/databricks-dolly-15k-val.jsonl]"
+VALID_NAMES="[peft-squad]"
+CONCAT_SAMPLING_PROBS="[1]"
+ 
+PEFT_SCHEME="ptuning"
+PEFT_EXP_DIR="/results/nemo_launcher/ptuning"
+LOG_DIR="/results/nemo_launcher/ptuning_log"
+ 
+TP_SIZE=2
+ 
+PP_SIZE=1
+ 
+python3 /opt/NeMo-Megatron-Launcher/launcher_scripts/main.py \
+        peft=gpt3/squad \
+        stages=[peft] \
+        cluster_type=interactive \
+        launcher_scripts_path=/opt/NeMo-Megatron-Launcher/launcher_scripts \
+        peft.model.peft.peft_scheme=${PEFT_SCHEME} \
+        peft.trainer.precision=bf16 \
+        peft.trainer.max_steps=100 \
+        peft.trainer.devices=2 \
+        peft.trainer.val_check_interval=10 \
+        peft.model.megatron_amp_O2=False \
+        peft.model.restore_from_path=/mount/workspace/nemo_gpt1.3B_fp16.nemo \
+        peft.model.tensor_model_parallel_size=${TP_SIZE} \
+        peft.model.pipeline_model_parallel_size=${PP_SIZE} \
+        peft.model.optim.lr=5e-6 \
+        peft.model.answer_only_loss=True \
+        peft.model.data.train_ds.file_names=${TRAIN} \
+        peft.model.data.train_ds.micro_batch_size=1 \
+        peft.model.data.train_ds.global_batch_size=32 \
+        peft.model.data.train_ds.concat_sampling_probabilities=${CONCAT_SAMPLING_PROBS} \
+        peft.model.data.validation_ds.micro_batch_size=1 \
+        peft.model.data.validation_ds.global_batch_size=32 \
+        peft.model.data.validation_ds.file_names=${VALID} \
+        peft.model.data.validation_ds.names=${VALID_NAMES} \
+        peft.model.data.test_ds.micro_batch_size=1 \
+        peft.model.data.test_ds.global_batch_size=128 \
+        peft.model.data.train_ds.num_workers=0 \
+        peft.model.data.validation_ds.num_workers=0 \
+        peft.model.data.test_ds.num_workers=0 \
+        peft.model.data.validation_ds.metric.name=loss \
+        peft.model.data.test_ds.metric.name=loss \
+        peft.exp_manager.exp_dir=${PEFT_EXP_DIR} \
+        peft.exp_manager.explicit_log_dir=${LOG_DIR} \
+        peft.exp_manager.resume_if_exists=True \
+        peft.exp_manager.resume_ignore_no_checkpoint=True \
+        peft.exp_manager.create_checkpoint_callback=True \
+        peft.exp_manager.checkpoint_callback_params.monitor=validation_loss
+```
+
+The command above assumes you mounted the data workspace in `/mount/workspace/` (e.g. the example script uses databricks-dolly-15k dataset), and the results workspace in `/mount/results`. The command needs set different peft.exp_manager.exp_dir for different PEFT jobs.
+The stdout and stderr outputs will also be redirected to the `/results/nemo_launcher/ptuning_log`, to be able to download the logs from NGC.
+Any other parameter can also be added to the command to modify its behavior.
+
 ##### 5.12.2 PEFT Training and Inference for mT5/T5-style Models
 We offer training and inference scripts in NeMo for parameter efficient tuning of mT5/T5-style models. You can train a LoRA, P-tuning, Adapter, or IA3 model using its corresponding training and inference script. 
 

From e437c6c2edee1439ea8d0e165d095eafbe92c69f Mon Sep 17 00:00:00 2001
From: "Zhenghang (Max) Xu" <zhenghax@gmail.com>
Date: Mon, 14 Aug 2023 11:07:24 -0700
Subject: [PATCH 20/62] Update README.md

PEFT
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index b9d9c49893..3ac121e110 100755
--- a/README.md
+++ b/README.md
@@ -3792,6 +3792,7 @@ In order to run the ptuning learning script on Base Command Platform, set the
 `cluster_type` parameter in `conf/config.yaml` to `bcp` or `interactive`. This can also be overridden
 from the command line, using hydra. 
 
+To run the ptuning pipeline to nemo-megatron-gpt-1.3B model converted checkpoint, run:
 ```bash
 export HYDRA_FULL_ERROR=1
 export TORCH_CPP_LOG_LEVEL=INFO NCCL_DEBUG=INFO

From 7034b27fedf3bdc07f9ef4e2e24af06087dfbfad Mon Sep 17 00:00:00 2001
From: "Zhenghang (Max) Xu" <zhenghax@gmail.com>
Date: Mon, 14 Aug 2023 11:18:03 -0700
Subject: [PATCH 21/62] Update README.md

Update PEFT
---
 README.md | 44 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 3ac121e110..a5ada22cf6 100755
--- a/README.md
+++ b/README.md
@@ -146,7 +146,8 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
     + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models)
       - [5.12.1.1 PEFT Training and Inference](#51211-peft-training-and-inference)
       + [5.12.1.2 PEFT Training with NeMo Megatron Launcher](#51212-peft-training-with-nemo-megatron-launcher)
-        - [5.12.1.2.1 Base Command Platform](#512121-base-command-platform)
+        - [5.12.1.2.1 Slurm](#512121-slurm)
+        - [5.12.1.2.2 Base Command Platform](#512122-base-command-platform)
       - [5.12.2 PEFT Training and Inference for mT5/T5-style Models](#5122-peft-training-and-inference-for-mt5-t5-style-models)
       - [5.12.2.1 PEFT Training and Inference](#51221-peft-training-and-inference)
     + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models)
@@ -3786,7 +3787,44 @@ mix-n-match PEFT scheme like adapter_and_ptuning can be easily extended to do ia
 
 PTuning does not need to flexibility to insert prompt tokens anywhere in the input. This feature has been removed for simplicity.
 
-##### 5.12.1.2.1 Base Command Platform
+##### 5.12.1.2.1 Slurm
+<a id="markdown-slurm" name="slurm"></a>
+
+Set configuration for a Slurm cluster in the `conf/cluster/bcm.yaml` file:
+
+```yaml
+partition: null
+account: null
+exclusive: True
+gpus_per_task: null
+gpus_per_node: 8
+mem: 0
+overcommit: False
+job_name_prefix: "nemo-megatron-"
+```
+
+**Example:**
+
+To run only the evaluation pipeline and not the data preparation, training, 
+conversion or inference pipelines set the `conf/config.yaml` file to:
+
+```yaml
+stages:
+  - peft
+```
+
+then run:
+```
+python3 main.py \
+    peft=gpt3/squad \
+    stages=["peft"] \
+    peft.model.peft.peft_scheme="ptuning" \
+    peft.model.megatron_amp_O2=False \
+    peft.model.restore_from_path=${LANGUAGE_MODEL_PATH}\
+    peft.exp_manager.exp_dir=${BASE_RESULTS_DIR}/${RUN_NAME}/ptuning \
+
+```
+##### 5.12.1.2.2 Base Command Platform
 <a id="markdown-base-command-platform" name="base-command-platform"></a>
 In order to run the ptuning learning script on Base Command Platform, set the
 `cluster_type` parameter in `conf/config.yaml` to `bcp` or `interactive`. This can also be overridden
@@ -3849,7 +3887,7 @@ python3 /opt/NeMo-Megatron-Launcher/launcher_scripts/main.py \
         peft.exp_manager.checkpoint_callback_params.monitor=validation_loss
 ```
 
-The command above assumes you mounted the data workspace in `/mount/workspace/` (e.g. the example script uses databricks-dolly-15k dataset), and the results workspace in `/mount/results`. The command needs set different peft.exp_manager.exp_dir for different PEFT jobs.
+The command above assumes you mounted the data workspace in `/mount/workspace/` (e.g. the example script uses databricks-dolly-15k dataset), and the results workspace in `/results`. The command needs set different peft.exp_manager.exp_dir for different PEFT jobs.
 The stdout and stderr outputs will also be redirected to the `/results/nemo_launcher/ptuning_log`, to be able to download the logs from NGC.
 Any other parameter can also be added to the command to modify its behavior.
 

From 720ad921b86870f956676781c3c239d5988f40f4 Mon Sep 17 00:00:00 2001
From: "Zhenghang (Max) Xu" <zhenghax@gmail.com>
Date: Mon, 14 Aug 2023 11:32:32 -0700
Subject: [PATCH 22/62] Update README.md

Update Launcher with PEFT Framework and methods
---
 README.md | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a5ada22cf6..49a3b934fa 100755
--- a/README.md
+++ b/README.md
@@ -3787,7 +3787,31 @@ mix-n-match PEFT scheme like adapter_and_ptuning can be easily extended to do ia
 
 PTuning does not need to flexibility to insert prompt tokens anywhere in the input. This feature has been removed for simplicity.
 
-##### 5.12.1.2.1 Slurm
+##### 5.12.1.2.1. Common
+<a id="markdown-common" name="common"></a>
+To specify the configuration for ptuning (LoRA, adapter or IA3 learning), 
+use all the `run` parameters to define the job specific config:
+```yaml
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: gpt3_1.3B
+  task_name: "squad"
+  results_dir: ${base_results_dir}/${.model_train_name}/ptuning_${.task_name}
+```
+
+To specify which language model checkpoint to load and its definition, use the `model` parameter:
+
+```yaml
+model:
+  language_model_path: ${base_results_dir}/${peft.run.model_train_name}/${peft.run.convert_name}/nemo_gpt1.3B_fp16.nemo
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+```
+
+##### 5.12.1.2.2 Slurm
 <a id="markdown-slurm" name="slurm"></a>
 
 Set configuration for a Slurm cluster in the `conf/cluster/bcm.yaml` file:
@@ -3824,7 +3848,7 @@ python3 main.py \
     peft.exp_manager.exp_dir=${BASE_RESULTS_DIR}/${RUN_NAME}/ptuning \
 
 ```
-##### 5.12.1.2.2 Base Command Platform
+##### 5.12.1.2.3 Base Command Platform
 <a id="markdown-base-command-platform" name="base-command-platform"></a>
 In order to run the ptuning learning script on Base Command Platform, set the
 `cluster_type` parameter in `conf/config.yaml` to `bcp` or `interactive`. This can also be overridden

From 18b76ed49b3aa2cbdbf79797acbb8fce54c1941e Mon Sep 17 00:00:00 2001
From: "Zhenghang (Max) Xu" <zhenghax@gmail.com>
Date: Mon, 14 Aug 2023 11:34:52 -0700
Subject: [PATCH 23/62] Update README.md

Update Launcher with PEFT Framework and methods
---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 49a3b934fa..485a8fba30 100755
--- a/README.md
+++ b/README.md
@@ -146,8 +146,9 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
     + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models)
       - [5.12.1.1 PEFT Training and Inference](#51211-peft-training-and-inference)
       + [5.12.1.2 PEFT Training with NeMo Megatron Launcher](#51212-peft-training-with-nemo-megatron-launcher)
-        - [5.12.1.2.1 Slurm](#512121-slurm)
-        - [5.12.1.2.2 Base Command Platform](#512122-base-command-platform)
+        - [5.12.1.2.1 Common](#512121-common)
+        - [5.12.1.2.2 Slurm](#512122-slurm)
+        - [5.12.1.2.3 Base Command Platform](#512123-base-command-platform)
       - [5.12.2 PEFT Training and Inference for mT5/T5-style Models](#5122-peft-training-and-inference-for-mt5-t5-style-models)
       - [5.12.2.1 PEFT Training and Inference](#51221-peft-training-and-inference)
     + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models)

From f9810f0a034d2e4082a06840d5099d22893b4b14 Mon Sep 17 00:00:00 2001
From: Robert Clark <robdclark@outlook.com>
Date: Fri, 19 May 2023 14:24:13 -0500
Subject: [PATCH 24/62] Add support for training GPT models on Kubernetes

Adding kubernetes support for preparing datasets and training GPT-based
foundation models with NeMo Framework as well as conversion to the .nemo
format and model evaluation. The kubernetes support creates a Helm chart
based off the cluster config settings and each task is launched as a
distributed job via Helm. Currently, kubernetes support assumes the
following:
  * Recent versions of the GPU, Network, and Kubeflow Operators are installed.
  * InfiniBand adapters are labeled as node resources if running multi-node jobs.
  * The launcher is run from a controller node with access to `kubectl` and `helm` and can launch jobs on the cluster.
  * The controller node has the ability to install various Python dependencies, including Hydra.
  * All data including the launcher scripts and results/checkpoints will be stored on an NFS filer attached to all nodes.

A new k8s cluster setting and config file have been included to allow
jobs to run on specific kubernetes cluster.

Signed-Off-By: Robert Clark <roclark@nvidia.com>
---
 Dockerfile                                    |   7 +
 README.md                                     | 199 ++++++++++++++-
 launcher_scripts/conf/cluster/k8s.yaml        |   6 +
 launcher_scripts/conf/config.yaml             |   4 +-
 .../gpt3/download_gpt3_pile.yaml              |   2 +-
 .../pile_dataprep/download.py                 |   2 +-
 .../dataprep_scripts/pile_dataprep/extract.py |   2 +-
 .../pile_dataprep/preprocess.py               |   2 +-
 .../nemo_launcher/core/data_stages.py         |  94 ++++++-
 .../nemo_launcher/core/export_stages.py       |   2 +-
 .../core/k8s_templates/conversion/Chart.yaml  |   5 +
 .../k8s_templates/conversion/conversion.yaml  |  48 ++++
 .../core/k8s_templates/conversion/values.yaml |  40 +++
 .../k8s_templates/data_preparation/Chart.yaml |   5 +
 .../data_preparation/data-prep-config.yaml    |   7 +
 .../data_preparation/data-prep.yaml           |  59 +++++
 .../data_preparation/values.yaml              |  27 ++
 .../core/k8s_templates/evaluation/Chart.yaml  |   5 +
 .../evaluation/evaluation-config.yaml         |   7 +
 .../k8s_templates/evaluation/evaluation.yaml  |  53 ++++
 .../core/k8s_templates/evaluation/values.yaml |  73 ++++++
 .../core/k8s_templates/training/Chart.yaml    |   5 +
 .../training/training-config.yaml             |   7 +
 .../core/k8s_templates/training/training.yaml |  71 ++++++
 .../core/k8s_templates/training/values.yaml   |  28 +++
 .../nemo_launcher/core/launchers.py           |  68 ++++++
 launcher_scripts/nemo_launcher/core/stages.py | 230 +++++++++++++++++-
 27 files changed, 1036 insertions(+), 22 deletions(-)
 create mode 100644 launcher_scripts/conf/cluster/k8s.yaml
 create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/conversion/Chart.yaml
 create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/conversion/conversion.yaml
 create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/conversion/values.yaml
 create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/Chart.yaml
 create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep-config.yaml
 create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep.yaml
 create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/values.yaml
 create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/Chart.yaml
 create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation-config.yaml
 create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation.yaml
 create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/values.yaml
 create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/training/Chart.yaml
 create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/training/training-config.yaml
 create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/training/training.yaml
 create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/training/values.yaml

diff --git a/Dockerfile b/Dockerfile
index e0144fa03d..b250d0dbe0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -68,6 +68,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libsndfile1 \
         sox \
         swig \
+        openssh-server \
         libb64-dev && \
     rm -rf /var/lib/apt/lists/*
 
@@ -179,6 +180,12 @@ RUN pip install --no-cache-dir wandb==0.15.3 \
 # Copy FasterTransformer
 COPY --from=ft_builder /workspace/FasterTransformer FasterTransformer
 
+# Setup SSH config to allow mpi-operator to communicate with containers in k8s
+RUN echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
+    sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
+    sed -i 's/#   StrictHostKeyChecking ask/    StrictHostKeyChecking no/' /etc/ssh/ssh_config && \
+    mkdir -p /var/run/sshd
+
 # Examples
 WORKDIR /workspace
 #COPY any user-facing example scripts should go in here
diff --git a/README.md b/README.md
index 3df4288d54..dbe58feb64 100755
--- a/README.md
+++ b/README.md
@@ -19,6 +19,7 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
     + [4.1.1. Common](#411-common)
     + [4.1.2. OCI](#412-oci)
     + [4.1.3. AWS](#413-aws)
+    + [4.1.4. Kubernetes](#414-k8s)
   * [4.2. Cluster Validation](#42-cluster-validation)
     + [4.2.1. Validation Script Usage](#421-validation-script-usage)
     + [4.2.2 Running tests manually](#422-running-tests-manually)
@@ -32,12 +33,14 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
     + [5.1.1. Prepare Environment](#511-prepare-environment)
       - [5.1.1.1. Slurm](#5111-slurm)
       - [5.1.1.2. Base Command Platform](#5112-base-command-platform)
-      - [5.1.1.3. General Configuration](#5113-general-configuration)
+      - [5.1.1.3. Kubernetes](#5113-kubernetes)
+      - [5.1.1.4. General Configuration](#5114-general-configuration)
     + [5.1.2. Data Preparation](#512-data-preparation)
       - [5.1.2.1. Data Preparation for GPT Models](#5121-data-preparation-for-gpt-models)
         * [5.1.2.1.1. Slurm](#51211-slurm)
         * [5.1.2.1.2. Base Command Platform](#51212-base-command-platform)
-        * [5.1.2.1.3. Common](#51213-common)
+        * [5.1.2.1.3. Kubernetes](#51213-kubernetes)
+        * [5.1.2.1.4. Common](#51214-common)
       - [5.1.2.2. Data Preparation for T5 Models](#5122-data-preparation-for-t5-models)
         * [5.1.2.2.1. Slurm](#51221-slurm)
         * [5.1.2.2.2. Base Command Platform](#51222-base-command-platform)
@@ -85,6 +88,7 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
     + [5.6.1. GPT Training](#561-gpt-training)
       - [5.6.1.1. Slurm](#5611-slurm)
       - [5.6.1.2. Base Command Platform](#5612-base-command-platform)
+      - [5.6.1.3. Kubernetes](#5613-base-command-platform)
     + [5.6.2. T5 Training](#562-t5-training)
       - [5.6.2.1. Slurm](#5621-slurm)
       - [5.6.2.2. Base Command Platform](#5622-base-command-platform)
@@ -100,6 +104,7 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
       - [5.8.1.1. Common](#5811-common)
       - [5.8.1.2. Slurm](#5812-slurm)
       - [5.8.1.3. Base Command Platform](#5813-base-command-platform)
+      - [5.8.1.4. Kubernetes](#5814-kubernetes)
     + [5.8.2. T5 Conversion](#582-t5-conversion)
       - [5.8.2.1. Common](#5821-common)
       - [5.8.2.2. Slurm](#5822-slurm)
@@ -152,7 +157,8 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
       - [5.13.1.1. Common](#51311-common)
       - [5.13.1.2. Slurm](#51312-slurm)
       - [5.13.1.3. Base Command Platform](#51313-base-command-platform)
-      - [5.13.1.4 Interleaved Pipeline Parallelism](#51314-interleaved-pipeline-parallelism)
+      - [5.13.1.4. Kubernetes](#51314-kubernetes)
+      - [5.13.1.5 Interleaved Pipeline Parallelism](#51314-interleaved-pipeline-parallelism)
     + [5.13.2. T5 Evaluation](#5132-t5-evaluation)
       - [5.13.2.1. Common](#51321-common)
       - [5.13.2.2. Slurm](#51322-slurm)
@@ -371,6 +377,11 @@ Figure 1: The GPT family architecture. The 5B variant includes 24 transformer la
 | HPC-X                   | 2.13             |
 | Base Command Manager    | 1.0.0            |
 | DeepOps                 | 21.06            |
+| Kubernetes              | 1.27.4           |
+| Helm                    | 3.12.1           |
+| GPU Operator            | 23.3.2           |
+| Network Operator        | 23.1.0           |
+| KubeFlow Operator       | 1.6.0            |
 
 ## 4. Cloud Service Providers
 <a id="markdown-cloud-service-providers" name="cloud-service-providers"></a>
@@ -421,6 +432,23 @@ On the scheduler node:
 container: /path/to/nemo_megatron_launcher/nemo_megatron_training.sqsh
 ```
 
+#### 4.1.4. Kubernetes
+<a id="markdown-k8s" name="k8s"></a>
+Data preparation and training GPT models is currently supported on vanilla kubernetes (k8s) clusters.
+The launcher scripts will generate a Helm chart for each task based on the config files and launch the job using the chart.
+
+The following is required for running jobs on Kubernetes:
+  * One or more DGX A100s/H100s as worker nodes
+  * An NFS filesystem where the data and launcher scripts will be stored which is accessible on all worker and controller nodes
+  * A head/controller node which has access to the worker nodes and can run `kubectl` and `helm` to launch jobs and can install Python dependencies
+  * Recent versions of the GPU, Network, and KubeFlow Operators installed
+
+A secret key needs to be configured to allow kubernetes to pull from the private registry. For example, if pulling the container directly
+from NGC, a secret needs to be created to authenticate with the private NGC registry, such as the following:
+```
+kubectl create secret docker-registry ngc-registry --docker-server=nvcr.io --docker-username=\$oauthtoken --docker-password=<NGC KEY HERE>
+```
+
 ### 4.2. Cluster Validation
 <a id="markdown-cluster-validation" name="cluster-validation"></a>
 
@@ -604,7 +632,22 @@ creating these workspaces (e.g. `nemo_megatron_data_ws` and `nemo_megatron_resul
 the Base Command Platform User Guide for how to create and work with Base 
 Command Platform workspaces.
 
-##### 5.1.1.3. General Configuration
+##### 5.1.1.3. Kubernetes
+<a id="markdown-kubernetes" name="kubernetes"></a>
+
+The launcher scripts need to be downloaded to the NFS filesystem that is
+connected to the worker nodes. This can either be copied at
+`/opt/NeMo-Megatron-Launcher` from inside the training container or by cloning
+this repository.
+
+Install the NeMo Framework scripts dependencies on the head node/controller of
+the cluster where jobs will be launched:
+
+```
+pip install -r requirements.txt
+```
+
+##### 5.1.1.4. General Configuration
 <a id="markdown-general-configuration" name="general-configuration"></a>
 
 The first parameter that must be set is the `launcher_scripts_path` parameter inside the
@@ -852,8 +895,36 @@ The command above assumes you want to prepare the entire dataset (files 0-29), a
 workspace in `/mount/data`, and the results workspace in `/mount/results`. Stdout and stderr are redirected to the `/results/data_gpt3_log.txt` file, so it can be downloaded from NGC. 
 Any other parameter can also be added to the command to modify its behavior.
 
-###### 5.1.2.1.3. Common
-<a id="markdown-41213-common" name="41213-common"></a>
+###### 5.1.2.1.3. Kubernetes
+<a id="markdown-51213-kubernetes" name="51213-kubernetes"></a>
+
+To run data preparation on a kubernetes cluster, set both the `cluster` and
+`cluster_type` parameters to `k8s` in `conf/config.yaml`. Additionally, set the
+`launcher_scripts_path` parameter to the location where the launcher scripts
+are located on the NFS filesystem. This must be the same path on all nodes in
+the cluster. Ensure the `stages` parameter is set to `data_preparation` and
+`data_preparation` in the `defaults` section points to the intended data
+preparation script.
+
+The `conf/config/k8s.yaml` file also needs to be updated with the
+kubernetes container registry secret if created earlier (`pull_secret`), the
+`shm_size` to determine how much local memory to put in each pod, and the NFS
+server and path to where the launcher scripts are saved. These can all be
+overridden from the command line using hydra as well.
+
+Once all of the config files are updated, the data preparation can be launched
+from the controller node with:
+
+```
+python main.py
+```
+
+This will generate and launch a job via Helm in the default namespace which
+can be viewed with `helm show` or `kubectl get pods`. The logs can be followed
+with `kubectl logs <pod-name>`.
+
+###### 5.1.2.1.4. Common
+<a id="markdown-51214-common" name="51214-common"></a>
 
 Set the configuration for the data preparation job for GPT models in the YAML file:
 ```yaml
@@ -2462,6 +2533,89 @@ Select the cluster related configuration following the NGC documentation.
 Then, use the `python3 main.py` command to launch the job and override the 
 desired parameters from the training job parameters.
 
+##### 5.6.1.3. Kubernetes
+<a id="markdown-kuberetes" name="kubernetes"></a>
+
+Set configuration for your Kubernetes cluster in the `conf/cluster/k8s.yaml` file:
+
+```yaml
+pull_secret: null
+shm_size: 512Gi
+nfs_server: null
+nfs_path: null
+ib_resource_name: "nvidia.com/hostdev"
+ib_count: "8"
+```
+
+The settings are as follows:
+  * `pull_secret`: The name of the sercret key created with `kubectl` that will
+  be used to authenticate with private registries for pulling the training
+  container.
+  * `shm_size`: The amount of shared memory to include in the Pods. It is
+  recommended to use a large value here.
+  * `nfs_server`: The IP address or hostname of the NFS server that the worker
+  nodes will read and write data to/from.
+  * `nfs_path`: The absolute path on the NFS server that should be mounted
+  inside the Pods.
+  * `ib_resource_name`: The name of the IB interconnect to attach to Pods for
+  multi-node training. This is the name that Kubernetes assigns to the NICs as
+  allocatable resources.
+  * `ib_count`: The number of IB interconnects to include per node in each pod.
+  This will likely equal the total number of active/usable compute NICs per
+  node.
+
+And set the training job specific parameters in the `conf/training/(model_type)/(model_size).yaml` file, 
+using the run section:
+```yaml
+run:
+    name: gpt3_126m
+    results_dir: ${base_results_dir}/${.name}
+    time_limit: "1-12:00:00"
+    dependency: "singleton"
+```
+
+To run only the training pipeline and not the data preparation, evaluation or
+inference pipelines, set the `conf/config.yaml` file to:
+
+```yaml
+stages:
+  - training
+```
+
+Also set the `cluster` and `cluster_type` values to `k8s` in the
+`conf/config.yaml` file.
+
+And then run:
+```
+python3 main.py
+```
+
+Once the launcher is run, it will display the path to the Helm chart that was
+generated based on the updated config files. The Helm chart will be located in
+the job results directory by default. The chart will be run automatically and
+Pods will be started by Kubernetes once resources become available. The status
+of the Helm chart can be checked with:
+
+```
+$ helm list
+NAME           	NAMESPACE	REVISION	UPDATED                                	STATUS  	CHART                        	APP VERSION
+gpt-7b-improved	default  	1       	2023-07-17 14:10:11.794541205 -0700 PDT	deployed	nemo-framework-training-1.0.0	1.0
+```
+
+Once allocated, this will spin up N pods for N number of nodes requested. To
+view training progress follow the log of the first pod, typically named
+`nlp-training-worker-0`.
+
+Once a job is finished, it will be marked as complete via Helm and can be
+uninstalled with (note - replace `<job-name>` with the name of the Helm chart
+as shown in the previous example):
+
+```
+$ helm uninstall <job-name>
+```
+
+The uninstallation will not affect the completed job - it will only mark the
+resources as free for Kubernetes to use them for future tasks.
 
 #### 5.6.2. T5 Training
 <a id="markdown-t5-training" name="t5-training"></a>
@@ -2749,6 +2903,22 @@ The command above assumes you mounted the data workspace in `/mount/data`, and t
 The stdout and stderr outputs will also be redirected to the `/results/convert_gpt3_log.txt` file, to be able to download the logs from NGC.
 Any other parameter can also be added to the command to modify its behavior.
 
+##### 5.8.1.4. Kubernetes
+<a id="markdown-kubernetes" name="kubernetes"></a>
+To convert a model to the `.nemo` format on a Kubernetes cluster, set both the
+`cluster` and `cluster_type` parameters to `k8s` in `conf/config.yaml`. Update
+the `conf/conversion/gpt3/convert_gpt3.yaml` config file to point to the model
+you would like to convert.
+
+Once the configs are ready, run:
+
+```
+python3 main.py
+```
+
+This will launch a Helm chart that will spawn a job that runs on one of the
+compute nodes to convert the requested model to the `.nemo` format.
+
 #### 5.8.2. T5 Conversion
 <a id="markdown-t5-conversion" name="t5-conversion"></a>
 
@@ -3928,7 +4098,22 @@ The command above assumes you mounted the data workspace in `/mount/data`, and t
 The stdout and stderr outputs will also be redirected to the `/results/eval_gpt3_log.txt` file, to be able to download the logs from NGC.
 Any other parameter can also be added to the command to modify its behavior.
 
-##### 5.13.1.4 Interleaved Pipeline Parallelism
+##### 5.13.1.4. Kubernetes
+<a id="markdown-kubernetes" name="kubernetes"></a>
+To evaluate base models on Kubernetes clusters, set the `cluster` and
+`cluster_type` parameters to `k8s` in `conf/config.yaml`. Update either the
+`conf/evaluation/gpt3/evaluate_all.yaml` or `conf/evaluation/gpt3/evaluate_lambada.yaml`
+file based on your cluster and desired evaluation tasks. Once the configurations
+are updated, launch an evaluation job with:
+
+```
+python3 main.py
+```
+
+This will launch a Helm chart based on the evaluation configurations which will
+download all task files and run evaluation against the specified model.
+
+##### 5.13.1.5 Interleaved Pipeline Parallelism
 <a id="markdown-interleaved-pipeline-parallelism" name="interleaved-pipeline-parallelism"></a>
 If your model was trained with interleaved pipeline parallelism, then the model must converted to a non-interleaved model.
 In order to check if your model used interleaved, inspect the training config and verify that
diff --git a/launcher_scripts/conf/cluster/k8s.yaml b/launcher_scripts/conf/cluster/k8s.yaml
new file mode 100644
index 0000000000..d609fb3901
--- /dev/null
+++ b/launcher_scripts/conf/cluster/k8s.yaml
@@ -0,0 +1,6 @@
+pull_secret: null  # Kubernetes secret for the container registry to pull private containers.
+shm_size: 512Gi  # Amount of system memory to allocate in Pods. Should end in "Gi" for gigabytes.
+nfs_server: null  # Hostname or IP address for the NFS server where data is stored.
+nfs_path: null  # Path to store data in the NFS server.
+ib_resource_name: "nvidia.com/hostdev"  # Specify the resource name for IB devices according to kubernetes, such as "nvidia.com/hostdev" for Mellanox IB adapters.
+ib_count: "8"  # Specify the number of IB devices to include per node in each pod.
diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml
index e5e1da9e0a..6d9fd9356d 100755
--- a/launcher_scripts/conf/config.yaml
+++ b/launcher_scripts/conf/config.yaml
@@ -1,6 +1,6 @@
 defaults:
   - _self_
-  - cluster: bcm  # Leave it as bcm even if using bcp. It will be ignored for bcp.
+  - cluster: bcm  # Set to bcm for BCM and BCP clusters. Set to k8s for a k8s cluster.
   - data_preparation: gpt3/download_gpt3_pile
   - training: gpt3/5b
   - conversion: gpt3/convert_gpt3
@@ -25,7 +25,7 @@ stages:
   - evaluation
   - export
 
-cluster_type: bcm  # bcm or bcp. If bcm, it must match - cluster above.
+cluster_type: bcm  # bcm, bcp, or k8s. If bcm or k8s, it must match - cluster above.
 launcher_scripts_path: ???  # Path to NeMo Megatron Launch scripts, should ends with /launcher_scripts
 data_dir: ${launcher_scripts_path}/data  # Location to store and read the data.
 base_results_dir: ${launcher_scripts_path}/results  # Location to store the results, checkpoints and logs.
diff --git a/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml b/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml
index 632ccdadd2..ab6614480a 100755
--- a/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml
+++ b/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml
@@ -9,7 +9,7 @@ run:
 
 dataset: pile
 download_the_pile: True  # Whether to download the pile dataset from the internet.
-the_pile_url: "https://mystic.the-eye.eu/public/AI/pile/train/"  # Source URL to download The Pile dataset from.
+the_pile_url: "https://the-eye.eu/public/AI/pile/train/"  # Source URL to download The Pile dataset from.
 file_numbers: "0-29"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
 preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
 download_vocab_url: "https://huggingface.co/gpt2/resolve/main/vocab.json"  # URL to download the vocab from.
diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/download.py b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/download.py
index 917961a51a..80831b3960 100755
--- a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/download.py
+++ b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/download.py
@@ -35,7 +35,7 @@ def main(cfg):
         url = f"{pile_url_train}{file_number:02d}.jsonl.zst"
         output_file = f"{file_number:02d}.jsonl.zst"
         downloaded_path = utils.download_single_file(url, data_dir, output_file)
-    if cfg.get("cluster_type") == "bcp":
+    if cfg.get("cluster_type") in ["bcp", "k8s"]:
         file_numbers = cfg["file_numbers"]
         # Downloading the files
         files_list = utils.convert_file_numbers(file_numbers)
diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/extract.py b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/extract.py
index 5093543528..16fef5ef28 100755
--- a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/extract.py
+++ b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/extract.py
@@ -35,7 +35,7 @@ def main(cfg) -> None:
         downloaded_path = os.path.join(data_dir, f"{file_number:02d}.jsonl.zst")
         output_file = f"{file_number:02d}.jsonl"
         utils.extract_single_zst_file(downloaded_path, data_dir, output_file, rm_downloaded)
-    elif cfg.get("cluster_type") == "bcp":
+    elif cfg.get("cluster_type") in ["bcp", "k8s"]:
         file_numbers = cfg.get("file_numbers")
         # Downloading the files
         files_list = utils.convert_file_numbers(file_numbers)
diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py
index 61a9e36560..2117a27d5c 100755
--- a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py
+++ b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py
@@ -91,7 +91,7 @@ def main(cfg):
         os.system(runcmd)
         if rm_extracted:
             os.remove(extracted_path)
-    elif cfg.get("cluster_type") == "bcp":
+    elif cfg.get("cluster_type") in ["bcp", "k8s"]:
         file_numbers = cfg.get("file_numbers")
         files_list = utils.convert_file_numbers(file_numbers)
         # Assumes launched via mpirun:
diff --git a/launcher_scripts/nemo_launcher/core/data_stages.py b/launcher_scripts/nemo_launcher/core/data_stages.py
index c3713786e5..b33ece406b 100755
--- a/launcher_scripts/nemo_launcher/core/data_stages.py
+++ b/launcher_scripts/nemo_launcher/core/data_stages.py
@@ -16,11 +16,13 @@
 import os
 from pathlib import Path
 from typing import Dict, List, Optional
-
 import omegaconf
+import shutil
+
 from nemo_launcher.core.launchers import AutoLauncher
 from nemo_launcher.core.stages import NemoMegatronStage, clean_command_groups, create_args_list
 from nemo_launcher.utils.file_utils import download_single_file
+from nemo_launcher.utils.job_utils import JobPaths
 
 
 class DataStage(NemoMegatronStage):
@@ -55,7 +57,7 @@ def run(self) -> str:
             job_path = self.get_job_path(sub_stage)
             job_path.folder.mkdir(parents=True, exist_ok=True)
 
-            stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(self.stage_cfg, job_path)
+            stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(self.stage_cfg, job_path, self.cfg)
             if job_id:
                 dependency = f"aftercorr:{job_id}"
                 self.stage_cfg["run"]["dependency"] = dependency
@@ -65,9 +67,24 @@ def run(self) -> str:
 
             # Make command groups
             command_groups = self.make_stage_command_groups(stage_cfg_path, sub_stage)
+
+            # Prepare Helm chart for k8s
+            if self.cluster == 'k8s':
+                template_root = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'k8s_templates/data_preparation')
+                self._make_k8s_helm_chart(template_root, cluster_parameters, job_path, sub_stage)
+
             # Create launcher
             launcher = AutoLauncher(folder=job_path.folder, cluster=self.cluster, **cluster_parameters,)
-            job_id = launcher.launch(command_groups=command_groups)
+
+            if self.cluster == 'k8s':
+                # For k8s clusters, only launch on the final stage (preprocess) as
+                # the Helm chart contains all stages in a single chart.
+                if sub_stage == sub_stages[-1]:
+                    job_id = launcher.launch(command_groups=command_groups)
+                else:
+                    job_id = ''
+            else:
+                job_id = launcher.launch(command_groups=command_groups)
 
         return job_id
 
@@ -97,11 +114,11 @@ def _make_private_cluster_parameters(self, cluster, sub_stage):
     def _make_cluster_parameters(self, cluster: str, sub_stage: Optional = None,) -> Dict:
         """
         Make a cluster-specific parameters for jobs on different clusters.
-        Current clusters include bcm(slurm), bcp and interactive.
+        Current clusters include bcm(slurm), bcp, k8s, and interactive.
         For example for bcm, it will return slurm parameters:
             {'job_name': 'some_name', 'nodes': 2, 'ntasks_per_node': 8, ...}
 
-        :param str cluster: i.e. `bcm`, `bcp`, `interactive`, etc.
+        :param str cluster: i.e. `bcm`, `bcp`, `interactive`, `k8s`, etc.
         :param Optional sub_stage: current sub_stage name
         :return: a dictionary of cluster parameters, e.g. `ntasks_per_node`
         :rtype: Dict
@@ -142,11 +159,78 @@ def _make_cluster_parameters(self, cluster: str, sub_stage: Optional = None,) ->
             cluster_parameters.update(
                 {**shared_parameters, **private_parameters,}
             )
+        elif cluster == "k8s":
+            cluster_cfg = cfg.get("cluster")
+            container_image = cfg.get("container")
+            k8s_cfg = {**copy.deepcopy(cluster_cfg)}
+
+            cluster_parameters = {**k8s_cfg}
+
+            cluster_parameters.update(
+                {
+                    **shared_parameters,
+                    **private_parameters,
+                    "container_image": container_image,}
+            )
         elif cluster == "interactive":
             raise ValueError("Data preparation is not supported in interactive mode.")
 
         return cluster_parameters
 
+    def _make_k8s_helm_chart(self, template_root: str, cluster_parameters: Dict, job_path: JobPaths, sub_stage: str):
+        """
+        Create a Helm chart for data preparation.
+        The Helm chart uses a base template which is extended with user-defined
+        cluster settings as specified in the config files. The generated Hydra
+        config file needs to be copied to the Helm chart as this will be used
+        for launching the job.
+
+        :param str template_root: the path to where the k8s template files are located.
+        :param dict cluster_parameters: additional parameters specific to the cluster config.
+        :param JobPaths job_path: the path to the job results directory.
+        :param str sub_stage: the current stage.
+        """
+        with open(os.path.join(template_root, 'values.yaml')) as value_file:
+            values_template = omegaconf.OmegaConf.load(value_file)
+
+        procs_per_node = self.stage_cfg.run.bcp_preproc_npernode if sub_stage == "preprocess" else 1
+        total_processes = procs_per_node * self.stage_cfg.run.node_array_size
+
+        # Update the Helm chart template with the user-specified settings
+        values_template.image.trainingImage = cluster_parameters['container_image']
+        values_template.image.pullSecret = cluster_parameters['pull_secret']
+        values_template.image.nodes = self.stage_cfg.run.node_array_size
+        values_template.dataPrepConfig.shmSize = cluster_parameters['shm_size']
+        values_template.dataPrepConfig.NFSServer = cluster_parameters['nfs_server']
+        values_template.dataPrepConfig.NFSPath = cluster_parameters['nfs_path']
+        values_template.dataPrepConfig.totalProcesses = total_processes
+        values_template.dataPrepConfig.procsPerNode = procs_per_node
+        values_template.dataPrepConfig.stage = sub_stage
+
+        k8s_template_path = job_path.folder
+        k8s_template_file = Path(k8s_template_path / 'k8s_template' / 'values.yaml')
+        k8s_template_file.parent.mkdir(parents=True, exist_ok=True)
+
+        conf = omegaconf.OmegaConf.create(values_template)
+        omegaconf.OmegaConf.save(conf, k8s_template_file)
+
+        # Copy the data prep spec files to the Helm chart
+        template_file = os.path.join(template_root, 'data-prep.yaml')
+        chart_file = os.path.join(template_root, 'Chart.yaml')
+        data_prep_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'data-prep.yaml')
+        data_prep_path.parent.mkdir(parents=True, exist_ok=True)
+        config_path = Path(job_path.folder / 'k8s_template' / 'config')
+        config_path.mkdir(parents=True, exist_ok=True)
+        chart_path = Path(job_path.folder / 'k8s_template' / 'Chart.yaml')
+        data_prep_config_file = os.path.join(template_root, 'data-prep-config.yaml')
+        data_prep_config_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'data-prep-config.yaml')
+        hydra_config_path = Path(job_path.folder / 'k8s_template' / 'config')
+
+        shutil.copy2(template_file, data_prep_path)
+        shutil.copy2(chart_file, chart_path)
+        shutil.copy2(data_prep_config_file, data_prep_config_path)
+        shutil.copy2(job_path.config_file, hydra_config_path)
+
 
 class PileDataPreparation(DataStage):
     """DataStage for preparing the Pile dataset for gpt3 and t5"""
diff --git a/launcher_scripts/nemo_launcher/core/export_stages.py b/launcher_scripts/nemo_launcher/core/export_stages.py
index dea2296db6..171e7d2c29 100755
--- a/launcher_scripts/nemo_launcher/core/export_stages.py
+++ b/launcher_scripts/nemo_launcher/core/export_stages.py
@@ -108,7 +108,7 @@ def run(self) -> str:
             job_path = self.get_job_path(sub_stage)
             job_path.folder.mkdir(parents=True, exist_ok=True)
 
-            stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(self.stage_cfg, job_path)
+            stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(self.stage_cfg, job_path, self.cfg)
             if job_id:
                 dependency = f"aftercorr:{job_id}"
                 self.stage_cfg["run"]["dependency"] = dependency
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/Chart.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/Chart.yaml
new file mode 100644
index 0000000000..bbf3651743
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v2
+appVersion: "1.0"
+description: NeMo Framework Base Model Conversion
+name: nemo-framework-conversion
+version: 1.0.0
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/conversion.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/conversion.yaml
new file mode 100644
index 0000000000..214e14df69
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/conversion.yaml
@@ -0,0 +1,48 @@
+{{ $config := .Values.trainingConfig }}
+
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: nlp-conversion
+  labels:
+    app: nlp-conversion
+spec:
+  template:
+    spec:
+      containers:
+      - name: nlp-conversion
+        image: {{ .Values.image.trainingImage }}
+        env:
+          - name: NCCL_AVOID_RECORD_STREAMS
+            value: "1"
+        command: ["/bin/bash", "-c"]
+        args:
+          - 'export CKPT_NAME=$(python3 {{ $config.launcherScriptsPath }}/nemo_launcher/collections/checkpoint_search.py checkpoint_folder={{ $config.trainingDirectory }}/results/checkpoints checkpoint_name=latest tensor_model_parallel_size=1 pipeline_model_parallel_size=1) &&
+          echo ${CKPT_NAME} &&
+          python3 {{ $config.launcherScriptsPath }}/nemo_launcher/collections/hparams_override.py hparams_file={{ $config.trainingDirectory }}/results/hparams.yaml output_path={{ $config.resultsDirectory }}/results vocab_file={{ $config.vocabPath }} merge_file={{ $config.mergesPath }} tokenizer_model=None &&
+          python3 /opt/NeMo/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py --gpus_per_node=1 --model_type=gpt --checkpoint_folder={{ $config.trainingDirectory }}/results/checkpoints --checkpoint_name=${CKPT_NAME} --hparams_file={{ $config.resultsDirectory }}/results/hparams_override.yaml --nemo_file_path={{ $config.resultsDirectory }}/megatron_gpt.nemo --tensor_model_parallel_size={{ $config.tensorParallelism }} --pipeline_model_parallel_size={{ $config.pipelineParallelism }}'
+        imagePullPolicy: Always
+        resources:
+          requests:
+            nvidia.com/gpu: {{ .Values.image.gpuNum }}
+          limits:
+            nvidia.com/gpu: {{ .Values.image.gpuNum }}
+        volumeMounts:
+        - mountPath: {{ $config.NFSPath }}
+          name: workspace
+        - mountPath: /dev/shm
+          name: dshm
+      restartPolicy: Never
+      imagePullSecrets:
+      - name: {{ .Values.image.pullSecret }}
+
+      volumes:
+      - name: workspace
+        nfs:
+          server: {{ $config.NFSServer }}
+          path: {{ $config.NFSPath }}
+
+      - name: dshm
+        emptyDir:
+          medium: Memory
+          sizeLimit: {{ $config.shmSize }}
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/values.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/values.yaml
new file mode 100644
index 0000000000..21df8fd095
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/values.yaml
@@ -0,0 +1,40 @@
+image:
+  trainingImage: cfg.container
+  pullPolicy: IfNotPresent
+
+  # Insert the name of your container registry pull secret #
+  pullSecret: nvcr.io
+
+  # Insert number of GPUs #
+  gpuNum: <Insert number of GPUs>
+
+trainingConfig:
+  # Specify the amount of shared memory to attach to the Pods #
+  shmSize: 512Gi
+
+  # Insert the address for the NFS server if using NFS for model storage #
+  NFSServer: <Insert NFS server address>
+
+  # Insert the path to save data on the NFS server #
+  NFSPath: <Insert NFS server path>
+
+  # Insert the path to the vocab file #
+  vocabPath: <Insert absolute path to vocab.json file>
+
+  # Insert the path to the merges file #
+  mergesPath: <Insert absolute path to merges.txt file>
+
+  # Insert the path to the results directory #
+  resultsDirectory: <Insert absolute path to the conversion directory>
+
+  # Insert the path to the training directory #
+  trainingDirectory: <Insert the absolute path to the training directory>
+
+  # Insert the path to the launcher_scripts directory #
+  launcherScriptsPath: <Insert the absolute path to the launcher_scripts directory>
+
+  # Insert the TP size #
+  tensorParallelism: <Insert TP size>
+
+  # Insert the PP size #
+  pipelineParallelism: <Insert PP size>
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/Chart.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/Chart.yaml
new file mode 100644
index 0000000000..d2337c69ac
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v2
+appVersion: "1.0"
+description: NeMo Framework Data Preparation
+name: nemo-framework-data-prep
+version: 1.0.0
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep-config.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep-config.yaml
new file mode 100644
index 0000000000..338acfb9a5
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep-config.yaml
@@ -0,0 +1,7 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: data-prep-config
+data:
+  config.yaml: |-
+  {{ (.Files.Glob "config/*hydra.yaml").AsConfig | indent 4 }}
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep.yaml
new file mode 100644
index 0000000000..8ab7a76207
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep.yaml
@@ -0,0 +1,59 @@
+{{ $config := .Values.dataPrepConfig }}
+
+apiVersion: kubeflow.org/v2beta1
+kind: MPIJob
+metadata:
+  name: nlp-data-prep
+  labels:
+    app: nlp-data-prep
+spec:
+  mpiReplicaSpecs:
+    Launcher:
+      replicas: 1
+      template:
+        spec:
+          containers:
+          - name: nlp-data-prep
+            image: {{ .Values.image.trainingImage }}
+            command: ["bash", "-c"]
+            args:
+              - '{{- range tuple "download" "extract" "preprocess" }} mpirun --allow-run-as-root -np {{ $config.totalProcesses }} -npernode {{ $config.procsPerNode }} -bind-to none -map-by slot --oversubscribe -x PYTHONPATH -mca pml ob1 -mca btl ^openib python3 /opt/NeMo-Megatron-Launcher/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/{{ . }}.py --config-path=/config --config-name=config.yaml && {{- end}} echo Data preparation complete'
+            imagePullPolicy: Always
+          imagePullSecrets:
+          - name: {{ .Values.image.pullSecret }}
+    Worker:
+      replicas: {{ .Values.image.nodes }}
+      template:
+        spec:
+          containers:
+          - name: nlp-data-prep
+            image: {{ .Values.image.trainingImage }}
+            command: ["/usr/sbin/sshd"]
+            args:
+              - "-De"
+            volumeMounts:
+            - mountPath: {{ $config.NFSPath }}
+              name: workspace
+            - mountPath: /dev/shm
+              name: dshm
+            - mountPath: /config
+              name: data-prep-config
+            imagePullPolicy: Always
+          restartPolicy: Never
+          imagePullSecrets:
+          - name: {{ .Values.image.pullSecret }}
+
+          volumes:
+          - name: workspace
+            nfs:
+              server: {{ $config.NFSServer }}
+              path: {{ $config.NFSPath }}
+
+          - name: dshm
+            emptyDir:
+              medium: Memory
+              sizeLimit: {{ $config.shmSize }}
+
+          - configMap:
+              name: data-prep-config
+            name: data-prep-config
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/values.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/values.yaml
new file mode 100644
index 0000000000..e5a8bc7987
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/values.yaml
@@ -0,0 +1,27 @@
+image:
+  trainingImage: cfg.container
+  pullPolicy: IfNotPresent
+
+  # Insert the name of your container registry pull secret #
+  pullSecret: nvcr.io
+
+  nodes: training.trainer.num_nodes
+
+dataPrepConfig:
+  # Specify the amount of shared memory to attach to the Pods #
+  shmSize: 512Gi
+
+  # Insert the address for the NFS server if using NFS for model storage #
+  NFSServer: <Insert NFS server address>
+
+  # Insert the path to save data on the NFS server #
+  NFSPath: <Insert NFS server path>
+
+  # Insert the total number of processes to spawn on the cluster #
+  totalProcesses: <Insert number of processes>
+
+  # Insert the number of processes to spawn per node #
+  procsPerNode: <Insert number of processes per node>
+
+  # Insert the data preparation stage, such as download, extract, or preprocess #
+  stage: <Insert the data prep stage>
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/Chart.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/Chart.yaml
new file mode 100644
index 0000000000..4c291917f1
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v2
+appVersion: "1.0"
+description: NeMo Framework Evaluation
+name: nemo-framework-evaluation
+version: 1.0.0
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation-config.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation-config.yaml
new file mode 100644
index 0000000000..080bbcc6b3
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation-config.yaml
@@ -0,0 +1,7 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: evaluation-config
+data:
+  hparams.yaml: |-
+  {{ (.Files.Glob "config/hparams.yaml").AsConfig | indent 4 }}
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation.yaml
new file mode 100644
index 0000000000..7278d1385e
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation.yaml
@@ -0,0 +1,53 @@
+{{ $config := .Values.trainingConfig }}
+
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: nlp-evaluation
+  labels:
+    app: nlp-evaluation
+spec:
+  template:
+    spec:
+      containers:
+      - name: nlp-evaluation
+        image: {{ .Values.image.trainingImage }}
+        env:
+          - name: NCCL_AVOID_RECORD_STREAMS
+            value: "1"
+        command: ["/bin/bash", "-c"]
+        args:
+          - 'python3 {{ $config.launcherScriptsPath }}/nemo_launcher/collections/eval_harness/download.py --tasks=all_tasks --cache-dir={{ $config.cacheDir }} &&
+          mkdir -p {{ $config.outputPath }} &&
+          python3 {{ $config.launcherScriptsPath }}/nemo_launcher/collections/eval_harness/evaluate.py --name={{ $config.name }} --model={{ $config.model }} --tasks={{ $config.tasks }} --cache_dir={{ $config.cacheDir }} --output_path={{ $config.outputPath }} --batch_size={{ $config.batchSize }} --tensor_model_parallel_size={{ $config.tensorParallelism }} --pipeline_model_parallel_size={{ $config.pipelineParallelism }} --precision={{ $config.precision }} --vocab_file={{ $config.vocabPath }} --merge_file={{ $config.mergesPath }} {{- if $config.nemoModel }} --nemo_model={{ $config.nemoModel }}{{ end }} --checkpoint_folder={{ $config.checkpointFolder }} --checkpoint_name={{ $config.checkpointName }} --hparams_file=/config/hparams.yaml'
+        imagePullPolicy: Always
+        resources:
+          requests:
+            nvidia.com/gpu: {{ .Values.image.gpuNum }}
+          limits:
+            nvidia.com/gpu: {{ .Values.image.gpuNum }}
+        volumeMounts:
+        - mountPath: {{ $config.NFSPath }}
+          name: workspace
+        - mountPath: /dev/shm
+          name: dshm
+        - mountPath: /config
+          name: evaluation-config
+      restartPolicy: Never
+      imagePullSecrets:
+      - name: {{ .Values.image.pullSecret }}
+
+      volumes:
+      - name: workspace
+        nfs:
+          server: {{ $config.NFSServer }}
+          path: {{ $config.NFSPath }}
+
+      - name: dshm
+        emptyDir:
+          medium: Memory
+          sizeLimit: {{ $config.shmSize }}
+
+      - configMap:
+          name: evaluation-config
+        name: evaluation-config
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/values.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/values.yaml
new file mode 100644
index 0000000000..0fcfe4c835
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/values.yaml
@@ -0,0 +1,73 @@
+image:
+  trainingImage: cfg.container
+  pullPolicy: IfNotPresent
+
+  # Insert the name of your container registry pull secret #
+  pullSecret: nvcr.io
+
+  # Insert number of GPUs #
+  gpuNum: 1
+
+trainingConfig:
+  # Specify the amount of shared memory to attach to the Pods #
+  shmSize: 512Gi
+
+  # Insert the address for the NFS server if using NFS for model storage #
+  NFSServer: <Insert NFS server address>
+
+  # Insert the path to save data on the NFS server #
+  NFSPath: <Insert NFS server path>
+
+  # Insert the path to the vocab file #
+  vocabPath: <Insert absolute path to vocab.json file>
+
+  # Insert the path to the merges file #
+  mergesPath: <Insert absolute path to merges.txt file>
+
+  # Insert the path to the results directory #
+  resultsDirectory: <Insert absolute path to the conversion directory>
+
+  # Insert the path to the training directory #
+  trainingDirectory: <Insert the absolute path to the training directory>
+
+  # Insert the path to the launcher_scripts directory #
+  launcherScriptsPath: <Insert the absolute path to the launcher_scripts directory>
+
+  # Insert the TP size #
+  tensorParallelism: <Insert TP size>
+
+  # Insert the PP size #
+  pipelineParallelism: <Insert PP size>
+
+  # Insert evaluation task name #
+  name: <Insert name of evaluation task>
+
+  # Insert name of model to evaluate #
+  model: <Insert name of model to evaluate>
+
+  # Insert which tasks to evaluate #
+  tasks: <Insert tasks to evaluate>
+
+  # Insert path to store downloaded eval data #
+  cacheDir: <Insert path to cache eval data>
+
+  # Insert path to save evaluation results #
+  outputPath: <Insert path to save eval results>
+
+  # Insert batch size for evaluation #
+  batchSize: <Insert batch size>
+
+  # Insert evaluation precision #
+  precision: <Insert precision>
+
+  # Specify the path to the .nemo model if used #
+  nemoModel: <Insert path to .nemo file or "null">
+
+  # Insert path the the training checkpoint directory #
+  checkpointFolder: <Insert path to checkpoint directory>
+
+  # Insert name of checkpoint or "latest" #
+  checkpointName: <Insert checkpoint name>
+
+  # Insert path to the hparams file from the training job #
+  hparamsFile: <Insert path to hparams.yaml file>
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/training/Chart.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/training/Chart.yaml
new file mode 100644
index 0000000000..e2314f8ec3
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/training/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v2
+appVersion: "1.0"
+description: NeMo Framework Base Model Training
+name: nemo-framework-training
+version: 1.0.0
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/training/training-config.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/training/training-config.yaml
new file mode 100644
index 0000000000..ce3095184c
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/training/training-config.yaml
@@ -0,0 +1,7 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: training-config
+data:
+  config.yaml: |-
+  {{ (.Files.Glob "config/*hydra.yaml").AsConfig | indent 4 }}
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/training/training.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/training/training.yaml
new file mode 100644
index 0000000000..37f37a1317
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/training/training.yaml
@@ -0,0 +1,71 @@
+{{ $config := .Values.trainingConfig }}
+
+apiVersion: kubeflow.org/v1
+kind: PyTorchJob
+metadata:
+  name: nlp-training
+  labels:
+    app: nlp-training
+spec:
+  pytorchReplicaSpecs:
+    Worker:
+      replicas: {{ .Values.image.nodes }}
+      template:
+        spec:
+          containers:
+          - name: pytorch
+            image: {{ .Values.image.trainingImage }}
+            env:
+              - name: NCCL_AVOID_RECORD_STREAMS
+                value: "1"
+            {{ if eq $config.wandbKey "nil" }}
+            command: ["torchrun"]
+            args:
+              - "--nnodes={{ .Values.image.nodes }}"
+              - "--rdzv-backend=c10d"
+              - "--rdzv-endpoint=nlp-training-worker-0"
+              - "--nproc_per_node={{ .Values.image.numGPUs }}"
+              - "/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py"
+              - "--config-path=/config"
+              - "--config-name=config.yaml"
+            {{ else }}
+            command: ["bash", "-c"]
+            args:
+              - "wandb login {{ $config.wandbKey }} && torchrun --nnodes={{ .Values.image.nodes }} --rdzv-backend=c10d --rdzv-endpoint=nlp-training-worker-0 --nproc_per_node={{ .Values.image.numGPUs }} /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py --config-path=/config --config-name=config.yaml"
+            {{ end }}
+            imagePullPolicy: Always
+            securityContext:
+              capabilities:
+                add: [ "IPC_LOCK" ]
+            resources:
+              requests:
+                nvidia.com/gpu: {{ .Values.image.numGPUs }}
+                {{ $config.ibResourceName }}: {{ $config.ibCount }}
+              limits:
+                nvidia.com/gpu: {{ .Values.image.numGPUs }}
+                {{ $config.ibResourceName }}: {{ $config.ibCount }}
+            volumeMounts:
+            - mountPath: {{ $config.NFSPath }}
+              name: workspace
+            - mountPath: /dev/shm
+              name: dshm
+            - mountPath: /config
+              name: training-config
+          restartPolicy: Never
+          imagePullSecrets:
+          - name: {{ .Values.image.pullSecret }}
+
+          volumes:
+          - name: workspace
+            nfs:
+              server: {{ $config.NFSServer }}
+              path: {{ $config.NFSPath }}
+
+          - name: dshm
+            emptyDir:
+              medium: Memory
+              sizeLimit: {{ $config.shmSize }}
+
+          - configMap:
+              name: training-config
+            name: training-config
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/training/values.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/training/values.yaml
new file mode 100644
index 0000000000..553be55b19
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/training/values.yaml
@@ -0,0 +1,28 @@
+image:
+  trainingImage: cfg.container
+  pullPolicy: IfNotPresent
+
+  # Insert the name of your container registry pull secret #
+  pullSecret: nvcr.io
+
+  numGPUs: training.trainer.devices
+  nodes: training.trainer.num_nodes
+
+trainingConfig:
+  # Specify the amount of shared memory to attach to the Pods #
+  shmSize: 512Gi
+
+  # Insert the address for the NFS server if using NFS for model storage #
+  NFSServer: <Insert NFS server address>
+
+  # Insert the path to save data on the NFS server #
+  NFSPath: <Insert NFS server path>
+
+  # Specify the k8s resource name for IB devices #
+  ibResourceName: nvidia.com/hostdev
+
+  # Specity the number of IB devices to include in pods #
+  ibCount: "8"
+
+  # Specify the WandB API key if using WandB for logging #
+  wandbKey: "nil"
diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index f31fb7bba8..6e6d497337 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -21,6 +21,8 @@
 import shlex
 import shutil
 import warnings
+from omegaconf import OmegaConf, DictConfig
+import yaml
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Set, Union
 
@@ -70,6 +72,7 @@ def get_launchers():
             "bcm": SlurmLauncher,
             "bcp": BCPLauncher,
             "interactive": InteractiveLauncher,
+            "k8s": K8SLauncher,
         }
 
 
@@ -114,6 +117,7 @@ def _make_submission_file(self, command_groups: List[List[str]]) -> Path:
             on interactive cluster, it's a bash file, trigger with bash.
             on slurm cluster, it's a slurm script file, trigger with sbatch.
             on BCP cluster, it's a BCP script file, trigger with bash.
+            on k8s cluster, it's a Helm chart, triggered with helm.
 
         :param List[List[str]] command_groups: Command groups to launch with
         :return: job id on slurm based system otherwise empty string
@@ -431,6 +435,70 @@ def _get_job_id_from_submission_command(string: Union[bytes, str]) -> str:
         return output.group("id")
 
 
+class K8SLauncher(Launcher):
+    """
+    K8s job launcher
+    This class is used to hold the parameters to run a job on kubernetes.
+    In practice, it will create a Helm chart in the specified directory for the job
+    and trigger the job with `bash` command.
+
+    :param Union[Path, str] folder: folder for storing job submission/output and logs.
+    :param str job_name: Name of the job, used as job folder name
+    :param Any **kwargs: Parse other cluster parameters required for k8s running,
+        including `nodes`, `ntasks_pernode`, `bcp_launcher`, etc.
+    """
+
+    def __init__(self, folder: Union[Path, str], job_name: str, **kwargs: Any) -> None:
+        super().__init__(folder, job_name)
+        self.parameters = kwargs
+        self.parameters = self._convert_parameters(self.parameters)
+
+    @classmethod
+    def _equivalence_dict(cls):
+        return {
+            "name": "job_name",
+            "nodes": "nnodes",
+            "tasks_per_node": "npernode",
+            "ntasks_per_node": "npernode",
+        }
+
+    def _convert_parameters(self, params: Dict[str, Any]) -> Dict[str, Any]:
+        """translate k8s parameter names"""
+        # replace type in some cases
+        eq_dict = self._equivalence_dict()
+        if eq_dict is not None:
+            params = {eq_dict.get(k, k): v for k, v in params.items()}
+        return params
+
+    def _submit_command(self, submission_file_path: Path) -> str:
+        """Launch the submission command"""
+        command_list = self._make_submission_command(submission_file_path)
+        # run
+        job_utils.CommandFunction(command_list, ret_stdout=False, verbose=False)()  # explicit errors
+        return ""
+
+    @staticmethod
+    def _make_submission_command(submission_file_path: Path) -> List[str]:
+        """Make a command to trigger submission script. On a k8s cluster, the script is triggerred with Helm"""
+        return ["bash", str(submission_file_path)]
+
+    def _make_submission_file_text(self, command_groups: List[List[str]]) -> str:
+        """
+        Generate the script to launch the Helm chart.
+        A very simple bash script is generated which runs `helm install` for the
+        Helm chart that was generated.
+
+        :param List[List[str]] command_groups: Command groups to launch with
+        :return: submission script file's text
+        :rtype: str
+        """
+        paths = job_utils.JobPaths(folder=self.folder, job_name=self.job_name)
+        helm_charts = paths.folder / 'k8s_template'
+        job_name = self.job_name.replace('_', '-')
+
+        return f'#!/bin/bash\nhelm install {job_name} {helm_charts}\n'
+
+
 @functools.lru_cache()
 def _get_default_parameters() -> Dict[str, Any]:
     """Parameters that can be set through update_parameters"""
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 56404956e1..bcc5cf3f26 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -19,6 +19,7 @@
 import json
 import re
 from pathlib import Path
+import shutil
 from typing import Any, Dict, List, Optional
 
 import omegaconf
@@ -28,7 +29,7 @@
     prepare_squad_for_prompt_learning,
 )
 from nemo_launcher.utils.job_utils import JobPaths
-from omegaconf import OmegaConf
+from omegaconf import OmegaConf, DictConfig
 
 
 class NemoMegatronStage:
@@ -73,9 +74,14 @@ def run(self) -> str:
             self.cfg['training']["trainer"]["num_nodes"] = nodes
             logging.info(f"global batch size and number of nodes will change following this schedule:\n {self.nodes_scheduler}")
 
-        stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(self.stage_cfg, job_path)
+        stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(self.stage_cfg, job_path, self.cfg)
         # Make cluster parameters
         cluster_parameters = self._make_cluster_parameters(self.cluster)
+        # Make k8s config file if necessary
+        if self.cluster == 'k8s':
+            template_root = os.path.join(os.path.abspath(os.path.dirname(__file__)), f'k8s_templates/{self.stage_name}')
+            self._make_k8s_spec_file(template_root, cluster_parameters, job_path)
+            self._copy_k8s_helm_chart(template_root, job_path)
         # Make command groups
         command_groups = self.make_stage_command_groups(stage_cfg_path)
         # Create launcher
@@ -92,15 +98,30 @@ def setup_folder_and_data(self) -> None:
         results_folder.mkdir(parents=True, exist_ok=True)
 
     @staticmethod
-    def save_stage_hydra_config(stage_cfg: OmegaConf, job_path: JobPaths) -> Path:
+    def save_stage_hydra_config(stage_cfg: OmegaConf, job_path: JobPaths, cfg: OmegaConf) -> Path:
         """
         Interpolate and save hydra config file for current stage
 
         :param OmegaConf stage_cfg: current stage's hydra configuration
         :param JobPaths job_path: JobPaths object
+        :param OmegaConf cfg: base config for job
         :return: path current stage's essential nemo scripts code
         :rtype: Path
         """
+        # Since k8s uses a Helm chart that launches a job based on the Hydra config
+        # file, the Hydra config file that is generated needs to contain all of the
+        # required keys for each stage.
+        if cfg.cluster_type == "k8s":
+            # OmegaConf doesn't allow adding new keys. Temporarily create a dictionary
+            # representation and add the new keys before converting back to an
+            # OmegaConf object.
+            temp_config = OmegaConf.to_object(stage_cfg)
+            temp_config['data_dir'] = cfg.data_dir
+            temp_config['cluster_type'] = cfg.cluster_type
+            temp_config['launcher_scripts_path'] = cfg.launcher_scripts_path
+            temp_config['data_config'] = stage_cfg.run.name
+            stage_cfg = OmegaConf.create(temp_config)
+
         _hydra_interpolation(stage_cfg)
 
         cfg_save_path = job_path.config_file
@@ -139,6 +160,10 @@ def _make_nemo_path_command(self) -> List[str]:
             f'export PYTHONPATH={self._nemo_code_path}:\${{PYTHONPATH}}',
         ]
 
+    def _make_k8s_spec_file(self, template_root: str, cluster_parameters: Dict, job_path: JobPaths):
+        """Create a yaml spec file for kubernetes jobs"""
+        raise NotImplementedError
+
     # def _make_numa_mapping_command(self) -> List[str]:
     #     """Make a command of numa mapping call"""
     #     cfg = self.cfg
@@ -285,6 +310,17 @@ def _make_cluster_parameters(self, cluster: str) -> Dict:
             )
         elif cluster == "interactive":
             cluster_parameters.update(shared_parameters)
+        elif cluster == "k8s":
+            cluster_cfg = cfg.get("cluster")
+            k8s_cfg = {**copy.deepcopy(cluster_cfg)}
+
+            cluster_parameters = {**k8s_cfg}
+            cluster_parameters.update(
+                {
+                    **shared_parameters,
+                    "container_image": container_image,
+                }
+            )
 
         return cluster_parameters
     
@@ -540,6 +576,72 @@ def _make_hydra_override(self) -> List:
         if self.cluster == "bcp":
             hydra_override += ["+rank=\${RANK}"]
         return hydra_override
+    
+    def _copy_k8s_helm_chart(self, template_root: str, job_path: JobPaths):
+        """
+        Copy the k8s Helm charts to the results directory.
+
+        :param str template_root: path to where the k8s template files are located
+        :param JobPaths job_path: JobPaths object
+        """
+        template_file = os.path.join(template_root, 'training.yaml')
+        chart_file = os.path.join(template_root, 'Chart.yaml')
+        training_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'training.yaml')
+        training_path.parent.mkdir(parents=True, exist_ok=True)
+        config_path = Path(job_path.folder / 'k8s_template' / 'config')
+        config_path.mkdir(parents=True, exist_ok=True)
+        chart_path = Path(job_path.folder / 'k8s_template' / 'Chart.yaml')
+        training_config_file = os.path.join(template_root, 'training-config.yaml')
+        training_config_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'training-config.yaml')
+        hydra_config_path = Path(job_path.folder / 'k8s_template' / 'config')
+
+        shutil.copy2(template_file, training_path)
+        shutil.copy2(chart_file, chart_path)
+        shutil.copy2(training_config_file, training_config_path)
+        shutil.copy2(job_path.config_file, hydra_config_path)
+
+    def _add_wandb_key_to_chart(self) -> str:
+        """
+        Read the WandB API key file and return it to be placed in the Helm chart.
+
+        :return: a string of the WandB API key.
+        :rtype: str
+        """
+        with open(self.cfg.wandb_api_key_file, "r") as f:
+            wandb_api_key = f.readline().rstrip()
+        return wandb_api_key
+
+    def _make_k8s_spec_file(self, template_root: str, cluster_parameters: Dict, job_path: JobPaths):
+        """
+        Create a spec file for a Kubernetes training job.
+        The spec file is generated based on the parameters in the cluster and training config files.
+
+        :param str template_root: path to where the k8s template files are located
+        :param Dict cluster_parameters: settings specific to the cluster that is being used
+        :param JobPaths job_path: JobPaths object
+        """
+        with open(os.path.join(template_root, 'values.yaml')) as value_file:
+            values_template = OmegaConf.load(value_file)
+
+        values_template.image.trainingImage = cluster_parameters['container_image']
+        values_template.image.pullSecret = cluster_parameters['pull_secret']
+        values_template.image.numGPUs = self.stage_cfg.trainer.devices
+        values_template.image.nodes = self.stage_cfg.trainer.num_nodes
+        values_template.trainingConfig.shmSize = cluster_parameters['shm_size']
+        values_template.trainingConfig.NFSServer = cluster_parameters['nfs_server']
+        values_template.trainingConfig.NFSPath = cluster_parameters['nfs_path']
+        values_template.trainingConfig.ibResourceName = cluster_parameters['ib_resource_name']
+        values_template.trainingConfig.ibCount = cluster_parameters['ib_count']
+
+        if self.cfg.wandb_api_key_file is not None:
+            values_template.trainingConfig.wandbKey = self._add_wandb_key_to_chart()
+
+        k8s_template_path = job_path.folder
+        k8s_template_file = Path(k8s_template_path / 'k8s_template' / 'values.yaml')
+        k8s_template_file.parent.mkdir(parents=True, exist_ok=True)
+
+        conf = OmegaConf.create(values_template)
+        OmegaConf.save(conf, k8s_template_file)
 
     def get_env_vars(self) -> Dict:
         """
@@ -814,6 +916,57 @@ def _make_checkpoint_search_command(self, **kwargs: Any) -> str:
             f"{' '.join(checkpoint_override)}"
         )
 
+    def _make_k8s_spec_file(self, template_root: str, cluster_parameters: Dict, job_path: JobPaths):
+        """
+        Create a spec file for a Kubernetes conversion job.
+        The spec file is generated based on the parameters in the cluster and conversion config files.
+
+        :param str template_root: path to where the k8s template files are located
+        :param Dict cluster_parameters: settings specific to the cluster that is being used
+        :param JobPaths job_path: JobPaths object
+        """
+        with open(os.path.join(template_root, 'values.yaml')) as value_file:
+            values_template = OmegaConf.load(value_file)
+
+        num_gpus = self.cfg.conversion.model.pipeline_model_parallel_size * self.cfg.conversion.model.tensor_model_parallel_size
+
+        values_template.image.trainingImage = cluster_parameters['container_image']
+        values_template.image.pullSecret = cluster_parameters['pull_secret']
+        values_template.image.gpuNum = num_gpus
+        values_template.trainingConfig.shmSize = cluster_parameters['shm_size']
+        values_template.trainingConfig.NFSServer = cluster_parameters['nfs_server']
+        values_template.trainingConfig.NFSPath = cluster_parameters['nfs_path']
+        values_template.trainingConfig.vocabPath = self.cfg.conversion.model.vocab_file
+        values_template.trainingConfig.mergesPath = self.cfg.conversion.model.merge_file
+        values_template.trainingConfig.resultsDirectory = str(job_path.folder)
+        values_template.trainingConfig.trainingDirectory = self.cfg.conversion.run.train_dir
+        values_template.trainingConfig.launcherScriptsPath = self.cfg.launcher_scripts_path
+        values_template.trainingConfig.tensorParallelism = self.cfg.conversion.model.tensor_model_parallel_size
+        values_template.trainingConfig.pipelineParallelism = self.cfg.conversion.model.pipeline_model_parallel_size
+
+        k8s_template_path = job_path.folder
+        k8s_template_file = Path(k8s_template_path / 'k8s_template' / 'values.yaml')
+        k8s_template_file.parent.mkdir(parents=True, exist_ok=True)
+
+        conf = OmegaConf.create(values_template)
+        OmegaConf.save(conf, k8s_template_file)
+
+    def _copy_k8s_helm_chart(self, template_root: str, job_path: JobPaths):
+        """
+        Copy the k8s Helm charts to the results directory.
+
+        :param str template_root: path to where the k8s template files are located
+        :param JobPaths job_path: JobPaths object
+        """
+        template_file = os.path.join(template_root, 'conversion.yaml')
+        chart_file = os.path.join(template_root, 'Chart.yaml')
+        conversion_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'conversion.yaml')
+        conversion_path.parent.mkdir(parents=True, exist_ok=True)
+        chart_path = Path(job_path.folder / 'k8s_template' / 'Chart.yaml')
+
+        shutil.copy2(template_file, conversion_path)
+        shutil.copy2(chart_file, chart_path)
+
     def make_stage_command_groups(self, stage_cfg_path: Path) -> List[List[str]]:
         """
         Make the command groups for current stage
@@ -987,6 +1140,77 @@ def _make_download_command_string(self) -> str:
         download_command_string = " \\\n  ".join(download_command)
         return download_command_string
 
+    def _make_k8s_spec_file(self, template_root: str, cluster_parameters: Dict, job_path: JobPaths):
+        """
+        Create a spec file for a Kubernetes conversion job.
+        The spec file is generated based on the parameters in the cluster and conversion config files.
+
+        :param str template_root: path to where the k8s template files are located
+        :param Dict cluster_parameters: settings specific to the cluster that is being used
+        :param JobPaths job_path: JobPaths object
+        """
+        with open(os.path.join(template_root, 'values.yaml')) as value_file:
+            values_template = OmegaConf.load(value_file)
+
+        num_gpus = self.cfg.evaluation.model.pipeline_model_parallel_size * self.cfg.evaluation.model.tensor_model_parallel_size
+
+        values_template.image.trainingImage = cluster_parameters['container_image']
+        values_template.image.pullSecret = cluster_parameters['pull_secret']
+        values_template.image.gpuNum = num_gpus
+        values_template.trainingConfig.shmSize = cluster_parameters['shm_size']
+        values_template.trainingConfig.NFSServer = cluster_parameters['nfs_server']
+        values_template.trainingConfig.NFSPath = cluster_parameters['nfs_path']
+        values_template.trainingConfig.vocabPath = self.cfg.evaluation.model.vocab_file
+        values_template.trainingConfig.mergesPath = self.cfg.evaluation.model.merge_file
+        values_template.trainingConfig.resultsDirectory = str(job_path.folder)
+        values_template.trainingConfig.trainingDirectory = self.cfg.evaluation.run.train_dir
+        values_template.trainingConfig.launcherScriptsPath = self.cfg.launcher_scripts_path
+        values_template.trainingConfig.tensorParallelism = self.cfg.evaluation.model.tensor_model_parallel_size
+        values_template.trainingConfig.pipelineParallelism = self.cfg.evaluation.model.pipeline_model_parallel_size
+        values_template.trainingConfig.name = self.cfg.evaluation.run.name
+        values_template.trainingConfig.model = self.cfg.evaluation.model.model_type
+        values_template.trainingConfig.cacheDir = os.path.join(self.cfg.data_dir, 'eval_harness_data')
+        values_template.trainingConfig.outputPath = os.path.join(self.cfg.evaluation.run.results_dir,
+                                                                 self.cfg.evaluation.run.eval_name,
+                                                                 'results')
+        values_template.trainingConfig.batchSize = self.cfg.evaluation.model.eval_batch_size
+        values_template.trainingConfig.precision = self.cfg.evaluation.model.precision
+        values_template.trainingConfig.nemoModel = self.cfg.evaluation.model.nemo_model
+        values_template.trainingConfig.checkpointFolder = self.cfg.evaluation.model.checkpoint_folder
+        values_template.trainingConfig.checkpointName = self.cfg.evaluation.model.checkpoint_name
+        values_template.trainingConfig.hparamsFile = self.cfg.evaluation.model.hparams_file
+        values_template.trainingConfig.tasks = self.cfg.evaluation.run.tasks
+
+        k8s_template_path = job_path.folder
+        k8s_template_file = Path(k8s_template_path / 'k8s_template' / 'values.yaml')
+        k8s_template_file.parent.mkdir(parents=True, exist_ok=True)
+
+        conf = OmegaConf.create(values_template)
+        OmegaConf.save(conf, k8s_template_file)
+
+    def _copy_k8s_helm_chart(self, template_root: str, job_path: JobPaths):
+        """
+        Copy the k8s Helm charts to the results directory.
+
+        :param str template_root: path to where the k8s template files are located
+        :param JobPaths job_path: JobPaths object
+        """
+        template_file = os.path.join(template_root, 'evaluation.yaml')
+        chart_file = os.path.join(template_root, 'Chart.yaml')
+        evaluation_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'evaluation.yaml')
+        evaluation_path.parent.mkdir(parents=True, exist_ok=True)
+        config_path = Path(job_path.folder / 'k8s_template' / 'config')
+        config_path.mkdir(parents=True, exist_ok=True)
+        chart_path = Path(job_path.folder / 'k8s_template' / 'Chart.yaml')
+        evaluation_config_file = os.path.join(template_root, 'evaluation-config.yaml')
+        evaluation_config_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'evaluation-config.yaml')
+        hparams_config_path = Path(job_path.folder / 'k8s_template' / 'config')
+
+        shutil.copy2(template_file, evaluation_path)
+        shutil.copy2(chart_file, chart_path)
+        shutil.copy2(evaluation_config_file, evaluation_config_path)
+        shutil.copy2(os.path.join(self.cfg.evaluation.run.train_dir, 'results', 'hparams.yaml'), hparams_config_path)
+
     def make_stage_command_groups(self, stage_cfg_path: Path) -> List[List[str]]:
         """
         Make the command groups for current stage

From 91742c108d844bcece515e9177ed4ee30e2c2732 Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Thu, 24 Aug 2023 02:49:42 -0700
Subject: [PATCH 25/62] add llama support for auto configurator

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 .../autoconfig/scripts/compare_throughput.py  | 19 ++++++++--------
 auto_configurator/autoconfig/search_config.py |  2 +-
 .../autoconfig/training_config.py             | 22 ++++++++++++-------
 auto_configurator/autoconfig/utils.py         | 14 ++++++------
 4 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/auto_configurator/autoconfig/scripts/compare_throughput.py b/auto_configurator/autoconfig/scripts/compare_throughput.py
index c6c30a031b..d3b5b3e6c2 100644
--- a/auto_configurator/autoconfig/scripts/compare_throughput.py
+++ b/auto_configurator/autoconfig/scripts/compare_throughput.py
@@ -16,7 +16,8 @@ def main(cfg):
     settings_cfg = cfg.search_config.train_settings
     model_size = settings_cfg.model_size_in_b
     output_top_n = settings_cfg.output_top_n
-    nodes = cfg.get("nodes")
+    nodes = settings_cfg.num_nodes
+    #nodes = cfg.get("nodes")
 
     training_logs = os.path.join(settings_cfg.get("logs"), "training_logs")
     candidate_configs = os.path.join(settings_cfg.get("logs"), "candidate_configs")
@@ -77,11 +78,11 @@ def main(cfg):
         model_name = candidate_cfg.get("run").get("name").split("_")[0]
         gbs = model_cfg.get("global_batch_size")
         enc_seq_len = (
-            model_cfg.get("encoder_seq_length") if model_name in ("gpt3", "bert") else model_cfg.get("seq_length")
+            model_cfg.get("encoder_seq_length") if model_name in ("gpt3", "bert", "llama") else model_cfg.get("seq_length")
         )
         dec_seq_len = data_cfg.get("seq_length_dec")
 
-        if model_name in ("gpt3", "bert"):
+        if model_name in ("gpt3", "bert", "llama"):
             hs = model_cfg.get("hidden_size")
             ffn_hs = None
             layers = model_cfg.get("num_layers")
@@ -138,9 +139,9 @@ def main(cfg):
                 ea.Reload()
                 try:
                     timing_list = ea.Scalars("train_step_timing")
-                    if len(timing_list) <= 6:
-                        continue
-                    timing_list = [x.value for x in timing_list[5:]]
+                    #if len(timing_list) <= 6:
+                    #    continue
+                    timing_list = [x.value for x in timing_list[0:]]
                     avg_global_step_time = round(sum(timing_list) / len(timing_list), 4)
                     samples_per_s = round(gbs / avg_global_step_time, 2)
                     m_tflops, m_tflops_gpu = calculate_tflops(
@@ -184,14 +185,14 @@ def main(cfg):
                 finally:
                     continue
 
-    result.sort(key=lambda x: x[14])
+    result.sort(key=lambda x: x[15])
     print(f"Top {min(output_top_n, len(result))} configs sorted from fastest to slowest:")
     for i, res in enumerate(result):
         print(f"Config #{i+1}: {res[-1]} with {res[14]:.4f}s per global step.")
         if i + 1 == output_top_n:
             break
 
-    top_config = f"{model_name}_{model_size}b_{nodes}nodes_tp_{result[0][2]}_pp_{result[0][3]}_mbs_{result[0][4]}_act_ckpt_{result[0][5]}_num_mbs_act_{result[0][6]}_act_per_pipe_{result[0][7]}"
+    top_config = f"{model_name}_{model_size}b_{nodes}nodes_tp_{result[0][3]}_pp_{result[0][4]}_mbs_{result[0][5]}_act_ckpt_{result[0][6]}_num_mbs_act_{result[0][7]}_act_per_pipe_{result[0][8]}"
     print("\n==================================================")
     print(f"Optimal config: {top_config} with {result[0][14]:.4f}s per global step.")
     print(f"Saving config to {final_result_logs}/optimal_config_{model_size}b_{nodes}nodes.yaml.")
@@ -223,7 +224,7 @@ def calculate_tflops(
     Bert Formula: 
         Model FLOPs = 72BLsh^2 * ( 1 + (s/6h) + (v/12hL))
     """
-    if model_name == "gpt3":
+    if model_name in ["gpt3", "llama"]:
         # Model FLOPS calculation
         model_flops = (
             (24 * gbs * enc_seq_len * hs * hs + 4 * gbs * enc_seq_len * enc_seq_len * hs) * (3 * layers)
diff --git a/auto_configurator/autoconfig/search_config.py b/auto_configurator/autoconfig/search_config.py
index 6870359ede..1f50a6d707 100644
--- a/auto_configurator/autoconfig/search_config.py
+++ b/auto_configurator/autoconfig/search_config.py
@@ -20,7 +20,7 @@
 from autoconfig.inference_sweep import search_inference_config
 from autoconfig.training_config import search_training_config
 
-SUPPORTED_MODELS = ["gpt3", "t5", "mt5", "bert"]
+SUPPORTED_MODELS = ["gpt3", "t5", "mt5", "bert", "llama"]
 
 
 def search_config(cfg: omegaconf.dictconfig.DictConfig, hydra_args: Optional[str] = None):
diff --git a/auto_configurator/autoconfig/training_config.py b/auto_configurator/autoconfig/training_config.py
index 71f01f20e8..7940aecac1 100644
--- a/auto_configurator/autoconfig/training_config.py
+++ b/auto_configurator/autoconfig/training_config.py
@@ -69,12 +69,12 @@ def generate_grid_search_configs(
     act_layers = train_cfg.get("act_ckpt_layers")
 
     # 2 * num_layers is needed because of encoder/decoder architecture.
-    multiplier = 1 if model_name in ["gpt3", "bert"] else 2
+    multiplier = 1 if model_name in ["gpt3", "bert", "llama"] else 2
 
     seq_length = base_cfg["model"]["data"]["seq_length"]
     num_layers = (
         base_cfg["model"]["num_layers"]
-        if model_name in ["gpt3", "bert"]
+        if model_name in ["gpt3", "bert", "llama"]
         else base_cfg["model"]["encoder"]["num_layers"]
     )
 
@@ -96,7 +96,7 @@ def generate_grid_search_configs(
             for mbs in mbs_list:
                 num_gpus = base_cfg["trainer"]["num_nodes"] * base_cfg["trainer"]["devices"]
                 gbs = base_cfg["model"]["global_batch_size"]
-                if model_name in ["gpt3", "bert"]:
+                if model_name in ["gpt3", "bert", "llama"]:
                     att_heads = base_cfg["model"]["num_attention_heads"]
                     num_layers = base_cfg["model"]["num_layers"]
                 else:
@@ -175,7 +175,7 @@ def _set_activations_checkpoint_params(tp, pp, num_layers, act_method, multiplie
     min_layers_per_pipe = 0
     max_layers_per_pipe = num_layers
     interval_layers_per_pipe = act_multiple
-    if model_name in ["gpt3", "bert"] and pp > 2:  # Interleaved pipeline scheduling.
+    if model_name in ["gpt3", "bert", "llama"] and pp > 2:  # Interleaved pipeline scheduling.
         virtual_pipelines = num_layers // pp  # TODO: verify that this is the best value.
         act_multiple = 1
         max_micro_b = pp * (virtual_pipelines - 1) + (pp - 1) * 2 + 1
@@ -190,7 +190,7 @@ def _set_activations_checkpoint_params(tp, pp, num_layers, act_method, multiplie
         else:
             act_ckpt_layers = range(0, multiplier * num_layers // pp // virtual_pipelines + 1, act_multiple)
 
-        if pp > 1 and model_name in ["gpt3", "bert"]:
+        if pp > 1 and model_name in ["gpt3", "bert", "llama"]:
             # Num micro batches with partial act ckpt
             num_micro_batches_partial_act_ckpt = list(range(min_micro_b, max_micro_b + 1, interval_micro_b))
             if num_micro_batches_partial_act_ckpt[0] == 0:
@@ -304,6 +304,12 @@ def _tp_pp_mbs_grid_gpt3_80gb(model_size_in_b: float, valid_pp: List[int], seq_l
             mbs = [1, 2]
             min_model_parallel = 8
             max_model_parallel = 32
+        elif model_size_in_b <= 95:
+            tp = [4, 8]
+            pp = [x for x in valid_pp if 1 <= x <= 8]
+            mbs = [1, 2]
+            min_model_parallel = 8
+            max_model_parallel = 64
     elif seq_length == 8192:
         if model_size_in_b <= 1.0:
             tp = [1, 2]
@@ -738,13 +744,13 @@ def _calculate_tp_pp_mbs_grid(
     mbs_sizes = train_cfg.get("micro_batch_sizes")
     gpu_memory_gb = train_cfg.get("gpu_memory_gb")
 
-    multiplier = 1 if model_name in ["gpt3", "bert"] else 2
-    init_pp = [] if model_name == "gpt3" else [1]
+    multiplier = 1 if model_name in ["gpt3", "bert", "llama"] else 2
+    init_pp = [] if model_name in ["gpt3", "llama"] else [1]
     valid_pp = init_pp + [
         multiplier * x for x in range(1, num_layers + 1) if num_layers % x == 0
     ]  # Only divisors of num_layers are possible.
 
-    if model_name == "gpt3":
+    if model_name in ["gpt3", "llama"]:
         if gpu_memory_gb == 80:
             tp, pp, mbs, min_model_parallel, max_model_parallel = _tp_pp_mbs_grid_gpt3_80gb(
                 model_size_in_b=model_size_in_b, valid_pp=valid_pp, seq_length=seq_length
diff --git a/auto_configurator/autoconfig/utils.py b/auto_configurator/autoconfig/utils.py
index 7e9b59460d..7a3125e0a8 100644
--- a/auto_configurator/autoconfig/utils.py
+++ b/auto_configurator/autoconfig/utils.py
@@ -45,7 +45,7 @@ def _calculate_model_size(
     :rtype: float
     :raises NotImplementedError: if the model name is not valid.
     """
-    if model_name == "gpt3":
+    if model_name in ["gpt3", "llama"]:
         model_size = (
             12
             * num_layers
@@ -96,7 +96,7 @@ def calculate_model_size_params(
     :raises NotImplementedError: if the model name is not supported.
     """
     ffn, kv = None, None  # Only needed for some models.
-    if model_name == "gpt3":
+    if model_name in ["gpt3", "llama"]:
         if model_size_in_b < 0.25:
             hs, att_h, lr = 768, 12, 6e-4
         elif model_size_in_b < 0.5:
@@ -350,26 +350,26 @@ def modify_cfg(
     """
     new_cfg = copy.deepcopy(base_cfg)
     if act is not None:
-        if model_name in ["gpt3", "bert"]:
+        if model_name in ["gpt3", "bert", "llama"]:
             new_cfg["model"]["activations_checkpoint_num_layers"] = act
         else:
             new_cfg["model"]["encoder"]["activations_checkpoint_num_layers"] = act // 2
             new_cfg["model"]["decoder"]["activations_checkpoint_num_layers"] = act // 2
 
-    if num_mbs_act is not None and model_name in ["gpt3", "bert"]:
+    if num_mbs_act is not None and model_name in ["gpt3", "bert", "llama"]:
         new_cfg["model"]["num_micro_batches_with_partial_activation_checkpoints"] = num_mbs_act
 
-    if act_per_pipe is not None and model_name in ["gpt3", "bert"]:
+    if act_per_pipe is not None and model_name in ["gpt3", "bert", "llama"]:
         new_cfg["model"]["activations_checkpoint_layers_per_pipeline"] = act_per_pipe
 
-    if virtual_pipelines is not None and model_name in ["gpt3", "bert"]:
+    if virtual_pipelines is not None and model_name in ["gpt3", "bert", "llama"]:
         new_cfg["model"]["virtual_pipeline_model_parallel_size"] = virtual_pipelines
 
     new_cfg["model"]["tensor_model_parallel_size"] = tp
     new_cfg["model"]["pipeline_model_parallel_size"] = pp
     new_cfg["model"]["micro_batch_size"] = mbs
 
-    if model_name in ["gpt3", "bert"]:
+    if model_name in ["gpt3", "bert", "llama"]:
         att_heads = new_cfg["model"]["num_attention_heads"]
         num_layers = new_cfg["model"]["num_layers"]
     else:

From 389ba7a5ed06e3db9a2a354c4f8cd87b7e7f1811 Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Thu, 24 Aug 2023 05:19:04 -0700
Subject: [PATCH 26/62] add llama2 training config

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 launcher_scripts/conf/training/llama/13b.yaml |  77 +++++-
 launcher_scripts/conf/training/llama/30b.yaml |  78 +++++-
 launcher_scripts/conf/training/llama/65b.yaml |  78 +++++-
 launcher_scripts/conf/training/llama/7b.yaml  | 138 +++++------
 .../conf/training/llama/llama2_13b.yaml       | 220 +++++++++++++++++
 .../conf/training/llama/llama2_70b.yaml       | 225 ++++++++++++++++++
 .../conf/training/llama/llama2_7b.yaml        | 223 +++++++++++++++++
 7 files changed, 948 insertions(+), 91 deletions(-)
 mode change 100755 => 100644 launcher_scripts/conf/training/llama/7b.yaml
 create mode 100644 launcher_scripts/conf/training/llama/llama2_13b.yaml
 create mode 100644 launcher_scripts/conf/training/llama/llama2_70b.yaml
 create mode 100644 launcher_scripts/conf/training/llama/llama2_7b.yaml

diff --git a/launcher_scripts/conf/training/llama/13b.yaml b/launcher_scripts/conf/training/llama/13b.yaml
index cf6f8ec8cc..e06835be27 100644
--- a/launcher_scripts/conf/training/llama/13b.yaml
+++ b/launcher_scripts/conf/training/llama/13b.yaml
@@ -10,16 +10,17 @@ trainer:
   precision: bf16
   logger: false
   enable_checkpointing: false
-  replace_sampler_ddp: false
+  use_distributed_sampler: false
   max_epochs: null
   max_steps: 300000
   max_time: '5:23:30:00'
   log_every_n_steps: 10
   val_check_interval: 2000
-  limit_val_batches: 50
+  limit_val_batches: 32
   limit_test_batches: 50
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
+  num_sanity_val_steps: 0
 exp_manager:
   explicit_log_dir: ${training.run.results_dir}/results
   exp_dir: null
@@ -30,7 +31,7 @@ exp_manager:
     name: ${training.run.name}
   resume_if_exists: true
   resume_ignore_no_checkpoint: true
-  create_checkpoint_callback: false
+  create_checkpoint_callback: true
   checkpoint_callback_params:
     monitor: val_loss
     save_top_k: 10
@@ -45,8 +46,9 @@ exp_manager:
     sync_cuda: true
     buffer_size: 5
 model:
+  mcore_gpt: true
   micro_batch_size: 2
-  global_batch_size: 2048
+  global_batch_size: 128
   rampup_batch_size: null
   tensor_model_parallel_size: 4
   pipeline_model_parallel_size: 1
@@ -114,7 +116,7 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: 0
   activations_checkpoint_layers_per_pipeline: 0
   sequence_parallel: false
-  transformer_engine: false
+  transformer_engine: true
   fp8: false
   fp8_e4m3: false
   fp8_hybrid: false
@@ -151,7 +153,68 @@ model:
     eod_mask_loss: false
     index_mapping_dir: null
     data_prefix:
-    - 0.5
+    - .0333
     - ${data_dir}/my-llama_00_text_document
-    - 0.5
+    - .0333
     - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
+
diff --git a/launcher_scripts/conf/training/llama/30b.yaml b/launcher_scripts/conf/training/llama/30b.yaml
index 33ed5054c8..ebdfa06f02 100644
--- a/launcher_scripts/conf/training/llama/30b.yaml
+++ b/launcher_scripts/conf/training/llama/30b.yaml
@@ -10,13 +10,13 @@ trainer:
   precision: bf16
   logger: false
   enable_checkpointing: false
-  replace_sampler_ddp: false
+  use_distributed_sampler: false
   max_epochs: null
   max_steps: 300000
   max_time: '19:23:30:00'
   log_every_n_steps: 10
   val_check_interval: 2000
-  limit_val_batches: 50
+  limit_val_batches: 32
   limit_test_batches: 50
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
@@ -44,6 +44,7 @@ exp_manager:
     sync_cuda: true
     buffer_size: 5
 model:
+  mcore_gpt: true
   micro_batch_size: 1
   global_batch_size: 2048
   rampup_batch_size: null
@@ -113,7 +114,7 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: 2
   activations_checkpoint_layers_per_pipeline: 32
   sequence_parallel: false
-  transformer_engine: false
+  transformer_engine: true
   fp8: false
   fp8_e4m3: false
   fp8_hybrid: false
@@ -125,14 +126,14 @@ model:
   ub_tp_comm_overlap: false
   use_flash_attention: false
   optim:
-    name: fused_adam
+    name: distributed_fused_adam
     lr: 0.0001
     weight_decay: 0.1
     betas:
     - 0.9
     - 0.95
-      #bucket_cap_mb: 125
-      #overlap_grad_sync: false
+    bucket_cap_mb: 125
+    overlap_grad_sync: false
     sched:
       name: CosineAnnealing
       warmup_steps: 107
@@ -150,7 +151,68 @@ model:
     eod_mask_loss: false
     index_mapping_dir: null
     data_prefix:
-    - .5
+    - .0333
     - ${data_dir}/my-llama_00_text_document
-    - .5
+    - .0333
     - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
+
diff --git a/launcher_scripts/conf/training/llama/65b.yaml b/launcher_scripts/conf/training/llama/65b.yaml
index 464af39c09..0f65d40071 100644
--- a/launcher_scripts/conf/training/llama/65b.yaml
+++ b/launcher_scripts/conf/training/llama/65b.yaml
@@ -10,16 +10,17 @@ trainer:
   precision: bf16
   logger: false
   enable_checkpointing: false
-  replace_sampler_ddp: false
+  use_distributed_sampler: false
   max_epochs: null
   max_steps: 300000
   max_time: '19:23:30:00'
   log_every_n_steps: 10
   val_check_interval: 2000
-  limit_val_batches: 50
+  limit_val_batches: 32
   limit_test_batches: 50
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
+  num_sanity_val_steps: 0
 exp_manager:
   explicit_log_dir: ${training.run.results_dir}/results
   exp_dir: null
@@ -44,6 +45,7 @@ exp_manager:
     sync_cuda: true
     buffer_size: 5
 model:
+  mcore_gpt: true
   micro_batch_size: 1
   global_batch_size: 2048
   rampup_batch_size: null
@@ -113,7 +115,7 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: 80
   activations_checkpoint_layers_per_pipeline: 0
   sequence_parallel: false
-  transformer_engine: false
+  transformer_engine: true
   fp8: false
   fp8_e4m3: false
   fp8_hybrid: false
@@ -125,14 +127,14 @@ model:
   ub_tp_comm_overlap: false
   use_flash_attention: false
   optim:
-    name: fused_adam
+    name: distributed_fused_adam
     lr: 0.0001
     weight_decay: 0.1
     betas:
     - 0.9
     - 0.95
-      #bucket_cap_mb: 125
-      #overlap_grad_sync: false
+    bucket_cap_mb: 125
+    overlap_grad_sync: false
     sched:
       name: CosineAnnealing
       warmup_steps: 107
@@ -150,8 +152,68 @@ model:
     eod_mask_loss: false
     index_mapping_dir: null
     data_prefix:
-    - .5
+    - .0333
     - ${data_dir}/my-llama_00_text_document
-    - .5
+    - .0333
     - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
 
diff --git a/launcher_scripts/conf/training/llama/7b.yaml b/launcher_scripts/conf/training/llama/7b.yaml
old mode 100755
new mode 100644
index 96cb4790c0..cc1bb32c15
--- a/launcher_scripts/conf/training/llama/7b.yaml
+++ b/launcher_scripts/conf/training/llama/7b.yaml
@@ -4,22 +4,23 @@ run:
   time_limit: "0-04:00:00"
   dependency: "singleton"
 trainer:
-  num_nodes: 4
+  num_nodes: 2
   devices: 8
   accelerator: gpu
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 300000 # consumed_samples = global_step * global_batch_size
   max_time: "05:23:30:00" # days:hours:minutes:seconds
   log_every_n_steps: 10
   val_check_interval: 2000
-  limit_val_batches: 50
+  limit_val_batches: 32
   limit_test_batches: 50
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
+  num_sanity_val_steps: 0
 exp_manager:
   explicit_log_dir: ${training.run.results_dir}/results
   exp_dir: null
@@ -45,8 +46,9 @@ exp_manager:
     buffer_size: 5
 
 model:
+  mcore_gpt: true
   micro_batch_size: 2
-  global_batch_size: 2048
+  global_batch_size: 128
   rampup_batch_size: null
   tensor_model_parallel_size: 2
   pipeline_model_parallel_size: 1
@@ -117,7 +119,7 @@ model:
 
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
-  transformer_engine: False
+  transformer_engine: True
   fp8: False # enables fp8 in TransformerLayer forward
   fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
   fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
@@ -127,7 +129,7 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   use_emha: False
   ub_tp_comm_overlap: False
-  use_flash_attention: false
+  use_flash_attention: True
   optim:
     name: distributed_fused_adam
     lr: 1e-4
@@ -154,68 +156,68 @@ model:
     eod_mask_loss: false
     index_mapping_dir: null
     data_prefix:
-    - .5
+    - .0333
     - ${data_dir}/my-llama_00_text_document
-    - .5
+    - .0333
     - ${data_dir}/my-llama_01_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_00_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_01_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_02_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_03_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_04_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_05_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_06_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_07_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_08_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_09_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_10_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_11_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_12_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_13_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_14_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_15_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_16_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_17_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_18_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_19_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_20_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_21_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_22_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_23_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_24_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_25_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_26_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_27_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_28_text_document
-            #      - .0334
-            #      - ${data_dir}/my-gpt3_29_text_document
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
 
diff --git a/launcher_scripts/conf/training/llama/llama2_13b.yaml b/launcher_scripts/conf/training/llama/llama2_13b.yaml
new file mode 100644
index 0000000000..a77584a33d
--- /dev/null
+++ b/launcher_scripts/conf/training/llama/llama2_13b.yaml
@@ -0,0 +1,220 @@
+run:
+  name: llama2_13b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: 0-01:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 4
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: false
+  max_epochs: null
+  max_steps: 300000
+  max_time: '5:23:30:00'
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  num_sanity_val_steps: 0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: true
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size},
+      ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+model:
+  mcore_gpt: true
+  micro_batch_size: 2
+  global_batch_size: 128
+  rampup_batch_size: null
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 4096
+  max_position_embeddings: 4096
+  num_layers: 40
+  hidden_size: 5120
+  ffn_hidden_size: 13824
+  num_attention_heads: 40
+  init_method_std: 0.01
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: sentencepiece
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: selective
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: 0
+  activations_checkpoint_layers_per_pipeline: 0
+  sequence_parallel: true
+  transformer_engine: true
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: most_recent
+  use_emha: false
+  ub_tp_comm_overlap: false
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 0.0001
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: false
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 107
+      constant_steps: 11873
+      min_lr: 1.0e-05
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 4096
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
+
diff --git a/launcher_scripts/conf/training/llama/llama2_70b.yaml b/launcher_scripts/conf/training/llama/llama2_70b.yaml
new file mode 100644
index 0000000000..7697b36e0f
--- /dev/null
+++ b/launcher_scripts/conf/training/llama/llama2_70b.yaml
@@ -0,0 +1,225 @@
+run:
+  name: llama2_70b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: 0-01:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 8
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 300000
+  max_time: '19:23:30:00'
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  num_sanity_val_steps: 0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: true
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 128
+  rampup_batch_size: null
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 4
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 4096
+  max_position_embeddings: 4096
+  num_layers: 80
+  hidden_size: 8192
+  ffn_hidden_size: 28672
+  num_attention_heads: 64
+  num_query_groups: 8
+  init_method_std: 0.008944
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: sentencepiece
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: selective
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 1
+  num_micro_batches_with_partial_activation_checkpoints: 0
+  activations_checkpoint_layers_per_pipeline: 0
+  sequence_parallel: true
+  transformer_engine: true
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: most_recent
+  use_emha: false
+  ub_tp_comm_overlap: false
+  use_flash_attention: true
+  overlap_p2p_comm: false
+  batch_p2p_comm: true
+  gc_interval: 100
+  optim:
+    name: distributed_fused_adam
+    lr: 0.00015
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      constant_steps: 11873
+      min_lr: 1.0e-05
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 4096
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
+
diff --git a/launcher_scripts/conf/training/llama/llama2_7b.yaml b/launcher_scripts/conf/training/llama/llama2_7b.yaml
new file mode 100644
index 0000000000..dcc2887bcf
--- /dev/null
+++ b/launcher_scripts/conf/training/llama/llama2_7b.yaml
@@ -0,0 +1,223 @@
+run:
+  name: llama2_7b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "0-01:00:00"
+  dependency: "singleton"
+trainer:
+  num_nodes: 2
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 300000 # consumed_samples = global_step * global_batch_size
+  max_time: "05:23:30:00" # days:hours:minutes:seconds
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  num_sanity_val_steps: 0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: True
+  step_timing_kwargs:
+    sync_cuda: True
+    buffer_size: 5
+
+model:
+  mcore_gpt: true
+  micro_batch_size: 2
+  global_batch_size: 128
+  rampup_batch_size: null
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 4096
+  max_position_embeddings: 4096
+  num_layers: 32
+  hidden_size: 4096
+  ffn_hidden_size: 11008
+  num_attention_heads: 32
+  init_method_std: 0.01
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: 'sentencepiece'
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: False
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: selective
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: true # does not support sequence parallel
+
+  ## Transformer Engine
+  # fp8 training is currently not supported in the improved models
+  transformer_engine: true
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  use_emha: False
+  ub_tp_comm_overlap: False
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 1e-4
+    weight_decay: 0.1
+    betas:
+      - 0.9
+      - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: False
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 1e-5
+  data:
+    data_impl: mmap
+    splits_string: "99990,8,2"
+    seq_length: 4096
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
+

From fd414f930c890d1766f4aae89d0b685559cba05a Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Thu, 24 Aug 2023 05:43:59 -0700
Subject: [PATCH 27/62] update prompt learning for llama2

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 .../conf/prompt_learning/llama/squad.yaml     | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/launcher_scripts/conf/prompt_learning/llama/squad.yaml b/launcher_scripts/conf/prompt_learning/llama/squad.yaml
index 51104ba17d..4336a568bc 100755
--- a/launcher_scripts/conf/prompt_learning/llama/squad.yaml
+++ b/launcher_scripts/conf/prompt_learning/llama/squad.yaml
@@ -3,26 +3,25 @@ run:
   time_limit: "01:00:00"
   dependency: "singleton"
   convert_name: convert_nemo
-  model_train_name: llama_7b
+  model_train_name: llama2_7b
   convert_dir: ${base_results_dir}/${prompt_learning.run.model_train_name}/${prompt_learning.run.convert_name}
   task_name: "squad"
   results_dir: ${base_results_dir}/${.model_train_name}/prompt_learning_${.task_name}
 
 trainer:
   devices: 8
-  num_nodes: 1
+  num_nodes: 4
   accelerator: gpu
   precision: bf16
   logger: False
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 4
   max_steps: -1
   log_every_n_steps: 10
   val_check_interval: 200
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
-  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
 
 exp_manager:
   explicit_log_dir: ${prompt_learning.run.results_dir}/results
@@ -32,7 +31,7 @@ exp_manager:
   wandb_logger_kwargs:
     project: nemo_llama_prompt
     name: ${prompt_learning.run.name}
-  resume_if_exists: True
+  resume_if_exists: False
   resume_ignore_no_checkpoint: True
   create_checkpoint_callback: True
   checkpoint_callback_params:
@@ -46,14 +45,14 @@ exp_manager:
 
 model:
   seed: 1234
-  nemo_path: ${prompt_learning.run.results_dir}/results/megatron_gpt_prompt.nemo # the place to save prompt learning nemo checkpoint
+  nemo_path: ${prompt_learning.run.results_dir}/results/megatron_llama_prompt.nemo # the place to save prompt learning nemo checkpoint
   virtual_prompt_style: 'p-tuning' # One of 'p-tuning', 'prompt-tuning', or 'inference'. We recommend 'p-tuning' over 'prompt-tuning'.
   tensor_model_parallel_size: 2
   pipeline_model_parallel_size: 1
   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
-  encoder_seq_length: 2048
-  global_batch_size: 64
-  micro_batch_size: 8
+  encoder_seq_length: 4096
+  global_batch_size: 8
+  micro_batch_size: 1
 
   restore_path: null # used to restore from a prompt tuned checkpoint and add new tasks
   language_model_path: ${prompt_learning.run.convert_dir}/results/megatron_llama.nemo # Restore lanugage model from pre-trained .nemo checkpoint
@@ -64,6 +63,9 @@ model:
   # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
   # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
   sequence_parallel: False
+  activations_checkpoint_granularity: selective
+  activations_checkpoint_num_layers: 1
+  activations_checkpoint_method: block
 
   task_templates: # task_templates for all existing_tasks and new_tasks are required.
   - taskname: "squad" # The task name

From b38b964ee3be5251599efb1a7881af0580edc4b3 Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Thu, 24 Aug 2023 05:46:41 -0700
Subject: [PATCH 28/62] update conversion for llama2

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 launcher_scripts/conf/conversion/llama/convert_llama.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/launcher_scripts/conf/conversion/llama/convert_llama.yaml b/launcher_scripts/conf/conversion/llama/convert_llama.yaml
index 451d916b20..9dfb362cc2 100755
--- a/launcher_scripts/conf/conversion/llama/convert_llama.yaml
+++ b/launcher_scripts/conf/conversion/llama/convert_llama.yaml
@@ -5,16 +5,16 @@ run:
   dependency: "singleton"
   ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
   convert_name: convert_nemo
-  model_train_name: llama_7b
+  model_train_name: llama2_7b
   train_dir: ${base_results_dir}/${.model_train_name}
   results_dir: ${.train_dir}/${.convert_name}
-  nemo_file_name: megatron_llama_prompt.nemo # name of nemo checkpoint; must be .nemo file
+  nemo_file_name: megatron_llama.nemo # name of nemo checkpoint; must be .nemo file
 
 model:
   model_type: gpt # gpt or t5, use t5 for mt5 as well
-  checkpoint_folder: ${conversion.run.train_dir}/prompt_learning_squad/results/checkpoints
+  checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
   checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_llama-*last.ckpt)
-  hparams_file: ${conversion.run.train_dir}/prompt_learning_squad/results/hparams.yaml
+  hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
   tensor_model_parallel_size: 2
   pipeline_model_parallel_size: 1
   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}

From 633367e9967b3881446b2900a359632864ee2edf Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Thu, 24 Aug 2023 05:53:41 -0700
Subject: [PATCH 29/62] update evaluation script for llama2

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 .../conf/evaluation/llama/evaluate_all.yaml          | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml b/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml
index ca4d9b7456..047f72b866 100755
--- a/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml
+++ b/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml
@@ -1,11 +1,11 @@
 run:
   name: ${.eval_name}_${.model_train_name}
-  time_limit: "01:00:00"
+  time_limit: "02:00:00"
   dependency: "singleton"
   nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
   ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
   eval_name: eval_all
-  model_train_name: llama_7b
+  model_train_name: llama2_7b
   train_dir: ${base_results_dir}/${.model_train_name}
   tasks: all_tasks  # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks
   results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
@@ -13,12 +13,12 @@ run:
 model:
   model_type: nemo-llama
   nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints
-  checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
-  checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
-  hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
+  #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
+  #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
+  #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
   tensor_model_parallel_size: 2
   pipeline_model_parallel_size: 1
   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
   precision: bf16 # must match training precision - 32, 16 or bf16
   eval_batch_size: 4
-  tokenizer_model: ${data_dir}/llama/llama_tokenizer.model 
+  #tokenizer_model: ${data_dir}/llama/llama_tokenizer.model 

From f5cee87301a843699d468c39d3136138e14d22e6 Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Thu, 24 Aug 2023 05:55:18 -0700
Subject: [PATCH 30/62] update evaluation scripts for llama2

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 .../eval_harness/lm_eval/models/nemo_gpt3.py  | 31 ++++++++++++++-----
 .../eval_harness/lm_eval/models/nemo_llama.py |  3 +-
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_gpt3.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_gpt3.py
index d20eb89c69..e79f5808cc 100755
--- a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_gpt3.py
+++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_gpt3.py
@@ -35,11 +35,11 @@
 
 
 class RequestDataset(Dataset):
-    def __init__(self, requests, tokenizer) -> None:
+    def __init__(self, requests, tokenizer, max_length) -> None:
         super().__init__()
         self.requests = requests
         self.tokenizer = tokenizer
-        self.max_length = 2048
+        self.max_length = max_length
 
     def __len__(self):
         return len(self.requests)
@@ -148,12 +148,29 @@ def dummy():
             logging.info(f'Setting up transformer engine modules for tensor parallelism.')
             if model.cfg.get('megatron_amp_O2', 'False'):
                 # when using O2 additional module key is added that casts the weights
-                for layer in model.model.module.language_model.encoder.layers:
-                    layer.set_tensor_parallel_group(parallel_state.get_tensor_model_parallel_group())
+                if model.cfg.get('mcore_gpt', False):
+                    for layer in model.model.module.decoder.layers:
+                        layer.set_tensor_parallel_group(parallel_state.get_tensor_model_parallel_group())
+                else:
+                    for layer in model.model.module.language_model.encoder.layers:
+                        layer.set_tensor_parallel_group(parallel_state.get_tensor_model_parallel_group())
 
             else:
-                for layer in model.model.language_model.encoder.layers:
-                    layer.set_tensor_parallel_group(parallel_state.get_tensor_model_parallel_group())
+                if model.cfg.get('mcore_gpt', False):
+                    for module in model.get_gpt_module_list():
+                        """Set TP group
+                        Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398
+                        """
+                        # Deep iterate but skip self to avoid infinite recursion.
+                        for index, child in enumerate(module.modules()):
+                            if index == 0:
+                                continue
+                            if hasattr(child, "set_tensor_parallel_group"):
+                                tp_group = parallel_state.get_tensor_model_parallel_group()
+                                child.set_tensor_parallel_group(tp_group)
+                else:
+                    for layer in model.model.language_model.encoder.layers:
+                        layer.set_tensor_parallel_group(parallel_state.get_tensor_model_parallel_group())
 
 
 class NeMo_GPT3LM_TP_PP(LM):
@@ -241,7 +258,7 @@ def _collate(x):  # used to reorder request and remove duplications
             return -len(toks), tuple(toks)
 
         reord = utils.Reorderer(requests, _collate)
-        request_ds = RequestDataset(reord.get_reordered(), self.model.tokenizer)
+        request_ds = RequestDataset(reord.get_reordered(), self.model.tokenizer, self.max_length)
         request_dl = DataLoader(request_ds, collate_fn=pad_collate, batch_size=self.batch_size, shuffle=False)
 
         def logits_to_results(batch, response):
diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py
index 462d28f549..975bdf4b2e 100755
--- a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py
+++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py
@@ -120,7 +120,7 @@ def _collate(x):  # used to reorder request and remove duplications
             return -len(toks), tuple(toks)
 
         reord = utils.Reorderer(requests, _collate)
-        request_ds = RequestDataset(reord.get_reordered(), self.model.tokenizer)
+        request_ds = RequestDataset(reord.get_reordered(), self.model.tokenizer, self.max_length)
         request_dl = DataLoader(request_ds, collate_fn=pad_collate, batch_size=self.batch_size, shuffle=False)
 
         def logits_to_results(batch, response):
@@ -171,6 +171,7 @@ def logits_to_results(batch, response):
                 greedy=True,
                 repetition_penalty=1.0,
                 min_tokens_to_generate=0,
+                compute_logprob=True,
             )
             response = get_computeprob_response(self.tokenizer, response, inputs)
 

From 4f3421b8ae8fe6f3511c5f6be7166441196c87a9 Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Thu, 24 Aug 2023 05:56:17 -0700
Subject: [PATCH 31/62] revert adding nemo_dir

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 launcher_scripts/nemo_launcher/core/stages.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 3277996eb1..4410bd3adf 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -213,9 +213,8 @@ def add_container_mounts(container_mounts):
 
         cfg = self.cfg
         data_dir = cfg.get("data_dir")
-        nemo_dir = cfg.get("nemo_dir")
         base_results_dir = cfg.get("base_results_dir")
-        mounts_string = f"{self._launcher_scripts_path}:{self._launcher_scripts_path},{data_dir}:{data_dir},{base_results_dir}:{base_results_dir},{nemo_dir}:{nemo_dir}"
+        mounts_string = f"{self._launcher_scripts_path}:{self._launcher_scripts_path},{data_dir}:{data_dir},{base_results_dir}:{base_results_dir}"
 
         container_mounts = cfg.get("container_mounts")
         mounts_string += add_container_mounts(container_mounts)
@@ -425,9 +424,10 @@ def get_job_path(self, sub_stage: Optional = None) -> JobPaths:
     @property
     def _set_ln_sm_margin(self) -> str:
         """ Set LayerNorm SM margin when using P2P communication overlap to support the overlap with LayerNorm kernel """
+        vpp = self.cfg.training.model.get("virtual_pipeline_model_parallel_size")
         if (self.cfg.training.model.get("overlap_p2p_comm", False) and
             self.cfg.training.model.get("pipeline_model_parallel_size") > 1 and
-            self.cfg.training.model.get("virtual_pipeline_model_parallel_size") > 1):
+            vpp is not None and vpp > 1):
             get_ln_sm_margin_command = (
                 f"python3 {self._launcher_scripts_path / 'nemo_launcher/collections/conditional_cfgs.py'} "
                 f"name=get_ln_sm_margin"

From 50ba136a5e8ca8187db6afac628851f1ff8d737f Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Thu, 24 Aug 2023 06:00:54 -0700
Subject: [PATCH 32/62] add llama2 config for auto-configurator

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 .../base_configs/llama2_13b.yaml              | 220 +++++++++++++++++
 .../base_configs/llama2_70b.yaml              | 225 ++++++++++++++++++
 auto_configurator/base_configs/llama2_7b.yaml | 223 +++++++++++++++++
 .../conf/search_config/llama/13b.yaml         |  40 ++++
 .../conf/search_config/llama/70b.yaml         |  40 ++++
 .../conf/search_config/llama/7b.yaml          |  40 ++++
 6 files changed, 788 insertions(+)
 create mode 100644 auto_configurator/base_configs/llama2_13b.yaml
 create mode 100644 auto_configurator/base_configs/llama2_70b.yaml
 create mode 100755 auto_configurator/base_configs/llama2_7b.yaml
 create mode 100644 auto_configurator/conf/search_config/llama/13b.yaml
 create mode 100644 auto_configurator/conf/search_config/llama/70b.yaml
 create mode 100644 auto_configurator/conf/search_config/llama/7b.yaml

diff --git a/auto_configurator/base_configs/llama2_13b.yaml b/auto_configurator/base_configs/llama2_13b.yaml
new file mode 100644
index 0000000000..a77584a33d
--- /dev/null
+++ b/auto_configurator/base_configs/llama2_13b.yaml
@@ -0,0 +1,220 @@
+run:
+  name: llama2_13b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: 0-01:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 4
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: false
+  max_epochs: null
+  max_steps: 300000
+  max_time: '5:23:30:00'
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  num_sanity_val_steps: 0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: true
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size},
+      ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+model:
+  mcore_gpt: true
+  micro_batch_size: 2
+  global_batch_size: 128
+  rampup_batch_size: null
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 4096
+  max_position_embeddings: 4096
+  num_layers: 40
+  hidden_size: 5120
+  ffn_hidden_size: 13824
+  num_attention_heads: 40
+  init_method_std: 0.01
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: sentencepiece
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: selective
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: 0
+  activations_checkpoint_layers_per_pipeline: 0
+  sequence_parallel: true
+  transformer_engine: true
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: most_recent
+  use_emha: false
+  ub_tp_comm_overlap: false
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 0.0001
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: false
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 107
+      constant_steps: 11873
+      min_lr: 1.0e-05
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 4096
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
+
diff --git a/auto_configurator/base_configs/llama2_70b.yaml b/auto_configurator/base_configs/llama2_70b.yaml
new file mode 100644
index 0000000000..7697b36e0f
--- /dev/null
+++ b/auto_configurator/base_configs/llama2_70b.yaml
@@ -0,0 +1,225 @@
+run:
+  name: llama2_70b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: 0-01:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 8
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 300000
+  max_time: '19:23:30:00'
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  num_sanity_val_steps: 0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: true
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 128
+  rampup_batch_size: null
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 4
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 4096
+  max_position_embeddings: 4096
+  num_layers: 80
+  hidden_size: 8192
+  ffn_hidden_size: 28672
+  num_attention_heads: 64
+  num_query_groups: 8
+  init_method_std: 0.008944
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: sentencepiece
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: selective
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 1
+  num_micro_batches_with_partial_activation_checkpoints: 0
+  activations_checkpoint_layers_per_pipeline: 0
+  sequence_parallel: true
+  transformer_engine: true
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: most_recent
+  use_emha: false
+  ub_tp_comm_overlap: false
+  use_flash_attention: true
+  overlap_p2p_comm: false
+  batch_p2p_comm: true
+  gc_interval: 100
+  optim:
+    name: distributed_fused_adam
+    lr: 0.00015
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      constant_steps: 11873
+      min_lr: 1.0e-05
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 4096
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
+
diff --git a/auto_configurator/base_configs/llama2_7b.yaml b/auto_configurator/base_configs/llama2_7b.yaml
new file mode 100755
index 0000000000..39222af385
--- /dev/null
+++ b/auto_configurator/base_configs/llama2_7b.yaml
@@ -0,0 +1,223 @@
+run:
+  name: llama2_7b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "0-01:00:00"
+  dependency: "singleton"
+trainer:
+  num_nodes: 2
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 300000 # consumed_samples = global_step * global_batch_size
+  max_time: "05:23:30:00" # days:hours:minutes:seconds
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  num_sanity_val_steps: 0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: True
+  step_timing_kwargs:
+    sync_cuda: True
+    buffer_size: 5
+
+model:
+  mcore_gpt: true
+  micro_batch_size: 2
+  global_batch_size: 64
+  rampup_batch_size: null
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 4096
+  max_position_embeddings: 4096
+  num_layers: 32
+  hidden_size: 4096
+  ffn_hidden_size: 11008
+  num_attention_heads: 32
+  init_method_std: 0.01
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: 'sentencepiece'
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: False
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: selective
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: true # does not support sequence parallel
+
+  ## Transformer Engine
+  # fp8 training is currently not supported in the improved models
+  transformer_engine: true
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  use_emha: False
+  ub_tp_comm_overlap: False
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 1e-4
+    weight_decay: 0.1
+    betas:
+      - 0.9
+      - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: False
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 1e-5
+  data:
+    data_impl: mmap
+    splits_string: "99990,8,2"
+    seq_length: 4096
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .5
+    - ${data_dir}/my-llama_00_text_document
+    - .5
+    - ${data_dir}/my-llama_01_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_00_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_01_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_02_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_03_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_04_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_05_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_06_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_07_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_08_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_09_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_10_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_11_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_12_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_13_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_14_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_15_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_16_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_17_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_18_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_19_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_20_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_21_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_22_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_23_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_24_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_25_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_26_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_27_text_document
+            #      - .0333
+            #      - ${data_dir}/my-gpt3_28_text_document
+            #      - .0334
+            #      - ${data_dir}/my-gpt3_29_text_document
+
diff --git a/auto_configurator/conf/search_config/llama/13b.yaml b/auto_configurator/conf/search_config/llama/13b.yaml
new file mode 100644
index 0000000000..e18a5f242c
--- /dev/null
+++ b/auto_configurator/conf/search_config/llama/13b.yaml
@@ -0,0 +1,40 @@
+train_settings:
+  model_size_in_b: 13 # unit in billion parameters
+  num_nodes: 4
+  gpus_per_node: 8
+  gpu_memory_gb: 80  # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported.
+  max_training_days: 5 # unit in days
+  limit_search_runs: 1 # Max number of runs to be launched in parallel for grid search.
+  output_top_n: 10  # The result will print the top N fastest training configs.
+  max_steps_per_run: 50 # Max steps per run for the grid search.
+  max_minutes_per_run: 30 # minutes per run for the grid search.
+  tflops_per_gpu: 150  # Estimated tflops per GPU.
+  num_tokens_in_b: 300  # Unit in billions, typically 300B for GPT3 models.
+  vocab_size: 32000
+  seq_length: 4096 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768]
+  custom_config: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/auto_configurator/base_configs/llama2_13b.yaml # path to custom .yaml model config instead of using auto-generated
+  logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb  # Example base_results_dir/gpt3/126m
+  tensor_parallel_sizes: [2]  # auto to use our recommendation, or a list, such as [1, 2, 4, 8]
+  pipeline_parallel_sizes: [1,2]  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10]
+  min_model_parallel_size: auto  # auto to use our recommendation, or a value for the minimum desired parallelism
+  max_model_parallel_size: auto  # auto to use our recommendation, or a value for the maximum desired parallelism
+  micro_batch_sizes: [1,2]  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16]
+  act_ckpt_layers: auto  # auto to use our recommendation, or a list, such as [0, 1, 2, 3]
+ 
+inference_settings:
+  run:
+    model_type: gpt3
+    model_train_name: gpt3_5b
+    gpus_per_node: 8
+    data_type: "fp16" # fp32|fp16|bf16
+    time_limit: 0:30:00
+    results_dir: ${base_results_dir}/${search_config_value}_${search_config.train_settings.gpu_memory_gb}gb
+    tensor_parallel_sizes: [1,2,4]
+    pipeline_parallel_sizes: [1,2]
+  benchmark:
+    input_len: 60
+    output_len: 20
+    batch_sizes: [4,8,16,32,64,128,256]
+    beam_width: 1
+    topk: 4
+    topp: 0.0
diff --git a/auto_configurator/conf/search_config/llama/70b.yaml b/auto_configurator/conf/search_config/llama/70b.yaml
new file mode 100644
index 0000000000..eb2d089064
--- /dev/null
+++ b/auto_configurator/conf/search_config/llama/70b.yaml
@@ -0,0 +1,40 @@
+train_settings:
+  model_size_in_b: 70 # unit in billion parameters
+  num_nodes: 8
+  gpus_per_node: 8
+  gpu_memory_gb: 80  # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported.
+  max_training_days: 5 # unit in days
+  limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search.
+  output_top_n: 10  # The result will print the top N fastest training configs.
+  max_steps_per_run: 50 # Max steps per run for the grid search.
+  max_minutes_per_run: 30 # minutes per run for the grid search.
+  tflops_per_gpu: 150  # Estimated tflops per GPU.
+  num_tokens_in_b: 300  # Unit in billions, typically 300B for GPT3 models.
+  vocab_size: 32000
+  seq_length: 4096 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768]
+  custom_config: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/auto_configurator/base_configs/llama2_70b.yaml # path to custom .yaml model config instead of using auto-generated
+  logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb  # Example base_results_dir/gpt3/126m
+  tensor_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8]
+  pipeline_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10]
+  min_model_parallel_size: auto  # auto to use our recommendation, or a value for the minimum desired parallelism
+  max_model_parallel_size: auto  # auto to use our recommendation, or a value for the maximum desired parallelism
+  micro_batch_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16]
+  act_ckpt_layers: auto  # auto to use our recommendation, or a list, such as [0, 1, 2, 3]
+ 
+inference_settings:
+  run:
+    model_type: gpt3
+    model_train_name: gpt3_5b
+    gpus_per_node: 8
+    data_type: "fp16" # fp32|fp16|bf16
+    time_limit: 0:30:00
+    results_dir: ${base_results_dir}/${search_config_value}_${search_config.train_settings.gpu_memory_gb}gb
+    tensor_parallel_sizes: [1,2,4]
+    pipeline_parallel_sizes: [1,2]
+  benchmark:
+    input_len: 60
+    output_len: 20
+    batch_sizes: [4,8,16,32,64,128,256]
+    beam_width: 1
+    topk: 4
+    topp: 0.0
diff --git a/auto_configurator/conf/search_config/llama/7b.yaml b/auto_configurator/conf/search_config/llama/7b.yaml
new file mode 100644
index 0000000000..148f12ff6c
--- /dev/null
+++ b/auto_configurator/conf/search_config/llama/7b.yaml
@@ -0,0 +1,40 @@
+train_settings:
+  model_size_in_b: 7 # unit in billion parameters
+  num_nodes: 2
+  gpus_per_node: 8
+  gpu_memory_gb: 80  # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported.
+  max_training_days: 5 # unit in days
+  limit_search_runs: 1 # Max number of runs to be launched in parallel for grid search.
+  output_top_n: 10  # The result will print the top N fastest training configs.
+  max_steps_per_run: 50 # Max steps per run for the grid search.
+  max_minutes_per_run: 30 # minutes per run for the grid search.
+  tflops_per_gpu: 150  # Estimated tflops per GPU.
+  num_tokens_in_b: 300  # Unit in billions, typically 300B for GPT3 models.
+  vocab_size: 32000
+  seq_length: 4096 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768]
+  custom_config: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/auto_configurator/base_configs/llama2_7b.yaml # path to custom .yaml model config instead of using auto-generated
+  logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb  # Example base_results_dir/gpt3/126m
+  tensor_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8]
+  pipeline_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10]
+  min_model_parallel_size: auto  # auto to use our recommendation, or a value for the minimum desired parallelism
+  max_model_parallel_size: auto  # auto to use our recommendation, or a value for the maximum desired parallelism
+  micro_batch_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16]
+  act_ckpt_layers: auto  # auto to use our recommendation, or a list, such as [0, 1, 2, 3]
+ 
+inference_settings:
+  run:
+    model_type: gpt3
+    model_train_name: gpt3_5b
+    gpus_per_node: 8
+    data_type: "fp16" # fp32|fp16|bf16
+    time_limit: 0:30:00
+    results_dir: ${base_results_dir}/${search_config_value}_${search_config.train_settings.gpu_memory_gb}gb
+    tensor_parallel_sizes: [1,2,4]
+    pipeline_parallel_sizes: [1,2]
+  benchmark:
+    input_len: 60
+    output_len: 20
+    batch_sizes: [4,8,16,32,64,128,256]
+    beam_width: 1
+    topk: 4
+    topp: 0.0

From 91865fb1ea85838d1ca3f43dd5cb9747f23d37d0 Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Thu, 24 Aug 2023 06:14:21 -0700
Subject: [PATCH 33/62] minor fix for PR

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 .../autoconfig/scripts/compare_throughput.py         |  7 +++----
 launcher_scripts/conf/cluster/bcm.yaml               | 12 ++++++------
 launcher_scripts/conf/config.yaml                    |  5 ++---
 launcher_scripts/nemo_launcher/core/stages.py        |  4 ++--
 4 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/auto_configurator/autoconfig/scripts/compare_throughput.py b/auto_configurator/autoconfig/scripts/compare_throughput.py
index d3b5b3e6c2..dcb56fa833 100644
--- a/auto_configurator/autoconfig/scripts/compare_throughput.py
+++ b/auto_configurator/autoconfig/scripts/compare_throughput.py
@@ -17,7 +17,6 @@ def main(cfg):
     model_size = settings_cfg.model_size_in_b
     output_top_n = settings_cfg.output_top_n
     nodes = settings_cfg.num_nodes
-    #nodes = cfg.get("nodes")
 
     training_logs = os.path.join(settings_cfg.get("logs"), "training_logs")
     candidate_configs = os.path.join(settings_cfg.get("logs"), "candidate_configs")
@@ -139,9 +138,9 @@ def main(cfg):
                 ea.Reload()
                 try:
                     timing_list = ea.Scalars("train_step_timing")
-                    #if len(timing_list) <= 6:
-                    #    continue
-                    timing_list = [x.value for x in timing_list[0:]]
+                    if len(timing_list) <= 6:
+                        continue
+                    timing_list = [x.value for x in timing_list[5:]]
                     avg_global_step_time = round(sum(timing_list) / len(timing_list), 4)
                     samples_per_s = round(gbs / avg_global_step_time, 2)
                     m_tflops, m_tflops_gpu = calculate_tflops(
diff --git a/launcher_scripts/conf/cluster/bcm.yaml b/launcher_scripts/conf/cluster/bcm.yaml
index 8ff05b1fe3..ba8f2ebbb0 100755
--- a/launcher_scripts/conf/cluster/bcm.yaml
+++ b/launcher_scripts/conf/cluster/bcm.yaml
@@ -1,9 +1,9 @@
-partition: luna
-account: devtech
-exclusive: true
+partition: null
+account: null
+exclusive: True
 gpus_per_task: null
-gpus_per_node: null
+gpus_per_node: 8
 mem: 0
-job_name_prefix: 'devtech-gpt:'
+job_name_prefix: 'nemo-megatron-'
 srun_args:
-  - --no-container-mount-home
+  - "--no-container-mount-home"
diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml
index 9729c4901d..979644f9b1 100755
--- a/launcher_scripts/conf/config.yaml
+++ b/launcher_scripts/conf/config.yaml
@@ -31,15 +31,14 @@ stages:
   #- export
 
 cluster_type: bcm  # bcm or bcp. If bcm, it must match - cluster above.
-launcher_scripts_path: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/launcher_scripts  # Path to NeMo Megatron Launch scripts, should ends with /launcher_scripts
+launcher_scripts_path: ???  # Path to NeMo Megatron Launch scripts, should ends with /launcher_scripts
 data_dir: ${launcher_scripts_path}/data  # Location to store and read the data.
-nemo_dir: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/nemo_repo/internal/NeMo
 base_results_dir: ${launcher_scripts_path}/results  # Location to store the results, checkpoints and logs.
 container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
   - null
 container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3
 
-wandb_api_key_file: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/NeMo-Megatron-Launcher/wandb_api_key  # File where the w&B api key is stored. Key must be on the first line.
+wandb_api_key_file: null  # File where the w&B api key is stored. Key must be on the first line.
 
 env_vars:
   NCCL_TOPO_FILE: null # Should be a path to an XML file describing the topology
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 9864331805..aa6ac9c0bf 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -382,7 +382,7 @@ def _launcher_scripts_path(self) -> Path:
 
     @property
     def _nemo_code_path(self) -> Path:
-        return Path(self.cfg.get("nemo_dir", "/opt/NeMo"))
+        return Path("/opt/NeMo")
 
     @property
     def _data_dir(self) -> Path:
@@ -675,6 +675,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         :return: path current stage's essential nemo scripts code 
         :rtype: Path
         """
+
         model_type_to_code_path = {
             "gpt3" : self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_sft.py",
             "t5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py",
@@ -966,7 +967,6 @@ class EvalHarnessEvaluation(NemoMegatronStage):
     def __init__(self, cfg):
         super().__init__(cfg)
         choice_model_type, choice_name = self.get_stage_config_choice()
-        #self.prompt_evaluation = choice_model_type == "prompt_gpt3"
         self.prompt_evaluation = True if "prompt" in choice_model_type else False
 
     def setup_stage_vars(self, cfg):

From b78aa12f85fdb3a569dbbf3fa25815519d1a65ed Mon Sep 17 00:00:00 2001
From: Robert Clark <robdclark@outlook.com>
Date: Fri, 25 Aug 2023 16:33:17 -0500
Subject: [PATCH 34/62] Remove README documentation updates

A new user guide will replace the existing README moving forward and the
k8s documentation here will no longer be necessary. For legacy purposes,
the README updates will be included with the original kubernetes commit
for standalone documentation of the kubernetes support.

Signed-Off-By: Robert Clark <roclark@nvidia.com>
---
 README.md | 199 ++----------------------------------------------------
 1 file changed, 7 insertions(+), 192 deletions(-)

diff --git a/README.md b/README.md
index dbe58feb64..3df4288d54 100755
--- a/README.md
+++ b/README.md
@@ -19,7 +19,6 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
     + [4.1.1. Common](#411-common)
     + [4.1.2. OCI](#412-oci)
     + [4.1.3. AWS](#413-aws)
-    + [4.1.4. Kubernetes](#414-k8s)
   * [4.2. Cluster Validation](#42-cluster-validation)
     + [4.2.1. Validation Script Usage](#421-validation-script-usage)
     + [4.2.2 Running tests manually](#422-running-tests-manually)
@@ -33,14 +32,12 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
     + [5.1.1. Prepare Environment](#511-prepare-environment)
       - [5.1.1.1. Slurm](#5111-slurm)
       - [5.1.1.2. Base Command Platform](#5112-base-command-platform)
-      - [5.1.1.3. Kubernetes](#5113-kubernetes)
-      - [5.1.1.4. General Configuration](#5114-general-configuration)
+      - [5.1.1.3. General Configuration](#5113-general-configuration)
     + [5.1.2. Data Preparation](#512-data-preparation)
       - [5.1.2.1. Data Preparation for GPT Models](#5121-data-preparation-for-gpt-models)
         * [5.1.2.1.1. Slurm](#51211-slurm)
         * [5.1.2.1.2. Base Command Platform](#51212-base-command-platform)
-        * [5.1.2.1.3. Kubernetes](#51213-kubernetes)
-        * [5.1.2.1.4. Common](#51214-common)
+        * [5.1.2.1.3. Common](#51213-common)
       - [5.1.2.2. Data Preparation for T5 Models](#5122-data-preparation-for-t5-models)
         * [5.1.2.2.1. Slurm](#51221-slurm)
         * [5.1.2.2.2. Base Command Platform](#51222-base-command-platform)
@@ -88,7 +85,6 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
     + [5.6.1. GPT Training](#561-gpt-training)
       - [5.6.1.1. Slurm](#5611-slurm)
       - [5.6.1.2. Base Command Platform](#5612-base-command-platform)
-      - [5.6.1.3. Kubernetes](#5613-base-command-platform)
     + [5.6.2. T5 Training](#562-t5-training)
       - [5.6.2.1. Slurm](#5621-slurm)
       - [5.6.2.2. Base Command Platform](#5622-base-command-platform)
@@ -104,7 +100,6 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
       - [5.8.1.1. Common](#5811-common)
       - [5.8.1.2. Slurm](#5812-slurm)
       - [5.8.1.3. Base Command Platform](#5813-base-command-platform)
-      - [5.8.1.4. Kubernetes](#5814-kubernetes)
     + [5.8.2. T5 Conversion](#582-t5-conversion)
       - [5.8.2.1. Common](#5821-common)
       - [5.8.2.2. Slurm](#5822-slurm)
@@ -157,8 +152,7 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
       - [5.13.1.1. Common](#51311-common)
       - [5.13.1.2. Slurm](#51312-slurm)
       - [5.13.1.3. Base Command Platform](#51313-base-command-platform)
-      - [5.13.1.4. Kubernetes](#51314-kubernetes)
-      - [5.13.1.5 Interleaved Pipeline Parallelism](#51314-interleaved-pipeline-parallelism)
+      - [5.13.1.4 Interleaved Pipeline Parallelism](#51314-interleaved-pipeline-parallelism)
     + [5.13.2. T5 Evaluation](#5132-t5-evaluation)
       - [5.13.2.1. Common](#51321-common)
       - [5.13.2.2. Slurm](#51322-slurm)
@@ -377,11 +371,6 @@ Figure 1: The GPT family architecture. The 5B variant includes 24 transformer la
 | HPC-X                   | 2.13             |
 | Base Command Manager    | 1.0.0            |
 | DeepOps                 | 21.06            |
-| Kubernetes              | 1.27.4           |
-| Helm                    | 3.12.1           |
-| GPU Operator            | 23.3.2           |
-| Network Operator        | 23.1.0           |
-| KubeFlow Operator       | 1.6.0            |
 
 ## 4. Cloud Service Providers
 <a id="markdown-cloud-service-providers" name="cloud-service-providers"></a>
@@ -432,23 +421,6 @@ On the scheduler node:
 container: /path/to/nemo_megatron_launcher/nemo_megatron_training.sqsh
 ```
 
-#### 4.1.4. Kubernetes
-<a id="markdown-k8s" name="k8s"></a>
-Data preparation and training GPT models is currently supported on vanilla kubernetes (k8s) clusters.
-The launcher scripts will generate a Helm chart for each task based on the config files and launch the job using the chart.
-
-The following is required for running jobs on Kubernetes:
-  * One or more DGX A100s/H100s as worker nodes
-  * An NFS filesystem where the data and launcher scripts will be stored which is accessible on all worker and controller nodes
-  * A head/controller node which has access to the worker nodes and can run `kubectl` and `helm` to launch jobs and can install Python dependencies
-  * Recent versions of the GPU, Network, and KubeFlow Operators installed
-
-A secret key needs to be configured to allow kubernetes to pull from the private registry. For example, if pulling the container directly
-from NGC, a secret needs to be created to authenticate with the private NGC registry, such as the following:
-```
-kubectl create secret docker-registry ngc-registry --docker-server=nvcr.io --docker-username=\$oauthtoken --docker-password=<NGC KEY HERE>
-```
-
 ### 4.2. Cluster Validation
 <a id="markdown-cluster-validation" name="cluster-validation"></a>
 
@@ -632,22 +604,7 @@ creating these workspaces (e.g. `nemo_megatron_data_ws` and `nemo_megatron_resul
 the Base Command Platform User Guide for how to create and work with Base 
 Command Platform workspaces.
 
-##### 5.1.1.3. Kubernetes
-<a id="markdown-kubernetes" name="kubernetes"></a>
-
-The launcher scripts need to be downloaded to the NFS filesystem that is
-connected to the worker nodes. This can either be copied at
-`/opt/NeMo-Megatron-Launcher` from inside the training container or by cloning
-this repository.
-
-Install the NeMo Framework scripts dependencies on the head node/controller of
-the cluster where jobs will be launched:
-
-```
-pip install -r requirements.txt
-```
-
-##### 5.1.1.4. General Configuration
+##### 5.1.1.3. General Configuration
 <a id="markdown-general-configuration" name="general-configuration"></a>
 
 The first parameter that must be set is the `launcher_scripts_path` parameter inside the
@@ -895,36 +852,8 @@ The command above assumes you want to prepare the entire dataset (files 0-29), a
 workspace in `/mount/data`, and the results workspace in `/mount/results`. Stdout and stderr are redirected to the `/results/data_gpt3_log.txt` file, so it can be downloaded from NGC. 
 Any other parameter can also be added to the command to modify its behavior.
 
-###### 5.1.2.1.3. Kubernetes
-<a id="markdown-51213-kubernetes" name="51213-kubernetes"></a>
-
-To run data preparation on a kubernetes cluster, set both the `cluster` and
-`cluster_type` parameters to `k8s` in `conf/config.yaml`. Additionally, set the
-`launcher_scripts_path` parameter to the location where the launcher scripts
-are located on the NFS filesystem. This must be the same path on all nodes in
-the cluster. Ensure the `stages` parameter is set to `data_preparation` and
-`data_preparation` in the `defaults` section points to the intended data
-preparation script.
-
-The `conf/config/k8s.yaml` file also needs to be updated with the
-kubernetes container registry secret if created earlier (`pull_secret`), the
-`shm_size` to determine how much local memory to put in each pod, and the NFS
-server and path to where the launcher scripts are saved. These can all be
-overridden from the command line using hydra as well.
-
-Once all of the config files are updated, the data preparation can be launched
-from the controller node with:
-
-```
-python main.py
-```
-
-This will generate and launch a job via Helm in the default namespace which
-can be viewed with `helm show` or `kubectl get pods`. The logs can be followed
-with `kubectl logs <pod-name>`.
-
-###### 5.1.2.1.4. Common
-<a id="markdown-51214-common" name="51214-common"></a>
+###### 5.1.2.1.3. Common
+<a id="markdown-41213-common" name="41213-common"></a>
 
 Set the configuration for the data preparation job for GPT models in the YAML file:
 ```yaml
@@ -2533,89 +2462,6 @@ Select the cluster related configuration following the NGC documentation.
 Then, use the `python3 main.py` command to launch the job and override the 
 desired parameters from the training job parameters.
 
-##### 5.6.1.3. Kubernetes
-<a id="markdown-kuberetes" name="kubernetes"></a>
-
-Set configuration for your Kubernetes cluster in the `conf/cluster/k8s.yaml` file:
-
-```yaml
-pull_secret: null
-shm_size: 512Gi
-nfs_server: null
-nfs_path: null
-ib_resource_name: "nvidia.com/hostdev"
-ib_count: "8"
-```
-
-The settings are as follows:
-  * `pull_secret`: The name of the sercret key created with `kubectl` that will
-  be used to authenticate with private registries for pulling the training
-  container.
-  * `shm_size`: The amount of shared memory to include in the Pods. It is
-  recommended to use a large value here.
-  * `nfs_server`: The IP address or hostname of the NFS server that the worker
-  nodes will read and write data to/from.
-  * `nfs_path`: The absolute path on the NFS server that should be mounted
-  inside the Pods.
-  * `ib_resource_name`: The name of the IB interconnect to attach to Pods for
-  multi-node training. This is the name that Kubernetes assigns to the NICs as
-  allocatable resources.
-  * `ib_count`: The number of IB interconnects to include per node in each pod.
-  This will likely equal the total number of active/usable compute NICs per
-  node.
-
-And set the training job specific parameters in the `conf/training/(model_type)/(model_size).yaml` file, 
-using the run section:
-```yaml
-run:
-    name: gpt3_126m
-    results_dir: ${base_results_dir}/${.name}
-    time_limit: "1-12:00:00"
-    dependency: "singleton"
-```
-
-To run only the training pipeline and not the data preparation, evaluation or
-inference pipelines, set the `conf/config.yaml` file to:
-
-```yaml
-stages:
-  - training
-```
-
-Also set the `cluster` and `cluster_type` values to `k8s` in the
-`conf/config.yaml` file.
-
-And then run:
-```
-python3 main.py
-```
-
-Once the launcher is run, it will display the path to the Helm chart that was
-generated based on the updated config files. The Helm chart will be located in
-the job results directory by default. The chart will be run automatically and
-Pods will be started by Kubernetes once resources become available. The status
-of the Helm chart can be checked with:
-
-```
-$ helm list
-NAME           	NAMESPACE	REVISION	UPDATED                                	STATUS  	CHART                        	APP VERSION
-gpt-7b-improved	default  	1       	2023-07-17 14:10:11.794541205 -0700 PDT	deployed	nemo-framework-training-1.0.0	1.0
-```
-
-Once allocated, this will spin up N pods for N number of nodes requested. To
-view training progress follow the log of the first pod, typically named
-`nlp-training-worker-0`.
-
-Once a job is finished, it will be marked as complete via Helm and can be
-uninstalled with (note - replace `<job-name>` with the name of the Helm chart
-as shown in the previous example):
-
-```
-$ helm uninstall <job-name>
-```
-
-The uninstallation will not affect the completed job - it will only mark the
-resources as free for Kubernetes to use them for future tasks.
 
 #### 5.6.2. T5 Training
 <a id="markdown-t5-training" name="t5-training"></a>
@@ -2903,22 +2749,6 @@ The command above assumes you mounted the data workspace in `/mount/data`, and t
 The stdout and stderr outputs will also be redirected to the `/results/convert_gpt3_log.txt` file, to be able to download the logs from NGC.
 Any other parameter can also be added to the command to modify its behavior.
 
-##### 5.8.1.4. Kubernetes
-<a id="markdown-kubernetes" name="kubernetes"></a>
-To convert a model to the `.nemo` format on a Kubernetes cluster, set both the
-`cluster` and `cluster_type` parameters to `k8s` in `conf/config.yaml`. Update
-the `conf/conversion/gpt3/convert_gpt3.yaml` config file to point to the model
-you would like to convert.
-
-Once the configs are ready, run:
-
-```
-python3 main.py
-```
-
-This will launch a Helm chart that will spawn a job that runs on one of the
-compute nodes to convert the requested model to the `.nemo` format.
-
 #### 5.8.2. T5 Conversion
 <a id="markdown-t5-conversion" name="t5-conversion"></a>
 
@@ -4098,22 +3928,7 @@ The command above assumes you mounted the data workspace in `/mount/data`, and t
 The stdout and stderr outputs will also be redirected to the `/results/eval_gpt3_log.txt` file, to be able to download the logs from NGC.
 Any other parameter can also be added to the command to modify its behavior.
 
-##### 5.13.1.4. Kubernetes
-<a id="markdown-kubernetes" name="kubernetes"></a>
-To evaluate base models on Kubernetes clusters, set the `cluster` and
-`cluster_type` parameters to `k8s` in `conf/config.yaml`. Update either the
-`conf/evaluation/gpt3/evaluate_all.yaml` or `conf/evaluation/gpt3/evaluate_lambada.yaml`
-file based on your cluster and desired evaluation tasks. Once the configurations
-are updated, launch an evaluation job with:
-
-```
-python3 main.py
-```
-
-This will launch a Helm chart based on the evaluation configurations which will
-download all task files and run evaluation against the specified model.
-
-##### 5.13.1.5 Interleaved Pipeline Parallelism
+##### 5.13.1.4 Interleaved Pipeline Parallelism
 <a id="markdown-interleaved-pipeline-parallelism" name="interleaved-pipeline-parallelism"></a>
 If your model was trained with interleaved pipeline parallelism, then the model must converted to a non-interleaved model.
 In order to check if your model used interleaved, inspect the training config and verify that

From eef8c20972403b53e8615e8f9caafe02115ed265 Mon Sep 17 00:00:00 2001
From: "Zhenghang (Max) Xu" <zhenghax@gmail.com>
Date: Fri, 25 Aug 2023 21:30:30 -0700
Subject: [PATCH 35/62] Update squad.yaml

Update squad.yaml based on update NeMo toolkit PEFT yaml file https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml. NeMo Launcher ptuning, lora and adapter tests could PASS on BCP.
---
 launcher_scripts/conf/peft/gpt3/squad.yaml | 123 +++++++++++----------
 1 file changed, 64 insertions(+), 59 deletions(-)

diff --git a/launcher_scripts/conf/peft/gpt3/squad.yaml b/launcher_scripts/conf/peft/gpt3/squad.yaml
index e3da77bba7..fec6a0e8ad 100644
--- a/launcher_scripts/conf/peft/gpt3/squad.yaml
+++ b/launcher_scripts/conf/peft/gpt3/squad.yaml
@@ -11,20 +11,18 @@ run:
   results_dir: ${base_results_dir}/${.model_train_name}/peft_${.task_name}
 
 trainer:
-  devices: 8
-  num_nodes: 1
+  devices: 1
   accelerator: gpu
-  precision: bf16
-  logger: False
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
-  max_epochs: 4
-  max_steps: -1
-  log_every_n_steps: 10
-  val_check_interval: 200
-  accumulate_grad_batches: 1
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
   gradient_clip_val: 1.0
-  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
 
 exp_manager:
   explicit_log_dir: null
@@ -62,7 +60,7 @@ model:
   
   global_batch_size: 128
   micro_batch_size: 4
-  restore_from_path: ${peft.run.convert_dir}/results/megatron_gpt.nemo # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
   sync_batch_comm: False
@@ -80,6 +78,7 @@ model:
   # of each chunk at the specified granularity
   # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
   activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
   answer_only_loss: True
   gradient_as_bucket_view: False
 
@@ -88,7 +87,7 @@ model:
   ffn_dropout: 0.0
 
   peft:
-    peft_scheme: "ptuning"  # can be either adapter,ia3, or ptuning
+    peft_scheme: "adapter"  # can be either adapter,ia3, or ptuning
     restore_from_path: null
     
     # Used for adapter peft training
@@ -96,16 +95,22 @@ model:
       type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
       adapter_dim: 32
       adapter_dropout: 0.0
-      norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used.
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
       column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
       row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
       norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
     
     lora_tuning:
       adapter_dim: 32
       adapter_dropout: 0.0
       column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
       row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
     
     # Used for p-tuning peft training
     p_tuning:
@@ -113,9 +118,11 @@ model:
       bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
       embedding_dim: 1024  # the size of the prompt encoder embeddings
       init_std: 0.023
-  
+    
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+
   data:
-    chat: False # whether use chatbot data or not
     train_ds:
       # Example of how to specify paths to multiple datasets
       # file_names: 
@@ -124,12 +131,12 @@ model:
       #   - /path/to/boolq.jsonl
       # Example of how each dataset is formatted
       # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
-      file_names: 
-      - ${data_dir}/squad_data/v1.1/train-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data.
+      file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data.
       global_batch_size: ${peft.model.global_batch_size}
       micro_batch_size: ${peft.model.micro_batch_size}
       shuffle: True
-      num_workers: 4
+      num_workers: 0
+      memmap_workers: 2
       pin_memory: True
       max_seq_length: 2048
       min_seq_length: 1
@@ -139,59 +146,29 @@ model:
       #   - 0.5
       #   - 0.25
       #   - 0.25
-      concat_sampling_probabilities: 
-      - 1.0 # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
       context_key: 'input'
       label_key: 'output'
       add_eos: True
       add_sep: False
       add_bos: False
-      separate_prompt_and_response_with_newline: True
+      separate_prompt_and_response_with_newline: False
       truncation_field: "context" # Options: ['context', 'answer']
       index_mapping_dir: null # Path to a directory to write index mapping files.
       prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
 
     validation_ds:
-      file_names:
-      - ${data_dir}/squad_data/v1.1/dev-v1.1_gpt.json  # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
-      names: 
-      - ${peft.run.task_name} # Names of the corresponding datasets used to log metrics.
-      global_batch_size: ${peft.model.global_batch_size}
-      micro_batch_size: ${peft.model.micro_batch_size}
-      shuffle: True
-      num_workers: 4
-      pin_memory: True
-      max_seq_length: ${peft.model.data.train_ds.max_seq_length}
-      min_seq_length: ${peft.model.data.train_ds.min_seq_length}
-      drop_last: True
-      context_key: 'input'
-      label_key: 'output'
-      add_eos: ${peft.model.data.train_ds.add_eos}
-      add_sep: ${peft.model.data.train_ds.add_sep}
-      add_bos: ${peft.model.data.train_ds.add_bos}
-      separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline}
-      write_predictions_to_file: False
-      output_file_path_prefix: null # Prefix of the file to write predictions to.
-      truncation_field: "context" # Options: ['context', 'answer']
-      index_mapping_dir: null # Path to a directory to write index mapping files.
-      prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
-
-      metric:
-        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
-        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
-        num_classes: null
-
-    test_ds:
       file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
       names: null # Names of the corresponding datasets used to log metrics.
       global_batch_size: ${peft.model.global_batch_size}
       micro_batch_size: ${peft.model.micro_batch_size}
-      shuffle: True
-      num_workers: 4
+      shuffle: False
+      num_workers: 0
+      memmap_workers: ${peft.model.data.train_ds.memmap_workers}
       pin_memory: True
-      max_seq_length: ${peft.model.data.train_ds.max_seq_length}
-      min_seq_length: ${peft.model.data.train_ds.min_seq_length}
-      drop_last: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
       context_key: 'input'
       label_key: 'output'
       add_eos: ${peft.model.data.train_ds.add_eos}
@@ -203,12 +180,40 @@ model:
       truncation_field: "context" # Options: ['context', 'answer']
       index_mapping_dir: null # Path to a directory to write index mapping files.
       prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
-
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
       metric:
         name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
         average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
         num_classes: null
-  
+    test_ds:
+        file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+        names: null # Names of the corresponding datasets used to log metrics.
+        global_batch_size: ${peft.model.global_batch_size}
+        micro_batch_size: ${peft.model.micro_batch_size}
+        shuffle: False
+        num_workers: 0
+        memmap_workers: ${peft.model.data.train_ds.memmap_workers}
+        pin_memory: True
+        max_seq_length: 2048
+        min_seq_length: 1
+        drop_last: False
+        context_key: 'input'
+        label_key: 'output'
+        add_eos: ${peft.model.data.train_ds.add_eos}
+        add_sep: ${peft.model.data.train_ds.add_sep}
+        add_bos: ${peft.model.data.train_ds.add_bos}
+        separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline}
+        write_predictions_to_file: False
+        output_file_path_prefix: null # Prefix of the file to write predictions to.
+        truncation_field: "context" # Options: ['context', 'answer']
+        index_mapping_dir: null # Path to a directory to write index mapping files.
+        prompt_template: ${peft.model.data.train_ds.prompt_template}
+        tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+        metric:
+          name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+          average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+          num_classes: null
+
   optim:
     name: fused_adam
     lr: 1e-4

From 64cce78192157570073e83b43b362e4d3203487b Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Wed, 30 Aug 2023 19:38:07 -0700
Subject: [PATCH 36/62] add end_string for llama evaluation

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 .../collections/eval_harness/lm_eval/models/nemo_llama.py        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py
index 975bdf4b2e..4c7f5e56b8 100755
--- a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py
+++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py
@@ -172,6 +172,7 @@ def logits_to_results(batch, response):
                 repetition_penalty=1.0,
                 min_tokens_to_generate=0,
                 compute_logprob=True,
+                end_strings=['</s>'],
             )
             response = get_computeprob_response(self.tokenizer, response, inputs)
 

From d8b4e4d1048699a3ec877cbb2a45bda83e7f29be Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Wed, 30 Aug 2023 19:45:43 -0700
Subject: [PATCH 37/62] update evaluation configs for llama

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 .../conf/evaluation/llama/evaluate_all.yaml   |  2 +-
 .../conf/evaluation/llama/evaluate_boolq.yaml | 24 +++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100755 launcher_scripts/conf/evaluation/llama/evaluate_boolq.yaml

diff --git a/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml b/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml
index 047f72b866..e354d6ee68 100755
--- a/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml
+++ b/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml
@@ -16,7 +16,7 @@ model:
   #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
   #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
   #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
-  tensor_model_parallel_size: 2
+  tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1
   model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
   precision: bf16 # must match training precision - 32, 16 or bf16
diff --git a/launcher_scripts/conf/evaluation/llama/evaluate_boolq.yaml b/launcher_scripts/conf/evaluation/llama/evaluate_boolq.yaml
new file mode 100755
index 0000000000..49ba25236c
--- /dev/null
+++ b/launcher_scripts/conf/evaluation/llama/evaluate_boolq.yaml
@@ -0,0 +1,24 @@
+run:
+  name: ${.eval_name}_${.model_train_name}
+  time_limit: "02:00:00"
+  dependency: "singleton"
+  nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
+  ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
+  eval_name: eval_boolq
+  model_train_name: llama2_7b
+  train_dir: ${base_results_dir}/${.model_train_name}
+  tasks: boolq  # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks
+  results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
+
+model:
+  model_type: nemo-llama
+  nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints
+  #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
+  #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
+  #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
+  precision: bf16 # must match training precision - 32, 16 or bf16
+  eval_batch_size: 4
+  #tokenizer_model: ${data_dir}/llama/llama_tokenizer.model 

From a9a7de409729e0839b9d92d66461313bdfca899c Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Fri, 1 Sep 2023 13:44:17 -0700
Subject: [PATCH 38/62] config changes

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 launcher_scripts/conf/training/gpt3/126m.yaml             | 6 ++++++
 launcher_scripts/conf/training/gpt3/175b.yaml             | 8 ++++++--
 launcher_scripts/conf/training/gpt3/175b_performance.yaml | 4 ++++
 launcher_scripts/conf/training/gpt3/1b_improved.yaml      | 8 +++++++-
 launcher_scripts/conf/training/gpt3/20b.yaml              | 6 ++++++
 launcher_scripts/conf/training/gpt3/400m_improved.yaml    | 8 +++++++-
 launcher_scripts/conf/training/gpt3/40b.yaml              | 6 ++++++
 launcher_scripts/conf/training/gpt3/40b_improved.yaml     | 8 +++++++-
 launcher_scripts/conf/training/gpt3/5b.yaml               | 6 ++++++
 launcher_scripts/conf/training/gpt3/7b_improved.yaml      | 8 +++++++-
 10 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml
index affee0765e..2e8cd73053 100755
--- a/launcher_scripts/conf/training/gpt3/126m.yaml
+++ b/launcher_scripts/conf/training/gpt3/126m.yaml
@@ -64,6 +64,7 @@ model:
   num_attention_heads: 12
   init_method_std: 0.023  # Standard deviation of the zero mean normal distribution used for weight initialization.')
   hidden_dropout: 0.1  # Dropout probability for hidden state transformer.
+  attention_dropout: 0.1 # Dropout probability for attention
   kv_channels: null  # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
   apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
   layernorm_epsilon: 1e-5
@@ -90,6 +91,10 @@ model:
   ## Sequence Parallelism
   sequence_parallel: False
 
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
   tokenizer:
     library: 'megatron'
     type: 'GPT2BPETokenizer'
@@ -120,6 +125,7 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  fp8_wgrad: True
   use_emha: False
   ub_tp_comm_overlap: False
 
diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml
index 493d24d516..971bd4ccdb 100755
--- a/launcher_scripts/conf/training/gpt3/175b.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b.yaml
@@ -63,6 +63,7 @@ model:
   num_attention_heads: 96
   init_method_std: 0.006  # Standard deviation of the zero mean normal distribution used for weight initialization.')
   hidden_dropout: 0.1  # Dropout probability for hidden state transformer.
+  attention_dropout: 0.1 # Dropout probability for attention
   kv_channels: null  # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
   apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
   layernorm_epsilon: 1e-5
@@ -89,6 +90,10 @@ model:
   ## Sequence Parallelism
   sequence_parallel: True
 
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
   tokenizer:
     library: 'megatron'
     type: 'GPT2BPETokenizer'
@@ -119,6 +124,7 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  fp8_wgrad: True
   use_emha: False
   ub_tp_comm_overlap: True
 
@@ -128,8 +134,6 @@ model:
   use_cpu_initialization: False # Init weights on the CPU (slow for large models)
   onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
   apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
-  overlap_p2p_comm: True # Overlap p2p communication with computes
-  batch_p2p_comm: False # Batch consecutive inter-peer send/recv operations
 
   # Nsys profiling options
   nsys_profile:
diff --git a/launcher_scripts/conf/training/gpt3/175b_performance.yaml b/launcher_scripts/conf/training/gpt3/175b_performance.yaml
index 976deda501..780e636ba8 100755
--- a/launcher_scripts/conf/training/gpt3/175b_performance.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b_performance.yaml
@@ -65,6 +65,7 @@ model:
   num_attention_heads: 96
   init_method_std: 0.006  # Standard deviation of the zero mean normal distribution used for weight initialization.')
   hidden_dropout: 0.1  # Dropout probability for hidden state transformer.
+  attention_dropout: 0.1
   kv_channels: null  # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
   apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
   layernorm_epsilon: 1e-5
@@ -90,6 +91,8 @@ model:
 
   ## Sequence Parallelism
   sequence_parallel: True
+  
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
 
   tokenizer:
     library: 'megatron'
@@ -121,6 +124,7 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  fp8_wgrad: True
   use_emha: False
   ub_tp_comm_overlap: True
 
diff --git a/launcher_scripts/conf/training/gpt3/1b_improved.yaml b/launcher_scripts/conf/training/gpt3/1b_improved.yaml
index 1ff6b3dbf0..e6d473c840 100644
--- a/launcher_scripts/conf/training/gpt3/1b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/1b_improved.yaml
@@ -66,7 +66,8 @@ model:
   ffn_dropout: 0.0
   kv_channels: null
   apply_query_key_layer_scaling: true
-  normalization: layernorm1p
+  normalization: LayerNorm
+  layernorm_zero_centered_gamma: True
   layernorm_epsilon: 1.0e-05
   do_layer_norm_weight_decay: false
   make_vocab_size_divisible_by: 128
@@ -116,6 +117,10 @@ model:
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false # does not support sequence parallel
 
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
   transformer_engine: False
@@ -126,6 +131,7 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  fp8_wgrad: True
   use_emha: False
   ub_tp_comm_overlap: False
 
diff --git a/launcher_scripts/conf/training/gpt3/20b.yaml b/launcher_scripts/conf/training/gpt3/20b.yaml
index e48788e197..b4185d922f 100755
--- a/launcher_scripts/conf/training/gpt3/20b.yaml
+++ b/launcher_scripts/conf/training/gpt3/20b.yaml
@@ -63,6 +63,7 @@ model:
   num_attention_heads: 48
   init_method_std: 0.008165  # Standard deviation of the zero mean normal distribution used for weight initialization.')
   hidden_dropout: 0.1  # Dropout probability for hidden state transformer.
+  attention_dropout: 0.1 # Dropout probability for attention
   kv_channels: null  # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
   apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
   layernorm_epsilon: 1e-5
@@ -89,6 +90,10 @@ model:
   ## Sequence Parallelism
   sequence_parallel: True
 
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
   tokenizer:
     library: 'megatron'
     type: 'GPT2BPETokenizer'
@@ -119,6 +124,7 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  fp8_wgrad: True
   use_emha: False
   ub_tp_comm_overlap: True
 
diff --git a/launcher_scripts/conf/training/gpt3/400m_improved.yaml b/launcher_scripts/conf/training/gpt3/400m_improved.yaml
index 5b1e6b915f..e4b4a6e31f 100644
--- a/launcher_scripts/conf/training/gpt3/400m_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/400m_improved.yaml
@@ -66,7 +66,8 @@ model:
   ffn_dropout: 0.0
   kv_channels: null
   apply_query_key_layer_scaling: true
-  normalization: layernorm1p
+  normalization: LayerNorm
+  layernorm_zero_centered_gamma: True
   layernorm_epsilon: 1.0e-05
   do_layer_norm_weight_decay: false
   make_vocab_size_divisible_by: 128
@@ -116,6 +117,10 @@ model:
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false # does not support sequence parallel
 
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
   transformer_engine: False
@@ -126,6 +131,7 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  fp8_wgrad: True
   use_emha: False
   ub_tp_comm_overlap: False
 
diff --git a/launcher_scripts/conf/training/gpt3/40b.yaml b/launcher_scripts/conf/training/gpt3/40b.yaml
index 84c1802bc9..bf660ea9b4 100755
--- a/launcher_scripts/conf/training/gpt3/40b.yaml
+++ b/launcher_scripts/conf/training/gpt3/40b.yaml
@@ -63,6 +63,7 @@ model:
   num_attention_heads: 64
   init_method_std: 0.007  # Standard deviation of the zero mean normal distribution used for weight initialization.')
   hidden_dropout: 0.1  # Dropout probability for hidden state transformer.
+  attention_dropout: 0.1
   kv_channels: null  # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
   apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
   layernorm_epsilon: 1e-5
@@ -89,6 +90,10 @@ model:
   ## Sequence Parallelism
   sequence_parallel: True
 
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
   tokenizer:
     library: 'megatron'
     type: 'GPT2BPETokenizer'
@@ -119,6 +124,7 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  fp8_wgrad: True
   use_emha: False
   ub_tp_comm_overlap: True
 
diff --git a/launcher_scripts/conf/training/gpt3/40b_improved.yaml b/launcher_scripts/conf/training/gpt3/40b_improved.yaml
index 8686a171be..af5f14f2b2 100644
--- a/launcher_scripts/conf/training/gpt3/40b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/40b_improved.yaml
@@ -66,7 +66,8 @@ model:
   ffn_dropout: 0.0
   kv_channels: null
   apply_query_key_layer_scaling: true
-  normalization: layernorm1p
+  normalization: LayerNorm
+  layernorm_zero_centered_gamma: True
   layernorm_epsilon: 1.0e-05
   do_layer_norm_weight_decay: false
   make_vocab_size_divisible_by: 128
@@ -116,6 +117,10 @@ model:
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false # does not support sequence parallel
   
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+  
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
   transformer_engine: False
@@ -126,6 +131,7 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  fp8_wgrad: True
   use_emha: False
   ub_tp_comm_overlap: False
 
diff --git a/launcher_scripts/conf/training/gpt3/5b.yaml b/launcher_scripts/conf/training/gpt3/5b.yaml
index ae99d3e063..0ba8d80b89 100755
--- a/launcher_scripts/conf/training/gpt3/5b.yaml
+++ b/launcher_scripts/conf/training/gpt3/5b.yaml
@@ -63,6 +63,7 @@ model:
   num_attention_heads: 32
   init_method_std: 0.01  # Standard deviation of the zero mean normal distribution used for weight initialization.')
   hidden_dropout: 0.1  # Dropout probability for hidden state transformer.
+  attention_dropout: 0.1 # Dropout probability for attention
   kv_channels: null  # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
   apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
   layernorm_epsilon: 1e-5
@@ -89,6 +90,10 @@ model:
   ## Sequence Parallelism
   sequence_parallel: False
 
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
   tokenizer:
     library: 'megatron'
     type: 'GPT2BPETokenizer'
@@ -119,6 +124,7 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  fp8_wgrad: True
   use_emha: False
   ub_tp_comm_overlap: False
 
diff --git a/launcher_scripts/conf/training/gpt3/7b_improved.yaml b/launcher_scripts/conf/training/gpt3/7b_improved.yaml
index 0eec1b43ba..8cd14cad59 100644
--- a/launcher_scripts/conf/training/gpt3/7b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/7b_improved.yaml
@@ -66,7 +66,8 @@ model:
   ffn_dropout: 0.0
   kv_channels: null
   apply_query_key_layer_scaling: true
-  normalization: layernorm1p
+  normalization: LayerNorm
+  layernorm_zero_centered_gamma: True
   layernorm_epsilon: 1.0e-05
   do_layer_norm_weight_decay: false
   make_vocab_size_divisible_by: 128
@@ -116,6 +117,10 @@ model:
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false # does not support sequence parallel
 
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
   transformer_engine: False
@@ -126,6 +131,7 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  fp8_wgrad: True
   use_emha: False
   ub_tp_comm_overlap: False
 

From 5ee14f1730562e3b0a97b2583301a691a8231e1b Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Tue, 5 Sep 2023 22:23:08 +0300
Subject: [PATCH 39/62] Update config.yaml

---
 launcher_scripts/conf/config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml
index d172a06c98..523edfff11 100755
--- a/launcher_scripts/conf/config.yaml
+++ b/launcher_scripts/conf/config.yaml
@@ -45,7 +45,7 @@ env_vars:
   NCCL_DEBUG: null # Logging level for NCCL. Set to "INFO" for debug information
   NCCL_PROTO: null # Protocol NCCL will use. Set to "simple" for AWS
   TRANSFORMERS_OFFLINE: 1
-  NCCL_AVOID_RECORD_STREAMS: 1
+  TORCH_NCCL_AVOID_RECORD_STREAMS: 1
 
 # GPU Mapping
 numa_mapping:

From a95eb7167b3a68ee99b0e49125676f5571790cf6 Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Wed, 6 Sep 2023 02:19:02 -0700
Subject: [PATCH 40/62] update llama training scripts

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 .../llama/{13b.yaml => llama1_13b.yaml}       |  15 +-
 .../llama/{30b.yaml => llama1_30b.yaml}       |  16 +-
 .../llama/{65b.yaml => llama1_65b.yaml}       |  17 +--
 .../llama/{7b.yaml => llama1_7b.yaml}         |  11 +-
 .../conf/training/llama/llama2_13b.yaml       |  13 +-
 .../conf/training/llama/llama2_70b.yaml       |  15 +-
 .../conf/training/llama/llama2_7b.yaml        | 137 +++++++++---------
 7 files changed, 109 insertions(+), 115 deletions(-)
 rename launcher_scripts/conf/training/llama/{13b.yaml => llama1_13b.yaml} (96%)
 rename launcher_scripts/conf/training/llama/{30b.yaml => llama1_30b.yaml} (94%)
 rename launcher_scripts/conf/training/llama/{65b.yaml => llama1_65b.yaml} (94%)
 rename launcher_scripts/conf/training/llama/{7b.yaml => llama1_7b.yaml} (97%)

diff --git a/launcher_scripts/conf/training/llama/13b.yaml b/launcher_scripts/conf/training/llama/llama1_13b.yaml
similarity index 96%
rename from launcher_scripts/conf/training/llama/13b.yaml
rename to launcher_scripts/conf/training/llama/llama1_13b.yaml
index e06835be27..2d71dadb3a 100644
--- a/launcher_scripts/conf/training/llama/13b.yaml
+++ b/launcher_scripts/conf/training/llama/llama1_13b.yaml
@@ -4,7 +4,7 @@ run:
   time_limit: 0-02:00:00
   dependency: singleton
 trainer:
-  num_nodes: 4
+  num_nodes: 32
   devices: 8
   accelerator: gpu
   precision: bf16
@@ -20,7 +20,6 @@ trainer:
   limit_test_batches: 50
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
-  num_sanity_val_steps: 0
 exp_manager:
   explicit_log_dir: ${training.run.results_dir}/results
   exp_dir: null
@@ -48,9 +47,9 @@ exp_manager:
 model:
   mcore_gpt: true
   micro_batch_size: 2
-  global_batch_size: 128
+  global_batch_size: 2048
   rampup_batch_size: null
-  tensor_model_parallel_size: 4
+  tensor_model_parallel_size: 2
   pipeline_model_parallel_size: 1
   virtual_pipeline_model_parallel_size: null
   encoder_seq_length: 2048
@@ -110,12 +109,12 @@ model:
   apex_transformer_log_level: 30
   gradient_as_bucket_view: true
   sync_batch_comm: false
-  activations_checkpoint_granularity: selective
+  activations_checkpoint_granularity: null
   activations_checkpoint_method: block
   activations_checkpoint_num_layers: 0
   num_micro_batches_with_partial_activation_checkpoints: 0
   activations_checkpoint_layers_per_pipeline: 0
-  sequence_parallel: false
+  sequence_parallel: true
   transformer_engine: true
   fp8: false
   fp8_e4m3: false
@@ -126,7 +125,7 @@ model:
   fp8_amax_compute_algo: most_recent
   use_emha: false
   ub_tp_comm_overlap: false
-  use_flash_attention: false
+  use_flash_attention: true
   optim:
     name: distributed_fused_adam
     lr: 0.0001
@@ -135,7 +134,7 @@ model:
     - 0.9
     - 0.95
     bucket_cap_mb: 125
-    overlap_grad_sync: false
+    overlap_grad_sync: true
     sched:
       name: CosineAnnealing
       warmup_steps: 107
diff --git a/launcher_scripts/conf/training/llama/30b.yaml b/launcher_scripts/conf/training/llama/llama1_30b.yaml
similarity index 94%
rename from launcher_scripts/conf/training/llama/30b.yaml
rename to launcher_scripts/conf/training/llama/llama1_30b.yaml
index ebdfa06f02..43cc420c46 100644
--- a/launcher_scripts/conf/training/llama/30b.yaml
+++ b/launcher_scripts/conf/training/llama/llama1_30b.yaml
@@ -49,7 +49,7 @@ model:
   global_batch_size: 2048
   rampup_batch_size: null
   tensor_model_parallel_size: 4
-  pipeline_model_parallel_size: 4
+  pipeline_model_parallel_size: 1
   virtual_pipeline_model_parallel_size: null
   encoder_seq_length: 2048
   max_position_embeddings: 2048
@@ -84,7 +84,7 @@ model:
   tokenizer:
     library: sentencepiece
     type: null
-    model:  ${data_dir}/llama/llama_tokenizer.model
+    model: ${data_dir}/llama/llama_tokenizer.model
     delimiter: null
     vocab_file: null
     merge_file: null
@@ -108,12 +108,12 @@ model:
   apex_transformer_log_level: 30
   gradient_as_bucket_view: true
   sync_batch_comm: false
-  activations_checkpoint_granularity: selective
+  activations_checkpoint_granularity: null
   activations_checkpoint_method: block
   activations_checkpoint_num_layers: 0
-  num_micro_batches_with_partial_activation_checkpoints: 2
-  activations_checkpoint_layers_per_pipeline: 32
-  sequence_parallel: false
+  num_micro_batches_with_partial_activation_checkpoints: 0
+  activations_checkpoint_layers_per_pipeline: 0
+  sequence_parallel: true
   transformer_engine: true
   fp8: false
   fp8_e4m3: false
@@ -124,7 +124,7 @@ model:
   fp8_amax_compute_algo: most_recent
   use_emha: false
   ub_tp_comm_overlap: false
-  use_flash_attention: false
+  use_flash_attention: true
   optim:
     name: distributed_fused_adam
     lr: 0.0001
@@ -133,7 +133,7 @@ model:
     - 0.9
     - 0.95
     bucket_cap_mb: 125
-    overlap_grad_sync: false
+    overlap_grad_sync: true
     sched:
       name: CosineAnnealing
       warmup_steps: 107
diff --git a/launcher_scripts/conf/training/llama/65b.yaml b/launcher_scripts/conf/training/llama/llama1_65b.yaml
similarity index 94%
rename from launcher_scripts/conf/training/llama/65b.yaml
rename to launcher_scripts/conf/training/llama/llama1_65b.yaml
index 0f65d40071..e61cf1ca59 100644
--- a/launcher_scripts/conf/training/llama/65b.yaml
+++ b/launcher_scripts/conf/training/llama/llama1_65b.yaml
@@ -4,7 +4,7 @@ run:
   time_limit: 0-01:00:00
   dependency: singleton
 trainer:
-  num_nodes: 16
+  num_nodes: 128
   devices: 8
   accelerator: gpu
   precision: bf16
@@ -20,7 +20,6 @@ trainer:
   limit_test_batches: 50
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
-  num_sanity_val_steps: 0
 exp_manager:
   explicit_log_dir: ${training.run.results_dir}/results
   exp_dir: null
@@ -50,8 +49,8 @@ model:
   global_batch_size: 2048
   rampup_batch_size: null
   tensor_model_parallel_size: 4
-  pipeline_model_parallel_size: 8
-  virtual_pipeline_model_parallel_size: 10
+  pipeline_model_parallel_size: 4
+  virtual_pipeline_model_parallel_size: 20
   encoder_seq_length: 2048
   max_position_embeddings: 2048
   num_layers: 80
@@ -109,12 +108,12 @@ model:
   apex_transformer_log_level: 30
   gradient_as_bucket_view: true
   sync_batch_comm: false
-  activations_checkpoint_granularity: selective
+  activations_checkpoint_granularity: null
   activations_checkpoint_method: block
   activations_checkpoint_num_layers: 0
-  num_micro_batches_with_partial_activation_checkpoints: 80
+  num_micro_batches_with_partial_activation_checkpoints: 0
   activations_checkpoint_layers_per_pipeline: 0
-  sequence_parallel: false
+  sequence_parallel: true
   transformer_engine: true
   fp8: false
   fp8_e4m3: false
@@ -125,7 +124,7 @@ model:
   fp8_amax_compute_algo: most_recent
   use_emha: false
   ub_tp_comm_overlap: false
-  use_flash_attention: false
+  use_flash_attention: true
   optim:
     name: distributed_fused_adam
     lr: 0.0001
@@ -134,7 +133,7 @@ model:
     - 0.9
     - 0.95
     bucket_cap_mb: 125
-    overlap_grad_sync: false
+    overlap_grad_sync: true
     sched:
       name: CosineAnnealing
       warmup_steps: 107
diff --git a/launcher_scripts/conf/training/llama/7b.yaml b/launcher_scripts/conf/training/llama/llama1_7b.yaml
similarity index 97%
rename from launcher_scripts/conf/training/llama/7b.yaml
rename to launcher_scripts/conf/training/llama/llama1_7b.yaml
index cc1bb32c15..090a0881a2 100644
--- a/launcher_scripts/conf/training/llama/7b.yaml
+++ b/launcher_scripts/conf/training/llama/llama1_7b.yaml
@@ -4,7 +4,7 @@ run:
   time_limit: "0-04:00:00"
   dependency: "singleton"
 trainer:
-  num_nodes: 2
+  num_nodes: 16
   devices: 8
   accelerator: gpu
   precision: bf16
@@ -20,7 +20,6 @@ trainer:
   limit_test_batches: 50
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
-  num_sanity_val_steps: 0
 exp_manager:
   explicit_log_dir: ${training.run.results_dir}/results
   exp_dir: null
@@ -48,9 +47,9 @@ exp_manager:
 model:
   mcore_gpt: true
   micro_batch_size: 2
-  global_batch_size: 128
+  global_batch_size: 2048
   rampup_batch_size: null
-  tensor_model_parallel_size: 2
+  tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1
   virtual_pipeline_model_parallel_size: null
   encoder_seq_length: 2048
@@ -110,7 +109,7 @@ model:
   apex_transformer_log_level: 30
   gradient_as_bucket_view: true
   sync_batch_comm: false
-  activations_checkpoint_granularity: selective
+  activations_checkpoint_granularity: null
   activations_checkpoint_method: block
   activations_checkpoint_num_layers: 0
   num_micro_batches_with_partial_activation_checkpoints: null
@@ -138,7 +137,7 @@ model:
       - 0.9
       - 0.95
     bucket_cap_mb: 125
-    overlap_grad_sync: False
+    overlap_grad_sync: true
     sched:
       name: CosineAnnealing
       warmup_steps: 500
diff --git a/launcher_scripts/conf/training/llama/llama2_13b.yaml b/launcher_scripts/conf/training/llama/llama2_13b.yaml
index a77584a33d..1e1ff0c7d8 100644
--- a/launcher_scripts/conf/training/llama/llama2_13b.yaml
+++ b/launcher_scripts/conf/training/llama/llama2_13b.yaml
@@ -4,7 +4,7 @@ run:
   time_limit: 0-01:00:00
   dependency: singleton
 trainer:
-  num_nodes: 4
+  num_nodes: 32
   devices: 8
   accelerator: gpu
   precision: bf16
@@ -20,7 +20,6 @@ trainer:
   limit_test_batches: 50
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
-  num_sanity_val_steps: 0
 exp_manager:
   explicit_log_dir: ${training.run.results_dir}/results
   exp_dir: null
@@ -47,10 +46,10 @@ exp_manager:
     buffer_size: 5
 model:
   mcore_gpt: true
-  micro_batch_size: 2
-  global_batch_size: 128
+  micro_batch_size: 1
+  global_batch_size: 2048
   rampup_batch_size: null
-  tensor_model_parallel_size: 4
+  tensor_model_parallel_size: 2
   pipeline_model_parallel_size: 1
   virtual_pipeline_model_parallel_size: null
   encoder_seq_length: 4096
@@ -110,7 +109,7 @@ model:
   apex_transformer_log_level: 30
   gradient_as_bucket_view: true
   sync_batch_comm: false
-  activations_checkpoint_granularity: selective
+  activations_checkpoint_granularity: null
   activations_checkpoint_method: block
   activations_checkpoint_num_layers: 0
   num_micro_batches_with_partial_activation_checkpoints: 0
@@ -135,7 +134,7 @@ model:
     - 0.9
     - 0.95
     bucket_cap_mb: 125
-    overlap_grad_sync: false
+    overlap_grad_sync: true
     sched:
       name: CosineAnnealing
       warmup_steps: 107
diff --git a/launcher_scripts/conf/training/llama/llama2_70b.yaml b/launcher_scripts/conf/training/llama/llama2_70b.yaml
index 7697b36e0f..5cee07a21a 100644
--- a/launcher_scripts/conf/training/llama/llama2_70b.yaml
+++ b/launcher_scripts/conf/training/llama/llama2_70b.yaml
@@ -4,7 +4,7 @@ run:
   time_limit: 0-01:00:00
   dependency: singleton
 trainer:
-  num_nodes: 8
+  num_nodes: 128
   devices: 8
   accelerator: gpu
   precision: bf16
@@ -20,7 +20,6 @@ trainer:
   limit_test_batches: 50
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
-  num_sanity_val_steps: 0
 exp_manager:
   explicit_log_dir: ${training.run.results_dir}/results
   exp_dir: null
@@ -47,11 +46,11 @@ exp_manager:
 model:
   mcore_gpt: true
   micro_batch_size: 1
-  global_batch_size: 128
+  global_batch_size: 2048
   rampup_batch_size: null
   tensor_model_parallel_size: 4
   pipeline_model_parallel_size: 4
-  virtual_pipeline_model_parallel_size: null
+  virtual_pipeline_model_parallel_size: 20
   encoder_seq_length: 4096
   max_position_embeddings: 4096
   num_layers: 80
@@ -110,9 +109,9 @@ model:
   apex_transformer_log_level: 30
   gradient_as_bucket_view: true
   sync_batch_comm: false
-  activations_checkpoint_granularity: selective
+  activations_checkpoint_granularity: null
   activations_checkpoint_method: block
-  activations_checkpoint_num_layers: 1
+  activations_checkpoint_num_layers: 0
   num_micro_batches_with_partial_activation_checkpoints: 0
   activations_checkpoint_layers_per_pipeline: 0
   sequence_parallel: true
@@ -127,8 +126,8 @@ model:
   use_emha: false
   ub_tp_comm_overlap: false
   use_flash_attention: true
-  overlap_p2p_comm: false
-  batch_p2p_comm: true
+  overlap_p2p_comm: true
+  batch_p2p_comm: false
   gc_interval: 100
   optim:
     name: distributed_fused_adam
diff --git a/launcher_scripts/conf/training/llama/llama2_7b.yaml b/launcher_scripts/conf/training/llama/llama2_7b.yaml
index dcc2887bcf..87bac9f902 100644
--- a/launcher_scripts/conf/training/llama/llama2_7b.yaml
+++ b/launcher_scripts/conf/training/llama/llama2_7b.yaml
@@ -1,10 +1,10 @@
 run:
   name: llama2_7b
   results_dir: ${base_results_dir}/${.name}
-  time_limit: "0-01:00:00"
+  time_limit: "0-01:30:00"
   dependency: "singleton"
 trainer:
-  num_nodes: 2
+  num_nodes: 16
   devices: 8
   accelerator: gpu
   precision: bf16
@@ -20,7 +20,6 @@ trainer:
   limit_test_batches: 50
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
-  num_sanity_val_steps: 0
 exp_manager:
   explicit_log_dir: ${training.run.results_dir}/results
   exp_dir: null
@@ -47,10 +46,10 @@ exp_manager:
 
 model:
   mcore_gpt: true
-  micro_batch_size: 2
-  global_batch_size: 128
+  micro_batch_size: 1
+  global_batch_size: 2048
   rampup_batch_size: null
-  tensor_model_parallel_size: 2
+  tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1
   virtual_pipeline_model_parallel_size: null
   encoder_seq_length: 4096
@@ -110,12 +109,12 @@ model:
   apex_transformer_log_level: 30
   gradient_as_bucket_view: true
   sync_batch_comm: false
-  activations_checkpoint_granularity: selective
+  activations_checkpoint_granularity: null
   activations_checkpoint_method: block
   activations_checkpoint_num_layers: 0
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
-  sequence_parallel: true # does not support sequence parallel
+  sequence_parallel: false
 
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
@@ -138,7 +137,7 @@ model:
       - 0.9
       - 0.95
     bucket_cap_mb: 125
-    overlap_grad_sync: False
+    overlap_grad_sync: true
     sched:
       name: CosineAnnealing
       warmup_steps: 500
@@ -160,64 +159,64 @@ model:
     - ${data_dir}/my-llama_00_text_document
     - .0333
     - ${data_dir}/my-llama_01_text_document
-    - .0333
-    - ${data_dir}/my-llama_00_text_document
-    - .0333
-    - ${data_dir}/my-llama_01_text_document
-    - .0333
-    - ${data_dir}/my-llama_02_text_document
-    - .0333
-    - ${data_dir}/my-llama_03_text_document
-    - .0333
-    - ${data_dir}/my-llama_04_text_document
-    - .0333
-    - ${data_dir}/my-llama_05_text_document
-    - .0333
-    - ${data_dir}/my-llama_06_text_document
-    - .0333
-    - ${data_dir}/my-llama_07_text_document
-    - .0333
-    - ${data_dir}/my-llama_08_text_document
-    - .0333
-    - ${data_dir}/my-llama_09_text_document
-    - .0333
-    - ${data_dir}/my-llama_10_text_document
-    - .0333
-    - ${data_dir}/my-llama_11_text_document
-    - .0333
-    - ${data_dir}/my-llama_12_text_document
-    - .0333
-    - ${data_dir}/my-llama_13_text_document
-    - .0333
-    - ${data_dir}/my-llama_14_text_document
-    - .0333
-    - ${data_dir}/my-llama_15_text_document
-    - .0333
-    - ${data_dir}/my-llama_16_text_document
-    - .0333
-    - ${data_dir}/my-llama_17_text_document
-    - .0333
-    - ${data_dir}/my-llama_18_text_document
-    - .0333
-    - ${data_dir}/my-llama_19_text_document
-    - .0333
-    - ${data_dir}/my-llama_20_text_document
-    - .0333
-    - ${data_dir}/my-llama_21_text_document
-    - .0333
-    - ${data_dir}/my-llama_22_text_document
-    - .0333
-    - ${data_dir}/my-llama_23_text_document
-    - .0333
-    - ${data_dir}/my-llama_24_text_document
-    - .0333
-    - ${data_dir}/my-llama_25_text_document
-    - .0333
-    - ${data_dir}/my-llama_26_text_document
-    - .0333
-    - ${data_dir}/my-llama_27_text_document
-    - .0333
-    - ${data_dir}/my-llama_28_text_document
-    - .0334
-    - ${data_dir}/my-llama_29_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_00_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_01_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_02_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_03_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_04_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_05_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_06_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_07_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_08_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_09_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_10_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_11_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_12_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_13_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_14_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_15_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_16_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_17_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_18_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_19_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_20_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_21_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_22_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_23_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_24_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_25_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_26_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_27_text_document
+      #- .0333
+      #- ${data_dir}/my-llama_28_text_document
+      #- .0334
+      #- ${data_dir}/my-llama_29_text_document
 

From 45b9f41037d79a81475e106a6bd98cd0114399f6 Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Wed, 6 Sep 2023 02:22:00 -0700
Subject: [PATCH 41/62] update data preparation script for llama

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 .../conf/data_preparation/llama/download_llama_pile.yaml   | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml b/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml
index 863f817661..ab317e8a8d 100755
--- a/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml
+++ b/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml
@@ -10,14 +10,9 @@ run:
 dataset: pile
 download_the_pile: True  # Whether to download the pile dataset from the internet.
 the_pile_url: "https://the-eye.eu/public/AI/pile/train/"  # Source URL to download The Pile dataset from.
-file_numbers: "0-1"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
+file_numbers: "0-29"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
 preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
 download_tokenizer_url: "https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model"
-#download_vocab_url: "https://huggingface.co/gpt2/resolve/main/vocab.json"  # URL to download the vocab from.
-#download_merges_url: "https://huggingface.co/gpt2/resolve/main/merges.txt"  # URL to download the merges from.
-#vocab_save_dir: ${data_dir}/bpe
-#merges_save_dir: ${data_dir}/bpe
-#tokenizer_type: GPT2BPETokenizer
 tokenizer_library: "sentencepiece"
 tokenizer_save_dir: ${data_dir}/llama
 tokenizer_model:  ${.tokenizer_save_dir}/llama_tokenizer.model

From 702ea64ee603b64b6fc97de0f2a6d5968cf2893d Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Wed, 6 Sep 2023 04:03:56 -0700
Subject: [PATCH 42/62] update llama training scripts

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 .../conf/training/llama/llama1_13b.yaml       |   6 +-
 .../conf/training/llama/llama1_30b.yaml       |   8 +-
 .../conf/training/llama/llama1_65b.yaml       |   6 +-
 .../conf/training/llama/llama1_7b.yaml        |   6 +-
 .../conf/training/llama/llama2_13b.yaml       |   6 +-
 .../conf/training/llama/llama2_70b.yaml       |   4 -
 .../conf/training/llama/llama2_7b.yaml        | 118 +++++++++---------
 7 files changed, 69 insertions(+), 85 deletions(-)

diff --git a/launcher_scripts/conf/training/llama/llama1_13b.yaml b/launcher_scripts/conf/training/llama/llama1_13b.yaml
index 2d71dadb3a..3c2fd60daf 100644
--- a/launcher_scripts/conf/training/llama/llama1_13b.yaml
+++ b/launcher_scripts/conf/training/llama/llama1_13b.yaml
@@ -135,6 +135,8 @@ model:
     - 0.95
     bucket_cap_mb: 125
     overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
     sched:
       name: CosineAnnealing
       warmup_steps: 107
@@ -157,10 +159,6 @@ model:
     - .0333
     - ${data_dir}/my-llama_01_text_document
     - .0333
-    - ${data_dir}/my-llama_00_text_document
-    - .0333
-    - ${data_dir}/my-llama_01_text_document
-    - .0333
     - ${data_dir}/my-llama_02_text_document
     - .0333
     - ${data_dir}/my-llama_03_text_document
diff --git a/launcher_scripts/conf/training/llama/llama1_30b.yaml b/launcher_scripts/conf/training/llama/llama1_30b.yaml
index 43cc420c46..93dee04071 100644
--- a/launcher_scripts/conf/training/llama/llama1_30b.yaml
+++ b/launcher_scripts/conf/training/llama/llama1_30b.yaml
@@ -4,7 +4,7 @@ run:
   time_limit: 0-01:00:00
   dependency: singleton
 trainer:
-  num_nodes: 16
+  num_nodes: 32
   devices: 8
   accelerator: gpu
   precision: bf16
@@ -134,6 +134,8 @@ model:
     - 0.95
     bucket_cap_mb: 125
     overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
     sched:
       name: CosineAnnealing
       warmup_steps: 107
@@ -156,10 +158,6 @@ model:
     - .0333
     - ${data_dir}/my-llama_01_text_document
     - .0333
-    - ${data_dir}/my-llama_00_text_document
-    - .0333
-    - ${data_dir}/my-llama_01_text_document
-    - .0333
     - ${data_dir}/my-llama_02_text_document
     - .0333
     - ${data_dir}/my-llama_03_text_document
diff --git a/launcher_scripts/conf/training/llama/llama1_65b.yaml b/launcher_scripts/conf/training/llama/llama1_65b.yaml
index e61cf1ca59..d39259caae 100644
--- a/launcher_scripts/conf/training/llama/llama1_65b.yaml
+++ b/launcher_scripts/conf/training/llama/llama1_65b.yaml
@@ -134,6 +134,8 @@ model:
     - 0.95
     bucket_cap_mb: 125
     overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
     sched:
       name: CosineAnnealing
       warmup_steps: 107
@@ -156,10 +158,6 @@ model:
     - .0333
     - ${data_dir}/my-llama_01_text_document
     - .0333
-    - ${data_dir}/my-llama_00_text_document
-    - .0333
-    - ${data_dir}/my-llama_01_text_document
-    - .0333
     - ${data_dir}/my-llama_02_text_document
     - .0333
     - ${data_dir}/my-llama_03_text_document
diff --git a/launcher_scripts/conf/training/llama/llama1_7b.yaml b/launcher_scripts/conf/training/llama/llama1_7b.yaml
index 090a0881a2..a8acb21e7d 100644
--- a/launcher_scripts/conf/training/llama/llama1_7b.yaml
+++ b/launcher_scripts/conf/training/llama/llama1_7b.yaml
@@ -138,6 +138,8 @@ model:
       - 0.95
     bucket_cap_mb: 125
     overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
     sched:
       name: CosineAnnealing
       warmup_steps: 500
@@ -160,10 +162,6 @@ model:
     - .0333
     - ${data_dir}/my-llama_01_text_document
     - .0333
-    - ${data_dir}/my-llama_00_text_document
-    - .0333
-    - ${data_dir}/my-llama_01_text_document
-    - .0333
     - ${data_dir}/my-llama_02_text_document
     - .0333
     - ${data_dir}/my-llama_03_text_document
diff --git a/launcher_scripts/conf/training/llama/llama2_13b.yaml b/launcher_scripts/conf/training/llama/llama2_13b.yaml
index 1e1ff0c7d8..3d4dc8d0b1 100644
--- a/launcher_scripts/conf/training/llama/llama2_13b.yaml
+++ b/launcher_scripts/conf/training/llama/llama2_13b.yaml
@@ -135,6 +135,8 @@ model:
     - 0.95
     bucket_cap_mb: 125
     overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true    
     sched:
       name: CosineAnnealing
       warmup_steps: 107
@@ -157,10 +159,6 @@ model:
     - .0333
     - ${data_dir}/my-llama_01_text_document
     - .0333
-    - ${data_dir}/my-llama_00_text_document
-    - .0333
-    - ${data_dir}/my-llama_01_text_document
-    - .0333
     - ${data_dir}/my-llama_02_text_document
     - .0333
     - ${data_dir}/my-llama_03_text_document
diff --git a/launcher_scripts/conf/training/llama/llama2_70b.yaml b/launcher_scripts/conf/training/llama/llama2_70b.yaml
index 5cee07a21a..0beb5f8bca 100644
--- a/launcher_scripts/conf/training/llama/llama2_70b.yaml
+++ b/launcher_scripts/conf/training/llama/llama2_70b.yaml
@@ -162,10 +162,6 @@ model:
     - .0333
     - ${data_dir}/my-llama_01_text_document
     - .0333
-    - ${data_dir}/my-llama_00_text_document
-    - .0333
-    - ${data_dir}/my-llama_01_text_document
-    - .0333
     - ${data_dir}/my-llama_02_text_document
     - .0333
     - ${data_dir}/my-llama_03_text_document
diff --git a/launcher_scripts/conf/training/llama/llama2_7b.yaml b/launcher_scripts/conf/training/llama/llama2_7b.yaml
index 87bac9f902..7df5de9940 100644
--- a/launcher_scripts/conf/training/llama/llama2_7b.yaml
+++ b/launcher_scripts/conf/training/llama/llama2_7b.yaml
@@ -138,6 +138,8 @@ model:
       - 0.95
     bucket_cap_mb: 125
     overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true    
     sched:
       name: CosineAnnealing
       warmup_steps: 500
@@ -159,64 +161,60 @@ model:
     - ${data_dir}/my-llama_00_text_document
     - .0333
     - ${data_dir}/my-llama_01_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_00_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_01_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_02_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_03_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_04_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_05_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_06_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_07_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_08_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_09_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_10_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_11_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_12_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_13_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_14_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_15_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_16_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_17_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_18_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_19_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_20_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_21_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_22_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_23_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_24_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_25_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_26_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_27_text_document
-      #- .0333
-      #- ${data_dir}/my-llama_28_text_document
-      #- .0334
-      #- ${data_dir}/my-llama_29_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
 

From 4a5c91e19b4447edf18fade4708533493100032d Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Wed, 6 Sep 2023 04:08:54 -0700
Subject: [PATCH 43/62] update llama2 config in auto configurator

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 .../base_configs/llama2_13b.yaml              |  13 +-
 .../base_configs/llama2_70b.yaml              |  11 +-
 auto_configurator/base_configs/llama2_7b.yaml | 130 +++++++++---------
 3 files changed, 70 insertions(+), 84 deletions(-)

diff --git a/auto_configurator/base_configs/llama2_13b.yaml b/auto_configurator/base_configs/llama2_13b.yaml
index a77584a33d..b3f20fd0c2 100644
--- a/auto_configurator/base_configs/llama2_13b.yaml
+++ b/auto_configurator/base_configs/llama2_13b.yaml
@@ -4,7 +4,7 @@ run:
   time_limit: 0-01:00:00
   dependency: singleton
 trainer:
-  num_nodes: 4
+  num_nodes: 2
   devices: 8
   accelerator: gpu
   precision: bf16
@@ -20,7 +20,6 @@ trainer:
   limit_test_batches: 50
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
-  num_sanity_val_steps: 0
 exp_manager:
   explicit_log_dir: ${training.run.results_dir}/results
   exp_dir: null
@@ -37,7 +36,7 @@ exp_manager:
     save_top_k: 10
     mode: min
     always_save_nemo: false
-    save_nemo_on_train_end: true
+    save_nemo_on_train_end: false
     filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
     model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size},
       ${training.model.pipeline_model_parallel_size}}
@@ -50,7 +49,7 @@ model:
   micro_batch_size: 2
   global_batch_size: 128
   rampup_batch_size: null
-  tensor_model_parallel_size: 4
+  tensor_model_parallel_size: 2
   pipeline_model_parallel_size: 1
   virtual_pipeline_model_parallel_size: null
   encoder_seq_length: 4096
@@ -110,7 +109,7 @@ model:
   apex_transformer_log_level: 30
   gradient_as_bucket_view: true
   sync_batch_comm: false
-  activations_checkpoint_granularity: selective
+  activations_checkpoint_granularity: null
   activations_checkpoint_method: block
   activations_checkpoint_num_layers: 0
   num_micro_batches_with_partial_activation_checkpoints: 0
@@ -158,10 +157,6 @@ model:
     - .0333
     - ${data_dir}/my-llama_01_text_document
     - .0333
-    - ${data_dir}/my-llama_00_text_document
-    - .0333
-    - ${data_dir}/my-llama_01_text_document
-    - .0333
     - ${data_dir}/my-llama_02_text_document
     - .0333
     - ${data_dir}/my-llama_03_text_document
diff --git a/auto_configurator/base_configs/llama2_70b.yaml b/auto_configurator/base_configs/llama2_70b.yaml
index 7697b36e0f..f624f5bb03 100644
--- a/auto_configurator/base_configs/llama2_70b.yaml
+++ b/auto_configurator/base_configs/llama2_70b.yaml
@@ -20,7 +20,6 @@ trainer:
   limit_test_batches: 50
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
-  num_sanity_val_steps: 0
 exp_manager:
   explicit_log_dir: ${training.run.results_dir}/results
   exp_dir: null
@@ -37,7 +36,7 @@ exp_manager:
     save_top_k: 10
     mode: min
     always_save_nemo: false
-    save_nemo_on_train_end: true
+    save_nemo_on_train_end: false
     filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
     model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
   log_step_timing: true
@@ -110,9 +109,9 @@ model:
   apex_transformer_log_level: 30
   gradient_as_bucket_view: true
   sync_batch_comm: false
-  activations_checkpoint_granularity: selective
+  activations_checkpoint_granularity: null
   activations_checkpoint_method: block
-  activations_checkpoint_num_layers: 1
+  activations_checkpoint_num_layers: 0
   num_micro_batches_with_partial_activation_checkpoints: 0
   activations_checkpoint_layers_per_pipeline: 0
   sequence_parallel: true
@@ -163,10 +162,6 @@ model:
     - .0333
     - ${data_dir}/my-llama_01_text_document
     - .0333
-    - ${data_dir}/my-llama_00_text_document
-    - .0333
-    - ${data_dir}/my-llama_01_text_document
-    - .0333
     - ${data_dir}/my-llama_02_text_document
     - .0333
     - ${data_dir}/my-llama_03_text_document
diff --git a/auto_configurator/base_configs/llama2_7b.yaml b/auto_configurator/base_configs/llama2_7b.yaml
index 39222af385..95733d1f53 100755
--- a/auto_configurator/base_configs/llama2_7b.yaml
+++ b/auto_configurator/base_configs/llama2_7b.yaml
@@ -4,7 +4,7 @@ run:
   time_limit: "0-01:00:00"
   dependency: "singleton"
 trainer:
-  num_nodes: 2
+  num_nodes: 1
   devices: 8
   accelerator: gpu
   precision: bf16
@@ -48,9 +48,9 @@ exp_manager:
 model:
   mcore_gpt: true
   micro_batch_size: 2
-  global_batch_size: 64
+  global_batch_size: 128
   rampup_batch_size: null
-  tensor_model_parallel_size: 2
+  tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1
   virtual_pipeline_model_parallel_size: null
   encoder_seq_length: 4096
@@ -110,12 +110,12 @@ model:
   apex_transformer_log_level: 30
   gradient_as_bucket_view: true
   sync_batch_comm: false
-  activations_checkpoint_granularity: selective
+  activations_checkpoint_granularity: null
   activations_checkpoint_method: block
   activations_checkpoint_num_layers: 0
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
-  sequence_parallel: true # does not support sequence parallel
+  sequence_parallel: false # does not support sequence parallel
 
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
@@ -156,68 +156,64 @@ model:
     eod_mask_loss: false
     index_mapping_dir: null
     data_prefix:
-    - .5
+    - .0333
     - ${data_dir}/my-llama_00_text_document
-    - .5
+    - .0333
     - ${data_dir}/my-llama_01_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_00_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_01_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_02_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_03_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_04_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_05_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_06_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_07_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_08_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_09_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_10_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_11_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_12_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_13_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_14_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_15_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_16_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_17_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_18_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_19_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_20_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_21_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_22_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_23_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_24_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_25_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_26_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_27_text_document
-            #      - .0333
-            #      - ${data_dir}/my-gpt3_28_text_document
-            #      - .0334
-            #      - ${data_dir}/my-gpt3_29_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
 

From 485463966fe060b4dc70948018270f8128f76129 Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Wed, 6 Sep 2023 04:13:21 -0700
Subject: [PATCH 44/62] add llama config in auto configurator

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 .../conf/search_config/llama/13b.yaml         | 31 ++++----------
 .../conf/search_config/llama/70b.yaml         | 21 +---------
 .../conf/search_config/llama/7b.yaml          | 26 ++----------
 .../conf/search_config/llama/7b_nemo.yaml     | 40 +++++++++++++++++++
 4 files changed, 53 insertions(+), 65 deletions(-)
 create mode 100644 auto_configurator/conf/search_config/llama/7b_nemo.yaml

diff --git a/auto_configurator/conf/search_config/llama/13b.yaml b/auto_configurator/conf/search_config/llama/13b.yaml
index e18a5f242c..0035e650bb 100644
--- a/auto_configurator/conf/search_config/llama/13b.yaml
+++ b/auto_configurator/conf/search_config/llama/13b.yaml
@@ -1,40 +1,23 @@
 train_settings:
   model_size_in_b: 13 # unit in billion parameters
-  num_nodes: 4
+  num_nodes: 2
   gpus_per_node: 8
   gpu_memory_gb: 80  # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported.
   max_training_days: 5 # unit in days
-  limit_search_runs: 1 # Max number of runs to be launched in parallel for grid search.
+  limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search.
   output_top_n: 10  # The result will print the top N fastest training configs.
-  max_steps_per_run: 50 # Max steps per run for the grid search.
+  max_steps_per_run: 100 # Max steps per run for the grid search.
   max_minutes_per_run: 30 # minutes per run for the grid search.
   tflops_per_gpu: 150  # Estimated tflops per GPU.
   num_tokens_in_b: 300  # Unit in billions, typically 300B for GPT3 models.
   vocab_size: 32000
   seq_length: 4096 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768]
-  custom_config: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/auto_configurator/base_configs/llama2_13b.yaml # path to custom .yaml model config instead of using auto-generated
+  custom_config: {auto_configurator_path}/base_configs/llama2_13b.yaml # path to custom .yaml model config instead of using auto-generated
   logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb  # Example base_results_dir/gpt3/126m
-  tensor_parallel_sizes: [2]  # auto to use our recommendation, or a list, such as [1, 2, 4, 8]
-  pipeline_parallel_sizes: [1,2]  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10]
+  tensor_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8]
+  pipeline_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10]
   min_model_parallel_size: auto  # auto to use our recommendation, or a value for the minimum desired parallelism
   max_model_parallel_size: auto  # auto to use our recommendation, or a value for the maximum desired parallelism
-  micro_batch_sizes: [1,2]  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16]
+  micro_batch_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16]
   act_ckpt_layers: auto  # auto to use our recommendation, or a list, such as [0, 1, 2, 3]
  
-inference_settings:
-  run:
-    model_type: gpt3
-    model_train_name: gpt3_5b
-    gpus_per_node: 8
-    data_type: "fp16" # fp32|fp16|bf16
-    time_limit: 0:30:00
-    results_dir: ${base_results_dir}/${search_config_value}_${search_config.train_settings.gpu_memory_gb}gb
-    tensor_parallel_sizes: [1,2,4]
-    pipeline_parallel_sizes: [1,2]
-  benchmark:
-    input_len: 60
-    output_len: 20
-    batch_sizes: [4,8,16,32,64,128,256]
-    beam_width: 1
-    topk: 4
-    topp: 0.0
diff --git a/auto_configurator/conf/search_config/llama/70b.yaml b/auto_configurator/conf/search_config/llama/70b.yaml
index eb2d089064..ee41a9ccda 100644
--- a/auto_configurator/conf/search_config/llama/70b.yaml
+++ b/auto_configurator/conf/search_config/llama/70b.yaml
@@ -6,13 +6,13 @@ train_settings:
   max_training_days: 5 # unit in days
   limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search.
   output_top_n: 10  # The result will print the top N fastest training configs.
-  max_steps_per_run: 50 # Max steps per run for the grid search.
+  max_steps_per_run: 100 # Max steps per run for the grid search.
   max_minutes_per_run: 30 # minutes per run for the grid search.
   tflops_per_gpu: 150  # Estimated tflops per GPU.
   num_tokens_in_b: 300  # Unit in billions, typically 300B for GPT3 models.
   vocab_size: 32000
   seq_length: 4096 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768]
-  custom_config: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/auto_configurator/base_configs/llama2_70b.yaml # path to custom .yaml model config instead of using auto-generated
+  custom_config: {auto_configurator_path}/base_configs/llama2_70b.yaml # path to custom .yaml model config instead of using auto-generated
   logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb  # Example base_results_dir/gpt3/126m
   tensor_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8]
   pipeline_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10]
@@ -21,20 +21,3 @@ train_settings:
   micro_batch_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16]
   act_ckpt_layers: auto  # auto to use our recommendation, or a list, such as [0, 1, 2, 3]
  
-inference_settings:
-  run:
-    model_type: gpt3
-    model_train_name: gpt3_5b
-    gpus_per_node: 8
-    data_type: "fp16" # fp32|fp16|bf16
-    time_limit: 0:30:00
-    results_dir: ${base_results_dir}/${search_config_value}_${search_config.train_settings.gpu_memory_gb}gb
-    tensor_parallel_sizes: [1,2,4]
-    pipeline_parallel_sizes: [1,2]
-  benchmark:
-    input_len: 60
-    output_len: 20
-    batch_sizes: [4,8,16,32,64,128,256]
-    beam_width: 1
-    topk: 4
-    topp: 0.0
diff --git a/auto_configurator/conf/search_config/llama/7b.yaml b/auto_configurator/conf/search_config/llama/7b.yaml
index 148f12ff6c..bfe1756413 100644
--- a/auto_configurator/conf/search_config/llama/7b.yaml
+++ b/auto_configurator/conf/search_config/llama/7b.yaml
@@ -1,18 +1,18 @@
 train_settings:
   model_size_in_b: 7 # unit in billion parameters
-  num_nodes: 2
+  num_nodes: 1
   gpus_per_node: 8
   gpu_memory_gb: 80  # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported.
   max_training_days: 5 # unit in days
-  limit_search_runs: 1 # Max number of runs to be launched in parallel for grid search.
+  limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search.
   output_top_n: 10  # The result will print the top N fastest training configs.
-  max_steps_per_run: 50 # Max steps per run for the grid search.
+  max_steps_per_run: 100 # Max steps per run for the grid search.
   max_minutes_per_run: 30 # minutes per run for the grid search.
   tflops_per_gpu: 150  # Estimated tflops per GPU.
   num_tokens_in_b: 300  # Unit in billions, typically 300B for GPT3 models.
   vocab_size: 32000
   seq_length: 4096 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768]
-  custom_config: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/auto_configurator/base_configs/llama2_7b.yaml # path to custom .yaml model config instead of using auto-generated
+  custom_config: {auto_configurator_path}/base_configs/llama2_7b.yaml # path to custom .yaml model config instead of using auto-generated
   logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb  # Example base_results_dir/gpt3/126m
   tensor_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8]
   pipeline_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10]
@@ -20,21 +20,3 @@ train_settings:
   max_model_parallel_size: auto  # auto to use our recommendation, or a value for the maximum desired parallelism
   micro_batch_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16]
   act_ckpt_layers: auto  # auto to use our recommendation, or a list, such as [0, 1, 2, 3]
- 
-inference_settings:
-  run:
-    model_type: gpt3
-    model_train_name: gpt3_5b
-    gpus_per_node: 8
-    data_type: "fp16" # fp32|fp16|bf16
-    time_limit: 0:30:00
-    results_dir: ${base_results_dir}/${search_config_value}_${search_config.train_settings.gpu_memory_gb}gb
-    tensor_parallel_sizes: [1,2,4]
-    pipeline_parallel_sizes: [1,2]
-  benchmark:
-    input_len: 60
-    output_len: 20
-    batch_sizes: [4,8,16,32,64,128,256]
-    beam_width: 1
-    topk: 4
-    topp: 0.0
diff --git a/auto_configurator/conf/search_config/llama/7b_nemo.yaml b/auto_configurator/conf/search_config/llama/7b_nemo.yaml
new file mode 100644
index 0000000000..aca9819929
--- /dev/null
+++ b/auto_configurator/conf/search_config/llama/7b_nemo.yaml
@@ -0,0 +1,40 @@
+train_settings:
+  model_size_in_b: 7 # unit in billion parameters
+  num_nodes: 2
+  gpus_per_node: 8
+  gpu_memory_gb: 80  # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported.
+  max_training_days: 5 # unit in days
+  limit_search_runs: 10 # Max number of runs to be launched in parallel for grid search.
+  output_top_n: 10  # The result will print the top N fastest training configs.
+  max_steps_per_run: 100 # Max steps per run for the grid search.
+  max_minutes_per_run: 30 # minutes per run for the grid search.
+  tflops_per_gpu: 150  # Estimated tflops per GPU.
+  num_tokens_in_b: 300  # Unit in billions, typically 300B for GPT3 models.
+  vocab_size: 32000
+  seq_length: 4096 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768]
+  custom_config: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/auto_configurator/base_configs/llama2_7b.yaml # path to custom .yaml model config instead of using auto-generated
+  logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb  # Example base_results_dir/gpt3/126m
+  tensor_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8]
+  pipeline_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10]
+  min_model_parallel_size: auto  # auto to use our recommendation, or a value for the minimum desired parallelism
+  max_model_parallel_size: auto  # auto to use our recommendation, or a value for the maximum desired parallelism
+  micro_batch_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16]
+  act_ckpt_layers: auto  # auto to use our recommendation, or a list, such as [0, 1, 2, 3]
+ 
+inference_settings:
+  run:
+    model_type: gpt3
+    model_train_name: gpt3_5b
+    gpus_per_node: 8
+    data_type: "fp16" # fp32|fp16|bf16
+    time_limit: 0:30:00
+    results_dir: ${base_results_dir}/${search_config_value}_${search_config.train_settings.gpu_memory_gb}gb
+    tensor_parallel_sizes: [1,2,4]
+    pipeline_parallel_sizes: [1,2]
+  benchmark:
+    input_len: 60
+    output_len: 20
+    batch_sizes: [4,8,16,32,64,128,256]
+    beam_width: 1
+    topk: 4
+    topp: 0.0

From fd3c364b25db89c2114f563afa0f1cee40cb357f Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Fri, 8 Sep 2023 04:49:08 -0700
Subject: [PATCH 45/62] add sft support for llama

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 .../conf/fine_tuning/llama/squad.yaml         | 187 ++++++++++++++++++
 launcher_scripts/nemo_launcher/core/stages.py |   1 +
 2 files changed, 188 insertions(+)
 create mode 100644 launcher_scripts/conf/fine_tuning/llama/squad.yaml

diff --git a/launcher_scripts/conf/fine_tuning/llama/squad.yaml b/launcher_scripts/conf/fine_tuning/llama/squad.yaml
new file mode 100644
index 0000000000..0162280247
--- /dev/null
+++ b/launcher_scripts/conf/fine_tuning/llama/squad.yaml
@@ -0,0 +1,187 @@
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: llama_sft
+  convert_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.convert_name}
+  task_name: "squad"  # Rename this name to be more clear
+  results_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.task_name}
+
+trainer:
+  devices: 8
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 13000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 300 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: ${fine_tuning.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama_${fine_tuning.run.task_name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: nemo_llama_${fine_tuning.run.task_name}
+    name: ${fine_tuning.run.name}
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${fine_tuning.model.data.validation_ds.metric.name}
+    save_top_k: 5
+    mode: min
+    save_nemo_on_train_end: True
+    filename: 'megatron_gpt_sft--{${.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${fine_tuning.model.tensor_model_parallel_size}, ${fine_tuning.model.pipeline_model_parallel_size}}
+    save_best_model: True
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  global_batch_size: 32
+  micro_batch_size: 4
+  restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_llama.nemo # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: True 
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  answer_only_loss: True  # not used right now
+  gradient_as_bucket_view: False
+  seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value
+  use_flash_attention: True # if not None, will match the base model's value
+
+  hidden_dropout: 0.1
+  attention_dropout: 0.1
+  ffn_dropout: 0.1
+
+  data:
+    chat: False # whether use chatbot data or not
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names: 
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names: 
+      - ${data_dir}/squad_data/v1.1/train-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${fine_tuning.model.global_batch_size}
+      micro_batch_size: ${fine_tuning.model.micro_batch_size}
+      shuffle: True
+      num_workers: 4
+      pin_memory: True
+      max_seq_length: 4096
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities: 
+      - 1.0 # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: True
+      separate_prompt_and_response_with_newline: True
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+    validation_ds:
+      file_names:
+      - ${data_dir}/squad_data/v1.1/dev-v1.1_gpt.json  # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: 
+      - ${fine_tuning.run.task_name} # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${fine_tuning.model.global_batch_size}
+      micro_batch_size: ${fine_tuning.model.micro_batch_size}
+      shuffle: True
+      num_workers: 4
+      pin_memory: True
+      max_seq_length: ${fine_tuning.model.data.train_ds.max_seq_length}
+      min_seq_length: ${fine_tuning.model.data.train_ds.min_seq_length}
+      drop_last: True
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: ${fine_tuning.model.data.train_ds.add_eos}
+      add_sep: ${fine_tuning.model.data.train_ds.add_sep}
+      add_bos: ${fine_tuning.model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+    test_ds:
+      file_names: ${fine_tuning.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${fine_tuning.model.global_batch_size}
+      micro_batch_size: ${fine_tuning.model.micro_batch_size}
+      shuffle: True
+      num_workers: 4
+      pin_memory: True
+      max_seq_length: ${fine_tuning.model.data.train_ds.max_seq_length}
+      min_seq_length: ${fine_tuning.model.data.train_ds.min_seq_length}
+      drop_last: True
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: ${fine_tuning.model.data.train_ds.add_eos}
+      add_sep: ${fine_tuning.model.data.train_ds.add_sep}
+      add_bos: ${fine_tuning.model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+  optim:
+    name: distributed_fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work.
+    lr: 1e-6
+    weight_decay: 0.1 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      monitor: validation_${fine_tuning.model.data.validation_ds.metric.name}
+      min_lr: 1e-8
+      warmup_steps: 1000
+      last_epoch: -1
+
+
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index ebe0e9c210..291b2c565d 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -682,6 +682,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
 
         model_type_to_code_path = {
             "gpt3" : self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_sft.py",
+            "llama" : self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_sft.py",
             "t5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py",
             "mt5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py",
         }

From d9a30597a5a035c2a91338dec1558e098dfed2b1 Mon Sep 17 00:00:00 2001
From: Hongbin Liu <hongbinl@nvidia.com>
Date: Fri, 8 Sep 2023 07:01:51 -0700
Subject: [PATCH 46/62] add peft support for llama

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
---
 .../conf/fine_tuning/llama/squad.yaml         |   2 +-
 launcher_scripts/conf/peft/llama/squad.yaml   | 234 ++++++++++++++++++
 launcher_scripts/nemo_launcher/core/stages.py |   1 +
 3 files changed, 236 insertions(+), 1 deletion(-)
 create mode 100644 launcher_scripts/conf/peft/llama/squad.yaml

diff --git a/launcher_scripts/conf/fine_tuning/llama/squad.yaml b/launcher_scripts/conf/fine_tuning/llama/squad.yaml
index 0162280247..cc954a846f 100644
--- a/launcher_scripts/conf/fine_tuning/llama/squad.yaml
+++ b/launcher_scripts/conf/fine_tuning/llama/squad.yaml
@@ -3,7 +3,7 @@ run:
   time_limit: "04:00:00"
   dependency: "singleton"
   convert_name: convert_nemo
-  model_train_name: llama_sft
+  model_train_name: llama2_7b
   convert_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.convert_name}
   task_name: "squad"  # Rename this name to be more clear
   results_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.task_name}
diff --git a/launcher_scripts/conf/peft/llama/squad.yaml b/launcher_scripts/conf/peft/llama/squad.yaml
new file mode 100644
index 0000000000..c958ba30dc
--- /dev/null
+++ b/launcher_scripts/conf/peft/llama/squad.yaml
@@ -0,0 +1,234 @@
+name: megatron_llama_peft_tuning-${peft.model.peft.peft_scheme}
+
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: llama2_7b
+  convert_dir: ${base_results_dir}/${peft.run.model_train_name}/${peft.run.convert_name}
+  task_name: "squad"
+  results_dir: ${base_results_dir}/${.model_train_name}/peft_${.task_name}
+
+trainer:
+  devices: 8
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${peft.name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${peft.model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${peft.name}--{${peft.exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${peft.model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: True
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_llama.nemo # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: True
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  peft:
+    peft_scheme: "lora"  # can be either adapter,ia3, or ptuning
+    restore_from_path: null
+    
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+    
+    lora_tuning:
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+    
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+    
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+
+  data:
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names: 
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names:
+      - ${data_dir}/squad_data/v1.1/train-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${peft.model.global_batch_size}
+      micro_batch_size: ${peft.model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      memmap_workers: 2
+      pin_memory: True
+      max_seq_length: 4096
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities:
+      - 1.0 # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: True
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+    validation_ds:
+      file_names: 
+      - ${data_dir}/squad_data/v1.1/dev-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names:
+      - ${fine_tuning.run.task_name} # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${peft.model.global_batch_size}
+      micro_batch_size: ${peft.model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      memmap_workers: ${peft.model.data.train_ds.memmap_workers}
+      pin_memory: True
+      max_seq_length: ${fine_tuning.model.data.train_ds.max_seq_length}
+      min_seq_length: ${fine_tuning.model.data.train_ds.min_seq_length}
+      drop_last: False
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: ${peft.model.data.train_ds.add_eos}
+      add_sep: ${peft.model.data.train_ds.add_sep}
+      add_bos: ${peft.model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+    test_ds:
+        file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+        names: null # Names of the corresponding datasets used to log metrics.
+        global_batch_size: ${peft.model.global_batch_size}
+        micro_batch_size: ${peft.model.micro_batch_size}
+        shuffle: False
+        num_workers: 0
+        memmap_workers: ${peft.model.data.train_ds.memmap_workers}
+        pin_memory: True
+        max_seq_length: ${fine_tuning.model.data.train_ds.max_seq_length}
+        min_seq_length: ${fine_tuning.model.data.train_ds.min_seq_length}
+        drop_last: False
+        context_key: 'input'
+        label_key: 'output'
+        add_eos: ${peft.model.data.train_ds.add_eos}
+        add_sep: ${peft.model.data.train_ds.add_sep}
+        add_bos: ${peft.model.data.train_ds.add_bos}
+        separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline}
+        write_predictions_to_file: False
+        output_file_path_prefix: null # Prefix of the file to write predictions to.
+        truncation_field: "context" # Options: ['context', 'answer']
+        index_mapping_dir: null # Path to a directory to write index mapping files.
+        prompt_template: ${peft.model.data.train_ds.prompt_template}
+        tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+        metric:
+          name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+          average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+          num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 291b2c565d..06fd8fa261 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -725,6 +725,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
             raise NotImplementedError("PEFT is not supported in NeMo Megatron mt5 models.")
         model_type_to_code_path = {
             "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py",
+            "llama": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py",
         }
         return model_type_to_code_path[model_type]
 

From 5d011e3b0bc2cc377b243ca9effadcf6111f4651 Mon Sep 17 00:00:00 2001
From: Abhinav Khattar <aklife97@gmail.com>
Date: Fri, 8 Sep 2023 10:59:47 -0700
Subject: [PATCH 47/62] update tests

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
---
 launcher_scripts/conf/training/bert/100b.yaml          | 10 +++++-----
 launcher_scripts/conf/training/bert/110m.yaml          |  6 +++---
 launcher_scripts/conf/training/bert/20b.yaml           |  6 +++---
 launcher_scripts/conf/training/bert/4b.yaml            |  6 +++---
 launcher_scripts/conf/training/gpt3/126m.yaml          |  6 +++---
 launcher_scripts/conf/training/gpt3/175b.yaml          |  6 +++---
 .../conf/training/gpt3/175b_performance.yaml           |  6 +++---
 launcher_scripts/conf/training/gpt3/1b_improved.yaml   |  6 +++---
 launcher_scripts/conf/training/gpt3/20b.yaml           |  6 +++---
 launcher_scripts/conf/training/gpt3/400m_improved.yaml |  6 +++---
 launcher_scripts/conf/training/gpt3/40b.yaml           |  6 +++---
 launcher_scripts/conf/training/gpt3/40b_improved.yaml  |  6 +++---
 launcher_scripts/conf/training/gpt3/5b.yaml            |  6 +++---
 launcher_scripts/conf/training/gpt3/7b_improved.yaml   |  6 +++---
 14 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/launcher_scripts/conf/training/bert/100b.yaml b/launcher_scripts/conf/training/bert/100b.yaml
index 84d7170dae..8d26a5b7b8 100755
--- a/launcher_scripts/conf/training/bert/100b.yaml
+++ b/launcher_scripts/conf/training/bert/100b.yaml
@@ -97,11 +97,11 @@ model:
   onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
 
-  activations_checkpoint_granularity: selective
-  activations_checkpoint_method: block
-  activations_checkpoint_layers_per_pipeline: 1
-  num_micro_batches_with_partial_activation_checkpoints: 96
-  activations_checkpoint_num_layers: 0
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_layers_per_pipeline: null
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_num_layers: null
 
   sequence_parallel: True 
 
diff --git a/launcher_scripts/conf/training/bert/110m.yaml b/launcher_scripts/conf/training/bert/110m.yaml
index 8d72872eb2..2988141040 100755
--- a/launcher_scripts/conf/training/bert/110m.yaml
+++ b/launcher_scripts/conf/training/bert/110m.yaml
@@ -98,11 +98,11 @@ model:
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
  
  
-  activations_checkpoint_granularity: selective
-  activations_checkpoint_method: block
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
   activations_checkpoint_layers_per_pipeline: null
   num_micro_batches_with_partial_activation_checkpoints: null
-  activations_checkpoint_num_layers: 0
+  activations_checkpoint_num_layers: null
 
   sequence_parallel: False 
 
diff --git a/launcher_scripts/conf/training/bert/20b.yaml b/launcher_scripts/conf/training/bert/20b.yaml
index 729b8e0ef7..1a2d033c7e 100755
--- a/launcher_scripts/conf/training/bert/20b.yaml
+++ b/launcher_scripts/conf/training/bert/20b.yaml
@@ -97,11 +97,11 @@ model:
   onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
 
-  activations_checkpoint_granularity: selective
-  activations_checkpoint_method: block
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
   activations_checkpoint_layers_per_pipeline: null
   num_micro_batches_with_partial_activation_checkpoints: null
-  activations_checkpoint_num_layers: 1
+  activations_checkpoint_num_layers: null
 
   sequence_parallel: True 
 
diff --git a/launcher_scripts/conf/training/bert/4b.yaml b/launcher_scripts/conf/training/bert/4b.yaml
index e925f5621a..484f17c998 100755
--- a/launcher_scripts/conf/training/bert/4b.yaml
+++ b/launcher_scripts/conf/training/bert/4b.yaml
@@ -97,11 +97,11 @@ model:
   onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
 
-  activations_checkpoint_granularity: selective
-  activations_checkpoint_method: block
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
   activations_checkpoint_layers_per_pipeline: null
   num_micro_batches_with_partial_activation_checkpoints: null
-  activations_checkpoint_num_layers: 0 
+  activations_checkpoint_num_layers: null 
 
   sequence_parallel: False
 
diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml
index 2e8cd73053..b816072f74 100755
--- a/launcher_scripts/conf/training/gpt3/126m.yaml
+++ b/launcher_scripts/conf/training/gpt3/126m.yaml
@@ -82,9 +82,9 @@ model:
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
 
   ## Activation Checkpointing
-  activations_checkpoint_granularity: selective # 'selective' or 'full'
-  activations_checkpoint_method: block # 'uniform', 'block'
-  activations_checkpoint_num_layers: 0 
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block'
+  activations_checkpoint_num_layers: null 
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
 
diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml
index 971bd4ccdb..37c6672ccc 100755
--- a/launcher_scripts/conf/training/gpt3/175b.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b.yaml
@@ -81,9 +81,9 @@ model:
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
 
   ## Activation Checkpointing
-  activations_checkpoint_granularity: selective # 'selective' or 'full'
-  activations_checkpoint_method: block # 'uniform', 'block'
-  activations_checkpoint_num_layers: 0
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block'
+  activations_checkpoint_num_layers: null
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
 
diff --git a/launcher_scripts/conf/training/gpt3/175b_performance.yaml b/launcher_scripts/conf/training/gpt3/175b_performance.yaml
index 780e636ba8..1b615f3b08 100755
--- a/launcher_scripts/conf/training/gpt3/175b_performance.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b_performance.yaml
@@ -83,9 +83,9 @@ model:
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
 
   ## Activation Checkpointing
-  activations_checkpoint_granularity: selective # 'selective' or 'full'
-  activations_checkpoint_method: block # 'uniform', 'block'
-  activations_checkpoint_num_layers: 0
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block'
+  activations_checkpoint_num_layers: null
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
 
diff --git a/launcher_scripts/conf/training/gpt3/1b_improved.yaml b/launcher_scripts/conf/training/gpt3/1b_improved.yaml
index e6d473c840..23917f0b6f 100644
--- a/launcher_scripts/conf/training/gpt3/1b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/1b_improved.yaml
@@ -110,9 +110,9 @@ model:
   apex_transformer_log_level: 30
   gradient_as_bucket_view: true
   sync_batch_comm: false
-  activations_checkpoint_granularity: selective
-  activations_checkpoint_method: block
-  activations_checkpoint_num_layers: 0
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false # does not support sequence parallel
diff --git a/launcher_scripts/conf/training/gpt3/20b.yaml b/launcher_scripts/conf/training/gpt3/20b.yaml
index b4185d922f..d677565daa 100755
--- a/launcher_scripts/conf/training/gpt3/20b.yaml
+++ b/launcher_scripts/conf/training/gpt3/20b.yaml
@@ -81,9 +81,9 @@ model:
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
 
   ## Activation Checkpointing
-  activations_checkpoint_granularity: selective # 'selective' or 'full'
-  activations_checkpoint_method: block # 'uniform', 'block'
-  activations_checkpoint_num_layers: 0
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block'
+  activations_checkpoint_num_layers: null
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
 
diff --git a/launcher_scripts/conf/training/gpt3/400m_improved.yaml b/launcher_scripts/conf/training/gpt3/400m_improved.yaml
index e4b4a6e31f..5666be015c 100644
--- a/launcher_scripts/conf/training/gpt3/400m_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/400m_improved.yaml
@@ -110,9 +110,9 @@ model:
   apex_transformer_log_level: 30
   gradient_as_bucket_view: true
   sync_batch_comm: false
-  activations_checkpoint_granularity: selective
-  activations_checkpoint_method: block
-  activations_checkpoint_num_layers: 0
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false # does not support sequence parallel
diff --git a/launcher_scripts/conf/training/gpt3/40b.yaml b/launcher_scripts/conf/training/gpt3/40b.yaml
index bf660ea9b4..2350d0ed07 100755
--- a/launcher_scripts/conf/training/gpt3/40b.yaml
+++ b/launcher_scripts/conf/training/gpt3/40b.yaml
@@ -81,9 +81,9 @@ model:
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
 
   ## Activation Checkpointing
-  activations_checkpoint_granularity: selective # 'selective' or 'full'
-  activations_checkpoint_method: block # 'uniform', 'block'
-  activations_checkpoint_num_layers: 0
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block'
+  activations_checkpoint_num_layers: null
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
 
diff --git a/launcher_scripts/conf/training/gpt3/40b_improved.yaml b/launcher_scripts/conf/training/gpt3/40b_improved.yaml
index af5f14f2b2..d96532dbbc 100644
--- a/launcher_scripts/conf/training/gpt3/40b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/40b_improved.yaml
@@ -110,9 +110,9 @@ model:
   apex_transformer_log_level: 30
   gradient_as_bucket_view: true
   sync_batch_comm: false
-  activations_checkpoint_granularity: selective
-  activations_checkpoint_method: block
-  activations_checkpoint_num_layers: 1
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false # does not support sequence parallel
diff --git a/launcher_scripts/conf/training/gpt3/5b.yaml b/launcher_scripts/conf/training/gpt3/5b.yaml
index 0ba8d80b89..2981029d91 100755
--- a/launcher_scripts/conf/training/gpt3/5b.yaml
+++ b/launcher_scripts/conf/training/gpt3/5b.yaml
@@ -81,9 +81,9 @@ model:
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
 
   ## Activation Checkpointing
-  activations_checkpoint_granularity: selective # 'selective' or 'full'
-  activations_checkpoint_method: block # 'uniform', 'block'
-  activations_checkpoint_num_layers: 0 
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block'
+  activations_checkpoint_num_layers: null 
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
 
diff --git a/launcher_scripts/conf/training/gpt3/7b_improved.yaml b/launcher_scripts/conf/training/gpt3/7b_improved.yaml
index 8cd14cad59..ffe7f89e55 100644
--- a/launcher_scripts/conf/training/gpt3/7b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/7b_improved.yaml
@@ -110,9 +110,9 @@ model:
   apex_transformer_log_level: 30
   gradient_as_bucket_view: true
   sync_batch_comm: false
-  activations_checkpoint_granularity: selective
-  activations_checkpoint_method: block
-  activations_checkpoint_num_layers: 8
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false # does not support sequence parallel

From 5b2d3ba3fe482055677234e57ec78de674cc324e Mon Sep 17 00:00:00 2001
From: Eric Harper <eharper@nvidia.com>
Date: Sat, 9 Sep 2023 12:54:39 -0600
Subject: [PATCH 48/62] add if

Signed-off-by: Eric Harper <eharper@nvidia.com>
---
 launcher_scripts/nemo_launcher/collections/conditional_cfgs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py b/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py
index ec6be9845f..57a2d3eae5 100644
--- a/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py
+++ b/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py
@@ -54,7 +54,7 @@ def get_ag_overlap(cfg):
 
 
 if __name__ == "__main__":
-    elif sys.argv[1] == "name=get_ln_sm_margin":
+    if sys.argv[1] == "name=get_ln_sm_margin":
         get_ln_sm_margin()
     elif sys.argv[1] == "name=get_ag_overlap":
         get_ag_overlap()

From 76436a952dcfc6fcae9aff5869dd4409e133971a Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Mon, 11 Sep 2023 17:39:12 -0700
Subject: [PATCH 49/62] remove ft from auto_configurator config

---
 auto_configurator/conf/config.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/auto_configurator/conf/config.yaml b/auto_configurator/conf/config.yaml
index aa75cfff7a..ab712b4a83 100644
--- a/auto_configurator/conf/config.yaml
+++ b/auto_configurator/conf/config.yaml
@@ -15,7 +15,6 @@ run_inference_hp_search: True
 cluster_type: bcm  # bcm or bcp
 auto_configurator_path: ???  # Path to the location of auto_configurator codebase.
 launcher_scripts_path: ${auto_configurator_path}/../launcher_scripts
-fastertransformer_path: ${auto_configurator_path}/../FasterTransformer
 base_results_dir: ${auto_configurator_path}/results
 data_dir: ${launcher_scripts_path}/data
 

From 48dea84ea390c1b834a98b73ce2b4a970b98eabd Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Mon, 11 Sep 2023 19:39:32 -0700
Subject: [PATCH 50/62] change print

---
 launcher_scripts/nemo_launcher/collections/conditional_cfgs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py b/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py
index 57a2d3eae5..f1fff5c18a 100644
--- a/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py
+++ b/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py
@@ -31,7 +31,7 @@ def get_ln_sm_margin(cfg):
     """
     global cuda_capability
     if cuda_capability == 9:
-        print(4)
+        print(8)
     else:
         print(0)
 

From b89407cc4218a72e5f6d62f4ad50d018616e119d Mon Sep 17 00:00:00 2001
From: David <amosalla@asu.edu>
Date: Mon, 11 Sep 2023 20:33:11 -0700
Subject: [PATCH 51/62] updating GPT configs (mcore and te) (#124)

Signed-off-by: David Mosallanezhad <dmosallanezh@nvidia.com>
---
 launcher_scripts/conf/training/gpt3/126m.yaml             | 5 ++++-
 launcher_scripts/conf/training/gpt3/175b.yaml             | 5 ++++-
 launcher_scripts/conf/training/gpt3/175b_performance.yaml | 5 ++++-
 launcher_scripts/conf/training/gpt3/1b_improved.yaml      | 3 +++
 launcher_scripts/conf/training/gpt3/20b.yaml              | 5 ++++-
 launcher_scripts/conf/training/gpt3/400m_improved.yaml    | 3 +++
 launcher_scripts/conf/training/gpt3/40b.yaml              | 5 ++++-
 launcher_scripts/conf/training/gpt3/40b_improved.yaml     | 3 +++
 launcher_scripts/conf/training/gpt3/5b.yaml               | 5 ++++-
 launcher_scripts/conf/training/gpt3/7b_improved.yaml      | 3 +++
 10 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml
index a6c7028da7..bd719118af 100755
--- a/launcher_scripts/conf/training/gpt3/126m.yaml
+++ b/launcher_scripts/conf/training/gpt3/126m.yaml
@@ -118,10 +118,13 @@ model:
   megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters
   grad_allreduce_chunk_size_mb: 125
 
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # To use fp8, please set `transformer_engine=True` and `fp8=True`.
   # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training
-  transformer_engine: True
+  transformer_engine: False
   fp8: False # enables fp8 in TransformerLayer forward
   fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
   fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml
index 187edaf3f7..df8049ba77 100755
--- a/launcher_scripts/conf/training/gpt3/175b.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b.yaml
@@ -117,10 +117,13 @@ model:
   megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters
   grad_allreduce_chunk_size_mb: 125
 
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # To use fp8, please set `transformer_engine=True` and `fp8=True`.
   # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training
-  transformer_engine: True
+  transformer_engine: False
   fp8: False # enables fp8 in TransformerLayer forward
   fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
   fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
diff --git a/launcher_scripts/conf/training/gpt3/175b_performance.yaml b/launcher_scripts/conf/training/gpt3/175b_performance.yaml
index 68dccabf9b..7c0d55b37f 100755
--- a/launcher_scripts/conf/training/gpt3/175b_performance.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b_performance.yaml
@@ -117,10 +117,13 @@ model:
   megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters
   grad_allreduce_chunk_size_mb: 125
 
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # To use fp8, please set `transformer_engine=True` and `fp8=True`.
   # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training
-  transformer_engine: True
+  transformer_engine: False
   fp8: False # enables fp8 in TransformerLayer forward
   fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
   fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
diff --git a/launcher_scripts/conf/training/gpt3/1b_improved.yaml b/launcher_scripts/conf/training/gpt3/1b_improved.yaml
index 22b1bd6fad..173497fc55 100644
--- a/launcher_scripts/conf/training/gpt3/1b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/1b_improved.yaml
@@ -125,6 +125,9 @@ model:
   batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
   num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
 
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
   transformer_engine: False
diff --git a/launcher_scripts/conf/training/gpt3/20b.yaml b/launcher_scripts/conf/training/gpt3/20b.yaml
index d759cfdbe7..d54a2dc0ed 100755
--- a/launcher_scripts/conf/training/gpt3/20b.yaml
+++ b/launcher_scripts/conf/training/gpt3/20b.yaml
@@ -117,10 +117,13 @@ model:
   megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters
   grad_allreduce_chunk_size_mb: 125
 
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # To use fp8, please set `transformer_engine=True` and `fp8=True`.
   # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training
-  transformer_engine: True
+  transformer_engine: False
   fp8: False # enables fp8 in TransformerLayer forward
   fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
   fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
diff --git a/launcher_scripts/conf/training/gpt3/400m_improved.yaml b/launcher_scripts/conf/training/gpt3/400m_improved.yaml
index 78cd1bc9da..6385fa7526 100644
--- a/launcher_scripts/conf/training/gpt3/400m_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/400m_improved.yaml
@@ -125,6 +125,9 @@ model:
   batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
   num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
 
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
   transformer_engine: False
diff --git a/launcher_scripts/conf/training/gpt3/40b.yaml b/launcher_scripts/conf/training/gpt3/40b.yaml
index 24bf2a1494..8a4a562d5a 100755
--- a/launcher_scripts/conf/training/gpt3/40b.yaml
+++ b/launcher_scripts/conf/training/gpt3/40b.yaml
@@ -117,10 +117,13 @@ model:
   megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters
   grad_allreduce_chunk_size_mb: 125
 
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # To use fp8, please set `transformer_engine=True` and `fp8=True`.
   # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training
-  transformer_engine: True
+  transformer_engine: False
   fp8: False # enables fp8 in TransformerLayer forward
   fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
   fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
diff --git a/launcher_scripts/conf/training/gpt3/40b_improved.yaml b/launcher_scripts/conf/training/gpt3/40b_improved.yaml
index 62b3f4e3f0..57c39b899a 100644
--- a/launcher_scripts/conf/training/gpt3/40b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/40b_improved.yaml
@@ -125,6 +125,9 @@ model:
   batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
   num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
   
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
   transformer_engine: False
diff --git a/launcher_scripts/conf/training/gpt3/5b.yaml b/launcher_scripts/conf/training/gpt3/5b.yaml
index f42e472bca..76a79d85d1 100755
--- a/launcher_scripts/conf/training/gpt3/5b.yaml
+++ b/launcher_scripts/conf/training/gpt3/5b.yaml
@@ -117,10 +117,13 @@ model:
   megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters
   grad_allreduce_chunk_size_mb: 125
 
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # To use fp8, please set `transformer_engine=True` and `fp8=True`.
   # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training
-  transformer_engine: True
+  transformer_engine: False
   fp8: False # enables fp8 in TransformerLayer forward
   fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
   fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
diff --git a/launcher_scripts/conf/training/gpt3/7b_improved.yaml b/launcher_scripts/conf/training/gpt3/7b_improved.yaml
index 7a2d23ad03..ecfba32d93 100644
--- a/launcher_scripts/conf/training/gpt3/7b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/7b_improved.yaml
@@ -125,6 +125,9 @@ model:
   batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
   num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
 
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
   transformer_engine: False

From fc7575e17dbbf3d5579a69260940be2264a664b6 Mon Sep 17 00:00:00 2001
From: David <amosalla@asu.edu>
Date: Mon, 11 Sep 2023 22:02:24 -0700
Subject: [PATCH 52/62] updatiing GPT conversion (#125)

Signed-off-by: David Mosallanezhad <dmosallanezh@nvidia.com>
---
 launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml b/launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml
index dc8d43b44d..2b80dd31aa 100755
--- a/launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml
+++ b/launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml
@@ -13,7 +13,7 @@ run:
 model:
   model_type: gpt # gpt or t5, use t5 for mt5 as well
   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
-  checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
+  checkpoint_name: megatron_gpt-*last # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
   tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1

From 68ae39f9a65a01c2655f26e5d879a9333c795403 Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Tue, 12 Sep 2023 12:03:46 -0700
Subject: [PATCH 53/62] Set ub_tp_overlap to False, change 175b_fp8 to use
 TP=4, MBS=2

Signed-off-by: Sangkug Lym <slym@nvidia.com>
---
 launcher_scripts/conf/training/gpt3/175b.yaml |   2 +-
 .../conf/training/gpt3/175b_fp8.yaml          | 246 ++++++++++++++++++
 launcher_scripts/conf/training/gpt3/20b.yaml  |   2 +-
 launcher_scripts/conf/training/gpt3/40b.yaml  |   2 +-
 4 files changed, 249 insertions(+), 3 deletions(-)
 create mode 100755 launcher_scripts/conf/training/gpt3/175b_fp8.yaml

diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml
index df8049ba77..62064a4d84 100755
--- a/launcher_scripts/conf/training/gpt3/175b.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b.yaml
@@ -133,7 +133,7 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   use_emha: False
-  ub_tp_comm_overlap: True
+  ub_tp_comm_overlap: False
 
   # miscellaneous
   seed: 1234
diff --git a/launcher_scripts/conf/training/gpt3/175b_fp8.yaml b/launcher_scripts/conf/training/gpt3/175b_fp8.yaml
new file mode 100755
index 0000000000..4bea128403
--- /dev/null
+++ b/launcher_scripts/conf/training/gpt3/175b_fp8.yaml
@@ -0,0 +1,246 @@
+# The configurations below provide the best 175B training performance with the NeMo SW stack.
+# We have confirmed the model convergence only with a limited number of tokens and the full model
+# convergence (e.g., 300B tokens) is not guaranteed.
+hydra:
+  searchpath:
+    - file:///opt/NeMo/examples/nlp/language_modeling/conf
+
+run:
+  name: gpt3_175b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "26-00:00:00"
+  dependency: "singleton"
+
+trainer:
+  num_nodes: 128
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 75000 # consumed_samples = global_step * global_batch_size
+  max_time: "25:23:00:00"
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 20
+  limit_test_batches: 20
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_gpt
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: nemo_gpt3
+    name: ${training.run.name}
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 5
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: True
+  step_timing_kwargs:
+    sync_cuda: True
+    buffer_size: 5
+
+model:
+  micro_batch_size: 2
+  global_batch_size: 2048
+  tensor_model_parallel_size: 8
+  pipeline_model_parallel_size: 8
+  virtual_pipeline_model_parallel_size: 12 # interleaved pipeline, set to maximum
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  # model architecture
+  encoder_seq_length: 2048
+  max_position_embeddings: 2048
+  num_layers: 96
+  hidden_size: 12288
+  ffn_hidden_size: ${multiply:4, ${.hidden_size}}  # Transformer FFN hidden size. 4 * hidden_size.
+  num_attention_heads: 96
+  init_method_std: 0.006  # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  hidden_dropout: 0.1  # Dropout probability for hidden state transformer.
+  attention_dropout: 0.1
+  kv_channels: null  # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  layernorm_epsilon: 1e-5
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  persist_layer_norm: True # Use of persistent fused layer norm kernel.
+  gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+
+  # Fusion
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
+  gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs
+  bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+
+  ## Activation Checkpointing
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block'
+  activations_checkpoint_num_layers: null
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+
+  ## Sequence Parallelism
+  sequence_parallel: True
+  
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
+  tokenizer:
+    library: 'megatron'
+    type: 'GPT2BPETokenizer'
+    model: null
+    delimiter: null # only used for tabular tokenizer
+    vocab_file: ${data_dir}/bpe/vocab.json
+    merge_file: ${data_dir}/bpe/merges.txt
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  hysteresis: 2 # Gradient scale hysteresis
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters
+  grad_allreduce_chunk_size_mb: 125
+
+  ## Using Megatron Core
+  mcore_gpt: True
+
+  ## Transformer Engine
+  # To use fp8, please set `transformer_engine=True` and `fp8=True`.
+  # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training
+  transformer_engine: False
+  fp8: True # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  fp8_wgrad: True
+  use_emha: False
+  ub_tp_comm_overlap: False
+
+  # miscellaneous
+  seed: 1234
+  sync_batch_comm: False
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  overlap_p2p_comm: True # Overlap p2p communication with computes
+  batch_p2p_comm: False # Batch consecutive inter-peer send/recv operations
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    trace: [nvtx,cuda]
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [0] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+
+  optim:
+    name: distributed_fused_adam
+    bucket_cap_mb: 100
+    overlap_grad_sync: True
+    overlap_param_sync: true
+    contiguous_grad_buffer: True
+    grad_sync_dtype: bf16
+    lr: 0.9e-4
+    weight_decay: 0.1 
+    betas: 
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 115
+      constant_steps: 12500
+      min_lr: 0.9e-5
+
+  data:
+    data_impl: mmap
+    splits_string: "99990,8,2"
+    seq_length: 2048
+    skip_warmup: True
+    num_workers: 2
+    dataloader_type: single # cyclic
+    reset_position_ids: False # Reset position ids after end-of-document token
+    reset_attention_mask: False # Reset attention mask after end-of-document token
+    eod_mask_loss: False # Mask loss for the end of document tokens
+    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
+    data_prefix: # Should be weight path weight path... for a blended dataset
+      - .0333
+      - ${data_dir}/my-gpt3_00_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_01_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_02_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_03_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_04_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_05_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_06_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_07_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_08_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_09_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_10_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_11_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_12_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_13_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_14_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_15_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_16_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_17_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_18_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_19_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_20_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_21_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_22_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_23_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_24_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_25_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_26_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_27_text_document
+      - .0333
+      - ${data_dir}/my-gpt3_28_text_document
+      - .0334
+      - ${data_dir}/my-gpt3_29_text_document
+
diff --git a/launcher_scripts/conf/training/gpt3/20b.yaml b/launcher_scripts/conf/training/gpt3/20b.yaml
index d54a2dc0ed..092c52f9ae 100755
--- a/launcher_scripts/conf/training/gpt3/20b.yaml
+++ b/launcher_scripts/conf/training/gpt3/20b.yaml
@@ -133,7 +133,7 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   use_emha: False
-  ub_tp_comm_overlap: True
+  ub_tp_comm_overlap: False
 
   # miscellaneous
   seed: 1234
diff --git a/launcher_scripts/conf/training/gpt3/40b.yaml b/launcher_scripts/conf/training/gpt3/40b.yaml
index 8a4a562d5a..d4dfa3e928 100755
--- a/launcher_scripts/conf/training/gpt3/40b.yaml
+++ b/launcher_scripts/conf/training/gpt3/40b.yaml
@@ -133,7 +133,7 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   use_emha: False
-  ub_tp_comm_overlap: True
+  ub_tp_comm_overlap: False
 
   # miscellaneous
   seed: 1234

From 8f2aceddd74d7871a30a442e44dacc077419e4fe Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Tue, 12 Sep 2023 12:15:05 -0700
Subject: [PATCH 54/62] set NCCL_NVLS_ENABLE=0 for memory saving

Signed-off-by: Sangkug Lym <slym@nvidia.com>
---
 launcher_scripts/conf/config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml
index 30d553059a..80267d5a56 100755
--- a/launcher_scripts/conf/config.yaml
+++ b/launcher_scripts/conf/config.yaml
@@ -52,6 +52,7 @@ env_vars:
   NCCL_PROTO: null # Protocol NCCL will use. Set to "simple" for AWS
   TRANSFORMERS_OFFLINE: 1
   TORCH_NCCL_AVOID_RECORD_STREAMS: 1
+  NCCL_NVLS_ENABLE: 0
 
 # GPU Mapping
 numa_mapping:

From 4dbf28730cc1ef031f0af14190829afebc88c497 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Tue, 12 Sep 2023 17:43:37 -0700
Subject: [PATCH 55/62] support distributed checkpointing in checkpoint search

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 .../collections/checkpoint_search.py          | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/launcher_scripts/nemo_launcher/collections/checkpoint_search.py b/launcher_scripts/nemo_launcher/collections/checkpoint_search.py
index 4da89ed860..6bdd7d5101 100755
--- a/launcher_scripts/nemo_launcher/collections/checkpoint_search.py
+++ b/launcher_scripts/nemo_launcher/collections/checkpoint_search.py
@@ -51,16 +51,30 @@ def checkpoint_search(cfg):
     pipeline_model_parallel_size = cfg.pipeline_model_parallel_size
 
     if checkpoint_name == "latest":
-        checkpoints = os.path.join(checkpoint_folder, "*.ckpt")
-        checkpoints = _inject_model_parallel_rank(
-            checkpoints, tensor_model_parallel_size, pipeline_model_parallel_size
-        )
-        checkpoint_list = glob.glob(checkpoints)
+
+        dist_ckpt = False
+        # Every distributed checkpoint saves a 'common.pt' file
+        for result in glob.glob(os.path.join(checkpoint_folder, "*")):
+            if os.path.exists(os.path.join(result, 'common.pt')):
+                dist_ckpt = True
+                break
+
+        if dist_ckpt:
+            checkpoint_list = [f for f in glob.glob(os.path.join(checkpoint_folder, "*")) if os.path.isdir(f)]
+        else:
+            checkpoints = os.path.join(checkpoint_folder, "*.ckpt")
+
+            checkpoints = _inject_model_parallel_rank(
+                checkpoints, tensor_model_parallel_size, pipeline_model_parallel_size
+            )
+            checkpoint_list = glob.glob(checkpoints)
+
         latest_checkpoint = max(checkpoint_list, key=os.path.getctime)
         checkpoint_name = os.path.basename(latest_checkpoint)
 
     checkpoint = os.path.join(checkpoint_folder, checkpoint_name)
-    checkpoint = _inject_model_parallel_rank(checkpoint, tensor_model_parallel_size, pipeline_model_parallel_size)
+    if not dist_ckpt:
+        checkpoint = _inject_model_parallel_rank(checkpoint, tensor_model_parallel_size, pipeline_model_parallel_size)
     checkpoint_list = glob.glob(checkpoint)
     if len(checkpoint_list) > 1:
         raise ValueError("Too many checkpoints fit the checkpoint name pattern in conversion config.")

From 10e03e042e65995e00c75488d5327c644fff838f Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Tue, 12 Sep 2023 18:20:19 -0700
Subject: [PATCH 56/62] clearn up

Signed-off-by: Sangkug Lym <slym@nvidia.com>
---
 launcher_scripts/conf/training/gpt3/126m.yaml |   1 -
 launcher_scripts/conf/training/gpt3/175b.yaml |   1 -
 .../conf/training/gpt3/175b_fp8.yaml          |   1 -
 .../conf/training/gpt3/175b_performance.yaml  | 247 ------------------
 .../conf/training/gpt3/1b_improved.yaml       |   1 -
 launcher_scripts/conf/training/gpt3/20b.yaml  |   1 -
 .../conf/training/gpt3/400m_improved.yaml     |   1 -
 launcher_scripts/conf/training/gpt3/40b.yaml  |   1 -
 .../conf/training/gpt3/40b_improved.yaml      |   1 -
 launcher_scripts/conf/training/gpt3/5b.yaml   |   1 -
 .../conf/training/gpt3/7b_improved.yaml       |   1 -
 ...b_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml |  53 ----
 ...b_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml |  53 ----
 ...b_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml |  59 -----
 ...b_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml |  59 -----
 15 files changed, 481 deletions(-)
 delete mode 100755 launcher_scripts/conf/training/gpt3/175b_performance.yaml
 delete mode 100644 launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml
 delete mode 100644 launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml
 delete mode 100644 launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml
 delete mode 100644 launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml

diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml
index bd719118af..27d3329756 100755
--- a/launcher_scripts/conf/training/gpt3/126m.yaml
+++ b/launcher_scripts/conf/training/gpt3/126m.yaml
@@ -133,7 +133,6 @@ model:
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
-  use_emha: False
   ub_tp_comm_overlap: False
 
   # miscellaneous
diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml
index 62064a4d84..33125cbb9f 100755
--- a/launcher_scripts/conf/training/gpt3/175b.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b.yaml
@@ -132,7 +132,6 @@ model:
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
-  use_emha: False
   ub_tp_comm_overlap: False
 
   # miscellaneous
diff --git a/launcher_scripts/conf/training/gpt3/175b_fp8.yaml b/launcher_scripts/conf/training/gpt3/175b_fp8.yaml
index 4bea128403..f58e86cbd9 100755
--- a/launcher_scripts/conf/training/gpt3/175b_fp8.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b_fp8.yaml
@@ -132,7 +132,6 @@ model:
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
-  use_emha: False
   ub_tp_comm_overlap: False
 
   # miscellaneous
diff --git a/launcher_scripts/conf/training/gpt3/175b_performance.yaml b/launcher_scripts/conf/training/gpt3/175b_performance.yaml
deleted file mode 100755
index 7c0d55b37f..0000000000
--- a/launcher_scripts/conf/training/gpt3/175b_performance.yaml
+++ /dev/null
@@ -1,247 +0,0 @@
-# The configurations below provide the best 175B training performance with the NeMo SW stack.
-# We have confirmed the model convergence only with a limited number of tokens and the full model
-# convergence (e.g., 300B tokens) is not guaranteed.
-hydra:
-  searchpath:
-    - file:///opt/NeMo/examples/nlp/language_modeling/conf
-
-run:
-  name: gpt3_175b
-  results_dir: ${base_results_dir}/${.name}
-  time_limit: "26-00:00:00"
-  dependency: "singleton"
-
-trainer:
-  num_nodes: 128
-  devices: 8
-  accelerator: gpu
-  precision: bf16
-  logger: False # logger provided by exp_manager
-  enable_checkpointing: False
-  use_distributed_sampler: False
-  max_epochs: null
-  max_steps: 75000 # consumed_samples = global_step * global_batch_size
-  max_time: "25:23:00:00"
-  log_every_n_steps: 10
-  val_check_interval: 2000
-  limit_val_batches: 20
-  limit_test_batches: 20
-  accumulate_grad_batches: 1
-  gradient_clip_val: 1.0
-
-exp_manager:
-  explicit_log_dir: ${training.run.results_dir}/results
-  exp_dir: null
-  name: megatron_gpt
-  create_wandb_logger: False
-  wandb_logger_kwargs:
-    project: nemo_gpt3
-    name: ${training.run.name}
-  resume_if_exists: True
-  resume_ignore_no_checkpoint: True
-  create_checkpoint_callback: True
-  checkpoint_callback_params:
-    monitor: val_loss
-    save_top_k: 5
-    mode: min
-    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
-    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
-    filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}'
-    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
-  log_step_timing: True
-  step_timing_kwargs:
-    sync_cuda: True
-    buffer_size: 5
-
-model:
-  micro_batch_size: 1
-  global_batch_size: 2048
-  tensor_model_parallel_size: 4
-  pipeline_model_parallel_size: 8
-  virtual_pipeline_model_parallel_size: 12 # interleaved pipeline, set to maximum
-  resume_from_checkpoint: null # manually set the checkpoint file to load from
-  # model architecture
-  encoder_seq_length: 2048
-  max_position_embeddings: 2048
-  num_layers: 96
-  hidden_size: 12288
-  ffn_hidden_size: ${multiply:4, ${.hidden_size}}  # Transformer FFN hidden size. 4 * hidden_size.
-  num_attention_heads: 96
-  init_method_std: 0.006  # Standard deviation of the zero mean normal distribution used for weight initialization.')
-  hidden_dropout: 0.1  # Dropout probability for hidden state transformer.
-  attention_dropout: 0.1
-  kv_channels: null  # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
-  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
-  layernorm_epsilon: 1e-5
-  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
-  pre_process: True # add embedding
-  post_process: True # add pooler
-  persist_layer_norm: True # Use of persistent fused layer norm kernel.
-  gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
-
-  # Fusion
-  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
-  gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs
-  bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
-  bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
-  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
-
-  ## Activation Checkpointing
-  activations_checkpoint_granularity: null # 'selective' or 'full'
-  activations_checkpoint_method: null # 'uniform', 'block'
-  activations_checkpoint_num_layers: null
-  num_micro_batches_with_partial_activation_checkpoints: null
-  activations_checkpoint_layers_per_pipeline: null
-
-  ## Sequence Parallelism
-  sequence_parallel: True
-  
-  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
-
-  tokenizer:
-    library: 'megatron'
-    type: 'GPT2BPETokenizer'
-    model: null
-    delimiter: null # only used for tabular tokenizer
-    vocab_file: ${data_dir}/bpe/vocab.json
-    merge_file: ${data_dir}/bpe/merges.txt
-
-  # precision
-  native_amp_init_scale: 4294967296 # 2 ** 32
-  native_amp_growth_interval: 1000
-  hysteresis: 2 # Gradient scale hysteresis
-  fp32_residual_connection: False # Move residual connections to fp32
-  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
-
-  # Megatron O2-style half-precision
-  megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters
-  grad_allreduce_chunk_size_mb: 125
-
-  ## Using Megatron Core
-  mcore_gpt: True
-
-  ## Transformer Engine
-  # To use fp8, please set `transformer_engine=True` and `fp8=True`.
-  # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training
-  transformer_engine: False
-  fp8: False # enables fp8 in TransformerLayer forward
-  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
-  fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
-  fp8_margin: 0 # scaling margin
-  fp8_interval: 1 # scaling update interval
-  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
-  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
-  fp8_wgrad: True
-  use_emha: False
-  ub_tp_comm_overlap: True
-
-  # miscellaneous
-  seed: 1234
-  sync_batch_comm: False
-  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
-  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
-  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
-  overlap_p2p_comm: True # Overlap p2p communication with computes
-  batch_p2p_comm: False # Batch consecutive inter-peer send/recv operations
-  gc_interval: 100 # Interval of the host memory garbage collection
-
-  # Nsys profiling options
-  nsys_profile:
-    enabled: False
-    trace: [nvtx,cuda]
-    start_step: 10  # Global batch to start profiling
-    end_step: 10 # Global batch to end profiling
-    ranks: [0] # Global rank IDs to profile
-    gen_shape: False # Generate model and kernel details including input shapes
-
-  optim:
-    name: distributed_fused_adam
-    bucket_cap_mb: 100
-    overlap_grad_sync: True
-    overlap_param_sync: true
-    contiguous_grad_buffer: True
-    grad_sync_dtype: bf16
-    lr: 0.9e-4
-    weight_decay: 0.1 
-    betas: 
-    - 0.9
-    - 0.95
-    sched:
-      name: CosineAnnealing
-      warmup_steps: 115
-      constant_steps: 12500
-      min_lr: 0.9e-5
-
-  data:
-    data_impl: mmap
-    splits_string: "99990,8,2"
-    seq_length: 2048
-    skip_warmup: True
-    num_workers: 2
-    dataloader_type: single # cyclic
-    reset_position_ids: False # Reset position ids after end-of-document token
-    reset_attention_mask: False # Reset attention mask after end-of-document token
-    eod_mask_loss: False # Mask loss for the end of document tokens
-    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
-    data_prefix: # Should be weight path weight path... for a blended dataset
-      - .0333
-      - ${data_dir}/my-gpt3_00_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_01_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_02_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_03_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_04_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_05_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_06_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_07_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_08_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_09_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_10_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_11_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_12_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_13_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_14_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_15_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_16_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_17_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_18_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_19_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_20_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_21_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_22_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_23_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_24_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_25_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_26_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_27_text_document
-      - .0333
-      - ${data_dir}/my-gpt3_28_text_document
-      - .0334
-      - ${data_dir}/my-gpt3_29_text_document
-
diff --git a/launcher_scripts/conf/training/gpt3/1b_improved.yaml b/launcher_scripts/conf/training/gpt3/1b_improved.yaml
index 173497fc55..0b0c73421f 100644
--- a/launcher_scripts/conf/training/gpt3/1b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/1b_improved.yaml
@@ -139,7 +139,6 @@ model:
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
-  use_emha: False
   ub_tp_comm_overlap: False
 
   optim:
diff --git a/launcher_scripts/conf/training/gpt3/20b.yaml b/launcher_scripts/conf/training/gpt3/20b.yaml
index 092c52f9ae..f748e32bd4 100755
--- a/launcher_scripts/conf/training/gpt3/20b.yaml
+++ b/launcher_scripts/conf/training/gpt3/20b.yaml
@@ -132,7 +132,6 @@ model:
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
-  use_emha: False
   ub_tp_comm_overlap: False
 
   # miscellaneous
diff --git a/launcher_scripts/conf/training/gpt3/400m_improved.yaml b/launcher_scripts/conf/training/gpt3/400m_improved.yaml
index 6385fa7526..f6cdbdea66 100644
--- a/launcher_scripts/conf/training/gpt3/400m_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/400m_improved.yaml
@@ -139,7 +139,6 @@ model:
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
-  use_emha: False
   ub_tp_comm_overlap: False
 
   optim:
diff --git a/launcher_scripts/conf/training/gpt3/40b.yaml b/launcher_scripts/conf/training/gpt3/40b.yaml
index d4dfa3e928..e689fe7d4e 100755
--- a/launcher_scripts/conf/training/gpt3/40b.yaml
+++ b/launcher_scripts/conf/training/gpt3/40b.yaml
@@ -132,7 +132,6 @@ model:
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
-  use_emha: False
   ub_tp_comm_overlap: False
 
   # miscellaneous
diff --git a/launcher_scripts/conf/training/gpt3/40b_improved.yaml b/launcher_scripts/conf/training/gpt3/40b_improved.yaml
index 57c39b899a..077d3cb5ee 100644
--- a/launcher_scripts/conf/training/gpt3/40b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/40b_improved.yaml
@@ -139,7 +139,6 @@ model:
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
-  use_emha: False
   ub_tp_comm_overlap: False
 
   optim:
diff --git a/launcher_scripts/conf/training/gpt3/5b.yaml b/launcher_scripts/conf/training/gpt3/5b.yaml
index 76a79d85d1..388c052121 100755
--- a/launcher_scripts/conf/training/gpt3/5b.yaml
+++ b/launcher_scripts/conf/training/gpt3/5b.yaml
@@ -132,7 +132,6 @@ model:
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
-  use_emha: False
   ub_tp_comm_overlap: False
 
   # miscellaneous
diff --git a/launcher_scripts/conf/training/gpt3/7b_improved.yaml b/launcher_scripts/conf/training/gpt3/7b_improved.yaml
index ecfba32d93..9c3258b195 100644
--- a/launcher_scripts/conf/training/gpt3/7b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/7b_improved.yaml
@@ -139,7 +139,6 @@ model:
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
-  use_emha: False
   ub_tp_comm_overlap: False
 
   optim:
diff --git a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml b/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml
deleted file mode 100644
index 33bbffb7ce..0000000000
--- a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml
+++ /dev/null
@@ -1,53 +0,0 @@
-# UB communicator configurations
-# Model configs: A100/175B/TP4/MBS1/SeqLen2K/BF16
-    
-# Bulk overlap with AllGather
-qkv_dgrad:
-  method: bulk
-  num_sm: 2
-  set_sm_margin: 0
-
-qkv_wgrad:
-  method: bulk
-  num_sm: 2
-  set_sm_margin: 0
-
-fc1_dgrad:
-  method: bulk
-  num_sm: 2
-  set_sm_margin: 0
-
-fc1_wgrad:
-  method: bulk
-  num_sm: 2
-  set_sm_margin: 0
-
-## Ring-exchange overlap with AllGather
-qkv_fprop:
-  method: ring_exchange
-  aggregate: 0
-
-proj_dgrad:
-  method: ring_exchange
-  aggregate: 0
-
-fc1_fprop:
-  method: ring_exchange
-  aggregate: 0
-
-fc2_dgrad:
-  method: ring_exchange
-  aggregate: 0
-
-# Chunked-collective overlap with ReduceScatter
-proj_fprop:
-  method: pipeline
-  num_sm: 4
-  num_splits: 4
-  set_sm_margin: 0
-
-fc2_fprop:
-  method: pipeline
-  num_sm: 4
-  num_splits: 4
-  set_sm_margin: 0
\ No newline at end of file
diff --git a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml b/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml
deleted file mode 100644
index 434e0a29f4..0000000000
--- a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml
+++ /dev/null
@@ -1,53 +0,0 @@
-# UB communicator configurations
-# Model configs: A100/175B/TP4/MBS2/SeqLen2K/BF16
-
-# Bulk overlap with AllGather
-qkv_dgrad:
-  method: bulk
-  num_sm: 2
-  set_sm_margin: 0
-
-qkv_wgrad:
-  method: bulk
-  num_sm: 2
-  set_sm_margin: 0
-
-fc1_dgrad:
-  method: bulk
-  num_sm: 2
-  set_sm_margin: 0
-
-fc1_wgrad:
-  method: bulk
-  num_sm: 2
-  set_sm_margin: 0
-
-## Ring-exchange overlap with AllGather
-qkv_fprop:
-  method: ring_exchange
-  aggregate: 0
-
-proj_dgrad:
-  method: ring_exchange
-  aggregate: 0
-
-fc1_fprop:
-  method: ring_exchange
-  aggregate: 0
-
-fc2_dgrad:
-  method: ring_exchange
-  aggregate: 0
-
-# Chunked-collective overlap with ReduceScatter
-proj_fprop:
-  method: pipeline
-  num_sm: 8
-  num_splits: 4
-  set_sm_margin: 0
-
-fc2_fprop:
-  method: pipeline
-  num_sm: 4
-  num_splits: 4
-  set_sm_margin: 0
diff --git a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml b/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml
deleted file mode 100644
index 21d02f3dd2..0000000000
--- a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml
+++ /dev/null
@@ -1,59 +0,0 @@
-# UB communicator configurations
-# Model configs: H100/175B/TP4/MBS1/SeqLen2K/FP8
-
-# Bulk overlap with AllGather / ReduceScatter
-qkv_dgrad:
-  method: bulk
-  num_sm: 4
-  cga_size: 2
-  set_sm_margin: 0
-
-qkv_wgrad:
-  method: bulk
-  num_sm: 8
-  cga_size: 2
-  set_sm_margin: 0
-
-fc1_dgrad:
-  method: bulk
-  num_sm: 2
-  cga_size: 2
-  set_sm_margin: 0
-
-fc1_wgrad:
-  method: bulk
-  num_sm: 4
-  cga_size: 2
-  set_sm_margin: 0
-
-## Ring-exchange overlap with AllGather
-qkv_fprop:
-  method: ring_exchange
-  aggregate: 0
-
-proj_dgrad:
-  method: ring_exchange
-  aggregate: 0
-
-fc1_fprop:
-  method: ring_exchange
-  aggregate: 0
-
-fc2_dgrad:
-  method: ring_exchange
-  aggregate: 1
-
-# Chunked-collective overlap with ReduceScatter
-proj_fprop:
-  method: pipeline
-  num_sm: 24
-  cga_size: 2
-  num_splits: 4
-  set_sm_margin: 1
-
-fc2_fprop:
-  method: pipeline
-  num_sm: 20
-  cga_size: 2
-  num_splits: 4
-  set_sm_margin: 1
diff --git a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml b/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml
deleted file mode 100644
index 444c8245e0..0000000000
--- a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml
+++ /dev/null
@@ -1,59 +0,0 @@
-# UB communicator configurations
-# Model configs: H100/175B/TP8/MBS2/SeqLen2K/FP8
-
-# Bulk overlap with AllGather
-qkv_dgrad:
-  method: bulk
-  num_sm: 8
-  cga_size: 2
-  set_sm_margin: 0
-
-qkv_wgrad:
-  method: bulk
-  num_sm: 16
-  cga_size: 2
-  set_sm_margin: 0
-
-fc1_dgrad:
-  method: bulk
-  num_sm: 4
-  cga_size: 2
-  set_sm_margin: 0
-
-fc1_wgrad:
-  method: bulk
-  num_sm: 16
-  cga_size: 2
-  set_sm_margin: 0
-
-## Ring-exchange overlap with AllGather
-qkv_fprop:
-  method: ring_exchange
-  aggregate: 0
-
-proj_dgrad:
-  method: ring_exchange
-  aggregate: 1
-
-fc1_fprop:
-  method: ring_exchange
-  aggregate: 0
-
-fc2_dgrad:
-  method: ring_exchange
-  aggregate: 0
-
-# Chunked-collective overlap with ReduceScatter
-proj_fprop:
-  method: pipeline
-  num_sm: 16
-  cga_size: 2
-  num_splits: 4
-  set_sm_margin: 1
-
-fc2_fprop:
-  method: pipeline
-  num_sm: 24
-  cga_size: 2
-  num_splits: 4
-  set_sm_margin: 1

From 093bafc66df3424b148c7698f40cf1f074344ee5 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Wed, 13 Sep 2023 09:19:58 -0700
Subject: [PATCH 57/62] move dist ckpt flag

---
 .../nemo_launcher/collections/checkpoint_search.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/launcher_scripts/nemo_launcher/collections/checkpoint_search.py b/launcher_scripts/nemo_launcher/collections/checkpoint_search.py
index 6bdd7d5101..5cd080b96f 100755
--- a/launcher_scripts/nemo_launcher/collections/checkpoint_search.py
+++ b/launcher_scripts/nemo_launcher/collections/checkpoint_search.py
@@ -50,14 +50,14 @@ def checkpoint_search(cfg):
     tensor_model_parallel_size = cfg.tensor_model_parallel_size
     pipeline_model_parallel_size = cfg.pipeline_model_parallel_size
 
-    if checkpoint_name == "latest":
+    dist_ckpt = False
+    # Every distributed checkpoint saves a 'common.pt' file
+    for result in glob.glob(os.path.join(checkpoint_folder, "*")):
+        if os.path.exists(os.path.join(result, 'common.pt')):
+            dist_ckpt = True
+            break
 
-        dist_ckpt = False
-        # Every distributed checkpoint saves a 'common.pt' file
-        for result in glob.glob(os.path.join(checkpoint_folder, "*")):
-            if os.path.exists(os.path.join(result, 'common.pt')):
-                dist_ckpt = True
-                break
+    if checkpoint_name == "latest":
 
         if dist_ckpt:
             checkpoint_list = [f for f in glob.glob(os.path.join(checkpoint_folder, "*")) if os.path.isdir(f)]

From e21358330ea14ed042c6486674357f5769de5355 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Wed, 13 Sep 2023 09:24:38 -0700
Subject: [PATCH 58/62] update config

---
 launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml b/launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml
index 2b80dd31aa..dc8d43b44d 100755
--- a/launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml
+++ b/launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml
@@ -13,7 +13,7 @@ run:
 model:
   model_type: gpt # gpt or t5, use t5 for mt5 as well
   checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
-  checkpoint_name: megatron_gpt-*last # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
+  checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
   hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
   tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1

From 4e7d80b102144f57d0023336966b14c411100dd7 Mon Sep 17 00:00:00 2001
From: "Zhenghang (Max) Xu" <zhenghax@gmail.com>
Date: Wed, 13 Sep 2023 14:20:55 -0700
Subject: [PATCH 59/62] Create peft t5 squad.yaml

---
 launcher_scripts/conf/peft/t5/squad.yaml | 230 +++++++++++++++++++++++
 1 file changed, 230 insertions(+)
 create mode 100644 launcher_scripts/conf/peft/t5/squad.yaml

diff --git a/launcher_scripts/conf/peft/t5/squad.yaml b/launcher_scripts/conf/peft/t5/squad.yaml
new file mode 100644
index 0000000000..cdd452bab3
--- /dev/null
+++ b/launcher_scripts/conf/peft/t5/squad.yaml
@@ -0,0 +1,230 @@
+name: megatron_t5_peft_tuning-${peft.model.peft.peft_scheme}
+
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: t5
+  convert_dir: ${base_results_dir}/${peft.run.model_train_name}/${peft.run.convert_name}
+  task_name: "squad"
+  results_dir: ${base_results_dir}/${.model_train_name}/peft_${.task_name}
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${peft.name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${peft.model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${peft.name}--{${peft.exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${peft.model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: True
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  peft:
+    peft_scheme: "adapter"  # can be either adapter,ia3, or ptuning
+    restore_from_path: null
+    
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+    
+    lora_tuning:
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+    
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+    
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+
+  data:
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names: 
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${peft.model.global_batch_size}
+      micro_batch_size: ${peft.model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      memmap_workers: 2
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: False
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+    validation_ds:
+      file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${peft.model.global_batch_size}
+      micro_batch_size: ${peft.model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      memmap_workers: ${peft.model.data.train_ds.memmap_workers}
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: ${peft.model.data.train_ds.add_eos}
+      add_sep: ${peft.model.data.train_ds.add_sep}
+      add_bos: ${peft.model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+    test_ds:
+        file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+        names: null # Names of the corresponding datasets used to log metrics.
+        global_batch_size: ${peft.model.global_batch_size}
+        micro_batch_size: ${peft.model.micro_batch_size}
+        shuffle: False
+        num_workers: 0
+        memmap_workers: ${peft.model.data.train_ds.memmap_workers}
+        pin_memory: True
+        max_seq_length: 2048
+        min_seq_length: 1
+        drop_last: False
+        context_key: 'input'
+        label_key: 'output'
+        add_eos: ${peft.model.data.train_ds.add_eos}
+        add_sep: ${peft.model.data.train_ds.add_sep}
+        add_bos: ${peft.model.data.train_ds.add_bos}
+        separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline}
+        write_predictions_to_file: False
+        output_file_path_prefix: null # Prefix of the file to write predictions to.
+        truncation_field: "context" # Options: ['context', 'answer']
+        index_mapping_dir: null # Path to a directory to write index mapping files.
+        prompt_template: ${peft.model.data.train_ds.prompt_template}
+        tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+        metric:
+          name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+          average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+          num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false

From 5057396aaeea1196897469b961266c29e642b4e9 Mon Sep 17 00:00:00 2001
From: "Zhenghang (Max) Xu" <zhenghax@gmail.com>
Date: Wed, 13 Sep 2023 14:24:13 -0700
Subject: [PATCH 60/62] Update stages.py with PEFT t5 support

---
 launcher_scripts/nemo_launcher/core/stages.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 4932e08717..3f317e5866 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -821,13 +821,13 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         :return: path current stage's essential nemo scripts code 
         :rtype: Path
         """
-        if model_type == "t5":
-            raise NotImplementedError("PEFT is not supported in NeMo Megatron t5 models.")
+        
         if model_type == "mt5":
             raise NotImplementedError("PEFT is not supported in NeMo Megatron mt5 models.")
         model_type_to_code_path = {
             "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py",
             "llama": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py",
+            "t5": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_t5_peft_tuning.py",
         }
         return model_type_to_code_path[model_type]
 

From 5d4ec27e4f3fe480e859db1a3010c15b598a77c9 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Mon, 18 Sep 2023 06:22:47 -0700
Subject: [PATCH 61/62] update support matrix

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 README.md                                                | 4 ++--
 auto_configurator/conf/config.yaml                       | 2 +-
 auto_configurator/tests/config_tests/test_main_config.py | 2 +-
 launcher_scripts/conf/config.yaml                        | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index c9ae532213..fdea811332 100755
--- a/README.md
+++ b/README.md
@@ -1977,7 +1977,7 @@ launcher_scripts_path: ${auto_configurator_path}/../launcher_scripts
 fastertransformer_path: ${auto_configurator_path}/../FasterTransformer
 base_results_dir: ${auto_configurator_path}/results
 data_dir: ${launcher_scripts_path}/data
-training_container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3
+training_container: nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.01
 container_mounts:
     - null
 wandb:  # Weights and Biases (W&B) logging.
@@ -5401,7 +5401,7 @@ VALID_DATA_PATH=/path/to/val_actor
 TEST_DATA_PATH=/path/to/test_actor
 
 NEMO_RLHF_DIR=/opt/nemo-rlhf
-CONTAINER="nvcr.io/ea-bignlp/nemofw-training:23.07-py3"
+CONTAINER="nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.01"
 
 mkdir -p $OUTPUT_DIR
 
diff --git a/auto_configurator/conf/config.yaml b/auto_configurator/conf/config.yaml
index ab712b4a83..d28e5060b6 100644
--- a/auto_configurator/conf/config.yaml
+++ b/auto_configurator/conf/config.yaml
@@ -18,7 +18,7 @@ launcher_scripts_path: ${auto_configurator_path}/../launcher_scripts
 base_results_dir: ${auto_configurator_path}/results
 data_dir: ${launcher_scripts_path}/data
 
-training_container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3
+training_container: nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.01
 container_mounts:
   - null
 
diff --git a/auto_configurator/tests/config_tests/test_main_config.py b/auto_configurator/tests/config_tests/test_main_config.py
index 492ff0395f..b989c57cfd 100644
--- a/auto_configurator/tests/config_tests/test_main_config.py
+++ b/auto_configurator/tests/config_tests/test_main_config.py
@@ -26,7 +26,7 @@ def test_config(self):
         base_results_dir: ${auto_configurator_path}/results
         data_dir: ${launcher_scripts_path}/data
 
-        training_container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3
+        training_container: nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.01
         container_mounts:
           - null
         
diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml
index 80267d5a56..0e39bbe0af 100755
--- a/launcher_scripts/conf/config.yaml
+++ b/launcher_scripts/conf/config.yaml
@@ -39,7 +39,7 @@ data_dir: ${launcher_scripts_path}/data  # Location to store and read the data.
 base_results_dir: ${launcher_scripts_path}/results  # Location to store the results, checkpoints and logs.
 container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
   - null
-container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3
+container: nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.01
 
 wandb_api_key_file: null  # File where the w&B api key is stored. Key must be on the first line.
 

From df55ff7d7cd9135a7502296172ab287313e36678 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Mon, 18 Sep 2023 06:53:43 -0700
Subject: [PATCH 62/62] update support matrix

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 README.md | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index fdea811332..455c0be7d6 100755
--- a/README.md
+++ b/README.md
@@ -357,26 +357,25 @@ Figure 1: The GPT family architecture. The 5B variant includes 24 transformer la
 ### 3.1. Support Matrix
 <a id="markdown-support-matrix" name="support-matrix"></a>
 
-| Software                | Version          |
-|-------------------------|------------------|
-| NVIDIA Triton           | 2.24.0           |
-| FasterTransformer       | v5.3+f8e42aa     |
-| TransformerEngine       | v0.11+b172bad    |
-| MegatronCore            | 4f8e9ac          |
-| PyTorch                 | 2.1.0a0+fe05266  |
-| NeMo                    | 1.20.0+2baef81   |
-| PyTorch Lightning       | 1.9.4            |
-| Hydra                   | 1.2.0            |
-| CUDA                    | NVIDIA CUDA 12.1 |
-| cuBLAS                  | 12.1.3.1         |
-| cuDNN                   | 8.9.0.131        |
-| NCCL                    | 2.17.1           |
-| Container OS            | Ubuntu 20.04     |
-| rdma-core               | 36.0             |
-| GDRcopy                 | 2.3              |
-| HPC-X                   | 2.13             |
-| Base Command Manager    | 1.0.0            |
-| DeepOps                 | 21.06            |
+| Software                | Version              |
+|-------------------------|----------------------|
+| NVIDIA Triton           | 2.37.0.9383150       |
+| TransformerEngine       | 0.13.0.dev0+a03f8bc  |
+| MegatronCore            | 0.3.0+ab0336a        |
+| PyTorch                 | 2.1.0a0+29c30b1      |
+| NeMo                    | 1.21.0+b850d14       |
+| PyTorch Lightning       | 2.0.7                |
+| Hydra                   | 1.2.0                |
+| CUDA                    | NVIDIA CUDA 12.2     |
+| cuBLAS                  | 12.2.5.1             |
+| cuDNN                   | 8.9.4.25             |
+| NCCL                    | 2.18.3               |
+| Container OS            | Ubuntu 22.04         |
+| rdma-core               | 39.0                 |
+| GDRcopy                 | 2.3                  |
+| HPC-X                   | 2.15                 |
+| Base Command Manager    | 1.0.0                |
+| DeepOps                 | 21.06                |
 
 ## 4. Cloud Service Providers
 <a id="markdown-cloud-service-providers" name="cloud-service-providers"></a>