diff --git a/.gitignore b/.gitignore
index b254c682e8..4517784c6b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 #*.ipynb
 output
 result
+data
 *.pt
 tests/data/asr
 .DS_Store
diff --git a/Dockerfile b/Dockerfile
index e0144fa03d..b250d0dbe0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -68,6 +68,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libsndfile1 \
         sox \
         swig \
+        openssh-server \
         libb64-dev && \
     rm -rf /var/lib/apt/lists/*
 
@@ -179,6 +180,12 @@ RUN pip install --no-cache-dir wandb==0.15.3 \
 # Copy FasterTransformer
 COPY --from=ft_builder /workspace/FasterTransformer FasterTransformer
 
+# Setup SSH config to allow mpi-operator to communicate with containers in k8s
+RUN echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
+    sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
+    sed -i 's/#   StrictHostKeyChecking ask/    StrictHostKeyChecking no/' /etc/ssh/ssh_config && \
+    mkdir -p /var/run/sshd
+
 # Examples
 WORKDIR /workspace
 #COPY any user-facing example scripts should go in here
diff --git a/README.md b/README.md
index ebd5429a00..455c0be7d6 100755
--- a/README.md
+++ b/README.md
@@ -145,8 +145,13 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
   * [5.12 LoRA Model and Generalized PEFT Framework](#512-lora-model-and-generalized-peft-framework)
     + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models)
       - [5.12.1.1 PEFT Training and Inference](#51211-peft-training-and-inference)
+      + [5.12.1.2 PEFT Training with NeMo Megatron Launcher](#51212-peft-training-with-nemo-megatron-launcher)
+        - [5.12.1.2.1 Common](#512121-common)
+        - [5.12.1.2.2 Slurm](#512122-slurm)
+        - [5.12.1.2.3 Base Command Platform](#512123-base-command-platform)
       - [5.12.2 PEFT Training and Inference for mT5/T5-style Models](#5122-peft-training-and-inference-for-mt5-t5-style-models)
       - [5.12.2.1 PEFT Training and Inference](#51221-peft-training-and-inference)
+    + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models)
   * [5.13. Model Evaluation](#513-model-evaluation)
     + [5.13.1. GPT Evaluation](#5131-gpt-evaluation)
       - [5.13.1.1. Common](#51311-common)
@@ -352,26 +357,25 @@ Figure 1: The GPT family architecture. The 5B variant includes 24 transformer la
 ### 3.1. Support Matrix
 <a id="markdown-support-matrix" name="support-matrix"></a>
 
-| Software                | Version          |
-|-------------------------|------------------|
-| NVIDIA Triton           | 2.24.0           |
-| FasterTransformer       | v5.3+f8e42aa     |
-| TransformerEngine       | v0.11+b172bad    |
-| MegatronCore            | 4f8e9ac          |
-| PyTorch                 | 2.1.0a0+fe05266  |
-| NeMo                    | 1.20.0+2baef81   |
-| PyTorch Lightning       | 1.9.4            |
-| Hydra                   | 1.2.0            |
-| CUDA                    | NVIDIA CUDA 12.1 |
-| cuBLAS                  | 12.1.3.1         |
-| cuDNN                   | 8.9.0.131        |
-| NCCL                    | 2.17.1           |
-| Container OS            | Ubuntu 20.04     |
-| rdma-core               | 36.0             |
-| GDRcopy                 | 2.3              |
-| HPC-X                   | 2.13             |
-| Base Command Manager    | 1.0.0            |
-| DeepOps                 | 21.06            |
+| Software                | Version              |
+|-------------------------|----------------------|
+| NVIDIA Triton           | 2.37.0.9383150       |
+| TransformerEngine       | 0.13.0.dev0+a03f8bc  |
+| MegatronCore            | 0.3.0+ab0336a        |
+| PyTorch                 | 2.1.0a0+29c30b1      |
+| NeMo                    | 1.21.0+b850d14       |
+| PyTorch Lightning       | 2.0.7                |
+| Hydra                   | 1.2.0                |
+| CUDA                    | NVIDIA CUDA 12.2     |
+| cuBLAS                  | 12.2.5.1             |
+| cuDNN                   | 8.9.4.25             |
+| NCCL                    | 2.18.3               |
+| Container OS            | Ubuntu 22.04         |
+| rdma-core               | 39.0                 |
+| GDRcopy                 | 2.3                  |
+| HPC-X                   | 2.15                 |
+| Base Command Manager    | 1.0.0                |
+| DeepOps                 | 21.06                |
 
 ## 4. Cloud Service Providers
 <a id="markdown-cloud-service-providers" name="cloud-service-providers"></a>
@@ -1972,7 +1976,7 @@ launcher_scripts_path: ${auto_configurator_path}/../launcher_scripts
 fastertransformer_path: ${auto_configurator_path}/../FasterTransformer
 base_results_dir: ${auto_configurator_path}/results
 data_dir: ${launcher_scripts_path}/data
-training_container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3
+training_container: nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.01
 container_mounts:
     - null
 wandb:  # Weights and Biases (W&B) logging.
@@ -3777,6 +3781,141 @@ inference.outfile_path=<OUTPUT_FILE>
 ```
 Additionally, NeMo has a notebook which walks through the steps (which these scripts encapsulate) to train and run inference for PEFT models: https://github.com/NVIDIA/NeMo/blob/main/tutorials/nlp/lora.ipynb
 
+##### 5.12.1.2 PEFT Training with NeMo Megatron Launcher
+PEFT stage could launch PEFT methods including PTuning, LoRA, Adapters and IA3 in a single stage, by setting different peft scheme.
+It is implemented via adapter_mixins framework with a unify style.
+mix-n-match PEFT scheme like adapter_and_ptuning can be easily extended to do ia3_and_ptuning or lora_and_ptuning
+
+PTuning does not need to flexibility to insert prompt tokens anywhere in the input. This feature has been removed for simplicity.
+
+##### 5.12.1.2.1. Common
+<a id="markdown-common" name="common"></a>
+To specify the configuration for ptuning (LoRA, adapter or IA3 learning), 
+use all the `run` parameters to define the job specific config:
+```yaml
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: gpt3_1.3B
+  task_name: "squad"
+  results_dir: ${base_results_dir}/${.model_train_name}/ptuning_${.task_name}
+```
+
+To specify which language model checkpoint to load and its definition, use the `model` parameter:
+
+```yaml
+model:
+  language_model_path: ${base_results_dir}/${peft.run.model_train_name}/${peft.run.convert_name}/nemo_gpt1.3B_fp16.nemo
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+```
+
+##### 5.12.1.2.2 Slurm
+<a id="markdown-slurm" name="slurm"></a>
+
+Set configuration for a Slurm cluster in the `conf/cluster/bcm.yaml` file:
+
+```yaml
+partition: null
+account: null
+exclusive: True
+gpus_per_task: null
+gpus_per_node: 8
+mem: 0
+overcommit: False
+job_name_prefix: "nemo-megatron-"
+```
+
+**Example:**
+
+To run only the evaluation pipeline and not the data preparation, training, 
+conversion or inference pipelines set the `conf/config.yaml` file to:
+
+```yaml
+stages:
+  - peft
+```
+
+then run:
+```
+python3 main.py \
+    peft=gpt3/squad \
+    stages=["peft"] \
+    peft.model.peft.peft_scheme="ptuning" \
+    peft.model.megatron_amp_O2=False \
+    peft.model.restore_from_path=${LANGUAGE_MODEL_PATH}\
+    peft.exp_manager.exp_dir=${BASE_RESULTS_DIR}/${RUN_NAME}/ptuning \
+
+```
+##### 5.12.1.2.3 Base Command Platform
+<a id="markdown-base-command-platform" name="base-command-platform"></a>
+In order to run the ptuning learning script on Base Command Platform, set the
+`cluster_type` parameter in `conf/config.yaml` to `bcp` or `interactive`. This can also be overridden
+from the command line, using hydra. 
+
+To run the ptuning pipeline to nemo-megatron-gpt-1.3B model converted checkpoint, run:
+```bash
+export HYDRA_FULL_ERROR=1
+export TORCH_CPP_LOG_LEVEL=INFO NCCL_DEBUG=INFO
+  
+TRAIN="[/mount/workspace/databricks-dolly-15k-train.jsonl]"
+VALID="[/mount/workspace/databricks-dolly-15k-val.jsonl]"
+VALID_NAMES="[peft-squad]"
+CONCAT_SAMPLING_PROBS="[1]"
+ 
+PEFT_SCHEME="ptuning"
+PEFT_EXP_DIR="/results/nemo_launcher/ptuning"
+LOG_DIR="/results/nemo_launcher/ptuning_log"
+ 
+TP_SIZE=2
+ 
+PP_SIZE=1
+ 
+python3 /opt/NeMo-Megatron-Launcher/launcher_scripts/main.py \
+        peft=gpt3/squad \
+        stages=[peft] \
+        cluster_type=interactive \
+        launcher_scripts_path=/opt/NeMo-Megatron-Launcher/launcher_scripts \
+        peft.model.peft.peft_scheme=${PEFT_SCHEME} \
+        peft.trainer.precision=bf16 \
+        peft.trainer.max_steps=100 \
+        peft.trainer.devices=2 \
+        peft.trainer.val_check_interval=10 \
+        peft.model.megatron_amp_O2=False \
+        peft.model.restore_from_path=/mount/workspace/nemo_gpt1.3B_fp16.nemo \
+        peft.model.tensor_model_parallel_size=${TP_SIZE} \
+        peft.model.pipeline_model_parallel_size=${PP_SIZE} \
+        peft.model.optim.lr=5e-6 \
+        peft.model.answer_only_loss=True \
+        peft.model.data.train_ds.file_names=${TRAIN} \
+        peft.model.data.train_ds.micro_batch_size=1 \
+        peft.model.data.train_ds.global_batch_size=32 \
+        peft.model.data.train_ds.concat_sampling_probabilities=${CONCAT_SAMPLING_PROBS} \
+        peft.model.data.validation_ds.micro_batch_size=1 \
+        peft.model.data.validation_ds.global_batch_size=32 \
+        peft.model.data.validation_ds.file_names=${VALID} \
+        peft.model.data.validation_ds.names=${VALID_NAMES} \
+        peft.model.data.test_ds.micro_batch_size=1 \
+        peft.model.data.test_ds.global_batch_size=128 \
+        peft.model.data.train_ds.num_workers=0 \
+        peft.model.data.validation_ds.num_workers=0 \
+        peft.model.data.test_ds.num_workers=0 \
+        peft.model.data.validation_ds.metric.name=loss \
+        peft.model.data.test_ds.metric.name=loss \
+        peft.exp_manager.exp_dir=${PEFT_EXP_DIR} \
+        peft.exp_manager.explicit_log_dir=${LOG_DIR} \
+        peft.exp_manager.resume_if_exists=True \
+        peft.exp_manager.resume_ignore_no_checkpoint=True \
+        peft.exp_manager.create_checkpoint_callback=True \
+        peft.exp_manager.checkpoint_callback_params.monitor=validation_loss
+```
+
+The command above assumes you mounted the data workspace in `/mount/workspace/` (e.g. the example script uses databricks-dolly-15k dataset), and the results workspace in `/results`. The command needs set different peft.exp_manager.exp_dir for different PEFT jobs.
+The stdout and stderr outputs will also be redirected to the `/results/nemo_launcher/ptuning_log`, to be able to download the logs from NGC.
+Any other parameter can also be added to the command to modify its behavior.
+
 ##### 5.12.2 PEFT Training and Inference for mT5/T5-style Models
 We offer training and inference scripts in NeMo for parameter efficient tuning of mT5/T5-style models. You can train a LoRA, P-tuning, Adapter, or IA3 model using its corresponding training and inference script. 
 
@@ -5261,7 +5400,7 @@ VALID_DATA_PATH=/path/to/val_actor
 TEST_DATA_PATH=/path/to/test_actor
 
 NEMO_RLHF_DIR=/opt/nemo-rlhf
-CONTAINER="nvcr.io/ea-bignlp/nemofw-training:23.07-py3"
+CONTAINER="nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.01"
 
 mkdir -p $OUTPUT_DIR
 
@@ -5458,6 +5597,8 @@ Currently, within the NeMo Data Curator, we support the following data-curation
    - Fuzzy deduplication. Our implementation of fuzzy deduplication builds off of the following existing libraries:
      - For computing MinHash signatures we use a modified version of the MinHasher class provided in [pyLSH](https://github.com/mattilyra/LSH)
      - For the locality sensitive hashing, we extended the Redis-based implementation found in [datasketch](https://github.com/ekzhu/datasketch) beyond a single Redis server to a Redis Cluster. This enables this module to efficiently deduplicate large datasets that do not fit in memory of a single node (e.g., several TB of text)
+ - Multilingual downstream-task decontamination
+    -  Our implementation follows the approach of [OpenAI GPT3](https://arxiv.org/pdf/2005.14165.pdf) and [Microsoft Turing NLG 530B](https://arxiv.org/abs/2201.11990)
 
 The modules are implemented in a scalable manner using [Message Passing Interface (MPI) for Python (mpi4py)](https://mpi4py.readthedocs.io/en/stable/) and we use [Dask](https://dask.org) for creating balanced input jsonl files. With the scalable modules within the NeMo Data Curator, we have been have been able to fully process a [Common Crawl Snapshot](https://commoncrawl.org/2020/12/nov-dec-2020-crawl-archive-now-available/) (consisting of 60 TB of compressed WARC files) in approximately two days using 30 CPU nodes (with hardware similar to the `c5.24xlarge` [Amazon AWS C5 instance](https://aws.amazon.com/ec2/instance-types/c5/)). Please note that the core functions used within the NeMo Data Curator (e.g., html extraction, text cleaning, heuristic filtering, etc.) have not been fully optimized. The main goal of the NeMo Data Curator is to provide users the capability to apply these functions to their large datasets using many compute nodes.
 
diff --git a/auto_configurator/autoconfig/scripts/compare_throughput.py b/auto_configurator/autoconfig/scripts/compare_throughput.py
index c6c30a031b..dcb56fa833 100644
--- a/auto_configurator/autoconfig/scripts/compare_throughput.py
+++ b/auto_configurator/autoconfig/scripts/compare_throughput.py
@@ -16,7 +16,7 @@ def main(cfg):
     settings_cfg = cfg.search_config.train_settings
     model_size = settings_cfg.model_size_in_b
     output_top_n = settings_cfg.output_top_n
-    nodes = cfg.get("nodes")
+    nodes = settings_cfg.num_nodes
 
     training_logs = os.path.join(settings_cfg.get("logs"), "training_logs")
     candidate_configs = os.path.join(settings_cfg.get("logs"), "candidate_configs")
@@ -77,11 +77,11 @@ def main(cfg):
         model_name = candidate_cfg.get("run").get("name").split("_")[0]
         gbs = model_cfg.get("global_batch_size")
         enc_seq_len = (
-            model_cfg.get("encoder_seq_length") if model_name in ("gpt3", "bert") else model_cfg.get("seq_length")
+            model_cfg.get("encoder_seq_length") if model_name in ("gpt3", "bert", "llama") else model_cfg.get("seq_length")
         )
         dec_seq_len = data_cfg.get("seq_length_dec")
 
-        if model_name in ("gpt3", "bert"):
+        if model_name in ("gpt3", "bert", "llama"):
             hs = model_cfg.get("hidden_size")
             ffn_hs = None
             layers = model_cfg.get("num_layers")
@@ -184,14 +184,14 @@ def main(cfg):
                 finally:
                     continue
 
-    result.sort(key=lambda x: x[14])
+    result.sort(key=lambda x: x[15])
     print(f"Top {min(output_top_n, len(result))} configs sorted from fastest to slowest:")
     for i, res in enumerate(result):
         print(f"Config #{i+1}: {res[-1]} with {res[14]:.4f}s per global step.")
         if i + 1 == output_top_n:
             break
 
-    top_config = f"{model_name}_{model_size}b_{nodes}nodes_tp_{result[0][2]}_pp_{result[0][3]}_mbs_{result[0][4]}_act_ckpt_{result[0][5]}_num_mbs_act_{result[0][6]}_act_per_pipe_{result[0][7]}"
+    top_config = f"{model_name}_{model_size}b_{nodes}nodes_tp_{result[0][3]}_pp_{result[0][4]}_mbs_{result[0][5]}_act_ckpt_{result[0][6]}_num_mbs_act_{result[0][7]}_act_per_pipe_{result[0][8]}"
     print("\n==================================================")
     print(f"Optimal config: {top_config} with {result[0][14]:.4f}s per global step.")
     print(f"Saving config to {final_result_logs}/optimal_config_{model_size}b_{nodes}nodes.yaml.")
@@ -223,7 +223,7 @@ def calculate_tflops(
     Bert Formula: 
         Model FLOPs = 72BLsh^2 * ( 1 + (s/6h) + (v/12hL))
     """
-    if model_name == "gpt3":
+    if model_name in ["gpt3", "llama"]:
         # Model FLOPS calculation
         model_flops = (
             (24 * gbs * enc_seq_len * hs * hs + 4 * gbs * enc_seq_len * enc_seq_len * hs) * (3 * layers)
diff --git a/auto_configurator/autoconfig/search_config.py b/auto_configurator/autoconfig/search_config.py
index 6870359ede..1f50a6d707 100644
--- a/auto_configurator/autoconfig/search_config.py
+++ b/auto_configurator/autoconfig/search_config.py
@@ -20,7 +20,7 @@
 from autoconfig.inference_sweep import search_inference_config
 from autoconfig.training_config import search_training_config
 
-SUPPORTED_MODELS = ["gpt3", "t5", "mt5", "bert"]
+SUPPORTED_MODELS = ["gpt3", "t5", "mt5", "bert", "llama"]
 
 
 def search_config(cfg: omegaconf.dictconfig.DictConfig, hydra_args: Optional[str] = None):
diff --git a/auto_configurator/autoconfig/training_config.py b/auto_configurator/autoconfig/training_config.py
index 71f01f20e8..7940aecac1 100644
--- a/auto_configurator/autoconfig/training_config.py
+++ b/auto_configurator/autoconfig/training_config.py
@@ -69,12 +69,12 @@ def generate_grid_search_configs(
     act_layers = train_cfg.get("act_ckpt_layers")
 
     # 2 * num_layers is needed because of encoder/decoder architecture.
-    multiplier = 1 if model_name in ["gpt3", "bert"] else 2
+    multiplier = 1 if model_name in ["gpt3", "bert", "llama"] else 2
 
     seq_length = base_cfg["model"]["data"]["seq_length"]
     num_layers = (
         base_cfg["model"]["num_layers"]
-        if model_name in ["gpt3", "bert"]
+        if model_name in ["gpt3", "bert", "llama"]
         else base_cfg["model"]["encoder"]["num_layers"]
     )
 
@@ -96,7 +96,7 @@ def generate_grid_search_configs(
             for mbs in mbs_list:
                 num_gpus = base_cfg["trainer"]["num_nodes"] * base_cfg["trainer"]["devices"]
                 gbs = base_cfg["model"]["global_batch_size"]
-                if model_name in ["gpt3", "bert"]:
+                if model_name in ["gpt3", "bert", "llama"]:
                     att_heads = base_cfg["model"]["num_attention_heads"]
                     num_layers = base_cfg["model"]["num_layers"]
                 else:
@@ -175,7 +175,7 @@ def _set_activations_checkpoint_params(tp, pp, num_layers, act_method, multiplie
     min_layers_per_pipe = 0
     max_layers_per_pipe = num_layers
     interval_layers_per_pipe = act_multiple
-    if model_name in ["gpt3", "bert"] and pp > 2:  # Interleaved pipeline scheduling.
+    if model_name in ["gpt3", "bert", "llama"] and pp > 2:  # Interleaved pipeline scheduling.
         virtual_pipelines = num_layers // pp  # TODO: verify that this is the best value.
         act_multiple = 1
         max_micro_b = pp * (virtual_pipelines - 1) + (pp - 1) * 2 + 1
@@ -190,7 +190,7 @@ def _set_activations_checkpoint_params(tp, pp, num_layers, act_method, multiplie
         else:
             act_ckpt_layers = range(0, multiplier * num_layers // pp // virtual_pipelines + 1, act_multiple)
 
-        if pp > 1 and model_name in ["gpt3", "bert"]:
+        if pp > 1 and model_name in ["gpt3", "bert", "llama"]:
             # Num micro batches with partial act ckpt
             num_micro_batches_partial_act_ckpt = list(range(min_micro_b, max_micro_b + 1, interval_micro_b))
             if num_micro_batches_partial_act_ckpt[0] == 0:
@@ -304,6 +304,12 @@ def _tp_pp_mbs_grid_gpt3_80gb(model_size_in_b: float, valid_pp: List[int], seq_l
             mbs = [1, 2]
             min_model_parallel = 8
             max_model_parallel = 32
+        elif model_size_in_b <= 95:
+            tp = [4, 8]
+            pp = [x for x in valid_pp if 1 <= x <= 8]
+            mbs = [1, 2]
+            min_model_parallel = 8
+            max_model_parallel = 64
     elif seq_length == 8192:
         if model_size_in_b <= 1.0:
             tp = [1, 2]
@@ -738,13 +744,13 @@ def _calculate_tp_pp_mbs_grid(
     mbs_sizes = train_cfg.get("micro_batch_sizes")
     gpu_memory_gb = train_cfg.get("gpu_memory_gb")
 
-    multiplier = 1 if model_name in ["gpt3", "bert"] else 2
-    init_pp = [] if model_name == "gpt3" else [1]
+    multiplier = 1 if model_name in ["gpt3", "bert", "llama"] else 2
+    init_pp = [] if model_name in ["gpt3", "llama"] else [1]
     valid_pp = init_pp + [
         multiplier * x for x in range(1, num_layers + 1) if num_layers % x == 0
     ]  # Only divisors of num_layers are possible.
 
-    if model_name == "gpt3":
+    if model_name in ["gpt3", "llama"]:
         if gpu_memory_gb == 80:
             tp, pp, mbs, min_model_parallel, max_model_parallel = _tp_pp_mbs_grid_gpt3_80gb(
                 model_size_in_b=model_size_in_b, valid_pp=valid_pp, seq_length=seq_length
diff --git a/auto_configurator/autoconfig/utils.py b/auto_configurator/autoconfig/utils.py
index 7e9b59460d..7a3125e0a8 100644
--- a/auto_configurator/autoconfig/utils.py
+++ b/auto_configurator/autoconfig/utils.py
@@ -45,7 +45,7 @@ def _calculate_model_size(
     :rtype: float
     :raises NotImplementedError: if the model name is not valid.
     """
-    if model_name == "gpt3":
+    if model_name in ["gpt3", "llama"]:
         model_size = (
             12
             * num_layers
@@ -96,7 +96,7 @@ def calculate_model_size_params(
     :raises NotImplementedError: if the model name is not supported.
     """
     ffn, kv = None, None  # Only needed for some models.
-    if model_name == "gpt3":
+    if model_name in ["gpt3", "llama"]:
         if model_size_in_b < 0.25:
             hs, att_h, lr = 768, 12, 6e-4
         elif model_size_in_b < 0.5:
@@ -350,26 +350,26 @@ def modify_cfg(
     """
     new_cfg = copy.deepcopy(base_cfg)
     if act is not None:
-        if model_name in ["gpt3", "bert"]:
+        if model_name in ["gpt3", "bert", "llama"]:
             new_cfg["model"]["activations_checkpoint_num_layers"] = act
         else:
             new_cfg["model"]["encoder"]["activations_checkpoint_num_layers"] = act // 2
             new_cfg["model"]["decoder"]["activations_checkpoint_num_layers"] = act // 2
 
-    if num_mbs_act is not None and model_name in ["gpt3", "bert"]:
+    if num_mbs_act is not None and model_name in ["gpt3", "bert", "llama"]:
         new_cfg["model"]["num_micro_batches_with_partial_activation_checkpoints"] = num_mbs_act
 
-    if act_per_pipe is not None and model_name in ["gpt3", "bert"]:
+    if act_per_pipe is not None and model_name in ["gpt3", "bert", "llama"]:
         new_cfg["model"]["activations_checkpoint_layers_per_pipeline"] = act_per_pipe
 
-    if virtual_pipelines is not None and model_name in ["gpt3", "bert"]:
+    if virtual_pipelines is not None and model_name in ["gpt3", "bert", "llama"]:
         new_cfg["model"]["virtual_pipeline_model_parallel_size"] = virtual_pipelines
 
     new_cfg["model"]["tensor_model_parallel_size"] = tp
     new_cfg["model"]["pipeline_model_parallel_size"] = pp
     new_cfg["model"]["micro_batch_size"] = mbs
 
-    if model_name in ["gpt3", "bert"]:
+    if model_name in ["gpt3", "bert", "llama"]:
         att_heads = new_cfg["model"]["num_attention_heads"]
         num_layers = new_cfg["model"]["num_layers"]
     else:
diff --git a/auto_configurator/base_configs/bert.yaml b/auto_configurator/base_configs/bert.yaml
index 305040666e..01e3be140e 100644
--- a/auto_configurator/base_configs/bert.yaml
+++ b/auto_configurator/base_configs/bert.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   max_time: "00:23:30:00"
diff --git a/auto_configurator/base_configs/gpt3.yaml b/auto_configurator/base_configs/gpt3.yaml
index 4eeaf79ce2..a69ba139eb 100644
--- a/auto_configurator/base_configs/gpt3.yaml
+++ b/auto_configurator/base_configs/gpt3.yaml
@@ -11,7 +11,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 600000 # consumed_samples = global_step * global_batch_size
   max_time: "00:23:30:00" # days:hours:minutes:seconds
diff --git a/auto_configurator/base_configs/llama2_13b.yaml b/auto_configurator/base_configs/llama2_13b.yaml
new file mode 100644
index 0000000000..b3f20fd0c2
--- /dev/null
+++ b/auto_configurator/base_configs/llama2_13b.yaml
@@ -0,0 +1,215 @@
+run:
+  name: llama2_13b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: 0-01:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 2
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: false
+  max_epochs: null
+  max_steps: 300000
+  max_time: '5:23:30:00'
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: false
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size},
+      ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+model:
+  mcore_gpt: true
+  micro_batch_size: 2
+  global_batch_size: 128
+  rampup_batch_size: null
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 4096
+  max_position_embeddings: 4096
+  num_layers: 40
+  hidden_size: 5120
+  ffn_hidden_size: 13824
+  num_attention_heads: 40
+  init_method_std: 0.01
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: sentencepiece
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: 0
+  activations_checkpoint_layers_per_pipeline: 0
+  sequence_parallel: true
+  transformer_engine: true
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: most_recent
+  use_emha: false
+  ub_tp_comm_overlap: false
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 0.0001
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: false
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 107
+      constant_steps: 11873
+      min_lr: 1.0e-05
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 4096
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
+
diff --git a/auto_configurator/base_configs/llama2_70b.yaml b/auto_configurator/base_configs/llama2_70b.yaml
new file mode 100644
index 0000000000..f624f5bb03
--- /dev/null
+++ b/auto_configurator/base_configs/llama2_70b.yaml
@@ -0,0 +1,220 @@
+run:
+  name: llama2_70b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: 0-01:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 8
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 300000
+  max_time: '19:23:30:00'
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: false
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 128
+  rampup_batch_size: null
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 4
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 4096
+  max_position_embeddings: 4096
+  num_layers: 80
+  hidden_size: 8192
+  ffn_hidden_size: 28672
+  num_attention_heads: 64
+  num_query_groups: 8
+  init_method_std: 0.008944
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: sentencepiece
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: 0
+  activations_checkpoint_layers_per_pipeline: 0
+  sequence_parallel: true
+  transformer_engine: true
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: most_recent
+  use_emha: false
+  ub_tp_comm_overlap: false
+  use_flash_attention: true
+  overlap_p2p_comm: false
+  batch_p2p_comm: true
+  gc_interval: 100
+  optim:
+    name: distributed_fused_adam
+    lr: 0.00015
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      constant_steps: 11873
+      min_lr: 1.0e-05
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 4096
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
+
diff --git a/auto_configurator/base_configs/llama2_7b.yaml b/auto_configurator/base_configs/llama2_7b.yaml
new file mode 100755
index 0000000000..95733d1f53
--- /dev/null
+++ b/auto_configurator/base_configs/llama2_7b.yaml
@@ -0,0 +1,219 @@
+run:
+  name: llama2_7b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "0-01:00:00"
+  dependency: "singleton"
+trainer:
+  num_nodes: 1
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 300000 # consumed_samples = global_step * global_batch_size
+  max_time: "05:23:30:00" # days:hours:minutes:seconds
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  num_sanity_val_steps: 0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: True
+  step_timing_kwargs:
+    sync_cuda: True
+    buffer_size: 5
+
+model:
+  mcore_gpt: true
+  micro_batch_size: 2
+  global_batch_size: 128
+  rampup_batch_size: null
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 4096
+  max_position_embeddings: 4096
+  num_layers: 32
+  hidden_size: 4096
+  ffn_hidden_size: 11008
+  num_attention_heads: 32
+  init_method_std: 0.01
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: 'sentencepiece'
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: False
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: false # does not support sequence parallel
+
+  ## Transformer Engine
+  # fp8 training is currently not supported in the improved models
+  transformer_engine: true
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  use_emha: False
+  ub_tp_comm_overlap: False
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 1e-4
+    weight_decay: 0.1
+    betas:
+      - 0.9
+      - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: False
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 1e-5
+  data:
+    data_impl: mmap
+    splits_string: "99990,8,2"
+    seq_length: 4096
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
+
diff --git a/auto_configurator/base_configs/mt5.yaml b/auto_configurator/base_configs/mt5.yaml
index 96053b9ac4..a0f3d70d8a 100644
--- a/auto_configurator/base_configs/mt5.yaml
+++ b/auto_configurator/base_configs/mt5.yaml
@@ -16,7 +16,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1000000 # consumed_samples = global_step * global_batch_size
   max_time: "06:23:30:00"
diff --git a/auto_configurator/base_configs/t5.yaml b/auto_configurator/base_configs/t5.yaml
index cd1ef0ac87..06c6016f78 100644
--- a/auto_configurator/base_configs/t5.yaml
+++ b/auto_configurator/base_configs/t5.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1000000 # consumed_samples = global_step * global_batch_size
   max_time: "06:23:30:00"
diff --git a/auto_configurator/conf/config.yaml b/auto_configurator/conf/config.yaml
index aa75cfff7a..d28e5060b6 100644
--- a/auto_configurator/conf/config.yaml
+++ b/auto_configurator/conf/config.yaml
@@ -15,11 +15,10 @@ run_inference_hp_search: True
 cluster_type: bcm  # bcm or bcp
 auto_configurator_path: ???  # Path to the location of auto_configurator codebase.
 launcher_scripts_path: ${auto_configurator_path}/../launcher_scripts
-fastertransformer_path: ${auto_configurator_path}/../FasterTransformer
 base_results_dir: ${auto_configurator_path}/results
 data_dir: ${launcher_scripts_path}/data
 
-training_container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3
+training_container: nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.01
 container_mounts:
   - null
 
diff --git a/auto_configurator/conf/search_config/llama/13b.yaml b/auto_configurator/conf/search_config/llama/13b.yaml
new file mode 100644
index 0000000000..0035e650bb
--- /dev/null
+++ b/auto_configurator/conf/search_config/llama/13b.yaml
@@ -0,0 +1,23 @@
+train_settings:
+  model_size_in_b: 13 # unit in billion parameters
+  num_nodes: 2
+  gpus_per_node: 8
+  gpu_memory_gb: 80  # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported.
+  max_training_days: 5 # unit in days
+  limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search.
+  output_top_n: 10  # The result will print the top N fastest training configs.
+  max_steps_per_run: 100 # Max steps per run for the grid search.
+  max_minutes_per_run: 30 # minutes per run for the grid search.
+  tflops_per_gpu: 150  # Estimated tflops per GPU.
+  num_tokens_in_b: 300  # Unit in billions, typically 300B for GPT3 models.
+  vocab_size: 32000
+  seq_length: 4096 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768]
+  custom_config: {auto_configurator_path}/base_configs/llama2_13b.yaml # path to custom .yaml model config instead of using auto-generated
+  logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb  # Example base_results_dir/gpt3/126m
+  tensor_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8]
+  pipeline_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10]
+  min_model_parallel_size: auto  # auto to use our recommendation, or a value for the minimum desired parallelism
+  max_model_parallel_size: auto  # auto to use our recommendation, or a value for the maximum desired parallelism
+  micro_batch_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16]
+  act_ckpt_layers: auto  # auto to use our recommendation, or a list, such as [0, 1, 2, 3]
+ 
diff --git a/auto_configurator/conf/search_config/llama/70b.yaml b/auto_configurator/conf/search_config/llama/70b.yaml
new file mode 100644
index 0000000000..ee41a9ccda
--- /dev/null
+++ b/auto_configurator/conf/search_config/llama/70b.yaml
@@ -0,0 +1,23 @@
+train_settings:
+  model_size_in_b: 70 # unit in billion parameters
+  num_nodes: 8
+  gpus_per_node: 8
+  gpu_memory_gb: 80  # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported.
+  max_training_days: 5 # unit in days
+  limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search.
+  output_top_n: 10  # The result will print the top N fastest training configs.
+  max_steps_per_run: 100 # Max steps per run for the grid search.
+  max_minutes_per_run: 30 # minutes per run for the grid search.
+  tflops_per_gpu: 150  # Estimated tflops per GPU.
+  num_tokens_in_b: 300  # Unit in billions, typically 300B for GPT3 models.
+  vocab_size: 32000
+  seq_length: 4096 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768]
+  custom_config: {auto_configurator_path}/base_configs/llama2_70b.yaml # path to custom .yaml model config instead of using auto-generated
+  logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb  # Example base_results_dir/gpt3/126m
+  tensor_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8]
+  pipeline_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10]
+  min_model_parallel_size: auto  # auto to use our recommendation, or a value for the minimum desired parallelism
+  max_model_parallel_size: auto  # auto to use our recommendation, or a value for the maximum desired parallelism
+  micro_batch_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16]
+  act_ckpt_layers: auto  # auto to use our recommendation, or a list, such as [0, 1, 2, 3]
+ 
diff --git a/auto_configurator/conf/search_config/llama/7b.yaml b/auto_configurator/conf/search_config/llama/7b.yaml
new file mode 100644
index 0000000000..bfe1756413
--- /dev/null
+++ b/auto_configurator/conf/search_config/llama/7b.yaml
@@ -0,0 +1,22 @@
+train_settings:
+  model_size_in_b: 7 # unit in billion parameters
+  num_nodes: 1
+  gpus_per_node: 8
+  gpu_memory_gb: 80  # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported.
+  max_training_days: 5 # unit in days
+  limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search.
+  output_top_n: 10  # The result will print the top N fastest training configs.
+  max_steps_per_run: 100 # Max steps per run for the grid search.
+  max_minutes_per_run: 30 # minutes per run for the grid search.
+  tflops_per_gpu: 150  # Estimated tflops per GPU.
+  num_tokens_in_b: 300  # Unit in billions, typically 300B for GPT3 models.
+  vocab_size: 32000
+  seq_length: 4096 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768]
+  custom_config: {auto_configurator_path}/base_configs/llama2_7b.yaml # path to custom .yaml model config instead of using auto-generated
+  logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb  # Example base_results_dir/gpt3/126m
+  tensor_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8]
+  pipeline_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10]
+  min_model_parallel_size: auto  # auto to use our recommendation, or a value for the minimum desired parallelism
+  max_model_parallel_size: auto  # auto to use our recommendation, or a value for the maximum desired parallelism
+  micro_batch_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16]
+  act_ckpt_layers: auto  # auto to use our recommendation, or a list, such as [0, 1, 2, 3]
diff --git a/auto_configurator/conf/search_config/llama/7b_nemo.yaml b/auto_configurator/conf/search_config/llama/7b_nemo.yaml
new file mode 100644
index 0000000000..aca9819929
--- /dev/null
+++ b/auto_configurator/conf/search_config/llama/7b_nemo.yaml
@@ -0,0 +1,40 @@
+train_settings:
+  model_size_in_b: 7 # unit in billion parameters
+  num_nodes: 2
+  gpus_per_node: 8
+  gpu_memory_gb: 80  # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported.
+  max_training_days: 5 # unit in days
+  limit_search_runs: 10 # Max number of runs to be launched in parallel for grid search.
+  output_top_n: 10  # The result will print the top N fastest training configs.
+  max_steps_per_run: 100 # Max steps per run for the grid search.
+  max_minutes_per_run: 30 # minutes per run for the grid search.
+  tflops_per_gpu: 150  # Estimated tflops per GPU.
+  num_tokens_in_b: 300  # Unit in billions, typically 300B for GPT3 models.
+  vocab_size: 32000
+  seq_length: 4096 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768]
+  custom_config: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/auto_configurator/base_configs/llama2_7b.yaml # path to custom .yaml model config instead of using auto-generated
+  logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb  # Example base_results_dir/gpt3/126m
+  tensor_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8]
+  pipeline_parallel_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10]
+  min_model_parallel_size: auto  # auto to use our recommendation, or a value for the minimum desired parallelism
+  max_model_parallel_size: auto  # auto to use our recommendation, or a value for the maximum desired parallelism
+  micro_batch_sizes: auto  # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16]
+  act_ckpt_layers: auto  # auto to use our recommendation, or a list, such as [0, 1, 2, 3]
+ 
+inference_settings:
+  run:
+    model_type: gpt3
+    model_train_name: gpt3_5b
+    gpus_per_node: 8
+    data_type: "fp16" # fp32|fp16|bf16
+    time_limit: 0:30:00
+    results_dir: ${base_results_dir}/${search_config_value}_${search_config.train_settings.gpu_memory_gb}gb
+    tensor_parallel_sizes: [1,2,4]
+    pipeline_parallel_sizes: [1,2]
+  benchmark:
+    input_len: 60
+    output_len: 20
+    batch_sizes: [4,8,16,32,64,128,256]
+    beam_width: 1
+    topk: 4
+    topp: 0.0
diff --git a/auto_configurator/tests/base_configs_tests/test_base_configs.py b/auto_configurator/tests/base_configs_tests/test_base_configs.py
index 0919ee65ab..4fb155628d 100644
--- a/auto_configurator/tests/base_configs_tests/test_base_configs.py
+++ b/auto_configurator/tests/base_configs_tests/test_base_configs.py
@@ -18,7 +18,7 @@ def test_gpt3_base_config(self):
           precision: bf16
           logger: False
           enable_checkpointing: False
-          replace_sampler_ddp: False
+          use_distributed_sampler: False
           max_epochs: null
           max_steps: 600000
           max_time: "00:23:30:00"
@@ -196,7 +196,7 @@ def test_t5_base_config(self):
           precision: bf16
           logger: False # logger provided by exp_manager
           enable_checkpointing: False
-          replace_sampler_ddp: False
+          use_distributed_sampler: False
           max_epochs: null
           max_steps: 1000000 # consumed_samples = global_step * global_batch_size
           max_time: "06:23:30:00"
@@ -421,7 +421,7 @@ def test_mt5_base_config(self):
           precision: bf16
           logger: False # logger provided by exp_manager
           enable_checkpointing: False
-          replace_sampler_ddp: False
+          use_distributed_sampler: False
           max_epochs: null
           max_steps: 1000000 # consumed_samples = global_step * global_batch_size
           max_time: "06:23:30:00"
@@ -642,7 +642,7 @@ def test_bert_base_config(self):
           precision: bf16
           logger: False # logger provided by exp_manager
           enable_checkpointing: False
-          replace_sampler_ddp: False
+          use_distributed_sampler: False
           max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
           max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
           max_time: "00:23:30:00"
diff --git a/auto_configurator/tests/config_tests/test_main_config.py b/auto_configurator/tests/config_tests/test_main_config.py
index 492ff0395f..b989c57cfd 100644
--- a/auto_configurator/tests/config_tests/test_main_config.py
+++ b/auto_configurator/tests/config_tests/test_main_config.py
@@ -26,7 +26,7 @@ def test_config(self):
         base_results_dir: ${auto_configurator_path}/results
         data_dir: ${launcher_scripts_path}/data
 
-        training_container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3
+        training_container: nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.01
         container_mounts:
           - null
         
diff --git a/launcher_scripts/conf/adapter_learning/gpt3/squad.yaml b/launcher_scripts/conf/adapter_learning/gpt3/squad.yaml
index 5abbbc9cb3..fe2ceea017 100755
--- a/launcher_scripts/conf/adapter_learning/gpt3/squad.yaml
+++ b/launcher_scripts/conf/adapter_learning/gpt3/squad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 4
   max_steps: -1
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/adapter_learning/llama/squad.yaml b/launcher_scripts/conf/adapter_learning/llama/squad.yaml
new file mode 100755
index 0000000000..9907d52635
--- /dev/null
+++ b/launcher_scripts/conf/adapter_learning/llama/squad.yaml
@@ -0,0 +1,107 @@
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: llama_7b
+  convert_dir: ${base_results_dir}/${adapter_learning.run.model_train_name}/${adapter_learning.run.convert_name}
+  task_name: "squad"
+  results_dir: ${base_results_dir}/${.model_train_name}/adapter_learning_${.task_name}
+
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False
+  enable_checkpointing: False
+  replace_sampler_ddp: False
+  max_epochs: 4
+  max_steps: -1
+  log_every_n_steps: 10
+  val_check_interval: 0.1
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+
+
+exp_manager:
+  explicit_log_dir: ${adapter_learning.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama_adapter
+  create_wandb_logger: True
+  wandb_logger_kwargs:
+    project: nemo_llama_adapter
+    name: ${adapter_learning.run.name}
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 5
+    mode: min
+    save_nemo_on_train_end: False
+    filename: "megatron_llama_adapter_learn--{val_loss:.3f}-{step}"
+    model_parallel_size: ${adapter_learning.model.model_parallel_size}
+    save_best_model: True
+
+model:
+  seed: 1234
+  nemo_path: ${adapter_learning.run.results_dir}/results/megatron_gpt_adapter.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
+  virtual_prompt_style: 'no-prompts' # adapter tuning requires no virtual prompts
+  encoder_seq_length: 2048 
+  gradient_as_bucket_view: false
+  tensor_model_parallel_size: 4 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
+  global_batch_size: 64
+  micro_batch_size: 8
+
+  restore_path: null # Path to an existing adapter .nemo model you wish to add new tasks to or run inference with
+  language_model_path: ${adapter_learning.run.convert_dir}/results/megatron_llama.nemo # Path to the GPT language model .nemo file, always required
+  existing_tasks: [] # List of tasks the model has already been p-tuned/prompt-tuned for, needed when a restore path is given
+  new_tasks: ["squad"] # List of new tasknames to be prompt-tuned
+
+  task_templates: # Add more/replace tasks as needed, these are just examples
+  - taskname: "squad" 
+    prompt_template: "context: {context} question: {question} answer: {answer}" 
+    total_virtual_tokens: 0
+    virtual_token_splits: []
+    truncate_field: null
+    answer_only_loss: True
+    answer_field: "answer"
+
+  adapter_tuning:
+    type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+    adapter_dim: 16
+    adapter_dropout: 0.1
+    norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used.
+    column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+    row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+    norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+
+  data:
+    train_ds:
+      - ${data_dir}/prompt_data/v1.1/squad_train.jsonl # multiple prompt dataset can be given at the same time
+    validation_ds:
+      - ${data_dir}/prompt_data/v1.1/squad_val.jsonl
+    add_eos: True
+    shuffle: True
+    num_workers: 4
+    pin_memory: True
+
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/launcher_scripts/conf/adapter_learning/t5/squad.yaml b/launcher_scripts/conf/adapter_learning/t5/squad.yaml
index f82940d489..a5fc08f7a0 100755
--- a/launcher_scripts/conf/adapter_learning/t5/squad.yaml
+++ b/launcher_scripts/conf/adapter_learning/t5/squad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 4
   max_steps: -1
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/cluster/bcm.yaml b/launcher_scripts/conf/cluster/bcm.yaml
index e1f7b32c6b..ba8f2ebbb0 100755
--- a/launcher_scripts/conf/cluster/bcm.yaml
+++ b/launcher_scripts/conf/cluster/bcm.yaml
@@ -4,6 +4,6 @@ exclusive: True
 gpus_per_task: null
 gpus_per_node: 8
 mem: 0
-job_name_prefix: "nemo-megatron-"
+job_name_prefix: 'nemo-megatron-'
 srun_args:
   - "--no-container-mount-home"
diff --git a/launcher_scripts/conf/cluster/k8s.yaml b/launcher_scripts/conf/cluster/k8s.yaml
new file mode 100644
index 0000000000..d609fb3901
--- /dev/null
+++ b/launcher_scripts/conf/cluster/k8s.yaml
@@ -0,0 +1,6 @@
+pull_secret: null  # Kubernetes secret for the container registry to pull private containers.
+shm_size: 512Gi  # Amount of system memory to allocate in Pods. Should end in "Gi" for gigabytes.
+nfs_server: null  # Hostname or IP address for the NFS server where data is stored.
+nfs_path: null  # Path to store data in the NFS server.
+ib_resource_name: "nvidia.com/hostdev"  # Specify the resource name for IB devices according to kubernetes, such as "nvidia.com/hostdev" for Mellanox IB adapters.
+ib_count: "8"  # Specify the number of IB devices to include per node in each pod.
diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml
index 576d7db78d..0e39bbe0af 100755
--- a/launcher_scripts/conf/config.yaml
+++ b/launcher_scripts/conf/config.yaml
@@ -1,10 +1,12 @@
 defaults:
   - _self_
-  - cluster: bcm  # Leave it as bcm even if using bcp. It will be ignored for bcp.
+  - cluster: bcm  # Set to bcm for BCM and BCP clusters. Set to k8s for a k8s cluster.
   - data_preparation: gpt3/download_gpt3_pile
+  - quality_filtering: heuristic/english
   - training: gpt3/5b
   - conversion: gpt3/convert_gpt3
   - fine_tuning: null
+  - peft: null
   - prompt_learning: null
   - adapter_learning: null
   - ia3_learning: null
@@ -22,18 +24,22 @@ hydra:
 debug: False
 
 stages:
-  - training
+  #- data_preparation
+  #- training
   - conversion
-  - evaluation
-  - export
+  #- prompt_learning
+  #- adapter_learning
+  #- ia3_learning
+  #- evaluation
+  #- export
 
-cluster_type: bcm  # bcm or bcp. If bcm, it must match - cluster above.
+cluster_type: bcm  # bcm, bcp, or k8s. If bcm or k8s, it must match - cluster above.
 launcher_scripts_path: ???  # Path to NeMo Megatron Launch scripts, should ends with /launcher_scripts
 data_dir: ${launcher_scripts_path}/data  # Location to store and read the data.
 base_results_dir: ${launcher_scripts_path}/results  # Location to store the results, checkpoints and logs.
 container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
   - null
-container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3
+container: nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.01
 
 wandb_api_key_file: null  # File where the w&B api key is stored. Key must be on the first line.
 
@@ -45,7 +51,8 @@ env_vars:
   NCCL_DEBUG: null # Logging level for NCCL. Set to "INFO" for debug information
   NCCL_PROTO: null # Protocol NCCL will use. Set to "simple" for AWS
   TRANSFORMERS_OFFLINE: 1
-  NCCL_AVOID_RECORD_STREAMS: 1
+  TORCH_NCCL_AVOID_RECORD_STREAMS: 1
+  NCCL_NVLS_ENABLE: 0
 
 # GPU Mapping
 numa_mapping:
@@ -59,8 +66,10 @@ numa_mapping:
 
 # Do not modify below, use the values above instead.
 data_preparation_config: ${hydra:runtime.choices.data_preparation}
+quality_filtering_config: ${hydra:runtime.choices.quality_filtering}
 training_config: ${hydra:runtime.choices.training}
 fine_tuning_config: ${hydra:runtime.choices.fine_tuning}
+peft_config: ${hydra:runtime.choices.peft}
 prompt_learning_config: ${hydra:runtime.choices.prompt_learning}
 adapter_learning_config: ${hydra:runtime.choices.adapter_learning}
 ia3_learning_config: ${hydra:runtime.choices.ia3_learning}
diff --git a/launcher_scripts/conf/conversion/llama/convert_llama.yaml b/launcher_scripts/conf/conversion/llama/convert_llama.yaml
new file mode 100755
index 0000000000..9dfb362cc2
--- /dev/null
+++ b/launcher_scripts/conf/conversion/llama/convert_llama.yaml
@@ -0,0 +1,21 @@
+run:
+  name: convert_${conversion.run.model_train_name}
+  nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node
+  time_limit: "1:00:00"
+  dependency: "singleton"
+  ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}}
+  convert_name: convert_nemo
+  model_train_name: llama2_7b
+  train_dir: ${base_results_dir}/${.model_train_name}
+  results_dir: ${.train_dir}/${.convert_name}
+  nemo_file_name: megatron_llama.nemo # name of nemo checkpoint; must be .nemo file
+
+model:
+  model_type: gpt # gpt or t5, use t5 for mt5 as well
+  checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints
+  checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_llama-*last.ckpt)
+  hparams_file: ${conversion.run.train_dir}/results/hparams.yaml
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+  model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
+  tokenizer_model: ${data_dir}/llama/llama_tokenizer.model
diff --git a/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml b/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml
index 632ccdadd2..ab6614480a 100755
--- a/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml
+++ b/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml
@@ -9,7 +9,7 @@ run:
 
 dataset: pile
 download_the_pile: True  # Whether to download the pile dataset from the internet.
-the_pile_url: "https://mystic.the-eye.eu/public/AI/pile/train/"  # Source URL to download The Pile dataset from.
+the_pile_url: "https://the-eye.eu/public/AI/pile/train/"  # Source URL to download The Pile dataset from.
 file_numbers: "0-29"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
 preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
 download_vocab_url: "https://huggingface.co/gpt2/resolve/main/vocab.json"  # URL to download the vocab from.
diff --git a/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml b/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml
new file mode 100755
index 0000000000..ab317e8a8d
--- /dev/null
+++ b/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml
@@ -0,0 +1,20 @@
+run:
+  name: download_llama_pile
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "1:00:00"
+  dependency: "singleton"
+  node_array_size: 30
+  array: ${..file_numbers}
+  bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster.
+
+dataset: pile
+download_the_pile: True  # Whether to download the pile dataset from the internet.
+the_pile_url: "https://the-eye.eu/public/AI/pile/train/"  # Source URL to download The Pile dataset from.
+file_numbers: "0-29"  # The pile dataset consists of 30 files (0-29), choose which ones to download.
+preprocess_data: True  # True to preprocess the data from a jsonl file, False otherwise.
+download_tokenizer_url: "https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model"
+tokenizer_library: "sentencepiece"
+tokenizer_save_dir: ${data_dir}/llama
+tokenizer_model:  ${.tokenizer_save_dir}/llama_tokenizer.model
+rm_downloaded: False # Extract script will remove downloaded zst after extraction
+rm_extracted: False # Preprocess script will remove extracted files after preproc.
diff --git a/launcher_scripts/conf/evaluation/adapter_gpt3/squad.yaml b/launcher_scripts/conf/evaluation/adapter_gpt3/squad.yaml
index 75c9774e14..a7dbd31065 100755
--- a/launcher_scripts/conf/evaluation/adapter_gpt3/squad.yaml
+++ b/launcher_scripts/conf/evaluation/adapter_gpt3/squad.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 inference:
diff --git a/launcher_scripts/conf/evaluation/adapter_t5/squad.yaml b/launcher_scripts/conf/evaluation/adapter_t5/squad.yaml
index d18cc08856..91d2cec798 100755
--- a/launcher_scripts/conf/evaluation/adapter_t5/squad.yaml
+++ b/launcher_scripts/conf/evaluation/adapter_t5/squad.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 inference:
diff --git a/launcher_scripts/conf/evaluation/ia3_gpt3/squad.yaml b/launcher_scripts/conf/evaluation/ia3_gpt3/squad.yaml
index d109a98557..046d7c9ae0 100755
--- a/launcher_scripts/conf/evaluation/ia3_gpt3/squad.yaml
+++ b/launcher_scripts/conf/evaluation/ia3_gpt3/squad.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 inference:
diff --git a/launcher_scripts/conf/evaluation/ia3_t5/squad.yaml b/launcher_scripts/conf/evaluation/ia3_t5/squad.yaml
index 48480c074a..40b9594f68 100755
--- a/launcher_scripts/conf/evaluation/ia3_t5/squad.yaml
+++ b/launcher_scripts/conf/evaluation/ia3_t5/squad.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 inference:
diff --git a/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml b/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml
new file mode 100755
index 0000000000..e354d6ee68
--- /dev/null
+++ b/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml
@@ -0,0 +1,24 @@
+run:
+  name: ${.eval_name}_${.model_train_name}
+  time_limit: "02:00:00"
+  dependency: "singleton"
+  nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
+  ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
+  eval_name: eval_all
+  model_train_name: llama2_7b
+  train_dir: ${base_results_dir}/${.model_train_name}
+  tasks: all_tasks  # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks
+  results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
+
+model:
+  model_type: nemo-llama
+  nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints
+  #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
+  #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
+  #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
+  precision: bf16 # must match training precision - 32, 16 or bf16
+  eval_batch_size: 4
+  #tokenizer_model: ${data_dir}/llama/llama_tokenizer.model 
diff --git a/launcher_scripts/conf/evaluation/llama/evaluate_boolq.yaml b/launcher_scripts/conf/evaluation/llama/evaluate_boolq.yaml
new file mode 100755
index 0000000000..49ba25236c
--- /dev/null
+++ b/launcher_scripts/conf/evaluation/llama/evaluate_boolq.yaml
@@ -0,0 +1,24 @@
+run:
+  name: ${.eval_name}_${.model_train_name}
+  time_limit: "02:00:00"
+  dependency: "singleton"
+  nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
+  ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
+  eval_name: eval_boolq
+  model_train_name: llama2_7b
+  train_dir: ${base_results_dir}/${.model_train_name}
+  tasks: boolq  # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks
+  results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
+
+model:
+  model_type: nemo-llama
+  nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints
+  #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints
+  #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt)
+  #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
+  precision: bf16 # must match training precision - 32, 16 or bf16
+  eval_batch_size: 4
+  #tokenizer_model: ${data_dir}/llama/llama_tokenizer.model 
diff --git a/launcher_scripts/conf/evaluation/mt5/custom_task.yaml b/launcher_scripts/conf/evaluation/mt5/custom_task.yaml
index ce3523d3e7..128937204b 100755
--- a/launcher_scripts/conf/evaluation/mt5/custom_task.yaml
+++ b/launcher_scripts/conf/evaluation/mt5/custom_task.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 
diff --git a/launcher_scripts/conf/evaluation/mt5/xquad.yaml b/launcher_scripts/conf/evaluation/mt5/xquad.yaml
index 6d733fec7f..89771d546b 100755
--- a/launcher_scripts/conf/evaluation/mt5/xquad.yaml
+++ b/launcher_scripts/conf/evaluation/mt5/xquad.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 exp_manager:
diff --git a/launcher_scripts/conf/evaluation/prompt_llama/squad.yaml b/launcher_scripts/conf/evaluation/prompt_llama/squad.yaml
new file mode 100755
index 0000000000..7890e97eab
--- /dev/null
+++ b/launcher_scripts/conf/evaluation/prompt_llama/squad.yaml
@@ -0,0 +1,21 @@
+run:
+  name: ${.eval_name}_${.model_train_name}
+  time_limit: "1:00:00"
+  nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node
+  ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}}
+  eval_name: eval_prompt_squad
+  model_train_name: llama_7b
+  tasks: "prompt" # general prompt task
+  prompt_learning_dir: ${base_results_dir}/${.model_train_name}/prompt_learning_squad # assume prompt learning was on squad task
+  results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name}
+
+model:
+  model_type: nemo-llama-prompt
+  nemo_model: ${evaluation.run.prompt_learning_dir}/results/megatron_llama_prompt.nemo
+  tensor_model_parallel_size: 2 #1 for 126m, 2 for 5b, 8 for 20b
+  pipeline_model_parallel_size: 1
+  model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
+  precision: bf16 # must match training precision - 32, 16 or bf16
+  eval_batch_size: 4
+  prompt_dataset_paths: ${data_dir}/prompt_data/v1.1/squad_val.jsonl
+  disable_special_tokens: False # Whether to disable virtual tokens in prompt model evaluation. This is equivalent to evaluate without prompt-/p-tuning.
diff --git a/launcher_scripts/conf/evaluation/prompt_mt5/squad.yaml b/launcher_scripts/conf/evaluation/prompt_mt5/squad.yaml
index 01278be854..a223289ffc 100755
--- a/launcher_scripts/conf/evaluation/prompt_mt5/squad.yaml
+++ b/launcher_scripts/conf/evaluation/prompt_mt5/squad.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 data:
diff --git a/launcher_scripts/conf/evaluation/prompt_t5/squad.yaml b/launcher_scripts/conf/evaluation/prompt_t5/squad.yaml
index 7b549fedf7..c1fb88caed 100755
--- a/launcher_scripts/conf/evaluation/prompt_t5/squad.yaml
+++ b/launcher_scripts/conf/evaluation/prompt_t5/squad.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 data:
diff --git a/launcher_scripts/conf/evaluation/t5/custom_task.yaml b/launcher_scripts/conf/evaluation/t5/custom_task.yaml
index 90e0ebb38d..2959469ccd 100755
--- a/launcher_scripts/conf/evaluation/t5/custom_task.yaml
+++ b/launcher_scripts/conf/evaluation/t5/custom_task.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 
diff --git a/launcher_scripts/conf/evaluation/t5/squad.yaml b/launcher_scripts/conf/evaluation/t5/squad.yaml
index f50843d82f..39c954a943 100755
--- a/launcher_scripts/conf/evaluation/t5/squad.yaml
+++ b/launcher_scripts/conf/evaluation/t5/squad.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   log_every_n_steps: 10
 
 
diff --git a/launcher_scripts/conf/fine_tuning/gpt3/custom_task.yaml b/launcher_scripts/conf/fine_tuning/gpt3/custom_task.yaml
index 9d9ebabd1b..e55341fb9b 100644
--- a/launcher_scripts/conf/fine_tuning/gpt3/custom_task.yaml
+++ b/launcher_scripts/conf/fine_tuning/gpt3/custom_task.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 1
   max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10 # frequency with which training steps are logged 
diff --git a/launcher_scripts/conf/fine_tuning/gpt3/squad.yaml b/launcher_scripts/conf/fine_tuning/gpt3/squad.yaml
index 4730f2f1ae..17dfb0fdc7 100644
--- a/launcher_scripts/conf/fine_tuning/gpt3/squad.yaml
+++ b/launcher_scripts/conf/fine_tuning/gpt3/squad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 13000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10 # frequency with which training steps are logged 
diff --git a/launcher_scripts/conf/fine_tuning/llama/squad.yaml b/launcher_scripts/conf/fine_tuning/llama/squad.yaml
new file mode 100644
index 0000000000..cc954a846f
--- /dev/null
+++ b/launcher_scripts/conf/fine_tuning/llama/squad.yaml
@@ -0,0 +1,187 @@
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: llama2_7b
+  convert_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.convert_name}
+  task_name: "squad"  # Rename this name to be more clear
+  results_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.task_name}
+
+trainer:
+  devices: 8
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 13000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 300 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: ${fine_tuning.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama_${fine_tuning.run.task_name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: nemo_llama_${fine_tuning.run.task_name}
+    name: ${fine_tuning.run.name}
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${fine_tuning.model.data.validation_ds.metric.name}
+    save_top_k: 5
+    mode: min
+    save_nemo_on_train_end: True
+    filename: 'megatron_gpt_sft--{${.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${fine_tuning.model.tensor_model_parallel_size}, ${fine_tuning.model.pipeline_model_parallel_size}}
+    save_best_model: True
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  global_batch_size: 32
+  micro_batch_size: 4
+  restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_llama.nemo # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: True 
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  answer_only_loss: True  # not used right now
+  gradient_as_bucket_view: False
+  seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value
+  use_flash_attention: True # if not None, will match the base model's value
+
+  hidden_dropout: 0.1
+  attention_dropout: 0.1
+  ffn_dropout: 0.1
+
+  data:
+    chat: False # whether use chatbot data or not
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names: 
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names: 
+      - ${data_dir}/squad_data/v1.1/train-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${fine_tuning.model.global_batch_size}
+      micro_batch_size: ${fine_tuning.model.micro_batch_size}
+      shuffle: True
+      num_workers: 4
+      pin_memory: True
+      max_seq_length: 4096
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities: 
+      - 1.0 # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: True
+      separate_prompt_and_response_with_newline: True
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+    validation_ds:
+      file_names:
+      - ${data_dir}/squad_data/v1.1/dev-v1.1_gpt.json  # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: 
+      - ${fine_tuning.run.task_name} # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${fine_tuning.model.global_batch_size}
+      micro_batch_size: ${fine_tuning.model.micro_batch_size}
+      shuffle: True
+      num_workers: 4
+      pin_memory: True
+      max_seq_length: ${fine_tuning.model.data.train_ds.max_seq_length}
+      min_seq_length: ${fine_tuning.model.data.train_ds.min_seq_length}
+      drop_last: True
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: ${fine_tuning.model.data.train_ds.add_eos}
+      add_sep: ${fine_tuning.model.data.train_ds.add_sep}
+      add_bos: ${fine_tuning.model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+    test_ds:
+      file_names: ${fine_tuning.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${fine_tuning.model.global_batch_size}
+      micro_batch_size: ${fine_tuning.model.micro_batch_size}
+      shuffle: True
+      num_workers: 4
+      pin_memory: True
+      max_seq_length: ${fine_tuning.model.data.train_ds.max_seq_length}
+      min_seq_length: ${fine_tuning.model.data.train_ds.min_seq_length}
+      drop_last: True
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: ${fine_tuning.model.data.train_ds.add_eos}
+      add_sep: ${fine_tuning.model.data.train_ds.add_sep}
+      add_bos: ${fine_tuning.model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+  optim:
+    name: distributed_fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work.
+    lr: 1e-6
+    weight_decay: 0.1 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      monitor: validation_${fine_tuning.model.data.validation_ds.metric.name}
+      min_lr: 1e-8
+      warmup_steps: 1000
+      last_epoch: -1
+
+
diff --git a/launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml b/launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml
index bea1aacee8..abd3c2565c 100755
--- a/launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml
+++ b/launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 5
   max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/fine_tuning/mt5/xquad.yaml b/launcher_scripts/conf/fine_tuning/mt5/xquad.yaml
index f8d677fba5..8190e47aa5 100755
--- a/launcher_scripts/conf/fine_tuning/mt5/xquad.yaml
+++ b/launcher_scripts/conf/fine_tuning/mt5/xquad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 5
   max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/fine_tuning/t5/custom_task.yaml b/launcher_scripts/conf/fine_tuning/t5/custom_task.yaml
index a3b2960f9c..54c3166405 100755
--- a/launcher_scripts/conf/fine_tuning/t5/custom_task.yaml
+++ b/launcher_scripts/conf/fine_tuning/t5/custom_task.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 5
   max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/fine_tuning/t5/squad.yaml b/launcher_scripts/conf/fine_tuning/t5/squad.yaml
index da5cc2c252..d608fd28ec 100755
--- a/launcher_scripts/conf/fine_tuning/t5/squad.yaml
+++ b/launcher_scripts/conf/fine_tuning/t5/squad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 5
   max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/ia3_learning/gpt3/squad.yaml b/launcher_scripts/conf/ia3_learning/gpt3/squad.yaml
index 2e7ed23f3e..b5d643c94c 100755
--- a/launcher_scripts/conf/ia3_learning/gpt3/squad.yaml
+++ b/launcher_scripts/conf/ia3_learning/gpt3/squad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 4
   max_steps: -1
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/ia3_learning/llama/squad.yaml b/launcher_scripts/conf/ia3_learning/llama/squad.yaml
new file mode 100755
index 0000000000..01c22b6f02
--- /dev/null
+++ b/launcher_scripts/conf/ia3_learning/llama/squad.yaml
@@ -0,0 +1,98 @@
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: llama_7b
+  convert_dir: ${base_results_dir}/${ia3_learning.run.model_train_name}/${ia3_learning.run.convert_name}
+  task_name: "squad"
+  results_dir: ${base_results_dir}/${.model_train_name}/ia3_learning_${.task_name}
+
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False
+  enable_checkpointing: False
+  replace_sampler_ddp: False
+  max_epochs: 4
+  max_steps: -1
+  log_every_n_steps: 10
+  val_check_interval: 0.1
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+
+
+exp_manager:
+  explicit_log_dir: ${ia3_learning.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama_ia3
+  create_wandb_logger: True
+  wandb_logger_kwargs:
+    project: nemo_llama_ia3
+    name: ${ia3_learning.run.name}
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 5
+    mode: min
+    save_nemo_on_train_end: False
+    filename: "megatron_gpt_ia3_learn--{val_loss:.3f}-{step}"
+    model_parallel_size: ${ia3_learning.model.model_parallel_size}
+    save_best_model: True
+
+model:
+  seed: 1234
+  nemo_path: ${ia3_learning.run.results_dir}/results/megatron_llama_ia3.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
+  virtual_prompt_style: 'no-prompts' # ia3 tuning requires no virtual prompts
+  encoder_seq_length: 2048 
+  gradient_as_bucket_view: false
+  tensor_model_parallel_size: 2 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
+  global_batch_size: 64
+  micro_batch_size: 8
+
+  restore_path: null # Path to an existing ia3 .nemo model you wish to add new tasks to or run inference with
+  language_model_path: ${ia3_learning.run.convert_dir}/results/megatron_llama.nemo # Path to the GPT language model .nemo file, always required
+  existing_tasks: [] # List of tasks the model has already been p-tuned/prompt-tuned for, needed when a restore path is given
+  new_tasks: ["squad"] # List of new tasknames to be prompt-tuned
+
+  task_templates: # Add more/replace tasks as needed, these are just examples
+  - taskname: "squad" 
+    prompt_template: "context: {context} question: {question} answer: {answer}" 
+    total_virtual_tokens: 0
+    virtual_token_splits: []
+    truncate_field: null
+    answer_only_loss: True
+    answer_field: "answer"
+
+  data:
+    train_ds:
+      - ${data_dir}/prompt_data/v1.1/squad_train.jsonl # multiple prompt dataset can be given at the same time
+    validation_ds:
+      - ${data_dir}/prompt_data/v1.1/squad_val.jsonl
+    add_eos: True
+    shuffle: True
+    num_workers: 4
+    pin_memory: True
+
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/launcher_scripts/conf/ia3_learning/t5/squad.yaml b/launcher_scripts/conf/ia3_learning/t5/squad.yaml
index 840fce46b2..3e0900b058 100755
--- a/launcher_scripts/conf/ia3_learning/t5/squad.yaml
+++ b/launcher_scripts/conf/ia3_learning/t5/squad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 4
   max_steps: -1
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/peft/gpt3/squad.yaml b/launcher_scripts/conf/peft/gpt3/squad.yaml
new file mode 100644
index 0000000000..fec6a0e8ad
--- /dev/null
+++ b/launcher_scripts/conf/peft/gpt3/squad.yaml
@@ -0,0 +1,230 @@
+name: megatron_gpt_peft_tuning-${peft.model.peft.peft_scheme}
+
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: gpt3_5b
+  convert_dir: ${base_results_dir}/${peft.run.model_train_name}/${peft.run.convert_name}
+  task_name: "squad"
+  results_dir: ${base_results_dir}/${.model_train_name}/peft_${.task_name}
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${peft.name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${peft.model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${peft.name}--{${peft.exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${peft.model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: True
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  peft:
+    peft_scheme: "adapter"  # can be either adapter,ia3, or ptuning
+    restore_from_path: null
+    
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+    
+    lora_tuning:
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+    
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+    
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+
+  data:
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names: 
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${peft.model.global_batch_size}
+      micro_batch_size: ${peft.model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      memmap_workers: 2
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: False
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+    validation_ds:
+      file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${peft.model.global_batch_size}
+      micro_batch_size: ${peft.model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      memmap_workers: ${peft.model.data.train_ds.memmap_workers}
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: ${peft.model.data.train_ds.add_eos}
+      add_sep: ${peft.model.data.train_ds.add_sep}
+      add_bos: ${peft.model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+    test_ds:
+        file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+        names: null # Names of the corresponding datasets used to log metrics.
+        global_batch_size: ${peft.model.global_batch_size}
+        micro_batch_size: ${peft.model.micro_batch_size}
+        shuffle: False
+        num_workers: 0
+        memmap_workers: ${peft.model.data.train_ds.memmap_workers}
+        pin_memory: True
+        max_seq_length: 2048
+        min_seq_length: 1
+        drop_last: False
+        context_key: 'input'
+        label_key: 'output'
+        add_eos: ${peft.model.data.train_ds.add_eos}
+        add_sep: ${peft.model.data.train_ds.add_sep}
+        add_bos: ${peft.model.data.train_ds.add_bos}
+        separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline}
+        write_predictions_to_file: False
+        output_file_path_prefix: null # Prefix of the file to write predictions to.
+        truncation_field: "context" # Options: ['context', 'answer']
+        index_mapping_dir: null # Path to a directory to write index mapping files.
+        prompt_template: ${peft.model.data.train_ds.prompt_template}
+        tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+        metric:
+          name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+          average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+          num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/launcher_scripts/conf/peft/llama/squad.yaml b/launcher_scripts/conf/peft/llama/squad.yaml
new file mode 100644
index 0000000000..c958ba30dc
--- /dev/null
+++ b/launcher_scripts/conf/peft/llama/squad.yaml
@@ -0,0 +1,234 @@
+name: megatron_llama_peft_tuning-${peft.model.peft.peft_scheme}
+
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: llama2_7b
+  convert_dir: ${base_results_dir}/${peft.run.model_train_name}/${peft.run.convert_name}
+  task_name: "squad"
+  results_dir: ${base_results_dir}/${.model_train_name}/peft_${.task_name}
+
+trainer:
+  devices: 8
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${peft.name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${peft.model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${peft.name}--{${peft.exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${peft.model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: True
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_llama.nemo # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: True
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  peft:
+    peft_scheme: "lora"  # can be either adapter,ia3, or ptuning
+    restore_from_path: null
+    
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+    
+    lora_tuning:
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+    
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+    
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+
+  data:
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names: 
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names:
+      - ${data_dir}/squad_data/v1.1/train-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${peft.model.global_batch_size}
+      micro_batch_size: ${peft.model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      memmap_workers: 2
+      pin_memory: True
+      max_seq_length: 4096
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities:
+      - 1.0 # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: True
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+    validation_ds:
+      file_names: 
+      - ${data_dir}/squad_data/v1.1/dev-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names:
+      - ${fine_tuning.run.task_name} # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${peft.model.global_batch_size}
+      micro_batch_size: ${peft.model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      memmap_workers: ${peft.model.data.train_ds.memmap_workers}
+      pin_memory: True
+      max_seq_length: ${fine_tuning.model.data.train_ds.max_seq_length}
+      min_seq_length: ${fine_tuning.model.data.train_ds.min_seq_length}
+      drop_last: False
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: ${peft.model.data.train_ds.add_eos}
+      add_sep: ${peft.model.data.train_ds.add_sep}
+      add_bos: ${peft.model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+    test_ds:
+        file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+        names: null # Names of the corresponding datasets used to log metrics.
+        global_batch_size: ${peft.model.global_batch_size}
+        micro_batch_size: ${peft.model.micro_batch_size}
+        shuffle: False
+        num_workers: 0
+        memmap_workers: ${peft.model.data.train_ds.memmap_workers}
+        pin_memory: True
+        max_seq_length: ${fine_tuning.model.data.train_ds.max_seq_length}
+        min_seq_length: ${fine_tuning.model.data.train_ds.min_seq_length}
+        drop_last: False
+        context_key: 'input'
+        label_key: 'output'
+        add_eos: ${peft.model.data.train_ds.add_eos}
+        add_sep: ${peft.model.data.train_ds.add_sep}
+        add_bos: ${peft.model.data.train_ds.add_bos}
+        separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline}
+        write_predictions_to_file: False
+        output_file_path_prefix: null # Prefix of the file to write predictions to.
+        truncation_field: "context" # Options: ['context', 'answer']
+        index_mapping_dir: null # Path to a directory to write index mapping files.
+        prompt_template: ${peft.model.data.train_ds.prompt_template}
+        tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+        metric:
+          name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+          average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+          num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/launcher_scripts/conf/peft/t5/squad.yaml b/launcher_scripts/conf/peft/t5/squad.yaml
new file mode 100644
index 0000000000..cdd452bab3
--- /dev/null
+++ b/launcher_scripts/conf/peft/t5/squad.yaml
@@ -0,0 +1,230 @@
+name: megatron_t5_peft_tuning-${peft.model.peft.peft_scheme}
+
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: t5
+  convert_dir: ${base_results_dir}/${peft.run.model_train_name}/${peft.run.convert_name}
+  task_name: "squad"
+  results_dir: ${base_results_dir}/${.model_train_name}/peft_${.task_name}
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${peft.name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${peft.model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${peft.name}--{${peft.exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${peft.model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: True
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  peft:
+    peft_scheme: "adapter"  # can be either adapter,ia3, or ptuning
+    restore_from_path: null
+    
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+    
+    lora_tuning:
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+    
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+    
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+
+  data:
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names: 
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${peft.model.global_batch_size}
+      micro_batch_size: ${peft.model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      memmap_workers: 2
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: False
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+
+    validation_ds:
+      file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${peft.model.global_batch_size}
+      micro_batch_size: ${peft.model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      memmap_workers: ${peft.model.data.train_ds.memmap_workers}
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: ${peft.model.data.train_ds.add_eos}
+      add_sep: ${peft.model.data.train_ds.add_sep}
+      add_bos: ${peft.model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+    test_ds:
+        file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+        names: null # Names of the corresponding datasets used to log metrics.
+        global_batch_size: ${peft.model.global_batch_size}
+        micro_batch_size: ${peft.model.micro_batch_size}
+        shuffle: False
+        num_workers: 0
+        memmap_workers: ${peft.model.data.train_ds.memmap_workers}
+        pin_memory: True
+        max_seq_length: 2048
+        min_seq_length: 1
+        drop_last: False
+        context_key: 'input'
+        label_key: 'output'
+        add_eos: ${peft.model.data.train_ds.add_eos}
+        add_sep: ${peft.model.data.train_ds.add_sep}
+        add_bos: ${peft.model.data.train_ds.add_bos}
+        separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline}
+        write_predictions_to_file: False
+        output_file_path_prefix: null # Prefix of the file to write predictions to.
+        truncation_field: "context" # Options: ['context', 'answer']
+        index_mapping_dir: null # Path to a directory to write index mapping files.
+        prompt_template: ${peft.model.data.train_ds.prompt_template}
+        tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+        metric:
+          name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+          average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+          num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/launcher_scripts/conf/prompt_learning/gpt3/squad.yaml b/launcher_scripts/conf/prompt_learning/gpt3/squad.yaml
index ea42f3c4ba..32fda8389c 100755
--- a/launcher_scripts/conf/prompt_learning/gpt3/squad.yaml
+++ b/launcher_scripts/conf/prompt_learning/gpt3/squad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 4
   max_steps: -1
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/prompt_learning/llama/squad.yaml b/launcher_scripts/conf/prompt_learning/llama/squad.yaml
new file mode 100755
index 0000000000..4336a568bc
--- /dev/null
+++ b/launcher_scripts/conf/prompt_learning/llama/squad.yaml
@@ -0,0 +1,113 @@
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "01:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: llama2_7b
+  convert_dir: ${base_results_dir}/${prompt_learning.run.model_train_name}/${prompt_learning.run.convert_name}
+  task_name: "squad"
+  results_dir: ${base_results_dir}/${.model_train_name}/prompt_learning_${.task_name}
+
+trainer:
+  devices: 8
+  num_nodes: 4
+  accelerator: gpu
+  precision: bf16
+  logger: False
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 4
+  max_steps: -1
+  log_every_n_steps: 10
+  val_check_interval: 200
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: ${prompt_learning.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama_prompt
+  create_wandb_logger: True
+  wandb_logger_kwargs:
+    project: nemo_llama_prompt
+    name: ${prompt_learning.run.name}
+  resume_if_exists: False
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 5
+    mode: min
+    save_nemo_on_train_end: False
+    filename: "megatron_llama_prompt_learn--{val_loss:.3f}-{step}"
+    model_parallel_size: ${prompt_learning.model.model_parallel_size}
+    save_best_model: True
+
+model:
+  seed: 1234
+  nemo_path: ${prompt_learning.run.results_dir}/results/megatron_llama_prompt.nemo # the place to save prompt learning nemo checkpoint
+  virtual_prompt_style: 'p-tuning' # One of 'p-tuning', 'prompt-tuning', or 'inference'. We recommend 'p-tuning' over 'prompt-tuning'.
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+  model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}}
+  encoder_seq_length: 4096
+  global_batch_size: 8
+  micro_batch_size: 1
+
+  restore_path: null # used to restore from a prompt tuned checkpoint and add new tasks
+  language_model_path: ${prompt_learning.run.convert_dir}/results/megatron_llama.nemo # Restore lanugage model from pre-trained .nemo checkpoint
+  existing_tasks: [] # if restore from a prompt tuned checkpoint and add new tasks, existing task names should be included here.
+  new_tasks: ["squad"] # multiple tasks can be tuned at the same time
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+  activations_checkpoint_granularity: selective
+  activations_checkpoint_num_layers: 1
+  activations_checkpoint_method: block
+
+  task_templates: # task_templates for all existing_tasks and new_tasks are required.
+  - taskname: "squad" # The task name
+    prompt_template: "<|VIRTUAL_PROMPT_0|>Context: {context} Question: {question} Answer: {answer}" # Prompt template for task, specify virtual prompt positions with <|VIRTUAL_PROMPT_#|>
+    total_virtual_tokens: 10 # Sum of tokens in virtual_token_splits must add to this number. Can differ between new and existing tasks, but must match across all new tasks being tuned at the same time.
+    virtual_token_splits: [10] # number of virtual tokens to be inserted at each VIRTUAL PROMPT location, must add to total_virtual_tokens
+    truncate_field: "context" # The {field} in the prompt template whose text will be truncated if the input is too long, if null, inputs that are too long will just be skipped.
+    answer_field: "answer" # Answer/Target field
+    answer_only_loss: True # If true, the loss will only be calculated with answer_field text vs. ground truth. If false, the loss will be calculated over entire sentence.
+
+  prompt_learning: # Prompt tunin specific params
+    new_prompt_init_methods: null # e.g ['text'], List of 'text' or 'random', should correspond to tasks listed in new tasks
+    new_prompt_init_text: null # e.g ['some init text goes here'], some init text if init method is text, or None if init method is random
+
+  p_tuning: # P-tuning specific params
+    encoder_type: "tpmlp" # ['tpmlp', 'lstm', 'biglstm', 'mlp'] 
+    dropout: 0.0
+    num_layers: 2  # number of layers for MLP or LSTM layers. Note, it has no effect for tpmlp currently as it always assumes it is two layers.
+    encoder_hidden: 2048 # encoder hidden for biglstm and tpmlp
+    init_std: 0.023  # init std for tpmlp layers
+
+  data:
+    train_ds:
+      - ${data_dir}/prompt_data/v1.1/squad_train.jsonl # multiple prompt dataset can be given at the same time
+    validation_ds:
+      - ${data_dir}/prompt_data/v1.1/squad_val.jsonl
+    add_eos: True
+    shuffle: True
+    num_workers: 4
+    pin_memory: True
+
+  optim:
+    name: fused_adam
+    lr: 2.0e-4
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      constant_steps: 10
+      min_lr: 0.0 # has to be zero
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/launcher_scripts/conf/prompt_learning/mt5/squad.yaml b/launcher_scripts/conf/prompt_learning/mt5/squad.yaml
index 19bf9c7447..99c9871ca8 100755
--- a/launcher_scripts/conf/prompt_learning/mt5/squad.yaml
+++ b/launcher_scripts/conf/prompt_learning/mt5/squad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 10
   max_steps: -1
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/prompt_learning/t5/squad.yaml b/launcher_scripts/conf/prompt_learning/t5/squad.yaml
index 755323e938..27d54627c6 100755
--- a/launcher_scripts/conf/prompt_learning/t5/squad.yaml
+++ b/launcher_scripts/conf/prompt_learning/t5/squad.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: 10
   max_steps: -1
   log_every_n_steps: 10
diff --git a/launcher_scripts/conf/quality_filtering/heuristic/english.yaml b/launcher_scripts/conf/quality_filtering/heuristic/english.yaml
new file mode 100644
index 0000000000..9e7d6fedb0
--- /dev/null
+++ b/launcher_scripts/conf/quality_filtering/heuristic/english.yaml
@@ -0,0 +1,126 @@
+run:
+  name: 'heuristic-filter-en'
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "08:00:00"
+  dependency: "singleton"
+  nodes: 1
+  partition:
+  cpus_per_node: 48
+
+# Provide the downloader, data loader and extraction modules that
+# define how the dataset will be built from the URLs
+filter:
+  filter_module: ndc.filter.heuristics.filter.CascadedHeuristicFilter
+  params:
+    # The filters below define a chain of heuristic filters to be applied to each document in a corpus.
+    # This particular cascade of filters is intended to filter English language data. 
+    # The filter listed at the top will be applied first, and the following filters will be applied in
+    # the order they appear in this file. Each filter can be removed and re-ordered as desired.
+    # New filters can be added as described in docs/1_document_filtering.rst
+    filters:
+    - name: ndc.filter.heuristics.filter.NonAlphaNumericFilter
+      params:
+        max_non_alpha_numeric_to_text_ratio: 0.25
+    - name: ndc.filter.heuristics.filter.SymbolsToWordsFilter
+      params:
+        max_symbol_to_word_ratio: 0.1
+    - name: ndc.filter.heuristics.filter.NumbersFilter
+      params:
+        max_number_to_text_ratio: 0.15
+    - name: ndc.filter.heuristics.filter.UrlsFilter
+      params:
+        max_url_to_text_ratio: 0.2
+    - name: ndc.filter.heuristics.filter.WhiteSpaceFilter
+      params:
+        max_white_space_ratio: 0.25
+    - name: ndc.filter.heuristics.filter.ParenthesesFilter
+      params:
+        max_parentheses_ratio: 0.1
+    - name: ndc.filter.heuristics.filter.BoilerPlateStringFilter
+      params:
+        remove_if_at_top_or_bottom: True
+        max_boilerplate_string_ratio: 0.4
+    - name: ndc.filter.heuristics.filter.RepeatedLinesFilter
+      params:
+        max_repeated_line_fraction: 0.7
+    - name: ndc.filter.heuristics.filter.RepeatedParagraphsFilter
+      params:
+        max_repeated_paragraphs_ratio: 0.7
+    - name: ndc.filter.heuristics.filter.RepeatedLinesByCharFilter
+      params:
+        max_repeated_lines_char_ratio: 0.8
+    - name: ndc.filter.heuristics.filter.RepeatedParagraphsByCharFilter
+      params:
+        max_repeated_paragraphs_char_ratio: 0.8
+    - name: ndc.filter.heuristics.filter.WordCountFilter
+      params:
+        min_words: 50
+        max_words: 100000
+    - name: ndc.filter.heuristics.filter.PunctuationFilter
+      params:
+        max_num_sentences_without_endmark_ratio: 0.85
+    - name: ndc.filter.heuristics.filter.WordsWithoutAlphabetsFilter
+      params:
+        max_words_without_alphabets: 0.8
+    - name: ndc.filter.heuristics.filter.CommonEnglishWordsFilter
+      params:
+        min_num_common_words: 2
+        stop_at_false: True
+    - name: ndc.filter.heuristics.filter.MeanWordLengthFilter
+      params:
+        max_mean_word_length: 10
+        min_mean_word_length: 3
+    - name: ndc.filter.heuristics.filter.LongWordFilter
+      params:
+        max_word_length: 1000
+    - name: ndc.filter.heuristics.filter.EllipsisFilter
+      params:
+        max_num_lines_ending_with_ellipsis_ratio: 0.3
+    # Top N-Gram filters for N-grams 2, 3, and 4
+    - name: ndc.filter.heuristics.filter.RepeatingTopNGramsFilter
+      params:
+        n: 2
+        max_repeating_ngram_ratio: 0.2
+    - name: ndc.filter.heuristics.filter.RepeatingTopNGramsFilter
+      params:
+        n: 3
+        max_repeating_ngram_ratio: 0.18
+    - name: ndc.filter.heuristics.filter.RepeatingTopNGramsFilter
+      params:
+        n: 4
+        max_repeating_ngram_ratio: 0.16
+    # Duplicate N-gram filters for N-grams 5, 6, 7, 8, 9, and 10
+    - name: ndc.filter.heuristics.filter.RepeatingDuplicateNGramsFilter
+      params:
+        n: 5
+        max_repeating_duplicate_ngram_ratio: 0.15
+    - name: ndc.filter.heuristics.filter.RepeatingDuplicateNGramsFilter
+      params:
+        n: 6
+        max_repeating_duplicate_ngram_ratio: 0.14
+    - name: ndc.filter.heuristics.filter.RepeatingDuplicateNGramsFilter
+      params:
+        n: 7
+        max_repeating_duplicate_ngram_ratio: 0.13
+    - name: ndc.filter.heuristics.filter.RepeatingDuplicateNGramsFilter
+      params:
+        n: 8
+        max_repeating_duplicate_ngram_ratio: 0.12
+    - name: ndc.filter.heuristics.filter.RepeatingDuplicateNGramsFilter
+      params:
+        n: 9
+        max_repeating_duplicate_ngram_ratio: 0.11
+    - name: ndc.filter.heuristics.filter.RepeatingDuplicateNGramsFilter
+      params:
+        n: 10
+        max_repeating_duplicate_ngram_ratio: 0.10
+    - name: ndc.filter.heuristics.filter.BulletsFilter
+      params:
+        max_bullet_lines_ratio: 0.9
+    # If True, the chained operation defined by the filters above 
+    # will stop at first filter that is triggered during the above defined pipeline
+    stop_at_true: True
+
+input_dir: ${data_dir}/json/original
+# Output directory to where filtered documents will be written
+output_retained_document_dir: ${data_dir}/json/filtered/high_quality
diff --git a/launcher_scripts/conf/training/bert/100b.yaml b/launcher_scripts/conf/training/bert/100b.yaml
index d63a844756..8d26a5b7b8 100755
--- a/launcher_scripts/conf/training/bert/100b.yaml
+++ b/launcher_scripts/conf/training/bert/100b.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
   max_steps: 860000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   max_time: "81:23:30:00"
@@ -97,11 +97,11 @@ model:
   onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
 
-  activations_checkpoint_granularity: selective
-  activations_checkpoint_method: block
-  activations_checkpoint_layers_per_pipeline: 1
-  num_micro_batches_with_partial_activation_checkpoints: 96
-  activations_checkpoint_num_layers: 0
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_layers_per_pipeline: null
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_num_layers: null
 
   sequence_parallel: True 
 
diff --git a/launcher_scripts/conf/training/bert/110m.yaml b/launcher_scripts/conf/training/bert/110m.yaml
index 47b2e95839..2988141040 100755
--- a/launcher_scripts/conf/training/bert/110m.yaml
+++ b/launcher_scripts/conf/training/bert/110m.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
   max_steps: 13800000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   max_time: "7:23:30:00"
@@ -98,11 +98,11 @@ model:
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
  
  
-  activations_checkpoint_granularity: selective
-  activations_checkpoint_method: block
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
   activations_checkpoint_layers_per_pipeline: null
   num_micro_batches_with_partial_activation_checkpoints: null
-  activations_checkpoint_num_layers: 0
+  activations_checkpoint_num_layers: null
 
   sequence_parallel: False 
 
diff --git a/launcher_scripts/conf/training/bert/20b.yaml b/launcher_scripts/conf/training/bert/20b.yaml
index 79312130cf..1a2d033c7e 100755
--- a/launcher_scripts/conf/training/bert/20b.yaml
+++ b/launcher_scripts/conf/training/bert/20b.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
   max_steps: 860000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   max_time: "90:23:30:00"
@@ -97,11 +97,11 @@ model:
   onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
 
-  activations_checkpoint_granularity: selective
-  activations_checkpoint_method: block
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
   activations_checkpoint_layers_per_pipeline: null
   num_micro_batches_with_partial_activation_checkpoints: null
-  activations_checkpoint_num_layers: 1
+  activations_checkpoint_num_layers: null
 
   sequence_parallel: True 
 
diff --git a/launcher_scripts/conf/training/bert/4b.yaml b/launcher_scripts/conf/training/bert/4b.yaml
index 5e435c48a2..484f17c998 100755
--- a/launcher_scripts/conf/training/bert/4b.yaml
+++ b/launcher_scripts/conf/training/bert/4b.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
   max_steps: 1720000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   max_time: "26:23:30:00"
@@ -97,11 +97,11 @@ model:
   onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
 
-  activations_checkpoint_granularity: selective
-  activations_checkpoint_method: block
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
   activations_checkpoint_layers_per_pipeline: null
   num_micro_batches_with_partial_activation_checkpoints: null
-  activations_checkpoint_num_layers: 0 
+  activations_checkpoint_num_layers: null 
 
   sequence_parallel: False
 
diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml
index a274f8e470..27d3329756 100755
--- a/launcher_scripts/conf/training/gpt3/126m.yaml
+++ b/launcher_scripts/conf/training/gpt3/126m.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 600000 # consumed_samples = global_step * global_batch_size
   max_time: "00:23:30:00" # days:hours:minutes:seconds
@@ -68,6 +68,7 @@ model:
   num_attention_heads: 12
   init_method_std: 0.023  # Standard deviation of the zero mean normal distribution used for weight initialization.')
   hidden_dropout: 0.1  # Dropout probability for hidden state transformer.
+  attention_dropout: 0.1 # Dropout probability for attention
   kv_channels: null  # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
   apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
   layernorm_epsilon: 1e-5
@@ -85,15 +86,19 @@ model:
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
 
   ## Activation Checkpointing
-  activations_checkpoint_granularity: selective # 'selective' or 'full'
-  activations_checkpoint_method: block # 'uniform', 'block'
-  activations_checkpoint_num_layers: 0 
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block'
+  activations_checkpoint_num_layers: null 
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
 
   ## Sequence Parallelism
   sequence_parallel: False
 
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
   tokenizer:
     library: 'megatron'
     type: 'GPT2BPETokenizer'
@@ -113,10 +118,13 @@ model:
   megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters
   grad_allreduce_chunk_size_mb: 125
 
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # To use fp8, please set `transformer_engine=True` and `fp8=True`.
   # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training
-  transformer_engine: True
+  transformer_engine: False
   fp8: False # enables fp8 in TransformerLayer forward
   fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
   fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
@@ -124,7 +132,7 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
-  use_emha: False
+  fp8_wgrad: True
   ub_tp_comm_overlap: False
 
   # miscellaneous
diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml
index 31654dc33d..33125cbb9f 100755
--- a/launcher_scripts/conf/training/gpt3/175b.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 75000 # consumed_samples = global_step * global_batch_size
   max_time: "25:23:00:00"
@@ -67,6 +67,7 @@ model:
   num_attention_heads: 96
   init_method_std: 0.006  # Standard deviation of the zero mean normal distribution used for weight initialization.')
   hidden_dropout: 0.1  # Dropout probability for hidden state transformer.
+  attention_dropout: 0.1 # Dropout probability for attention
   kv_channels: null  # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
   apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
   layernorm_epsilon: 1e-5
@@ -84,15 +85,19 @@ model:
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
 
   ## Activation Checkpointing
-  activations_checkpoint_granularity: selective # 'selective' or 'full'
-  activations_checkpoint_method: block # 'uniform', 'block'
-  activations_checkpoint_num_layers: 0
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block'
+  activations_checkpoint_num_layers: null
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
 
   ## Sequence Parallelism
   sequence_parallel: True
 
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
   tokenizer:
     library: 'megatron'
     type: 'GPT2BPETokenizer'
@@ -112,10 +117,13 @@ model:
   megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters
   grad_allreduce_chunk_size_mb: 125
 
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # To use fp8, please set `transformer_engine=True` and `fp8=True`.
   # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training
-  transformer_engine: True
+  transformer_engine: False
   fp8: False # enables fp8 in TransformerLayer forward
   fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
   fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
@@ -123,8 +131,8 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
-  use_emha: False
-  ub_tp_comm_overlap: True
+  fp8_wgrad: True
+  ub_tp_comm_overlap: False
 
   # miscellaneous
   seed: 1234
@@ -132,8 +140,6 @@ model:
   use_cpu_initialization: False # Init weights on the CPU (slow for large models)
   onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
   apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
-  overlap_p2p_comm: True # Overlap p2p communication with computes
-  batch_p2p_comm: False # Batch consecutive inter-peer send/recv operations
 
   # Nsys profiling options
   nsys_profile:
diff --git a/launcher_scripts/conf/training/gpt3/175b_performance.yaml b/launcher_scripts/conf/training/gpt3/175b_fp8.yaml
similarity index 93%
rename from launcher_scripts/conf/training/gpt3/175b_performance.yaml
rename to launcher_scripts/conf/training/gpt3/175b_fp8.yaml
index f5f2155d53..f58e86cbd9 100755
--- a/launcher_scripts/conf/training/gpt3/175b_performance.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b_fp8.yaml
@@ -18,7 +18,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 75000 # consumed_samples = global_step * global_batch_size
   max_time: "25:23:00:00"
@@ -54,9 +54,9 @@ exp_manager:
     buffer_size: 5
 
 model:
-  micro_batch_size: 1
+  micro_batch_size: 2
   global_batch_size: 2048
-  tensor_model_parallel_size: 4
+  tensor_model_parallel_size: 8
   pipeline_model_parallel_size: 8
   virtual_pipeline_model_parallel_size: 12 # interleaved pipeline, set to maximum
   resume_from_checkpoint: null # manually set the checkpoint file to load from
@@ -69,6 +69,7 @@ model:
   num_attention_heads: 96
   init_method_std: 0.006  # Standard deviation of the zero mean normal distribution used for weight initialization.')
   hidden_dropout: 0.1  # Dropout probability for hidden state transformer.
+  attention_dropout: 0.1
   kv_channels: null  # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
   apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
   layernorm_epsilon: 1e-5
@@ -86,14 +87,16 @@ model:
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
 
   ## Activation Checkpointing
-  activations_checkpoint_granularity: selective # 'selective' or 'full'
-  activations_checkpoint_method: block # 'uniform', 'block'
-  activations_checkpoint_num_layers: 0
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block'
+  activations_checkpoint_num_layers: null
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
 
   ## Sequence Parallelism
   sequence_parallel: True
+  
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
 
   tokenizer:
     library: 'megatron'
@@ -114,19 +117,22 @@ model:
   megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters
   grad_allreduce_chunk_size_mb: 125
 
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # To use fp8, please set `transformer_engine=True` and `fp8=True`.
   # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training
-  transformer_engine: True
-  fp8: False # enables fp8 in TransformerLayer forward
+  transformer_engine: False
+  fp8: True # enables fp8 in TransformerLayer forward
   fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
   fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
   fp8_margin: 0 # scaling margin
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
-  use_emha: False
-  ub_tp_comm_overlap: True
+  fp8_wgrad: True
+  ub_tp_comm_overlap: False
 
   # miscellaneous
   seed: 1234
@@ -136,7 +142,6 @@ model:
   apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
   overlap_p2p_comm: True # Overlap p2p communication with computes
   batch_p2p_comm: False # Batch consecutive inter-peer send/recv operations
-  gc_interval: 100 # Interval of the host memory garbage collection
 
   # Nsys profiling options
   nsys_profile:
diff --git a/launcher_scripts/conf/training/gpt3/1b_improved.yaml b/launcher_scripts/conf/training/gpt3/1b_improved.yaml
index 3e0cb4da5f..0b0c73421f 100644
--- a/launcher_scripts/conf/training/gpt3/1b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/1b_improved.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 300000 # consumed_samples = global_step * global_batch_size
   max_time: "02:23:30:00" # days:hours:minutes:seconds
@@ -70,7 +70,8 @@ model:
   ffn_dropout: 0.0
   kv_channels: null
   apply_query_key_layer_scaling: true
-  normalization: layernorm1p
+  normalization: LayerNorm
+  layernorm_zero_centered_gamma: True
   layernorm_epsilon: 1.0e-05
   do_layer_norm_weight_decay: false
   make_vocab_size_divisible_by: 128
@@ -113,13 +114,20 @@ model:
   apex_transformer_log_level: 30
   gradient_as_bucket_view: true
   sync_batch_comm: false
-  activations_checkpoint_granularity: selective
-  activations_checkpoint_method: block
-  activations_checkpoint_num_layers: 0
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false # does not support sequence parallel
 
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
   transformer_engine: False
@@ -130,7 +138,7 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
-  use_emha: False
+  fp8_wgrad: True
   ub_tp_comm_overlap: False
 
   optim:
diff --git a/launcher_scripts/conf/training/gpt3/20b.yaml b/launcher_scripts/conf/training/gpt3/20b.yaml
index 390cc42f9b..f748e32bd4 100755
--- a/launcher_scripts/conf/training/gpt3/20b.yaml
+++ b/launcher_scripts/conf/training/gpt3/20b.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 75000 # consumed_samples = global_step * global_batch_size
   max_time: "06:23:30:00"
@@ -67,6 +67,7 @@ model:
   num_attention_heads: 48
   init_method_std: 0.008165  # Standard deviation of the zero mean normal distribution used for weight initialization.')
   hidden_dropout: 0.1  # Dropout probability for hidden state transformer.
+  attention_dropout: 0.1 # Dropout probability for attention
   kv_channels: null  # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
   apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
   layernorm_epsilon: 1e-5
@@ -84,15 +85,19 @@ model:
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
 
   ## Activation Checkpointing
-  activations_checkpoint_granularity: selective # 'selective' or 'full'
-  activations_checkpoint_method: block # 'uniform', 'block'
-  activations_checkpoint_num_layers: 0
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block'
+  activations_checkpoint_num_layers: null
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
 
   ## Sequence Parallelism
   sequence_parallel: True
 
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
   tokenizer:
     library: 'megatron'
     type: 'GPT2BPETokenizer'
@@ -112,10 +117,13 @@ model:
   megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters
   grad_allreduce_chunk_size_mb: 125
 
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # To use fp8, please set `transformer_engine=True` and `fp8=True`.
   # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training
-  transformer_engine: True
+  transformer_engine: False
   fp8: False # enables fp8 in TransformerLayer forward
   fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
   fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
@@ -123,8 +131,8 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
-  use_emha: False
-  ub_tp_comm_overlap: True
+  fp8_wgrad: True
+  ub_tp_comm_overlap: False
 
   # miscellaneous
   seed: 1234
diff --git a/launcher_scripts/conf/training/gpt3/400m_improved.yaml b/launcher_scripts/conf/training/gpt3/400m_improved.yaml
index a1f358710a..f6cdbdea66 100644
--- a/launcher_scripts/conf/training/gpt3/400m_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/400m_improved.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 600000 # consumed_samples = global_step * global_batch_size
   max_time: "01:23:30:00" # days:hours:minutes:seconds
@@ -70,7 +70,8 @@ model:
   ffn_dropout: 0.0
   kv_channels: null
   apply_query_key_layer_scaling: true
-  normalization: layernorm1p
+  normalization: LayerNorm
+  layernorm_zero_centered_gamma: True
   layernorm_epsilon: 1.0e-05
   do_layer_norm_weight_decay: false
   make_vocab_size_divisible_by: 128
@@ -113,13 +114,20 @@ model:
   apex_transformer_log_level: 30
   gradient_as_bucket_view: true
   sync_batch_comm: false
-  activations_checkpoint_granularity: selective
-  activations_checkpoint_method: block
-  activations_checkpoint_num_layers: 0
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false # does not support sequence parallel
 
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
   transformer_engine: False
@@ -130,7 +138,7 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
-  use_emha: False
+  fp8_wgrad: True
   ub_tp_comm_overlap: False
 
   optim:
diff --git a/launcher_scripts/conf/training/gpt3/40b.yaml b/launcher_scripts/conf/training/gpt3/40b.yaml
index 17dd1cb291..e689fe7d4e 100755
--- a/launcher_scripts/conf/training/gpt3/40b.yaml
+++ b/launcher_scripts/conf/training/gpt3/40b.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 75000 # consumed_samples = global_step * global_batch_size
   max_time: "6:11:00:00"
@@ -67,6 +67,7 @@ model:
   num_attention_heads: 64
   init_method_std: 0.007  # Standard deviation of the zero mean normal distribution used for weight initialization.')
   hidden_dropout: 0.1  # Dropout probability for hidden state transformer.
+  attention_dropout: 0.1
   kv_channels: null  # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
   apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
   layernorm_epsilon: 1e-5
@@ -84,15 +85,19 @@ model:
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
 
   ## Activation Checkpointing
-  activations_checkpoint_granularity: selective # 'selective' or 'full'
-  activations_checkpoint_method: block # 'uniform', 'block'
-  activations_checkpoint_num_layers: 0
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block'
+  activations_checkpoint_num_layers: null
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
 
   ## Sequence Parallelism
   sequence_parallel: True
 
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
   tokenizer:
     library: 'megatron'
     type: 'GPT2BPETokenizer'
@@ -112,10 +117,13 @@ model:
   megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters
   grad_allreduce_chunk_size_mb: 125
 
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # To use fp8, please set `transformer_engine=True` and `fp8=True`.
   # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training
-  transformer_engine: True
+  transformer_engine: False
   fp8: False # enables fp8 in TransformerLayer forward
   fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
   fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
@@ -123,8 +131,8 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
-  use_emha: False
-  ub_tp_comm_overlap: True
+  fp8_wgrad: True
+  ub_tp_comm_overlap: False
 
   # miscellaneous
   seed: 1234
diff --git a/launcher_scripts/conf/training/gpt3/40b_improved.yaml b/launcher_scripts/conf/training/gpt3/40b_improved.yaml
index 6ab37ff8c5..077d3cb5ee 100644
--- a/launcher_scripts/conf/training/gpt3/40b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/40b_improved.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 100000 # consumed_samples = global_step * global_batch_size
   max_time: "6:11:00:00" # days:hours:minutes:seconds
@@ -70,7 +70,8 @@ model:
   ffn_dropout: 0.0
   kv_channels: null
   apply_query_key_layer_scaling: true
-  normalization: layernorm1p
+  normalization: LayerNorm
+  layernorm_zero_centered_gamma: True
   layernorm_epsilon: 1.0e-05
   do_layer_norm_weight_decay: false
   make_vocab_size_divisible_by: 128
@@ -113,13 +114,20 @@ model:
   apex_transformer_log_level: 30
   gradient_as_bucket_view: true
   sync_batch_comm: false
-  activations_checkpoint_granularity: selective
-  activations_checkpoint_method: block
-  activations_checkpoint_num_layers: 1
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false # does not support sequence parallel
   
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+  
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
   transformer_engine: False
@@ -130,7 +138,7 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
-  use_emha: False
+  fp8_wgrad: True
   ub_tp_comm_overlap: False
 
   optim:
diff --git a/launcher_scripts/conf/training/gpt3/5b.yaml b/launcher_scripts/conf/training/gpt3/5b.yaml
index 59be251fcc..388c052121 100755
--- a/launcher_scripts/conf/training/gpt3/5b.yaml
+++ b/launcher_scripts/conf/training/gpt3/5b.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 75000 # consumed_samples = global_step * global_batch_size
   max_time: "05:23:30:00"
@@ -67,6 +67,7 @@ model:
   num_attention_heads: 32
   init_method_std: 0.01  # Standard deviation of the zero mean normal distribution used for weight initialization.')
   hidden_dropout: 0.1  # Dropout probability for hidden state transformer.
+  attention_dropout: 0.1 # Dropout probability for attention
   kv_channels: null  # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
   apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
   layernorm_epsilon: 1e-5
@@ -84,15 +85,19 @@ model:
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
 
   ## Activation Checkpointing
-  activations_checkpoint_granularity: selective # 'selective' or 'full'
-  activations_checkpoint_method: block # 'uniform', 'block'
-  activations_checkpoint_num_layers: 0 
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block'
+  activations_checkpoint_num_layers: null 
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
 
   ## Sequence Parallelism
   sequence_parallel: False
 
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
   tokenizer:
     library: 'megatron'
     type: 'GPT2BPETokenizer'
@@ -112,10 +117,13 @@ model:
   megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters
   grad_allreduce_chunk_size_mb: 125
 
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # To use fp8, please set `transformer_engine=True` and `fp8=True`.
   # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training
-  transformer_engine: True
+  transformer_engine: False
   fp8: False # enables fp8 in TransformerLayer forward
   fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
   fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
@@ -123,7 +131,7 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
-  use_emha: False
+  fp8_wgrad: True
   ub_tp_comm_overlap: False
 
   # miscellaneous
diff --git a/launcher_scripts/conf/training/gpt3/7b_improved.yaml b/launcher_scripts/conf/training/gpt3/7b_improved.yaml
index 9e2ca1d61b..9c3258b195 100644
--- a/launcher_scripts/conf/training/gpt3/7b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/7b_improved.yaml
@@ -15,7 +15,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 300000 # consumed_samples = global_step * global_batch_size
   max_time: "05:23:30:00" # days:hours:minutes:seconds
@@ -70,7 +70,8 @@ model:
   ffn_dropout: 0.0
   kv_channels: null
   apply_query_key_layer_scaling: true
-  normalization: layernorm1p
+  normalization: LayerNorm
+  layernorm_zero_centered_gamma: True
   layernorm_epsilon: 1.0e-05
   do_layer_norm_weight_decay: false
   make_vocab_size_divisible_by: 128
@@ -113,13 +114,20 @@ model:
   apex_transformer_log_level: 30
   gradient_as_bucket_view: true
   sync_batch_comm: false
-  activations_checkpoint_granularity: selective
-  activations_checkpoint_method: block
-  activations_checkpoint_num_layers: 8
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false # does not support sequence parallel
 
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
+  ## Using Megatron Core
+  mcore_gpt: True
+
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
   transformer_engine: False
@@ -130,7 +138,7 @@ model:
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
-  use_emha: False
+  fp8_wgrad: True
   ub_tp_comm_overlap: False
 
   optim:
diff --git a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml b/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml
deleted file mode 100644
index 33bbffb7ce..0000000000
--- a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml
+++ /dev/null
@@ -1,53 +0,0 @@
-# UB communicator configurations
-# Model configs: A100/175B/TP4/MBS1/SeqLen2K/BF16
-    
-# Bulk overlap with AllGather
-qkv_dgrad:
-  method: bulk
-  num_sm: 2
-  set_sm_margin: 0
-
-qkv_wgrad:
-  method: bulk
-  num_sm: 2
-  set_sm_margin: 0
-
-fc1_dgrad:
-  method: bulk
-  num_sm: 2
-  set_sm_margin: 0
-
-fc1_wgrad:
-  method: bulk
-  num_sm: 2
-  set_sm_margin: 0
-
-## Ring-exchange overlap with AllGather
-qkv_fprop:
-  method: ring_exchange
-  aggregate: 0
-
-proj_dgrad:
-  method: ring_exchange
-  aggregate: 0
-
-fc1_fprop:
-  method: ring_exchange
-  aggregate: 0
-
-fc2_dgrad:
-  method: ring_exchange
-  aggregate: 0
-
-# Chunked-collective overlap with ReduceScatter
-proj_fprop:
-  method: pipeline
-  num_sm: 4
-  num_splits: 4
-  set_sm_margin: 0
-
-fc2_fprop:
-  method: pipeline
-  num_sm: 4
-  num_splits: 4
-  set_sm_margin: 0
\ No newline at end of file
diff --git a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml b/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml
deleted file mode 100644
index 434e0a29f4..0000000000
--- a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml
+++ /dev/null
@@ -1,53 +0,0 @@
-# UB communicator configurations
-# Model configs: A100/175B/TP4/MBS2/SeqLen2K/BF16
-
-# Bulk overlap with AllGather
-qkv_dgrad:
-  method: bulk
-  num_sm: 2
-  set_sm_margin: 0
-
-qkv_wgrad:
-  method: bulk
-  num_sm: 2
-  set_sm_margin: 0
-
-fc1_dgrad:
-  method: bulk
-  num_sm: 2
-  set_sm_margin: 0
-
-fc1_wgrad:
-  method: bulk
-  num_sm: 2
-  set_sm_margin: 0
-
-## Ring-exchange overlap with AllGather
-qkv_fprop:
-  method: ring_exchange
-  aggregate: 0
-
-proj_dgrad:
-  method: ring_exchange
-  aggregate: 0
-
-fc1_fprop:
-  method: ring_exchange
-  aggregate: 0
-
-fc2_dgrad:
-  method: ring_exchange
-  aggregate: 0
-
-# Chunked-collective overlap with ReduceScatter
-proj_fprop:
-  method: pipeline
-  num_sm: 8
-  num_splits: 4
-  set_sm_margin: 0
-
-fc2_fprop:
-  method: pipeline
-  num_sm: 4
-  num_splits: 4
-  set_sm_margin: 0
diff --git a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml b/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml
deleted file mode 100644
index 21d02f3dd2..0000000000
--- a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml
+++ /dev/null
@@ -1,59 +0,0 @@
-# UB communicator configurations
-# Model configs: H100/175B/TP4/MBS1/SeqLen2K/FP8
-
-# Bulk overlap with AllGather / ReduceScatter
-qkv_dgrad:
-  method: bulk
-  num_sm: 4
-  cga_size: 2
-  set_sm_margin: 0
-
-qkv_wgrad:
-  method: bulk
-  num_sm: 8
-  cga_size: 2
-  set_sm_margin: 0
-
-fc1_dgrad:
-  method: bulk
-  num_sm: 2
-  cga_size: 2
-  set_sm_margin: 0
-
-fc1_wgrad:
-  method: bulk
-  num_sm: 4
-  cga_size: 2
-  set_sm_margin: 0
-
-## Ring-exchange overlap with AllGather
-qkv_fprop:
-  method: ring_exchange
-  aggregate: 0
-
-proj_dgrad:
-  method: ring_exchange
-  aggregate: 0
-
-fc1_fprop:
-  method: ring_exchange
-  aggregate: 0
-
-fc2_dgrad:
-  method: ring_exchange
-  aggregate: 1
-
-# Chunked-collective overlap with ReduceScatter
-proj_fprop:
-  method: pipeline
-  num_sm: 24
-  cga_size: 2
-  num_splits: 4
-  set_sm_margin: 1
-
-fc2_fprop:
-  method: pipeline
-  num_sm: 20
-  cga_size: 2
-  num_splits: 4
-  set_sm_margin: 1
diff --git a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml b/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml
deleted file mode 100644
index 444c8245e0..0000000000
--- a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml
+++ /dev/null
@@ -1,59 +0,0 @@
-# UB communicator configurations
-# Model configs: H100/175B/TP8/MBS2/SeqLen2K/FP8
-
-# Bulk overlap with AllGather
-qkv_dgrad:
-  method: bulk
-  num_sm: 8
-  cga_size: 2
-  set_sm_margin: 0
-
-qkv_wgrad:
-  method: bulk
-  num_sm: 16
-  cga_size: 2
-  set_sm_margin: 0
-
-fc1_dgrad:
-  method: bulk
-  num_sm: 4
-  cga_size: 2
-  set_sm_margin: 0
-
-fc1_wgrad:
-  method: bulk
-  num_sm: 16
-  cga_size: 2
-  set_sm_margin: 0
-
-## Ring-exchange overlap with AllGather
-qkv_fprop:
-  method: ring_exchange
-  aggregate: 0
-
-proj_dgrad:
-  method: ring_exchange
-  aggregate: 1
-
-fc1_fprop:
-  method: ring_exchange
-  aggregate: 0
-
-fc2_dgrad:
-  method: ring_exchange
-  aggregate: 0
-
-# Chunked-collective overlap with ReduceScatter
-proj_fprop:
-  method: pipeline
-  num_sm: 16
-  cga_size: 2
-  num_splits: 4
-  set_sm_margin: 1
-
-fc2_fprop:
-  method: pipeline
-  num_sm: 24
-  cga_size: 2
-  num_splits: 4
-  set_sm_margin: 1
diff --git a/launcher_scripts/conf/training/llama/llama1_13b.yaml b/launcher_scripts/conf/training/llama/llama1_13b.yaml
new file mode 100644
index 0000000000..3c2fd60daf
--- /dev/null
+++ b/launcher_scripts/conf/training/llama/llama1_13b.yaml
@@ -0,0 +1,217 @@
+run:
+  name: llama_13b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: 0-02:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 32
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: false
+  max_epochs: null
+  max_steps: 300000
+  max_time: '5:23:30:00'
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_llama
+    name: ${training.run.name}
+  resume_if_exists: true
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: false
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size},
+      ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+model:
+  mcore_gpt: true
+  micro_batch_size: 2
+  global_batch_size: 2048
+  rampup_batch_size: null
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 2048
+  max_position_embeddings: 2048
+  num_layers: 40
+  hidden_size: 5120
+  ffn_hidden_size: 13824
+  num_attention_heads: 40
+  init_method_std: 0.01
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: sentencepiece
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: 0
+  activations_checkpoint_layers_per_pipeline: 0
+  sequence_parallel: true
+  transformer_engine: true
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: most_recent
+  use_emha: false
+  ub_tp_comm_overlap: false
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 0.0001
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 107
+      constant_steps: 11873
+      min_lr: 1.0e-05
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 2048
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
+
diff --git a/launcher_scripts/conf/training/llama/llama1_30b.yaml b/launcher_scripts/conf/training/llama/llama1_30b.yaml
new file mode 100644
index 0000000000..93dee04071
--- /dev/null
+++ b/launcher_scripts/conf/training/llama/llama1_30b.yaml
@@ -0,0 +1,216 @@
+run:
+  name: llama_30b
+  results_dir:  ${base_results_dir}/${.name}
+  time_limit: 0-01:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 32
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: false
+  max_epochs: null
+  max_steps: 300000
+  max_time: '19:23:30:00'
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_llama
+    name: ${training.run.name}
+  resume_if_exists: true
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: false
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 2048
+  rampup_batch_size: null
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 2048
+  max_position_embeddings: 2048
+  num_layers: 60
+  hidden_size: 6656
+  ffn_hidden_size: 17920
+  num_attention_heads: 52
+  init_method_std: 0.008944
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: sentencepiece
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: 0
+  activations_checkpoint_layers_per_pipeline: 0
+  sequence_parallel: true
+  transformer_engine: true
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: most_recent
+  use_emha: false
+  ub_tp_comm_overlap: false
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 0.0001
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 107
+      constant_steps: 11873
+      min_lr: 1.0e-05
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 2048
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
+
diff --git a/launcher_scripts/conf/training/llama/llama1_65b.yaml b/launcher_scripts/conf/training/llama/llama1_65b.yaml
new file mode 100644
index 0000000000..d39259caae
--- /dev/null
+++ b/launcher_scripts/conf/training/llama/llama1_65b.yaml
@@ -0,0 +1,216 @@
+run:
+  name: llama_65b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: 0-01:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 128
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: false
+  max_epochs: null
+  max_steps: 300000
+  max_time: '19:23:30:00'
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_llama
+    name: ${training.run.name}
+  resume_if_exists: true
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: false
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 2048
+  rampup_batch_size: null
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 4
+  virtual_pipeline_model_parallel_size: 20
+  encoder_seq_length: 2048
+  max_position_embeddings: 2048
+  num_layers: 80
+  hidden_size: 8192
+  ffn_hidden_size: 22016
+  num_attention_heads: 64
+  init_method_std: 0.008944
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: sentencepiece
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: 0
+  activations_checkpoint_layers_per_pipeline: 0
+  sequence_parallel: true
+  transformer_engine: true
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: most_recent
+  use_emha: false
+  ub_tp_comm_overlap: false
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 0.0001
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 107
+      constant_steps: 11873
+      min_lr: 1.0e-05
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 2048
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
+
diff --git a/launcher_scripts/conf/training/llama/llama1_7b.yaml b/launcher_scripts/conf/training/llama/llama1_7b.yaml
new file mode 100644
index 0000000000..a8acb21e7d
--- /dev/null
+++ b/launcher_scripts/conf/training/llama/llama1_7b.yaml
@@ -0,0 +1,220 @@
+run:
+  name: llama_7b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "0-04:00:00"
+  dependency: "singleton"
+trainer:
+  num_nodes: 16
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 300000 # consumed_samples = global_step * global_batch_size
+  max_time: "05:23:30:00" # days:hours:minutes:seconds
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: True
+  wandb_logger_kwargs:
+    project: nemo_llama
+    name: ${training.run.name}
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: True
+  step_timing_kwargs:
+    sync_cuda: True
+    buffer_size: 5
+
+model:
+  mcore_gpt: true
+  micro_batch_size: 2
+  global_batch_size: 2048
+  rampup_batch_size: null
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 2048
+  max_position_embeddings: 2048
+  num_layers: 32
+  hidden_size: 4096
+  ffn_hidden_size: 11008
+  num_attention_heads: 32
+  init_method_std: 0.01
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: 'sentencepiece'
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: False
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: false # does not support sequence parallel
+
+  ## Transformer Engine
+  # fp8 training is currently not supported in the improved models
+  transformer_engine: True
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  use_emha: False
+  ub_tp_comm_overlap: False
+  use_flash_attention: True
+  optim:
+    name: distributed_fused_adam
+    lr: 1e-4
+    weight_decay: 0.1
+    betas:
+      - 0.9
+      - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 1e-5
+  data:
+    data_impl: mmap
+    splits_string: "99990,8,2"
+    seq_length: 2048
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
+
diff --git a/launcher_scripts/conf/training/llama/llama2_13b.yaml b/launcher_scripts/conf/training/llama/llama2_13b.yaml
new file mode 100644
index 0000000000..3d4dc8d0b1
--- /dev/null
+++ b/launcher_scripts/conf/training/llama/llama2_13b.yaml
@@ -0,0 +1,217 @@
+run:
+  name: llama2_13b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: 0-01:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 32
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: false
+  max_epochs: null
+  max_steps: 300000
+  max_time: '5:23:30:00'
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: true
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size},
+      ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 2048
+  rampup_batch_size: null
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 4096
+  max_position_embeddings: 4096
+  num_layers: 40
+  hidden_size: 5120
+  ffn_hidden_size: 13824
+  num_attention_heads: 40
+  init_method_std: 0.01
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: sentencepiece
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: 0
+  activations_checkpoint_layers_per_pipeline: 0
+  sequence_parallel: true
+  transformer_engine: true
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: most_recent
+  use_emha: false
+  ub_tp_comm_overlap: false
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 0.0001
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true    
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 107
+      constant_steps: 11873
+      min_lr: 1.0e-05
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 4096
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
+
diff --git a/launcher_scripts/conf/training/llama/llama2_70b.yaml b/launcher_scripts/conf/training/llama/llama2_70b.yaml
new file mode 100644
index 0000000000..0beb5f8bca
--- /dev/null
+++ b/launcher_scripts/conf/training/llama/llama2_70b.yaml
@@ -0,0 +1,220 @@
+run:
+  name: llama2_70b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: 0-01:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 128
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 300000
+  max_time: '19:23:30:00'
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: true
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 2048
+  rampup_batch_size: null
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 4
+  virtual_pipeline_model_parallel_size: 20
+  encoder_seq_length: 4096
+  max_position_embeddings: 4096
+  num_layers: 80
+  hidden_size: 8192
+  ffn_hidden_size: 28672
+  num_attention_heads: 64
+  num_query_groups: 8
+  init_method_std: 0.008944
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: sentencepiece
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: false
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: 0
+  activations_checkpoint_layers_per_pipeline: 0
+  sequence_parallel: true
+  transformer_engine: true
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: most_recent
+  use_emha: false
+  ub_tp_comm_overlap: false
+  use_flash_attention: true
+  overlap_p2p_comm: true
+  batch_p2p_comm: false
+  gc_interval: 100
+  optim:
+    name: distributed_fused_adam
+    lr: 0.00015
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      constant_steps: 11873
+      min_lr: 1.0e-05
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 4096
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
+
diff --git a/launcher_scripts/conf/training/llama/llama2_7b.yaml b/launcher_scripts/conf/training/llama/llama2_7b.yaml
new file mode 100644
index 0000000000..7df5de9940
--- /dev/null
+++ b/launcher_scripts/conf/training/llama/llama2_7b.yaml
@@ -0,0 +1,220 @@
+run:
+  name: llama2_7b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "0-01:30:00"
+  dependency: "singleton"
+trainer:
+  num_nodes: 16
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 300000 # consumed_samples = global_step * global_batch_size
+  max_time: "05:23:30:00" # days:hours:minutes:seconds
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: True
+  step_timing_kwargs:
+    sync_cuda: True
+    buffer_size: 5
+
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 2048
+  rampup_batch_size: null
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 4096
+  max_position_embeddings: 4096
+  num_layers: 32
+  hidden_size: 4096
+  ffn_hidden_size: 11008
+  num_attention_heads: 32
+  init_method_std: 0.01
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: 'sentencepiece'
+    type: null
+    model: ${data_dir}/llama/llama_tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: False
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: block
+  activations_checkpoint_num_layers: 0
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: false
+
+  ## Transformer Engine
+  # fp8 training is currently not supported in the improved models
+  transformer_engine: true
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  use_emha: False
+  ub_tp_comm_overlap: False
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 1e-4
+    weight_decay: 0.1
+    betas:
+      - 0.9
+      - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true    
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 1e-5
+  data:
+    data_impl: mmap
+    splits_string: "99990,8,2"
+    seq_length: 4096
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .0333
+    - ${data_dir}/my-llama_00_text_document
+    - .0333
+    - ${data_dir}/my-llama_01_text_document
+    - .0333
+    - ${data_dir}/my-llama_02_text_document
+    - .0333
+    - ${data_dir}/my-llama_03_text_document
+    - .0333
+    - ${data_dir}/my-llama_04_text_document
+    - .0333
+    - ${data_dir}/my-llama_05_text_document
+    - .0333
+    - ${data_dir}/my-llama_06_text_document
+    - .0333
+    - ${data_dir}/my-llama_07_text_document
+    - .0333
+    - ${data_dir}/my-llama_08_text_document
+    - .0333
+    - ${data_dir}/my-llama_09_text_document
+    - .0333
+    - ${data_dir}/my-llama_10_text_document
+    - .0333
+    - ${data_dir}/my-llama_11_text_document
+    - .0333
+    - ${data_dir}/my-llama_12_text_document
+    - .0333
+    - ${data_dir}/my-llama_13_text_document
+    - .0333
+    - ${data_dir}/my-llama_14_text_document
+    - .0333
+    - ${data_dir}/my-llama_15_text_document
+    - .0333
+    - ${data_dir}/my-llama_16_text_document
+    - .0333
+    - ${data_dir}/my-llama_17_text_document
+    - .0333
+    - ${data_dir}/my-llama_18_text_document
+    - .0333
+    - ${data_dir}/my-llama_19_text_document
+    - .0333
+    - ${data_dir}/my-llama_20_text_document
+    - .0333
+    - ${data_dir}/my-llama_21_text_document
+    - .0333
+    - ${data_dir}/my-llama_22_text_document
+    - .0333
+    - ${data_dir}/my-llama_23_text_document
+    - .0333
+    - ${data_dir}/my-llama_24_text_document
+    - .0333
+    - ${data_dir}/my-llama_25_text_document
+    - .0333
+    - ${data_dir}/my-llama_26_text_document
+    - .0333
+    - ${data_dir}/my-llama_27_text_document
+    - .0333
+    - ${data_dir}/my-llama_28_text_document
+    - .0334
+    - ${data_dir}/my-llama_29_text_document
+
diff --git a/launcher_scripts/conf/training/mt5/11b.yaml b/launcher_scripts/conf/training/mt5/11b.yaml
index f6d6a67fc1..3111159db4 100755
--- a/launcher_scripts/conf/training/mt5/11b.yaml
+++ b/launcher_scripts/conf/training/mt5/11b.yaml
@@ -16,7 +16,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1066667 # consumed_samples = global_step * global_batch_size
   max_time: "44:23:30:00"
diff --git a/launcher_scripts/conf/training/mt5/170m.yaml b/launcher_scripts/conf/training/mt5/170m.yaml
index 49a04fc2a2..b166c26496 100755
--- a/launcher_scripts/conf/training/mt5/170m.yaml
+++ b/launcher_scripts/conf/training/mt5/170m.yaml
@@ -16,7 +16,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1000000 # consumed_samples = global_step * global_batch_size
   max_time: "06:23:30:00"
diff --git a/launcher_scripts/conf/training/mt5/23b.yaml b/launcher_scripts/conf/training/mt5/23b.yaml
index d38ea399cf..dab9d9504e 100755
--- a/launcher_scripts/conf/training/mt5/23b.yaml
+++ b/launcher_scripts/conf/training/mt5/23b.yaml
@@ -16,7 +16,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1066667 # consumed_samples = global_step * global_batch_size
   max_time: "54:23:30:00"
diff --git a/launcher_scripts/conf/training/mt5/390m.yaml b/launcher_scripts/conf/training/mt5/390m.yaml
index 479b533b3f..c03436bb8b 100755
--- a/launcher_scripts/conf/training/mt5/390m.yaml
+++ b/launcher_scripts/conf/training/mt5/390m.yaml
@@ -16,7 +16,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1000000 # consumed_samples = global_step * global_batch_size
   max_time: "06:23:30:00"
diff --git a/launcher_scripts/conf/training/mt5/3b.yaml b/launcher_scripts/conf/training/mt5/3b.yaml
index 3a0df27e4c..96b2c367bb 100755
--- a/launcher_scripts/conf/training/mt5/3b.yaml
+++ b/launcher_scripts/conf/training/mt5/3b.yaml
@@ -16,7 +16,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1066667 # consumed_samples = global_step * global_batch_size
   max_time: "17:23:30:00"
diff --git a/launcher_scripts/conf/training/t5/11b.yaml b/launcher_scripts/conf/training/t5/11b.yaml
index 9ee9b3288d..0f47b6e5e7 100755
--- a/launcher_scripts/conf/training/t5/11b.yaml
+++ b/launcher_scripts/conf/training/t5/11b.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1066667 # consumed_samples = global_step * global_batch_size
   max_time: "44:23:30:00"
diff --git a/launcher_scripts/conf/training/t5/220m.yaml b/launcher_scripts/conf/training/t5/220m.yaml
index 2b1549dc8c..73f56344a5 100755
--- a/launcher_scripts/conf/training/t5/220m.yaml
+++ b/launcher_scripts/conf/training/t5/220m.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1000000 # consumed_samples = global_step * global_batch_size
   max_time: "06:23:30:00"
diff --git a/launcher_scripts/conf/training/t5/23b.yaml b/launcher_scripts/conf/training/t5/23b.yaml
index 30ae8d6037..1050285cc7 100755
--- a/launcher_scripts/conf/training/t5/23b.yaml
+++ b/launcher_scripts/conf/training/t5/23b.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1066667 # consumed_samples = global_step * global_batch_size
   max_time: "54:23:30:00"
diff --git a/launcher_scripts/conf/training/t5/3b.yaml b/launcher_scripts/conf/training/t5/3b.yaml
index a2f4c99e59..02c51654fc 100755
--- a/launcher_scripts/conf/training/t5/3b.yaml
+++ b/launcher_scripts/conf/training/t5/3b.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1066667 # consumed_samples = global_step * global_batch_size
   max_time: "14:23:30:00"
diff --git a/launcher_scripts/conf/training/t5/41b.yaml b/launcher_scripts/conf/training/t5/41b.yaml
index 6d23f6e670..599e389f16 100755
--- a/launcher_scripts/conf/training/t5/41b.yaml
+++ b/launcher_scripts/conf/training/t5/41b.yaml
@@ -14,7 +14,7 @@ trainer:
   precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
-  replace_sampler_ddp: False
+  use_distributed_sampler: False
   max_epochs: null
   max_steps: 1066667 # consumed_samples = global_step * global_batch_size
   max_time: "99:23:30:00"
diff --git a/launcher_scripts/main.py b/launcher_scripts/main.py
index 564295461f..9936dad7c5 100755
--- a/launcher_scripts/main.py
+++ b/launcher_scripts/main.py
@@ -18,12 +18,14 @@
 import hydra
 import omegaconf
 from nemo_launcher.core.data_stages import CustomDataPreparation, MC4DataPreparation, PileDataPreparation
+from nemo_launcher.core.data_curation_stages import QualityFiltering
 from nemo_launcher.core.export_stages import Export
 from nemo_launcher.core.stages import (
     AdapterLearning,
     Conversion,
     EvalHarnessEvaluation,
     FineTuning,
+    PEFT,
     IA3Learning,
     NeMoEvaluation,
     PromptLearning,
@@ -38,22 +40,24 @@
 STR2STAGECLASS = {
     "training": Training,
     "fine_tuning": FineTuning,
+    "peft": PEFT,
     "prompt_learning": PromptLearning,
     "adapter_learning": AdapterLearning,
     "ia3_learning": IA3Learning,
     "conversion": Conversion,
     "export": Export,
     "evaluation": {
-        EvalHarnessEvaluation: ["gpt3", "prompt_gpt3"],
+        EvalHarnessEvaluation: ["gpt3", "prompt_gpt3", "llama", "prompt_llama"],
         NeMoEvaluation: ["t5", "mt5", "prompt_t5", "prompt_mt5", "adapter_t5", "adapter_gpt3", "ia3_t5", "ia3_gpt3"],
     },
     "data_preparation": {
-        PileDataPreparation: ["gpt3", "t5", "bert"],
+        PileDataPreparation: ["gpt3", "t5", "bert", "llama"],
         MC4DataPreparation: ["mt5"],
         CustomDataPreparation: ["generic"],
     },
     "rlhf_rm": RLHFRewardModel,
     "rlhf_ppo": RLHFPPO,
+    "quality_filtering": QualityFiltering,
 }
 
 
diff --git a/launcher_scripts/nemo_launcher/collections/checkpoint_search.py b/launcher_scripts/nemo_launcher/collections/checkpoint_search.py
index 4da89ed860..5cd080b96f 100755
--- a/launcher_scripts/nemo_launcher/collections/checkpoint_search.py
+++ b/launcher_scripts/nemo_launcher/collections/checkpoint_search.py
@@ -50,17 +50,31 @@ def checkpoint_search(cfg):
     tensor_model_parallel_size = cfg.tensor_model_parallel_size
     pipeline_model_parallel_size = cfg.pipeline_model_parallel_size
 
+    dist_ckpt = False
+    # Every distributed checkpoint saves a 'common.pt' file
+    for result in glob.glob(os.path.join(checkpoint_folder, "*")):
+        if os.path.exists(os.path.join(result, 'common.pt')):
+            dist_ckpt = True
+            break
+
     if checkpoint_name == "latest":
-        checkpoints = os.path.join(checkpoint_folder, "*.ckpt")
-        checkpoints = _inject_model_parallel_rank(
-            checkpoints, tensor_model_parallel_size, pipeline_model_parallel_size
-        )
-        checkpoint_list = glob.glob(checkpoints)
+
+        if dist_ckpt:
+            checkpoint_list = [f for f in glob.glob(os.path.join(checkpoint_folder, "*")) if os.path.isdir(f)]
+        else:
+            checkpoints = os.path.join(checkpoint_folder, "*.ckpt")
+
+            checkpoints = _inject_model_parallel_rank(
+                checkpoints, tensor_model_parallel_size, pipeline_model_parallel_size
+            )
+            checkpoint_list = glob.glob(checkpoints)
+
         latest_checkpoint = max(checkpoint_list, key=os.path.getctime)
         checkpoint_name = os.path.basename(latest_checkpoint)
 
     checkpoint = os.path.join(checkpoint_folder, checkpoint_name)
-    checkpoint = _inject_model_parallel_rank(checkpoint, tensor_model_parallel_size, pipeline_model_parallel_size)
+    if not dist_ckpt:
+        checkpoint = _inject_model_parallel_rank(checkpoint, tensor_model_parallel_size, pipeline_model_parallel_size)
     checkpoint_list = glob.glob(checkpoint)
     if len(checkpoint_list) > 1:
         raise ValueError("Too many checkpoints fit the checkpoint name pattern in conversion config.")
diff --git a/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py b/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py
index ec6be9845f..f1fff5c18a 100644
--- a/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py
+++ b/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py
@@ -31,7 +31,7 @@ def get_ln_sm_margin(cfg):
     """
     global cuda_capability
     if cuda_capability == 9:
-        print(4)
+        print(8)
     else:
         print(0)
 
@@ -54,7 +54,7 @@ def get_ag_overlap(cfg):
 
 
 if __name__ == "__main__":
-    elif sys.argv[1] == "name=get_ln_sm_margin":
+    if sys.argv[1] == "name=get_ln_sm_margin":
         get_ln_sm_margin()
     elif sys.argv[1] == "name=get_ag_overlap":
         get_ag_overlap()
diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/conf/config.yaml b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/conf/config.yaml
index 448dbb1dbc..14917628a8 100755
--- a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/conf/config.yaml
+++ b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/conf/config.yaml
@@ -9,3 +9,5 @@ rm_extracted: True
 tokenizer_type: null
 vocab_save_dir: null
 merges_save_dir: null
+tokenizer_library: null
+tokenizer_model: null
diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/download.py b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/download.py
index 917961a51a..80831b3960 100755
--- a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/download.py
+++ b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/download.py
@@ -35,7 +35,7 @@ def main(cfg):
         url = f"{pile_url_train}{file_number:02d}.jsonl.zst"
         output_file = f"{file_number:02d}.jsonl.zst"
         downloaded_path = utils.download_single_file(url, data_dir, output_file)
-    if cfg.get("cluster_type") == "bcp":
+    if cfg.get("cluster_type") in ["bcp", "k8s"]:
         file_numbers = cfg["file_numbers"]
         # Downloading the files
         files_list = utils.convert_file_numbers(file_numbers)
diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/extract.py b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/extract.py
index 5093543528..16fef5ef28 100755
--- a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/extract.py
+++ b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/extract.py
@@ -35,7 +35,7 @@ def main(cfg) -> None:
         downloaded_path = os.path.join(data_dir, f"{file_number:02d}.jsonl.zst")
         output_file = f"{file_number:02d}.jsonl"
         utils.extract_single_zst_file(downloaded_path, data_dir, output_file, rm_downloaded)
-    elif cfg.get("cluster_type") == "bcp":
+    elif cfg.get("cluster_type") in ["bcp", "k8s"]:
         file_numbers = cfg.get("file_numbers")
         # Downloading the files
         files_list = utils.convert_file_numbers(file_numbers)
diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py
index 61a9e36560..e5fbbb0fbe 100755
--- a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py
+++ b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py
@@ -28,6 +28,8 @@ def main(cfg):
     data_dir = cfg.get("data_dir")
     rm_extracted = cfg.get("rm_extracted")
     tokenizer_type = cfg.get("tokenizer_type")
+    tokenizer_library = cfg.get("tokenizer_library")
+    tokenizer_model = cfg.get("tokenizer_model")
     assert data_dir is not None, "data_dir must be a valid path"
 
     # Vocab
@@ -67,6 +69,8 @@ def main(cfg):
             model_type = 'bert'
         elif 'gpt3' in data_config:
             model_type = 'gpt3'
+        elif 'llama' in data_config:
+            model_type = 'llama'
 
         output_prefix = os.path.join(data_dir, f"my-{model_type}_{file_number:02d}")
 
@@ -77,6 +81,8 @@ def main(cfg):
             f"--dataset-impl mmap "
             f"--tokenizer-library megatron "
             f"--tokenizer-type {tokenizer_type} "
+            f"--tokenizer-library {tokenizer_library} "
+            f"--tokenizer-model {tokenizer_model} "
             f"--workers $SLURM_CPUS_ON_NODE "
         )
 
@@ -91,7 +97,7 @@ def main(cfg):
         os.system(runcmd)
         if rm_extracted:
             os.remove(extracted_path)
-    elif cfg.get("cluster_type") == "bcp":
+    elif cfg.get("cluster_type") in ["bcp", "k8s"]:
         file_numbers = cfg.get("file_numbers")
         files_list = utils.convert_file_numbers(file_numbers)
         # Assumes launched via mpirun:
@@ -119,6 +125,8 @@ def main(cfg):
                 model_type = 'bert'
             elif 'gpt3' in data_config:
                 model_type = 'gpt3'
+            elif 'llama' in data_config:
+                model_type = 'llama'
 
             output_prefix = os.path.join(data_dir, f"my-{model_type}_{file_number:02d}")
 
@@ -129,6 +137,8 @@ def main(cfg):
                 f"--dataset-impl mmap "
                 f"--tokenizer-library megatron "
                 f"--tokenizer-type {tokenizer_type} "
+                f"--tokenizer-library {tokenizer_library} "
+                f"--tokenizer-model {tokenizer_model} "
                 f"--workers {ncpus} "
             )
 
diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/evaluate.py b/launcher_scripts/nemo_launcher/collections/eval_harness/evaluate.py
index f6a664ef8e..df6c20f27c 100755
--- a/launcher_scripts/nemo_launcher/collections/eval_harness/evaluate.py
+++ b/launcher_scripts/nemo_launcher/collections/eval_harness/evaluate.py
@@ -85,7 +85,7 @@ def parse_args(parser_main):
     parser.add_argument("--model", required=True)
 
     parser.add_argument(
-        "--nemo_model", type=str, default=None, required=False, help="Pass path to model's .nemo file",
+        "--nemo_model", default=None, required=False, help="Pass path to model's .nemo file",
     )
     parser.add_argument(
         "--checkpoint_folder",
@@ -120,6 +120,7 @@ def parse_args(parser_main):
 
     parser.add_argument("--vocab_file", default=None)
     parser.add_argument("--merge_file", default=None)
+    parser.add_argument("--tokenizer_model", default=None)
 
     parser.add_argument(
         "--prompt_dataset_paths",
@@ -292,9 +293,10 @@ def main():
     pipeline_model_parallel_size = args.pipeline_model_parallel_size
     vocab_file = args.vocab_file
     merge_file = args.merge_file
+    tokenizer_model = args.tokenizer_model
 
     hparams_override_file = None
-    if args.nemo_model is None:  # Not loading from .nemo checkpoint
+    if args.nemo_model is None or args.nemo_model == "None":  # Not loading from .nemo checkpoint
         # Checkpoint search
         if checkpoint_name == "latest":
             checkpoints = os.path.join(checkpoint_folder, "*.ckpt")
@@ -322,6 +324,8 @@ def main():
                 conf.cfg.tokenizer.vocab_file = vocab_file
             if merge_file is not None:
                 conf.cfg.tokenizer.merge_file = merge_file
+            if tokenizer_model is not None:
+                conf.cfg.tokenizer.model = tokenizer_model
             if "activations_checkpoint_granularity" in conf.cfg:
                 conf.cfg.activations_checkpoint_granularity = None
             if "activations_checkpoint_method" in conf.cfg:
diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py
index 8c9a1b5ed2..1b18dc64e5 100755
--- a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py
+++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py
@@ -14,11 +14,13 @@
 
 from lm_eval.base import LM
 
-from . import dummy, nemo_gpt3, nemo_gpt3_prompt
+from . import dummy, nemo_gpt3, nemo_gpt3_prompt, nemo_llama, nemo_llama_prompt
 
 MODEL_REGISTRY = {
     "nemo-gpt3": nemo_gpt3.NeMo_GPT3LM_TP_PP,
+    "nemo-llama": nemo_llama.NeMo_LLAMALM_TP_PP,
     "nemo-gpt3-prompt": nemo_gpt3_prompt.NeMo_GPT3_PROMPTLM,
+    "nemo-llama-prompt": nemo_llama_prompt.NeMo_LLAMA_PROMPTLM,
     "dummy": dummy.DummyLM,
 }
 
diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_gpt3.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_gpt3.py
index 09241b7d88..40c74160a0 100755
--- a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_gpt3.py
+++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_gpt3.py
@@ -152,12 +152,29 @@ def dummy():
             logging.info(f'Setting up transformer engine modules for tensor parallelism.')
             if model.cfg.get('megatron_amp_O2', 'False'):
                 # when using O2 additional module key is added that casts the weights
-                for layer in model.model.module.language_model.encoder.layers:
-                    layer.set_tensor_parallel_group(parallel_state.get_tensor_model_parallel_group())
+                if model.cfg.get('mcore_gpt', False):
+                    for layer in model.model.module.decoder.layers:
+                        layer.set_tensor_parallel_group(parallel_state.get_tensor_model_parallel_group())
+                else:
+                    for layer in model.model.module.language_model.encoder.layers:
+                        layer.set_tensor_parallel_group(parallel_state.get_tensor_model_parallel_group())
 
             else:
-                for layer in model.model.language_model.encoder.layers:
-                    layer.set_tensor_parallel_group(parallel_state.get_tensor_model_parallel_group())
+                if model.cfg.get('mcore_gpt', False):
+                    for module in model.get_gpt_module_list():
+                        """Set TP group
+                        Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398
+                        """
+                        # Deep iterate but skip self to avoid infinite recursion.
+                        for index, child in enumerate(module.modules()):
+                            if index == 0:
+                                continue
+                            if hasattr(child, "set_tensor_parallel_group"):
+                                tp_group = parallel_state.get_tensor_model_parallel_group()
+                                child.set_tensor_parallel_group(tp_group)
+                else:
+                    for layer in model.model.language_model.encoder.layers:
+                        layer.set_tensor_parallel_group(parallel_state.get_tensor_model_parallel_group())
 
 
 class NeMo_GPT3LM_TP_PP(LM):
diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py
new file mode 100755
index 0000000000..4c7f5e56b8
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from omegaconf import OmegaConf, open_dict
+
+import torch
+import tqdm
+from megatron.core import parallel_state
+from lm_eval import utils
+from lm_eval.base import LM
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
+from nemo.collections.nlp.modules.common.text_generation_utils import generate, get_computeprob_response
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
+from nemo.utils import logging
+from nemo.utils.app_state import AppState
+from nemo.utils.get_rank import is_global_rank_zero
+from nemo.utils.model_utils import inject_model_parallel_rank
+from pytorch_lightning.trainer.trainer import Trainer
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.dataloader import default_collate
+
+from .nemo_gpt3 import RequestDataset, setup_trainer_and_model, DDP_initialize
+
+class NeMo_LLAMALM_TP_PP(LM):
+    def __init__(self, args, truncate=False, batch_size=1):
+        super().__init__()
+
+        # get nemo megatron
+        logging.info(f"**** Building LLaMA model ...")
+        self.trainer, self.model = setup_trainer_and_model(args)
+        self.tokenizer = self.model.tokenizer
+        self.model.eval()
+
+        self.max_length = self.model.cfg.get("max_position_embeddings")
+        assert self.tokenizer.text_to_ids("hello\n\nhello") == [
+            22172,
+            13,
+            13,
+            12199,
+        ], "Tokenizer text_to_ids is not working as expected."
+
+        self.truncate = truncate
+        self.batch_size = batch_size
+
+        # initialize DDP and move model to GPU
+        DDP_initialize(self.model)
+        self.model = self.model.cuda()
+
+    @classmethod
+    def create_from_arg_string(cls, arg_string, additional_config={}):
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(args, **args2)
+
+    def loglikelihood(self, requests):
+        return self._loglikelihood(requests)
+
+    """
+    request: (context, continuation)
+    how this all works:
+             CTX      CONT
+    inp    0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
+    gpt2    \               \
+    logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the [:, -len(continuation_enc):, :self.VOCAB_SIZE] slice
+    cont_toks      4 5 6 7 8 9
+    when too long to fit in context, truncate from the left
+    """
+
+    def _loglikelihood(self, requests):
+        def pad_collate(batch, eos_id=2):
+            tokens = [item[0] for item in batch]
+            conti_lens = [item[1] for item in batch]
+            lens = [len(token) - 1 for token in tokens]  # fake delete last token by reducing input len
+            max_len = max(lens)
+            extra_pad_len = 0
+            if max_len % 8 != 0:
+                extra_pad_len = 8 - (max_len % 8)
+                max_len += extra_pad_len
+            # extra_pad_len = 2048 - max_len
+            # max_len += extra_pad_len
+
+            tokens_pad = pad_sequence(tokens, batch_first=False, padding_value=eos_id)
+            if extra_pad_len > 0:
+                extra_pad = torch.ones(extra_pad_len, len(batch)) * eos_id
+                extra_pad = extra_pad.type_as(tokens_pad)
+                tokens_pad = torch.vstack((tokens_pad, extra_pad))
+            # Add padding to all samples to adapt nemo generate api
+
+            new_batch = []
+            for token, lenn, conti_len in zip(tokens_pad.T, lens, conti_lens):
+                # (token, lenn, tokens_to_generate, compute_logprobs)
+                new_batch.append((token, max_len, lenn, conti_len))
+
+            new_batch = default_collate(new_batch)
+            return new_batch
+
+        def _collate(x):  # used to reorder request and remove duplications
+            """
+              the negative sign on len(toks) sorts descending - this has a few advantages:
+              - time estimates will always be over not underestimates, which is more useful for planning
+              - to know the size of a batch when going through the list, you know the first one is always the batch padded context length.
+                this is useful to simplify the batching logic and more importantly to make automatic adaptive batches much much easier to implement
+              - any OOMs will happen right away rather than near the end
+            """
+            toks = x[0] + x[1]
+            return -len(toks), tuple(toks)
+
+        reord = utils.Reorderer(requests, _collate)
+        request_ds = RequestDataset(reord.get_reordered(), self.model.tokenizer, self.max_length)
+        request_dl = DataLoader(request_ds, collate_fn=pad_collate, batch_size=self.batch_size, shuffle=False)
+
+        def logits_to_results(batch, response):
+            input_token_ids_batch, _, lens, conti_lens = batch
+            batch_size = len(lens)
+            assert len(response['token_ids']) == batch_size, "Response's length not equal to batch size."
+
+            batch_res = []
+            for index in range(batch_size):
+                inp_len = lens[index]
+                conti_len = conti_lens[index]
+
+                inp_token_ids = input_token_ids_batch[index].tolist()[: inp_len + 1]  # recover fake deleted token
+                response_token_ids = response['token_ids'][index][:inp_len]
+
+                assert response_token_ids == inp_token_ids[:-1], f"Mismatch in input tokens."
+
+                log_probs = response['full_logprob'][index][:inp_len]  # torch.tensor
+                log_probs = log_probs[-conti_len:]
+
+                greedy_tokens = log_probs.argmax(dim=-1)
+                greedy_tokens = self.tokenizer.ids_to_tokens(greedy_tokens.cpu().numpy().tolist())
+
+                conti_token_ids = inp_token_ids[-conti_len:]
+                conti_tokens = self.tokenizer.ids_to_tokens(conti_token_ids)
+
+                max_equal = greedy_tokens == conti_tokens
+                log_probs = log_probs.cpu().to(torch.float32)
+                conti_enc = torch.tensor(self.tokenizer.tokens_to_ids(conti_tokens))
+                conti_probs = torch.gather(log_probs, 1, conti_enc.unsqueeze(-1)).squeeze(-1)
+
+                batch_res.append((float(conti_probs.sum()), bool(max_equal), greedy_tokens, conti_tokens))
+            return batch_res
+
+        res = []
+        for batch in tqdm.tqdm(request_dl):
+            # inputs = (token_ids, conti_lens)
+            inputs = (batch[0].cuda(), batch[1].cuda())
+            response = generate(
+                model=self.model,
+                inputs=inputs,
+                tokens_to_generate=1,
+                all_probs=True,
+                temperature=1.0,
+                add_BOS=False,
+                top_k=0,
+                top_p=0.9,
+                greedy=True,
+                repetition_penalty=1.0,
+                min_tokens_to_generate=0,
+                compute_logprob=True,
+                end_strings=['</s>'],
+            )
+            response = get_computeprob_response(self.tokenizer, response, inputs)
+
+            if is_global_rank_zero():
+                res.extend(logits_to_results(batch, response))
+
+            del inputs, response
+
+        return reord.get_original(res) if self.can_access_output() else None
+
+    def loglikelihood_rolling(self, requests):
+        loglikelihoods = []
+        len_rolling_token_windows = [0]
+        all_rolling_token_windows = []
+
+        for (string,) in requests:
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tokenizer.text_to_ids(string),
+                        prefix_token=2,
+                        max_seq_len=self.max_length,
+                        context_len=1,
+                    ),
+                )
+            )
+
+            len_rolling_token_windows.append(len(rolling_token_windows) + len_rolling_token_windows[-1])
+            all_rolling_token_windows.extend(rolling_token_windows)
+
+        string_nll = self._loglikelihood(all_rolling_token_windows)
+        if self.can_access_output():
+            string_nll = [x[0] for x in string_nll]
+            # discard is_greedy
+            for i in range(len(len_rolling_token_windows) - 1):
+                loglikelihoods.append(sum(string_nll[len_rolling_token_windows[i] : len_rolling_token_windows[i + 1]]))
+
+        return loglikelihoods
+
+    def greedy_until(self, requests):
+        raise NotImplementedError
+
+    def can_access_output(self):
+        return is_global_rank_zero()
diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama_prompt.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama_prompt.py
new file mode 100755
index 0000000000..59be96c5e7
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama_prompt.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import tqdm
+from lm_eval import utils
+from lm_eval.base import LM
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import (
+    MegatronGPTPromptLearningModel,
+)
+from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
+from nemo.collections.nlp.modules.common.text_generation_utils import generate, get_computeprob_response
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+from nemo.utils import logging
+from nemo.utils.app_state import AppState
+from nemo.utils.get_rank import is_global_rank_zero
+from pytorch_lightning.trainer.trainer import Trainer
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.dataloader import default_collate
+
+from .nemo_gpt3_prompt import PromptRequestDataset, setup_trainer_and_model, DDP_initialize
+
+class NeMo_LLAMA_PROMPTLM(LM):
+    def __init__(self, args, truncate=False, batch_size=1):
+        super().__init__()
+
+        # get nemo megatron
+        logging.info(f"**** Building LLaMA Prompt model ...")
+        self.trainer, self.model = setup_trainer_and_model(args)
+        self.tokenizer = self.model.tokenizer
+        self.model.eval()
+
+        self.max_length = self.model.cfg.get("max_position_embeddings")
+        assert self.tokenizer.text_to_ids("hello\n\nhello") == [
+            22172,
+            13,
+            13,
+            12199,
+        ], "Tokenizer text_to_ids is not working as expected."
+
+        self.truncate = truncate
+        self.batch_size = batch_size
+
+        # initialize DDP and move model to GPU
+        DDP_initialize(self.model)
+        self.model = self.model.cuda()
+
+    @classmethod
+    def create_from_arg_string(cls, arg_string, additional_config={}):
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(args, **args2)
+
+    def loglikelihood(self, requests):
+        return self._loglikelihood(requests)
+
+    """
+    request: (context, continuation)
+    how this all works:
+             CTX      CONT
+    inp    0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
+    gpt2    \               \
+    logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the [:, -len(continuation_enc):, :self.VOCAB_SIZE] slice
+    cont_toks      4 5 6 7 8 9
+    when too long to fit in context, truncate from the left
+    """
+
+    def _loglikelihood(self, requests):
+        def pad_collate(batch, eos_id=2):
+            tokens, conti_lens, task_ids, *_ = map(list, zip(*batch))
+            lens = [len(token) - 1 for token in tokens]  # fake delete last token by reducing input len
+            max_len = max(lens)
+
+            tokens_pad = pad_sequence(tokens, batch_first=False, padding_value=eos_id)
+            # Add padding to all samples to adapt nemo generate api
+            # tokens_pad = torch.cat((tokens_pad, torch.ones((1, len(tokens)), dtype=torch.int) * eos_id), 0)
+
+            new_batch = []
+            for token, lenn, conti_len, task_id in zip(tokens_pad.T, lens, conti_lens, task_ids):
+                new_batch.append((token, max_len, task_id, lenn, conti_len))
+
+            new_batch = default_collate(new_batch)
+            return new_batch
+
+        def _collate(x):  # used to reorder request and remove duplications
+            """
+              the negative sign on len(toks) sorts descending - this has a few advantages:
+              - time estimates will always be over not underestimates, which is more useful for planning
+              - to know the size of a batch when going through the list, you know the first one is always the batch padded context length.
+                this is useful to simplify the batching logic and more importantly to make automatic adaptive batches much much easier to implement
+              - any OOMs will happen right away rather than near the end
+            """
+            toks = x[0] + x[1]
+            return -len(toks), tuple(toks)
+
+        reord = utils.Reorderer(requests, _collate)
+        request_ds = PromptRequestDataset(reord.get_reordered(), self.model.tokenizer)
+        request_dl = DataLoader(request_ds, collate_fn=pad_collate, batch_size=self.batch_size, shuffle=False)
+
+        def logits_to_results(batch, response):
+            input_token_ids_batch, _, _, lens, conti_lens = batch
+            batch_size = len(lens)
+            assert len(response["token_ids"]) == batch_size, "Response's length not equal to batch size."
+
+            batch_res = []
+            for index in range(batch_size):
+                inp_len = lens[index]
+                conti_len = conti_lens[index]
+
+                inp_token_ids = input_token_ids_batch[index].tolist()[: inp_len + 1]  # recover fake deleted token
+
+                log_probs = response["full_logprob"][index][:inp_len]  # torch.tensor
+                log_probs = log_probs[-conti_len:]
+
+                greedy_tokens = log_probs.argmax(dim=-1)
+                greedy_tokens = self.tokenizer.ids_to_tokens(greedy_tokens.cpu().numpy().tolist())
+
+                conti_token_ids = inp_token_ids[-conti_len:]
+                conti_tokens = self.tokenizer.ids_to_tokens(conti_token_ids)
+
+                max_equal = greedy_tokens == conti_tokens
+                log_probs = log_probs.cpu().to(torch.float32)
+                conti_enc = torch.tensor(self.tokenizer.tokens_to_ids(conti_tokens))
+                conti_probs = torch.gather(log_probs, 1, conti_enc.unsqueeze(-1)).squeeze(-1)
+
+                batch_res.append((float(conti_probs.sum()), bool(max_equal), greedy_tokens, conti_tokens))
+            return batch_res
+
+        res = []
+        for batch in tqdm.tqdm(request_dl):
+            # inputs = (token_ids, conti_lens)
+            inputs = (batch[0].cuda(), batch[1].cuda())
+            task_ids = torch.zeros((self.batch_size, 1), device='cuda')
+            response = generate(
+                model=self.model,
+                inputs=inputs,
+                task_ids=task_ids,
+                tokens_to_generate=1,
+                all_probs=True,
+                temperature=1.0,
+                add_BOS=False,
+                top_k=0,
+                top_p=0.9,
+                greedy=True,
+                repetition_penalty=1.0,
+                min_tokens_to_generate=0,
+            )
+            response = get_computeprob_response(self.tokenizer, response, inputs)
+
+            if is_global_rank_zero():
+                res.extend(logits_to_results(batch, response))
+
+        return reord.get_original(res) if self.can_access_output() else None
+
+    def loglikelihood_rolling(self, requests):
+        raise NotImplementedError
+
+    def greedy_until(self, requests):
+        raise NotImplementedError
+
+    def can_access_output(self):
+        return is_global_rank_zero()
diff --git a/launcher_scripts/nemo_launcher/core/data_curation_stages.py b/launcher_scripts/nemo_launcher/core/data_curation_stages.py
new file mode 100644
index 0000000000..902132b992
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/data_curation_stages.py
@@ -0,0 +1,181 @@
+import copy
+import shlex
+import omegaconf
+from typing import Dict, List
+from pathlib import Path
+
+from nemo_launcher.core.stages import (
+    NemoMegatronStage,
+    create_args_list,
+    clean_command_groups,
+)
+from nemo_launcher.core.launchers import AutoLauncher
+
+
+class DataCurationStage(NemoMegatronStage):
+    """
+    DataCurationStage is a base class for data curation stages.
+    It can hold multiple sub-stages. For example, preparing data from
+    Common Crawl requires download, extraction, deduplication and filtering.
+    They have dependencies on each other and will be launched one by one.
+    """
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+        self.log_folder = Path()
+        self.conf_folder = Path()
+
+    def setup_folder_and_data(self):
+        """
+        Each job in the data curation pipeline creates a directory
+        for writing logs (log_folder), writing and reading intermediate
+        results (results_folder) and for reading configs (conf_folder)
+        """
+        job_path = self.get_job_path()
+        job_path.folder.mkdir(parents=True, exist_ok=True)
+        # make the results dir
+        results_folder = job_path.results_folder
+        results_folder.mkdir(parents=True, exist_ok=True)
+        # make the log dir
+        self.log_folder = Path(job_path.folder, 'log')
+        self.log_folder.mkdir(parents=True, exist_ok=True)
+        # Make the conf dir
+        self.conf_folder = Path(job_path.folder, 'config')
+        self.conf_folder.mkdir(parents=True, exist_ok=True)
+
+    def _make_cluster_parameters(self, cluster: str) -> Dict:
+        """
+        Make a cluster-specific parameters for jobs on different clusters.
+        Current clusters include bcm(slurm), bcp and interactive.
+        For example for bcm, it will return slurm parameters:
+            {'job_name': 'some_name', 'nodes': 2, 'ntasks_per_node': 8, ...}
+
+        :param str cluster: i.e. `bcm`, `bcp`, `interactive`, etc.
+        :param Optional sub_stage: current sub_stage name
+        :return: a dictionary of cluster parameters, e.g. `ntasks_per_node`
+        :rtype: Dict
+        """
+        cfg = self.cfg
+        stage_cfg = self.stage_cfg
+
+        run_cfg = stage_cfg.get("run")
+        job_name = run_cfg.get("name")
+        time_limit = run_cfg.get("time_limit")
+        nodes = run_cfg.get('nodes')
+        # Allow for updating the partition as we might run
+        # on CPU only nodes
+        partition = run_cfg.get('partition')
+
+        container_image = cfg.get("container")
+        container_mounts = self._make_container_mounts_string()
+
+        shared_parameters = {
+            "job_name": job_name,
+            "time": time_limit,
+        }
+        if cluster == "bcm":
+            cluster_cfg = cfg.get("cluster")
+            slurm_cfg = {**copy.deepcopy(cluster_cfg)}
+            job_name_prefix = slurm_cfg.pop("job_name_prefix")
+            cluster_params = {
+                **slurm_cfg,
+            }
+            cluster_params.update({
+                **shared_parameters,
+                "container_image": container_image,
+                "container_mounts": container_mounts,
+            })
+            cluster_params[
+                "job_name"] = job_name_prefix + cluster_params["job_name"]
+            cluster_params['nodes'] = nodes
+            cluster_params['partition'] = partition
+
+        return cluster_params
+
+    def run(self) -> str:
+        """
+        Run current stage including all of the substages, returns job id on slurm based system otherwise empty string
+
+        :return: job id on slurm based system otherwise empty string
+        :rtype: str
+        """
+        # Create the job folders
+        self.setup_folder_and_data()
+        job_path = self.get_job_path()
+
+        # Make cluster configuration parameters
+        cluster_parameters = self._make_cluster_parameters(self.cluster)
+        stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(
+            self.stage_cfg,
+            job_path,
+        )
+
+        # Build commands to launch on cluster
+        command_groups = self.make_stage_command_groups(stage_cfg_path)
+
+        # Create the launcher for the cluster
+        launcher = AutoLauncher(
+            folder=self.get_job_path().folder,
+            cluster=self.cluster,
+            **cluster_parameters,
+        )
+
+        # Launch the job on the cluster
+        job_id = launcher.launch(command_groups)
+
+        return job_id
+
+
+class QualityFiltering(DataCurationStage):
+    """ DataCurationStage for performing quality filtering on documents """
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+    def setup_stage_vars(self, cfg):
+        """Setup the stage vars, i.e. stage name and stage cfg"""
+        self.stage_name = "quality_filtering"
+        self.stage_cfg = cfg.get("quality_filtering")
+
+    def make_stage_command_groups(self, stage_cfg_path: Path) -> List[List[str]]:
+        """ Builds the command groups for the current stage """
+        stage_cfg = self.stage_cfg
+
+        # Write out the filter configuration as a separate config file
+        filter_cfg = Path(self.conf_folder, "heuristic_filter.yaml")
+        omegaconf.OmegaConf.save(stage_cfg.get('filter'), filter_cfg)
+
+        command_groups = [[]]
+
+        # If certain arguments are not specified, we remove them from the list
+        optional_args = {
+            "output_removed_document_dir":
+            stage_cfg.get('output_removed_document_dir'),
+            "output_document_score_dir":
+            stage_cfg.get('output_document_score_dir'),
+        }
+
+        # Remove any arguments that are not specified
+        optional_args = {
+            arg: optional_args[arg]
+            for arg in optional_args if optional_args[arg]
+        }
+
+        # Create the list of arguments for the filter_documents command
+        args = create_args_list(
+            replace_underscore=True,
+            log_dir=self.log_folder,
+            input_data_dir=stage_cfg.get("input_dir"),
+            filter_config_file=f"{filter_cfg}",
+            output_retained_document_dir=stage_cfg.get(
+                "output_retained_document_dir"),
+            **optional_args,
+        )
+
+        core_command = ["filter_documents", *args]
+
+        core_command_string = " \\\n  ".join(core_command)
+        command_groups[-1] += [core_command_string]
+        command_groups = clean_command_groups(command_groups)
+
+        return command_groups
diff --git a/launcher_scripts/nemo_launcher/core/data_stages.py b/launcher_scripts/nemo_launcher/core/data_stages.py
index c3713786e5..c3d0eba1b0 100755
--- a/launcher_scripts/nemo_launcher/core/data_stages.py
+++ b/launcher_scripts/nemo_launcher/core/data_stages.py
@@ -16,11 +16,13 @@
 import os
 from pathlib import Path
 from typing import Dict, List, Optional
-
 import omegaconf
+import shutil
+
 from nemo_launcher.core.launchers import AutoLauncher
 from nemo_launcher.core.stages import NemoMegatronStage, clean_command_groups, create_args_list
 from nemo_launcher.utils.file_utils import download_single_file
+from nemo_launcher.utils.job_utils import JobPaths
 
 
 class DataStage(NemoMegatronStage):
@@ -55,7 +57,7 @@ def run(self) -> str:
             job_path = self.get_job_path(sub_stage)
             job_path.folder.mkdir(parents=True, exist_ok=True)
 
-            stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(self.stage_cfg, job_path)
+            stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(self.stage_cfg, job_path, self.cfg)
             if job_id:
                 dependency = f"aftercorr:{job_id}"
                 self.stage_cfg["run"]["dependency"] = dependency
@@ -65,9 +67,24 @@ def run(self) -> str:
 
             # Make command groups
             command_groups = self.make_stage_command_groups(stage_cfg_path, sub_stage)
+
+            # Prepare Helm chart for k8s
+            if self.cluster == 'k8s':
+                template_root = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'k8s_templates/data_preparation')
+                self._make_k8s_helm_chart(template_root, cluster_parameters, job_path, sub_stage)
+
             # Create launcher
             launcher = AutoLauncher(folder=job_path.folder, cluster=self.cluster, **cluster_parameters,)
-            job_id = launcher.launch(command_groups=command_groups)
+
+            if self.cluster == 'k8s':
+                # For k8s clusters, only launch on the final stage (preprocess) as
+                # the Helm chart contains all stages in a single chart.
+                if sub_stage == sub_stages[-1]:
+                    job_id = launcher.launch(command_groups=command_groups)
+                else:
+                    job_id = ''
+            else:
+                job_id = launcher.launch(command_groups=command_groups)
 
         return job_id
 
@@ -97,11 +114,11 @@ def _make_private_cluster_parameters(self, cluster, sub_stage):
     def _make_cluster_parameters(self, cluster: str, sub_stage: Optional = None,) -> Dict:
         """
         Make a cluster-specific parameters for jobs on different clusters.
-        Current clusters include bcm(slurm), bcp and interactive.
+        Current clusters include bcm(slurm), bcp, k8s, and interactive.
         For example for bcm, it will return slurm parameters:
             {'job_name': 'some_name', 'nodes': 2, 'ntasks_per_node': 8, ...}
 
-        :param str cluster: i.e. `bcm`, `bcp`, `interactive`, etc.
+        :param str cluster: i.e. `bcm`, `bcp`, `interactive`, `k8s`, etc.
         :param Optional sub_stage: current sub_stage name
         :return: a dictionary of cluster parameters, e.g. `ntasks_per_node`
         :rtype: Dict
@@ -142,11 +159,78 @@ def _make_cluster_parameters(self, cluster: str, sub_stage: Optional = None,) ->
             cluster_parameters.update(
                 {**shared_parameters, **private_parameters,}
             )
+        elif cluster == "k8s":
+            cluster_cfg = cfg.get("cluster")
+            container_image = cfg.get("container")
+            k8s_cfg = {**copy.deepcopy(cluster_cfg)}
+
+            cluster_parameters = {**k8s_cfg}
+
+            cluster_parameters.update(
+                {
+                    **shared_parameters,
+                    **private_parameters,
+                    "container_image": container_image,}
+            )
         elif cluster == "interactive":
             raise ValueError("Data preparation is not supported in interactive mode.")
 
         return cluster_parameters
 
+    def _make_k8s_helm_chart(self, template_root: str, cluster_parameters: Dict, job_path: JobPaths, sub_stage: str):
+        """
+        Create a Helm chart for data preparation.
+        The Helm chart uses a base template which is extended with user-defined
+        cluster settings as specified in the config files. The generated Hydra
+        config file needs to be copied to the Helm chart as this will be used
+        for launching the job.
+
+        :param str template_root: the path to where the k8s template files are located.
+        :param dict cluster_parameters: additional parameters specific to the cluster config.
+        :param JobPaths job_path: the path to the job results directory.
+        :param str sub_stage: the current stage.
+        """
+        with open(os.path.join(template_root, 'values.yaml')) as value_file:
+            values_template = omegaconf.OmegaConf.load(value_file)
+
+        procs_per_node = self.stage_cfg.run.bcp_preproc_npernode if sub_stage == "preprocess" else 1
+        total_processes = procs_per_node * self.stage_cfg.run.node_array_size
+
+        # Update the Helm chart template with the user-specified settings
+        values_template.image.trainingImage = cluster_parameters['container_image']
+        values_template.image.pullSecret = cluster_parameters['pull_secret']
+        values_template.image.nodes = self.stage_cfg.run.node_array_size
+        values_template.dataPrepConfig.shmSize = cluster_parameters['shm_size']
+        values_template.dataPrepConfig.NFSServer = cluster_parameters['nfs_server']
+        values_template.dataPrepConfig.NFSPath = cluster_parameters['nfs_path']
+        values_template.dataPrepConfig.totalProcesses = total_processes
+        values_template.dataPrepConfig.procsPerNode = procs_per_node
+        values_template.dataPrepConfig.stage = sub_stage
+
+        k8s_template_path = job_path.folder
+        k8s_template_file = Path(k8s_template_path / 'k8s_template' / 'values.yaml')
+        k8s_template_file.parent.mkdir(parents=True, exist_ok=True)
+
+        conf = omegaconf.OmegaConf.create(values_template)
+        omegaconf.OmegaConf.save(conf, k8s_template_file)
+
+        # Copy the data prep spec files to the Helm chart
+        template_file = os.path.join(template_root, 'data-prep.yaml')
+        chart_file = os.path.join(template_root, 'Chart.yaml')
+        data_prep_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'data-prep.yaml')
+        data_prep_path.parent.mkdir(parents=True, exist_ok=True)
+        config_path = Path(job_path.folder / 'k8s_template' / 'config')
+        config_path.mkdir(parents=True, exist_ok=True)
+        chart_path = Path(job_path.folder / 'k8s_template' / 'Chart.yaml')
+        data_prep_config_file = os.path.join(template_root, 'data-prep-config.yaml')
+        data_prep_config_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'data-prep-config.yaml')
+        hydra_config_path = Path(job_path.folder / 'k8s_template' / 'config')
+
+        shutil.copy2(template_file, data_prep_path)
+        shutil.copy2(chart_file, chart_path)
+        shutil.copy2(data_prep_config_file, data_prep_config_path)
+        shutil.copy2(job_path.config_file, hydra_config_path)
+
 
 class PileDataPreparation(DataStage):
     """DataStage for preparing the Pile dataset for gpt3 and t5"""
@@ -176,6 +260,15 @@ def setup_folder_and_data(self) -> None:
         download_merges_url = data_cfg.get("download_merges_url")
         vocab_save_dir = data_cfg.get("vocab_save_dir")
         merges_save_dir = data_cfg.get("merges_save_dir")
+        download_tokenizer_url = data_cfg.get("download_tokenizer_url")
+        tokenizer_save_dir = data_cfg.get("tokenizer_save_dir")
+
+        if download_tokenizer_url is not None:
+            assert tokenizer_save_dir is not None, "tokenizer_save_dir must be a valid path."
+            download_single_file(
+                url=download_tokenizer_url, save_dir=tokenizer_save_dir, file_name="llama_tokenizer.model",
+            )
+
         # Download vocab
         if download_vocab_url is not None:
             assert vocab_save_dir is not None, "vocab_save_dir must be a valid path."
@@ -252,6 +345,8 @@ def _make_sub_stage_command(self, sub_stage: str) -> List[str]:
             rm_downloaded=self.stage_cfg.get("rm_downloaded"),
             rm_extracted=self.stage_cfg.get("rm_extracted"),
             tokenizer_type=self.stage_cfg.get("tokenizer_type"),
+            tokenizer_library=self.stage_cfg.get("tokenizer_library", "megatron"),
+            tokenizer_model=self.stage_cfg.get("tokenizer_model", None),
             vocab_save_dir=self.stage_cfg.get("vocab_save_dir"),
             merges_save_dir=self.stage_cfg.get("merges_save_dir"),
         )
diff --git a/launcher_scripts/nemo_launcher/core/export_stages.py b/launcher_scripts/nemo_launcher/core/export_stages.py
index dea2296db6..171e7d2c29 100755
--- a/launcher_scripts/nemo_launcher/core/export_stages.py
+++ b/launcher_scripts/nemo_launcher/core/export_stages.py
@@ -108,7 +108,7 @@ def run(self) -> str:
             job_path = self.get_job_path(sub_stage)
             job_path.folder.mkdir(parents=True, exist_ok=True)
 
-            stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(self.stage_cfg, job_path)
+            stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(self.stage_cfg, job_path, self.cfg)
             if job_id:
                 dependency = f"aftercorr:{job_id}"
                 self.stage_cfg["run"]["dependency"] = dependency
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/Chart.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/Chart.yaml
new file mode 100644
index 0000000000..bbf3651743
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v2
+appVersion: "1.0"
+description: NeMo Framework Base Model Conversion
+name: nemo-framework-conversion
+version: 1.0.0
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/conversion.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/conversion.yaml
new file mode 100644
index 0000000000..214e14df69
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/conversion.yaml
@@ -0,0 +1,48 @@
+{{ $config := .Values.trainingConfig }}
+
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: nlp-conversion
+  labels:
+    app: nlp-conversion
+spec:
+  template:
+    spec:
+      containers:
+      - name: nlp-conversion
+        image: {{ .Values.image.trainingImage }}
+        env:
+          - name: NCCL_AVOID_RECORD_STREAMS
+            value: "1"
+        command: ["/bin/bash", "-c"]
+        args:
+          - 'export CKPT_NAME=$(python3 {{ $config.launcherScriptsPath }}/nemo_launcher/collections/checkpoint_search.py checkpoint_folder={{ $config.trainingDirectory }}/results/checkpoints checkpoint_name=latest tensor_model_parallel_size=1 pipeline_model_parallel_size=1) &&
+          echo ${CKPT_NAME} &&
+          python3 {{ $config.launcherScriptsPath }}/nemo_launcher/collections/hparams_override.py hparams_file={{ $config.trainingDirectory }}/results/hparams.yaml output_path={{ $config.resultsDirectory }}/results vocab_file={{ $config.vocabPath }} merge_file={{ $config.mergesPath }} tokenizer_model=None &&
+          python3 /opt/NeMo/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py --gpus_per_node=1 --model_type=gpt --checkpoint_folder={{ $config.trainingDirectory }}/results/checkpoints --checkpoint_name=${CKPT_NAME} --hparams_file={{ $config.resultsDirectory }}/results/hparams_override.yaml --nemo_file_path={{ $config.resultsDirectory }}/megatron_gpt.nemo --tensor_model_parallel_size={{ $config.tensorParallelism }} --pipeline_model_parallel_size={{ $config.pipelineParallelism }}'
+        imagePullPolicy: Always
+        resources:
+          requests:
+            nvidia.com/gpu: {{ .Values.image.gpuNum }}
+          limits:
+            nvidia.com/gpu: {{ .Values.image.gpuNum }}
+        volumeMounts:
+        - mountPath: {{ $config.NFSPath }}
+          name: workspace
+        - mountPath: /dev/shm
+          name: dshm
+      restartPolicy: Never
+      imagePullSecrets:
+      - name: {{ .Values.image.pullSecret }}
+
+      volumes:
+      - name: workspace
+        nfs:
+          server: {{ $config.NFSServer }}
+          path: {{ $config.NFSPath }}
+
+      - name: dshm
+        emptyDir:
+          medium: Memory
+          sizeLimit: {{ $config.shmSize }}
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/values.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/values.yaml
new file mode 100644
index 0000000000..21df8fd095
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/values.yaml
@@ -0,0 +1,40 @@
+image:
+  trainingImage: cfg.container
+  pullPolicy: IfNotPresent
+
+  # Insert the name of your container registry pull secret #
+  pullSecret: nvcr.io
+
+  # Insert number of GPUs #
+  gpuNum: <Insert number of GPUs>
+
+trainingConfig:
+  # Specify the amount of shared memory to attach to the Pods #
+  shmSize: 512Gi
+
+  # Insert the address for the NFS server if using NFS for model storage #
+  NFSServer: <Insert NFS server address>
+
+  # Insert the path to save data on the NFS server #
+  NFSPath: <Insert NFS server path>
+
+  # Insert the path to the vocab file #
+  vocabPath: <Insert absolute path to vocab.json file>
+
+  # Insert the path to the merges file #
+  mergesPath: <Insert absolute path to merges.txt file>
+
+  # Insert the path to the results directory #
+  resultsDirectory: <Insert absolute path to the conversion directory>
+
+  # Insert the path to the training directory #
+  trainingDirectory: <Insert the absolute path to the training directory>
+
+  # Insert the path to the launcher_scripts directory #
+  launcherScriptsPath: <Insert the absolute path to the launcher_scripts directory>
+
+  # Insert the TP size #
+  tensorParallelism: <Insert TP size>
+
+  # Insert the PP size #
+  pipelineParallelism: <Insert PP size>
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/Chart.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/Chart.yaml
new file mode 100644
index 0000000000..d2337c69ac
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v2
+appVersion: "1.0"
+description: NeMo Framework Data Preparation
+name: nemo-framework-data-prep
+version: 1.0.0
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep-config.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep-config.yaml
new file mode 100644
index 0000000000..338acfb9a5
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep-config.yaml
@@ -0,0 +1,7 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: data-prep-config
+data:
+  config.yaml: |-
+  {{ (.Files.Glob "config/*hydra.yaml").AsConfig | indent 4 }}
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep.yaml
new file mode 100644
index 0000000000..8ab7a76207
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep.yaml
@@ -0,0 +1,59 @@
+{{ $config := .Values.dataPrepConfig }}
+
+apiVersion: kubeflow.org/v2beta1
+kind: MPIJob
+metadata:
+  name: nlp-data-prep
+  labels:
+    app: nlp-data-prep
+spec:
+  mpiReplicaSpecs:
+    Launcher:
+      replicas: 1
+      template:
+        spec:
+          containers:
+          - name: nlp-data-prep
+            image: {{ .Values.image.trainingImage }}
+            command: ["bash", "-c"]
+            args:
+              - '{{- range tuple "download" "extract" "preprocess" }} mpirun --allow-run-as-root -np {{ $config.totalProcesses }} -npernode {{ $config.procsPerNode }} -bind-to none -map-by slot --oversubscribe -x PYTHONPATH -mca pml ob1 -mca btl ^openib python3 /opt/NeMo-Megatron-Launcher/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/{{ . }}.py --config-path=/config --config-name=config.yaml && {{- end}} echo Data preparation complete'
+            imagePullPolicy: Always
+          imagePullSecrets:
+          - name: {{ .Values.image.pullSecret }}
+    Worker:
+      replicas: {{ .Values.image.nodes }}
+      template:
+        spec:
+          containers:
+          - name: nlp-data-prep
+            image: {{ .Values.image.trainingImage }}
+            command: ["/usr/sbin/sshd"]
+            args:
+              - "-De"
+            volumeMounts:
+            - mountPath: {{ $config.NFSPath }}
+              name: workspace
+            - mountPath: /dev/shm
+              name: dshm
+            - mountPath: /config
+              name: data-prep-config
+            imagePullPolicy: Always
+          restartPolicy: Never
+          imagePullSecrets:
+          - name: {{ .Values.image.pullSecret }}
+
+          volumes:
+          - name: workspace
+            nfs:
+              server: {{ $config.NFSServer }}
+              path: {{ $config.NFSPath }}
+
+          - name: dshm
+            emptyDir:
+              medium: Memory
+              sizeLimit: {{ $config.shmSize }}
+
+          - configMap:
+              name: data-prep-config
+            name: data-prep-config
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/values.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/values.yaml
new file mode 100644
index 0000000000..e5a8bc7987
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/values.yaml
@@ -0,0 +1,27 @@
+image:
+  trainingImage: cfg.container
+  pullPolicy: IfNotPresent
+
+  # Insert the name of your container registry pull secret #
+  pullSecret: nvcr.io
+
+  nodes: training.trainer.num_nodes
+
+dataPrepConfig:
+  # Specify the amount of shared memory to attach to the Pods #
+  shmSize: 512Gi
+
+  # Insert the address for the NFS server if using NFS for model storage #
+  NFSServer: <Insert NFS server address>
+
+  # Insert the path to save data on the NFS server #
+  NFSPath: <Insert NFS server path>
+
+  # Insert the total number of processes to spawn on the cluster #
+  totalProcesses: <Insert number of processes>
+
+  # Insert the number of processes to spawn per node #
+  procsPerNode: <Insert number of processes per node>
+
+  # Insert the data preparation stage, such as download, extract, or preprocess #
+  stage: <Insert the data prep stage>
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/Chart.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/Chart.yaml
new file mode 100644
index 0000000000..4c291917f1
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v2
+appVersion: "1.0"
+description: NeMo Framework Evaluation
+name: nemo-framework-evaluation
+version: 1.0.0
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation-config.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation-config.yaml
new file mode 100644
index 0000000000..080bbcc6b3
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation-config.yaml
@@ -0,0 +1,7 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: evaluation-config
+data:
+  hparams.yaml: |-
+  {{ (.Files.Glob "config/hparams.yaml").AsConfig | indent 4 }}
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation.yaml
new file mode 100644
index 0000000000..7278d1385e
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation.yaml
@@ -0,0 +1,53 @@
+{{ $config := .Values.trainingConfig }}
+
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: nlp-evaluation
+  labels:
+    app: nlp-evaluation
+spec:
+  template:
+    spec:
+      containers:
+      - name: nlp-evaluation
+        image: {{ .Values.image.trainingImage }}
+        env:
+          - name: NCCL_AVOID_RECORD_STREAMS
+            value: "1"
+        command: ["/bin/bash", "-c"]
+        args:
+          - 'python3 {{ $config.launcherScriptsPath }}/nemo_launcher/collections/eval_harness/download.py --tasks=all_tasks --cache-dir={{ $config.cacheDir }} &&
+          mkdir -p {{ $config.outputPath }} &&
+          python3 {{ $config.launcherScriptsPath }}/nemo_launcher/collections/eval_harness/evaluate.py --name={{ $config.name }} --model={{ $config.model }} --tasks={{ $config.tasks }} --cache_dir={{ $config.cacheDir }} --output_path={{ $config.outputPath }} --batch_size={{ $config.batchSize }} --tensor_model_parallel_size={{ $config.tensorParallelism }} --pipeline_model_parallel_size={{ $config.pipelineParallelism }} --precision={{ $config.precision }} --vocab_file={{ $config.vocabPath }} --merge_file={{ $config.mergesPath }} {{- if $config.nemoModel }} --nemo_model={{ $config.nemoModel }}{{ end }} --checkpoint_folder={{ $config.checkpointFolder }} --checkpoint_name={{ $config.checkpointName }} --hparams_file=/config/hparams.yaml'
+        imagePullPolicy: Always
+        resources:
+          requests:
+            nvidia.com/gpu: {{ .Values.image.gpuNum }}
+          limits:
+            nvidia.com/gpu: {{ .Values.image.gpuNum }}
+        volumeMounts:
+        - mountPath: {{ $config.NFSPath }}
+          name: workspace
+        - mountPath: /dev/shm
+          name: dshm
+        - mountPath: /config
+          name: evaluation-config
+      restartPolicy: Never
+      imagePullSecrets:
+      - name: {{ .Values.image.pullSecret }}
+
+      volumes:
+      - name: workspace
+        nfs:
+          server: {{ $config.NFSServer }}
+          path: {{ $config.NFSPath }}
+
+      - name: dshm
+        emptyDir:
+          medium: Memory
+          sizeLimit: {{ $config.shmSize }}
+
+      - configMap:
+          name: evaluation-config
+        name: evaluation-config
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/values.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/values.yaml
new file mode 100644
index 0000000000..0fcfe4c835
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/values.yaml
@@ -0,0 +1,73 @@
+image:
+  trainingImage: cfg.container
+  pullPolicy: IfNotPresent
+
+  # Insert the name of your container registry pull secret #
+  pullSecret: nvcr.io
+
+  # Insert number of GPUs #
+  gpuNum: 1
+
+trainingConfig:
+  # Specify the amount of shared memory to attach to the Pods #
+  shmSize: 512Gi
+
+  # Insert the address for the NFS server if using NFS for model storage #
+  NFSServer: <Insert NFS server address>
+
+  # Insert the path to save data on the NFS server #
+  NFSPath: <Insert NFS server path>
+
+  # Insert the path to the vocab file #
+  vocabPath: <Insert absolute path to vocab.json file>
+
+  # Insert the path to the merges file #
+  mergesPath: <Insert absolute path to merges.txt file>
+
+  # Insert the path to the results directory #
+  resultsDirectory: <Insert absolute path to the conversion directory>
+
+  # Insert the path to the training directory #
+  trainingDirectory: <Insert the absolute path to the training directory>
+
+  # Insert the path to the launcher_scripts directory #
+  launcherScriptsPath: <Insert the absolute path to the launcher_scripts directory>
+
+  # Insert the TP size #
+  tensorParallelism: <Insert TP size>
+
+  # Insert the PP size #
+  pipelineParallelism: <Insert PP size>
+
+  # Insert evaluation task name #
+  name: <Insert name of evaluation task>
+
+  # Insert name of model to evaluate #
+  model: <Insert name of model to evaluate>
+
+  # Insert which tasks to evaluate #
+  tasks: <Insert tasks to evaluate>
+
+  # Insert path to store downloaded eval data #
+  cacheDir: <Insert path to cache eval data>
+
+  # Insert path to save evaluation results #
+  outputPath: <Insert path to save eval results>
+
+  # Insert batch size for evaluation #
+  batchSize: <Insert batch size>
+
+  # Insert evaluation precision #
+  precision: <Insert precision>
+
+  # Specify the path to the .nemo model if used #
+  nemoModel: <Insert path to .nemo file or "null">
+
+  # Insert path the the training checkpoint directory #
+  checkpointFolder: <Insert path to checkpoint directory>
+
+  # Insert name of checkpoint or "latest" #
+  checkpointName: <Insert checkpoint name>
+
+  # Insert path to the hparams file from the training job #
+  hparamsFile: <Insert path to hparams.yaml file>
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/training/Chart.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/training/Chart.yaml
new file mode 100644
index 0000000000..e2314f8ec3
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/training/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v2
+appVersion: "1.0"
+description: NeMo Framework Base Model Training
+name: nemo-framework-training
+version: 1.0.0
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/training/training-config.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/training/training-config.yaml
new file mode 100644
index 0000000000..ce3095184c
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/training/training-config.yaml
@@ -0,0 +1,7 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: training-config
+data:
+  config.yaml: |-
+  {{ (.Files.Glob "config/*hydra.yaml").AsConfig | indent 4 }}
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/training/training.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/training/training.yaml
new file mode 100644
index 0000000000..37f37a1317
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/training/training.yaml
@@ -0,0 +1,71 @@
+{{ $config := .Values.trainingConfig }}
+
+apiVersion: kubeflow.org/v1
+kind: PyTorchJob
+metadata:
+  name: nlp-training
+  labels:
+    app: nlp-training
+spec:
+  pytorchReplicaSpecs:
+    Worker:
+      replicas: {{ .Values.image.nodes }}
+      template:
+        spec:
+          containers:
+          - name: pytorch
+            image: {{ .Values.image.trainingImage }}
+            env:
+              - name: NCCL_AVOID_RECORD_STREAMS
+                value: "1"
+            {{ if eq $config.wandbKey "nil" }}
+            command: ["torchrun"]
+            args:
+              - "--nnodes={{ .Values.image.nodes }}"
+              - "--rdzv-backend=c10d"
+              - "--rdzv-endpoint=nlp-training-worker-0"
+              - "--nproc_per_node={{ .Values.image.numGPUs }}"
+              - "/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py"
+              - "--config-path=/config"
+              - "--config-name=config.yaml"
+            {{ else }}
+            command: ["bash", "-c"]
+            args:
+              - "wandb login {{ $config.wandbKey }} && torchrun --nnodes={{ .Values.image.nodes }} --rdzv-backend=c10d --rdzv-endpoint=nlp-training-worker-0 --nproc_per_node={{ .Values.image.numGPUs }} /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py --config-path=/config --config-name=config.yaml"
+            {{ end }}
+            imagePullPolicy: Always
+            securityContext:
+              capabilities:
+                add: [ "IPC_LOCK" ]
+            resources:
+              requests:
+                nvidia.com/gpu: {{ .Values.image.numGPUs }}
+                {{ $config.ibResourceName }}: {{ $config.ibCount }}
+              limits:
+                nvidia.com/gpu: {{ .Values.image.numGPUs }}
+                {{ $config.ibResourceName }}: {{ $config.ibCount }}
+            volumeMounts:
+            - mountPath: {{ $config.NFSPath }}
+              name: workspace
+            - mountPath: /dev/shm
+              name: dshm
+            - mountPath: /config
+              name: training-config
+          restartPolicy: Never
+          imagePullSecrets:
+          - name: {{ .Values.image.pullSecret }}
+
+          volumes:
+          - name: workspace
+            nfs:
+              server: {{ $config.NFSServer }}
+              path: {{ $config.NFSPath }}
+
+          - name: dshm
+            emptyDir:
+              medium: Memory
+              sizeLimit: {{ $config.shmSize }}
+
+          - configMap:
+              name: training-config
+            name: training-config
diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/training/values.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/training/values.yaml
new file mode 100644
index 0000000000..553be55b19
--- /dev/null
+++ b/launcher_scripts/nemo_launcher/core/k8s_templates/training/values.yaml
@@ -0,0 +1,28 @@
+image:
+  trainingImage: cfg.container
+  pullPolicy: IfNotPresent
+
+  # Insert the name of your container registry pull secret #
+  pullSecret: nvcr.io
+
+  numGPUs: training.trainer.devices
+  nodes: training.trainer.num_nodes
+
+trainingConfig:
+  # Specify the amount of shared memory to attach to the Pods #
+  shmSize: 512Gi
+
+  # Insert the address for the NFS server if using NFS for model storage #
+  NFSServer: <Insert NFS server address>
+
+  # Insert the path to save data on the NFS server #
+  NFSPath: <Insert NFS server path>
+
+  # Specify the k8s resource name for IB devices #
+  ibResourceName: nvidia.com/hostdev
+
+  # Specity the number of IB devices to include in pods #
+  ibCount: "8"
+
+  # Specify the WandB API key if using WandB for logging #
+  wandbKey: "nil"
diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index c52b676e35..9c9e90d381 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -21,6 +21,8 @@
 import shlex
 import shutil
 import warnings
+from omegaconf import OmegaConf, DictConfig
+import yaml
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Set, Union
 
@@ -70,6 +72,7 @@ def get_launchers():
             "bcm": SlurmLauncher,
             "bcp": BCPLauncher,
             "interactive": InteractiveLauncher,
+            "k8s": K8SLauncher,
         }
 
 
@@ -114,6 +117,7 @@ def _make_submission_file(self, command_groups: List[List[str]]) -> Path:
             on interactive cluster, it's a bash file, trigger with bash.
             on slurm cluster, it's a slurm script file, trigger with sbatch.
             on BCP cluster, it's a BCP script file, trigger with bash.
+            on k8s cluster, it's a Helm chart, triggered with helm.
 
         :param List[List[str]] command_groups: Command groups to launch with
         :return: job id on slurm based system otherwise empty string
@@ -431,6 +435,70 @@ def _get_job_id_from_submission_command(string: Union[bytes, str]) -> str:
         return output.group("id")
 
 
+class K8SLauncher(Launcher):
+    """
+    K8s job launcher
+    This class is used to hold the parameters to run a job on kubernetes.
+    In practice, it will create a Helm chart in the specified directory for the job
+    and trigger the job with `bash` command.
+
+    :param Union[Path, str] folder: folder for storing job submission/output and logs.
+    :param str job_name: Name of the job, used as job folder name
+    :param Any **kwargs: Parse other cluster parameters required for k8s running,
+        including `nodes`, `ntasks_pernode`, `bcp_launcher`, etc.
+    """
+
+    def __init__(self, folder: Union[Path, str], job_name: str, **kwargs: Any) -> None:
+        super().__init__(folder, job_name)
+        self.parameters = kwargs
+        self.parameters = self._convert_parameters(self.parameters)
+
+    @classmethod
+    def _equivalence_dict(cls):
+        return {
+            "name": "job_name",
+            "nodes": "nnodes",
+            "tasks_per_node": "npernode",
+            "ntasks_per_node": "npernode",
+        }
+
+    def _convert_parameters(self, params: Dict[str, Any]) -> Dict[str, Any]:
+        """translate k8s parameter names"""
+        # replace type in some cases
+        eq_dict = self._equivalence_dict()
+        if eq_dict is not None:
+            params = {eq_dict.get(k, k): v for k, v in params.items()}
+        return params
+
+    def _submit_command(self, submission_file_path: Path) -> str:
+        """Launch the submission command"""
+        command_list = self._make_submission_command(submission_file_path)
+        # run
+        job_utils.CommandFunction(command_list, ret_stdout=False, verbose=False)()  # explicit errors
+        return ""
+
+    @staticmethod
+    def _make_submission_command(submission_file_path: Path) -> List[str]:
+        """Make a command to trigger submission script. On a k8s cluster, the script is triggerred with Helm"""
+        return ["bash", str(submission_file_path)]
+
+    def _make_submission_file_text(self, command_groups: List[List[str]]) -> str:
+        """
+        Generate the script to launch the Helm chart.
+        A very simple bash script is generated which runs `helm install` for the
+        Helm chart that was generated.
+
+        :param List[List[str]] command_groups: Command groups to launch with
+        :return: submission script file's text
+        :rtype: str
+        """
+        paths = job_utils.JobPaths(folder=self.folder, job_name=self.job_name)
+        helm_charts = paths.folder / 'k8s_template'
+        job_name = self.job_name.replace('_', '-')
+
+        return f'#!/bin/bash\nhelm install {job_name} {helm_charts}\n'
+
+
 @functools.lru_cache()
 def _get_default_parameters() -> Dict[str, Any]:
     """Parameters that can be set through update_parameters"""
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 134232a2c2..3f317e5866 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -19,6 +19,7 @@
 import json
 import re
 from pathlib import Path
+import shutil
 from typing import Any, Dict, List, Optional
 
 import omegaconf
@@ -28,7 +29,7 @@
     prepare_squad_for_prompt_learning,
 )
 from nemo_launcher.utils.job_utils import JobPaths
-from omegaconf import OmegaConf
+from omegaconf import OmegaConf, DictConfig
 
 
 class NemoMegatronStage:
@@ -73,9 +74,14 @@ def run(self) -> str:
             self.cfg['training']["trainer"]["num_nodes"] = nodes
             logging.info(f"global batch size and number of nodes will change following this schedule:\n {self.nodes_scheduler}")
 
-        stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(self.stage_cfg, job_path)
+        stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(self.stage_cfg, job_path, self.cfg)
         # Make cluster parameters
         cluster_parameters = self._make_cluster_parameters(self.cluster)
+        # Make k8s config file if necessary
+        if self.cluster == 'k8s':
+            template_root = os.path.join(os.path.abspath(os.path.dirname(__file__)), f'k8s_templates/{self.stage_name}')
+            self._make_k8s_spec_file(template_root, cluster_parameters, job_path)
+            self._copy_k8s_helm_chart(template_root, job_path)
         # Make command groups
         command_groups = self.make_stage_command_groups(stage_cfg_path)
         # Create launcher
@@ -92,15 +98,30 @@ def setup_folder_and_data(self) -> None:
         results_folder.mkdir(parents=True, exist_ok=True)
 
     @staticmethod
-    def save_stage_hydra_config(stage_cfg: OmegaConf, job_path: JobPaths) -> Path:
+    def save_stage_hydra_config(stage_cfg: OmegaConf, job_path: JobPaths, cfg: OmegaConf) -> Path:
         """
         Interpolate and save hydra config file for current stage
 
         :param OmegaConf stage_cfg: current stage's hydra configuration
         :param JobPaths job_path: JobPaths object
+        :param OmegaConf cfg: base config for job
         :return: path current stage's essential nemo scripts code
         :rtype: Path
         """
+        # Since k8s uses a Helm chart that launches a job based on the Hydra config
+        # file, the Hydra config file that is generated needs to contain all of the
+        # required keys for each stage.
+        if cfg.cluster_type == "k8s":
+            # OmegaConf doesn't allow adding new keys. Temporarily create a dictionary
+            # representation and add the new keys before converting back to an
+            # OmegaConf object.
+            temp_config = OmegaConf.to_object(stage_cfg)
+            temp_config['data_dir'] = cfg.data_dir
+            temp_config['cluster_type'] = cfg.cluster_type
+            temp_config['launcher_scripts_path'] = cfg.launcher_scripts_path
+            temp_config['data_config'] = stage_cfg.run.name
+            stage_cfg = OmegaConf.create(temp_config)
+
         _hydra_interpolation(stage_cfg)
 
         cfg_save_path = job_path.config_file
@@ -139,6 +160,10 @@ def _make_nemo_path_command(self) -> List[str]:
             f'export PYTHONPATH={self._nemo_code_path}:\${{PYTHONPATH}}',
         ]
 
+    def _make_k8s_spec_file(self, template_root: str, cluster_parameters: Dict, job_path: JobPaths):
+        """Create a yaml spec file for kubernetes jobs"""
+        raise NotImplementedError
+
     # def _make_numa_mapping_command(self) -> List[str]:
     #     """Make a command of numa mapping call"""
     #     cfg = self.cfg
@@ -240,7 +265,7 @@ def _make_cluster_parameters(self, cluster: str) -> Dict:
         dependency = run_cfg.get("dependency")
         if nodes is None:
             nodes = stage_cfg.get("trainer").get("num_nodes")
-        
+
         ntasks_per_node = run_cfg.get("ntasks_per_node")
         if ntasks_per_node is None:
             ntasks_per_node = stage_cfg.get("trainer").get("devices")
@@ -285,9 +310,20 @@ def _make_cluster_parameters(self, cluster: str) -> Dict:
             )
         elif cluster == "interactive":
             cluster_parameters.update(shared_parameters)
+        elif cluster == "k8s":
+            cluster_cfg = cfg.get("cluster")
+            k8s_cfg = {**copy.deepcopy(cluster_cfg)}
+
+            cluster_parameters = {**k8s_cfg}
+            cluster_parameters.update(
+                {
+                    **shared_parameters,
+                    "container_image": container_image,
+                }
+            )
 
         return cluster_parameters
-    
+
     def _find_optimal_nodes(self, cfg, gpus) -> None:
         nodes_scheduler_path = f"{cfg.get('training').get('run').get('results_dir')}/nodes_scheduler.json"
 
@@ -320,7 +356,7 @@ def _find_optimal_nodes(self, cfg, gpus) -> None:
                         optimal_lst.append(nodes)
 
                 self.nodes_scheduler[str(b)] = max(optimal_lst)
-            
+
             sched_rbs = [int(i) for i in self.nodes_scheduler.keys()]
             assert rbs[::-1] == sched_rbs, (
                 "please, make sure you enter the correct combination of"
@@ -329,7 +365,7 @@ def _find_optimal_nodes(self, cfg, gpus) -> None:
 
             with open(nodes_scheduler_path, 'w') as nodes_scheduler:
                 nodes_scheduler.write(json.dumps(self.nodes_scheduler))
-    
+
     def _get_current_gbs(self, cfg):
         start_bs = cfg.get('training').get('model').get('rampup_batch_size')[0]
         results_dir = cfg.get('training').get('run').get('results_dir')
@@ -340,16 +376,16 @@ def _get_current_gbs(self, cfg):
             for file in glob.glob("*.out"):
                 file = file.split('_')[-1].split('.')[0]
                 job_numbers.append(int(file))
-        
+
             job_number = max(job_numbers)
             last_job = glob.glob(f"*{job_number}.out")[0]
             with open(last_job, 'r') as logs:
                 logs = logs.read()
-        
+
             current_gbs = re.findall(r'global_batch_size=(\d+)', logs)[-1]
         except:
             current_gbs =  start_bs
-    
+
         return current_gbs
 
     def get_env_vars(self) -> Dict:
@@ -428,9 +464,10 @@ def get_job_path(self, sub_stage: Optional = None) -> JobPaths:
     @property
     def _set_ln_sm_margin(self) -> str:
         """ Set LayerNorm SM margin when using P2P communication overlap to support the overlap with LayerNorm kernel """
+        vpp = self.cfg.training.model.get("virtual_pipeline_model_parallel_size")
         if (self.cfg.training.model.get("overlap_p2p_comm", False) and
             self.cfg.training.model.get("pipeline_model_parallel_size") > 1 and
-            self.cfg.training.model.get("virtual_pipeline_model_parallel_size") > 1):
+            vpp is not None and vpp > 1):
             get_ln_sm_margin_command = (
                 f"python3 {self._launcher_scripts_path / 'nemo_launcher/collections/conditional_cfgs.py'} "
                 f"name=get_ln_sm_margin"
@@ -532,11 +569,11 @@ def _make_nemo_call_string(self, stage_cfg_path: Path) -> str:
     def _make_hydra_override(self) -> List:
         """
         Override some existing hydra configurations if necessary.
-        
+
         Example use cases are:
             1. For bcp cluster, `+rank=\${RANK}` is required running some NeMo scripts.
                 Existing hydra config doesn't have `rank` field, so we overwrite on the fly.
-            2. Auto blend training dataset by overwriting empty `model.data.data_prefix` as 
+            2. Auto blend training dataset by overwriting empty `model.data.data_prefix` as
                 `model.data.data_prefix=\$({auto_blend_command})`. Existing `model.data.data_prefix`
                 could be None in cfg, so we overwrite it in this function.
         """
@@ -544,6 +581,72 @@ def _make_hydra_override(self) -> List:
         if self.cluster == "bcp":
             hydra_override += ["+rank=\${RANK}"]
         return hydra_override
+    
+    def _copy_k8s_helm_chart(self, template_root: str, job_path: JobPaths):
+        """
+        Copy the k8s Helm charts to the results directory.
+
+        :param str template_root: path to where the k8s template files are located
+        :param JobPaths job_path: JobPaths object
+        """
+        template_file = os.path.join(template_root, 'training.yaml')
+        chart_file = os.path.join(template_root, 'Chart.yaml')
+        training_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'training.yaml')
+        training_path.parent.mkdir(parents=True, exist_ok=True)
+        config_path = Path(job_path.folder / 'k8s_template' / 'config')
+        config_path.mkdir(parents=True, exist_ok=True)
+        chart_path = Path(job_path.folder / 'k8s_template' / 'Chart.yaml')
+        training_config_file = os.path.join(template_root, 'training-config.yaml')
+        training_config_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'training-config.yaml')
+        hydra_config_path = Path(job_path.folder / 'k8s_template' / 'config')
+
+        shutil.copy2(template_file, training_path)
+        shutil.copy2(chart_file, chart_path)
+        shutil.copy2(training_config_file, training_config_path)
+        shutil.copy2(job_path.config_file, hydra_config_path)
+
+    def _add_wandb_key_to_chart(self) -> str:
+        """
+        Read the WandB API key file and return it to be placed in the Helm chart.
+
+        :return: a string of the WandB API key.
+        :rtype: str
+        """
+        with open(self.cfg.wandb_api_key_file, "r") as f:
+            wandb_api_key = f.readline().rstrip()
+        return wandb_api_key
+
+    def _make_k8s_spec_file(self, template_root: str, cluster_parameters: Dict, job_path: JobPaths):
+        """
+        Create a spec file for a Kubernetes training job.
+        The spec file is generated based on the parameters in the cluster and training config files.
+
+        :param str template_root: path to where the k8s template files are located
+        :param Dict cluster_parameters: settings specific to the cluster that is being used
+        :param JobPaths job_path: JobPaths object
+        """
+        with open(os.path.join(template_root, 'values.yaml')) as value_file:
+            values_template = OmegaConf.load(value_file)
+
+        values_template.image.trainingImage = cluster_parameters['container_image']
+        values_template.image.pullSecret = cluster_parameters['pull_secret']
+        values_template.image.numGPUs = self.stage_cfg.trainer.devices
+        values_template.image.nodes = self.stage_cfg.trainer.num_nodes
+        values_template.trainingConfig.shmSize = cluster_parameters['shm_size']
+        values_template.trainingConfig.NFSServer = cluster_parameters['nfs_server']
+        values_template.trainingConfig.NFSPath = cluster_parameters['nfs_path']
+        values_template.trainingConfig.ibResourceName = cluster_parameters['ib_resource_name']
+        values_template.trainingConfig.ibCount = cluster_parameters['ib_count']
+
+        if self.cfg.wandb_api_key_file is not None:
+            values_template.trainingConfig.wandbKey = self._add_wandb_key_to_chart()
+
+        k8s_template_path = job_path.folder
+        k8s_template_file = Path(k8s_template_path / 'k8s_template' / 'values.yaml')
+        k8s_template_file.parent.mkdir(parents=True, exist_ok=True)
+
+        conf = OmegaConf.create(values_template)
+        OmegaConf.save(conf, k8s_template_file)
 
     def get_env_vars(self) -> Dict:
         """
@@ -582,7 +685,7 @@ def _make_hydra_override(self) -> List:
         Example use cases are:
             1. For bcp cluster, `+rank=\${RANK}` is required running some NeMo scripts.
                 Existing hydra config doesn't have `rank` field, so we overwrite on the fly.
-            2. Auto blend training dataset by overwriting empty `model.data.data_prefix` as 
+            2. Auto blend training dataset by overwriting empty `model.data.data_prefix` as
                 `model.data.data_prefix=\$({auto_blend_command})`. Existing `model.data.data_prefix`
                 could be None in cfg, so we overwrite it in this function.
 
@@ -617,13 +720,14 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3.
 
         :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc.
-        :return: path current stage's essential nemo scripts code 
+        :return: path current stage's essential nemo scripts code
         :rtype: Path
         """
         model_type_to_code_path = {
             "t5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_pretraining.py",
             "mt5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_pretraining.py",
             "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/megatron_gpt_pretraining.py",
+            "llama": self._nemo_code_path / "examples/nlp/language_modeling/megatron_gpt_pretraining.py",
             "bert": self._nemo_code_path / "examples/nlp/language_modeling/megatron_bert_pretraining.py",
         }
         return model_type_to_code_path[model_type]
@@ -672,19 +776,60 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         """
         Provide the essential nemo code path for running the stage, usually different model types use different nemo scripts.
         For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3.
-        
+
         :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc.
-        :return: path current stage's essential nemo scripts code 
+        :return: path current stage's essential nemo scripts code
         :rtype: Path
         """
-        
+
         model_type_to_code_path = {
             "gpt3" : self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_sft.py",
+            "llama" : self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_sft.py",
             "t5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py",
             "mt5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py",
         }
         return model_type_to_code_path[model_type]
 
+class PEFT(NeMoStage):
+    """Stage class of PEFT with NeMo scripts"""
+
+    def setup_stage_vars(self, cfg):
+        """Setup the stage vars, i.e. stage name and stage cfg"""
+        self.stage_name = "peft"
+        self.stage_cfg = cfg.get("peft")
+
+    def setup_folder_and_data(self) -> None:
+        """Setup job/data folders and fine-tuning/prompt-learning dataset"""
+        # Setup folders
+        super().setup_folder_and_data()
+
+        # Prepare prompt learning dataset
+        data_dir = self.cfg.get("data_dir")
+        task_name = self.stage_cfg.run.get("task_name")
+
+        # Prepare dataset for squad
+        if task_name in ["squad", "xquad"]:
+            prepare_squad_for_fine_tuning(data_dir=os.path.join(data_dir, "squad_data"))
+
+
+    def _get_nemo_code_path(self, model_type: str) -> Path:
+        """
+        Provide the essential nemo code path for running the stage, usually different model types use different nemo scripts.
+        For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3.
+        
+        :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc.
+        :return: path current stage's essential nemo scripts code 
+        :rtype: Path
+        """
+        
+        if model_type == "mt5":
+            raise NotImplementedError("PEFT is not supported in NeMo Megatron mt5 models.")
+        model_type_to_code_path = {
+            "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py",
+            "llama": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py",
+            "t5": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_t5_peft_tuning.py",
+        }
+        return model_type_to_code_path[model_type]
 
 class PromptLearning(NeMoStage):
     """Stage class of prompt-learning with NeMo scripts"""
@@ -712,13 +857,14 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         """
         Provide the essential nemo code path for running the stage, usually different model types use different nemo scripts.
         For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3.
-        
+
         :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc.
-        :return: path current stage's essential nemo scripts code 
+        :return: path current stage's essential nemo scripts code
         :rtype: Path
         """
         model_type_to_code_path = {
             "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/megatron_gpt_prompt_learning.py",
+            "llama": self._nemo_code_path / "examples/nlp/language_modeling/megatron_gpt_prompt_learning.py",
             "t5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_prompt_learning.py",
             "mt5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_prompt_learning.py",
         }
@@ -735,13 +881,14 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         """
         Provide the essential nemo code path for running the stage, usually different model types use different nemo scripts.
         For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3.
-        
+
         :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc.
-        :return: path current stage's essential nemo scripts code 
+        :return: path current stage's essential nemo scripts code
         :rtype: Path
         """
         model_type_to_code_path = {
             "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py",
+            "llama": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py",
             "t5": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py",
         }
         return model_type_to_code_path[model_type]
@@ -757,13 +904,14 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         """
         Provide the essential nemo code path for running the stage, usually different model types use different nemo scripts.
         For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3.
-        
+
         :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc.
-        :return: path current stage's essential nemo scripts code 
+        :return: path current stage's essential nemo scripts code
         :rtype: Path
         """
         model_type_to_code_path = {
             "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_ia3_tuning.py",
+            "llama": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_ia3_tuning.py",
             "t5": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_t5_ia3_tuning.py",
         }
         return model_type_to_code_path[model_type]
@@ -818,6 +966,57 @@ def _make_checkpoint_search_command(self, **kwargs: Any) -> str:
             f"{' '.join(checkpoint_override)}"
         )
 
+    def _make_k8s_spec_file(self, template_root: str, cluster_parameters: Dict, job_path: JobPaths):
+        """
+        Create a spec file for a Kubernetes conversion job.
+        The spec file is generated based on the parameters in the cluster and conversion config files.
+
+        :param str template_root: path to where the k8s template files are located
+        :param Dict cluster_parameters: settings specific to the cluster that is being used
+        :param JobPaths job_path: JobPaths object
+        """
+        with open(os.path.join(template_root, 'values.yaml')) as value_file:
+            values_template = OmegaConf.load(value_file)
+
+        num_gpus = self.cfg.conversion.model.pipeline_model_parallel_size * self.cfg.conversion.model.tensor_model_parallel_size
+
+        values_template.image.trainingImage = cluster_parameters['container_image']
+        values_template.image.pullSecret = cluster_parameters['pull_secret']
+        values_template.image.gpuNum = num_gpus
+        values_template.trainingConfig.shmSize = cluster_parameters['shm_size']
+        values_template.trainingConfig.NFSServer = cluster_parameters['nfs_server']
+        values_template.trainingConfig.NFSPath = cluster_parameters['nfs_path']
+        values_template.trainingConfig.vocabPath = self.cfg.conversion.model.vocab_file
+        values_template.trainingConfig.mergesPath = self.cfg.conversion.model.merge_file
+        values_template.trainingConfig.resultsDirectory = str(job_path.folder)
+        values_template.trainingConfig.trainingDirectory = self.cfg.conversion.run.train_dir
+        values_template.trainingConfig.launcherScriptsPath = self.cfg.launcher_scripts_path
+        values_template.trainingConfig.tensorParallelism = self.cfg.conversion.model.tensor_model_parallel_size
+        values_template.trainingConfig.pipelineParallelism = self.cfg.conversion.model.pipeline_model_parallel_size
+
+        k8s_template_path = job_path.folder
+        k8s_template_file = Path(k8s_template_path / 'k8s_template' / 'values.yaml')
+        k8s_template_file.parent.mkdir(parents=True, exist_ok=True)
+
+        conf = OmegaConf.create(values_template)
+        OmegaConf.save(conf, k8s_template_file)
+
+    def _copy_k8s_helm_chart(self, template_root: str, job_path: JobPaths):
+        """
+        Copy the k8s Helm charts to the results directory.
+
+        :param str template_root: path to where the k8s template files are located
+        :param JobPaths job_path: JobPaths object
+        """
+        template_file = os.path.join(template_root, 'conversion.yaml')
+        chart_file = os.path.join(template_root, 'Chart.yaml')
+        conversion_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'conversion.yaml')
+        conversion_path.parent.mkdir(parents=True, exist_ok=True)
+        chart_path = Path(job_path.folder / 'k8s_template' / 'Chart.yaml')
+
+        shutil.copy2(template_file, conversion_path)
+        shutil.copy2(chart_file, chart_path)
+
     def make_stage_command_groups(self, stage_cfg_path: Path) -> List[List[str]]:
         """
         Make the command groups for current stage
@@ -939,9 +1138,9 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
         """
         Provide the essential nemo code path for running the stage, usually different model types use different nemo scripts.
         For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3.
-        
+
         :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc.
-        :return: path current stage's essential nemo scripts code 
+        :return: path current stage's essential nemo scripts code
         :rtype: Path
         """
         if model_type in ["gpt3", "prompt_gpt3"]:
@@ -966,7 +1165,7 @@ class EvalHarnessEvaluation(NemoMegatronStage):
     def __init__(self, cfg):
         super().__init__(cfg)
         choice_model_type, choice_name = self.get_stage_config_choice()
-        self.prompt_evaluation = choice_model_type == "prompt_gpt3"
+        self.prompt_evaluation = True if "prompt" in choice_model_type else False
 
     def setup_stage_vars(self, cfg):
         """Setup the stage vars, i.e. stage name and stage cfg"""
@@ -991,6 +1190,77 @@ def _make_download_command_string(self) -> str:
         download_command_string = " \\\n  ".join(download_command)
         return download_command_string
 
+    def _make_k8s_spec_file(self, template_root: str, cluster_parameters: Dict, job_path: JobPaths):
+        """
+        Create a spec file for a Kubernetes conversion job.
+        The spec file is generated based on the parameters in the cluster and conversion config files.
+
+        :param str template_root: path to where the k8s template files are located
+        :param Dict cluster_parameters: settings specific to the cluster that is being used
+        :param JobPaths job_path: JobPaths object
+        """
+        with open(os.path.join(template_root, 'values.yaml')) as value_file:
+            values_template = OmegaConf.load(value_file)
+
+        num_gpus = self.cfg.evaluation.model.pipeline_model_parallel_size * self.cfg.evaluation.model.tensor_model_parallel_size
+
+        values_template.image.trainingImage = cluster_parameters['container_image']
+        values_template.image.pullSecret = cluster_parameters['pull_secret']
+        values_template.image.gpuNum = num_gpus
+        values_template.trainingConfig.shmSize = cluster_parameters['shm_size']
+        values_template.trainingConfig.NFSServer = cluster_parameters['nfs_server']
+        values_template.trainingConfig.NFSPath = cluster_parameters['nfs_path']
+        values_template.trainingConfig.vocabPath = self.cfg.evaluation.model.vocab_file
+        values_template.trainingConfig.mergesPath = self.cfg.evaluation.model.merge_file
+        values_template.trainingConfig.resultsDirectory = str(job_path.folder)
+        values_template.trainingConfig.trainingDirectory = self.cfg.evaluation.run.train_dir
+        values_template.trainingConfig.launcherScriptsPath = self.cfg.launcher_scripts_path
+        values_template.trainingConfig.tensorParallelism = self.cfg.evaluation.model.tensor_model_parallel_size
+        values_template.trainingConfig.pipelineParallelism = self.cfg.evaluation.model.pipeline_model_parallel_size
+        values_template.trainingConfig.name = self.cfg.evaluation.run.name
+        values_template.trainingConfig.model = self.cfg.evaluation.model.model_type
+        values_template.trainingConfig.cacheDir = os.path.join(self.cfg.data_dir, 'eval_harness_data')
+        values_template.trainingConfig.outputPath = os.path.join(self.cfg.evaluation.run.results_dir,
+                                                                 self.cfg.evaluation.run.eval_name,
+                                                                 'results')
+        values_template.trainingConfig.batchSize = self.cfg.evaluation.model.eval_batch_size
+        values_template.trainingConfig.precision = self.cfg.evaluation.model.precision
+        values_template.trainingConfig.nemoModel = self.cfg.evaluation.model.nemo_model
+        values_template.trainingConfig.checkpointFolder = self.cfg.evaluation.model.checkpoint_folder
+        values_template.trainingConfig.checkpointName = self.cfg.evaluation.model.checkpoint_name
+        values_template.trainingConfig.hparamsFile = self.cfg.evaluation.model.hparams_file
+        values_template.trainingConfig.tasks = self.cfg.evaluation.run.tasks
+
+        k8s_template_path = job_path.folder
+        k8s_template_file = Path(k8s_template_path / 'k8s_template' / 'values.yaml')
+        k8s_template_file.parent.mkdir(parents=True, exist_ok=True)
+
+        conf = OmegaConf.create(values_template)
+        OmegaConf.save(conf, k8s_template_file)
+
+    def _copy_k8s_helm_chart(self, template_root: str, job_path: JobPaths):
+        """
+        Copy the k8s Helm charts to the results directory.
+
+        :param str template_root: path to where the k8s template files are located
+        :param JobPaths job_path: JobPaths object
+        """
+        template_file = os.path.join(template_root, 'evaluation.yaml')
+        chart_file = os.path.join(template_root, 'Chart.yaml')
+        evaluation_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'evaluation.yaml')
+        evaluation_path.parent.mkdir(parents=True, exist_ok=True)
+        config_path = Path(job_path.folder / 'k8s_template' / 'config')
+        config_path.mkdir(parents=True, exist_ok=True)
+        chart_path = Path(job_path.folder / 'k8s_template' / 'Chart.yaml')
+        evaluation_config_file = os.path.join(template_root, 'evaluation-config.yaml')
+        evaluation_config_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'evaluation-config.yaml')
+        hparams_config_path = Path(job_path.folder / 'k8s_template' / 'config')
+
+        shutil.copy2(template_file, evaluation_path)
+        shutil.copy2(chart_file, chart_path)
+        shutil.copy2(evaluation_config_file, evaluation_config_path)
+        shutil.copy2(os.path.join(self.cfg.evaluation.run.train_dir, 'results', 'hparams.yaml'), hparams_config_path)
+
     def make_stage_command_groups(self, stage_cfg_path: Path) -> List[List[str]]:
         """
         Make the command groups for current stage
@@ -1043,6 +1313,7 @@ def make_stage_command_groups(self, stage_cfg_path: Path) -> List[List[str]]:
                 nemo_model=model_cfg.get("nemo_model"),
                 checkpoint_folder=model_cfg.get("checkpoint_folder"),
                 checkpoint_name=model_cfg.get("checkpoint_name"),
+                tokenizer_model=model_cfg.get("tokenizer_model"),
                 hparams_file=model_cfg.get("hparams_file"),
             )