diff --git a/.github/workflows/changelog-build.yml b/.github/workflows/changelog-build.yml
index b6d836f71cec..763e6186fd1d 100644
--- a/.github/workflows/changelog-build.yml
+++ b/.github/workflows/changelog-build.yml
@@ -2,26 +2,25 @@ name: 'Changelog Build (Release)'
 
 on:
   workflow_dispatch:
-  push:
-    tags:
-      - '*'
+    inputs:
+      last-release-tag:
+        description: Last Git tag to start from (exclusive) (e.g. `v2.0.0`)
+        type: string
+        required: true
+      release-branch:
+        description: Release branch to build changelog on (e.g. `r2.1.0`)
+        type: string
+        required: true
     
 jobs:
   changelog:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout branch
+        uses: actions/checkout@v4
         with:
-          fetch-depth: 0 # Required due to the way Git works, without it this action won't be able to find any or the correct tags
-
-      - name: Get Previous tag
-        id: previous_tag
-        # git for-each-ref --sort=-creatordate --format '%(refname)' refs/tags ==> refs/tags/vX.Y.Z in descending order of date
-        # awk 'FNR == 2 {print substr($1, 11, length($1))}') ==> Selects the 2nd tag from the list, then strips the /refs/tags/ part of the tag
-        # set-output name=tag_name:: ==> Takes the clean tag vX.Y.Z and sets it to steps.previous_tag.outputs.tag_name
-        run: |
-          echo "::set-output name=tag_name::$(git for-each-ref --sort=-creatordate --format '%(refname)' refs/tags | awk 'FNR == 2 {print substr($1, 11, length($1))}')"
-          echo ${{ steps.previous_tag.outputs.tag_name }}
+          ref: ko3n1g/ci/fix-changelog-generator
+          fetch-depth: 0
 
       - name: Build Changelog
         id: github_tag
@@ -38,10 +37,37 @@ jobs:
           repo: "NeMo"
           ignorePreReleases: "false"
           failOnError: "false"
-          fromTag: ${{ steps.previous_tag.outputs.tag_name }}
-          toTag: ${{ github.ref_name || github.sha }}
+          fromTag: ${{ inputs.last-release-tag }}
+          toTag: ${{ inputs.release-branch }}
 
-      - name: Print Changelog
+      - name: Update changelog file
+        env: 
+          RELEASE_BRANCH: ${{ inputs.release-branch }}
+          CHANGELOG: ${{ steps.github_tag.outputs.changelog }}
+        shell: bash -x -e -u -o pipefail {0}
         run: |
-          echo "${{steps.github_tag.outputs.changelog}}"
-          echo "--- DONE ---"
+          RELEASE_VERSION=${RELEASE_BRANCH#r}
+          CHANGELOG=$(echo "$CHANGELOG" | sed '/^[[:blank:]]*#/s/#/###/')
+
+          RELEASE_NOTES="## NVIDIA Neural Modules $RELEASE_VERSION
+
+          ### Detailed Changelogs:
+
+          $CHANGELOG"
+
+          printf "%s\n" "$RELEASE_NOTES" | sed '/<!-- Next changelog -->/r /dev/stdin' CHANGELOG.md > CHANGELOG.tmp.md
+
+          mv CHANGELOG.tmp.md CHANGELOG.md
+
+      - name: Inspect new changelog file
+        run: cat CHANGELOG.md
+
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v7
+        with:
+          commit-message: "beep boop: Update changelog"
+          title: "Update changelog for `${{ inputs.release-branch }}`"
+          signoff: true
+          sign-commits: true
+          base: main
+          branch: bot/chore/update-changelog-into-${{ inputs.release-branch }}
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 25e0c5252100..101107dddc17 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -571,9 +571,24 @@ jobs:
           prune.num_attention_heads=2 \
           prune.num_query_groups=2 \
           prune.hidden_size=128 \
-          export.save_path=examples/nlp/language_modeling/ci_prune_width.nemo
-      AFTER_SCRIPT: |
-          rm -rf examples/nlp/language_modeling/ci_prune_width.nemo
+          export.save_path=/tmp/ci_prune_width.nemo
+
+  L2_Prune_Depth_Llama2:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Prune_Depth_Llama2') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_prune.py \
+          trainer.devices=2 \
+          trainer.num_nodes=1 \
+          trainer.precision=bf16 \
+          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          model.tensor_model_parallel_size=2 \
+          model.pipeline_model_parallel_size=1 \
+          'prune.drop_layers=[1]' \
+          export.save_path=/tmp/ci_prune_depth.nemo
 
   # L2: ASR dev run
   ASR_dev_run_Speech_to_Text:
@@ -3611,6 +3626,26 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf nemo_experiments
 
+  L2_VLM_HF_Transformer_PEFT_FSDP:
+    needs: [ cicd-test-container-setup ]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_VLM_HF_Transformer_PEFT_FSDP') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3 --disable-ckpt --strategy fsdp --devices 2
+
+  L2_VLM_HF_Transformer_PEFT_4bit:
+    needs: [ cicd-test-container-setup ]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_VLM_HF_Transformer_PEFT_4bit') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3 --disable-ckpt --use-4bit
+      AFTER_SCRIPT: |
+        rm -rf nemo_experiments
+
   L2_HF_Transformer_PEFT:
     needs: [ cicd-test-container-setup ]
     uses: ./.github/workflows/_test_template.yml
@@ -3666,6 +3701,17 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf nemo_experiments
 
+  L2_HF_Transformer_PT_2gpu:
+    needs: [ cicd-test-container-setup ]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT_2gpu') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2 --strategy ddp
+      AFTER_SCRIPT: |
+        rm -rf nemo_experiments
+
   L2_HF_Transformer_SFT_2gpu_nemorun:
     needs: [ cicd-test-container-setup ]
     uses: ./.github/workflows/_test_template.yml
@@ -3677,6 +3723,39 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf nemo_experiments
 
+  L2_HF_Transformer_PT_2gpu_nemorun:
+    needs: [ cicd-test-container-setup ]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT_2gpu_nemorun') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain_nemorun.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2 --strategy ddp
+      AFTER_SCRIPT: |
+        rm -rf nemo_experiments
+
+  L2_HF_Transformer_PT:
+    needs: [ cicd-test-container-setup ]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10
+      AFTER_SCRIPT: |
+        rm -rf nemo_experiments
+
+  L2_HF_Transformer_PT_nemorun:
+    needs: [ cicd-test-container-setup ]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT_nemorun') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain_nemorun.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10
+      AFTER_SCRIPT: |
+        rm -rf nemo_experiments
+
   L2_HF_Transformer_SFT:
     needs: [ cicd-test-container-setup ]
     uses: ./.github/workflows/_test_template.yml
@@ -3698,7 +3777,7 @@ jobs:
         TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/sft_nemorun.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10
       AFTER_SCRIPT: |
         rm -rf nemo_experiments
-  
+
   L2_HF_Transformer_SFT_TE_Acceleration:
     needs: [ cicd-test-container-setup ]
     uses: ./.github/workflows/_test_template.yml
@@ -3710,6 +3789,17 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf nemo_experiments
 
+  L2_HF_Transformer_PT_TE_Acceleration:
+    needs: [ cicd-test-container-setup ]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT_TE_Acceleration') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --model-accelerator te --max-steps 10
+      AFTER_SCRIPT: |
+        rm -rf nemo_experiments
+
   # L2: Megatron Mock Data Generation
   L2_Megatron_Mock_Data_Generation_MockGPTDataset:
     needs: [cicd-test-container-setup]
@@ -4705,6 +4795,18 @@ jobs:
         --lora_checkpoint_path=/home/TestData/nemo2_ckpt/llama_lora_ci_checkpoint_v2/ \
         --output_path=/tmp/nemo2_lora_merge/${{ github.run_id }}
 
+  L2_NEMO_2_LoRA_Export:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NEMO_2_LoRA_Export') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+
+        python tests/collections/llm/peft/lora_export.py \
+        --lora_checkpoint_path=/home/TestData/nemo2_ckpt/llama_lora_ci_checkpoint_v2/ \
+        --output_path=/tmp/nemo2_lora_merge/${{ github.run_id }}
+
   L2_NEMO_2_LoRA_Inference:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4891,8 +4993,16 @@ jobs:
       - L2_HF_Transformer_SFT_nemorun
       - L2_HF_Transformer_SFT_2gpu
       - L2_VLM_HF_Transformer_PEFT
+      - L2_VLM_HF_Transformer_PEFT_FSDP
+      - L2_VLM_HF_Transformer_PEFT_4bit
       - L2_HF_Transformer_SFT_2gpu_nemorun
       - L2_HF_Transformer_SFT_TE_Acceleration
+      - L2_HF_Transformer_PT
+      - L2_HF_Transformer_PT_nemorun
+      - L2_HF_Transformer_PT_2gpu
+      - L2_HF_Transformer_PT_2gpu_nemorun
+      - L2_HF_Transformer_PT_TE_Acceleration
+      - L2_VLM_HF_Transformer_PEFT
       - L2_NeMo_2_SSM_Pretraining
       - L2_NeMo_2_SSM_Finetuning
       - L2_NeMo_2_T5_Pretraining
@@ -4917,12 +5027,14 @@ jobs:
       - L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1
       - L2_NeMo_2_Mistral_LoRA_TP2PP1_MBS1
       - L2_NEMO_2_LoRA_MERGE
+      - L2_NEMO_2_LoRA_Export
       - L2_NEMO_2_LoRA_Inference
       - L2_NeMo_2_Mixtral_Pretraining
       - L2_PTQ_Llama2_FP8
       - L2_Community_LLM_Checkpoints_tests_Llama3
       - L2_Distill_Llama2
       - L2_Prune_Width_Llama2
+      - L2_Prune_Depth_Llama2
       - L2_Speech_to_Text_AED
       - L2_Speech_Estimate_Duration_Bins
       - L2_Speech_Batch_Size_OOMptimizer
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 2ddad31e159e..c1cd763a0501 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -20,6 +20,10 @@ on:
         description: Ref (SHA or branch name) to release
         required: true
         type: string
+      version-bump-branch:
+        description: Branch for version bump
+        required: true
+        type: string
       dry-run:
         description: Do not publish a wheel and GitHub release.
         required: true
@@ -28,7 +32,7 @@ on:
 
 jobs: 
   release:
-    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.17.3
+    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.18.3
     with:
       release-ref: ${{ inputs.release-ref }}
       image-name: nemo_container
@@ -41,6 +45,7 @@ jobs:
       container-workdir: /workspace
       library-name: Neural Modules
       dry-run: ${{ inputs.dry-run }}
+      version-bump-branch: ${{ inputs.version-bump-branch }}
     secrets:
       TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
       TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7fd5cd00b352..9f4b4cd6b596 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,418 @@
 # Changelog
 
+<!-- Next changelog -->
+## NVIDIA Neural Modules 2.1.0
+
+### Highlights
+
+- Training
+  - Fault Tolerance
+    - Straggler Detection
+    - Auto Relaunch
+- LLM & MM
+  - MM models
+    - Llava-next
+    - Llama 3.2
+  - Sequence Model Parallel for NeVa
+  - Enable Energon
+  - SigLIP (NeMo 1.0 only)
+  - LLM 2.0 migration
+    - Starcoder2
+    - Gemma 2
+    - T5
+    - Baichuan
+    - BERT
+    - Mamba
+    - ChatGLM
+  - DoRA support
+- Export
+  - Nemo 2.0 base model export path for NIM
+  - PTQ in Nemo 2.0
+- ASR
+  - Timestamps with TDT decoder
+  - Timestamps option with .transcribe()
+
+### Detailed Changelogs:
+
+#### ASR
+
+<details><summary>Changelog</summary>
+
+- [Fix] Fixed sampler override and audio_key in prepare_audio_data by @anteju :: PR: #10980
+- Akoumparouli/mixtral recipe fix r2.0.0 by @akoumpa :: PR: #10994
+- TDT compute timestamps option and Extra Whitespace handling for SPE by @monica-sekoyan :: PR: #10875
+- ci: Switch to CPU only runner by @ko3n1g :: PR: #11035
+- Fix timestamps tests by @monica-sekoyan :: PR: #11053
+- ci: Pin release freeze by @ko3n1g :: PR: #11143
+- Fix RNN-T loss memory usage by @artbataev :: PR: #11144
+- Added deprecation notice by @Ssofja :: PR: #11133
+- Fixes for Canary adapters tutorial by @pzelasko :: PR: #11184
+- add ipython import guard by @nithinraok :: PR: #11191
+- Self Supervised Pre-Training tutorial Fix by @monica-sekoyan :: PR: #11206
+- update the return type by @nithinraok :: PR: #11210
+- Timestamps to transcribe by @nithinraok :: PR: #10950
+- [Doc fixes] update file names, installation instructions, bad links by @erastorgueva-nv :: PR: #11045
+- Beam search algorithm implementation for TDT models by @lilithgrigoryan :: PR: #10903
+- Update import 'pytorch_lightning' -> 'lightning.pytorch' by @maanug-nv :: PR: #11252
+- Remove pytorch-lightning by @maanug-nv :: PR: #11306
+- update hypothesis when passed through cfg by @nithinraok :: PR: #11366
+- Revert "update hypothesis when passed through cfg" by @pablo-garay :: PR: #11373
+- Fix transcribe speech by @nithinraok :: PR: #11379
+- Lhotse support for transcribe_speech_parallel by @nune-tadevosyan :: PR: #11249
+- Sortformer Diarizer 4spk v1 model PR Part 1: models, modules and dataloaders by @tango4j :: PR: #11282
+- Removing unnecessary lines by @nune-tadevosyan :: PR: #11408
+- Support for initializing lhotse shar dataloader via field: list[path] mapping by @pzelasko :: PR: #11460
+- New extended prompt format for Canary, short utterances inference fix, and training micro-optimizations by @pzelasko :: PR: #11058
+- Fixing Multi_Task_Adapters.ipynb by replacing canary2 with canary_custom by @weiqingw4ng :: PR: #11636
+
+</details>
+
+#### TTS
+
+<details><summary>Changelog</summary>
+
+- [Doc fixes] update file names, installation instructions, bad links by @erastorgueva-nv :: PR: #11045
+- Add T5TTS by @blisc :: PR: #11193
+- Update import 'pytorch_lightning' -> 'lightning.pytorch' by @maanug-nv :: PR: #11252
+- Remove pytorch-lightning by @maanug-nv :: PR: #11306
+- Add nvidia/low-frame-rate-speech-codec-22khz model on docs by @Edresson :: PR: #11457
+
+</details>
+
+#### NLP / NMT
+
+<details><summary>Changelog</summary>
+
+- Move collectiob.nlp imports inline for t5 by @marcromeyn :: PR: #10877
+- Use a context-manager when opening files by @akoumpa :: PR: #10895
+- Packed sequence bug fixes by @cuichenx :: PR: #10898
+- ckpt convert bug fixes by @dimapihtar :: PR: #10878
+- remove deprecated ci tests by @dimapihtar :: PR: #10922
+- Update T5 tokenizer (adding additional tokens to tokenizer config) by @huvunvidia :: PR: #10972
+- Add support and recipes for HF models via AutoModelForCausalLM by @akoumpa :: PR: #10962
+- gpt3 175b cli by @malay-nagda :: PR: #10985
+- Fix for crash with LoRA + tp_overlap_comm=false + sequence_parallel=true by @vysarge :: PR: #10920
+- Update `BaseMegatronSampler` for compatibility with PTL's `_BatchProgress` by @ashors1 :: PR: #11016
+- add deprecation note by @dimapihtar :: PR: #11024
+- Update ModelOpt Width Pruning example defaults by @kevalmorabia97 :: PR: #10902
+- switch to NeMo 2.0 recipes by @dimapihtar :: PR: #10948
+- NeMo 1.0: upcycle dense to moe by @akoumpa :: PR: #11002
+- Gemma2 in Nemo2 with Recipes by @suiyoubi :: PR: #11037
+- Add Packed Seq option to GPT based models by @suiyoubi :: PR: #11100
+- Fix MCoreGPTModel import in llm.gpt.model.base by @hemildesai :: PR: #11109
+- TP+MoE peft fix by @akoumpa :: PR: #11114
+- GPT recipes to use full te spec by @JimmyZhang12 :: PR: #11119
+- Virtual pipeline parallel support for LoRA in NLPAdapterModelMixin by @vysarge :: PR: #11128
+- update nemo args for mcore flash decode arg change by @HuiyingLi :: PR: #11138
+- Call `ckpt_to_weights_subdir` from `MegatronCheckpointIO` by @ashors1 :: PR: #10897
+- [Doc fixes] update file names, installation instructions, bad links by @erastorgueva-nv :: PR: #11045
+- fix(export): GPT models w/ bias=False convert properly by @terrykong :: PR: #11255
+- Use MegatronDataSampler in HfDatasetDataModule by @akoumpa :: PR: #11274
+- Add T5TTS by @blisc :: PR: #11193
+- ci: Exclude CPU machines from scan by @ko3n1g :: PR: #11300
+- Revert "fix(export): GPT models w/ bias=False convert properly" by @terrykong :: PR: #11301
+- remove redundant docs by @sharathts :: PR: #11302
+- Update import 'pytorch_lightning' -> 'lightning.pytorch' by @maanug-nv :: PR: #11252
+- Add `attention_bias` argument in transformer block and transformer layer modules, addressing change in MCore by @yaoyu-33 :: PR: #11289
+- Remove pytorch-lightning by @maanug-nv :: PR: #11306
+- Update T5 attention-mask shapes to be compatible with all attention-backend in new TE versions by @huvunvidia :: PR: #11059
+- Add support for restoring from 2.0 checkpoint in 1.0 by @hemildesai :: PR: #11347
+- Fix Gemma2 Attention Args by @suiyoubi :: PR: #11365
+- mlm conversion & tiktokenizer support by @dimapihtar :: PR: #11349
+- [Nemo1] Generate sharded optimizer state dicts only if needed for saving by @ananthsub :: PR: #11451
+- add hindi tn/itn coverage by @mgrafu :: PR: #11382
+- chore(beep boop 🤖): Bump `MCORE_TAG=67a50f2...` (2024-11-28) by @ko3n1g :: PR: #11427
+- Handle exception when importing RetroGPTChunkDatasets by @guyueh1 :: PR: #11415
+- Update restore from config for gpt type continual training in NeMo1 by @yaoyu-33 :: PR: #11471
+- ci: Re-enable `L2_Megatron_LM_To_NeMo_Conversion` by @ko3n1g :: PR: #11484
+- Apply packed sequence params change for fused rope compatibility by @ananthsub :: PR: #11506
+- Huvu/tiktoken tokenizer update by @huvunvidia :: PR: #11494
+
+</details>
+
+#### Text Normalization / Inverse Text Normalization
+
+<details><summary>Changelog</summary>
+
+- Adding support for LightningDataModule inside Fabric-API by @marcromeyn :: PR: #10879
+- Add registry to register all needed classes with artifacts in nemo.lightning.io by @hemildesai :: PR: #10861
+- Update import 'pytorch_lightning' -> 'lightning.pytorch' by @maanug-nv :: PR: #11252
+- Remove pytorch-lightning by @maanug-nv :: PR: #11306
+- add hindi tn/itn coverage by @mgrafu :: PR: #11382
+
+</details>
+
+#### Export
+
+<details><summary>Changelog</summary>
+
+- Update engine build step for TRT-LLM 0.13.0 by @janekl :: PR: #10880
+- Nemo 2.0 ckpt support in TRT-LLM export by @oyilmaz-nvidia :: PR: #10891
+- Fix TRTLLM parallel_embedding by @meatybobby :: PR: #10975
+- Export & deploy updates (part I) by @janekl :: PR: #10941
+- Add doc-strings to import & export + improve logging by @marcromeyn :: PR: #11078
+- NeMo-UX: fix nemo-ux export path by @akoumpa :: PR: #11081
+- Fix TRTLLM nemo2 activation parsing by @meatybobby :: PR: #11062
+- Support exporting Nemotron-340B for TensorRT-LLM by @jinyangyuan-nvidia :: PR: #11015
+- vLLM Hugging Face exporter by @oyilmaz-nvidia :: PR: #11124
+- Fix export of configuration parameters to Weights and Biases by @soluwalana :: PR: #10995
+- Change activation parsing in TRTLLM by @meatybobby :: PR: #11173
+- Remove builder_opt param from trtllm-build for TensorRT-LLM >= 0.14.0 by @janekl :: PR: #11259
+- fix(export): GPT models w/ bias=False convert properly by @terrykong :: PR: #11255
+- fix(export): update API for disabling device reassignment in TRTLLM for Aligner by @terrykong :: PR: #10863
+- Add openai-gelu in gated activation for TRTLLM export by @meatybobby :: PR: #11293
+- Revert "fix(export): GPT models w/ bias=False convert properly" by @terrykong :: PR: #11301
+- Adding alinger export by @shanmugamr1992 :: PR: #11269
+- Export & deploy updates (part II) by @janekl :: PR: #11344
+- Introducing TensorRT lazy export and caching option with trt_compile()  by @borisfom :: PR: #11266
+- fix: export converts properly if no model_prefix by @terrykong :: PR: #11477
+
+</details>
+
+#### Bugfixes
+
+<details><summary>Changelog</summary>
+
+- Change default ckpt name by @maanug-nv :: PR: #11277
+- Fix patching of NeMo tokenizers for correct Lambada evaluation by @janekl :: PR: #11326
+
+</details>
+
+#### Uncategorized:
+
+<details><summary>Changelog</summary>
+
+- ci: Use Slack group by @ko3n1g :: PR: #10866
+- Bump `Dockerfile.ci` (2024-10-14) by @ko3n1g :: PR: #10871
+- Fix peft resume by @cuichenx :: PR: #10887
+- call __post_init__ after altering config values by @akoumpa :: PR: #10885
+- Late import prettytable by @maanug-nv :: PR: #10912
+- Bump `Dockerfile.ci` (2024-10-17) by @ko3n1g :: PR: #10919
+- Warning for missing FP8 checkpoint support for vLLM deployment by @janekl :: PR: #10906
+- Fix artifact saving by @hemildesai :: PR: #10914
+- Lora improvement by @cuichenx :: PR: #10918
+- Huvu/t5 nemo2.0 peft by @huvunvidia :: PR: #10916
+- perf recipes and Mcore DistOpt params by @malay-nagda :: PR: #10883
+- ci: Fix cherry pick team by @ko3n1g :: PR: #10945
+- Fix requirements for MacOS by @artbataev :: PR: #10930
+- Fix nemo 2.0 recipes  by @BoxiangW :: PR: #10915
+- Akoumparouli/nemo ux fix dir or string artifact by @akoumpa :: PR: #10936
+- Fix typo in docstring by @ashors1 :: PR: #10955
+- [Nemo CICD] Remove deprecated tests by @pablo-garay :: PR: #10960
+- Restore NeMo 2.0 T5 pretraining CICD test by @huvunvidia :: PR: #10952
+- Convert perf plugin env vars to strings by @hemildesai :: PR: #10947
+- disable dynamo for ddp checker by @akoumpa :: PR: #10961
+- Bump `Dockerfile.ci` (2024-10-21) by @ko3n1g :: PR: #10965
+- respect warnings' filters by @akoumpa :: PR: #10953
+- Alit/mamba recipe by @JRD971000 :: PR: #10935
+- Long context performance doc hot fix by @youngeunkwon0405 :: PR: #10946
+- Performance mode by @malay-nagda :: PR: #10926
+- Bump `Dockerfile.ci` (2024-10-22) by @ko3n1g :: PR: #10979
+- Add more recipes by @cuichenx :: PR: #10957
+- ci: Update tests by @ko3n1g :: PR: #10987
+- Bump `Dockerfile.ci` (2024-10-23) by @ko3n1g :: PR: #11001
+- llm.generate fixes by @HuiyingLi :: PR: #10983
+- use __dict__ in check by @akoumpa :: PR: #11012
+- LoRA support for HF::AutoModelForCausalLM by @akoumpa :: PR: #10982
+- Change default for always_save_context to True by @athitten :: PR: #11014
+- Fix pip install by @marcromeyn :: PR: #11026
+- Change dist ckpt defaults by @ShriyaPalsamudram :: PR: #10913
+- Fix _strategy_lib tests by @maanug-nv :: PR: #11033
+- Basic online dynamic FP8 quantization with vLLM by @janekl :: PR: #10904
+- Expose packed seq in finetuning recipes by @cuichenx :: PR: #11006
+- PEFT Inference by @cuichenx :: PR: #11030
+- added Lhotse online augmentation tutorial for SE by @nasretdinovr :: PR: #10944
+- Bump `Dockerfile.ci` (2024-10-27) by @ko3n1g :: PR: #11051
+- ci: Send team alerts on specific keywords by @ko3n1g :: PR: #10986
+- Qwen2 Recipe by @suiyoubi :: PR: #10974
+- Bump `Dockerfile.ci` (2024-10-28) by @ko3n1g :: PR: #11054
+- Generalizing Inference pipeline in NeMo 2.0 to support encoder-decoder models by @huvunvidia :: PR: #10924
+- [Bug fix] In energon MultiModalSampleConfig use default_factory in dataclass by @guyueh1 :: PR: #11041
+- fix: Resolve mutable default issue in MultiModalSampleConfig dataclass by @michal2409 :: PR: #11061
+- SC1/SC2 Recipe by @suiyoubi :: PR: #10971
+- Wrap batch_sampler with _IndexBatchSamplerWrapper by @farhadrgh :: PR: #10934
+- Performance fine-tuning recipes for llama3 8b + 70b by @vysarge :: PR: #11046
+- Set TE spec name for NeMo to HF checkpoint converters by @kevalmorabia97 :: PR: #11036
+- ci: Re-add secrets detector by @ko3n1g :: PR: #11038
+- Adding nemo-run recipes for NeMo 2.0 T5  by @huvunvidia :: PR: #10964
+- Minor fixes for NeMo 2.0 PTQ by @Laplasjan107 :: PR: #11079
+- Add copyright check by @pablo-garay :: PR: #11048
+- Fix finalize model grad for PEFT by @cuichenx :: PR: #11065
+- ci: Less verbose infra alerts by @ko3n1g :: PR: #11080
+- Add copyright notice by @pablo-garay :: PR: #11085
+- ci: Fix cron schedule  by @ko3n1g :: PR: #11076
+- ci: Use code-freeze via Nemo-FW-Templates by @ko3n1g :: PR: #11073
+- Akoumparouli/hf lit module peft ckpt bugfix by @akoumpa :: PR: #11022
+- PEFT perf and TE spec fixes by @JimmyZhang12 :: PR: #11070
+- Bump `Dockerfile.ci` (2024-10-30) by @ko3n1g :: PR: #11092
+- NeMorun for NeMo 2.0 T5 finetuning by @huvunvidia :: PR: #11040
+- fix model_checkpoint.py by @ethanhe42 :: PR: #11057
+- Update PTQ tests and ModelOpt version by @janekl :: PR: #11095
+- Fix datasets in CLI by @marcromeyn :: PR: #11097
+- Fix yaml serialization in io mixin by @hemildesai :: PR: #11106
+- disable overlap_param_gather_with_optimizer_step by @JimmyZhang12 :: PR: #11102
+- nemo1 to nemo2 checkpoint convert by @HuiyingLi :: PR: #10937
+- fix expert regex filter by @akoumpa :: PR: #11103
+- Remove stale checkpoint deletion on checkpoint saving failure by @akoumpa :: PR: #11116
+- NeMo-UX: Mistral/mixtral peft ci test by @akoumpa :: PR: #11094
+- Make nemo.collections.llm PreTrainingDataModule num samples configurable by @hemildesai :: PR: #11088
+- Fix packed seq path by @cuichenx :: PR: #11121
+- Allow arguments passed to dataset class + Gemma recipe fix by @cuichenx :: PR: #11125
+- Nemotron Recipe by @suiyoubi :: PR: #11118
+- NeMo-UX: HF PeFT fix by @akoumpa :: PR: #11096
+- Remove deprecated tests by @pablo-garay :: PR: #11134
+- Recipe Fix for NeMo CI by @suiyoubi :: PR: #11127
+- Fix freeze_model call in peft by @cuichenx :: PR: #11146
+- Bump `Dockerfile.ci` (2024-11-05) by @ko3n1g :: PR: #11159
+- NeMo-UX: Add sgd optim by @akoumpa :: PR: #11157
+- Update copyright check by @pablo-garay :: PR: #11168
+- add lora recipt for 405b by @JRD971000 :: PR: #10991
+- dit training diagrams by @zpx01 :: PR: #10873
+- ci: Switch to FW templates for build by @ko3n1g :: PR: #11077
+- Bump `Dockerfile.ci` (2024-11-06) by @ko3n1g :: PR: #11174
+- feat: Run PyLint by @ko3n1g :: PR: #11147
+- Add Alpaca Finetune Datamodule by @suiyoubi :: PR: #11185
+- Updated Diffusion Collection README by @zpx01 :: PR: #11179
+- Add support for Cosmos Tokenizers by @jojennin :: PR: #11194
+- Run formatting only if files changed. Echo message if pylint fails. by @artbataev :: PR: #11188
+- Bump `Dockerfile.ci` (2024-11-07) by @ko3n1g :: PR: #11196
+- Fix rotary_percentage parsing in nemo2 config by @meatybobby :: PR: #11197
+- ci: Update cherry pick workflow by @ko3n1g :: PR: #11202
+- ci: Build, test, publish a wheel by @ko3n1g :: PR: #11183
+- Bump `Dockerfile.ci` (2024-11-08) by @ko3n1g :: PR: #11222
+- update default pipeline_parallelism_type by @akoumpa :: PR: #11213
+- check actual value of vocab_file by @akoumpa :: PR: #11228
+- Fix VP Initialization Issue with Latest MCore by @suiyoubi :: PR: #11209
+- ci: Run Pylint strictly on new files, softly on history by @ko3n1g :: PR: #11212
+- ci: Add release workflow by @ko3n1g :: PR: #11180
+- Fix llm.generate by @hemildesai :: PR: #11217
+- Bump `Dockerfile.ci` (2024-11-11) by @ko3n1g :: PR: #11247
+- Bump `Dockerfile.ci` (2024-11-12) by @ko3n1g :: PR: #11254
+- Handling tokenizer in PTQ for Nemo 2.0 by @janekl :: PR: #11237
+- Fix finetuning datamodule resume by @cuichenx :: PR: #11187
+- ci: Move `bump mcore` to templates by @ko3n1g :: PR: #11229
+- ci: Fix secrets detector by @ko3n1g :: PR: #11205
+- chore(beep boop 🤖): Bump `MCORE_TAG=aded519...` (2024-11-12) by @ko3n1g :: PR: #11260
+- ci: Run secrets detector on `pull_request_target` by @ko3n1g :: PR: #11263
+- Advanced Diffusion Training Features by @zpx01 :: PR: #11246
+- Update pruning and distillation tutorial notebooks by @gvenkatakris :: PR: #11091
+- update nemo1->2 conversion according to changes in main by @HuiyingLi :: PR: #11253
+- Add llama 3.1 recipes by @cuichenx :: PR: #11273
+- Fix Finetune Recipe by @suiyoubi :: PR: #11267
+- Configure no restart validation loop in nl.Trainer by @hemildesai :: PR: #11029
+- Handle _io_unflatten_object when _thread_local.output_dir is not available by @hemildesai :: PR: #11199
+- Remove opencc upperbound by @thomasdhc :: PR: #10909
+- Fix head_size in NeMo to HF checkpoint converters for width pruned model support by @eagle705 :: PR: #11230
+- Fixes per comments by @gvenkatakris :: PR: #11280
+- Create phi3mini.py by @mayani-nv :: PR: #11281
+- ci: Fix release workflow by @ko3n1g :: PR: #11286
+- fix perf plugin CUDA_DEVICE_MAX_CONNECTIONS setting by @JimmyZhang12 :: PR: #11299
+- PTQ via NeMo-Run CLI by @janekl :: PR: #10984
+- PTQ memory optimization by @Laplasjan107 :: PR: #11257
+- Update README.md for collection page by @yaoyu-33 :: PR: #11223
+- Adding multimodal examples by @shanmugamr1992 :: PR: #11279
+- Add HF untrusted code toggle by @akoumpa :: PR: #11313
+- P2p chunk size setting in nemo 2.0 by @erhoo82 :: PR: #11312
+- Nemo2 batcheval by @HuiyingLi :: PR: #11158
+- DoRA by @cuichenx :: PR: #11104
+- Profiling - support Chakra & Kineto trace dumping by @lilyw97 :: PR: #11115
+- NeMo 2.0 SFT PEFT notebooks by @HuiyingLi :: PR: #10874
+- Update symlink option for save_last in ModelCheckpoint by @paul-gibbons :: PR: #11319
+- ci: Pass-through of `workflow_event` by @ko3n1g :: PR: #11322
+- Add StragglerDetection and auto-relaunch to NeMo2.0 by @ShriyaPalsamudram :: PR: #11328
+- Huvu/t5 nemo2.0 nemoci by @huvunvidia :: PR: #11291
+- TE acceleration using callbacks by @oyilmaz-nvidia :: PR: #11261
+- Leave target_module as default in PEFT Recipes by @cuichenx :: PR: #11334
+- More robust tar file loading from AIStore by @pzelasko :: PR: #11323
+- Fix CLIP transformer layer api by @yaoyu-33 :: PR: #11337
+- pass trust_remote_code to AutoTokenizer by @akoumpa :: PR: #11343
+- Fix linear layer replacement by @oyilmaz-nvidia :: PR: #11356
+- fix typo by @JRD971000 :: PR: #11351
+- Add torchrun local executor to recipes by @marcromeyn :: PR: #11342
+- Add PP support in NeVA along with few bug fixes by @yaoyu-33 :: PR: #11170
+- nemo2 peft merge by @HuiyingLi :: PR: #11017
+- Add dora recipes by @cuichenx :: PR: #11330
+- add fix to recipe by @JRD971000 :: PR: #11368
+- Add missing test to CICD needed list by @pablo-garay :: PR: #11376
+- update SquadDataModule to use run.config by @huvunvidia :: PR: #11358
+- Add llama 3.2 1b and 3b by @cuichenx :: PR: #11335
+- calculate metrics for nemo2 sftpeft notebook by @HuiyingLi :: PR: #11381
+- Enable packed dataset for validation; add a2a_experimental argument by @michal2409 :: PR: #11378
+- Fix DDP unused param error when TE is enabled in NeMo Lite by @oyilmaz-nvidia :: PR: #11364
+- Update llama32 vision (mllama) use attention bias by @yaoyu-33 :: PR: #11316
+- Fix environment variables in torchrun executor by @hemildesai :: PR: #11363
+- Add sample generate to PTQ for NeMo 2.0 by @Laplasjan107 :: PR: #11339
+- Fix selective restore by explicitly verifying keys by @hemildesai :: PR: #11377
+- Minor fix by @gvenkatakris :: PR: #11353
+- Add a fix for single-GPU nsys. by @tfogal :: PR: #11354
+- capitalize HF as HF instead of Hf by @akoumpa :: PR: #11384
+- ci: Add HF cache by @ko3n1g :: PR: #11398
+- Remove logic to skip checkpoint save if checkpoint exists by @ashors1 :: PR: #11362
+- Rewire tokenizer exception handling in model resume by @cuichenx :: PR: #11375
+- Adding LLava-Next model class by @yashaswikarnati :: PR: #11399
+- Fix vllm test issue when run_accuracy is enabled by @oyilmaz-nvidia :: PR: #11413
+- data modules for llava_next by @yashaswikarnati :: PR: #11400
+- Fix strategies saving unsharded optimizer states by @ananthsub :: PR: #11392
+- Adjust CLI support for PTQ by @janekl :: PR: #11421
+- Nemo run recipe's and example scripts for Llava Next by @yashaswikarnati :: PR: #11405
+- Huvu/t5 nemo2.0 nemoci 3b11b by @huvunvidia :: PR: #11388
+- ci: Allow dry-run of release by @ko3n1g :: PR: #11418
+- fix dtype when init HF model from config by @akoumpa :: PR: #11420
+- Handle import errors in virtual environment when running vLLM tests by @janekl :: PR: #11435
+- Fix loss mask when answer_only_loss=True by @ashors1 :: PR: #11444
+- [audio] Keep input directory structure when saving processed files by @anteju :: PR: #11403
+- Add different recipe examples to NeMo 2.0 by @BoxiangW :: PR: #11317
+- [Scripts] Remove fixed seed for adding noise by @anteju :: PR: #11401
+- Add option to provide prior NeMo 2 ckpt path to convert_nemo1_to_nemo… by @hemildesai :: PR: #11452
+- PTQ CLI and param updates by @janekl :: PR: #11459
+- Add tests for resiliency feature integration by @maanug-nv :: PR: #11406
+- ci: Disable HexHighEntropyString plugin by @ko3n1g :: PR: #11470
+- Fix broken links by @shashank3959 :: PR: #11294
+- Nemo 2.0 canonical lora by @cuichenx :: PR: #11416
+- ci: Run secrets detector on merge-commit by @ko3n1g :: PR: #11479
+- Formatting (minor) by @pablo-garay :: PR: #11485
+- Fix bug related to naming by @pablo-garay :: PR: #11487
+- Add BERT Model To NeMo2.0 by @suiyoubi :: PR: #11333
+- Update Nemo Distributed Checkpoint User Guide by @FortunaZhang :: PR: #11489
+- fix: regular torch optims (e.g., sgd) no longer error with closure spec by @terrykong :: PR: #11189
+- Add recipe configs validating by @BoxiangW :: PR: #10954
+- Fix finetuning PP by @cuichenx :: PR: #11474
+- [docs] Documentation for audio collection by @anteju :: PR: #11426
+- config hierarchy by @malay-nagda :: PR: #11145
+- Force param sync when using distributed optimizer and overlap_param_gather by @hemildesai :: PR: #11486
+- chore(beep boop 🤖): Bump `MCORE_TAG=bd677bf...` (2024-12-06) by @ko3n1g :: PR: #11492
+- Remove default mutable arguments from AbstractEmbModel constructor by @ananthsub :: PR: #11348
+- minor fix for nemo2 sftpeft readme by @HuiyingLi :: PR: #11502
+- Update Llama3 Fine-Tuning Notebook by @roclark :: PR: #11522
+- Fix CI issue on validation config by @BoxiangW :: PR: #11521
+- Freeze tags in in `r2.1.0` by @github-actions[bot] :: PR: #11556
+- Cherrypick all + R2.1.0 fix cicd  by @pablo-garay :: PR: #11622
+- Cherry pick `Add fix docstring for speech commands (11638)` into `r2.1.0` by @ko3n1g :: PR: #11639
+- Cherrypick #11628 to r2.1.0 by @nasretdinovr :: PR: #11630
+- Update package_info.py by @ko3n1g :: PR: #11646
+- Cherry pick `Add fix docstring for VAD (11659)` into `r2.1.0` by @ko3n1g :: PR: #11660
+- Fix tokenizer trust_remote_code by @cuichenx :: PR: #11657
+- Cherrypick 11568 by @cuichenx :: PR: #11656
+- Cherry pick `Downgrading the 'datasets' package from 3.0.0 to 2.21.0 for Multilang_ASR.ipynb and ASR_CTC_Language_Finetuning.ipynb (11675)` into `r2.1.0` by @ko3n1g :: PR: #11677
+- r2.1.0 cherrypick by @pablo-garay :: PR: #11680
+- Cherry pick `Rename multimodal data module - EnergonMultiModalDataModule (11654)` into `r2.1.0` by @ko3n1g :: PR: #11685
+- chore: Bump to `r2.1.0rc2` by @ko3n1g :: PR: #11693
+- r2.1.0 ptl fix by @pablo-garay :: PR: #11694
+
+</details>
+
+## NVIDIA Neural Modules 2.1.0rc2
+
+Prerelease: NVIDIA Neural Modules 2.1.0rc2 (2024-12-21)
+
+## NVIDIA Neural Modules 2.1.0rc1
+
+Prerelease: NVIDIA Neural Modules 2.1.0rc1 (2024-12-20)
+
+## NVIDIA Neural Modules 2.1.0rc0
+
+Prerelease: NVIDIA Neural Modules 2.1.0rc0 (2024-12-12)
+
 ## NVIDIA Neural Modules 2.0.0rc1
 
 ### Highlights
diff --git a/Dockerfile.ci b/Dockerfile.ci
index 1d4173f9689c..e93d00d03195 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -53,7 +53,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
-ARG MODELOPT_VERSION=0.19.0
+ARG MODELOPT_VERSION=0.21.0
 ARG MCORE_TAG=bd677bfb13ac2f19deaa927adc6da6f9201d66aa
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
diff --git a/docs/source/checkpoints/dev_guide.rst b/docs/source/checkpoints/dev_guide.rst
deleted file mode 100644
index 601e69749b64..000000000000
--- a/docs/source/checkpoints/dev_guide.rst
+++ /dev/null
@@ -1,234 +0,0 @@
-Community Model Converter Development Guide
-===========================================
-
-Guideline Steps for Checkpoint Conversion
------------------------------------------
-
-1. **Understand Both Frameworks**: Familiarize yourself with the architectures and naming conventions of both HuggingFace and NeMo models.
-
-2. **Load Community Checkpoint**: For example, use HuggingFace’s ``AutoModel`` to load the pre-trained model.
-
-3. **Inspect Model and Config**: Understand the layer names, parameter shapes, and essential configs.
-
-4. **Adjust NeMo Model Configuration**: Modify the NeMo model configuration to match the HuggingFace model’s specifications.
-
-5. **Initialize NeMo Model**: Create an instance of the corresponding NeMo model.
-
-6. **Create Key Mapping**: Define a function to map HuggingFace layer names to NeMo layer names. Adjust for any structural differences.
-
-7. **Rename and Reshape Parameters**: Implement a function to rename keys in the HuggingFace state dictionary and reshape tensors if necessary. For example, QKV weights usually need some special handling from HF to NeMo.
-
-8. **Load Converted Weights into NeMo Model**: Apply the transformed state dictionary to the NeMo model.
-
-9. **Save NeMo Checkpoint**: Save the updated NeMo model as a new checkpoint.
-
-10. **Verification**: Verify the performance of the NeMo model to ensure successful conversion.
-
-11. **Add Docstrings and Comments**: Please kindly comment the expected shapes in the parameter reshaping part.
-
-12. **Add Jenkins Tests**: Please use `Llama Huggingface to NeMo converter test <https://github.com/NVIDIA/NeMo/blob/main/Jenkinsfile#L418>`_  as an example for development.
-
-Script Placement and Naming Conventions
----------------------------------------
-
-- **Script Location**: Place scripts in the ``NeMo/scripts/checkpoint_converters`` directory.
-
-- **Script Naming**: Name your script following the format ``convert_{model}_{source}_to_{target}.py``, such as ``convert_llama_hf_to_nemo.py``.
-
-- **Unified Arguments (APIs)**: User only needs to define input and output files. Configs should be automatically updated.
-
-  - ``--input_name_or_path``: Specify the name or path of the model. Give one example default value.
-
-  - ``--output_path``: Set the path for saving the output .nemo file. This argument is required.
-
-  - ``--hparams_file``: Define the path for the configuration file needed for restoration. Set default path to an existing and working yaml file e.g. ``f"{os.path.dirname(__file__)}/../../examples/nlp/language_modeling/conf/megatron_bert_config.yaml"``. A regular user should not change it, but for advanced/internal users, this can be modified.
-
-  - ``--precision``: Choose the precision for saved checkpoint weights. Options: "bf16", "16", "32". Default: "32".
-
-Code Template
--------------
-
-Below template tries to address the 11 steps in the guideline part. Please also use `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py>`__  as an full example for development.
-
-.. code-block:: python
-
-    import os
-    import torch
-    from omegaconf import OmegaConf
-    from transformers import AutoTokenizer, AutoModel
-    from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
-    from nemo.utils import logging
-    from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
-
-    # Add additional imports and custom functions as required
-
-    def create_rename_keys(num_hidden_layers):
-        # Your implementation of create_rename_keys function
-        ...
-
-    def adjust_tensor_shapes(model, nemo_state_dict):
-        # Your implementation of adjust_tensor_shapes function
-        ...
-
-    def adjust_nemo_config(model_config, ref_config):
-        # Your implementation of adjust_nemo_config function
-        ...
-
-    def rename_model_keys(model_state_dict, rename_keys):
-        """
-        Rename keys in the model's state dictionary based on the provided mappings.
-
-        Parameters:
-        model_state_dict (dict): The state dictionary of the model.
-        rename_keys (list): A list of tuples with the mapping (old_key, new_key).
-
-        Returns:
-        dict: A new state dictionary with updated key names.
-        """
-
-        # Create a new state dictionary with updated key names
-        new_state_dict = {}
-
-        # Track keys from the original state dict to ensure all are processed
-        remaining_keys = set(model_state_dict.keys())
-
-        # Iterate over the rename mappings
-        for old_key, new_key in rename_keys:
-            if old_key in model_state_dict:
-                # Rename the key and remove it from the tracking set
-                new_state_dict[new_key] = model_state_dict[old_key]
-                remaining_keys.remove(old_key)
-            else:
-                print(f"Warning: Key '{old_key}' not found in the model state dictionary.")
-
-        # Check if any keys were not converted from old to new
-        for old_key in remaining_keys:
-            print(f"Warning: Key '{old_key}' was not converted.")
-
-    def get_args():
-        # Arg names subject to change, feel free to suggest.
-        parser = ArgumentParser()
-        parser.add_argument("--input_name_or_path", type=str, default="intfloat/e5-large-unsupervised")
-        parser.add_argument(
-            "--hparams_file",
-            type=str,
-            default=f"{os.path.dirname(__file__)}/../../examples/nlp/language_modeling/conf/megatron_bert_config.yaml",
-            required=False,
-            help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
-        )
-        parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
-        parser.add_argument(
-            "--precision", type=str, default="32", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
-        )
-
-        args = parser.parse_args()
-        return args
-
-    def convert(args):
-        logging.info(f"Loading checkpoint from HF: `{args.name_or_path}`")
-        hf_model = AutoModel.from_pretrained(args.name_or_path)
-
-        nemo_config = OmegaConf.load(args.hparams_file)
-        nemo_config.model = adjust_nemo_config(nemo_config.model, hf_model.config.to_dict())
-
-        nemo_config.trainer["precision"] = args.precision
-        trainer = MegatronTrainerBuilder(nemo_config).create_trainer()
-        model = MegatronBertModel(nemo_config.model, trainer)
-
-        old_state_dict = hf_model.state_dict()
-        rename_keys = create_rename_keys(nemo_config.model.num_layers)
-        new_state_dict = rename_model_keys(model_state_dict=old_state_dict, rename_keys=rename_keys)
-        nemo_state_dict = adjust_tensor_shapes(model, new_state_dict)
-        model.load_state_dict(nemo_state_dict, strict=True)
-
-        # Additional verification and processing steps
-        ...
-
-        model.save_to(args.save_path)
-        logging.info(f'NeMo model saved to: {args.save_path}')
-
-    if __name__ == '__main__':
-        args = get_args()
-        convert(args)
-
-
-
-*Notes:* This template abstracts some functions (create_rename_keys, adjust_tensor_shapes, adjust_nemo_config) which are crucial for the conversion process. These functions need to be adapted based on specific model architectures and requirements. Ensure that the NeMo model’s configuration is properly aligned with the HuggingFace model’s configuration. It is important to thoroughly test the converted model to validate the conversion process.
-
-
-Development Tips
-----------------
-
-A Simple Guide for Model Mapping and Conversion
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-1. **Mapping between community model and NeMo model**:
-
-   - Match the configurations between the community model and the NeMo model.
-   - Create two text files, ``state_src.txt`` and ``state_tgt.txt``, containing the state dict weights and their shapes for easier reference and debugging.
-
-   Example code to generate ``state_src.txt``:
-
-   .. code-block:: python
-
-       file_path = "state_src.txt"
-       state = model.state_dict()
-       with open(file_path, 'w') as file:
-           for k, v in state.items():
-               file.write(f"{k} {v.shape}\n")
-
-   - Utilize language models (LMs) to assist in completing the key mapping through the ``create_rename_keys`` function. Here's an example prompt for Gemma:
-
-     .. code-block:: text
-
-        Map the following key names and tensor shapes from Model A to their equivalents in Model B. Here is an example mapping: Model A's 'model.layer.weight' corresponds to Model B's 'module.block.weight'.
-        ============================================================
-        embedder.weight torch.Size([256128, 2048])
-        ...
-        ============================================================
-
-   Based on the results, update the following code accordingly:
-
-   .. code-block:: python
-
-       def create_rename_keys(num_hidden_layers):
-           rename_keys = []
-           for i in range(num_hidden_layers):
-               # encoder layers: output projection, 2 feedforward neural networks, and 2 layernorms
-               # @chatgpt to fill in layer-dependent keys above
-
-           # @chatgpt fill in non-layer-dependent keys above
-           rename_keys.extend(
-               [
-                   # ...
-               ]
-           )
-
-           return rename_keys
-
-   **Note**: Also list all the keys not included in the conversion above.
-
-2. **Common issues when converting: results not matching between Community model and NeMo model**:
-
-   a. Megatron Core uses a special QKV layout, which needs careful handling and reshaping from community models, especially when GQA or MQA is used. Refer to the `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py#L144>`__ for guidance.
-
-   b. GLU Variants weights could also be a common source of error. In Megatron Core, the regular feedforward projection weights and gated forward weights are fused together, requiring careful attention to the order of these two. Refer to the `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py#L135>`_ for more details.
-
-3. The ``create_hf_model`` function can be used to create a model programmatically. For reproducibility, see the example provided at `GitHub <https://github.com/NVIDIA/NeMo/blob/main/tests/setup/models/create_hf_model.py>`_. This function creates a randomly initialized HuggingFace model for testing purposes. The model can be specified by name or path for creating its config and tokenizer using HuggingFace transformers AutoConfig and AutoTokenizer functions.
-
-Example usage:
-
-.. code-block:: python
-
-    create_hf_model(
-        model_name_or_path="/home/TestData/nlp/meta-llama/Llama-2-7b-hf",
-        output_dir=os.path.join(args.save_dir, "megatron_llama/llama-ci-hf"),
-        config_updates={
-            "hidden_size": 256,
-            "num_attention_heads": 4,
-            "num_hidden_layers": 2,
-            "num_key_value_heads": 4
-        },
-        overwrite=args.overwrite,
-    )
-
diff --git a/docs/source/checkpoints/user_guide.rst b/docs/source/checkpoints/user_guide.rst
deleted file mode 100644
index 451679a7e3ae..000000000000
--- a/docs/source/checkpoints/user_guide.rst
+++ /dev/null
@@ -1,94 +0,0 @@
-Community Model Converter User Guide
-====================================
-
-This guide provides instructions on how to use the conversion scripts to convert models between Community model and NVIDIA's NeMo format.
-
-Support Matrix
---------------
-
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-| Conversion           | From             | To                  | Github Link                                                                                                         |
-+======================+==================+=====================+=====================================================================================================================+
-| Baichuan             | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py>`__   |
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-| Baichuan             | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py>`__   |
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-| BERT                 | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py>`__        |
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-| BERT                 | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_nemo_to_hf.py>`__        |
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-| Falcon               | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py>`__      |
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-| Falcon               | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py>`__      |
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-| Gemma                | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py>`__       |
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-| Gemma                | JAX              | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py>`__      |
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-| Gemma                | PyTorch          | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py>`__      |
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-| GPT/LLaMA            | NeMo (Legacy)    | NeMo (Megatron-Core)| `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py>`__      |
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-| LLaMA                | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py>`__       |
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-| LLaMA                | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py>`__       |
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-| Mistral 7B           | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py>`__  |
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-| Mistral 7B           | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py>`__  |
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-| Mixtral              | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py>`__     |
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-| Mixtral              | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py>`__     |
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-| MPT                  | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mpt_hf_to_nemo.py>`__         |
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-| Starcoder            | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py>`__   |
-+----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
-
-
-Convert Hugging Face LLaMA Checkpoints to NeMo
-----------------------------------------------
-
-To convert a Hugging Face LLaMA checkpoint into a NeMo checkpoint, use the following command:
-
-.. code-block:: bash
-
-    python convert_llama_hf_to_nemo.py \
-     --input_name_or_path <path_to_hf_checkpoints_folder> \
-     --output_path <path_to_output_nemo_file>
-
-Convert NeMo Checkpoint to Hugging Face LLaMA
----------------------------------------------
-
-To convert a NeMo checkpoint into a Hugging Face LLaMA checkpoint, you have two options:
-
-1. Generate only the Hugging Face weights:
-
-.. code-block:: bash
-
-    python convert_<model>_nemo_to_hf.py \
-    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
-    --output_path /path/to/pytorch_model.bin
-
-2. Generate the full Hugging Face model folder:
-
-.. code-block:: bash
-
-    python convert_<model>_nemo_to_hf.py \
-    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
-    --output_path /path/to/model_folder \
-    --hf_input_path /path/to/input_hf_folder \
-    --hf_output_path /path/to/output_hf_folder
-
-Replace `<model>` with the specific model you are converting.
-
-Use the ``--cpu-only`` flag if the model cannot fit in the GPU, such as for Llama2 70b models. Note that using this option will significantly slow down the conversion process.
-
-Command-Line Arguments
-----------------------
-
-- ``--input_name_or_path``: Path to the input .nemo file or the Hugging Face model folder.
-- ``--output_path``: Path to the output file or folder, depending on the conversion direction.
-- ``--hf_input_path``: (Optional) Path to the input Hugging Face model folder.
-- ``--hf_output_path``: (Optional) Path to the output Hugging Face model folder.
diff --git a/docs/source/features/optimizations/index.rst b/docs/source/features/optimizations/index.rst
index 60f4428f9299..c9492967b8a0 100644
--- a/docs/source/features/optimizations/index.rst
+++ b/docs/source/features/optimizations/index.rst
@@ -5,7 +5,6 @@ Optimizations
    :maxdepth: 1
 
    ./attention_optimizations
-   ./sequence_packing
    ./activation_recomputation
    ./communication_overlap
    ./cpu_offloading
diff --git a/docs/source/features/optimizations/sequence_packing.rst b/docs/source/features/optimizations/sequence_packing.rst
deleted file mode 100644
index 40c04ce65350..000000000000
--- a/docs/source/features/optimizations/sequence_packing.rst
+++ /dev/null
@@ -1,136 +0,0 @@
-Sequence Packing
-================
-
-This section explains how to use the sequence packing training technique with Supervised Fine-Tuning (SFT) and Parameter-Efficient Fine-Tuning (PEFT).
-
-Sequence Packing for SFT/PEFT
------------------------------
-
-Overview
-########
-
-When fine-tuning a large language model, whether using SFT or PEFT methods, GPU underutilization often occurs due to an inefficient data pipeline. This inefficiency arises because most fine-tuning datasets have a skewed distribution of sequence lengths, with many short sequences and a few long ones, following Zipf’s Law. Since transformer models require fixed-length inputs, shorter sequences must be padded with unused tokens, leading to two main inefficiencies:
-
-- Computation performed on the pad values is eventually ignored for model output, resulting in wasted FLOPs.
-- Micro batch size is often limited by the batch which contains longer sequences, so that most other micro batches have
-  underutilized GPU memory.
-
-Sequence packing is a training technique where multiple training sequences (examples) are concatenated into one long sequence (pack). This method eliminates the need for padding, allowing more tokens to be processed in each micro batch. As a result, it maximizes both GPU compute and GPU memory utilization.
-
-While sequences for pretraining can be concatenated naively, this is not the case for SFT and instruction fine-tuning
-where each input sequence should be treated individually. The conventional solution is to build an extended attention
-mask to mark the sequence id each token belongs to, and mask out attention values between sequences. However, this
-increases the complexity of attention from :math:`\sum_i {s_i}^2` to :math:`\Big({\sum_i {s_i}}\Big)^2`, where :math:`s_i` is the
-length of the ith subsequence. In practice, the conventional solution puts a limit on the length of packing.
-Instead, NeMo provides a highly optimized version of sequence packing which makes use of variable-length attention
-kernels in FlashAttention and TransformerEngine. With this approach, attention values between sequences are never calculated,
-so the complexity of attention remains at :math:`\sum_i {s_i}^2`. This allows packing sequences to arbitrary lengths so
-that GPU memory can be fully utilized.
-
-All things considered, NeMo’s implementation of sequence packing provides [#f1]_
-
-- Up to 10X performance improvement in terms of FLOPs
-- Up to 6X performance improvement in terms of training time
-- No impact on model convergence
-
-
-
-How to run SFT/PEFT with packed sequence
-########################################
-
-Prepare Dataset
-^^^^^^^^^^^^^^^
-
-We provide a convenient script to pack your SFT or PEFT dataset.
-This script assumes that you already have a prepared dataset file for SFT/PEFT training in NeMo. If you do not, please
-follow `this <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/llama2sft.html#prepare-data>`_ to
-download and prepare the Dolly dataset as an example.
-You will get a file named training.jsonl. The rest of this tutorial also assumes you already have a recipe for
-training with the unpacked dataset.
-
-Two main steps are run in this script:
-
-1. The online processing code in GPTSFTDataset is run. This includes tasks such as prompt template manipulation, sequence length truncation, and tokenization. The result is an array of tokenized sequences, represented by indices.
-2. The tokenized sequences are grouped by length and a packing algorithm is run.
-
-You can read more about packing algorithms `here <https://en.wikipedia.org/wiki/Bin_packing_problem#Offline_algorithms>`_.
-Currently, two variants of ``first_fit`` are supported.
-- ``first_fit_decreasing`` sorts the sequences in decreasing order before applying the first-fit algorithm. It generates a
-more optimal packing, but it tends to keep all short sequences together, which may have an impact for convergence.
-- ``first_fit_shuffle`` runs first-fit in a random order. Packing is less optimal but it keeps the dataset order random.
-The recommendation is to run ``first_fit_shuffle`` and check the packed sequence lengths. If they are similar to the
-target length (i.e. efficient packing), then use shuffle. Otherwise try ``first_fit_decreasing``.
-
-    .. code-block:: bash
-
-        python scripts/nlp_language_modeling/prepare_packed_ft_dataset.py \
-           model.data.train_ds.file_names=[/path/to/training.jsonl] \
-           model.data.train_ds.max_seq_length=2048 \
-           +tokenizer_path=/path/to/tokenizer.model \
-           +output_dir=/path/to/output_folder \
-           +pack_sizes=[2048,4096,8192] \
-        [  +packing_algorithm=first_fit_shuffle \  ]
-        [  +seed=0                                 ]
-
-.. note::
-
-    1. If your model or dataset requires non-default configs for conventional SFT/PEFT training in NeMo, you will need to pass in the same configs to ``model.data.train_ds`` as you would for training with an unpacked dataset.
-
-    2. ``model.data.train_ds.max_seq_length`` is the length to which each sequence is truncated before packing multiple sequences to the size of packed sequence (``pack_size``). ``max_seq_length`` should be set to the same value as unpacked data and can be determined by examining the distribution of sequence lengths in the dataset.
-
-    3. ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for each pack size. The output files are named ``<output_folder>/packed_{pack_size}_seed{seed}.npy``.
-    This argument is a list because you will likely want to experiment with a few ``pack_sizes`` to find out which length
-    can fill the GPU memory without exceeding it. Adjusting ``pack_size`` is analogous to adjusting the micro batch size in
-    the unpacked case.
-
-
-Adjust Training Config
-^^^^^^^^^^^^^^^^^^^^^^
-
-To train with packed sequences, you need to change four items in the SFT/PEFT config file.
-
-1. Turn on the packed_sequence flag:
-
-    .. code-block:: bash
-
-        ++model.data.train_ds.packed_sequence=True
-
-2. Use the new dataset file instead of the original jsonl file:
-
-    .. code-block:: bash
-
-        model.data.train_ds.file_names=output_folder/packed_{pack_size}_seed{seed}.npy
-
-3. Specify the packed sequence length. This should be one of the ``pack_sizes`` you specified during data preparation.
-
-    .. code-block:: bash
-
-        model.data.train_ds.max_seq_length={pack_size}
-
-4. Adjust the batch sizes.
-
-    - Micro batch size has to be set to 1 as a nominal constraint. This is because batches are now concatenated in the
-      preprocessing step. You can increase the ``pack_size`` to achieve the same purpose of increasing micro batch size.
-    - Global batch size has to be adjusted so that the training recipe is maintained. Because each pack contains
-      multiple sequences now, global batch size needs to be reduced by the average number of sequences per pack ``n``,
-      where ``n = num_sequences_in_dataset / num_packs``. This ensures that each gradient iteration sees (on
-      average) the same number of tokens. The value of ``n`` is printed out when the script is run.
-
-    .. code-block:: bash
-
-        model.micro_batch_size=1
-        model.global_batch_size=<GBS divided by n>
-
-Now, you are all set to fine-tune your model with a much improved throughput!
-
-Sequence Packing for NeVA
--------------------------
-
-Sequence packing with NeVA for multimodal large language models differs from the LLM SFT/PEFT approach. For details, please refer to the documentation below.
-
-:doc:`../../multimodal/mllm/sequence_packing`
-
-.. rubric:: Footnotes
-
-.. [#f1] Experiments were performed on Llama 7B with Dolly dataset. Actual performance improvement depends on dataset
-         and model.
\ No newline at end of file
diff --git a/examples/llm/peft/hf.py b/examples/llm/peft/hf.py
index 3137a542ae01..c0562663c2cc 100644
--- a/examples/llm/peft/hf.py
+++ b/examples/llm/peft/hf.py
@@ -107,6 +107,7 @@ def main():
             use_distributed_sampler=use_dist_samp,
             logger=wandb,
             callbacks=callbacks,
+            precision="bf16",
         ),
         optim=fdl.build(llm.adam.pytorch_adam_with_flat_lr(lr=1e-5)),
         log=None,
diff --git a/examples/llm/sft/hf.py b/examples/llm/sft/hf.py
index ff85180cf86b..ad22c8a733f4 100755
--- a/examples/llm/sft/hf.py
+++ b/examples/llm/sft/hf.py
@@ -124,6 +124,7 @@ def main():
             use_distributed_sampler=use_dist_samp,
             logger=wandb,
             callbacks=callbacks,
+            precision="bf16",
         ),
         optim=fdl.build(llm.adam.pytorch_adam_with_flat_lr(lr=1e-5)),
         log=None,
diff --git a/examples/llm/sft/hf_vllm.py b/examples/llm/sft/hf_vllm.py
index 8110c0fafc4f..6b907ffbb714 100755
--- a/examples/llm/sft/hf_vllm.py
+++ b/examples/llm/sft/hf_vllm.py
@@ -42,7 +42,7 @@
         triton_model_name=args.triton_model_name,
         triton_model_version=1,
         max_batch_size=64,
-        port=8000,
+        http_port=8000,
         address="0.0.0.0",
     )
 
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml
index 85e46b6a6989..16a1a89c0d2f 100644
--- a/examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml
@@ -1,14 +1,14 @@
 inference:
   greedy: false # Whether or not to use sampling ; use greedy decoding otherwise
-  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering.
   top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
   temperature: 1.0 # sampling temperature
   add_BOS: true # add the bos token at the begining of the prompt
   tokens_to_generate: 30 # The minimum length of the sequence to be generated.
-  all_probs: false  # whether return the log prob for all the tokens in vocab
-  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
-  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
-  compute_logprob: false  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  all_probs: false # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0 # The minimum length of the sequence to be generated.
+  compute_logprob: false # a flag used to compute logprob of all the input text, a very special case of running inference, default False
   batch_size: 1 # batch size for inference
   max_context_length: 512 # max length of the context, input sequence will be truncated if it is longer than this
 
@@ -38,7 +38,8 @@ prune:
   num_attention_heads: null # num_attention_heads in the pruned model
   num_query_groups: null # num_query_groups in the pruned model
   hidden_size: null # hidden_size (embedding size) in the pruned model
-  num_layers: null # num_layers (depth) in the pruned model
+  num_layers: null # num_layers (depth) in the pruned model using on cosine-similarity based importance
+  drop_layers: [] # drop specified layer numbers (comma separated, 1-indexed). Cannot be used with other constraints
 
 export:
   save_path: ??? # Path where the pruned model will be saved
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
index 62f0e452d3b5..ff8d8ca7c944 100644
--- a/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
@@ -32,7 +32,7 @@ model:
   activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
 
 quantization:
-  decoder_type: ${export.decoder_type} # gptnext, gpt2, llama
+  decoder_type: ${export.decoder_type} # gpt, llama
   algorithm: fp8 # null, int8_sq, fp8, int4_awq
   calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset
   num_calib_size: 512 # number of samples used for calibration
@@ -41,7 +41,7 @@ quantization:
   enable_kv_cache: null # Enable FP8 KV cache quantization. Set to null for automatic selection.
 
 export:
-  decoder_type: llama # gptnext, gpt2, llama
+  decoder_type: llama # gpt, llama
   inference_tensor_parallel: 1 # Default using 1 TP for inference
   inference_pipeline_parallel: 1 # Default using 1 PP for inference
   dtype: 16 # Default precision data type for non-quantized layers: 16 or bf16
diff --git a/examples/nlp/language_modeling/megatron_gpt_drop_layers.py b/examples/nlp/language_modeling/megatron_gpt_drop_layers.py
index 4cd3fb6a8ef6..e14a75efdb42 100644
--- a/examples/nlp/language_modeling/megatron_gpt_drop_layers.py
+++ b/examples/nlp/language_modeling/megatron_gpt_drop_layers.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 r"""
+NOTE: This script will be deprecated soon in favor of `megatron_gpt_prune.py`. Please use the new script for trimming layers.
+
 Script to trim model layers.
   Example to run the script with checkpoint:
     python -m torch.distributed.launch --nproc_per_node=<tensor_model_parallel_size> * <pipeline_model_parallel_size> \
@@ -112,6 +114,8 @@ def trim_layers(model, layers_to_trim):
 
 
 def main(local_rank, rank, world_size, args):
+    logging.warning("This script will be deprecated soon in favor of `megatron_gpt_prune.py`.")
+
     app_state = AppState()
     app_state.data_parallel_rank = 0
     num_nodes = world_size // args.gpus_per_node
diff --git a/examples/nlp/language_modeling/megatron_gpt_prune.py b/examples/nlp/language_modeling/megatron_gpt_prune.py
index b89d3adbb081..100f86f59aef 100644
--- a/examples/nlp/language_modeling/megatron_gpt_prune.py
+++ b/examples/nlp/language_modeling/megatron_gpt_prune.py
@@ -32,7 +32,7 @@
 Please consult examples/nlp/language_modeling/conf/megatron_gpt_prune.yaml config on available pruning arguments,
 models supported as well as how to set up data and inference for calibration (with defaults recommended).
 
-Example usage:
+Example usage to prune width automatically:
 ```
 python examples/nlp/language_modeling/megatron_gpt_prune.py \
     model.restore_from_path=llama3.1-8b.nemo \
@@ -45,9 +45,54 @@
     prune.num_attention_heads=null \
     prune.num_query_groups=null \
     prune.hidden_size=3072 \
+    export.save_path=llama3.1-8b-width-pruned.nemo
+```
+
+Example usage to prune depth automatically using cosine-similarity based importance metric:
+```
+python examples/nlp/language_modeling/megatron_gpt_prune.py \
+    model.restore_from_path=llama3.1-8b.nemo \
+    model.tensor_model_parallel_size=1 \
+    model.pipeline_model_parallel_size=8 \
+    trainer.num_nodes=1 \
+    trainer.precision=bf16 \
+    trainer.devices=8 \
+    prune.num_layers=16 \
+    export.save_path=llama3.1-8b-depth-pruned.nemo
+```
+
+Example usage to prune width and depth automatically:
+```
+python examples/nlp/language_modeling/megatron_gpt_prune.py \
+    model.restore_from_path=llama3.1-8b.nemo \
+    model.tensor_model_parallel_size=1 \
+    model.pipeline_model_parallel_size=8 \
+    trainer.num_nodes=1 \
+    trainer.precision=bf16 \
+    trainer.devices=8 \
+    prune.ffn_hidden_size=9216 \
+    prune.num_attention_heads=null \
+    prune.num_query_groups=null \
+    prune.hidden_size=3072 \
+    prune.num_layers=16 \
+    export.save_path=llama3.1-8b-width-and-depth-pruned.nemo
+```
+
+NOTE: for above usages, `model.tensor_model_parallel_size` and `inference.batch_size` must be 1
+because of the current prune API limitation
+
+Example usage to prune depth by dropping specific model layers (1-indexed):
+```
+python examples/nlp/language_modeling/megatron_gpt_prune.py \
+    model.restore_from_path=llama3.1-8b.nemo \
+    model.tensor_model_parallel_size=8 \
+    model.pipeline_model_parallel_size=1 \
+    trainer.num_nodes=1 \
+    trainer.precision=bf16 \
+    trainer.devices=8 \
+    'prune.drop_layers=[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]' \
     export.save_path=llama3.1-8b-pruned.nemo
 ```
-where model.tensor_model_parallel_size and inference.batch_size must be 1 because of the current prune API limitation
 """
 
 
@@ -79,51 +124,61 @@ def main(cfg) -> None:
     model_cfg.update(cfg.model)
     model_cfg.name = "modelopt"  # Use modelopt transformer spec for pruning
 
-    assert cfg.model.tensor_model_parallel_size == 1, "Pruning currently only supports tensor_model_parallel_size=1"
-
     trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
     model = MegatronGPTModel.restore_from(
         restore_path=cfg.model.restore_from_path, override_config_path=model_cfg, trainer=trainer
     )
 
-    data_iter = get_calib_data_iter(
-        cfg.prune.calib_dataset,
-        cfg.inference.batch_size,
-        cfg.prune.num_calib_size,
-        cfg.inference.max_context_length,
-    )
-    dataloader = [data for data in data_iter]
-
     def forward_loop(model):
+        data_iter = get_calib_data_iter(
+            cfg.prune.calib_dataset,
+            cfg.inference.batch_size,
+            cfg.prune.num_calib_size,
+            cfg.inference.max_context_length,
+        )
+        dataloader = [data for data in data_iter]
+
         # NOTE: Alternatively you can also use `model.forward_bwd_step(data_iter, forward_only=True)`
         # if your model is setup for training.
         model.set_inference_config(OmegaConf.to_container(cfg.inference))
         for i, batch in enumerate(tqdm(dataloader, desc="Calibrating")):
             model.predict_step(batch, i)
 
-    model_pruned, _ = mtp.prune(
-        model,
-        mode="mcore_gpt_minitron",
-        constraints={
-            "export_config": {
-                k: cfg.prune.get(k)
-                for k in [
-                    "ffn_hidden_size",
-                    "num_attention_heads",
-                    "num_query_groups",
-                    "hidden_size",
-                    "num_layers",
-                ]
-                if cfg.prune.get(k) is not None
-            },
-        },
-        dummy_input=None,  # Not used
-        config={"forward_loop": forward_loop},
-    )
-
-    model_pruned.save_to(cfg.export.save_path)
+    export_config = {
+        k: cfg.prune.get(k)
+        for k in [
+            "ffn_hidden_size",
+            "num_attention_heads",
+            "num_query_groups",
+            "hidden_size",
+            "num_layers",
+        ]
+        if cfg.prune.get(k) is not None
+    }
+
+    drop_layers = OmegaConf.to_object(cfg.prune.drop_layers)  # convert to native python list
+    if drop_layers:
+        assert (
+            not export_config
+        ), f"Cannot specify `prune.drop_layers` with other prune constraints. Recieved: {cfg.prune}"
+        mtp.plugins.megatron.drop_mcore_gpt_layers(model.model, layers_to_drop=drop_layers)
+        setattr(model.cfg, "num_layers", model.model.config.num_layers)
+    else:
+        assert (
+            cfg.model.tensor_model_parallel_size == 1
+        ), "Pruning currently only supports tensor_model_parallel_size=1"
+
+        mtp.prune(
+            model,
+            mode="mcore_gpt_minitron",
+            constraints={"export_config": export_config},
+            dummy_input=None,  # Not used
+            config={"forward_loop": forward_loop},
+        )
+
+    model.save_to(cfg.export.save_path)
     print(f"Pruned model saved to {cfg.export.save_path}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_qat_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_qat_config.yaml
index 09e00f8be110..35b0257b743b 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_qat_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_qat_config.yaml
@@ -190,7 +190,7 @@ model:
       reduce_on_plateau: false
 
 quantization:
-  decoder_type: ${export.decoder_type} # gptnext, gpt2, llama
+  decoder_type: ${export.decoder_type} # gpt, llama
   algorithm: int4 # null, int8_sq, fp8, int4_awq, int4
   num_calib_size: 512 # number of samples used for calibration
   awq_block_size: 128 # block size for scaling factors (only used in AWQ algorithms)
@@ -198,7 +198,7 @@ quantization:
   enable_kv_cache: false # Enable FP8 KV cache quantization. Set to null for automatic selection.
 
 export:
-  decoder_type: llama # gptnext, gpt2, llama
+  decoder_type: llama # gpt, llama
   inference_tensor_parallel: 1 # Default using 1 TP for inference
   inference_pipeline_parallel: 1 # Default using 1 PP for inference
   dtype: ${trainer.precision} # Default precision data type
diff --git a/examples/speechlm/sft/hf.py b/examples/speechlm/sft/hf.py
new file mode 100755
index 000000000000..96e785dac97f
--- /dev/null
+++ b/examples/speechlm/sft/hf.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fiddle as fdl
+import torch
+from lhotse.dataset.collation import collate_matrices, collate_vectors
+from omegaconf import OmegaConf
+
+from nemo import lightning as nl
+from nemo.collections import speechlm
+from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
+from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+from nemo.collections.speechlm.models import HFAutoModelForSpeechSeq2Seq
+
+torch.set_float32_matmul_precision("medium")
+
+
+class LhotseHfNeMoDataset(torch.utils.data.Dataset):
+    def __init__(self, processor, tokenizer, decoder_mask_fill=-100):
+        super().__init__()
+        self.processor = processor
+        self.tokenizer = tokenizer
+        self.decoder_mask_fill = decoder_mask_fill
+
+    def __getitem__(self, cuts):
+        features = []
+        for cut in cuts:
+            audio = cut.load_audio()
+            features.append(
+                self.processor(
+                    audio,
+                    sampling_rate=cut.sampling_rate,
+                    return_tensors="pt",
+                    text=cut.supervisions[0].text,
+                )
+            )
+
+        input_features = collate_matrices(tensors=[f["input_features"].squeeze(0) for f in features])
+        labels = collate_vectors(tensors=[c.supervisions[0].tokens for c in cuts])
+        decoder_input_ids = labels[:, :-1]
+        decoder_input_ids = decoder_input_ids.masked_fill(
+            decoder_input_ids == self.decoder_mask_fill, self.tokenizer.pad_id
+        )
+        labels = labels[:, 1:].reshape(-1)
+
+        return {
+            "input_features": input_features,
+            "labels": labels,
+            "decoder_input_ids": decoder_input_ids,
+        }
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+
+    # Models can be one of the supported ones by AutoModelForSpeechSeq2Seq such as
+    # openai/whisper-large-v3 and facebook/s2t-small-librispeech-asr
+    parser.add_argument('--model', default='openai/whisper-large-v3')
+    parser.add_argument('--strategy', type=str, default='auto', choices=['auto', 'ddp', 'fsdp'])
+    parser.add_argument('--devices', default=1)
+    parser.add_argument('--accelerator', default='gpu', choices=['gpu'])
+    parser.add_argument('--max-steps', type=int, default=100)
+    parser.add_argument('--model-save-path', type=str, default=None)
+    args = parser.parse_args()
+
+    model = HFAutoModelForSpeechSeq2Seq(model_name=args.model)
+    model = model.to(torch.float)
+    processor = model.processor
+    tokenizer = AutoTokenizer(args.model, include_special_tokens=True)
+
+    config = OmegaConf.create(
+        {
+            "cuts_path": "/opt/checkpoints/lhotse/libri/libri-train-5.jsonl.gz",
+            "sample_rate": 16000,
+            "shuffle": True,
+            "num_workers": 2,
+            "batch_size": 4,
+            "shuffle_buffer_size": 100,
+        }
+    )
+
+    train_dataloader = get_lhotse_dataloader_from_config(
+        config,
+        global_rank=0,
+        world_size=1,
+        dataset=LhotseHfNeMoDataset(
+            processor=processor,
+            tokenizer=tokenizer,
+        ),
+        tokenizer=tokenizer,
+    )
+
+    speechlm.api.finetune(
+        model=model,
+        data=train_dataloader,
+        trainer=nl.Trainer(
+            devices=args.devices,
+            max_steps=args.max_steps,
+            accelerator=args.accelerator,
+            strategy=args.strategy,
+            precision="bf16-mixed",
+            log_every_n_steps=1,
+            limit_val_batches=0.0,
+            num_sanity_val_steps=0,
+            accumulate_grad_batches=10,
+            gradient_clip_val=0.5,
+            use_distributed_sampler=False,
+            callbacks=[],
+            logger=None,
+        ),
+        optim=fdl.build(speechlm.adam.pytorch_adam_with_flat_lr(lr=1e-5)),
+        log=None,
+    )
+
+    if args.model_save_path is not None:
+        model.save_pretrained(args.model_save_path)
diff --git a/examples/vlm/hf/peft.py b/examples/vlm/hf/peft.py
index d51984677a74..01ba0fb7d5e7 100644
--- a/examples/vlm/hf/peft.py
+++ b/examples/vlm/hf/peft.py
@@ -85,6 +85,7 @@ def fmt(sample):
     parser.add_argument('--accelerator', default='gpu', choices=['gpu'])
     parser.add_argument('--max-steps', type=int, default=100)
     parser.add_argument('--wandb-project', type=str, default=None)
+    parser.add_argument('--use-4bit', help="Load model in 4bit", action="store_true")
     args = parser.parse_args()
 
     wandb = None
@@ -103,7 +104,7 @@ def fmt(sample):
     processor = vlm.HFAutoModelForImageTextToText.configure_processor(args.model)
 
     llm.api.finetune(
-        model=vlm.HFAutoModelForImageTextToText(args.model),
+        model=vlm.HFAutoModelForImageTextToText(args.model, load_in_4bit=args.use_4bit),
         data=mk_hf_vlm_dataset(processor, args.mbs, args.gbs),
         trainer=nl.Trainer(
             devices=args.devices,
@@ -116,6 +117,7 @@ def fmt(sample):
             accumulate_grad_batches=10,
             gradient_clip_val=grad_clip,
             use_distributed_sampler=use_dist_samp,
+            precision="bf16",
             logger=wandb,
         ),
         optim=fdl.build(llm.adam.pytorch_adam_with_flat_lr(lr=1e-5)),
@@ -123,5 +125,6 @@ def fmt(sample):
         peft=llm.peft.LoRA(
             target_modules=['*_proj'],
             dim=16,
+            lora_dtype=torch.bfloat16 if args.use_4bit else None,
         ),
     )
diff --git a/nemo/collections/asr/data/audio_to_diar_label.py b/nemo/collections/asr/data/audio_to_diar_label.py
index 1dbe68589c0a..817938b758ae 100644
--- a/nemo/collections/asr/data/audio_to_diar_label.py
+++ b/nemo/collections/asr/data/audio_to_diar_label.py
@@ -1237,15 +1237,11 @@ def __getitem__(self, index):
             np.floor(audio_signal.shape[0] / self.featurizer.sample_rate * self.floor_decimal) / self.floor_decimal
         )
         audio_signal = audio_signal[: round(self.featurizer.sample_rate * session_len_sec)]
-
         audio_signal_length = torch.tensor(audio_signal.shape[0]).long()
-        audio_signal, audio_signal_length = audio_signal.to(self.device), audio_signal_length.to(self.device)
-        target_len = self.get_segment_timestamps(duration=session_len_sec, sample_rate=self.featurizer.sample_rate).to(
-            self.device
-        )
+        target_len = self.get_segment_timestamps(duration=session_len_sec, sample_rate=self.featurizer.sample_rate)
         targets = self.parse_rttm_for_targets_and_lens(
             rttm_file=sample.rttm_file, offset=offset, duration=session_len_sec, target_len=target_len
-        ).to(self.device)
+        )
         return audio_signal, audio_signal_length, targets, target_len
 
 
diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py
index 1f84989c8ebe..874cde628c35 100644
--- a/nemo/collections/asr/models/ctc_bpe_models.py
+++ b/nemo/collections/asr/models/ctc_bpe_models.py
@@ -247,7 +247,6 @@ def change_vocabulary(
             if not os.path.isdir(new_tokenizer_dir):
                 raise NotADirectoryError(
                     f'New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}'
-                    f"New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}"
                 )
 
             if new_tokenizer_type.lower() not in ('bpe', 'wpe'):
diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
index 1f63c617cea2..be795b6e4bc4 100644
--- a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
+++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
@@ -155,7 +155,6 @@ def transcribe(
                     decoding_cfg.preserve_alignments = True
                 self.change_decoding_strategy(decoding_cfg, decoder_type=self.cur_decoder, verbose=False)
             else:
-                return_hypotheses = False
                 with open_dict(decoding_cfg):
                     decoding_cfg.compute_timestamps = False
                     decoding_cfg.preserve_alignments = False
diff --git a/nemo/collections/asr/models/sortformer_diar_models.py b/nemo/collections/asr/models/sortformer_diar_models.py
index 483ff5328ad0..e2ac0b09c81b 100644
--- a/nemo/collections/asr/models/sortformer_diar_models.py
+++ b/nemo/collections/asr/models/sortformer_diar_models.py
@@ -666,6 +666,7 @@ def test_batch(
                 audio_signal, audio_signal_length, targets, target_lens = batch
                 audio_signal = audio_signal.to(self.device)
                 audio_signal_length = audio_signal_length.to(self.device)
+                targets = targets.to(self.device)
                 preds = self.forward(
                     audio_signal=audio_signal,
                     audio_signal_length=audio_signal_length,
diff --git a/nemo/collections/common/tokenizers/canary_tokenizer.py b/nemo/collections/common/tokenizers/canary_tokenizer.py
index cb83fe7ddf3d..04dc6e3a68a9 100644
--- a/nemo/collections/common/tokenizers/canary_tokenizer.py
+++ b/nemo/collections/common/tokenizers/canary_tokenizer.py
@@ -68,13 +68,63 @@ def nospeech_id(self) -> int:
     def pad_id(self) -> int:
         return self.special_tokens[CANARY_PAD]
 
+    def _text_with_timestamps_to_ids(self, text_without_timestamps, time_text, lang_id) -> list[int]:
+        trans_words = text_without_timestamps.split()
+
+        # Get timestamp ids
+        time_ids = self._tokenize_special_prompt(time_text)
+
+        # Tokenize text word by wordd
+        word_ids = []
+        result_ids = []
+        time_index = 0
+
+        timestamp_every_n_words = 1  # Add timestmap for every N words
+        word_index = 0
+        # Both start and end time
+        for word in trans_words:
+            # Insert the first time_id once
+            if word_index == 0 and time_index < len(time_ids):
+                result_ids.append(time_ids[time_index])
+                time_index += 1
+            # Tokenize the word
+            word_ids += super().text_to_ids(word, lang_id)
+            result_ids += super().text_to_ids(word, lang_id)
+            word_index += 1
+            # Insert time ids every N words after the first one
+            if word_index % timestamp_every_n_words == 0 and word_index != 0 and time_index < len(time_ids):
+                result_ids.append(time_ids[time_index])
+                time_index += 1
+                if time_index < len(time_ids):
+                    result_ids.append(time_ids[time_index])
+                    time_index += 1
+            else:
+                time_index += 2
+        # Ensure the last time_id is appended at the end
+        if time_index < len(time_ids):
+            result_ids.append(time_ids[-1])
+        # Make sure the last time_id is appended only once
+        if time_index < len(time_ids) and result_ids[-1] != (time_ids[-1]):
+            result_ids.append(time_ids[-1])
+        return result_ids
+
+    def _text_to_ids_maybe_with_timestamps(self, text_no_eos, lang_id) -> list[int]:
+        time_pattern = re.compile(r"<\|\d+\|>")
+        time_text = "".join(time_pattern.findall(text_no_eos))
+        has_timestamp = bool(time_text)
+        if not has_timestamp:
+            return super().text_to_ids(text_no_eos, lang_id)
+        else:
+            text_without_timestamps = time_pattern.sub("", text_no_eos).strip()
+            return self._text_with_timestamps_to_ids(text_without_timestamps, time_text, lang_id)
+
     def text_to_ids(self, text, lang_id) -> list[int]:
         if lang_id == CANARY_SPECIAL_TOKENIZER:
             return self._tokenize_special_prompt(text)
         lang_id = _map_canary1_to_canary2_lang(lang_id, self.langs)
         if text.endswith(CANARY_EOS):
-            return super().text_to_ids(text[: -len(CANARY_EOS)], lang_id) + [self.eos_id]
-        return super().text_to_ids(text, lang_id)
+            return self._text_to_ids_maybe_with_timestamps(text[: -len(CANARY_EOS)], lang_id) + [self.eos_id]
+        return self._text_to_ids_maybe_with_timestamps(text, lang_id)
 
     def _tokenize_special_prompt(self, text: str) -> list[int]:
         """
diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
index 14da2d13a030..54cf95296d3d 100644
--- a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
+++ b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
@@ -46,6 +46,7 @@ def __init__(
         additional_special_tokens: Optional[List] = [],
         use_fast: Optional[bool] = False,
         trust_remote_code: Optional[bool] = False,
+        include_special_tokens: bool = False,
     ):
         """
         Args:
@@ -63,6 +64,7 @@ def __init__(
             unk_token: token to use for unknown tokens
             additional_special_tokens: list of other tokens beside standard special tokens (bos, eos, pad, etc.). For example, sentinel tokens for T5 (<extra_id_0>, <extra_id_1>, etc.)
             use_fast: whether to use fast HuggingFace tokenizer
+            include_special_tokens: when True, converting text to ids will include special tokens / prompt tokens (if any), yielding self.tokenizer(text).input_ids
         """
         try:
             # this logic deals with different huggingface tokenizers having different positional args
@@ -92,6 +94,7 @@ def __init__(
                 f'Unable to instantiate HuggingFace AUTOTOKENIZER for {pretrained_model_name}. Exception: {e}'
             )
 
+        self.include_special_tokens = include_special_tokens
         self.original_vocab_size = len(self.tokenizer)
         special_tokens_dict = {}
 
@@ -220,6 +223,8 @@ def ids_to_tokens(self, ids):
         return tokens
 
     def text_to_ids(self, text):
+        if self.include_special_tokens:
+            return self.tokenizer(text).input_ids
         tokens = self.text_to_tokens(text)
         ids = self.tokens_to_ids(tokens)
         return ids
diff --git a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
index 56a4b04dfe0f..45fbd4c8b328 100644
--- a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
+++ b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
@@ -237,6 +237,9 @@ def add_special_tokens(self, special_tokens):
                     self.special_token_to_id[token] = self.vocab_size
                     self.id_to_special_token[self.vocab_size] = token
                     self.vocab_size += 1
+                elif self.tokenizer.piece_to_id(token) != self.tokenizer.unk_id():
+                    self.special_token_to_id[token] = self.tokenizer.piece_to_id(token)
+
         elif isinstance(special_tokens, dict):
             for token_name, token in special_tokens.items():
                 setattr(self, token_name, token)
@@ -247,6 +250,8 @@ def add_special_tokens(self, special_tokens):
                     self.special_token_to_id[token] = self.vocab_size
                     self.id_to_special_token[self.vocab_size] = token
                     self.vocab_size += 1
+        else:
+            raise ValueError("Expected special_tokens to be a list or a dict " + str(type(special_tokens)))
 
     @property
     def pad_id(self):
diff --git a/nemo/collections/diffusion/data/diffusion_energon_datamodule.py b/nemo/collections/diffusion/data/diffusion_energon_datamodule.py
index 07747528363a..5ad15c654555 100644
--- a/nemo/collections/diffusion/data/diffusion_energon_datamodule.py
+++ b/nemo/collections/diffusion/data/diffusion_energon_datamodule.py
@@ -19,10 +19,10 @@
 from megatron.core import parallel_state
 from megatron.energon import DefaultTaskEncoder, WorkerConfig, get_savable_loader, get_train_dataset
 
-from nemo.collections.multimodal.data.energon.base import SimpleMultiModalDataModule
+from nemo.collections.multimodal.data.energon.base import EnergonMultiModalDataModule
 
 
-class DiffusionDataModule(SimpleMultiModalDataModule):
+class DiffusionDataModule(EnergonMultiModalDataModule):
     """
     A PyTorch Lightning DataModule for handling multimodal datasets with images and text.
 
@@ -62,7 +62,7 @@ def __init__(
         max_samples_per_sequence: int | None = None,
     ) -> None:
         """
-        Initialize the SimpleMultiModalDataModule.
+        Initialize the EnergonMultiModalDataModule.
 
         Parameters:
         path (str): Path to the dataset.
diff --git a/nemo/collections/diffusion/train.py b/nemo/collections/diffusion/train.py
index 404602084b85..0db2e8fd2326 100644
--- a/nemo/collections/diffusion/train.py
+++ b/nemo/collections/diffusion/train.py
@@ -38,7 +38,7 @@
     DiTXLConfig,
     ECDiTLlama1BConfig,
 )
-from nemo.collections.multimodal.data.energon.base import SimpleMultiModalDataModule
+from nemo.collections.multimodal.data.energon.base import EnergonMultiModalDataModule
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint, PreemptionCallback
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.lightning.pytorch.callbacks.model_transform import ModelTransform
@@ -64,7 +64,7 @@ def multimodal_datamodule() -> pl.LightningDataModule:
 @run.autoconvert
 def simple_datamodule() -> pl.LightningDataModule:
     """Simple Datamodule Initialization"""
-    data_module = SimpleMultiModalDataModule(
+    data_module = EnergonMultiModalDataModule(
         seq_length=2048,
         micro_batch_size=1,
         global_batch_size=32,
@@ -221,6 +221,7 @@ def train_mock() -> run.Partial:
 
 @run.cli.factory(target=llm.train)
 def mock_ditllama5b_8k() -> run.Partial:
+    """DiT-5B mock Recipe"""
     recipe = pretrain()
     recipe.model.config = run.Config(DiTLlama5BConfig, max_frames=1)
     recipe.data = multimodal_fake_datamodule()
@@ -256,6 +257,7 @@ def mock_ditllama5b_8k() -> run.Partial:
 
 @run.cli.factory(target=llm.train)
 def mock_dit7b_8k() -> run.Partial:
+    """DiT-7B mock Recipe"""
     recipe = mock_ditllama5b_8k()
     recipe.model.config = run.Config(DiT7BConfig, max_frames=1)
     recipe.data.model_config = recipe.model.config
diff --git a/nemo/collections/diffusion/vae/autoencoder.py b/nemo/collections/diffusion/vae/autoencoder.py
index b356d74baac1..234b8052b449 100644
--- a/nemo/collections/diffusion/vae/autoencoder.py
+++ b/nemo/collections/diffusion/vae/autoencoder.py
@@ -18,11 +18,45 @@
 import torch
 from torch import Tensor, nn
 
-from nemo.collections.diffusion.vae.blocks import AttnBlock, Downsample, Normalize, ResnetBlock, Upsample, make_attn
+from nemo.collections.diffusion.vae.blocks import Downsample, Normalize, ResnetBlock, Upsample, make_attn
 
 
 @dataclass
 class AutoEncoderParams:
+    """Dataclass for storing autoencoder hyperparameters.
+
+    Attributes
+    ----------
+    ch_mult : list[int]
+        Channel multipliers at each resolution level.
+    attn_resolutions : list[int]
+        List of resolutions at which attention layers are applied.
+    resolution : int, optional
+        Input image resolution. Default is 256.
+    in_channels : int, optional
+        Number of input channels. Default is 3.
+    ch : int, optional
+        Base channel dimension. Default is 128.
+    out_ch : int, optional
+        Number of output channels. Default is 3.
+    num_res_blocks : int, optional
+        Number of residual blocks at each resolution. Default is 2.
+    z_channels : int, optional
+        Number of latent channels in the compressed representation. Default is 16.
+    scale_factor : float, optional
+        Scaling factor for latent representations. Default is 0.3611.
+    shift_factor : float, optional
+        Shift factor for latent representations. Default is 0.1159.
+    attn_type : str, optional
+        Type of attention to use ('vanilla', 'linear'). Default is 'vanilla'.
+    double_z : bool, optional
+        If True, produce both mean and log-variance for latent space. Default is True.
+    dropout : float, optional
+        Dropout rate. Default is 0.0.
+    ckpt : str or None, optional
+        Path to checkpoint file for loading pretrained weights. Default is None.
+    """
+
     ch_mult: list[int]
     attn_resolutions: list[int]
     resolution: int = 256
@@ -39,12 +73,55 @@ class AutoEncoderParams:
     ckpt: str = None
 
 
-def nonlinearity(x):
-    # swish
+def nonlinearity(x: Tensor) -> Tensor:
+    """Applies the SiLU (Swish) nonlinearity.
+
+    Parameters
+    ----------
+    x : torch.Tensor
+        Input tensor.
+
+    Returns
+    -------
+    torch.Tensor
+        Transformed tensor after applying SiLU activation.
+    """
     return torch.nn.functional.silu(x)
 
 
 class Encoder(nn.Module):
+    """Encoder module that downsamples and encodes input images into a latent representation.
+
+    Parameters
+    ----------
+    ch : int
+        Base channel dimension.
+    out_ch : int
+        Number of output channels.
+    ch_mult : list[int]
+        Channel multipliers at each resolution level.
+    num_res_blocks : int
+        Number of residual blocks at each resolution level.
+    attn_resolutions : list[int]
+        List of resolutions at which attention layers are applied.
+    in_channels : int
+        Number of input image channels.
+    resolution : int
+        Input image resolution.
+    z_channels : int
+        Number of latent channels.
+    dropout : float, optional
+        Dropout rate. Default is 0.0.
+    resamp_with_conv : bool, optional
+        Whether to use convolutional resampling. Default is True.
+    double_z : bool, optional
+        If True, produce mean and log-variance channels for latent space. Default is True.
+    use_linear_attn : bool, optional
+        If True, use linear attention. Default is False.
+    attn_type : str, optional
+        Type of attention to use ('vanilla', 'linear'). Default is 'vanilla'.
+    """
+
     def __init__(
         self,
         *,
@@ -117,7 +194,19 @@ def __init__(
             block_in, 2 * z_channels if double_z else z_channels, kernel_size=3, stride=1, padding=1
         )
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward pass of the Encoder.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input image tensor of shape (B, C, H, W).
+
+        Returns
+        -------
+        torch.Tensor
+            Latent representation before sampling, with shape (B, 2*z_channels, H', W') if double_z=True.
+        """
         # timestep embedding
         temb = None
 
@@ -146,6 +235,40 @@ def forward(self, x):
 
 
 class Decoder(nn.Module):
+    """Decoder module that upscales and decodes latent representations back into images.
+
+    Parameters
+    ----------
+    ch : int
+        Base channel dimension.
+    out_ch : int
+        Number of output channels (e.g. 3 for RGB).
+    ch_mult : list[int]
+        Channel multipliers at each resolution level.
+    num_res_blocks : int
+        Number of residual blocks at each resolution level.
+    attn_resolutions : list[int]
+        List of resolutions at which attention layers are applied.
+    in_channels : int
+        Number of input image channels.
+    resolution : int
+        Input image resolution.
+    z_channels : int
+        Number of latent channels.
+    dropout : float, optional
+        Dropout rate. Default is 0.0.
+    resamp_with_conv : bool, optional
+        Whether to use convolutional resampling. Default is True.
+    give_pre_end : bool, optional
+        If True, returns the tensor before the final normalization and convolution. Default is False.
+    tanh_out : bool, optional
+        If True, applies a tanh activation to the output. Default is False.
+    use_linear_attn : bool, optional
+        If True, use linear attention. Default is False.
+    attn_type : str, optional
+        Type of attention to use ('vanilla', 'linear'). Default is 'vanilla'.
+    """
+
     def __init__(
         self,
         *,
@@ -224,8 +347,19 @@ def __init__(
         self.norm_out = Normalize(block_in)
         self.conv_out = torch.nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
 
-    def forward(self, z):
-        # assert z.shape[1:] == self.z_shape[1:]
+    def forward(self, z: Tensor) -> Tensor:
+        """Forward pass of the Decoder.
+
+        Parameters
+        ----------
+        z : torch.Tensor
+            Latent representation of shape (B, z_channels, H', W').
+
+        Returns
+        -------
+        torch.Tensor
+            Decoded image of shape (B, out_ch, H, W).
+        """
         self.last_z_shape = z.shape
 
         # timestep embedding
@@ -261,12 +395,35 @@ def forward(self, z):
 
 
 class DiagonalGaussian(nn.Module):
+    """Module that splits an input tensor into mean and log-variance and optionally samples from the Gaussian.
+
+    Parameters
+    ----------
+    sample : bool, optional
+        If True, return a sample from the Gaussian. Otherwise, return the mean. Default is True.
+    chunk_dim : int, optional
+        Dimension along which to chunk the tensor into mean and log-variance. Default is 1.
+    """
+
     def __init__(self, sample: bool = True, chunk_dim: int = 1):
         super().__init__()
         self.sample = sample
         self.chunk_dim = chunk_dim
 
     def forward(self, z: Tensor) -> Tensor:
+        """Forward pass of the DiagonalGaussian module.
+
+        Parameters
+        ----------
+        z : torch.Tensor
+            Input tensor of shape (..., 2*z_channels, ...).
+
+        Returns
+        -------
+        torch.Tensor
+            If sample=True, returns a sampled tensor from N(mean, var).
+            If sample=False, returns the mean.
+        """
         mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
         if self.sample:
             std = torch.exp(0.5 * logvar)
@@ -276,6 +433,14 @@ def forward(self, z: Tensor) -> Tensor:
 
 
 class AutoEncoder(nn.Module):
+    """Full AutoEncoder model combining an Encoder, Decoder, and latent Gaussian sampling.
+
+    Parameters
+    ----------
+    params : AutoEncoderParams
+        Configuration parameters for the AutoEncoder model.
+    """
+
     def __init__(self, params: AutoEncoderParams):
         super().__init__()
         self.encoder = Encoder(
@@ -314,21 +479,65 @@ def __init__(self, params: AutoEncoderParams):
             self.load_from_checkpoint(params.ckpt)
 
     def encode(self, x: Tensor) -> Tensor:
+        """Encode an input image to its latent representation.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input image of shape (B, C, H, W).
+
+        Returns
+        -------
+        torch.Tensor
+            Latent representation of the input image.
+        """
         z = self.reg(self.encoder(x))
         z = self.scale_factor * (z - self.shift_factor)
         return z
 
     def decode(self, z: Tensor) -> Tensor:
+        """Decode a latent representation back into an image.
+
+        Parameters
+        ----------
+        z : torch.Tensor
+            Latent representation of shape (B, z_channels, H', W').
+
+        Returns
+        -------
+        torch.Tensor
+            Reconstructed image of shape (B, out_ch, H, W).
+        """
         z = z / self.scale_factor + self.shift_factor
         return self.decoder(z)
 
     def forward(self, x: Tensor) -> Tensor:
+        """Forward pass that encodes and decodes the input image.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input image tensor.
+
+        Returns
+        -------
+        torch.Tensor
+            Reconstructed image.
+        """
         return self.decode(self.encode(x))
 
-    def load_from_checkpoint(self, ckpt_path):
+    def load_from_checkpoint(self, ckpt_path: str):
+        """Load the autoencoder weights from a checkpoint file.
+
+        Parameters
+        ----------
+        ckpt_path : str
+            Path to the checkpoint file.
+        """
         from safetensors.torch import load_file as load_sft
 
         state_dict = load_sft(ckpt_path)
         missing, unexpected = self.load_state_dict(state_dict)
         if len(missing) > 0:
-            logger.warning(f"Following keys are missing from checkpoint loaded: {missing}")
+            # If logger is not defined, you may replace this with print or similar.
+            print(f"Warning: Following keys are missing from checkpoint loaded: {missing}")
diff --git a/nemo/collections/diffusion/vae/autovae.py b/nemo/collections/diffusion/vae/autovae.py
new file mode 100644
index 000000000000..0797036f9cc0
--- /dev/null
+++ b/nemo/collections/diffusion/vae/autovae.py
@@ -0,0 +1,319 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import time
+from typing import Dict, List
+
+import torch
+import torch.profiler
+from diffusers import AutoencoderKL
+from torch import nn
+
+
+class VAEGenerator:
+    """
+    A class for generating and searching different Variational Autoencoder (VAE) configurations.
+
+    This class provides functionality to generate various VAE architecture configurations
+    given a specific input resolution and compression ratio. It allows searching through a
+    design space to find configurations that match given parameter and memory budgets.
+    """
+
+    def __init__(self, input_resolution: int = 1024, compression_ratio: int = 16) -> None:
+        if input_resolution == 1024:
+            assert compression_ratio in [8, 16]
+        elif input_resolution == 2048:
+            assert compression_ratio in [8, 16, 32]
+        else:
+            raise NotImplementedError("Higher resolution than 2028 is not implemented yet!")
+
+        self._input_resolution = input_resolution
+        self._compression_ratio = compression_ratio
+
+    def _generate_input(self):
+        """
+        Generate a random input tensor with the specified input resolution.
+
+        The tensor is placed on the GPU in half-precision (float16).
+        """
+        random_tensor = torch.rand(1, 3, self.input_resolution, self.input_resolution)
+        random_tensor = random_tensor.to(dtype=torch.float16, device="cuda")
+        return random_tensor
+
+    def _count_parameters(self, model: nn.Module = None):
+        """
+        Count the number of trainable parameters in a given model.
+
+        Args:
+            model (nn.Module): The model for which to count parameters.
+
+        Returns:
+            int: The number of trainable parameters.
+        """
+        assert model is not None, "Please provide a nn.Module to count the parameters."
+        return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+    def _load_base_json_skeleton(self):
+        """
+        Load a base configuration skeleton for the VAE.
+
+        Returns:
+            dict: A dictionary representing the base configuration JSON skeleton.
+        """
+        skeleton = {
+            "_class_name": "AutoencoderKL",
+            "_diffusers_version": "0.20.0.dev0",
+            "_name_or_path": "../sdxl-vae/",
+            "act_fn": "silu",
+            "block_out_channels": [],
+            "down_block_types": [],
+            "force_upcast": False,
+            "in_channels": 3,
+            "latent_channels": -1,  # 16
+            "layers_per_block": -1,  # 2
+            "norm_num_groups": 32,
+            "out_channels": 3,
+            "sample_size": 1024,  # resolution size
+            "scaling_factor": 0.13025,
+            "up_block_types": [],
+        }
+        return skeleton
+
+    def _generate_all_combinations(self, attr):
+        """
+        Generates all possible combinations from a search space dictionary.
+
+        Args:
+            attr (dict): A dictionary where each key has a list of possible values.
+
+        Returns:
+            List[Dict]: A list of dictionaries, each representing a unique combination of attributes.
+        """
+        keys = list(attr.keys())
+        choices = [attr[key] for key in keys]
+        all_combinations = list(itertools.product(*choices))
+
+        combination_dicts = []
+        for combination in all_combinations:
+            combination_dict = {key: value for key, value in zip(keys, combination)}
+            combination_dicts.append(combination_dict)
+
+        return combination_dicts
+
+    def _assign_attributes(self, choice):
+        """
+        Assign a chosen set of attributes to the base VAE configuration skeleton.
+
+        Args:
+            choice (dict): A dictionary of attributes to assign to the skeleton.
+
+        Returns:
+            dict: A dictionary representing the updated VAE configuration.
+        """
+        search_space_skleton = self._load_base_json_skeleton()
+        search_space_skleton["down_block_types"] = choice["down_block_types"]
+        search_space_skleton["up_block_types"] = choice["up_block_types"]
+        search_space_skleton["block_out_channels"] = choice["block_out_channels"]
+        search_space_skleton["layers_per_block"] = choice["layers_per_block"]
+        search_space_skleton["latent_channels"] = choice["latent_channels"]
+        return search_space_skleton
+
+    def _search_space_16x1024(self):
+        """
+        Define the search space for a 16x compression ratio at 1024 resolution.
+
+        Returns:
+            dict: A dictionary defining lists of possible attribute values.
+        """
+        attr = {}
+        attr["down_block_types"] = [["DownEncoderBlock2D"] * 5]
+        attr["up_block_types"] = [["UpDecoderBlock2D"] * 5]
+        attr["block_out_channels"] = [
+            [128, 256, 512, 512, 512],
+            [128, 256, 512, 512, 1024],
+            [128, 256, 512, 1024, 2048],
+            [64, 128, 256, 512, 512],
+        ]
+        attr["layers_per_block"] = [1, 2, 3]
+        attr["latent_channels"] = [4, 16, 32, 64]
+        return attr
+
+    def _search_space_8x1024(self):
+        """
+        Define the search space for an 8x compression ratio at 1024 resolution.
+
+        Returns:
+            dict: A dictionary defining lists of possible attribute values.
+        """
+        attr = {}
+        attr["down_block_types"] = [["DownEncoderBlock2D"] * 4]
+        attr["up_block_types"] = [["UpDecoderBlock2D"] * 4]
+        attr["block_out_channels"] = [[128, 256, 512, 512], [128, 256, 512, 1024], [64, 128, 256, 512]]
+        attr["layers_per_block"] = [1, 2, 3]
+        attr["latent_channels"] = [4, 16, 32, 64]
+        return attr
+
+    def _sort_data_in_place(self, data: List[Dict], mode: str) -> None:
+        """
+        Sort the list of design configurations in place based on a chosen mode.
+
+        Args:
+            data (List[Dict]): A list of dictionaries representing design configurations.
+            mode (str): The sorting criterion. Can be 'abs_param_diff', 'abs_cuda_mem_diff', or 'mse'.
+        """
+        if mode == 'abs_param_diff':
+            data.sort(key=lambda x: abs(x['param_diff']))
+        elif mode == 'abs_cuda_mem_diff':
+            data.sort(key=lambda x: abs(x['cuda_mem_diff']))
+        elif mode == 'mse':
+            data.sort(key=lambda x: (x['param_diff'] ** 2 + x['cuda_mem_diff'] ** 2) / 2)
+        else:
+            raise ValueError("Invalid mode. Choose from 'abs_param_diff', 'abs_cuda_mem_diff', 'mse'.")
+
+    def _print_table(self, data, headers, col_widths):
+        """
+        Print a formatted table of the design choices.
+
+        Args:
+            data (List[Dict]): The data to print, each entry a design configuration.
+            headers (List[str]): Column headers.
+            col_widths (List[int]): Widths for each column.
+        """
+        # Create header row
+        header_row = ""
+        for header, width in zip(headers, col_widths):
+            header_row += f"{header:<{width}}"
+        print(header_row)
+        print("-" * sum(col_widths))
+
+        # Print each data row
+        for item in data:
+            row = f"{item['param_diff']:<{col_widths[0]}}"
+            row += f"{item['cuda_mem_diff']:<{col_widths[1]}}"
+            print(row)
+
+    def search_for_target_vae(self, parameters_budget=0, cuda_max_mem=0):
+        """
+        Search through available VAE design choices to find one that best matches
+        the given parameter and memory budgets.
+
+        Args:
+            parameters_budget (float, optional): The target number of parameters (in millions).
+            cuda_max_mem (float, optional): The target maximum GPU memory usage (in MB).
+
+        Returns:
+            AutoencoderKL: The chosen VAE configuration that best matches the provided budgets.
+        """
+        if parameters_budget <= 0 and cuda_max_mem <= 0:
+            raise ValueError("Please specify a valid parameter budget or cuda max memory budget")
+
+        search_space_choices = []
+        if self.input_resolution == 1024 and self.compression_ratio == 8:
+            search_space = self._search_space_8x1024()
+            search_space_choices = self._generate_all_combinations(search_space)
+        elif self.input_resolution == 1024 and self.compression_ratio == 16:
+            search_space = self._search_space_16x1024()
+            search_space_choices = self._generate_all_combinations(search_space)
+
+        inp_tensor = self._generate_input()
+        inp_tensor = inp_tensor.to(dtype=torch.float16, device="cuda")
+        design_choices = []
+
+        for choice in search_space_choices:
+            parameters_budget_diff = 0
+            cuda_max_mem_diff = 0
+
+            curt_design_json = self._assign_attributes(choice)
+            print("-" * 20)
+            print(choice)
+            vae = AutoencoderKL.from_config(curt_design_json)
+            vae = vae.to(dtype=torch.float16, device="cuda")
+            total_params = self._count_parameters(vae)
+            total_params /= 10**6
+            # Reset peak memory statistics
+            torch.cuda.reset_peak_memory_stats()
+            torch.cuda.synchronize()
+
+            with torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.CUDA,
+                ],
+                profile_memory=True,  # Enables memory profiling
+                record_shapes=True,  # Records tensor shapes
+                with_stack=True,  # Records stack traces
+            ) as prof:
+                # Perform forward pass
+                start_time = time.perf_counter()
+                with torch.no_grad():
+                    _ = vae.encode(inp_tensor).latent_dist.sample()
+                torch.cuda.synchronize()
+                end_time = time.perf_counter()
+
+            total_execution_time_ms = (end_time - start_time) * 1000
+
+            # Get maximum memory allocated
+            max_memory_allocated = torch.cuda.max_memory_allocated()
+            max_memory_allocated = max_memory_allocated / (1024**2)
+
+            parameters_budget_diff = parameters_budget - total_params
+            cuda_max_mem_diff = cuda_max_mem - max_memory_allocated
+            design_choices.append(
+                {"param_diff": parameters_budget_diff, "cuda_mem_diff": cuda_max_mem_diff, "design": curt_design_json}
+            )
+
+            print(f"  Total params: {total_params}")
+            print(f"  Max GPU Memory Usage: {max_memory_allocated} MB")
+            print(f"  Total Execution Time: {total_execution_time_ms:.2f} ms")
+
+            print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+
+            print("-" * 20)
+        sort_mode = "abs_param_diff"
+        if parameters_budget == 0:
+            sort_mode = "abs_cuda_mem_diff"
+        elif cuda_max_mem == 0:
+            sort_mode = "abs_param_diff"
+        else:
+            sort_mode = "mse"
+
+        print("#" * 20)
+        self._sort_data_in_place(design_choices, sort_mode)
+        headers = ["param_diff (M)", "cuda_mem_diff (MB)"]
+        col_widths = [12, 15]
+        self._print_table(design_choices, headers, col_widths)
+
+        vae = AutoencoderKL.from_config(design_choices[0]["design"])
+        return vae
+
+    @property
+    def input_resolution(self) -> int:
+        """
+        Get the input resolution for the VAE.
+
+        Returns:
+            int: The input resolution.
+        """
+        return self._input_resolution
+
+    @property
+    def compression_ratio(self) -> float:
+        """
+        Get the compression ratio for the VAE.
+
+        Returns:
+            float: The compression ratio.
+        """
+        return self._compression_ratio
diff --git a/nemo/collections/diffusion/vae/blocks.py b/nemo/collections/diffusion/vae/blocks.py
index ad38a7a463cf..d942ba1ef4b0 100644
--- a/nemo/collections/diffusion/vae/blocks.py
+++ b/nemo/collections/diffusion/vae/blocks.py
@@ -26,11 +26,49 @@
 
 
 def Normalize(in_channels, num_groups=32, act=""):
+    """Creates a group normalization layer with specified activation.
+
+    Args:
+        in_channels (int): Number of channels in the input.
+        num_groups (int, optional): Number of groups for GroupNorm. Defaults to 32.
+        act (str, optional): Activation function name. Defaults to "".
+
+    Returns:
+        GroupNorm: A normalization layer with optional activation.
+    """
     return GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True, act=act)
 
 
+def nonlinearity(x):
+    """Nonlinearity function used in temporal embedding projection.
+
+    Currently implemented as a SiLU (Swish) function.
+
+    Args:
+        x (Tensor): Input tensor.
+
+    Returns:
+        Tensor: Output after applying SiLU activation.
+    """
+    return x * torch.sigmoid(x)
+
+
 class ResnetBlock(nn.Module):
+    """A ResNet-style block that can optionally apply a temporal embedding and shortcut projections.
+
+    This block consists of two convolutional layers, normalization, and optional temporal embedding.
+    It can adjust channel dimensions between input and output via shortcuts.
+    """
+
     def __init__(self, in_channels, out_channels=None, conv_shortcut=False, dropout=0.0, temb_channels=0):
+        """
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int, optional): Number of output channels. Defaults to in_channels.
+            conv_shortcut (bool, optional): Whether to use a convolutional shortcut. Defaults to False.
+            dropout (float, optional): Dropout probability. Defaults to 0.0.
+            temb_channels (int, optional): Number of channels in temporal embedding. Defaults to 0.
+        """
         super().__init__()
         self.in_channels = in_channels
         out_channels = in_channels if out_channels is None else out_channels
@@ -51,6 +89,15 @@ def __init__(self, in_channels, out_channels=None, conv_shortcut=False, dropout=
                 self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
 
     def forward(self, x, temb):
+        """Forward pass of the ResnetBlock.
+
+        Args:
+            x (Tensor): Input feature map of shape (B, C, H, W).
+            temb (Tensor): Temporal embedding tensor of shape (B, temb_channels).
+
+        Returns:
+            Tensor: Output feature map of shape (B, out_channels, H, W).
+        """
         h = x
         h = self.norm1(h)
         h = self.conv1(h)
@@ -72,16 +119,32 @@ def forward(self, x, temb):
 
 
 class Upsample(nn.Module):
+    """Upsampling block that increases spatial resolution by a factor of 2.
+
+    Can optionally include a convolution after upsampling.
+    """
+
     def __init__(self, in_channels, with_conv):
+        """
+        Args:
+            in_channels (int): Number of input channels.
+            with_conv (bool): If True, apply a convolution after upsampling.
+        """
         super().__init__()
         self.with_conv = with_conv
         if self.with_conv:
             self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
 
     def forward(self, x):
+        """Forward pass of the Upsample block.
+
+        Args:
+            x (Tensor): Input feature map (B, C, H, W).
+
+        Returns:
+            Tensor: Upsampled feature map (B, C, 2H, 2W).
+        """
         # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
-        # TODO(yuya): Remove this cast once the issue is fixed in PyTorch
-        # https://github.com/pytorch/pytorch/issues/86679
         dtype = x.dtype
         if dtype == torch.bfloat16:
             x = x.to(torch.float32)
@@ -94,7 +157,17 @@ def forward(self, x):
 
 
 class Downsample(nn.Module):
+    """Downsampling block that reduces spatial resolution by a factor of 2.
+
+    Can optionally include a convolution before downsampling.
+    """
+
     def __init__(self, in_channels, with_conv):
+        """
+        Args:
+            in_channels (int): Number of input channels.
+            with_conv (bool): If True, apply a convolution before downsampling.
+        """
         super().__init__()
         self.with_conv = with_conv
         if self.with_conv:
@@ -102,6 +175,14 @@ def __init__(self, in_channels, with_conv):
             self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
 
     def forward(self, x):
+        """Forward pass of the Downsample block.
+
+        Args:
+            x (Tensor): Input feature map (B, C, H, W).
+
+        Returns:
+            Tensor: Downsampled feature map (B, C, H/2, W/2).
+        """
         if self.with_conv:
             pad = (0, 1, 0, 1)
             x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
@@ -112,7 +193,16 @@ def forward(self, x):
 
 
 class AttnBlock(nn.Module):
+    """Self-attention block that applies scaled dot-product attention to feature maps.
+
+    Normalizes input, computes queries, keys, and values, then applies attention and a projection.
+    """
+
     def __init__(self, in_channels: int):
+        """
+        Args:
+            in_channels (int): Number of input/output channels.
+        """
         super().__init__()
         self.in_channels = in_channels
 
@@ -124,6 +214,14 @@ def __init__(self, in_channels: int):
         self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
 
     def attention(self, h_: Tensor) -> Tensor:
+        """Compute the attention over the input feature maps.
+
+        Args:
+            h_ (Tensor): Normalized input feature map (B, C, H, W).
+
+        Returns:
+            Tensor: Output after applying scaled dot-product attention (B, C, H, W).
+        """
         h_ = self.norm(h_)
         q = self.q(h_)
         k = self.k(h_)
@@ -138,11 +236,30 @@ def attention(self, h_: Tensor) -> Tensor:
         return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
 
     def forward(self, x: Tensor) -> Tensor:
+        """Forward pass of the AttnBlock.
+
+        Args:
+            x (Tensor): Input feature map (B, C, H, W).
+
+        Returns:
+            Tensor: Output feature map after self-attention (B, C, H, W).
+        """
         return x + self.proj_out(self.attention(x))
 
 
 class LinearAttention(nn.Module):
+    """Linear Attention block for efficient attention computations.
+
+    Uses linear attention mechanisms to reduce complexity and memory usage.
+    """
+
     def __init__(self, dim, heads=4, dim_head=32):
+        """
+        Args:
+            dim (int): Input channel dimension.
+            heads (int, optional): Number of attention heads. Defaults to 4.
+            dim_head (int, optional): Dimension per attention head. Defaults to 32.
+        """
         super().__init__()
         self.heads = heads
         hidden_dim = dim_head * heads
@@ -150,6 +267,14 @@ def __init__(self, dim, heads=4, dim_head=32):
         self.to_out = nn.Conv2d(hidden_dim, dim, 1)
 
     def forward(self, x):
+        """Forward pass of the LinearAttention block.
+
+        Args:
+            x (Tensor): Input feature map (B, C, H, W).
+
+        Returns:
+            Tensor: Output feature map after linear attention (B, C, H, W).
+        """
         b, c, h, w = x.shape
         qkv = self.to_qkv(x)
         q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads=self.heads, qkv=3)
@@ -161,15 +286,27 @@ def forward(self, x):
 
 
 class LinAttnBlock(LinearAttention):
-    """
-    to match AttnBlock usage
-    """
+    """Wrapper class to provide a linear attention block in a form compatible with other attention blocks."""
 
     def __init__(self, in_channels):
+        """
+        Args:
+            in_channels (int): Number of input/output channels.
+        """
         super().__init__(dim=in_channels, heads=1, dim_head=in_channels)
 
 
 def make_attn(in_channels, attn_type="vanilla"):
+    """Factory function to create an attention block.
+
+    Args:
+        in_channels (int): Number of input/output channels.
+        attn_type (str, optional): Type of attention block to create. Options: "vanilla", "linear", "none".
+                                   Defaults to "vanilla".
+
+    Returns:
+        nn.Module: An instance of the requested attention block.
+    """
     assert attn_type in ["vanilla", "linear", "none"], f'attn_type {attn_type} unknown'
     print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
     if attn_type == "vanilla":
diff --git a/nemo/collections/diffusion/vae/contperceptual_loss.py b/nemo/collections/diffusion/vae/contperceptual_loss.py
new file mode 100644
index 000000000000..7021e31f7f3b
--- /dev/null
+++ b/nemo/collections/diffusion/vae/contperceptual_loss.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+
+from taming.modules.losses.vqperceptual import *  # TODO: taming dependency yes/no?
+
+
+class LPIPSWithDiscriminator(nn.Module):
+    """
+    A perceptual loss module that combines LPIPS with an adversarial discriminator
+    for improved reconstruction quality in variational autoencoders. This class
+    calculates a combination of pixel-level, perceptual (LPIPS), KL, and adversarial
+    losses for training a VAE model with a discriminator.
+    """
+
+    def __init__(
+        self,
+        disc_start,
+        logvar_init=0.0,
+        kl_weight=1.0,
+        pixelloss_weight=1.0,
+        disc_num_layers=3,
+        disc_in_channels=3,
+        disc_factor=1.0,
+        disc_weight=1.0,
+        perceptual_weight=1.0,
+        use_actnorm=False,
+        disc_conditional=False,
+        disc_loss="hinge",
+    ):
+        """
+        Initializes the LPIPSWithDiscriminator module.
+
+        Args:
+            disc_start (int): Iteration at which to start discriminator updates.
+            logvar_init (float): Initial value for the log variance parameter.
+            kl_weight (float): Weight for the KL divergence term.
+            pixelloss_weight (float): Weight for the pixel-level reconstruction loss.
+            disc_num_layers (int): Number of layers in the discriminator.
+            disc_in_channels (int): Number of input channels for the discriminator.
+            disc_factor (float): Scaling factor for the discriminator loss.
+            disc_weight (float): Weight applied to the discriminator gradient balancing.
+            perceptual_weight (float): Weight for the LPIPS perceptual loss.
+            use_actnorm (bool): Whether to use actnorm in the discriminator.
+            disc_conditional (bool): Whether the discriminator is conditional on an additional input.
+            disc_loss (str): Type of GAN loss to use ("hinge" or "vanilla").
+        """
+        super().__init__()
+        assert disc_loss in ["hinge", "vanilla"]
+        self.kl_weight = kl_weight
+        self.pixel_weight = pixelloss_weight
+        self.perceptual_loss = LPIPS().eval()
+        self.perceptual_weight = perceptual_weight
+        # output log variance
+        self.logvar = nn.Parameter(torch.ones(1) * logvar_init)
+
+        self.discriminator = NLayerDiscriminator(
+            input_nc=disc_in_channels, n_layers=disc_num_layers, use_actnorm=use_actnorm
+        ).apply(weights_init)
+        self.discriminator_iter_start = disc_start
+        self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
+        self.disc_factor = disc_factor
+        self.discriminator_weight = disc_weight
+        self.disc_conditional = disc_conditional
+
+    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
+        """
+        Computes an adaptive weight that balances the reconstruction (NLL) and the
+        adversarial (GAN) losses. This ensures stable training by adjusting the
+        impact of the discriminator’s gradient on the generator.
+
+        Args:
+            nll_loss (torch.Tensor): The negative log-likelihood loss.
+            g_loss (torch.Tensor): The generator (adversarial) loss.
+            last_layer (torch.nn.Parameter, optional): Last layer parameters of the model
+                for gradient-based calculations. If None, uses self.last_layer[0].
+
+        Returns:
+            torch.Tensor: The computed adaptive weight for balancing the discriminator.
+        """
+        if last_layer is not None:
+            nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
+            g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
+        else:
+            nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
+            g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
+
+        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
+        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
+        d_weight = d_weight * self.discriminator_weight
+        return d_weight
+
+    def forward(
+        self, inputs, reconstructions, posteriors, optimizer_idx, global_step, last_layer=None, cond=None, weights=None
+    ):
+        """
+        Forward pass for computing the combined loss. Depending on the optimizer index,
+        this either computes the generator loss (including pixel, perceptual, KL, and
+        adversarial terms) or the discriminator loss.
+
+        Args:
+            inputs (torch.Tensor): Original inputs to reconstruct.
+            reconstructions (torch.Tensor): Reconstructed outputs from the model.
+            posteriors (object): Posteriors from the VAE model for KL computation.
+            optimizer_idx (int): Indicates which optimizer is being updated
+                (0 for generator, 1 for discriminator).
+            global_step (int): Current training iteration step.
+            last_layer (torch.nn.Parameter, optional): The last layer's parameters for
+                adaptive weight calculation.
+            cond (torch.Tensor, optional): Conditional input for the discriminator.
+            weights (torch.Tensor, optional): Sample-wise weighting for the losses.
+
+        Returns:
+            (torch.Tensor, dict): A tuple of (loss, log_dict) where loss is the computed loss
+            for the current optimizer and log_dict is a dictionary of intermediate values
+            for logging and debugging.
+        """
+        rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
+        if self.perceptual_weight > 0:
+            p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
+            rec_loss = rec_loss + self.perceptual_weight * p_loss
+
+        nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
+        weighted_nll_loss = nll_loss
+        if weights is not None:
+            weighted_nll_loss = weights * nll_loss
+        weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
+        nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
+        kl_loss = posteriors.kl()
+        kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
+
+        # now the GAN part
+        if optimizer_idx == 0:
+            # generator update
+            if cond is None:
+                assert not self.disc_conditional
+                logits_fake = self.discriminator(reconstructions.contiguous())
+            else:
+                assert self.disc_conditional
+                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
+            g_loss = -torch.mean(logits_fake)
+
+            if self.disc_factor > 0.0:
+                try:
+                    d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
+                except RuntimeError:
+                    assert not self.training
+                    d_weight = torch.tensor(0.0)
+            else:
+                d_weight = torch.tensor(0.0)
+
+            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
+            loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss
+
+            log = {
+                "total_loss": loss.clone().detach().mean(),
+                "logvar": self.logvar.detach().item(),
+                "kl_loss": kl_loss.detach().mean(),
+                "nll_loss": nll_loss.detach().mean(),
+                "rec_loss": rec_loss.detach().mean(),
+                "d_weight": d_weight.detach(),
+                "disc_factor": torch.tensor(disc_factor),
+                "g_loss": g_loss.detach().mean(),
+            }
+            return loss, log
+
+        if optimizer_idx == 1:
+            # discriminator update
+            if cond is None:
+                logits_real = self.discriminator(inputs.contiguous().detach())
+                logits_fake = self.discriminator(reconstructions.contiguous().detach())
+            else:
+                logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
+                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
+
+            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
+            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
+
+            log = {
+                "disc_loss": d_loss.clone().detach().mean(),
+                "logits_real": logits_real.detach().mean(),
+                "logits_fake": logits_fake.detach().mean(),
+            }
+            return d_loss, log
diff --git a/nemo/collections/diffusion/vae/diffusers_vae.py b/nemo/collections/diffusion/vae/diffusers_vae.py
index 19a056d4a682..fe8f50ce658b 100644
--- a/nemo/collections/diffusion/vae/diffusers_vae.py
+++ b/nemo/collections/diffusion/vae/diffusers_vae.py
@@ -18,12 +18,44 @@
 
 
 class AutoencoderKLVAE(torch.nn.Module):
+    """
+    A class that wraps the AutoencoderKL model and provides a decode method.
+
+    Attributes:
+        vae (AutoencoderKL): The underlying AutoencoderKL model loaded from a pretrained path.
+    """
+
     def __init__(self, path):
+        """
+        Initialize the AutoencoderKLVAE instance.
+
+        Args:
+            path (str): The path to the pretrained AutoencoderKL model.
+        """
         super().__init__()
         self.vae = AutoencoderKL.from_pretrained(path, torch_dtype=torch.bfloat16)
 
     @torch.no_grad()
     def decode(self, x):
+        """
+        Decode a latent representation using the underlying VAE model.
+
+        This method takes a latent tensor `x` and decodes it into an image.
+        If `x` has a temporal dimension `T` of 1, it
+        rearranges the tensor before and after decoding.
+
+        Args:
+            x (torch.Tensor): A tensor of shape (B, C, T, H, W), where:
+                              B = batch size
+                              C = number of channels
+                              T = temporal dimension
+                              H = height
+                              W = width
+
+        Returns:
+            torch.Tensor: Decoded image tensor with the same shape as the input (B, C, T, H, W).
+        """
+
         B, C, T, H, W = x.shape
         if T == 1:
             x = rearrange(x, 'b c t h w -> (b t) c h w')
diff --git a/nemo/collections/diffusion/vae/readme.rst b/nemo/collections/diffusion/vae/readme.rst
new file mode 100644
index 000000000000..ac0f2b2f5e71
--- /dev/null
+++ b/nemo/collections/diffusion/vae/readme.rst
@@ -0,0 +1,131 @@
+============================
+Pretraining Variational AutoEncoder
+============================
+
+Variational Autoencoder (VAE) is a data compression technique that compresses high-resolution images into a lower-dimensional latent space, capturing essential features while reducing dimensionality. This process allows for efficient storage and processing of image data. VAE has been integral to training Stable Diffusion (SD) models, significantly reducing computational requirements. For instance, SDLX utilizes a VAE that reduces image dimensions by 8x, greatly optimizing the training and inference processes. In this repository, we provide training codes to pretrain the VAE from scratch, enabling users to achieve higher compression ratios in the spatial dimension, such as 16x or 32x.
+
+Installation
+============
+
+Please pull the latest NeMo docker to get started, see details about NeMo docker `here <https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo>`_.
+
+Validation
+========
+We also provide a validation code for you to quickly evaluate our pretrained 16x VAE model on a 50K dataset. Once you start the docker, run the following script to start the testing.
+
+.. code-block:: bash
+
+   torchrun --nproc-per-node 8 nemo/collections/diffusion/vae/validate_vae.py --yes data.path=path/to/validation/data log.log_dir=/path/to/checkpoint
+
+Configure the following variables:
+
+
+1. ``data.path``: Set this to the directory containing your test data (e.g., `.jpg` or `.png` files). The original and VAE-reconstructed images will be logged side by side in Weights & Biases (wandb).
+
+2. ``log.log_dir``: Set this to the directory containing the pretrained checkpoint. You can find our pretrained checkpoint at ``TODO by ethan``
+
+Here are some sample images generated from our pretrained VAE.
+
+``Left``: Original Image, ``Right``: 16x VAE Reconstructed Image
+
+.. list-table::
+   :align: center
+
+   * - .. image:: https://github.com/user-attachments/assets/08122f5b-2e65-4d65-87d7-eceae9d158fb
+         :width: 1400
+         :align: center
+     - .. image:: https://github.com/user-attachments/assets/6e805a0d-8783-4d24-a65b-d96a6ba1555d
+         :width: 1400
+         :align: center
+     - .. image:: https://github.com/user-attachments/assets/aab1ef33-35da-444d-90ee-ba3ad58a6b2d
+         :width: 1400
+         :align: center
+
+Data Preparation
+========
+
+1. we expect data to be in the form of WebDataset tar files. If you have a folder of images, you can use `tar` to convert them into WebDataset tar files:
+
+    .. code-block:: bash
+
+        000000.tar
+        ├── 1.jpg
+        ├── 2.jpg
+        000001.tar
+        ├── 3.jpg
+        ├── 4.jpg
+
+2. next we need to index the webdataset with `energon <https://nvidia.github.io/Megatron-Energon/>`_. navigate to the dataset directory and run the following command:
+
+    .. code-block:: bash
+
+        energon prepare . --num-workers 8 --shuffle-tars
+
+3. then select dataset type `ImageWebdataset` and specify the type `jpg`. Below is an example of the interactive setup:
+
+    .. code-block:: bash
+        
+        Found 2925 tar files in total. The first and last ones are:
+        - 000000.tar
+        - 002924.tar
+        If you want to exclude some of them, cancel with ctrl+c and specify an exclude filter in the command line.
+        Please enter a desired train/val/test split like "0.5, 0.2, 0.3" or "8,1,1": 99,1,0
+        Indexing shards  [####################################]  2925/2925
+        Sample 0, keys:
+        - jpg
+        Sample 1, keys:
+        - jpg
+        Found the following part types in the dataset: jpg
+        Do you want to create a dataset.yaml interactively? [Y/n]:
+        The following dataset classes are available:
+        0. CaptioningWebdataset
+        1. CrudeWebdataset
+        2. ImageClassificationWebdataset
+        3. ImageWebdataset
+        4. InterleavedWebdataset
+        5. MultiChoiceVQAWebdataset
+        6. OCRWebdataset
+        7. SimilarityInterleavedWebdataset
+        8. TextWebdataset
+        9. VQAOCRWebdataset
+        10. VQAWebdataset
+        11. VidQAWebdataset
+        Please enter a number to choose a class: 3
+        The dataset you selected uses the following sample type:
+
+        @dataclass
+        class ImageSample(Sample):
+            """Sample type for an image, e.g. for image reconstruction."""
+
+            #: The input image tensor in the shape (C, H, W)
+            image: torch.Tensor
+
+        Do you want to set a simple field_map[Y] (or write your own sample_loader [n])? [Y/n]:
+
+        For each field, please specify the corresponding name in the WebDataset.
+        Available types in WebDataset: jpg
+        Leave empty for skipping optional field
+        You may also access json fields e.g. by setting the field to: json[field][field]
+        You may also specify alternative fields e.g. by setting to: jpg,png
+        Please enter the field_map for ImageWebdataset:
+        Please enter a webdataset field name for 'image' (<class 'torch.Tensor'>):
+        That type doesn't exist in the WebDataset. Please try again.
+        Please enter a webdataset field name for 'image' (<class 'torch.Tensor'>): jpg
+        Done
+
+4. finally, you can use the indexed dataset to train the VAE model. specify `data.path=/path/to/dataset` in the training script `train_vae.py`.
+
+Training
+========
+
+We provide a sample training script for launching multi-node training. Simply configure ``data.path`` to point to your prepared dataset to get started.
+
+.. code-block:: bash
+
+   bash nemo/collections/diffusion/vae/train_vae.sh \
+   data.path=xxx
+
+
+
+
+
diff --git a/nemo/collections/diffusion/vae/test_autovae.py b/nemo/collections/diffusion/vae/test_autovae.py
new file mode 100644
index 000000000000..fa414c20c4ce
--- /dev/null
+++ b/nemo/collections/diffusion/vae/test_autovae.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+from autovae import VAEGenerator
+
+
+class TestVAEGenerator(unittest.TestCase):
+    """Unit tests for the VAEGenerator class."""
+
+    def setUp(self):
+        # Common setup for tests
+        self.input_resolution = 1024
+        self.compression_ratio = 8
+        self.generator = VAEGenerator(input_resolution=self.input_resolution, compression_ratio=self.compression_ratio)
+
+    def test_initialization_valid(self):
+        """Test that valid initialization parameters set the correct properties."""
+        generator = VAEGenerator(input_resolution=1024, compression_ratio=8)
+        self.assertEqual(generator.input_resolution, 1024)
+        self.assertEqual(generator.compression_ratio, 8)
+
+        generator = VAEGenerator(input_resolution=2048, compression_ratio=16)
+        self.assertEqual(generator.input_resolution, 2048)
+        self.assertEqual(generator.compression_ratio, 16)
+
+    def test_initialization_invalid(self):
+        """Test that invalid initialization parameters raise an error."""
+        with self.assertRaises(NotImplementedError):
+            VAEGenerator(input_resolution=4096, compression_ratio=16)
+
+    def test_generate_input(self):
+        """Test that _generate_input produces a tensor with the correct shape and device."""
+        input_tensor = self.generator._generate_input()
+        expected_shape = (1, 3, self.input_resolution, self.input_resolution)
+        self.assertEqual(input_tensor.shape, expected_shape)
+        self.assertEqual(input_tensor.dtype, torch.float16)
+        self.assertEqual(input_tensor.device.type, "cuda")
+
+    def test_count_parameters(self):
+        """Test that _count_parameters correctly counts model parameters."""
+        model = torch.nn.Sequential(torch.nn.Linear(10, 20), torch.nn.ReLU(), torch.nn.Linear(20, 5))
+        expected_param_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        param_count = self.generator._count_parameters(model)
+        self.assertEqual(param_count, expected_param_count)
+
+    def test_load_base_json_skeleton(self):
+        """Test that _load_base_json_skeleton returns the correct skeleton."""
+        skeleton = self.generator._load_base_json_skeleton()
+        expected_keys = {
+            "_class_name",
+            "_diffusers_version",
+            "_name_or_path",
+            "act_fn",
+            "block_out_channels",
+            "down_block_types",
+            "force_upcast",
+            "in_channels",
+            "latent_channels",
+            "layers_per_block",
+            "norm_num_groups",
+            "out_channels",
+            "sample_size",
+            "scaling_factor",
+            "up_block_types",
+        }
+        self.assertEqual(set(skeleton.keys()), expected_keys)
+
+    def test_generate_all_combinations(self):
+        """Test that _generate_all_combinations generates all possible combinations."""
+        attr = {"layers_per_block": [1, 2], "latent_channels": [4, 8]}
+        combinations = self.generator._generate_all_combinations(attr)
+        expected_combinations = [
+            {"layers_per_block": 1, "latent_channels": 4},
+            {"layers_per_block": 1, "latent_channels": 8},
+            {"layers_per_block": 2, "latent_channels": 4},
+            {"layers_per_block": 2, "latent_channels": 8},
+        ]
+        self.assertEqual(len(combinations), len(expected_combinations))
+        for combo in expected_combinations:
+            self.assertIn(combo, combinations)
+
+    def test_assign_attributes(self):
+        """Test that _assign_attributes correctly assigns attributes to the skeleton."""
+        choice = {
+            "down_block_types": ["DownEncoderBlock2D"] * 4,
+            "up_block_types": ["UpDecoderBlock2D"] * 4,
+            "block_out_channels": [64, 128, 256, 512],
+            "layers_per_block": 2,
+            "latent_channels": 16,
+        }
+        skeleton = self.generator._assign_attributes(choice)
+        self.assertEqual(skeleton["down_block_types"], choice["down_block_types"])
+        self.assertEqual(skeleton["up_block_types"], choice["up_block_types"])
+        self.assertEqual(skeleton["block_out_channels"], choice["block_out_channels"])
+        self.assertEqual(skeleton["layers_per_block"], choice["layers_per_block"])
+        self.assertEqual(skeleton["latent_channels"], choice["latent_channels"])
+
+    def test_search_space_16x1024(self):
+        """Test that _search_space_16x1024 returns the correct search space."""
+        search_space = self.generator._search_space_16x1024()
+        expected_keys = {
+            "down_block_types",
+            "up_block_types",
+            "block_out_channels",
+            "layers_per_block",
+            "latent_channels",
+        }
+        self.assertEqual(set(search_space.keys()), expected_keys)
+        self.assertTrue(all(isinstance(v, list) for v in search_space.values()))
+
+    def test_sort_data_in_place(self):
+        """Test that _sort_data_in_place correctly sorts data based on the specified mode."""
+        data = [
+            {"param_diff": 10, "cuda_mem_diff": 100},
+            {"param_diff": 5, "cuda_mem_diff": 50},
+            {"param_diff": -3, "cuda_mem_diff": 30},
+            {"param_diff": 7, "cuda_mem_diff": 70},
+        ]
+        # Test sorting by absolute parameter difference
+        self.generator._sort_data_in_place(data, mode="abs_param_diff")
+        expected_order_param = [-3, 5, 7, 10]
+        actual_order_param = [item["param_diff"] for item in data]
+        self.assertEqual(actual_order_param, expected_order_param)
+
+        # Test sorting by absolute CUDA memory difference
+        self.generator._sort_data_in_place(data, mode="abs_cuda_mem_diff")
+        expected_order_mem = [30, 50, 70, 100]
+        actual_order_mem = [item["cuda_mem_diff"] for item in data]
+        self.assertEqual(actual_order_mem, expected_order_mem)
+
+        # Test sorting by mean squared error (MSE)
+        self.generator._sort_data_in_place(data, mode="mse")
+        expected_order_mse = [-3, 5, 7, 10]  # Computed based on MSE values
+        actual_order_mse = [item["param_diff"] for item in data]
+        self.assertEqual(actual_order_mse, expected_order_mse)
+
+    def test_search_for_target_vae_invalid(self):
+        """Test that search_for_target_vae raises an error when no budget is specified."""
+        with self.assertRaises(ValueError):
+            self.generator.search_for_target_vae(parameters_budget=0, cuda_max_mem=0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/nemo/collections/diffusion/vae/train_vae.py b/nemo/collections/diffusion/vae/train_vae.py
new file mode 100644
index 000000000000..c9748407b011
--- /dev/null
+++ b/nemo/collections/diffusion/vae/train_vae.py
@@ -0,0 +1,365 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Any, Callable, Dict, Optional, Sequence, Tuple
+
+import nemo_run as run
+import torch
+import torch.distributed
+import torch.utils.checkpoint
+import torchvision
+import wandb
+from autovae import VAEGenerator
+from contperceptual_loss import LPIPSWithDiscriminator
+from diffusers import AutoencoderKL
+from megatron.core import parallel_state
+from megatron.core.transformer.enums import ModelType
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.energon import DefaultTaskEncoder, ImageSample
+from torch import Tensor, nn
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.diffusion.data.diffusion_energon_datamodule import DiffusionDataModule
+from nemo.collections.diffusion.train import pretrain
+from nemo.collections.llm.gpt.model.base import GPTModel
+from nemo.lightning.io.mixin import IOMixin
+from nemo.lightning.megatron_parallel import DataT, MegatronLossReduction, ReductionT
+from nemo.lightning.pytorch.optim import OptimizerModule
+
+
+class AvgLossReduction(MegatronLossReduction):
+    """Performs average loss reduction across micro-batches."""
+
+    def forward(self, batch: DataT, forward_out: Tensor) -> Tuple[Tensor, ReductionT]:
+        """
+        Forward pass for loss reduction.
+
+        Args:
+            batch: The batch of data.
+            forward_out: The output tensor from forward computation.
+
+        Returns:
+            A tuple of (loss, reduction dictionary).
+        """
+        loss = forward_out.mean()
+        return loss, {"avg": loss}
+
+    def reduce(self, losses_reduced_per_micro_batch: Sequence[ReductionT]) -> Tensor:
+        """
+        Reduce losses across multiple micro-batches by averaging them.
+
+        Args:
+            losses_reduced_per_micro_batch: A sequence of loss dictionaries.
+
+        Returns:
+            The averaged loss tensor.
+        """
+        losses = torch.stack([loss["avg"] for loss in losses_reduced_per_micro_batch])
+        return losses.mean()
+
+
+class VAE(MegatronModule):
+    """Variational Autoencoder (VAE) module."""
+
+    def __init__(self, config, pretrained_model_name_or_path, search_vae=False):
+        """
+        Initialize the VAE model.
+
+        Args:
+            config: Transformer configuration.
+            pretrained_model_name_or_path: Path or name of the pretrained model.
+            search_vae: Flag to indicate whether to search for a target VAE using AutoVAE.
+        """
+        super().__init__(config)
+        if search_vae:
+            # Get VAE automatically from AutoVAE
+            self.vae = VAEGenerator(input_resolution=1024, compression_ratio=16)
+            # Below line is commented out due to an undefined 'generator' variable in original code snippet.
+            # self.vae = generator.search_for_target_vae(parameters_budget=895.178707, cuda_max_mem=0)
+        else:
+            self.vae = AutoencoderKL.from_config(pretrained_model_name_or_path, weight_dtype=torch.bfloat16)
+
+        sdxl_vae = AutoencoderKL.from_pretrained(
+            'stabilityai/stable-diffusion-xl-base-1.0', subfolder="vae", weight_dtype=torch.bfloat16
+        )
+        sd_dict = sdxl_vae.state_dict()
+        vae_dict = self.vae.state_dict()
+        pre_dict = {k: v for k, v in sd_dict.items() if (k in vae_dict) and (vae_dict[k].numel() == v.numel())}
+        self.vae.load_state_dict(pre_dict, strict=False)
+        del sdxl_vae
+
+        self.vae_loss = LPIPSWithDiscriminator(
+            disc_start=50001,
+            logvar_init=0.0,
+            kl_weight=0.000001,
+            pixelloss_weight=1.0,
+            disc_num_layers=3,
+            disc_in_channels=3,
+            disc_factor=1.0,
+            disc_weight=0.5,
+            perceptual_weight=1.0,
+            use_actnorm=False,
+            disc_conditional=False,
+            disc_loss="hinge",
+        )
+
+    def forward(self, target, global_step):
+        """
+        Forward pass through the VAE.
+
+        Args:
+            target: Target images.
+            global_step: Current global step.
+
+        Returns:
+            A tuple (aeloss, log_dict_ae, pred) containing the loss, log dictionary, and predictions.
+        """
+        posterior = self.vae.encode(target).latent_dist
+        z = posterior.sample()
+        pred = self.vae.decode(z).sample
+        aeloss, log_dict_ae = self.vae_loss(
+            inputs=target,
+            reconstructions=pred,
+            posteriors=posterior,
+            optimizer_idx=0,
+            global_step=global_step,
+            last_layer=self.vae.decoder.conv_out.weight,
+        )
+        return aeloss, log_dict_ae, pred
+
+    def set_input_tensor(self, input_tensor: Tensor) -> None:
+        """
+        Set input tensor.
+
+        Args:
+            input_tensor: The input tensor to the model.
+        """
+        pass
+
+
+class VAEModel(GPTModel):
+    """A GPTModel wrapper for the VAE."""
+
+    def __init__(
+        self,
+        pretrained_model_name_or_path: str,
+        optim: Optional[OptimizerModule] = None,
+        model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
+    ):
+        """
+        Initialize the VAEModel.
+
+        Args:
+            pretrained_model_name_or_path: Path or name of the pretrained model.
+            optim: Optional optimizer module.
+            model_transform: Optional function to transform the model.
+        """
+        self.pretrained_model_name_or_path = pretrained_model_name_or_path
+        config = TransformerConfig(num_layers=1, hidden_size=1, num_attention_heads=1)
+        self.model_type = ModelType.encoder_or_decoder
+        super().__init__(config, optim=optim, model_transform=model_transform)
+
+    def configure_model(self) -> None:
+        """Configure the model by initializing the module."""
+        if not hasattr(self, "module"):
+            self.module = VAE(self.config, self.pretrained_model_name_or_path)
+
+    def data_step(self, dataloader_iter) -> Dict[str, Any]:
+        """
+        Perform a single data step to fetch a batch from the iterator.
+
+        Args:
+            dataloader_iter: The dataloader iterator.
+
+        Returns:
+            A dictionary with 'pixel_values' ready for the model.
+        """
+        batch = next(dataloader_iter)[0]
+        return {'pixel_values': batch.image.to(device='cuda', dtype=torch.bfloat16, non_blocking=True)}
+
+    def forward(self, *args, **kwargs):
+        """
+        Forward pass through the underlying module.
+
+        Args:
+            *args: Variable length argument list.
+            **kwargs: Arbitrary keyword arguments.
+
+        Returns:
+            The result of forward pass of self.module.
+        """
+        return self.module(*args, **kwargs)
+
+    def training_step(self, batch, batch_idx=None) -> torch.Tensor:
+        """
+        Perform a single training step.
+
+        Args:
+            batch: The input batch.
+            batch_idx: Batch index.
+
+        Returns:
+            The loss tensor.
+        """
+        loss, log_dict_ae, pred = self(batch["pixel_values"], self.global_step)
+
+        if torch.distributed.get_rank() == 0:
+            self.log_dict(log_dict_ae)
+
+        return loss
+
+    def validation_step(self, batch, batch_idx=None) -> torch.Tensor:
+        """
+        Perform a single validation step.
+
+        Args:
+            batch: The input batch.
+            batch_idx: Batch index.
+
+        Returns:
+            The loss tensor.
+        """
+        loss, log_dict_ae, pred = self(batch["pixel_values"], self.global_step)
+
+        image = torch.cat([batch["pixel_values"].cpu(), pred.cpu()], axis=0)
+        image = (image + 0.5).clamp(0, 1)
+
+        # wandb is on the last rank for megatron, first rank for nemo
+        wandb_rank = 0
+
+        if parallel_state.get_data_parallel_src_rank() == wandb_rank:
+            if torch.distributed.get_rank() == wandb_rank:
+                gather_list = [None for _ in range(parallel_state.get_data_parallel_world_size())]
+            else:
+                gather_list = None
+            torch.distributed.gather_object(
+                image, gather_list, wandb_rank, group=parallel_state.get_data_parallel_group()
+            )
+            if gather_list is not None:
+                self.log_dict(log_dict_ae)
+                wandb.log(
+                    {
+                        "Original (left), Reconstruction (right)": [
+                            wandb.Image(torchvision.utils.make_grid(image)) for _, image in enumerate(gather_list)
+                        ]
+                    },
+                )
+
+        return loss
+
+    @property
+    def training_loss_reduction(self) -> AvgLossReduction:
+        """Returns the loss reduction method for training."""
+        if not self._training_loss_reduction:
+            self._training_loss_reduction = AvgLossReduction()
+        return self._training_loss_reduction
+
+    @property
+    def validation_loss_reduction(self) -> AvgLossReduction:
+        """Returns the loss reduction method for validation."""
+        if not self._validation_loss_reduction:
+            self._validation_loss_reduction = AvgLossReduction()
+        return self._validation_loss_reduction
+
+    def on_validation_model_zero_grad(self) -> None:
+        """
+        Hook to handle zero grad on validation model step.
+        Used here to skip first validation on resume.
+        """
+        super().on_validation_model_zero_grad()
+        if self.trainer.ckpt_path is not None and getattr(self, '_restarting_skip_val_flag', True):
+            self.trainer.sanity_checking = True
+            self._restarting_skip_val_flag = False
+
+
+def crop_image(img, divisor=16):
+    """
+    Crop the image so that both dimensions are divisible by the given divisor.
+
+    Args:
+        img: Image tensor.
+        divisor: The divisor to use for cropping.
+
+    Returns:
+        The cropped image tensor.
+    """
+    h, w = img.shape[-2], img.shape[-1]
+
+    delta_h = h % divisor
+    delta_w = w % divisor
+
+    delta_h_top = delta_h // 2
+    delta_h_bottom = delta_h - delta_h_top
+
+    delta_w_left = delta_w // 2
+    delta_w_right = delta_w - delta_w_left
+
+    img_cropped = img[..., delta_h_top : h - delta_h_bottom, delta_w_left : w - delta_w_right]
+
+    return img_cropped
+
+
+class ImageTaskEncoder(DefaultTaskEncoder, IOMixin):
+    """Image task encoder that crops and normalizes the image."""
+
+    def encode_sample(self, sample: ImageSample) -> ImageSample:
+        """
+        Encode a single image sample by cropping and shifting its values.
+
+        Args:
+            sample: An image sample.
+
+        Returns:
+            The transformed image sample.
+        """
+        sample = super().encode_sample(sample)
+        sample.image = crop_image(sample.image, 16)
+        sample.image -= 0.5
+        return sample
+
+
+@run.cli.factory(target=llm.train)
+def train_vae() -> run.Partial:
+    """
+    Training factory function for VAE.
+
+    Returns:
+        A run.Partial recipe for training.
+    """
+    recipe = pretrain()
+    recipe.model = run.Config(
+        VAEModel,
+        pretrained_model_name_or_path='nemo/collections/diffusion/vae/vae16x/config.json',
+    )
+    recipe.data = run.Config(
+        DiffusionDataModule,
+        task_encoder=run.Config(ImageTaskEncoder),
+        global_batch_size=24,
+        num_workers=10,
+    )
+    recipe.optim.lr_scheduler = run.Config(nl.lr_scheduler.WarmupHoldPolicyScheduler, warmup_steps=100, hold_steps=1e9)
+    recipe.optim.config.lr = 5e-6
+    recipe.optim.config.weight_decay = 1e-2
+    recipe.log.log_dir = 'nemo_experiments/train_vae'
+    recipe.trainer.val_check_interval = 1000
+    recipe.trainer.callbacks[0].every_n_train_steps = 1000
+
+    return recipe
+
+
+if __name__ == "__main__":
+    run.cli.main(llm.train, default_factory=train_vae)
diff --git a/nemo/collections/diffusion/vae/train_vae.sh b/nemo/collections/diffusion/vae/train_vae.sh
new file mode 100644
index 000000000000..3f5a46ab9f65
--- /dev/null
+++ b/nemo/collections/diffusion/vae/train_vae.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+#SBATCH -p batch -A coreai_dlalgo_llm -t 4:00:00 --nodes=16 --exclusive --mem=0 --overcommit --gpus-per-node 8 --ntasks-per-node=8 --dependency=singleton
+
+export WANDB_RESUME=allow
+export WANDB_NAME=train_vae
+
+DIR=`pwd`
+
+srun --signal=TERM@300 -l --container-image ${IMAGE} --container-mounts "/lustre:/lustre/,/home:/home" --no-container-mount-home --mpi=pmix bash -c "cd ${DIR} ; python -u nemo/collections/diffusion/vae/train_vae.py --yes $*"
diff --git a/nemo/collections/diffusion/vae/vae16x/config.json b/nemo/collections/diffusion/vae/vae16x/config.json
new file mode 100644
index 000000000000..9b363564eed2
--- /dev/null
+++ b/nemo/collections/diffusion/vae/vae16x/config.json
@@ -0,0 +1,35 @@
+{
+    "_class_name": "AutoencoderKL",
+    "_diffusers_version": "0.20.0.dev0",
+    "_name_or_path": "../sdxl-vae/",
+    "act_fn": "silu",
+    "block_out_channels": [
+      128,
+      256,
+      512,
+      1024,
+      2048
+    ],
+    "down_block_types": [
+      "DownEncoderBlock2D",
+      "DownEncoderBlock2D",
+      "DownEncoderBlock2D",
+      "DownEncoderBlock2D",
+      "DownEncoderBlock2D"
+    ],
+    "force_upcast": false,
+    "in_channels": 3,
+    "latent_channels": 16,
+    "layers_per_block": 2,
+    "norm_num_groups": 32,
+    "out_channels": 3,
+    "sample_size": 1024,
+    "scaling_factor": 0.13025,
+    "up_block_types": [
+      "UpDecoderBlock2D",
+      "UpDecoderBlock2D",
+      "UpDecoderBlock2D",
+      "UpDecoderBlock2D",
+      "UpDecoderBlock2D"
+    ]
+  }
diff --git a/nemo/collections/diffusion/vae/validate_vae.py b/nemo/collections/diffusion/vae/validate_vae.py
new file mode 100644
index 000000000000..dd143d9e0b33
--- /dev/null
+++ b/nemo/collections/diffusion/vae/validate_vae.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import nemo_run as run
+from nemo.collections import llm
+from nemo.collections.diffusion.vae.train_vae import train_vae
+
+
+@run.cli.factory(target=llm.validate)
+def validate_vae() -> run.Partial:
+    """
+    Create a partial function for validating a VAE (Variational Autoencoder) model.
+
+    This function uses the training recipe defined in `train_vae()` to set up
+    the model, data, trainer, logging, and optimization configurations for
+    validation. It returns a Partial object that can be used by the NeMo run CLI
+    to execute the validation procedure on the provided model and data.
+
+    Returns:
+        run.Partial: A partial object configured with llm.validate target
+        and all necessary arguments extracted from the VAE training recipe.
+    """
+    recipe = train_vae()
+    return run.Partial(
+        llm.validate,
+        model=recipe.model,
+        data=recipe.data,
+        trainer=recipe.trainer,
+        log=recipe.log,
+        optim=recipe.optim,
+        tokenizer=None,
+        resume=recipe.resume,
+        model_transform=None,
+    )
+
+
+if __name__ == "__main__":
+    run.cli.main(llm.validate, default_factory=validate_vae)
diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 4a792b44b056..5f92dcf81955 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -18,9 +18,12 @@
 safe_import("transformer_engine")
 
 from nemo.collections.llm import peft
-from nemo.collections.llm.bert.data import BERTMockDataModule, BERTPreTrainingDataModule
+from nemo.collections.llm.bert.data import BERTMockDataModule, BERTPreTrainingDataModule, SpecterDataModule
 from nemo.collections.llm.bert.model import (
     BertConfig,
+    BertEmbeddingLargeConfig,
+    BertEmbeddingMiniConfig,
+    BertEmbeddingModel,
     BertModel,
     HuggingFaceBertBaseConfig,
     HuggingFaceBertConfig,
@@ -135,7 +138,15 @@
 from nemo.collections.llm.t5.data import MockDataModule as T5MockDataModule
 from nemo.collections.llm.t5.data import PreTrainingDataModule as T5PreTrainingDataModule
 from nemo.collections.llm.t5.data import SquadDataModule as T5SquadDataModule
-from nemo.collections.llm.t5.model import T5Config, T5Model, t5_data_step, t5_forward_step
+from nemo.collections.llm.t5.model import (
+    T5Config,
+    T5Config3B,
+    T5Config11B,
+    T5Config220M,
+    T5Model,
+    t5_data_step,
+    t5_forward_step,
+)
 
 __all__ = [
     "MockDataModule",
@@ -146,8 +157,14 @@
     "gpt_forward_step",
     "T5Model",
     "T5Config",
+    "T5Config220M",
+    "T5Config3B",
+    "T5Config11B",
     "BertConfig",
+    "BertEmbeddingModel",
     "BertModel",
+    "BertEmbeddingLargeConfig",
+    "BertEmbeddingMiniConfig",
     "t5_data_step",
     "t5_forward_step",
     "MaskedTokenLossReduction",
@@ -238,6 +255,7 @@
     "MegatronBertLargeConfig",
     "BERTMockDataModule",
     "BERTPreTrainingDataModule",
+    "SpecterDataModule",
     "DollyDataModule",
     "tokenizer",
     "mock",
diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 7d7762edef3c..e84703c70352 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -326,7 +326,8 @@ def deploy(
     model_type: str = "llama",
     triton_model_name: str = "triton_model",
     triton_model_version: Optional[int] = 1,
-    triton_port: int = 8000,
+    triton_http_port: int = 8000,
+    triton_grpc_port: int = 8001,
     triton_http_address: str = "0.0.0.0",
     triton_request_timeout: int = 60,
     triton_model_repository: Path = None,
@@ -337,16 +338,10 @@ def deploy(
     max_input_len: int = 256,
     max_output_len: int = 256,
     max_batch_size: int = 8,
-    start_rest_service: bool = True,
-    rest_service_http_address: str = "0.0.0.0",
-    rest_service_port: int = 8080,
-    openai_format_response: bool = True,
     output_generation_logits: bool = True,
 ):
     """
     Deploys nemo model on a PyTriton server by converting the nemo ckpt to trtllm.
-    Also starts rest service that is used to send OpenAI API compatible input request
-    to the PyTiton server.
 
     Args:
         nemo_checkpoint (Path): Path for nemo checkpoint.
@@ -355,7 +350,8 @@ def deploy(
         name is passed to the evalute method for the model to be accessible while sending evalution requests.
         Default: 'triton_model'.
         triton_model_version (Optional[int]): Version for the triton model. Default: 1.
-        triton_port (int): Port for the PyTriton server. Default: 8000.
+        triton_http_port (int): HTTP port for the PyTriton server. Default: 8000.
+        triton_grpc_port (int): gRPC Port for the PyTriton server. Default: 8001.
         triton_http_address (str): HTTP address for the PyTriton server. Default:  "0.0.0.0".
         triton_request_timeout (int): Timeout in seconds for Triton server. Default: 60.
         triton_model_repository (Path): Folder for the trt-llm conversion, trt-llm engine gets saved in this specified
@@ -367,10 +363,7 @@ def deploy(
         max_input_len (int): Max input length of the model. Default: 256.
         max_output_len (int): Max output length of the model. Default: 256.
         max_batch_size (int): Max batch size of the model. Default: 8.
-        start_rest_service (bool): Start rest service that is used to send evaluation requests to the PyTriton server.
         Needs to be True to be able to run evaluation. Default: True.
-        rest_service_http_address (str): HTTP address for the rest service. Default: "0.0.0.0".
-        rest_service_port (int): Port for the rest service. Default: 8080.
         openai_format_response (bool): Return the response from PyTriton server in OpenAI compatible format. Needs to
         be True while running evaluation. Default: True.
         output_generation_logits (bool): If True builds trtllm engine with gather_generation_logits set to True.
@@ -380,16 +373,6 @@ def deploy(
     from nemo.deploy import DeployPyTriton
 
     unset_environment_variables()
-    if start_rest_service:
-        if triton_port == rest_service_port:
-            logging.error("REST service port and Triton server port cannot use the same port.")
-            return
-        # Store triton ip, port and other args relevant for REST API as env vars to be accessible by rest_model_api.py
-        os.environ["TRITON_HTTP_ADDRESS"] = triton_http_address
-        os.environ["TRITON_PORT"] = str(triton_port)
-        os.environ["TRITON_REQUEST_TIMEOUT"] = str(triton_request_timeout)
-        os.environ["OPENAI_FORMAT_RESPONSE"] = str(openai_format_response)
-        os.environ["OUTPUT_GENERATION_LOGITS"] = str(output_generation_logits)
 
     triton_deployable = get_trtllm_deployable(
         nemo_checkpoint,
@@ -411,7 +394,8 @@ def deploy(
             triton_model_name=triton_model_name,
             triton_model_version=triton_model_version,
             max_batch_size=max_batch_size,
-            port=triton_port,
+            http_port=triton_http_port,
+            grpc_port=triton_grpc_port,
             address=triton_http_address,
         )
 
@@ -422,26 +406,8 @@ def deploy(
         logging.error("Error message has occurred during deploy function. Error message: " + str(error))
         return
 
-    uvicorn_supported = True
     try:
-        import uvicorn
-    except ImportError as error:
-        logging.warning(f"uvicorn could not be imported: {error}")
-        uvicorn_supported = False
-
-    try:
-        logging.info("Model serving on Triton is will be started.")
-        if start_rest_service and uvicorn_supported:
-            try:
-                logging.info("REST service will be started.")
-                uvicorn.run(
-                    "nemo.deploy.service.rest_model_api:app",
-                    host=rest_service_http_address,
-                    port=rest_service_port,
-                    reload=True,
-                )
-            except Exception as error:
-                logging.error("Error message has occurred during REST service start. Error message: " + str(error))
+        logging.info("Model serving on Triton will be started.")
         nm.serve()
     except Exception as error:
         logging.error("Error message has occurred during deploy function. Error message: " + str(error))
@@ -453,7 +419,7 @@ def deploy(
 
 def evaluate(
     nemo_checkpoint_path: Path,
-    url: str = "http://0.0.0.0:8080/v1",
+    url: str = "grpc://0.0.0.0:8001",
     model_name: str = "triton_model",
     eval_task: str = "gsm8k",
     num_fewshot: Optional[int] = None,
@@ -473,10 +439,7 @@ def evaluate(
     Args:
         nemo_checkpoint_path (Path): Path for nemo 2.0 checkpoint. This is used to get the tokenizer from the ckpt
         which is required to tokenize the evaluation input and output prompts.
-        url (str): rest service url and port that were used in the deploy method above in the format:
-        http://{rest_service_http}:{rest_service_port}. Post requests with evaluation input prompts
-        (from lm-eval-harness) are sent to this url which is then passed to the model deployed on PyTriton server.
-        The rest service url and port serve as the entry point to evaluate model deployed on PyTriton server.
+        url (str): grpc service url that were used in the deploy method above in the format: grpc://{grpc_service_ip}:{grpc_port}.
         model_name (str): Name of the model that is deployed on PyTriton server. It should be the same as
         triton_model_name passed to the deploy method above to be able to launch evaluation. Deafult: "triton_model".
         eval_task (str): task to be evaluated on. For ex: "gsm8k", "gsm8k_cot", "mmlu", "lambada". Default: "gsm8k".
@@ -513,8 +476,6 @@ def evaluate(
 
     # Get tokenizer from nemo ckpt. This works only with NeMo 2.0 ckpt.
     tokenizer = io.load_context(nemo_checkpoint_path + "/context", subpath="model.tokenizer")
-    # Wait for rest service to be ready before starting evaluation
-    evaluation.wait_for_rest_service(rest_url=f"{url}/v1/health")
     # Create an object of the NeMoFWLM which is passed as a model to evaluator.simple_evaluate
     model = evaluation.NeMoFWLMEval(
         model_name, url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos
diff --git a/nemo/collections/llm/bert/data/__init__.py b/nemo/collections/llm/bert/data/__init__.py
index bd309ce4ab62..94c3721c30c1 100644
--- a/nemo/collections/llm/bert/data/__init__.py
+++ b/nemo/collections/llm/bert/data/__init__.py
@@ -1,4 +1,5 @@
 from nemo.collections.llm.bert.data.mock import BERTMockDataModule
 from nemo.collections.llm.bert.data.pre_training import BERTPreTrainingDataModule
+from nemo.collections.llm.bert.data.specter import SpecterDataModule
 
-__all__ = ["BERTPreTrainingDataModule", "BERTMockDataModule"]
+__all__ = ["BERTPreTrainingDataModule", "BERTMockDataModule", "SpecterDataModule"]
diff --git a/nemo/collections/llm/bert/data/core.py b/nemo/collections/llm/bert/data/core.py
new file mode 100644
index 000000000000..3d1adb47a042
--- /dev/null
+++ b/nemo/collections/llm/bert/data/core.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+
+from nemo.collections.nlp.data.information_retrieval.bert_embedding_dataset import BertEmbeddingDataset
+from nemo.lightning.base import NEMO_DATASETS_CACHE
+
+if TYPE_CHECKING:
+    from nemo.collections.common.tokenizers import TokenizerSpec
+
+
+def get_dataset_root(name: str) -> Path:
+    """Retrieve the root path for the dataset. Create the folder if not exists."""
+    output = Path(NEMO_DATASETS_CACHE) / name
+    output.mkdir(parents=True, exist_ok=True)
+
+    return output
+
+
+def create_sft_dataset(
+    path: Path,
+    tokenizer: "TokenizerSpec",
+    seq_length: int = 2048,
+    add_bos: bool = False,
+    add_eos: bool = True,
+    seed: int = 1234,
+    index_mapping_dir: Optional[str] = None,
+    truncation_method: str = 'right',
+    memmap_workers: int = 2,
+    data_type: str = 'train',
+    num_hard_negatives: int = 1,
+    **kwargs,
+) -> "BertEmbeddingDataset":
+    """Create BertEmbeddingDataset for SFT training."""
+
+    return BertEmbeddingDataset(
+        file_path=str(path),
+        tokenizer=tokenizer,
+        max_seq_length=seq_length,
+        add_bos=add_bos,
+        add_eos=add_eos,
+        memmap_workers=memmap_workers,
+        seed=seed,
+        index_mapping_dir=index_mapping_dir,
+        truncation_method=truncation_method,
+        data_type=data_type,
+        num_hard_negatives=num_hard_negatives,
+        **kwargs,
+    )
diff --git a/nemo/collections/llm/bert/data/fine_tuning.py b/nemo/collections/llm/bert/data/fine_tuning.py
new file mode 100644
index 000000000000..0edc9862f4f4
--- /dev/null
+++ b/nemo/collections/llm/bert/data/fine_tuning.py
@@ -0,0 +1,227 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from functools import lru_cache
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+import lightning.pytorch as pl
+from torch.utils.data import DataLoader
+
+from nemo.collections.common.tokenizers import AutoTokenizer
+from nemo.collections.llm.bert.data.core import create_sft_dataset
+from nemo.lightning.data import WrappedDataLoader
+from nemo.lightning.pytorch.plugins import MegatronDataSampler
+from nemo.utils import logging
+
+if TYPE_CHECKING:
+    from nemo.collections.common.tokenizers import TokenizerSpec
+
+
+class FineTuningDataModule(pl.LightningDataModule):
+    """Base class for fine-tuning an Bert.
+
+    This class provides a foundation for building custom data modules for fine-tuning Nemo NLP models. It inherits from
+    `pl.LightningDataModule` from the PyTorch Lightning library and handles data loading, preprocessing, and batch
+    creation for training, validation, and testing.
+
+    Args:
+        dataset_root (Union[str, Path]): The root directory containing the training, validation, and test data.
+        seq_length (int, optional): The maximum sequence length for the input and output text. Defaults to 2048.
+        tokenizer (Optional[TokenizerSpec], optional): The tokenizer to use for preprocessing the text.
+            If not provided, a Megatron GPT2 BPE tokenizer will be used.
+        micro_batch_size (int, optional): The micro batch size for training. Defaults to 4.
+        global_batch_size (int, optional): The global batch size for training. Defaults to 8.
+        rampup_batch_size (Optional[List[int]], optional): A list of batch sizes for ramping up during training.
+            Defaults to None.
+        seed (int, optional): The random seed for data shuffling. Defaults to 1234.
+        memmap_workers (int, optional): The number of worker processes for loading data using TextMemMapDataset.
+            Defaults to 1.
+        num_workers (int, optional): The number of worker processes for data loading. Defaults to 8.
+        pin_memory (bool, optional): Whether to pin memory during data loading for faster GPU training.
+            Defaults to True.
+        persistent_workers (bool, optional): Whether to keep data loading workers persistent across epochs.
+            Defaults to False.
+        dataset_kwargs (Optional[Dict[str, Any]], optional): Keyword arguments to pass into the GPTSFTDataset class
+    """
+
+    def __init__(
+        self,
+        dataset_root: Union[str, Path],
+        seq_length: int = 2048,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        rampup_batch_size: Optional[List[int]] = None,
+        seed: int = 1234,
+        memmap_workers: int = 1,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+        dataset_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__()
+        self.seq_length = seq_length
+        self.seed = seed
+        self.dataset_root = Path(dataset_root)
+        self.tokenizer = tokenizer
+        self.memmap_workers = memmap_workers
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        self.persistent_workers = persistent_workers
+        self.micro_batch_size = micro_batch_size
+        self.global_batch_size = global_batch_size
+        self.rampup_batch_size = rampup_batch_size
+        self.data_sampler = None
+        self.max_train_samples = None
+        self.dataset_kwargs = dataset_kwargs or {}
+
+    def setup(self, stage: str):
+        """Called by pytorch lightning in datamodule setup"""
+
+        # data_sampler is used in `setup_data_sampler` in MegatronStrategy.setup
+        self.data_sampler = MegatronDataSampler(
+            seq_len=self.seq_length,
+            micro_batch_size=self.micro_batch_size,
+            global_batch_size=self.global_batch_size,
+            rampup_batch_size=self.rampup_batch_size,
+            dataloader_type="batch",
+        )
+
+        # Follows the calculation in nemo.collections.nlp.data.language_modeling.megatron.
+        # base_dataset_utils.get_datasets_weights_and_num_samples
+        self.max_train_samples = int(math.ceil(self.global_batch_size * self.trainer.max_steps * 1.005))
+
+    def state_dict(self) -> Dict[str, Any]:
+        """Called when saving a checkpoint, implement to generate and save datamodule state.
+
+        Returns:
+            A dictionary containing datamodule state.
+
+        """
+        consumed_samples = self.data_sampler.compute_consumed_samples(
+            self.trainer.global_step - self.data_sampler.init_global_step
+        )
+        return {"consumed_samples": consumed_samples}
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        """Called when loading a checkpoint, implement to reload datamodule state given datamodule stat
+
+        Args:
+            state_dict: the datamodule state returned by ``state_dict``.
+
+        """
+        try:
+            from megatron.core.num_microbatches_calculator import update_num_microbatches
+
+        except (ImportError, ModuleNotFoundError):
+            logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+            from apex.transformer.pipeline_parallel.utils import update_num_microbatches
+        consumed_samples = state_dict["consumed_samples"]
+        self.data_sampler.init_consumed_samples = consumed_samples
+        self.data_sampler.prev_consumed_samples = consumed_samples
+
+        update_num_microbatches(
+            consumed_samples=consumed_samples,
+            consistency_check=False,
+        )
+        self.data_sampler.if_first_step = 1
+
+    def train_dataloader(self) -> DataLoader:
+        # pylint: disable=C0115,C0116
+        return self._create_dataloader(
+            self._create_dataset(
+                self.train_path,
+                max_num_samples=self.max_train_samples,
+                **self.dataset_kwargs,
+            ),
+            mode="train",
+        )
+
+    def val_dataloader(self) -> DataLoader:
+        # pylint: disable=C0115,C0116
+        return self._create_dataloader(
+            self._create_dataset(
+                self.train_path,
+                max_num_samples=self.max_train_samples,
+                **self.dataset_kwargs,
+            ),
+            mode="train",
+        )
+
+    def test_dataloader(self) -> DataLoader:
+        # pylint: disable=C0115,C0116
+        return self._create_dataloader(
+            self._create_dataset(
+                self.train_path,
+                max_num_samples=self.max_train_samples,
+                **self.dataset_kwargs,
+            ),
+            mode="train",
+        )
+
+    @lru_cache
+    def _create_dataset(self, path, **kwargs):
+        return create_sft_dataset(
+            path,
+            tokenizer=self.tokenizer,
+            seq_length=self.seq_length,
+            memmap_workers=self.memmap_workers,
+            seed=self.seed,
+            **kwargs,
+        )
+
+    def _create_dataloader(self, dataset, mode, **kwargs) -> DataLoader:
+        return WrappedDataLoader(
+            mode=mode,
+            dataset=dataset,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers,
+            collate_fn=dataset.collate_fn,
+            **kwargs,
+        )
+
+    @property
+    def train_path(self) -> Path:
+        """Path to training dataset file"""
+        return self.dataset_root / "training.jsonl"
+
+    @property
+    def validation_path(self) -> Path:
+        """Path to validation dataset file"""
+        return self.dataset_root / "validation.jsonl"
+
+    @property
+    def test_path(self) -> Path:
+        """Path to test dataset file"""
+        return self.dataset_root / "test.jsonl"
+
+    def _extract_tokenizer_model_name(self) -> str:
+        """Automatically get the model name from model path."""
+        if isinstance(self.tokenizer, AutoTokenizer):
+            name = self.tokenizer.tokenizer.name_or_path
+            if name.endswith("context/nemo_tokenizer"):
+                # NEMO_HOME/hf_org/hf_model/context/nemo_tokenizer => hf_org--hf_model
+                tokenizer_model_name = '--'.join(name.split("/")[-4:-2])
+            elif name.endswith("nemo_tokenizer"):
+                # NEMO_HOME/hf_org/hf_model/nemo_tokenizer => hf_org--hf_model
+                tokenizer_model_name = '--'.join(name.split("/")[-3:-1])
+            else:
+                # hf_org/hf_model => hf_org--hf_model
+                tokenizer_model_name = name.replace("/", "--")
+        else:
+            tokenizer_model_name = f"unknown_tokenizer_{hash(self.tokenizer)}"
+        return tokenizer_model_name
diff --git a/nemo/collections/llm/bert/data/specter.py b/nemo/collections/llm/bert/data/specter.py
new file mode 100644
index 000000000000..7784b32e6bd9
--- /dev/null
+++ b/nemo/collections/llm/bert/data/specter.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import shutil
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+from datasets import DatasetDict, load_dataset
+
+from nemo.collections.llm.bert.data.core import get_dataset_root
+from nemo.collections.llm.bert.data.fine_tuning import FineTuningDataModule
+from nemo.lightning.io.mixin import IOMixin
+from nemo.utils import logging
+
+if TYPE_CHECKING:
+    from nemo.collections.common.tokenizers import TokenizerSpec
+
+
+class SpecterDataModule(FineTuningDataModule, IOMixin):
+    """A data module for fine-tuning on the Specter dataset.
+
+    This class inherits from the `FineTuningDataModule` class and is specifically designed for fine-tuning models
+    on the SPECTER Datasets. It handles data download, preprocessing, splitting, and preparing the data
+    in a format suitable for training, validation, and testing.
+
+    Args:
+        force_redownload (bool, optional): Whether to force re-download the dataset even if it exists locally.
+                                           Defaults to False.
+        delete_raw (bool, optional): Whether to delete the raw downloaded dataset after preprocessing.
+                                     Defaults to True.
+        See FineTuningDataModule for the other args
+    """
+
+    def __init__(
+        self,
+        seq_length: int = 512,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        rampup_batch_size: Optional[List[int]] = None,
+        force_redownload: bool = False,
+        delete_raw: bool = True,
+        seed: int = 1234,
+        memmap_workers: int = 1,
+        num_workers: int = 0,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+        dataset_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        self.force_redownload = force_redownload
+        self.delete_raw = delete_raw
+
+        super().__init__(
+            dataset_root=get_dataset_root("specter"),
+            seq_length=seq_length,
+            tokenizer=tokenizer,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            rampup_batch_size=rampup_batch_size,
+            seed=seed,
+            memmap_workers=memmap_workers,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            persistent_workers=persistent_workers,
+            dataset_kwargs=dataset_kwargs,
+        )
+
+    def prepare_data(self) -> None:
+        """Prepare dataset for fine-tuning."""
+        # if train file is specified, no need to do anything
+        if not self.train_path.exists() or self.force_redownload:
+            dset = self._download_data()
+            self._preprocess_and_split_data(dset)
+        super().prepare_data()
+
+    def _download_data(self):
+        logging.info(f"Downloading {self.__class__.__name__}...")
+        return load_dataset(
+            "sentence-transformers/specter",
+            "triplet",
+            cache_dir=str(self.dataset_root),
+            download_mode="force_redownload" if self.force_redownload else None,
+        )
+
+    def _preprocess_and_split_data(self, dset: DatasetDict, train_ratio: float = 0.80, val_ratio: float = 0.15):
+        """Preprocesses and splits the downloaded dataset into training, validation, and test sets.
+
+        Args:
+            dset (DatasetDict): The downloaded dataset object.
+            split_val_from_train (bool, optional): Whether to split the validation set from the training set.
+                If False, the validation set is split from the test set. Defaults to True.
+            val_proportion (float, optional): The proportion of the training or test set to be used
+                for the validation split. Defaults to 0.05.
+        """
+        logging.info(f"Preprocessing {self.__class__.__name__} to jsonl format and splitting...")
+
+        test_ratio = 1 - train_ratio - val_ratio
+        save_splits = {}
+        dataset = dset.get('train')
+        split_dataset = dataset.train_test_split(test_size=val_ratio + test_ratio, seed=self.seed)
+        split_dataset2 = split_dataset['test'].train_test_split(
+            test_size=test_ratio / (val_ratio + test_ratio), seed=self.seed
+        )
+        save_splits['training'] = split_dataset['train']
+        save_splits['validation'] = split_dataset2['train']
+        save_splits['test'] = split_dataset2['test']
+
+        for split_name, dataset in save_splits.items():
+            output_file = self.dataset_root / f"{split_name}.jsonl"
+            with output_file.open("w", encoding="utf-8") as f:
+                for o in dataset:
+                    f.write(
+                        json.dumps({"query": o["anchor"], "pos_doc": o["positive"], "neg_doc": [o["negative"]]}) + "\n"
+                    )
+
+            logging.info(f"{split_name} split saved to {output_file}")
+
+        if self.delete_raw:
+            for p in self.dataset_root.iterdir():
+                if p.is_dir():
+                    shutil.rmtree(p)
+                elif '.jsonl' not in str(p.name):
+                    p.unlink()
+
+    def reconfigure_limit_batches(self):
+        """No need to reconfigure trainer.limit_val_batches for finetuning"""
+        return
diff --git a/nemo/collections/llm/bert/loss.py b/nemo/collections/llm/bert/loss.py
index 42729e28848d..6fd34a4d3fa3 100644
--- a/nemo/collections/llm/bert/loss.py
+++ b/nemo/collections/llm/bert/loss.py
@@ -12,12 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, List, Tuple
+from typing import Dict, List, Literal, Tuple
 
 import torch
 import torch.nn.functional as F
-from torch import Tensor
+from torch import Tensor, nn
+from torch.distributed import all_gather as all_gather_no_backprop
+from torch.distributed.nn.functional import all_gather as all_gather_with_backprop
 
+from nemo.collections.nlp.modules.common.megatron.utils import average_losses_across_data_parallel_group
 from nemo.lightning.megatron_parallel import MaskedTokenLossReduction, MegatronLossReduction
 
 
@@ -96,6 +99,125 @@ def reduce(self, losses_reduced_per_micro_batch) -> torch.Tensor:
         return torch.tensor(0.0, device=torch.cuda.current_device())
 
 
+class BERTInBatchExclusiveHardNegativesRankingLoss(MegatronLossReduction):
+    """
+    This loss uses in-batch negative samples + hard-negative samples.
+    The difference of this loss to the default MultipleNegativesRankingLoss
+    from Sentence Transformers is that the latter shares the hard negatives
+    as negatives for all examples, whereas this loss uses hard negatives
+    exclusively for the example they are associated.
+
+    This loss is also capable of using in-batch negatives from all ranks during training.
+    """
+
+    def __init__(
+        self,
+        validation_step: bool = False,
+        val_drop_last: bool = True,
+        num_hard_negatives: int = 1,
+        scale: float = 20,
+        label_smoothing: float = 0.0,
+        global_in_batch_negatives: bool = False,
+        backprop_type: Literal["local", "global"] = 'local',
+    ) -> None:
+        super().__init__()
+        self.validation_step = validation_step
+        self.val_drop_last = val_drop_last
+        self.num_hard_negatives = num_hard_negatives
+        self.scale = scale
+        self.cross_entropy_loss = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
+        self.global_in_batch_negatives = global_in_batch_negatives
+        self.backprop_type = backprop_type
+
+    def _gather_global_in_batch_representations(self, local_tensor):
+        from megatron.core import parallel_state
+
+        local_tensor = local_tensor.contiguous()
+        if self.backprop_type == 'local':
+            global_tensors = [
+                torch.zeros_like(local_tensor) for _ in range(parallel_state.get_data_parallel_world_size())
+            ]
+            all_gather_no_backprop(global_tensors, local_tensor, group=parallel_state.get_data_parallel_group())
+            global_tensors[parallel_state.get_data_parallel_rank()] = local_tensor
+            global_tensors = torch.cat(global_tensors, dim=0)
+
+        else:
+            global_tensors = all_gather_with_backprop(local_tensor)
+            global_tensors = torch.cat(global_tensors, dim=0)
+
+        return global_tensors
+
+    def forward(
+        self, batch: Dict[str, torch.Tensor], forward_out: torch.Tensor
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        from megatron.core import parallel_state
+
+        cp_size = parallel_state.get_context_parallel_world_size()
+        if cp_size != 1:
+            raise NotImplementedError(f'CP is not supported for {self.__class__} yet.')
+
+        if self.global_in_batch_negatives and not self.validation_step:
+            forward_out = self._gather_global_in_batch_representations(forward_out)
+
+        num_tensors_per_example = 2 + self.num_hard_negatives
+        batch_size = forward_out.shape[0] // num_tensors_per_example
+        chunks = forward_out.chunk(batch_size)
+        # Get Queries, Positives, Negatives
+        queries = torch.stack([item[0] for item in chunks])
+        positives = torch.stack([item[1] for item in chunks])
+        hard_negs = [
+            torch.stack([item[i + 2] for item in chunks]) for i in range(self.num_hard_negatives)
+        ]  # List of length "num_negatives", each tensor of shape (bs, embedding_dim)
+
+        # Calculate scores
+        pos_in_batch_negs_scores = torch.mm(
+            queries, positives.transpose(0, 1)  # shape (bs, bs); each positive is negative for other queries.
+        )
+        hard_negs_scores = (
+            torch.multiply(
+                queries.unsqueeze(0).repeat(len(hard_negs), 1, 1),
+                torch.stack(hard_negs),
+            )
+            .sum(axis=-1)
+            .T
+        )  # shape = (bs, num_negatives); Hard negatives are not shared between queries.
+        scores = torch.cat([pos_in_batch_negs_scores, hard_negs_scores], axis=1)
+
+        scores = scores.clamp(-1.0, 1.0)
+        scores *= self.scale
+        labels = torch.tensor(
+            range(len(scores)), dtype=torch.long, device=scores.device
+        )  # Indices of the (query, positive) pairs
+        ce_loss = self.cross_entropy_loss(scores, labels)
+        reduced_loss = average_losses_across_data_parallel_group([ce_loss])
+        return ce_loss, {"avg": reduced_loss}
+
+    def reduce(self, losses_reduced_per_micro_batch) -> torch.Tensor:
+        """Taken from: https://github.com/NVIDIA/NeMo/blob/main
+        /nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py#L535-L552 ."""
+        if losses_reduced_per_micro_batch:
+            if "avg" in losses_reduced_per_micro_batch[0]:
+                loss_tensors_list = [loss_reduced["avg"] for loss_reduced in losses_reduced_per_micro_batch]
+                loss_tensor = torch.concat(loss_tensors_list)
+
+                return loss_tensor.mean()
+
+            # Get the total loss since micro batches sizes are not uniform
+            loss_sum_tensors_list: List[torch.Tensor] = [
+                loss_sum["loss_sum_and_ub_size"]
+                for loss_sum in losses_reduced_per_micro_batch
+                if loss_sum["loss_sum_and_ub_size"][1] > 0
+            ]
+            loss_sum = (
+                torch.vstack(loss_sum_tensors_list).sum(dim=0)
+                if len(loss_sum_tensors_list) > 0
+                else torch.tensor([0.0, 0.0], device=torch.cuda.current_device())
+            )
+            return loss_sum
+
+        return torch.tensor(0.0, device=torch.cuda.current_device())
+
+
 def masked_token_with_zero(tensor: Tensor, mask: Tensor):
     """Calculate masked token loss with consideration of possible NaN.
     Sometimes when the number of tokens is very small, none of the tokens get masked for prediction.
diff --git a/nemo/collections/llm/bert/model/__init__.py b/nemo/collections/llm/bert/model/__init__.py
index 45e25279f7b0..71e4bd9a0b77 100644
--- a/nemo/collections/llm/bert/model/__init__.py
+++ b/nemo/collections/llm/bert/model/__init__.py
@@ -8,10 +8,18 @@
     MegatronBertConfig,
     MegatronBertLargeConfig,
 )
+from nemo.collections.llm.bert.model.embedding import (
+    BertEmbeddingLargeConfig,
+    BertEmbeddingMiniConfig,
+    BertEmbeddingModel,
+)
 
 __all__ = [
     "BertConfig",
+    "BertEmbeddingModel",
     "BertModel",
+    "BertEmbeddingLargeConfig",
+    "BertEmbeddingMiniConfig",
     "HuggingFaceBertBaseConfig",
     "HuggingFaceBertLargeConfig",
     "HuggingFaceBertConfig",
diff --git a/nemo/collections/llm/bert/model/base.py b/nemo/collections/llm/bert/model/base.py
index 8c11dfc0500c..1b7dc4077492 100644
--- a/nemo/collections/llm/bert/model/base.py
+++ b/nemo/collections/llm/bert/model/base.py
@@ -264,6 +264,7 @@ def forward(
         lm_labels: Tensor = None,
         loss_mask: Tensor = None,
         inference_params=None,
+        hidden_states_only=False,
     ):
         """Forward function of BERT model
 
@@ -280,7 +281,7 @@ def forward(
         hidden_states = super().forward(input_ids, attention_mask, tokentype_ids, lm_labels, inference_params)
         self.post_process = original_post_process
 
-        if not self.post_process:
+        if not self.post_process or hidden_states_only:
             return hidden_states
 
         if self.return_embeddings:
diff --git a/nemo/collections/llm/bert/model/embedding.py b/nemo/collections/llm/bert/model/embedding.py
new file mode 100644
index 000000000000..289aedbd68d3
--- /dev/null
+++ b/nemo/collections/llm/bert/model/embedding.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from dataclasses import dataclass
+from typing import Callable, Dict, Literal, Optional
+
+import lightning.pytorch as L
+import torch
+import torch.nn.functional as F
+from megatron.core import parallel_state
+from torch import Tensor, nn
+
+from nemo.collections.common.tokenizers import TokenizerSpec
+from nemo.collections.llm.bert.loss import BERTInBatchExclusiveHardNegativesRankingLoss
+from nemo.collections.llm.bert.model import BertConfig, BertModel
+from nemo.collections.llm.bert.model.base import get_batch_on_this_context_parallel_rank, get_packed_seq_params
+from nemo.collections.llm.bert.model.bert import HuggingFaceBertImporter
+from nemo.lightning import io
+from nemo.lightning.pytorch.optim import OptimizerModule
+
+
+def bert_embedding_data_step(dataloder_iter) -> Dict[str, torch.Tensor]:
+    """Setup BERT dataloader batch."""
+    batch = next(dataloder_iter)
+
+    _batch: dict
+    if isinstance(batch, tuple) and len(batch) == 3:
+        _batch = batch[0]
+    else:
+        _batch = batch
+
+    required_keys = set()
+    required_keys.add("attention_mask")
+    required_keys.add("token_type_ids")
+
+    if parallel_state.is_pipeline_first_stage():
+        required_keys.add("input_ids")
+
+    _batch = {key: val.cuda(non_blocking=True) if key in required_keys else None for key, val in _batch.items()}
+    # slice batch along sequence dimension for context parallelism
+    output = get_batch_on_this_context_parallel_rank(_batch)
+
+    return output
+
+
+def bert_embedding_forward_step(model: L.LightningModule, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+    """
+    This subsets the batch keys to the ones actually used by forward pass of the model,
+    and then calls the model's forward pass. if "cu_seqsens" are defined in the batch,
+    then the packed sequence parameters are also passed to the model for forward pass efficiency.
+    """
+    forward_args = {
+        "input_ids": batch["input_ids"],
+        "attention_mask": batch["attention_mask"],
+    }
+
+    if model.config.num_tokentypes != 0:
+        forward_args["tokentype_ids"] = batch["token_type_ids"]
+
+    if "cu_seqlens" in batch:
+        forward_args["packed_seq_params"] = get_packed_seq_params(batch)
+
+    return model(**forward_args)
+
+
+@dataclass
+class BertEmbeddingConfig(BertConfig):
+    """Bert Embedding Config"""
+
+    bert_type: Literal["huggingface", "megatron"] = 'huggingface'
+    ce_loss_scale: float = 20
+    label_smoothing: float = 0.0
+    add_lm_head: bool = False
+    bert_binary_head: bool = False
+    num_hard_negatives: int = 1
+    num_tokentypes: int = 2
+    global_in_batch_negatives: bool = True
+    backprop_type: Literal["local", "global"] = 'local'
+    forward_step_fn: Callable = bert_embedding_forward_step
+    data_step_fn: Callable = bert_embedding_data_step
+
+
+@dataclass
+class BertEmbeddingLargeConfig(BertEmbeddingConfig):
+    """Bert Embedding model follows Bert-large architecture."""
+
+    num_layers: int = 24
+    hidden_size: int = 1024
+    intermediate_size: int = 4096
+    num_attention_heads: int = 16
+
+
+@dataclass
+class BertEmbeddingMiniConfig(BertEmbeddingConfig):
+    """Bert Embedding model follows Bert-mini (384 hidden size) architecture."""
+
+    num_layers: int = 6
+    hidden_size: int = 384
+    intermediate_size: int = 1536
+    num_attention_heads: int = 12
+
+
+class BertEmbeddingHead(nn.Module):
+    """Performs mean pooling on the token embeddings."""
+
+    def __init__(
+        self,
+        word_embedding_dimension: int,
+        pooling_mode_mean_tokens: bool = True,
+    ):
+        super(BertEmbeddingHead, self).__init__()
+
+        self.config_keys = [
+            "word_embedding_dimension",
+            "pooling_mode_mean_tokens",
+        ]
+        self.word_embedding_dimension = word_embedding_dimension
+        self.pooling_mode_mean_tokens = pooling_mode_mean_tokens
+
+    def forward(self, token_embeddings: Tensor, attention_mask: Tensor):
+        """Forward function for embedding head. Performs mean pooling."""
+        token_embeddings = token_embeddings.permute(1, 0, 2)
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
+
+        sum_mask = input_mask_expanded.sum(1)
+        sum_mask = torch.clamp(sum_mask, min=1e-9)
+
+        output_vector = sum_embeddings / sum_mask
+        output_vector = F.normalize(output_vector, p=2, dim=1)
+
+        return output_vector
+
+
+class BertEmbeddingModel(BertModel):
+    """Bert Lightning Module"""
+
+    def __init__(
+        self,
+        config: BertConfig,
+        # TODO: Add transformer_layer_spec when we update mcore
+        optim: Optional[OptimizerModule] = None,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
+    ):
+        super().__init__(config, optim, tokenizer, model_transform)
+
+    def configure_model(self) -> None:
+        """Setup the BERT Model based on config definition."""
+        if not hasattr(self, "module"):
+            self.module = self.config.configure_model(self.tokenizer)
+            self.embedding_head = BertEmbeddingHead(
+                word_embedding_dimension=self.config.hidden_size,
+                pooling_mode_mean_tokens=True,
+            )
+
+    def forward(
+        self,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        """Call the forward method of the underlying model, and return whatever it outputs."""
+        assert "attention_mask" in kwargs, "attention mask is required for BERT Embedding Model."
+        output_tensor = self.module(
+            hidden_states_only=True, *args, **kwargs
+        )  # for now just pass through to the underlying model
+        embeddings_out = self.embedding_head(output_tensor, kwargs["attention_mask"])
+        return embeddings_out
+
+    @property
+    def training_loss_reduction(self) -> BERTInBatchExclusiveHardNegativesRankingLoss:  # pylint: disable=C0115,C0116
+        if not self._training_loss_reduction:
+            self._training_loss_reduction = BERTInBatchExclusiveHardNegativesRankingLoss(
+                validation_step=False,
+                num_hard_negatives=self.config.num_hard_negatives,
+                scale=self.config.ce_loss_scale,
+                label_smoothing=self.config.label_smoothing,
+                global_in_batch_negatives=self.config.global_in_batch_negatives,
+                backprop_type=self.config.backprop_type,
+            )
+
+        return self._training_loss_reduction
+
+    @property
+    def validation_loss_reduction(self) -> BERTInBatchExclusiveHardNegativesRankingLoss:  # pylint: disable=C0115,C0116
+        if not self._validation_loss_reduction:
+            self._validation_loss_reduction = BERTInBatchExclusiveHardNegativesRankingLoss(
+                validation_step=True,
+                num_hard_negatives=self.config.num_hard_negatives,
+                scale=self.config.ce_loss_scale,
+                label_smoothing=self.config.label_smoothing,
+            )
+
+        return self._validation_loss_reduction
+
+
+@io.model_importer(BertEmbeddingModel, "hf")
+class BertEmbeddingImporter(HuggingFaceBertImporter):
+    """
+    Importer for BertEmbedding Model.
+    HuggingFace uses same model for Bert Embedding model and Bert model, thus the connector is identical.
+    """
+
+    def __init__(self, *args, **kwargs):
+        if sys.version_info > (3, 11):
+            # In Python versions <= 3.11, *Path classes don’t have a __init__ method,
+            # and do all their initialization in __new__/ helper methods.
+            # Only need to call super().__init__ if version > 3.11
+            super().__init__(*args)
+        self.type = 'model'
+
+    def init(self) -> BertEmbeddingModel:
+        return BertEmbeddingModel(self.config, tokenizer=self.tokenizer)
diff --git a/nemo/collections/llm/deploy/base.py b/nemo/collections/llm/deploy/base.py
index e21198f5884b..0ff32329dc74 100644
--- a/nemo/collections/llm/deploy/base.py
+++ b/nemo/collections/llm/deploy/base.py
@@ -102,7 +102,6 @@ def get_trtllm_deployable(
             trt_llm_exporter.export(
                 nemo_checkpoint_path=nemo_checkpoint,
                 model_type=model_type,
-                n_gpus=num_gpus,
                 tensor_parallelism_size=tensor_parallelism_size,
                 pipeline_parallelism_size=pipeline_parallelism_size,
                 max_input_len=max_input_len,
diff --git a/nemo/collections/llm/evaluation/__init__.py b/nemo/collections/llm/evaluation/__init__.py
index 3012689bb8da..884fdb575675 100644
--- a/nemo/collections/llm/evaluation/__init__.py
+++ b/nemo/collections/llm/evaluation/__init__.py
@@ -1,3 +1,3 @@
-from nemo.collections.llm.evaluation.base import NeMoFWLMEval, wait_for_rest_service
+from nemo.collections.llm.evaluation.base import NeMoFWLMEval
 
-__all__ = ["NeMoFWLMEval", "wait_for_rest_service"]
+__all__ = ["NeMoFWLMEval"]
diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py
index f8f6639e3f3c..91dfcf8f2f44 100644
--- a/nemo/collections/llm/evaluation/base.py
+++ b/nemo/collections/llm/evaluation/base.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import time
-
-import requests
 import torch
 import torch.nn.functional as F
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
-from requests.exceptions import RequestException
+from tqdm import tqdm
 
 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
-from nemo.utils import logging
+from nemo.deploy.nlp import NemoQueryLLM
 
 
 class NeMoFWLMEval(LM):
@@ -49,21 +46,22 @@ def _generate_tokens_logits(self, payload, return_text: bool = False, return_log
         A private method that sends post request to the model on PyTriton server and returns either generated text or
         logits.
         """
-        # send a post request to /v1/completions/ endpoint with the payload
-        response = requests.post(f"{self.api_url}/v1/completions/", json=payload)
-        response_data = response.json()
-
-        if 'error' in response_data:
-            raise Exception(f"API Error: {response_data['error']}")
+        nq = NemoQueryLLM(url=self.api_url, model_name=payload['model'])
+
+        response = nq.query_llm(
+            prompts=payload['prompt'] if isinstance(payload['prompt'], list) else [payload['prompt']],
+            max_output_len=payload['max_tokens'],
+            top_k=payload['top_k'],
+            top_p=payload['top_p'],
+            temperature=payload['temperature'],
+            output_generation_logits=True,
+            openai_format_response=True,
+        )
 
-        # Assuming the response is in OpenAI format
         if return_text:
-            # in case of generate_until tasks return just the text
-            return response_data['choices'][0]['text']
-
+            return response["choices"][0]["text"]  # shape[batch_size, 1]
         if return_logits:
-            # in case of loglikelihood tasks return the logits
-            return response_data['choices'][0]['generation_logits']
+            return response["choices"][0]["generation_logits"]  # shape[batch_size, 1, num_tokens, vocab_size]
 
     def tokenizer_type(self, tokenizer):
         """
@@ -93,7 +91,7 @@ def loglikelihood(self, requests: list[Instance]):
             special_tokens_kwargs['add_special_tokens'] = self.add_bos
 
         results = []
-        for request in requests:
+        for request in tqdm(requests):
             # get the input prompt from the request
             context = request.arguments[0]
             # get the output prompt from the request
@@ -165,46 +163,3 @@ def generate_until(self, inputs: list[Instance]):
             results.append(generated_text)
 
         return results
-
-
-def wait_for_rest_service(rest_url, max_retries=600, retry_interval=2):
-    """
-    Wait for REST service to be ready.
-
-    Args:
-    rest_url (str): URL of the REST service's health endpoint
-    max_retries (int): Maximum number of retry attempts. Defaul: 60.
-    retry_interval (int): Time to wait between retries in seconds. Default: 2.
-
-    Returns:
-    bool: True if rest service is ready, False otherwise
-    """
-
-    def check_service(url):
-        """
-        Check if the service is ready by making a GET request to its health endpoint.
-
-        Args:
-        url (str): URL of the service's health endpoint
-
-        Returns:
-        bool: True if the service is ready, False otherwise
-        """
-        try:
-            response = requests.get(url, timeout=5)
-            return response.status_code == 200
-        except RequestException:
-            return False
-
-    for _ in range(max_retries):
-        rest_ready = check_service(rest_url)
-
-        if rest_ready:
-            logging.info("REST service is ready.")
-            return True
-
-        logging.info(f"REST Service not ready yet. Retrying in {retry_interval} seconds...")
-        time.sleep(retry_interval)
-
-    logging.info("Timeout: REST service did not become ready.")
-    return False
diff --git a/nemo/collections/llm/gpt/data/chat.py b/nemo/collections/llm/gpt/data/chat.py
index 1b51c4aa1524..3b2ac56dadb5 100644
--- a/nemo/collections/llm/gpt/data/chat.py
+++ b/nemo/collections/llm/gpt/data/chat.py
@@ -27,7 +27,7 @@ class ChatDataModule(FineTuningDataModule):
     """
 
     @lru_cache
-    def _create_dataset(self, path, is_test=False, **kwargs):
+    def _create_dataset(self, path, pack_metadata_path=None, is_test=False, **kwargs):
         # pylint: disable=C0115,C0116
         return create_sft_dataset(
             path,
@@ -37,5 +37,7 @@ def _create_dataset(self, path, is_test=False, **kwargs):
             seed=self.seed,
             chat=True,
             is_test=is_test,
+            pack_metadata_file_path=None,  # packing is not supported
+            pad_cu_seqlens=False,
             **kwargs,
         )
diff --git a/nemo/collections/llm/gpt/data/core.py b/nemo/collections/llm/gpt/data/core.py
index 54eb9e31c53a..b06708cec537 100644
--- a/nemo/collections/llm/gpt/data/core.py
+++ b/nemo/collections/llm/gpt/data/core.py
@@ -47,42 +47,55 @@ def create_sft_dataset(
     memmap_workers: int = 2,
     hf_dataset: bool = False,
     global_sample_mapping: bool = False,
+    pack_metadata_file_path: Path = None,
+    pad_cu_seqlens: bool = False,
     chat: bool = False,
     **kwargs,
 ) -> "GPTSFTDataset":
     """
     Create the dataset class (GPTSFTDataset, GPTSFTChatDataset or GPTSFTPackedDataset)
     """
+
+    gpt_sft_dataset_kwargs = {
+        'file_path': str(path),
+        'tokenizer': tokenizer,
+        'max_seq_length': seq_length,
+        'memmap_workers': memmap_workers,
+        'hf_dataset': hf_dataset,
+        'global_sample_mapping': global_sample_mapping,
+        'add_bos': add_bos,
+        'add_eos': add_eos,
+        'add_sep': add_sep,
+        'seed': seed,
+        'label_key': label_key,
+        'answer_only_loss': answer_only_loss,
+        'truncation_field': truncation_field,
+        'pad_to_max_length': pad_to_max_length,
+        'index_mapping_dir': index_mapping_dir,
+        'prompt_template': prompt_template,
+        'truncation_method': truncation_method,
+    }
+
     if chat:
         from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_chat_dataset import GPTSFTChatDataset
 
-        dataset_cls = GPTSFTChatDataset
+        return GPTSFTChatDataset(
+            **gpt_sft_dataset_kwargs,
+            **kwargs,
+        )
     elif path.suffix == '.npy':
         from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTPackedDataset
 
-        dataset_cls = GPTSFTPackedDataset
+        return GPTSFTPackedDataset(
+            pack_metadata_file_path=pack_metadata_file_path,
+            pad_cu_seqlens=pad_cu_seqlens,
+            **gpt_sft_dataset_kwargs,
+            **kwargs,
+        )
     else:
         from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTDataset
 
-        dataset_cls = GPTSFTDataset
-
-    return dataset_cls(
-        file_path=str(path),
-        tokenizer=tokenizer,
-        max_seq_length=seq_length,
-        memmap_workers=memmap_workers,
-        hf_dataset=hf_dataset,
-        global_sample_mapping=global_sample_mapping,
-        add_bos=add_bos,
-        add_eos=add_eos,
-        add_sep=add_sep,
-        seed=seed,
-        label_key=label_key,
-        answer_only_loss=answer_only_loss,
-        truncation_field=truncation_field,
-        pad_to_max_length=pad_to_max_length,
-        index_mapping_dir=index_mapping_dir,
-        prompt_template=prompt_template,
-        truncation_method=truncation_method,
-        **kwargs,
-    )
+        return GPTSFTDataset(
+            **gpt_sft_dataset_kwargs,
+            **kwargs,
+        )
diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py
index 8efe5cdbd918..53ed50de7b21 100644
--- a/nemo/collections/llm/gpt/data/fine_tuning.py
+++ b/nemo/collections/llm/gpt/data/fine_tuning.py
@@ -93,6 +93,7 @@ def __init__(
         self.packed_sequence_size = -1 if not packed_sequence_specs else packed_sequence_specs.packed_sequence_size
         self.validate_batch_size_for_packed_sequence()
         self.dataset_kwargs = dataset_kwargs or {}
+        self._pad_cu_seqlens = False if not packed_sequence_specs else packed_sequence_specs.pad_cu_seqlens
         self.init_global_step = 0
 
     def validate_batch_size_for_packed_sequence(self):
@@ -110,8 +111,7 @@ def validate_batch_size_for_packed_sequence(self):
                 f"Set packed sequence length to {self.packed_sequence_size*self.micro_batch_size} "
                 f"(currently {self.packed_sequence_size}) \n"
                 f"For details please visit "
-                f"https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/features/optimizations/"
-                f"sequence_packing.html"
+                f"https://docs.nvidia.com/nemo-framework/user-guide/latest/sft_peft/packed_sequence.html"
             )
 
     def prepare_data(self) -> None:
@@ -129,6 +129,7 @@ def prepare_data(self) -> None:
                     tokenizer=self.tokenizer,
                     max_seq_length=self.seq_length,
                     seed=self.seed,
+                    output_metadata_path=self.train_pack_metadata,
                 )
 
             if not self.validation_path_packed.is_file():
@@ -139,6 +140,7 @@ def prepare_data(self) -> None:
                     tokenizer=self.tokenizer,
                     max_seq_length=self.seq_length,
                     seed=self.seed,
+                    output_metadata_path=self.val_pack_metadata,
                 )
 
     def setup(self, stage: str):
@@ -195,6 +197,7 @@ def train_dataloader(self) -> DataLoader:
         return self._create_dataloader(
             self._create_dataset(
                 self.train_path if self.packed_sequence_size <= 0 else self.train_path_packed,
+                pack_metadata_path=None if self.packed_sequence_size <= 0 else self.train_pack_metadata,
                 max_num_samples=self.max_train_samples,
                 **self.dataset_kwargs,
             ),
@@ -206,6 +209,7 @@ def val_dataloader(self) -> DataLoader:
         return self._create_dataloader(
             self._create_dataset(
                 self.validation_path if self.packed_sequence_size <= 0 else self.validation_path_packed,
+                pack_metadata_path=None if self.packed_sequence_size <= 0 else self.val_pack_metadata,
                 is_test=True,
                 **self.dataset_kwargs,
             ),
@@ -225,15 +229,18 @@ def test_dataloader(self) -> DataLoader:
         )
 
     @lru_cache
-    def _create_dataset(self, path, is_test=False, **kwargs):
+    def _create_dataset(self, path, pack_metadata_path=None, is_test=False, **kwargs):
         # pylint: disable=C0115,C0116
+        is_not_packing = is_test or self.packed_sequence_size <= 0
         return create_sft_dataset(
             path,
             tokenizer=self.tokenizer,
-            seq_length=(self.seq_length if is_test or self.packed_sequence_size <= 0 else self.packed_sequence_size),
+            seq_length=(self.seq_length if is_not_packing else self.packed_sequence_size),
             memmap_workers=self.memmap_workers,
             seed=self.seed,
             is_test=is_test,
+            pack_metadata_file_path=None if is_not_packing else pack_metadata_path,
+            pad_cu_seqlens=False if is_not_packing else self.pad_cu_seqlens,
             **kwargs,
         )
 
@@ -256,6 +263,32 @@ def train_path(self) -> Path:
         """Path to training dataset file"""
         return self.dataset_root / "training.jsonl"
 
+    @property
+    def train_pack_metadata(self) -> Path:
+        """Path to metadata dataset file for packed sequence."""
+        if self.packed_sequence_size > 0:
+            if self.packed_sequence_specs.packed_train_metadata_path is not None:
+                return self.packed_sequence_specs.packed_train_metadata_path
+            tokenizer_model_name = self._extract_tokenizer_model_name()
+            folder_name = self.dataset_root / "packed" / tokenizer_model_name
+            folder_name.mkdir(parents=True, exist_ok=True)
+            return folder_name / f"train_{self.packed_sequence_size}_metadata.jsonl"
+        else:
+            raise ValueError("`train_pack_metadata invalid since packed sequence size is not specified.")
+
+    @property
+    def val_pack_metadata(self) -> Path:
+        """Path to metadata dataset file for packed sequence."""
+        if self.packed_sequence_size > 0:
+            if self.packed_sequence_specs.packed_val_metadata_path is not None:
+                return self.packed_sequence_specs.packed_val_metadata_path
+            tokenizer_model_name = self._extract_tokenizer_model_name()
+            folder_name = self.dataset_root / "packed" / tokenizer_model_name
+            folder_name.mkdir(parents=True, exist_ok=True)
+            return folder_name / f"val_{self.packed_sequence_size}_metadata.jsonl"
+        else:
+            raise ValueError("val_pack_metadata invalid since packed sequence size is not specified.")
+
     @property
     def train_path_packed(self) -> Path:
         """Path to training dataset file for packed sequence. The file path contains a reference to the
@@ -294,6 +327,16 @@ def test_path(self) -> Path:
         """Path to test dataset file"""
         return self.dataset_root / "test.jsonl"
 
+    @property
+    def pad_cu_seqlens(self) -> bool:
+        """Whether to pad cu_seqlens to a constant shape"""
+        if self.packed_sequence_size > 0:
+            if self.packed_sequence_specs.pad_cu_seqlens is not None:
+                return self.packed_sequence_specs.pad_cu_seqlens
+            else:
+                return self._pad_cu_seqlens
+        return False
+
     def _extract_tokenizer_model_name(self) -> str:
         """Automatically get the model name from model path."""
         if self.packed_sequence_specs.tokenizer_model_name is not None:
diff --git a/nemo/collections/llm/gpt/data/mock.py b/nemo/collections/llm/gpt/data/mock.py
index 1dea7a823253..c07ea37b4399 100644
--- a/nemo/collections/llm/gpt/data/mock.py
+++ b/nemo/collections/llm/gpt/data/mock.py
@@ -45,6 +45,8 @@ def __init__(
         pin_memory: bool = True,
         persistent_workers: bool = False,
         create_attention_mask: bool = False,
+        vocab_file: Optional[str] = None,
+        merges_file: Optional[str] = None,
     ):
         super().__init__()
         self.seq_length = seq_length
@@ -61,7 +63,9 @@ def __init__(
         if tokenizer is None:
             from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 
-            self.tokenizer = get_nmt_tokenizer("megatron", "GPT2BPETokenizer")
+            self.tokenizer = get_nmt_tokenizer(
+                "megatron", "GPT2BPETokenizer", vocab_file=vocab_file, merges_file=merges_file
+            )
         else:
             self.tokenizer = tokenizer
 
diff --git a/nemo/collections/llm/gpt/data/packed_sequence.py b/nemo/collections/llm/gpt/data/packed_sequence.py
index 1f43bce99e62..11208ffc7929 100644
--- a/nemo/collections/llm/gpt/data/packed_sequence.py
+++ b/nemo/collections/llm/gpt/data/packed_sequence.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
@@ -50,6 +51,7 @@ def tokenize_dataset(path: Path, tokenizer: TokenizerSpec, max_seq_length: int,
 def prepare_packed_sequence_data(
     input_path: Path,
     output_path: Path,
+    output_metadata_path: Path,
     packed_sequence_size: int,
     tokenizer: TokenizerSpec,
     max_seq_length: int,
@@ -77,11 +79,15 @@ def prepare_packed_sequence_data(
     dataset = tokenize_dataset(input_path, tokenizer, max_seq_length, seed)
     sequences, histogram = create_hist(dataset, max_seq_length)
 
-    assignments = create_packing_strategy(histogram, packed_sequence_size, packing_algorithm)
+    assignments, packing_metadata = create_packing_strategy(histogram, packed_sequence_size, packing_algorithm)
     output_data = fill_packing_strategy(assignments, sequences, packed_sequence_size, tokenizer.eos_id)
 
     # save output data
     np.save(output_path, output_data)
+    # save packing metadata
+    if output_metadata_path is not None:
+        with open(output_metadata_path, "w") as f:
+            json.dump(packing_metadata, f)
     logging.info(f"Packed sequence is prepared and saved to {output_path}")
 
 
@@ -111,6 +117,21 @@ class PackedSequenceSpecs:
     If specified, use this file for the packed validation dataset instead of the default path.
     """
 
+    packed_train_metadata_path: str = None
+    """
+    If specified, use this file for the train packing metadata instead of the default path.
+    """
+
+    packed_val_metadata_path: str = None
+    """
+    If specified, use this file for the val packing metadata instead of the default path.
+    """
+
+    pad_cu_seqlens: bool = False
+    """
+    If True, pad cu_seqlens to a constant size, which is required for use with cudagraphs.
+    """
+
     def __post_init__(self):
         if self.packed_train_data_path is not None:
             self.packed_train_data_path = Path(self.packed_train_data_path)
diff --git a/nemo/collections/llm/gpt/model/baichuan.py b/nemo/collections/llm/gpt/model/baichuan.py
index c283b802a118..df3263559338 100644
--- a/nemo/collections/llm/gpt/model/baichuan.py
+++ b/nemo/collections/llm/gpt/model/baichuan.py
@@ -20,13 +20,13 @@
 import torch.nn.functional as F
 from torch import nn
 
-from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
+from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel, torch_dtype_from_mcore_config
 from nemo.collections.llm.utils import Config
 from nemo.lightning import OptimizerModule, io, teardown
 from nemo.lightning.pytorch.utils import dtype_from_hf
 
 if TYPE_CHECKING:
-    from transformers import AutoConfig, AutoModelForCausalLM
+    from transformers import AutoModelForCausalLM
 
     from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
     from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
@@ -142,16 +142,23 @@ def make_vocab_size_divisible_by(vocab_size):
 
 @io.model_exporter(Baichuan2Model, "hf")
 class HFBaichuan2Exporter(io.ModelConnector[Baichuan2Model, "AutoModelForCausalLM"]):
-    def init(self) -> "AutoModelForCausalLM":
+    def init(self, dtype=torch.bfloat16, model_name="baichuan-inc/Baichuan2-7B-Base") -> "AutoModelForCausalLM":
         from transformers import AutoModelForCausalLM
         from transformers.modeling_utils import no_init_weights
 
         with no_init_weights(True):
-            return AutoModelForCausalLM.from_config(self.config, trust_remote_code=True)
+            # Since Baichuan is not importable from transformers, we can only initialize the HF model
+            # from a known checkpoint. If more than 1 Baichuan model is supported in NeMo in the future,
+            # the model_name will need to be passed in.
+            return AutoModelForCausalLM.from_pretrained(
+                model_name,
+                trust_remote_code=True,
+                torch_dtype=dtype,
+            )
 
     def apply(self, output_path: Path) -> Path:
-        target = self.init()
         source, _ = self.nemo_load(str(self))
+        target = self.init(torch_dtype_from_mcore_config(source.config))
         target = self.convert_state(source, target)
 
         target = target.cpu()
@@ -177,23 +184,6 @@ def convert_state(self, source, target):
     def tokenizer(self):
         return io.load_context(str(self)).model.tokenizer.tokenizer
 
-    @property
-    def config(self) -> "AutoConfig":
-        source: Baichuan2Config = io.load_context(str(self)).model.config
-
-        return AutoConfig(
-            num_hidden_layers=source.num_layers,
-            hidden_size=source.hidden_size,
-            intermediate_size=source.ffn_hidden_size,
-            num_attention_heads=source.num_attention_heads,
-            max_position_embeddings=source.seq_length,
-            initializer_range=source.init_method_std,
-            rms_norm_eps=source.layernorm_epsilon,
-            num_key_value_heads=source.num_query_groups,
-            rope_theta=source.rotary_base,
-            vocab_size=self.tokenizer.vocab_size,
-        )
-
 
 @io.state_transform(
     source_key="model.layers.*.self_attn.W_pack.weight",
diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index 563a2cde5854..1a28dc26b25c 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -116,7 +116,10 @@ def transformer_engine_layer_spec(config: "GPTConfig") -> ModuleSpec:
     from megatron.core.models.gpt import gpt_layer_specs
 
     return gpt_layer_specs.get_gpt_layer_with_transformer_engine_spec(
-        num_experts=config.num_moe_experts, moe_grouped_gemm=config.moe_grouped_gemm, qk_layernorm=config.qk_layernorm
+        num_experts=config.num_moe_experts,
+        moe_grouped_gemm=config.moe_grouped_gemm,
+        qk_layernorm=config.qk_layernorm,
+        fp8=bool(config.num_moe_experts and (config.fp8 is not None)),
     )
 
 
@@ -173,6 +176,7 @@ class GPTConfig(TransformerConfig, io.IOMixin):
     gradient_accumulation_fusion: bool = _grad_accum_fusion_available
     deallocate_pipeline_outputs: bool = True
     scatter_embedding_sequence_parallel: bool = True
+    tp_only_amax_red: bool = False
 
     use_transformer_engine_full_layer_spec: bool = False
     transformer_layer_spec: Union[ModuleSpec, Callable[["GPTConfig"], ModuleSpec]] = default_layer_spec
@@ -181,6 +185,13 @@ class GPTConfig(TransformerConfig, io.IOMixin):
     data_step_fn: Callable = gpt_data_step
 
     def configure_model(self, tokenizer, pre_process=None, post_process=None) -> "MCoreGPTModel":
+        if self.enable_cuda_graph:
+            assert HAVE_TE, "Transformer Engine is required for cudagraphs."
+            assert getattr(self, 'use_te_rng_tracker', False), (
+                "Transformer engine's RNG tracker is required for cudagraphs, it can be "
+                "enabled with use_te_rng_tracker=True'."
+            )
+
         vp_size = self.virtual_pipeline_model_parallel_size
         if vp_size:
             p_size = self.pipeline_model_parallel_size
@@ -196,10 +207,11 @@ def configure_model(self, tokenizer, pre_process=None, post_process=None) -> "MC
 
         if hasattr(self, 'vocab_size'):
             vocab_size = self.vocab_size
-            logging.info(
-                f"Use preset vocab_size: {vocab_size}, original vocab_size: {tokenizer.vocab_size}, dummy tokens:"
-                f" {vocab_size - tokenizer.vocab_size}."
-            )
+            if tokenizer is not None:
+                logging.info(
+                    f"Use preset vocab_size: {vocab_size}, original vocab_size: {tokenizer.vocab_size}, dummy tokens:"
+                    f" {vocab_size - tokenizer.vocab_size}."
+                )
         else:
             vocab_size = get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by)
 
@@ -395,11 +407,21 @@ def get_inference_wrapper(self, params_dtype, inference_batch_times_seqlen_thres
         if mcore_model is None or type(mcore_model) is not MCoreGPTModel:
             raise ValueError("Exact McoreGPTModel instance not found in the model structure.")
 
+        vocab_size = None
+        if self.tokenizer is not None:
+            vocab_size = self.tokenizer.vocab_size
+        elif hasattr(self.config, 'vocab_size'):
+            vocab_size = self.config.vocab_size
+        else:
+            raise ValueError(
+                'Unable to find vocab size. Either pass in a tokenizer with vocab size, or set vocab size in the model config'
+            )
+
         inference_wrapper_config = InferenceWrapperConfig(
             hidden_size=mcore_model.config.hidden_size,
             params_dtype=params_dtype,
             inference_batch_times_seqlen_threshold=inference_batch_times_seqlen_threshold,
-            padded_vocab_size=self.tokenizer.vocab_size,
+            padded_vocab_size=vocab_size,
         )
 
         model_inference_wrapper = GPTInferenceWrapper(mcore_model, inference_wrapper_config)
diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index 04540294d82a..0cd6cea1aa8c 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -11,12 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import json
 import math
 from dataclasses import dataclass
 from functools import partial
 from pathlib import Path
-from typing import TYPE_CHECKING, Annotated, Callable, Optional
+from typing import TYPE_CHECKING, Annotated, Callable, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -25,11 +25,15 @@
 from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel, torch_dtype_from_mcore_config
 from nemo.collections.llm.utils import Config
 from nemo.lightning import OptimizerModule, io, teardown
+from nemo.lightning.ckpt_utils import ADAPTER_META_FILENAME
+from nemo.lightning.io.pl import ckpt_to_weights_subdir
+from nemo.lightning.io.state import TransformFns
 from nemo.lightning.pytorch.utils import dtype_from_hf
 from nemo.utils import logging
 
 if TYPE_CHECKING:
     from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel
+    from peft import PeftConfig
     from transformers import LlamaConfig as HFLlamaConfig
     from transformers import LlamaForCausalLM
 
@@ -345,7 +349,7 @@ def apply(self, output_path: Path) -> Path:
         target = target.cpu()
         target.save_pretrained(output_path)
         try:
-            self.tokenizer.save_pretrained(output_path)
+            self.tokenizer.tokenizer.save_pretrained(output_path)
         except Exception:
             logging.warning("Failed to save tokenizer")
 
@@ -359,17 +363,20 @@ def convert_state(self, source, target):
             "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
             "decoder.final_layernorm.weight": "model.norm.weight",
         }
+        transforms = [_export_qkv, _export_linear_fc1, _export_embedding]
+        if not self.config.tie_word_embeddings:
+            transforms.append(_export_head)
 
         return io.apply_transforms(
             source,
             target,
             mapping=mapping,
-            transforms=[_export_qkv, _export_linear_fc1, _export_embedding, _export_head],
+            transforms=transforms,
         )
 
     @property
-    def tokenizer(self):
-        return io.load_context(str(self), subpath="model").tokenizer.tokenizer
+    def tokenizer(self) -> "TokenizerSpec":
+        return io.load_context(str(self), subpath="model").tokenizer
 
     @property
     def config(self) -> "HFLlamaConfig":
@@ -392,6 +399,154 @@ def config(self) -> "HFLlamaConfig":
         )
 
 
+@io.model_exporter(LlamaModel, "hf-peft")
+class HFLlamaPEFTExporter(HFLlamaExporter):
+    def init(self, dtype=torch.bfloat16) -> "AutoPeftModelForCausalLM":
+        from peft import get_peft_model
+
+        model = super().init(dtype=dtype)
+
+        # Infer base model checkpoint from checkpoint metadata file
+        adapter_meta_path = ckpt_to_weights_subdir(str(self), is_saving=False) / ADAPTER_META_FILENAME
+        with open(adapter_meta_path, "r") as f:
+            model_ckpt_path = json.load(f)['model_ckpt_path']
+        model.name_or_path = '/'.join(model_ckpt_path.split("/")[-2:])
+
+        return get_peft_model(model, self.peft_config, autocast_adapter_dtype=False)
+
+    def apply(self, output_path: Path) -> Path:
+        from nemo.collections.llm.peft import CanonicalLoRA, DoRA, LoRA
+
+        self.peft_obj: Union[LoRA, DoRA, CanonicalLoRA] = io.load_context(str(self)).model.model_transform
+
+        source, _ = self.nemo_load(str(self))
+        target = self.init(torch_dtype_from_mcore_config(source.config))
+        target = self.convert_state(source, target)
+        target = target.cpu()
+        target.save_pretrained(output_path, save_embedding_layers=False)
+
+        return output_path
+
+    def convert_state(self, source, target):
+        from nemo.collections.llm.peft import CanonicalLoRA
+
+        # nemo and HF prefixes
+        pn = "decoder.layers."
+        ph = "base_model.model.model.layers."
+
+        mapping = {
+            # linear_proj for both canonical and performant lora
+            f"{pn}*.self_attention.linear_proj.adapter.linear_in.weight": f"{ph}*.self_attn.o_proj.lora_A.default.weight",
+            f"{pn}*.self_attention.linear_proj.adapter.linear_out.weight": f"{ph}*.self_attn.o_proj.lora_B.default.weight",
+            # linear_fc2 for both canonical and performant lora
+            f"{pn}*.mlp.linear_fc2.adapter.linear_in.weight": f"{ph}*.mlp.down_proj.lora_A.default.weight",
+            f"{pn}*.mlp.linear_fc2.adapter.linear_out.weight": f"{ph}*.mlp.down_proj.lora_B.default.weight",
+        }
+        transforms = []
+
+        if isinstance(self.peft_obj, CanonicalLoRA):
+            mapping.update(
+                {
+                    # linear_qkv for canonical lora
+                    f"{pn}*.self_attention.linear_qkv.adapter.adapter_q.linear_in.weight": f"{ph}*.self_attn.q_proj.lora_A.default.weight",
+                    f"{pn}*.self_attention.linear_qkv.adapter.adapter_q.linear_out.weight": f"{ph}*.self_attn.q_proj.lora_B.default.weight",
+                    f"{pn}*.self_attention.linear_qkv.adapter.adapter_k.linear_in.weight": f"{ph}*.self_attn.k_proj.lora_A.default.weight",
+                    f"{pn}*.self_attention.linear_qkv.adapter.adapter_k.linear_out.weight": f"{ph}*.self_attn.k_proj.lora_B.default.weight",
+                    f"{pn}*.self_attention.linear_qkv.adapter.adapter_v.linear_in.weight": f"{ph}*.self_attn.v_proj.lora_A.default.weight",
+                    f"{pn}*.self_attention.linear_qkv.adapter.adapter_v.linear_out.weight": f"{ph}*.self_attn.v_proj.lora_B.default.weight",
+                    # linear_fc1 for canonical lora
+                    f"{pn}*.mlp.linear_fc1.adapter.adapter_up.linear_in.weight": f"{ph}*.mlp.up_proj.lora_A.default.weight",
+                    f"{pn}*.mlp.linear_fc1.adapter.adapter_up.linear_out.weight": f"{ph}*.mlp.up_proj.lora_B.default.weight",
+                    f"{pn}*.mlp.linear_fc1.adapter.adapter_gate.linear_in.weight": f"{ph}*.mlp.gate_proj.lora_A.default.weight",
+                    f"{pn}*.mlp.linear_fc1.adapter.adapter_gate.linear_out.weight": f"{ph}*.mlp.gate_proj.lora_B.default.weight",
+                }
+            )
+        else:
+            transforms.extend(
+                [
+                    # linear_qkv for performant lora
+                    io.state_transform(
+                        source_key=f"{pn}*.self_attention.linear_qkv.adapter.linear_in.weight",
+                        target_key=(
+                            f"{ph}*.self_attn.q_proj.lora_A.default.weight",
+                            f"{ph}*.self_attn.k_proj.lora_A.default.weight",
+                            f"{ph}*.self_attn.v_proj.lora_A.default.weight",
+                        ),
+                        fn=TransformFns.duplicate3,
+                    ),
+                    io.state_transform(
+                        source_key=f"{pn}*.self_attention.linear_qkv.adapter.linear_out.weight",
+                        target_key=(
+                            f"{ph}*.self_attn.q_proj.lora_B.default.weight",
+                            f"{ph}*.self_attn.k_proj.lora_B.default.weight",
+                            f"{ph}*.self_attn.v_proj.lora_B.default.weight",
+                        ),
+                        fn=TransformFns.split_qkv,
+                    ),
+                    # linear_fc1 for performant lora
+                    io.state_transform(
+                        source_key=f"{pn}*.mlp.linear_fc1.adapter.linear_in.weight",
+                        target_key=(
+                            f"{ph}*.mlp.gate_proj.lora_A.default.weight",
+                            f"{ph}*.mlp.up_proj.lora_A.default.weight",
+                        ),
+                        fn=TransformFns.duplicate2,
+                    ),
+                    io.state_transform(
+                        source_key=f"{pn}*.mlp.linear_fc1.adapter.linear_out.weight",
+                        target_key=(
+                            f"{ph}*.mlp.gate_proj.lora_B.default.weight",
+                            f"{ph}*.mlp.up_proj.lora_B.default.weight",
+                        ),
+                        fn=TransformFns.split_fc1,
+                    ),
+                ]
+            )
+
+        return io.apply_transforms(
+            source,
+            target,
+            mapping=mapping,
+            transforms=transforms,
+        )
+
+    @property
+    def peft_config(self) -> "PeftConfig":
+        from peft import LoraConfig
+
+        from nemo.collections.llm.peft import DoRA
+
+        assert (
+            not self.peft_obj.dropout
+            or self.peft_obj.dropout_position == 'pre' "LoRA dropout_position must be 'pre' to convert to HF."
+        )
+
+        NEMO2HF = {
+            'linear_q': ['q_proj'],
+            'linear_k': ['k_proj'],
+            'linear_v': ['v_proj'],
+            'linear_qkv': ['q_proj', 'k_proj', 'v_proj'],
+            'linear_proj': ['o_proj'],
+            'linear_fc1_up': ['up_proj'],
+            'linear_fc1_gate': ['gate_proj'],
+            'linear_fc1': ['up_proj', 'gate_proj'],
+            'linear_fc2': ['down_proj'],
+        }
+
+        # Infer HF target modules from NeMo target modules
+        hf_target_modules = []
+        for tm in self.peft_obj.target_modules:
+            hf_target_modules.extend(NEMO2HF[tm])
+
+        return LoraConfig(
+            r=self.peft_obj.dim,
+            target_modules=hf_target_modules,
+            lora_alpha=self.peft_obj.alpha,
+            lora_dropout=self.peft_obj.dropout,
+            use_dora=isinstance(self.peft_obj, DoRA),
+        )
+
+
 @io.state_transform(
     source_key=(
         "model.layers.*.self_attn.q_proj.weight",
diff --git a/nemo/collections/llm/gpt/model/mixtral.py b/nemo/collections/llm/gpt/model/mixtral.py
index 919705fc02f1..123e01aeccb8 100644
--- a/nemo/collections/llm/gpt/model/mixtral.py
+++ b/nemo/collections/llm/gpt/model/mixtral.py
@@ -71,6 +71,12 @@ class MixtralConfig(GPTConfig):
     bf16: bool = True
     params_dtype: torch.dtype = torch.bfloat16
 
+    # fusions
+    apply_rope_fusion = True
+    bias_activation_fusion = True
+    bias_dropout_fusion = True
+    masked_softmax_fusion = False
+
 
 @dataclass
 class MixtralConfig8x3B(MixtralConfig):
diff --git a/nemo/collections/llm/gpt/model/nemotron.py b/nemo/collections/llm/gpt/model/nemotron.py
index e4ad22f66239..3e6c6ad14474 100644
--- a/nemo/collections/llm/gpt/model/nemotron.py
+++ b/nemo/collections/llm/gpt/model/nemotron.py
@@ -51,6 +51,7 @@ class NemotronConfig(GPTConfig):
     bias_dropout_add_fusion: bool = False
     layernorm_zero_centered_gamma: bool = True
     cross_entropy_loss_fusion: bool = True
+    apply_rope_fusion: bool = True
 
     # Nemotron3Config4B as default configs
     num_layers: int = 32
diff --git a/nemo/collections/llm/peft/__init__.py b/nemo/collections/llm/peft/__init__.py
index 80d32e242302..a9a3307eef76 100644
--- a/nemo/collections/llm/peft/__init__.py
+++ b/nemo/collections/llm/peft/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemo.collections.llm.peft.api import gpt_lora, merge_lora
+from nemo.collections.llm.peft.api import export_lora, gpt_lora, merge_lora
 from nemo.collections.llm.peft.canonical_lora import CanonicalLoRA
 from nemo.collections.llm.peft.dora import DoRA
 from nemo.collections.llm.peft.lora import LoRA
@@ -26,4 +26,4 @@
     "canonical_lora": CanonicalLoRA,
 }
 
-__all__ = ["LoRA", "DoRA", "CanonicalLoRA", "gpt_lora", "PEFT_STR2CLS", "merge_lora"]
+__all__ = ["LoRA", "DoRA", "CanonicalLoRA", "gpt_lora", "PEFT_STR2CLS", "merge_lora", "export_lora"]
diff --git a/nemo/collections/llm/peft/api.py b/nemo/collections/llm/peft/api.py
index a089a6d17515..c05fd0b8edde 100644
--- a/nemo/collections/llm/peft/api.py
+++ b/nemo/collections/llm/peft/api.py
@@ -17,17 +17,19 @@
 from typing import Tuple, Union
 
 import pytorch_lightning as pl
+import torch
 from megatron.core import dist_checkpointing
 from pytorch_lightning.trainer.states import TrainerFn
+from rich.console import Console
 
 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 from nemo.collections.llm.peft.lora import LoRA, LoRAMerge
 from nemo.collections.llm.utils import factory
 from nemo.lightning import MegatronStrategy, Trainer, _strategy_lib, io
 from nemo.lightning.ckpt_utils import ADAPTER_META_FILENAME, ckpt_to_context_subdir
+from nemo.lightning.io import api
 from nemo.lightning.io.pl import TrainerContext, ckpt_to_weights_subdir
 from nemo.lightning.pytorch.callbacks import PEFT
-from nemo.lightning.pytorch.callbacks.peft import PEFT
 from nemo.lightning.pytorch.strategies.utils import RestoreConfig
 from nemo.utils import logging
 
@@ -37,6 +39,39 @@ def gpt_lora() -> PEFT:
     return LoRA()
 
 
+def export_lora(
+    lora_checkpoint_path: str,
+    output_path: str,
+):
+    """
+    Export the LoRA adapter weights to HF format. Requires an implementation of HF PEFT exporter class.
+    See HFLlamaPEFTExporter for an example.
+
+    Python Usage:
+    ```python
+    if __name__ == '__main__':
+        llm.peft.export_lora(
+            lora_checkpoint_path=your_lora_checkpoint_path,
+            output_path=your_output_path,
+        )
+    ```
+
+    Args:
+        lora_checkpoint_path: The path to the LoRA checkpoint.
+        output_path: The path to save the HF checkpoint.
+
+    """
+    output = api.export_ckpt(
+        path=Path(lora_checkpoint_path),
+        target="hf-peft",
+        output_path=Path(output_path),
+    )
+
+    console = Console()
+    console.print(f"[green]✓ LoRA checkpoint exported to {output}[/green]")
+    return output
+
+
 def merge_lora(
     lora_checkpoint_path: str,
     output_path: str,
@@ -74,11 +109,15 @@ def merge_lora(
     merged_weights = {k: v for k, v in merged_model.sharded_state_dict().items() if ".adapter." not in k}
     _save_merged_weight(output_path, merged_weights, model, trainer)
 
+    console = Console()
+    console.print(f"[green]✓ LoRA checkpoint merged and saved to {output_path}[/green]")
+
 
 def _load_base_model_and_lora(lora_checkpoint_path: Path) -> Tuple[pl.LightningModule, LoRA]:
     model = io.load_context(ckpt_to_context_subdir(lora_checkpoint_path), "model")
     model.model_transform, model.__io__.model_transform = None, None
-    model.config.bf16 = False
+    model.config.bf16 = True
+    model.config.params_dtype = torch.bfloat16
     lora: Union[io.TrainerContext, LoRA] = io.load_context(
         ckpt_to_context_subdir(lora_checkpoint_path), "model.model_transform"
     )
diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py
index 0ce6138d1c6b..6c7e7e93ae8f 100644
--- a/nemo/collections/llm/peft/lora.py
+++ b/nemo/collections/llm/peft/lora.py
@@ -17,6 +17,7 @@
 from typing import List, Literal
 
 import torch
+import torch.nn.functional as F
 from torch import nn
 
 from nemo.collections.llm.peft.utils import get_adapter_attributes_from_linear, is_expert_linear, wildcard_match
@@ -39,9 +40,23 @@ def forward(self, x):
         return linear_output + adapter_output, bias
 
 
-class LinearAdapter(nn.Module):
+class LinearAdapter(nn.Linear):
     """
-    A simple LoRA linear module for non-megatron models.
+    Linear + LoRA, maintains ckpts structrue (i.e. Linear's weight/bias remain at the same FQN)
+
+    The _init_wrapper and _forward methods provide the LoRA functionality. We want to be able to
+    use those inside LinearAdapter but also for monkey-patching modules, without repeating the
+    same code -> therefore those are decorated with @staticmethod.
+
+    Args:
+        orig_linear (nn.Module): the linear module to augment.
+        dim (int): lora's dim in_features -> dim -> out_features.
+        alpha (int): lora's scaling alpha.
+        dropout (float): dropout prob (default: 0.1).
+        dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post)
+        lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform'])
+        lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they
+        are quantized weights (e.g. 4bit) needs to be specified explicitly.
     """
 
     def __init__(
@@ -54,46 +69,134 @@ def __init__(
         lora_A_init_method='xavier',
         lora_dtype=None,
     ):
-        super(LinearAdapter, self).__init__()
         assert isinstance(orig_linear, nn.Linear)
+        super(LinearAdapter, self).__init__(
+            in_features=orig_linear.in_features,
+            out_features=orig_linear.out_features,
+            bias=orig_linear.bias is not None,
+            device=orig_linear.weight.device,
+            dtype=orig_linear.weight.dtype,
+        )
+        # copy weights
+        self.weight.data.copy_(orig_linear.weight.data)
+        if orig_linear.bias is not None:
+            self.bias.data.copy_(orig_linear.bias.data)
+        # initialize the adapte
+        LinearAdapter._init_adapter(self)
+
+    @staticmethod
+    def _init_adapter(
+        obj,
+        dim=8,
+        alpha=32,
+        dropout=0.1,
+        dropout_position='post',
+        lora_A_init_method='xavier',
+        lora_dtype=None,
+    ):
+        """Adds LoRA weights to obj. The obj is either a LinearAdapter or an nn.Module (when
+        monkey-patching).
 
-        self.orig_linear = orig_linear
-        self.dim = dim
-        self.scale = alpha / dim
+        Args:
+            obj (LinearAdapter | nn.Module): input module to adapt.
+            dim (int): lora's dim in_features -> dim -> out_features.
+            alpha (int): lora's scaling alpha.
+            dropout (float): dropout prob (default: 0.1).
+            dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post)
+            lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform'])
+            lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they
+            are quantized weights (e.g. 4bit) needs to be specified explicitly.
+        """
+        obj.dim = dim
+        obj.scale = alpha / dim
 
         # Freezer
-        device = self.orig_linear.weight.device
-        self.orig_linear.weight.requires_grad = False
-        if self.orig_linear.bias is not None:
-            self.orig_linear.bias.requires_grad = False
+        device = obj.weight.device
+        obj.weight.requires_grad = False
+        if obj.bias is not None:
+            obj.bias.requires_grad = False
 
-        in_features = self.orig_linear.in_features
-        out_features = self.orig_linear.out_features
-        dtype = lora_dtype or self.orig_linear.weight.dtype
+        in_features = obj.in_features
+        out_features = obj.out_features
+        dtype = lora_dtype or obj.weight.dtype
 
-        self.lora_a = nn.Parameter(torch.zeros((in_features, dim), dtype=dtype, device=device))
-        self.lora_b = nn.Parameter(torch.zeros((dim, out_features), dtype=dtype, device=device))
+        obj.lora_a = nn.Parameter(torch.zeros((in_features, dim), dtype=dtype, device=device))
+        obj.lora_b = nn.Parameter(torch.zeros((dim, out_features), dtype=dtype, device=device))
         if lora_A_init_method == 'xavier':
-            torch.nn.init.uniform_(self.lora_a)
+            torch.nn.init.uniform_(obj.lora_a)
         else:
-            nn.init.kaiming_uniform_(self.lora_a, a=math.sqrt(5))
+            nn.init.kaiming_uniform_(obj.lora_a, a=math.sqrt(5))
 
-        self.dropout = nn.Dropout(p=dropout)
+        obj.dropout = nn.Dropout(p=dropout)
         assert dropout_position in ['pre', 'post'], dropout_position
-        self.dropout_position = dropout_position
+        obj.dropout_position = dropout_position
 
-    def forward(self, x):
+    @staticmethod
+    def _forward(obj, x, fwd=None):
         # pylint: disable=C0115,C0116
-        res = self.orig_linear(x)
-        if self.dropout_position == 'pre':
-            x = self.dropout(x)
-        lora_res = x @ self.lora_a
-        lora_res = lora_res @ self.lora_b
-        lora_res = lora_res * self.scale
-        if self.dropout_position == 'post':
-            lora_res = self.dropout(lora_res)
+        if fwd is not None:
+            res = fwd(x)
+        else:
+            res = F.linear(x, obj.weight, obj.bias)
+        if obj.dropout_position == 'pre':
+            x = obj.dropout(x)
+        lora_res = x @ obj.lora_a
+        lora_res = lora_res @ obj.lora_b
+        lora_res = lora_res * obj.scale
+        if obj.dropout_position == 'post':
+            lora_res = obj.dropout(lora_res)
         return res + lora_res
 
+    def forward(self, x):
+        return LinearAdapter._forward(self, x)
+
+
+def patch_linear_module(
+    orig_linear,
+    dim=8,
+    alpha=32,
+    dropout=0.1,
+    dropout_position='post',
+    lora_A_init_method='xavier',
+    lora_dtype=None,
+):
+    """Monkey-patches a nn.Linear (orig_linear param) to be a LinearAdapter, for all purposes
+    think of this function as replacing a nn.Linear with a LinearAdapter defined above.
+
+    The orig_linear might not contain valid weights, for example, the given orig_linear was
+    initialized within a context-manager that uses a "meta" device. Therefore, we cannot copy
+    the weight/bias from the orig_linear to the LinearAdapter, since those have not been allocated,
+
+    To circumvent this scenario, LinearAdapter's additional functionality (_init_adapter, _forward)
+    is based on static functions, so that we can use them for patching or when allocating a
+    new LinearAdapter object.
+
+    Args:
+        orig_linear (nn.Linear): the module we add adapter to.
+        dim (int, optional): Lora dim. Defaults to 8.
+        alpha (int, optional): Lora alpha scale. Defaults to 32.
+        dropout (float, optional): dropout prob. Defaults to 0.1.
+        dropout_position (str, optional): location to apply dropout wrt lora.
+            Defaults to 'post' (choices: 'pre', 'post').
+        lora_A_init_method (str, optional): lora_a init method. Defaults to 'xavier'.
+        lora_dtype (_type_, optional): Lora weights' dtype. By default will use orig_linear's dtype
+        but orig_linear might use non-trainable dtype (e.g. 4bit), in which case the user must
+        specify the dtype manually. Defaults to None.
+
+    Returns:
+        (nn.Module): the monkey-patched (nn.Linear + LoRA) nn.Module
+    """
+
+    assert isinstance(orig_linear, nn.Linear)
+
+    LinearAdapter._init_adapter(orig_linear, dim, alpha, dropout, dropout_position, lora_A_init_method, lora_dtype)
+    fwd = None
+    # If the model uses quantized weights, we want to use orig_linear's forward
+    if orig_linear.weight.dtype == torch.uint8:
+        fwd = orig_linear.forward
+    orig_linear.forward = lambda x: LinearAdapter._forward(orig_linear, x, fwd)
+    return orig_linear
+
 
 @dataclass
 class LoRA(PEFT):
@@ -168,7 +271,12 @@ def transform(self, m: nn.Module, name=None, prefix=None):
         full_name = f"{prefix}.{name}" if prefix else name
         if name in self.target_modules or any(wildcard_match(pattern, full_name) for pattern in self.target_modules):
             if isinstance(m, nn.Linear):
-                return LinearAdapter(
+                if self._is_fsdp_v1 or m.weight.data.dtype == torch.uint8:
+                    lora_cls = patch_linear_module
+                else:
+                    lora_cls = LinearAdapter
+
+                return lora_cls(
                     m,
                     dim=self.dim,
                     alpha=self.alpha,
diff --git a/nemo/collections/llm/quantization/quantizer.py b/nemo/collections/llm/quantization/quantizer.py
index 4779cc3915c8..16ae1319e733 100644
--- a/nemo/collections/llm/quantization/quantizer.py
+++ b/nemo/collections/llm/quantization/quantizer.py
@@ -198,7 +198,7 @@ def quantize(self, model: MegatronParallel, forward_loop=None):
         # TODO: Investigate why enabling FP8 kv cache will cause accuracy regressions for Nemotron.
         enable_quant_kv_cache = self.quantization_config.enable_kv_cache
         if enable_quant_kv_cache is None:
-            enable_quant_kv_cache = "int8" not in algorithm and decoder_type != "gptnext"
+            enable_quant_kv_cache = "int8" not in algorithm and decoder_type != "gpt"
         logging.info(f'{"Enabled" if enable_quant_kv_cache else "Disabled"} KV cache quantization')
         quant_cfg["quant_cfg"]["*output_quantizer"] = {
             "num_bits": 8 if algorithm == "int8_sq" else (4, 3),
@@ -212,7 +212,7 @@ def quantize(self, model: MegatronParallel, forward_loop=None):
 
         unwrapped_model = mtq.quantize(unwrapped_model, quant_cfg, forward_loop)
 
-        if decoder_type == "gptnext":
+        if decoder_type == "gpt":
             # We found squared_relu may have an under-calibration problem.
             # Clamp the scaling_factor with a min threshold to avoid under-calibration.
             match algorithm:
diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py
index aa7fa61f1b38..13859260f3e2 100644
--- a/nemo/collections/llm/quantization/utils.py
+++ b/nemo/collections/llm/quantization/utils.py
@@ -33,10 +33,10 @@ def get_modelopt_decoder_type(model: llm.GPTModel) -> str:
         (llm.LlamaModel, "llama"),
         (llm.MistralModel, "llama"),
         (llm.MixtralModel, "llama"),
-        (llm.NemotronModel, "gptnext"),
+        (llm.NemotronModel, "gpt"),
         (llm.Qwen2Model, "qwen"),
-        (llm.StarcoderModel, "gptnext"),
-        (llm.Starcoder2Model, "gptnext"),
+        (llm.StarcoderModel, "gpt"),
+        (llm.Starcoder2Model, "gpt"),
         (llm.Phi3Model, "phi3"),
     ]
 
diff --git a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
index 6fa29957e305..d7cd84a615c8 100644
--- a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
+++ b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
@@ -5,7 +5,7 @@
   -  https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/model_parallel_config.py
 
 - The parameter values are defaults as defined in the base class for a module.
-- Clik on the links to see complete list of configuration options for a module.
+- Click on the links to see complete list of configuration options for a module.
 
 <details open>
 <summary>recipe</summary>
@@ -16,135 +16,213 @@
 
   <blockquote>
 
+  ```sh
+  accumulate_grad_batches: int = 1 # Accumulates gradients over k batches before stepping the optimizer
+  limit_test_batches: int = 1 # How much of test dataset to check (float = fraction, int = num_batches)
+  limit_val_batches: int = 1 # How much of validation dataset to check (float = fraction, int = num_batches)
+  log_every_n_steps: int = 50 # How often to log within steps
+  max_steps: int = -1 # Stop training after this number of steps; disabled by default; If 'max_steps = -1' and 'max_epochs = None', will default to 'max_epochs = 1000'
+  num_nodes: int = 1 # Number of GPU nodes for distributed training
+  use_distributed_sampler: bool = False # Whether to wrap the DataLoader's sampler with :class:`torch.utils.data.DistributedSampler`. If not specified this is toggled automatically for strategies that require it. By default, it will add ``shuffle=True`` for the train sampler and 'shuffle=False' for validation/test/predict samplers. If you want to disable this logic, you can pass 'False' and add your own distributed sampler in the dataloader hooks. If ``True`` and a distributed sampler was already added, Lightning will not replace the existing one
+  val_check_interval: [int, float] = 1 # How often to check the validation set. Pass a 'float' in the range [0.0, 1.0] to check after a fraction of the training epoch. Pass an 'int' to check after a fixed number of training batches. An 'int' value can only be higher than the number of training batches when 'check_val_every_n_epoch=None', which validates after every 'N' training batches across epochs or during iteration-based training
+  max_epochs: Optional[int] = None # Stop training once this number of epochs is reached. If both max_epochs and max_steps are not specified, defaults to "max_epochs = 1000". To enable infinite training, set "max_epochs = -1"
+
+  ```
+  </blockquote>
+
+  <blockquote>
+
   <details open><summary>strategy <a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/strategies/megatron_strategy.py">(MegatronStrategy)</a></summary>
   <blockquote>
 
   ```sh
-  tensor_model_parallel_size: int = 1
-  pipeline_model_parallel_size: int = 1
-  virtual_pipeline_model_parallel_size: Optional[int] = None
-  context_parallel_size: int = 1
-  sequence_parallel: bool = False
-  expert_model_parallel_size: int = 1
-  pipeline_dtype: Optional[torch.dtype] = None,
+  tensor_model_parallel_size: int = 1 # Intra-layer model parallelism. Splits tensors across GPU ranks
+  pipeline_model_parallel_size: int = 1 # Inter-layer model parallelism. Splits transformer layers across GPU ranks
+  virtual_pipeline_model_parallel_size: Optional[int] = None # Number of virtual blocks per pipeline model parallel rank is the virtual model parallel size
+  context_parallel_size: int = 1 # Splits network input along sequence dimension across GPU ranks
+  sequence_parallel: bool = False # Parallelizes layer norms and dropout sequentially
+  expert_model_parallel_size: int = 1 # Distributes Moe Experts across sub data parallel dimension
+  pipeline_dtype: Optional[torch.dtype] = None # dtype used in p2p communication
   ```    
-  <details><summary>ddp: Union[DDPLiteral, <a href="https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/distributed/distributed_data_parallel_config.py">DistributedDataParallelConfig</a>] = "megatron"</summary>
+  <details open><summary>ddp: Union[DDPLiteral, <a href="https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/distributed/distributed_data_parallel_config.py">DistributedDataParallelConfig</a>] = "megatron"</summary>
   <blockquote>
 
   ```sh
-  grad_reduce_in_fp32: bool = False # this is overridden by same config in MegatronMixedPrecision
-  overlap_grad_reduce: bool = False
-  overlap_param_gather: bool = False
-  align_param_gather: bool = False
-  use_distributed_optimizer: bool = False # this is overridden by same config in OptimizerConfig
-  check_for_nan_in_grad: bool = False
-  bucket_size: Optional[int] = None
-  average_in_collective: bool = False
-  fp8_param_gather: bool = False    
+  grad_reduce_in_fp32: bool = False # If true, reduce grads in fp32; this is overridden by same config in MegatronMixedPrecision
+  overlap_grad_reduce: bool = False # If true, overlap grad all-reduce / reduce-scatter with backward compute
+  overlap_param_gather: bool = False # If true, overlap param all-gather with forward compute
+  align_param_gather: bool = False # If true, all PP stages will launch param all-gathers simultaneously
+  use_distributed_optimizer: bool = False # If true, issue reduce-scatter collectives to aggregate gradients and clean up originally allocated model parameters, otherwise issue all-reduce collectives ; this is overridden by same config in OptimizerConfig
+  check_for_nan_in_grad: bool = False # If true, check for NaNs in gradients _before_ communication collective
+  bucket_size: Optional[int] = None # Maximum number of parameters in each bucket
+  average_in_collective: bool = False # If true, compute average in collective directly, as opposed to dividing by the dp_size first and then computing sum in the collective
+  fp8_param_gather: bool = False # If true, keep the compute param in fp8 (do not use any other intermediate dtype) and perform the param all-gather in fp8
   ```  
   </blockquote>
   </details>
   </blockquote>
   </details>
 
-  <details>
+  <details open>
   <summary>callbacks (Optional[List[Callback]]=None)</summary>
   <blockquote>
 
-  <details><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/utils/exp_manager.py#L240">TimingCallback</a> (Callback)</summary>
+  <details open><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/utils/exp_manager.py#L240">TimingCallback</a> (Callback)</summary>
   <blockquote>
 
   ```sh
-  reduction: str = "mean"
-  sync_cuda: bool = False
-  buffer_size: int = 1
+  reduction: str = "mean" # reduction over multiple timings of the same timer
+  sync_cuda: bool = False # if True torch.cuda.synchronize() is called for start/stop
+  buffer_size: int = 1 # if positive, limits the number of stored measures per name
   ```
   </blockquote>
   </details>
 
-  <details><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/callbacks/megatron_comm_overlap.py">MegatronCommOverlapCallback</a> (Callback)</summary>
+  <details open><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/callbacks/megatron_comm_overlap.py">MegatronCommOverlapCallback</a> (Callback)</summary>
   <blockquote>
 
   ```sh
-  tp_comm_overlap: bool = None
-  tp_comm_overlap_cfg: TransformerLayerTPOverlapCfg = None
-  overlap_p2p_comm: bool = None
-  batch_p2p_comm: bool = None
-  overlap_grad_reduce: bool = None
-  overlap_param_gather: bool = None
-  overlap_param_gather_with_optimizer_step: bool = None
-  align_param_gather: bool = None
-  bucket_size: int = None
-  defer_embedding_wgrad_compute: bool = None
-  wgrad_deferral_limit: int = None
+  tp_comm_overlap: bool = None # Enable tensor parallel overlap
+  tp_comm_overlap_cfg: TransformerLayerTPOverlapCfg = None # Tensor parallel overlap config
+  overlap_p2p_comm: bool = None # Enable pipeline parallel communication overlap
+  batch_p2p_comm: bool = None # Batch pipeline parallel send and recv into a single op
+  overlap_grad_reduce: bool = None # Overlap data parallel gradient reduction with compute
+  overlap_param_gather: bool = None # Overlap data parallel parameter gather with compute
+  overlap_param_gather_with_optimizer_step: bool = None # Overlap the first data parallel parameter gather chunk with optimizer step
+  align_param_gather: bool = None # Align data parallel parameter gather across virtual pipeline chunks
+  bucket_size: int = None # The DDP bucket size, controls the data parallel overlap granularity
+  defer_embedding_wgrad_compute: bool = None # Overlap wgrads with the pipeline drain bubble for the last pipeline stage
+  wgrad_deferral_limit: int = None # Limit of how many outstanding wgrads may be overlapped with the pipeline drain bubble
   ```
   </blockquote>
   </details>
 
-  <details><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/callbacks/nsys.py">NsysCallback</a> (Callback)</summary>
+  <details open><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/callbacks/nsys.py">NsysCallback</a> (Callback)</summary>
   <blockquote>
 
   ```sh
-  start_step: int
-  end_step: int
-  ranks: List[int] = [0]
-  gen_shape: bool = False
+  start_step: int # Global batch to start profiling
+  end_step: int # Global batch to end profiling
+  ranks: List[int] = [0] # Global rank IDs to profile
+  gen_shape: bool = False # Generate model and kernel details including input shapes
   ```
   </blockquote>
   </details>
 
-  <details><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/callbacks/memory_profiler.py">MemoryProfileCallback</a> (Callback)</summary>
+  <details open><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/callbacks/memory_profiler.py">MemoryProfileCallback</a> (Callback)</summary>
   <blockquote>
 
   ```sh
-  dir: str = "/mem_profile"
-  warn_cycles: bool = True
-  ranks: List = []
+  dir: str = "/mem_profile" # Directory to store the memory profile dump
+  warn_cycles: bool = True # Whether to enable [reference cycle detection](https://pytorch.org/blog/understanding-gpu-memory-2/)
+  ranks: List = [] # List of ranks to collect snapshot on, defaults to all if list is empty
   ```
   </blockquote>
   </details>
 
-  <details><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/callbacks/garbage_collection.py">GarbageCollectionCallback</a> (Callback)</summary>
+  <details open><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/callbacks/garbage_collection.py">GarbageCollectionCallback</a> (Callback)</summary>
   <blockquote>
 
   ```sh
-  gc_interval_train: int
-  gc_interval_val: int
+  gc_interval_train: int # Number of global train steps at which garbage collection is done
+  gc_interval_val: int # Number of global validation steps at which garbage collection is done
   ```
   </blockquote>
   </details>
   </blockquote>
   </details>
 
-  <details>
+  <details open>
   <summary>plugins (nemo_run.Plugin)</summary>
   <blockquote>
 
-  <details><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/plugins/mixed_precision.py">MegatronMixedPrecision</a></summary>
+  <details open><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/plugins/mixed_precision.py">MegatronMixedPrecision</a></summary>
   <blockquote>
 
   ```sh
-  precision: Literal["16-mixed", "bf16-mixed", "32"]
-  params_dtype: torch.dtype = None
-  pipeline_dtype: torch.dtype = None
+  precision: Literal["16-mixed", "bf16-mixed", "32"] # dtype for mixed precision training
+  params_dtype: torch.dtype = None # dtype used when intializing the weights
+  pipeline_dtype: torch.dtype = None # dtype used in p2p communication, usually params_dtype
   autocast_enabled: bool = False
-  grad_reduce_in_fp32: bool = True # this overrides same config in DistributedDataParallelConfig
-  fp8: str = None
-  fp8_margin: int = 0
-  fp8_amax_history_len: int = 1
-  fp8_amax_compute_algo: str = "most_recent"
-  fp8_params: bool = False
+  grad_reduce_in_fp32: bool = True # If true, reduce grads in fp32; this overrides same config in DistributedDataParallelConfig
+  fp8: str = None # If set, enables the use of FP8 precision through Transformer Engine. There are 2 predefined choices- 'e4m3' and 'hybrid'
+  fp8_margin: int = 0 # Margin for the scaling factor computation
+  fp8_amax_history_len: int = 1 # The length of the amax history window used for scaling factor computation
+  fp8_amax_compute_algo: str = "most_recent" # Algorithm used for choosing the `amax` value for the scaling factor computation. There are 2 predefined choices- `max` and 'most_recent'
+  fp8_params: bool = False # fp8 dtype weights, sets 'transformer_engine.pytorch.fp8.FP8GlobalStateManager.FP8_PARAMETERS=True' and 'fp8_param_gather=True'
+  ```
+  </blockquote>
+  </details>
+
+  <details open><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/run/plugins.py">PerfEnvPlugin</a></summary>
+  <blockquote>
+
+  ```sh
+  enable_layernorm_sm_margin: bool = True # Set SM margin for TransformerEngine's Layernorm, so in order not to block DP level communication overlap
+  layernorm_sm_margin: int = 16 # The SM margin for TransformerEngine Layernorm
+  enable_vboost: bool = False # Whether to steer more power towards tensor cores via `sudo nvidia-smi boost-slider --vboost 1`. May not work on all systems
+  ```
+  </blockquote>
+  </details>
+
+  <details open><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/run/plugins.py">PreemptionPlugin</a></summary>
+  <blockquote>
+
+  ```sh
+  preempt_time: int = 60 # Time, in seconds, before the task's time limit at which the executor will send a SIGTERM preemption signal. This allows tasks to be gracefully stopped before reaching their time limit. (only applicable for nemo_run.SlurmExecutor)
+  callbacks: list[nemo_run.Config[Callback]]= [nemo_run.Config(PreemptionCallback)] # A list of callback configurations that the plugin will merge with the task's existing callbacks
+  ```
+  </blockquote>
+  </details>
+
+  <details open><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/run/plugins.py">FaultTolerancePlugin</a></summary>
+  <blockquote>
+
+  Note: FaultTolerancePlugin does not work with the NsysPlugin.
+
+  ```sh
+  num_in_job_restarts: int = 3 # Max number of restarts on failure, within the same job
+  num_job_retries_on_failure: int = 2 # Max number of new job restarts on failure
+  initial_rank_heartbeat_timeout: int = 1800 # Timeouts are time intervals used by a rank monitor to detect that a rank is not alive. This is the max timeout for the initial heartbeat
+  rank_heartbeat_timeout: int = 300 # This is the timeout for subsequent hearbeats after the initial heartbeat
   ```
   </blockquote>
   </details>
 
-  <details><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/run/plugins.py">PerfEnvPlugin</a></summary>
+  <details open><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/run/plugins.py">NsysPlugin</a></summary>
   <blockquote>
 
   ```sh
-  enable_layernorm_sm_margin: bool = True
-  layernorm_sm_margin: int = 16
-  enable_vboost: bool = False
+  start_step: int # The step at which to start the nsys profiling.
+  end_step: int # The step at which to end the nsys profiling.
+  ranks: Optional[list[int]] = None # The ranks on which to run the nsys profiling. If not specified, profiling will be run on rank 0
+  nsys_trace: Optional[list[str]] = None # The events to trace during profiling. If not specified, 'nvtx' and 'cuda' events will be traced
+  ```
+  </blockquote>
+  </details>
+
+  <details open><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/run/plugins.py">WandbPlugin</a></summary>
+  <blockquote>
+
+  NOTE: This plugin is only activated if the ``WANDB_API_KEY`` environment variable is set. The ``WANDB_API_KEY`` environment variables will also be set in the executor's environment variables. Follow https://docs.wandb.ai/quickstart to retrieve your ``WANDB_API_KEY``.
+
+  ```sh
+  name :str # The name for the Weights & Biases run
+  logger_fn: Callable[..., run.Config[WandbLogger]] # A callable that returns a Config of ``WandbLogger``
+  log_task_config: bool # Whether to log the task configuration to the logger
+  ```
+  </blockquote>
+  </details>
+
+  <details open><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/run/plugins.py">ConfigValidationPlugin</a></summary>
+  <blockquote>
+
+  ```sh
+  validate_preemption: bool = True # Whether to validate the preemption callback. If set to True, the plugin will assert that the task has a 'PreemptionCallback'
+  validate_checkpoint_dir: bool = True # Whether to validate the checkpoint directory. If set to True and the executor is a 'SlurmExecutor' the plugin will assert that the task's log directory exists in the mounts specified in the `SlurmExecutor`
+  validate_serialization: bool = True # Whether to validate task serialization. If set to True, the plugin will assert that the task can be successfully serialized and deserialized using NeMo-Run's 'ZlibJSONSerializer'
+  validate_wandb: bool = False # Whether to validate Weights and Biases integration. If set to True, the plugin will assert that the executor's environment variables contain a `WANDB_API_KEY` and that NeMo Logger's `wandb` is set
+  validate_nodes_and_devices: bool = True # Whether to validate the number of devices and nodes. If set to True, the plugin will assert that the task's trainer is configured to use the same number of nodes and devices as the executor
   ```
   </blockquote>
   </details>
@@ -156,42 +234,42 @@
 
 </details>
 
-<details>
+<details open>
   <summary>model (pytorch_lightning.LightningModule)</summary>
 
   <blockquote>
     
-  <details><summary>config (<a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/llm/gpt/model/base.py">GPTConfig</a>)</summary>
+  <details open><summary>config (<a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/llm/gpt/model/base.py">GPTConfig</a>)</summary>
   <blockquote>
 
   ```sh
-  seq_length: int = 1024
-  attention_softmax_in_fp32: bool = False
-  num_layers: int = 0
-  hidden_size: int = 0
-  num_attention_heads: int = 0
-  num_query_groups: Optional[int] = None
-  ffn_hidden_size: Optional[int] = None
-  hidden_dropout: float = 0.1
-  attention_dropout: float = 0.1
-  add_bias_linear: bool = True
-  gated_linear_unit: bool = False
-  activation_func: Callable = F.gelu
-  normalization: bool = "LayerNorm"
-  layernorm_epsilon: float = 1e-5
-  layernorm_zero_centered_gamma: bool = False
+  seq_length: int = 1024 # Number of tokens in a single sequence
+  attention_softmax_in_fp32: bool = False # If True, run attention masking and softmax in fp32. This should be True if apply_query_key_layer_scaling is True
+  num_layers: int = 0 # Number of transformer layers in a transformer block
+  hidden_size: int = 0 # Transformer hidden size
+  num_attention_heads: int = 0 # Number of transformer attention heads
+  num_query_groups: Optional[int] = None # Number of query groups for group query attention. If None, normal attention is used
+  ffn_hidden_size: Optional[int] = None # Transformer Feed-Forward Network hidden size. This is set to 4*hidden_size if not provided
+  hidden_dropout: float = 0.1 # Dropout probability for transformer hidden state
+  attention_dropout: float = 0.1 # Post attention dropout probability
+  add_bias_linear: bool = True # Include a bias term in all linear layers (QKV projections, after core attention, and two in MLP layer)
+  gated_linear_unit: bool = False # Use a gated linear unit for the first linear layer in the MLP
+  activation_func: Callable = F.gelu # Activation function to use for the non-linearity in the MLP
+  normalization: bool = "LayerNorm" # Which norm to use for normalization layers, valid options are `LayerNorm` and `RMSNorm`
+  layernorm_epsilon: float = 1e-5 # Epsilon value for any LayerNorm operations
+  layernorm_zero_centered_gamma: bool = False # If set to True, the LayerNorm is adjusted to center the gamma values around 0. This improves numerical stability
 
   # Fusions
-  masked_softmax_fusion: bool = True
-  cross_entropy_loss_fusion: bool = True
-  gradient_accumulation_fusion: bool = _grad_accum_fusion_available # Requires the custom CUDA extension fused_weight_gradient_mlp_cuda module
-  bias_activation_fusion: bool = False
-  bias_dropout_fusion: bool = False 
-  apply_rope_fusion: bool = False
+  masked_softmax_fusion: bool = True # If True, uses softmax fusion
+  cross_entropy_loss_fusion: bool = True # If this is enabled, the fused cross entropy implementation would be used
+  gradient_accumulation_fusion: bool = _grad_accum_fusion_available # If true, fuses weight gradient accumulation to GEMMs. Requires the custom CUDA extension fused_weight_gradient_mlp_cuda module
+  bias_activation_fusion: bool = False # If True, fuses bias addition and the activation function when possible
+  bias_dropout_fusion: bool = False # If True, uses bias dropout fusion
+  apply_rope_fusion: bool = False # If True, use fused RoPE kernel
 
   recompute_granularity: Optional[str] = None # Determines which type of activation recompute to use. If set, must be 'selective' or 'full'.
   recompute_method: Optional[str] = None # Determines which transformer layers will be recomputed. If set, must be 'uniform' or 'block'.
-  recompute_num_layers: Optional[int] = None 
+  recompute_num_layers: Optional[int] = None # If True, distribute recomputed activations across the model parallel group
   distribute_saved_activations: Optional[bool] = None # If True, distribute recomputed activations across the - model parallel group.
 
   enable_cuda_graph: bool = False # When set to true, TransformerLayer layers are swapped with a CUDA graphed version.
@@ -204,47 +282,48 @@
 
 </details>
 
-<details>
+<details open>
   <summary>data (pytorch_lightning.LightningDataModule)</summary>
 
   <blockquote>
 
-  <details><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/llm/gpt/data/mock.py">MockDataModule</a></summary>
+  <details open><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/llm/gpt/data/mock.py">MockDataModule</a></summary>
   <blockquote>
       
   ```sh
-  seq_length: int = 2048
-  tokenizer: Optional["TokenizerSpec"] = None
-  micro_batch_size: int = 4
-  global_batch_size: int = 8
-  rampup_batch_size: Optional[List[int]] = None
-  num_train_samples: int = 10_000
-  num_val_samples: int = 10_000
-  num_test_samples: int = 10_000
-  num_workers: int = 8
-  pin_memory: bool = True
-  persistent_workers: bool = False
-  create_attention_mask: bool = False
+  seq_length: int = 2048 # Number of tokens in a single sequence
+  tokenizer: Optional["TokenizerSpec"] = None # TokenizerSpec object to convert sequences to tokens
+  micro_batch_size: int = 4 # The size of each micro batch
+  global_batch_size: int = 8 # The size of each micro batch
+  rampup_batch_size: Optional[List[int]] = None # Rampup batch size, should be in format of [start_global_batch_size, batch_size_increment, ramup_samples]
+  num_train_samples: int = 10_000 # The number of samples to use for training
+  num_val_samples: int = 10_000 # The number of samples to use for validation
+  num_test_samples: int = 10_000 # The number of samples to use for testing
+  num_workers: int = 8 # How many subprocesses to use for data loading
+  pin_memory: bool = True # If True, the data loader will copy Tensors into device/CUDA pinned memory before returning them
+  persistent_workers: bool = False # If True, the data loader will not shut down the worker processes after a dataset has been consumed once
+  create_attention_mask: bool = False # Option to enable the attention masks generation
   ```
   </blockquote>
   </details>
 
-  <details><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/llm/gpt/data/squad.py">SquadDataModule</a> (For fine-tuning jobs)</summary>
+  <details open><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/llm/gpt/data/squad.py">SquadDataModule</a> (For fine-tuning jobs)</summary>
   <blockquote>
 
   ```sh
-  seq_length: int = 2048
-  tokenizer: Optional["TokenizerSpec"] = None
-  micro_batch_size: int = 4
-  global_batch_size: int = 8
+  seq_length: int = 2048 # Number of tokens in a single sequence
+  tokenizer: Optional["TokenizerSpec"] = None # TokenizerSpec object to convert sequences to tokens
+  micro_batch_size: int = 4 # The size of each micro batch
+  global_batch_size: int = 8 # The size of each micro batch
   ```
-  <details><summary>packed_sequence_specs (Optional[<a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/llm/gpt/data/packed_sequence.py">PackedSequenceSpecs</a>] = None)</summary>
+  <details open><summary>packed_sequence_specs (Optional[<a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/llm/gpt/data/packed_sequence.py">PackedSequenceSpecs</a>] = None)</summary>
   <blockquote>
 
   ```sh
-  packed_sequence_size: int = -1
-  tokenizer_model_name: str = None
-  packed_data_path: str = None
+  packed_sequence_size: int = -1 # If a positive integer, this arg enables training with sequence packing and specifies the pack size
+  tokenizer_model_name: str = None # Keep track of tokenizer model name, since each tokenizer produces a different packed sequence dataset file. This field is set by llm.finetune api
+  packed_train_data_path: str = None # If specified, use this file for the packed training dataset instead of the default path
+  packed_val_data_path: str = None # If specified, use this file for the packed validation dataset instead of the default path
   ```
   </blockquote>
   </details>
@@ -255,47 +334,47 @@
 
 </details>
 
-<details>
+<details open>
   <summary>log (<a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/nemo_logger.py">NeMoLogger</a>)</summary>
 
   <blockquote>
 
   ```sh
-  log_dir: Optional[str] = None
-  log_local_rank_0_only: bool = False
-  log_global_rank_0_only: bool = False
+  log_dir: Optional[str] = None # Directory to save logs
+  log_local_rank_0_only: bool = False # Log only on local rank 0
+  log_global_rank_0_only: bool = False # Log only on global rank 0
   ```
-  <details><summary>ckpt (Optional[<a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/callbacks/model_checkpoint.py">ModelCheckpoint</a>] = None)</summary>
+  <details open><summary>ckpt (Optional[<a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/callbacks/model_checkpoint.py">ModelCheckpoint</a>] = None)</summary>
   <blockquote>      
 
   ```sh
-  save_last: Optional[bool] = True
-  save_top_k: int = 3
-  every_n_epochs: int = None
-  every_n_train_steps: Optional[int] = None
-  save_on_train_epoch_end: Optional[bool] = False
-  train_time_interval: Optional[timedelta] = None
+  save_last: Optional[bool] = True # saves a `*-last` copy whenever a checkpoint file gets saved
+  save_top_k: int = 3 # saves the top-k checkpoints according to 'monitor'
+  every_n_epochs: int = None # Number of epochs between checkpoints
+  every_n_train_steps: Optional[int] = None # Number of train steps between checkpoints
+  save_on_train_epoch_end: Optional[bool] = False # Whether to run checkpointing at the end of the training epoch
+  train_time_interval: Optional[timedelta] = None # After each interval, monitor checkpoints. Not to be used with 'every_n_epochs' or 'every_n_train_steps'
   ```
   </blockquote>
   </details>
 
-  <details><summary>tensorboard (Optional[TensorBoardLogger] = None)</summary>
+  <details open><summary>tensorboard (Optional[TensorBoardLogger] = None)</summary>
   <blockquote>
 
   ```sh
-  save_dir: Union[str, Path]
-  name: Optional[str] = "lightning_logs"
+  save_dir: Union[str, Path] # Directory where tensorbroad log file will be saved
+  name: Optional[str] = "lightning_logs" # Experiment name
   ```
   </blockquote>
   </details>
 
-  <details><summary>wandb: Optional[WandbLogger] = None</summary>
+  <details open><summary>wandb: Optional[WandbLogger] = None</summary>
   <blockquote>
 
   ```sh
-  name: Optional[str] = None
-  project: Optional[str] = None
-  config: Dict
+  name: Optional[str] = None # Display name for the run
+  project: Optional[str] = None # The name of the project to which this run will belong. If not set, the environment variable 'WANDB_PROJECT' will be used as a fallback. If both are not set, it defaults to 'lightning_logs'
+  config: Dict # Add other config parameters, wandb_logger.experiment.config["key"] = value, wandb_logger.experiment.config.update({key1: val1, key2: val2}), wandb.config["key"] = value
   ```
   </blockquote>
   </details>
@@ -304,39 +383,39 @@
 
 </details>
 
-<details>
+<details open>
   <summary>optim (<a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/optim/base.py">OptimizerModule</a>) # Use either MegatronOptimizerModule or PytorchOptimizerModule</summary>
 
   <blockquote>
 
-  <details><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/optim/megatron.py">MegatronOptimizerModule</a></summary>
+  <details open><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/optim/megatron.py">MegatronOptimizerModule</a></summary>
   <blockquote>
 
-  <details><summary>config (<a href="https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/optimizer/optimizer_config.py">OptimizerConfig</a>)</summary>
+  <details open><summary>config (<a href="https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/optimizer/optimizer_config.py">OptimizerConfig</a>)</summary>
   <blockquote>
 
   ```sh
-  optimizer: str = 'adam'
-  lr: Optional[float] = None
-  weight_decay: float = 0.01
-  bf16: bool = False
-  fp16: bool = False
-  adam_beta1: float = 0.9
-  adam_beta2: float = 0.999
-  adam_eps: float = 1e-08
-  use_distributed_optimizer: bool = False # this overrides same config in DistributedDataParallelConfig
-  clip_grad: float 1.0
+  optimizer: str = 'adam' # Optimizer to use (one of Adam or SGD)
+  lr: Optional[float] = None # Initial learning rate. Depending on decay style and initial warmup, the learning rate at each iteration would be different
+  weight_decay: float = 0.01 # Weight decay coefficient for L2 regularization
+  bf16: bool = False # If true, train with bf16 mixed precision training
+  fp16: bool = False # If true, train with fp8 mixed precision training
+  adam_beta1: float = 0.9 # First coefficient for computing running averages of gradient and its square in Adam optimizer
+  adam_beta2: float = 0.999 # Second coefficient for computing running averages of gradient and its square in Adam optimizer
+  adam_eps: float = 1e-08 # Term added to the denominator to improve numerical stability in Adam optimizer
+  use_distributed_optimizer: bool = False # Distribute optimizer state over data-parallel replicas; this overrides same config in DistributedDataParallelConfig
+  clip_grad: float 1.0 # Gradient clipping based on global L2 norm
   ```
   </blockquote>
   </details>
       
-  <details><summary>lr_scheduler (Optional[<a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/optim/base.py">LRSchedulerModule</a>] = None)</summary>
+  <details open><summary>lr_scheduler (Optional[CosineAnnealingScheduler[<a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/optim/base.py">LRSchedulerModule</a>]] = None)</summary>
   <blockquote>
 
   ```sh
-  warmup_steps: int = 750
-  constant_steps: int = 80000
-  min_lr: float = 6e-5
+  warmup_steps: int = 750 # Number of training steps in warmup stage
+  constant_steps: int = 80000 # Number of steps to keep lr constant at
+  min_lr: float = 6e-5 # Minimum lr to hold the learning rate after decay
   ```
   </blockquote>
   </details>
@@ -344,13 +423,12 @@
   </blockquote>
   </details>
 
-  <details><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/optim/pytorch.py">PytorchOptimizerModule</a></summary>
+  <details open><summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/pytorch/optim/pytorch.py">PytorchOptimizerModule</a></summary>
   <blockquote>
 
   ```sh
-  optim_cls # Eg. torch.optim.Adam
-  config: dict = {'lr': 3e-4}
-  lr_scheduler: Optional[LRSchedulerModule] = None
+  optimizer_fn: torch.optim.Optimizer # Eg. torch.optim.Adam
+  lr_scheduler: Optional[LRSchedulerModule] = None # The learning rate scheduler module
   ```
   </blockquote>
   </details>
@@ -359,25 +437,23 @@
 
 </details>
 
-<details>
+<details open>
   <summary>resume (<a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/lightning/resume.py">AutoResume</a>)</summary>
 
   <blockquote>
 
   ```sh
-  restore_config: Optional[RestoreConfig] = None
-  resume_from_directory: Optional[str] = None
-  resume_from_path: Optional[str] = None
-  adapter_path: Optional[str] = None
-  resume_if_exists: bool = False
-  resume_past_end: bool = False
-  resume_ignore_no_checkpoint: bool = False
+  restore_config: Optional[RestoreConfig] = None # Optional config for selectively restoring specific parts like model weights, optimizer states, etc. If the config contains a path from HF or another non-NeMo checkpoint format, the checkpoint will be automatically converted to a NeMo compatible format
+  resume_from_directory: Optional[str] = None # Path to the checkpointing directory to restore from; this takes precedence over 'restore_config'
+  resume_from_path: Optional[str] = None # Path to a specific checkpoint to restore from
+  adapter_path: Optional[str] = None # Path to any adapter checkpoints
+  resume_if_exists: bool = False # Whether this experiment is resuming from a previous run. If True, it sets trainer._checkpoint_connector._ckpt_path so that the trainer should auto-resume
   ```
   </blockquote>
 
 </details>
 
-<details>
+<details open>
   <summary><a href="https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/llm/peft/lora.py">LoRA</a> (PEFT)</summary>
 
   <blockquote>
@@ -385,13 +461,11 @@
   ```sh
   target_modules: List[str] = field(
       default_factory=lambda: ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2']
-  )
-  dim: int = 32
-  alpha: int = 32
-  dropout: float = 0.0
-  dropout_position: Literal['pre', 'post'] = 'post'
-  lora_A_init_method: str = "xavier"
-  lora_B_init_method: str = "zero"
+  ) # A list of module names to apply LoRA to
+  dim: int = 32 # Dimension of the low-rank projection space
+  alpha: int = 32 # Weighting factor for the low-rank projection
+  dropout: float = 0.0 # Dropout rate for the low-rank projection
+  dropout_position: Literal['pre', 'post'] = 'post' # Position for applying dropout
   ```
   </blockquote>
 
diff --git a/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py
index 6eeaedbddca2..334e4a96763b 100644
--- a/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py
+++ b/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py
@@ -127,6 +127,7 @@ def pretrain_recipe(
     num_gpus_per_node: int = 8,
     fn=pretrain,
     model_name: str = '',
+    max_steps: int = 100,
 ) -> run.Partial:
     """
     Create a pre-training recipe for a HFAutoModelForCausalLM model.
@@ -159,6 +160,7 @@ def pretrain_recipe(
             num_nodes=num_nodes,
             num_gpus_per_node=num_gpus_per_node,
             callbacks=[run.Config(TimingCallback)],
+            max_steps=max_steps,
         ),
         data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1),
         log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
diff --git a/nemo/collections/llm/recipes/nemotron4_15b.py b/nemo/collections/llm/recipes/nemotron4_15b.py
index 49f92fcc1616..2ac3911a540a 100644
--- a/nemo/collections/llm/recipes/nemotron4_15b.py
+++ b/nemo/collections/llm/recipes/nemotron4_15b.py
@@ -62,7 +62,7 @@ def pretrain_recipe(
     pipeline_parallelism_type: Optional[torch.dtype] = None,
     virtual_pipeline_parallelism: Optional[int] = None,
     context_parallelism: int = 1,
-    sequence_parallelism: bool = False,
+    sequence_parallelism: bool = True,
     num_nodes: int = 1,
     num_gpus_per_node: int = 8,
     max_steps: int = 300000,
diff --git a/nemo/collections/llm/t5/model/__init__.py b/nemo/collections/llm/t5/model/__init__.py
index 088173857efd..3834b4751c0c 100644
--- a/nemo/collections/llm/t5/model/__init__.py
+++ b/nemo/collections/llm/t5/model/__init__.py
@@ -1,6 +1,9 @@
 from nemo.collections.llm.t5.model.t5 import (
     MaskedTokenLossReduction,
     T5Config,
+    T5Config3B,
+    T5Config11B,
+    T5Config220M,
     T5Model,
     local_layer_spec,
     t5_data_step,
@@ -10,6 +13,9 @@
 
 __all__ = [
     "T5Config",
+    "T5Config220M",
+    "T5Config3B",
+    "T5Config11B",
     "T5Model",
     "MaskedTokenLossReduction",
     "t5_data_step",
diff --git a/nemo/collections/llm/t5/model/t5.py b/nemo/collections/llm/t5/model/t5.py
index 743d16f57c2b..e4e1294b52f1 100644
--- a/nemo/collections/llm/t5/model/t5.py
+++ b/nemo/collections/llm/t5/model/t5.py
@@ -14,11 +14,13 @@
 
 import copy
 from dataclasses import dataclass
+from pathlib import Path
 from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional, Union
 
 import lightning.pytorch as L
 import torch
 import torch.distributed
+import torch.nn.functional as F
 from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
 
 from megatron.core.models.T5.t5_model import T5Model as MCoreT5Model
@@ -27,8 +29,11 @@
 from megatron.core.transformer.transformer_config import TransformerConfig
 from torch import nn
 
+from transformers import T5Config as HFT5Config
+from transformers import T5ForConditionalGeneration
+
 from nemo.collections.llm import fn
-from nemo.lightning import get_vocab_size, io
+from nemo.lightning import get_vocab_size, io, teardown
 from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
 from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule
 
@@ -43,6 +48,8 @@
 
 
 def t5_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
+    """Processing data for one step of T5 model"""
+
     from megatron.core import parallel_state
 
     from nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder import AttnMaskType
@@ -96,6 +103,7 @@ def t5_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
 
 
 def t5_forward_step(model, batch) -> torch.Tensor:
+    """Processing a forward step for T5 model"""
     forward_args = {
         "encoder_input_ids": batch["text_enc"],
         "decoder_input_ids": batch["text_dec"],
@@ -109,6 +117,7 @@ def t5_forward_step(model, batch) -> torch.Tensor:
 
 
 def transformer_engine_layer_spec(encoder_config: "T5Config", decoder_config: "T5Config") -> ModuleSpec:
+    """Spec for T5 when using transformer_engine mcore implementation"""
     from megatron.core.models.T5.t5_spec import (
         get_t5_decoder_with_transformer_engine_block_spec,
         get_t5_encoder_with_transformer_engine_block_spec,
@@ -121,6 +130,7 @@ def transformer_engine_layer_spec(encoder_config: "T5Config", decoder_config: "T
 
 
 def local_layer_spec(encoder_config: "T5Config", decoder_config: "T5Config") -> ModuleSpec:
+    """Spec for T5 when using local mcore implementation"""
     from megatron.core.models.T5.t5_spec import (
         get_t5_decoder_with_local_block_spec,
         get_t5_encoder_with_local_block_spec,
@@ -133,6 +143,7 @@ def local_layer_spec(encoder_config: "T5Config", decoder_config: "T5Config") ->
 
 
 def default_layer_spec(encoder_config: "T5Config", decoder_config: "T5Config") -> ModuleSpec:
+    """Set layer spec conditioning on whether transformer_engine is available"""
     if HAVE_TE:
         return transformer_engine_layer_spec(encoder_config, decoder_config)
     else:
@@ -141,7 +152,8 @@ def default_layer_spec(encoder_config: "T5Config", decoder_config: "T5Config") -
 
 @dataclass
 class T5Config(TransformerConfig, io.IOMixin):
-    # From megatron.core.models.t5.t5_model.T5Model
+    """Model config for T5 model. Adpated from megatron.core.models.t5.t5_model.T5Model"""
+
     encoder_num_layers: int = None
     fp16_lm_cross_entropy: bool = False
     parallel_output: bool = True
@@ -150,6 +162,8 @@ class T5Config(TransformerConfig, io.IOMixin):
     position_embedding_type: Literal["learned_absolute", "rope"] = "learned_absolute"
     apply_rope_fusion: bool = True
     max_position_embeddings: int = 512
+    relative_attention_num_buckets: int = 32
+    relative_attention_max_distance: int = 128
     rotary_percent: float = 1.0
     seq_len_interpolation_factor: Optional[float] = None
     seq_length: int = 512
@@ -172,6 +186,8 @@ class T5Config(TransformerConfig, io.IOMixin):
     data_step_fn: Callable = t5_data_step
 
     def configure_model(self, tokenizer) -> "MCoreT5Model":
+        """Setup the T5 Model based on config definition."""
+
         vp_size = self.virtual_pipeline_model_parallel_size
         if vp_size:
             p_size = self.pipeline_model_parallel_size
@@ -227,6 +243,8 @@ class T5Config220M(T5Config):
 
 @dataclass
 class T5Config3B(T5Config):
+    """Config for 3B T5 model"""
+
     num_layers: int = 24
     encoder_num_layers: int = 24
     hidden_size: int = 2048
@@ -236,6 +254,8 @@ class T5Config3B(T5Config):
 
 @dataclass
 class T5Config11B(T5Config):
+    """Config for 11B T5 model"""
+
     num_layers: int = 24
     encoder_num_layers: int = 24
     hidden_size: int = 4096
@@ -244,6 +264,8 @@ class T5Config11B(T5Config):
 
 
 class T5Model(L.LightningModule, io.IOMixin, io.ConnectorMixin, fn.FNMixin):
+    """T5 Lightning Module"""
+
     def __init__(
         self,
         config: T5Config,
@@ -262,6 +284,7 @@ def __init__(
         self._validation_loss_reduction = None
 
     def configure_model(self) -> None:
+        """Setup the T5 Model based on config definition."""
         if not hasattr(self, "module"):
             self.module = self.config.configure_model(self.tokenizer)
 
@@ -275,6 +298,7 @@ def forward(
         lm_labels: Optional[torch.Tensor] = None,
         inference_params=None,
     ) -> torch.Tensor:
+        """Call the forward method of the underlying model, and return whatever it outputs."""
 
         output_tensor = self.module(
             encoder_input_ids=encoder_input_ids,
@@ -288,23 +312,23 @@ def forward(
 
         return output_tensor
 
-    def data_step(self, dataloader_iter) -> Dict[str, torch.Tensor]:
+    def data_step(self, dataloader_iter) -> Dict[str, torch.Tensor]:  # pylint: disable=C0115,C0116
         return self.config.data_step_fn(dataloader_iter)
 
-    def forward_step(self, batch) -> torch.Tensor:
+    def forward_step(self, batch) -> torch.Tensor:  # pylint: disable=C0115,C0116
         return self.config.forward_step_fn(self, batch)
 
-    def training_step(self, batch, batch_idx=None) -> torch.Tensor:
+    def training_step(self, batch, batch_idx=None) -> torch.Tensor:  # pylint: disable=C0115,C0116
         # In mcore the loss-function is part of the forward-pass (when labels are provided)
         return self.forward_step(batch)
 
-    def validation_step(self, batch, batch_idx=None) -> torch.Tensor:
+    def validation_step(self, batch, batch_idx=None) -> torch.Tensor:  # pylint: disable=C0115,C0116
         # In mcore the loss-function is part of the forward-pass (when labels are provided)
 
         return self.forward_step(batch)
 
     def get_inference_wrapper(self, params_dtype, inference_batch_times_seqlen_threshold) -> torch.Tensor:
-        # This is to get the MCore model required in T5InferenceWrapper.
+        """This is to get the MCore model required in T5InferenceWrapper"""
         mcore_model = self.module
         while mcore_model:
             if type(mcore_model) is MCoreT5Model:
@@ -325,20 +349,480 @@ def get_inference_wrapper(self, params_dtype, inference_batch_times_seqlen_thres
         return model_inference_wrapper
 
     @property
-    def training_loss_reduction(self) -> MaskedTokenLossReduction:
+    def training_loss_reduction(self) -> MaskedTokenLossReduction:  # pylint: disable=C0115,C0116
         if not self._training_loss_reduction:
             self._training_loss_reduction = MaskedTokenLossReduction()
 
         return self._training_loss_reduction
 
     @property
-    def validation_loss_reduction(self) -> MaskedTokenLossReduction:
+    def validation_loss_reduction(self) -> MaskedTokenLossReduction:  # pylint: disable=C0115,C0116
         if not self._validation_loss_reduction:
             self._validation_loss_reduction = MaskedTokenLossReduction(validation_step=True)
 
         return self._validation_loss_reduction
 
 
+@io.model_importer(T5Model, "hf")
+class HFT5Importer(io.ModelConnector["T5ForConditionalGeneration", T5Model]):
+    """Importer Connector for converting HF Google T5 Model to NeMo"""
+
+    def init(self) -> T5Model:
+        return T5Model(self.config, tokenizer=self.tokenizer)
+
+    def apply(self, output_path: Path) -> Path:
+        from transformers import T5ForConditionalGeneration
+
+        source = T5ForConditionalGeneration.from_pretrained(str(self), torch_dtype='auto')
+        target = self.init()
+        trainer = self.nemo_setup(target)
+        self.convert_state(source, target)
+
+        self.nemo_save(output_path, trainer)
+
+        print(f"Converted T5 model to Nemo, model saved to {output_path} in {source.dtype}.")
+
+        teardown(trainer, target)
+        del trainer, target
+
+        return output_path
+
+    def convert_state(self, source, target):
+        """Converting HF state dict to NeMo state dict."""
+        mapping = {
+            "shared.weight": "embedding.word_embeddings.weight",
+            "lm_head.weight": "lm_head.output_layer.weight",
+            "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "encoder_relative_pos_emb.relative_attention_bias.weight",
+            "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "decoder_relative_pos_emb.relative_attention_bias.weight",
+            "encoder.block.*.layer.0.layer_norm.weight": "encoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+            "encoder.block.*.layer.0.SelfAttention.o.weight": "encoder.layers.*.self_attention.linear_proj.weight",
+            "encoder.block.*.layer.1.layer_norm.weight": "encoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+            "encoder.block.*.layer.1.DenseReluDense.wo.weight": "encoder.layers.*.mlp.linear_fc2.weight",
+            "encoder.final_layer_norm.weight": "encoder.final_layernorm.weight",
+            "decoder.block.*.layer.0.layer_norm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+            "decoder.block.*.layer.0.SelfAttention.o.weight": "decoder.layers.*.self_attention.linear_proj.weight",
+            "decoder.block.*.layer.1.layer_norm.weight": "decoder.layers.*.pre_cross_attn_layernorm.weight",
+            "decoder.block.*.layer.1.EncDecAttention.q.weight": "decoder.layers.*.cross_attention.linear_q.weight",
+            "decoder.block.*.layer.1.EncDecAttention.o.weight": "decoder.layers.*.cross_attention.linear_proj.weight",
+            "decoder.block.*.layer.2.layer_norm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+            "decoder.block.*.layer.2.DenseReluDense.wo.weight": "decoder.layers.*.mlp.linear_fc2.weight",
+            "decoder.final_layer_norm.weight": "decoder.final_layernorm.weight",
+        }
+        if getattr(source.config, "tie_word_embeddings", False):
+            del mapping["lm_head.weight"]
+
+        return io.apply_transforms(
+            source,
+            target,
+            mapping=mapping,
+            transforms=[
+                _import_encoder_qkv,
+                _import_encoder_linear_fc1,
+                _import_decoder_qkv,
+                _import_decoder_kv,
+                _import_decoder_linear_fc1,
+            ],
+            state_dict_ignored_entries=['output_layer.weight'],
+        )
+
+    @property
+    def tokenizer(self) -> "AutoTokenizer":
+        """Retrieve Tokenizer from HF"""
+        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
+        # Set special tokens to match HF
+        bos_token = "<pad>"
+
+        return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)), bos_token=bos_token)
+
+    @property
+    def config(self) -> T5Config:
+        """Generate NeMo Config based on HF config"""
+        from transformers import T5Config as HFT5Config
+
+        source = HFT5Config.from_pretrained(str(self))
+
+        def make_vocab_size_divisible_by(vocab_size):
+            base = 128
+            while vocab_size % base != 0:
+                base //= 2
+            return base
+
+        output = T5Config(
+            num_layers=source.num_layers,
+            encoder_num_layers=source.num_decoder_layers,
+            hidden_size=source.d_model,
+            ffn_hidden_size=source.d_ff,
+            kv_channels=source.d_kv,
+            num_attention_heads=source.num_heads,
+            position_embedding_type="relative",
+            relative_attention_num_buckets=source.relative_attention_num_buckets,
+            relative_attention_max_distance=source.relative_attention_max_distance,
+            activation_func=F.gelu,
+            add_bias_linear=False,
+            init_method_std=source.initializer_factor,
+            normalization="RMSNorm",
+            layernorm_epsilon=source.layer_norm_epsilon,
+            gated_linear_unit=True,
+            make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size),
+            share_embeddings_and_output_weights=getattr(source, "tie_word_embeddings", False),
+            fp16=False,
+            bf16=False,
+            params_dtype=torch.float32,
+            softmax_scale=1.0,
+        )
+
+        return output
+
+
+@io.state_transform(
+    source_key=(
+        "encoder.block.*.layer.0.SelfAttention.q.weight",
+        "encoder.block.*.layer.0.SelfAttention.k.weight",
+        "encoder.block.*.layer.0.SelfAttention.v.weight",
+    ),
+    target_key="encoder.layers.*.self_attention.linear_qkv.weight",
+)
+def _import_encoder_qkv(ctx: io.TransformCTX, q, k, v):
+    # T5 Model does not support GQA
+    megatron_config = ctx.target.config
+
+    head_num = megatron_config.num_attention_heads
+    hidden_size = megatron_config.hidden_size
+    head_size = megatron_config.kv_channels
+
+    old_tensor_shape = q.size()
+    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+
+    q = q.view(*new_q_tensor_shape)
+    k = k.view(*new_q_tensor_shape)
+    v = v.view(*new_q_tensor_shape)
+
+    qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:])
+    for i in range(head_num):
+        qkv_weights = torch.cat((qkv_weights, q[i : i + 1, :, :]))
+        qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
+        qkv_weights = torch.cat((qkv_weights, v[i : i + 1, :, :]))
+    qkv_weights = qkv_weights.reshape([head_size * (3 * head_num), hidden_size])
+
+    return qkv_weights
+
+
+@io.state_transform(
+    source_key=(
+        "decoder.block.*.layer.0.SelfAttention.q.weight",
+        "decoder.block.*.layer.0.SelfAttention.k.weight",
+        "decoder.block.*.layer.0.SelfAttention.v.weight",
+    ),
+    target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+)
+def _import_decoder_qkv(ctx: io.TransformCTX, q, k, v):
+    # T5 Model does not support GQA
+    megatron_config = ctx.target.config
+
+    head_num = megatron_config.num_attention_heads
+    hidden_size = megatron_config.hidden_size
+    head_size = megatron_config.kv_channels
+
+    old_tensor_shape = q.size()
+    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+
+    q = q.view(*new_q_tensor_shape)
+    k = k.view(*new_q_tensor_shape)
+    v = v.view(*new_q_tensor_shape)
+
+    qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:])
+    for i in range(head_num):
+        qkv_weights = torch.cat((qkv_weights, q[i : i + 1, :, :]))
+        qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
+        qkv_weights = torch.cat((qkv_weights, v[i : i + 1, :, :]))
+    qkv_weights = qkv_weights.reshape([head_size * (3 * head_num), hidden_size])
+
+    return qkv_weights
+
+
+@io.state_transform(
+    source_key=(
+        "decoder.block.*.layer.1.EncDecAttention.k.weight",
+        "decoder.block.*.layer.1.EncDecAttention.v.weight",
+    ),
+    target_key="decoder.layers.*.cross_attention.linear_kv.weight",
+)
+def _import_decoder_kv(ctx: io.TransformCTX, k, v):
+    # T5 Model does not support GQA
+    megatron_config = ctx.target.config
+
+    head_num = megatron_config.num_attention_heads
+    hidden_size = megatron_config.hidden_size
+    head_size = megatron_config.kv_channels
+
+    old_tensor_shape = k.size()
+    new_k_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+
+    k = k.view(*new_k_tensor_shape)
+    v = v.view(*new_k_tensor_shape)
+
+    kv_weights = torch.empty((0, head_size) + old_tensor_shape[1:])
+    for i in range(head_num):
+        kv_weights = torch.cat((kv_weights, k[i : i + 1, :, :]))
+        kv_weights = torch.cat((kv_weights, v[i : i + 1, :, :]))
+    kv_weights = kv_weights.reshape([head_size * (2 * head_num), hidden_size])
+
+    return kv_weights
+
+
+@io.state_transform(
+    source_key=(
+        "encoder.block.*.layer.1.DenseReluDense.wi_0.weight",
+        "encoder.block.*.layer.1.DenseReluDense.wi_1.weight",
+    ),
+    target_key="encoder.layers.*.mlp.linear_fc1.weight",
+)
+def _import_encoder_linear_fc1(down, gate):
+    return torch.cat((down, gate), axis=0)
+
+
+@io.state_transform(
+    source_key=(
+        "decoder.block.*.layer.2.DenseReluDense.wi_0.weight",
+        "decoder.block.*.layer.2.DenseReluDense.wi_1.weight",
+    ),
+    target_key="decoder.layers.*.mlp.linear_fc1.weight",
+)
+def _import_decoder_linear_fc1(down, gate):
+    return torch.cat((down, gate), axis=0)
+
+
+@io.model_exporter(T5Model, "hf")
+class HFT5Exporter(io.ModelConnector[T5Model, "T5ForConditionalGeneration"]):
+    """Exporter Connector for converting NeMo T5 Model to HF"""
+
+    def init(self) -> "T5ForConditionalGeneration":
+        from transformers import AutoModelForCausalLM
+        from transformers.modeling_utils import no_init_weights
+
+        with no_init_weights(True):
+            return T5ForConditionalGeneration(config=self.config)
+
+    def apply(self, output_path: Path) -> Path:
+        source, _ = self.nemo_load(str(self))
+        target = self.init()
+        target = self.convert_state(source, target)
+
+        target = target.cpu()
+        target.save_pretrained(output_path)
+        self.tokenizer.save_pretrained(output_path)
+
+        return output_path
+
+    def convert_state(self, source, target):
+        """Convert NeMo state dict to HF style"""
+        mapping = {
+            "embedding.word_embeddings.weight": "shared.weight",
+            "lm_head.output_layer.weight": "lm_head.weight",
+            "encoder_relative_pos_emb.relative_attention_bias.weight": "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight",
+            "decoder_relative_pos_emb.relative_attention_bias.weight": "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight",
+            "encoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "encoder.block.*.layer.0.layer_norm.weight",
+            "encoder.layers.*.self_attention.linear_proj.weight": "encoder.block.*.layer.0.SelfAttention.o.weight",
+            "encoder.layers.*.mlp.linear_fc1.layer_norm_weight": "encoder.block.*.layer.1.layer_norm.weight",
+            "encoder.layers.*.mlp.linear_fc2.weight": "encoder.block.*.layer.1.DenseReluDense.wo.weight",
+            "encoder.final_layernorm.weight": "encoder.final_layer_norm.weight",
+            "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "decoder.block.*.layer.0.layer_norm.weight",
+            "decoder.layers.*.self_attention.linear_proj.weight": "decoder.block.*.layer.0.SelfAttention.o.weight",
+            "decoder.layers.*.pre_cross_attn_layernorm.weight": "decoder.block.*.layer.1.layer_norm.weight",
+            "decoder.layers.*.cross_attention.linear_q.weight": "decoder.block.*.layer.1.EncDecAttention.q.weight",
+            "decoder.layers.*.cross_attention.linear_proj.weight": "decoder.block.*.layer.1.EncDecAttention.o.weight",
+            "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "decoder.block.*.layer.2.layer_norm.weight",
+            "decoder.layers.*.mlp.linear_fc2.weight": "decoder.block.*.layer.2.DenseReluDense.wo.weight",
+            "decoder.final_layernorm.weight": "decoder.final_layer_norm.weight",
+        }
+
+        return io.apply_transforms(
+            source,
+            target,
+            mapping=mapping,
+            transforms=[
+                _export_encoder_qkv,
+                _export_encoder_linear_fc1,
+                _export_decoder_qkv,
+                _export_decoder_kv,
+                _export_decoder_linear_fc1,
+            ],
+            state_dict_ignored_entries=['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'],
+        )
+
+    @property
+    def tokenizer(self):
+        """Retrieve Tokenizer from HF"""
+        # return io.load_context(str(self)).model.tokenizer.tokenizer
+        nemo_tokenizer = io.load_context(str(self)).model.tokenizer
+        self.bos_id = nemo_tokenizer.bos_id
+        self.pad_id = nemo_tokenizer.pad_id
+
+        return nemo_tokenizer.tokenizer
+
+    @property
+    def config(self) -> "HFT5Config":
+        """Generate NeMo Config based on HF config"""
+        source: T5Config = io.load_context(str(self)).model.config
+
+        from transformers import T5Config as HFT5Config
+
+        nemo_tokenizer = io.load_context(str(self)).model.tokenizer
+        bos_id = nemo_tokenizer.bos_id
+        pad_id = nemo_tokenizer.pad_id
+        eos_id = nemo_tokenizer.eos_id
+
+        def round_up_to_divisible(number, divisor):
+            import math
+
+            if divisor == 0:
+                raise ValueError("Divisor cannot be zero.")
+            return int(math.ceil(number / divisor) * divisor)
+
+        return HFT5Config(
+            num_layers=source.num_layers,
+            num_decoder_layers=source.encoder_num_layers,
+            d_model=source.hidden_size,
+            d_ff=source.ffn_hidden_size,
+            d_kv=source.kv_channels,
+            num_heads=source.num_attention_heads,
+            relative_attention_num_buckets=source.relative_attention_num_buckets,
+            relative_attention_max_distance=source.relative_attention_max_distance,
+            initializer_factor=source.init_method_std,
+            layer_norm_epsilon=source.layernorm_epsilon,
+            vocab_size=round_up_to_divisible(
+                self.tokenizer.vocab_size + len(self.tokenizer.additional_special_tokens), 128
+            ),
+            feed_forward_proj="gated-gelu",
+            tie_word_embeddings=source.share_embeddings_and_output_weights,
+            decoder_start_token_id=bos_id,
+            pad_token_id=pad_id,
+            eos_token_id=eos_id,
+        )
+
+
+@io.state_transform(
+    source_key="encoder.layers.*.self_attention.linear_qkv.weight",
+    target_key=(
+        "encoder.block.*.layer.0.SelfAttention.q.weight",
+        "encoder.block.*.layer.0.SelfAttention.k.weight",
+        "encoder.block.*.layer.0.SelfAttention.v.weight",
+    ),
+)
+def _export_encoder_qkv(ctx: io.TransformCTX, linear_qkv):
+    megatron_config = ctx.source.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_size = megatron_config.kv_channels
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size])
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+            for i in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+    q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
+    k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
+    v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()
+
+    return q_proj, k_proj, v_proj
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.self_attention.linear_qkv.weight",
+    target_key=(
+        "decoder.block.*.layer.0.SelfAttention.q.weight",
+        "decoder.block.*.layer.0.SelfAttention.k.weight",
+        "decoder.block.*.layer.0.SelfAttention.v.weight",
+    ),
+)
+def _export_decoder_qkv(ctx: io.TransformCTX, linear_qkv):
+    megatron_config = ctx.source.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_size = megatron_config.kv_channels
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size])
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+            for i in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+    q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
+    k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
+    v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()
+
+    return q_proj, k_proj, v_proj
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.cross_attention.linear_kv.weight",
+    target_key=(
+        "decoder.block.*.layer.1.EncDecAttention.k.weight",
+        "decoder.block.*.layer.1.EncDecAttention.v.weight",
+    ),
+)
+def _export_decoder_kv(ctx: io.TransformCTX, linear_kv):
+    megatron_config = ctx.source.config
+
+    num_query_groups = megatron_config.num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_size = megatron_config.kv_channels
+    kv_total_dim = 2 * num_query_groups
+
+    linear_kv = linear_kv.reshape([kv_total_dim, head_size, hidden_size])
+    k_slice = torch.arange(0, kv_total_dim, 2)
+    v_slice = torch.arange(1, kv_total_dim, 2)
+
+    k_proj = linear_kv[k_slice].reshape(-1, hidden_size).cpu()
+    v_proj = linear_kv[v_slice].reshape(-1, hidden_size).cpu()
+
+    return k_proj, v_proj
+
+
+@io.state_transform(
+    source_key="encoder.layers.*.mlp.linear_fc1.weight",
+    target_key=(
+        "encoder.block.*.layer.1.DenseReluDense.wi_0.weight",
+        "encoder.block.*.layer.1.DenseReluDense.wi_1.weight",
+    ),
+)
+def _export_encoder_linear_fc1(linear_fc1):
+    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
+
+    return gate_proj, up_proj
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.mlp.linear_fc1.weight",
+    target_key=(
+        "decoder.block.*.layer.2.DenseReluDense.wi_0.weight",
+        "decoder.block.*.layer.2.DenseReluDense.wi_1.weight",
+    ),
+)
+def _export_decoder_linear_fc1(linear_fc1):
+    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
+
+    return gate_proj, up_proj
+
+
 __all__ = [
     "T5Model",
     "T5Config",
diff --git a/nemo/collections/multimodal/data/__init__.py b/nemo/collections/multimodal/data/__init__.py
index 7e6ac24828f5..9a78712f026d 100644
--- a/nemo/collections/multimodal/data/__init__.py
+++ b/nemo/collections/multimodal/data/__init__.py
@@ -14,7 +14,7 @@
 
 from nemo.utils.import_utils import safe_import_from
 
-SimpleMultiModalDataModule, _ = safe_import_from(
-    "nemo.collections.multimodal.data.energon", "SimpleMultiModalDataModule"
+EnergonMultiModalDataModule, _ = safe_import_from(
+    "nemo.collections.multimodal.data.energon", "EnergonMultiModalDataModule"
 )
-__all__ = ["SimpleMultiModalDataModule"]
+__all__ = ["EnergonMultiModalDataModule"]
diff --git a/nemo/collections/multimodal/data/energon/__init__.py b/nemo/collections/multimodal/data/energon/__init__.py
index 04926758cbac..8c7465880b39 100644
--- a/nemo/collections/multimodal/data/energon/__init__.py
+++ b/nemo/collections/multimodal/data/energon/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-from nemo.collections.multimodal.data.energon.base import SimpleMultiModalDataModule
+from nemo.collections.multimodal.data.energon.base import EnergonMultiModalDataModule
 from nemo.collections.multimodal.data.energon.config import (
     ImageTextSample,
     ImageToken,
@@ -28,7 +28,7 @@
 )
 
 __all__ = [
-    "SimpleMultiModalDataModule",
+    "EnergonMultiModalDataModule",
     "ImageToken",
     "ImageTextSample",
     "MultiModalSampleConfig",
diff --git a/nemo/collections/multimodal/data/energon/base.py b/nemo/collections/multimodal/data/energon/base.py
index 8c7819c3d7dd..3dfd495edd82 100644
--- a/nemo/collections/multimodal/data/energon/base.py
+++ b/nemo/collections/multimodal/data/energon/base.py
@@ -30,7 +30,7 @@
 from nemo.utils import logging
 
 
-class SimpleMultiModalDataModule(pl.LightningDataModule, IOMixin):
+class EnergonMultiModalDataModule(pl.LightningDataModule, IOMixin):
     """
     A PyTorch Lightning DataModule for handling multimodal datasets with images and text.
 
@@ -70,7 +70,7 @@ def __init__(
         decoder_seq_length: Optional[int] = None,
     ) -> None:
         """
-        Initialize the SimpleMultiModalDataModule.
+        Initialize the EnergonMultiModalDataModule.
 
         Parameters:
         path (str): Path to the dataset.
@@ -80,8 +80,10 @@ def __init__(
         micro_batch_size (int, optional): The batch size for training and validation. Defaults to 1.
         num_workers (int, optional): Number of workers for data loading. Defaults to 1.
         pin_memory (bool, optional): Whether to pin memory in the DataLoader. Defaults to True.
-        multimodal_sample_config (MultiModalSampleConfig, optional): Configuration object for multimodal samples. Defaults to MultiModalSampleConfig().
-        task_encoder (MultiModalTaskEncoder, optional): Encoder responsible for encoding and batching samples. If not provided, a default (MultimodalTaskEncoder) encoder will be created. Defaults to None.
+        multimodal_sample_config (MultiModalSampleConfig, optional): Configuration object for multimodal samples.
+        Defaults to MultiModalSampleConfig().
+        task_encoder (MultiModalTaskEncoder, optional): Encoder responsible for encoding and batching samples.
+        If not provided, a default (MultimodalTaskEncoder) encoder will be created. Defaults to None.
         """
 
         super().__init__()
@@ -113,7 +115,7 @@ def __init__(
         self.val_dataloader_object = None
 
     def io_init(self, **kwargs) -> fdl.Config[Self]:
-        # (pleasefixme) image_processor and task_encoder are problematic with Fiddle so we skip serializing them for now
+
         cfg_kwargs = {k: deepcopy(v) for k, v in kwargs.items() if k not in ['image_processor', 'task_encoder']}
 
         for val in cfg_kwargs.values():
@@ -168,7 +170,8 @@ def train_dataloader(self) -> TRAIN_DATALOADERS:
             return self.train_dataloader_object
         if not parallel_state.is_initialized():
             logging.info(
-                f"Muiltimodal data loader parallel state is not initialized, using default worker config with no_workers {self.num_workers}"
+                f"Muiltimodal data loader parallel state is not initialized,"
+                f"using default worker config with no_workers {self.num_workers}"
             )
             worker_config = WorkerConfig.default_worker_config(self.num_workers)
         else:
@@ -176,7 +179,8 @@ def train_dataloader(self) -> TRAIN_DATALOADERS:
             world_size = parallel_state.get_data_parallel_world_size()
             data_parallel_group = parallel_state.get_data_parallel_group()
             logging.info(
-                f" Multimodal  train dataloader initializing with  rank {rank} world_size {world_size} data_parallel_group {data_parallel_group} ****** "
+                f" Multimodal  train dataloader initializing with"
+                f"rank {rank} world_size {world_size} data_parallel_group {data_parallel_group} ****** "
             )
             worker_config = WorkerConfig(
                 rank=rank,
@@ -206,7 +210,8 @@ def val_dataloader(self) -> EVAL_DATALOADERS:
 
         if not parallel_state.is_initialized():
             logging.info(
-                f"Muiltimodal val data loader parallel state is not initialized, using default worker config with no_workers {self.num_workers}"
+                f"Muiltimodal val data loader parallel state is not initialized,"
+                "using default worker config with no_workers {self.num_workers}"
             )
             worker_config = WorkerConfig.default_worker_config(self.num_workers)
         else:
@@ -276,7 +281,8 @@ def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
         """
         if not 'dataloader_state' in state_dict:
             logging.warning(
-                f"Data loader state cannot be resumed from state_dict, it does not have the required key dataloader_state. It has {state_dict.keys()}"
+                f"Data loader state cannot be resumed from state_dict,"
+                f"it does not have the required key dataloader_state. It has {state_dict.keys()}"
             )
             return
 
@@ -288,7 +294,8 @@ def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
             else:
                 logging.error(f"Cannot restore state from state_dict {state_dict}")
                 raise ValueError(
-                    f"Cannot restore state from state_dict: Is the trainer object is initialized and attached to datamodule???"
+                    f"Cannot restore state from state_dict: "
+                    f"Is the trainer object is initialized and attached to datamodule???"
                 )
         except Exception as e:
             raise RuntimeError(f"Failed to dataloader restore state due to: {e}")
diff --git a/nemo/collections/multimodal/data/energon/conversation.py b/nemo/collections/multimodal/data/energon/conversation.py
index f0749e47dc12..31019ae9c615 100644
--- a/nemo/collections/multimodal/data/energon/conversation.py
+++ b/nemo/collections/multimodal/data/energon/conversation.py
@@ -19,21 +19,24 @@
 class BaseConversationTemplateConfig:
     """Conversation template config related parameters"""
 
-    system: Optional[str] = "".format()  # fmt: off
+    system: Optional[str] = ""
     roles: List[str] = field(default_factory=lambda: ['user', 'assistant'])
     stop_string: Optional[str] = None
     chat_template = None
 
 
+@dataclass
 class LLaVATemplateConfig(BaseConversationTemplateConfig):
-    """LLava specific template configuration which extends the base config"""
+    """LLava-specific template configuration which extends the base config"""
 
-    system: Optional[str] = (
-        "A chat between a curious user and artificial assistant agent. The assistant gives helpful, detailed and polite answers to user's questions.".format()
-    )  # fmt: off
+    system: str = field(
+        default="A chat between a curious user and artificial assistant agent. "
+        "The assistant gives helpful, detailed and polite answers to user's questions."
+    )
     roles: List[str] = field(default_factory=lambda: ['user', 'assistant'])
-    stop_string: str = "</s>"
-    chat_template = """
+    stop_string: str = field(default="</s>")
+    chat_template: str = field(
+        default="""
     {%- for message in messages %}
         {%- if message['role'] == 'system' %}
             {{- message['content'].strip() + ' ' -}}
@@ -45,14 +48,17 @@ class LLaVATemplateConfig(BaseConversationTemplateConfig):
         {%- endif %}
     {%- endfor -%}
     """
+    )
 
 
 class MLlamaTemplateConfig(BaseConversationTemplateConfig):
-    """LLava specific template configuration which extends the base config"""
+    """MLlama specific template configuration which extends the base config"""
 
-    system: Optional[str] = None
+    system: str = field(default=None)
     roles: List[str] = field(default_factory=lambda: ['user', 'assistant'])
-    stop_string: str = None
-    chat_template = """
+    stop_string: str = field(default=None)
+    chat_template: str = field(
+        default="""
     '{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- if strftime_now is defined %}\n        {%- set date_string = strftime_now("%d %b %Y") %}\n    {%- else %}\n        {%- set date_string = "26 Jul 2024" %}\n    {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0][\'role\'] == \'system\' %}\n    {%- set system_message = messages[0][\'content\']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = "" %}\n{%- endif %}\n\n{#- Find out if there are any images #}\n{% set image_ns = namespace(has_images=false) %}      \n{%- for message in messages %}\n    {%- for content in message[\'content\'] %}\n        {%- if content[\'type\'] == \'image\' %}\n            {%- set image_ns.has_images = true %}\n        {%- endif %}\n    {%- endfor %}\n{%- endfor %}\n\n{#- Error out if there are images and system message #}\n{%- if image_ns.has_images and not system_message == "" %}\n    {{- raise_exception("Prompting with images is incompatible with system messages.") }}\n{%- endif %}\n\n{#- System message if there are no images #}\n{%- if not image_ns.has_images %}\n    {{- "<|start_header_id|>system<|end_header_id|>\\n\\n" }}\n    {%- if tools is not none %}\n        {{- "Environment: ipython\\n" }}\n    {%- endif %}\n    {{- "Cutting Knowledge Date: December 2023\\n" }}\n    {{- "Today Date: " + date_string + "\\n\\n" }}\n    {%- if tools is not none and not tools_in_user_message %}\n        {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}\n        {{- \'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.\' }}\n        {{- "Do not use variables.\\n\\n" }}\n        {%- for t in tools %}\n            {{- t | tojson(indent=4) }}\n            {{- "\\n\\n" }}\n        {%- endfor %}\n    {%- endif %}\n    {{- system_message }}\n    {{- "<|eot_id|>" }}\n{%- endif %}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0][\'content\']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception("Cannot put tools in the first user message when there\'s no first user message!") }}\n{%- endif %}\n    {{- \'<|start_header_id|>user<|end_header_id|>\\n\\n\' -}}\n    {{- "Given the following functions, please respond with a JSON for a function call " }}\n    {{- "with its proper arguments that best answers the given prompt.\\n\\n" }}\n    {{- \'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.\' }}\n    {{- "Do not use variables.\\n\\n" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- "\\n\\n" }}\n    {%- endfor %}\n    {{- first_user_message + "<|eot_id|>"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == \'ipython\' or message.role == \'tool\' or \'tool_calls\' in message) %}\n    {{- \'<|start_header_id|>\' + message[\'role\'] + \'<|end_header_id|>\\n\\n\' }}\n        {%- if message[\'content\'] is string %}\n            {{- message[\'content\'] }}\n        {%- else %}\n            {%- for content in message[\'content\'] %}\n                {%- if content[\'type\'] == \'image\' %}\n                    {{- \'<|image|>\' }}\n                {%- elif content[\'type\'] == \'text\' %}\n                    {{- content[\'text\'] }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- \'<|eot_id|>\' }}\n    {%- elif \'tool_calls\' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception("This model only supports single tool-calls at once!") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {{- \'<|start_header_id|>assistant<|end_header_id|>\\n\\n\' -}}\n        {{- \'{"name": "\' + tool_call.name + \'", \' }}\n        {{- \'"parameters": \' }}\n        {{- tool_call.arguments | tojson }}\n        {{- "}" }}\n        {{- "<|eot_id|>" }}\n    {%- elif message.role == "tool" or message.role == "ipython" %}\n        {{- "<|start_header_id|>ipython<|end_header_id|>\\n\\n" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- "<|eot_id|>" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- \'<|start_header_id|>assistant<|end_header_id|>\\n\\n\' }}\n{%- endif %}\n'
     """
+    )
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index ed489cf8c547..e35b7008a829 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -1189,8 +1189,10 @@ def test_epoch_end(self, outputs):
     def loss_func(self, loss_mask, output_tensor):
         losses = output_tensor.float()
         loss_mask = loss_mask.view(-1).float()
-        # TODO: add nemo version here
-        loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()  # sequence level nll
+        valid_tokens = loss_mask.sum()
+        if valid_tokens < 0.5:  # no valid tokens
+            valid_tokens += 1.0
+        loss = torch.sum(losses.view(-1) * loss_mask) / valid_tokens  # sequence level nll
         return loss
 
     def setup(self, stage=None):
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
index 9da2419520c2..bff4922c10fc 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import math
 import re
 from typing import List, Mapping, Optional
@@ -524,7 +525,15 @@ def collate_fn(self, batch):
 
 
 class GPTSFTPackedDataset(GPTSFTDataset):
-    def __init__(self, file_path: str, tokenizer: TokenizerSpec, return_cu_seqlen: bool = True, **kwargs):
+    def __init__(
+        self,
+        file_path: str,
+        tokenizer: TokenizerSpec,
+        return_cu_seqlen: bool = True,
+        pad_cu_seqlens: bool = False,
+        pack_metadata_file_path: Optional[str] = None,
+        **kwargs,
+    ):
         """
         file_path: See `file_path` in the parent class.
         tokenizer: See `tokenizer` in the parent class.
@@ -537,6 +546,20 @@ def __init__(self, file_path: str, tokenizer: TokenizerSpec, return_cu_seqlen: b
         assert self.virtual_tokens == 0, "P-Tuning with packed sequence is not supported."
         self.return_cu_seqlen = return_cu_seqlen
 
+        self.pad_cu_seqlens = pad_cu_seqlens
+        if self.pad_cu_seqlens:
+            assert (
+                pack_metadata_file_path is not None
+            ), "a metadata json file is required when pad_cu_seqlens is enabled"
+            assert (
+                self.pad_to_max_length is True
+            ), "'pad_to_max_length=True' is required when pad_cu_seqlens is enabled"
+
+        self.pack_metadata = None
+        if pack_metadata_file_path is not None:
+            with open(pack_metadata_file_path) as f:
+                self.pack_metadata = json.load(f)
+
     def __getitem__(self, idx):
         if self.samples_mapping is not None:
             # assert idx < len(self.samples_mapping)
@@ -665,6 +688,11 @@ def collate_fn(self, batch):
             if len(cu_seqlens[-1]) > len(cu_seqlens_unpadded[-1]):
                 cu_seqlens_unpadded[-1].append(cu_seqlens_unpadded[-1][-1])
 
+            if self.pad_cu_seqlens:
+                # pad cu_seqlens with zero length sequences
+                pad_num = self.pack_metadata['max_samples_per_bin'] - len(cu_seqlens[-1])
+                cu_seqlens[-1].extend([max_length] * pad_num)
+
         assert len(input_ids[0]) == len(
             position_ids[0]
         ), "Dataset problem: input_ids and position_ids lengths don't match"
@@ -695,6 +723,15 @@ def collate_fn(self, batch):
             cu_seqlens_unpadded = torch.IntTensor(cu_seqlens_unpadded)
             cu_seqlens_unpadded_argmin = torch.argmin(cu_seqlens_unpadded, dim=1, keepdim=True)
 
+            if self.pad_cu_seqlens:
+                # Use the global max seqlen, as 'pad_cu_seqlens' is used mainly
+                # to support cudagraphs, and 'max_seqlen' is a cpu tensor, which means should
+                # be the same across all batches.
+                max_seqlen = torch.IntTensor([self.pack_metadata['dataset_max_seqlen']] * len(cu_seqlens))
+            else:
+                seqlens = cu_seqlens[:, 1:] - cu_seqlens[:, :-1]
+                max_seqlen, _ = seqlens.max(dim=1, keepdim=True)
+
             processed_batch.update(
                 {
                     'attention_mask': torch.LongTensor(
diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py
index 31bf493ec776..f066ade86811 100644
--- a/nemo/collections/nlp/modules/common/tokenizer_utils.py
+++ b/nemo/collections/nlp/modules/common/tokenizer_utils.py
@@ -169,6 +169,11 @@ def get_nmt_tokenizer(
             It has empirically been shown to improve inference time BLEU scores.
         r2l: Whether to return subword IDs from right to left
     """
+    import omegaconf
+    from omegaconf import OmegaConf
+
+    if isinstance(special_tokens, omegaconf.listconfig.ListConfig):
+        special_tokens = OmegaConf.to_container(special_tokens)
     if special_tokens is None:
         special_tokens_dict = {}
     else:
@@ -195,8 +200,10 @@ def get_nmt_tokenizer(
         from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
 
         logging.info(f'Getting SentencePiece with model: {tokenizer_model}')
+
         return SentencePieceTokenizer(
             model_path=tokenizer_model,
+            special_tokens=special_tokens,
             legacy=legacy,
             chat_template=chat_template,
         )
diff --git a/nemo/collections/speechlm/__init__.py b/nemo/collections/speechlm/__init__.py
new file mode 100755
index 000000000000..2b19e0be88fd
--- /dev/null
+++ b/nemo/collections/speechlm/__init__.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.speechlm.models import HFAutoModelForSpeechSeq2Seq
+from nemo.utils import logging
+
+__all__ = [
+    "HFAutoModelForSpeechSeq2Seq",
+]
+
+try:
+    import nemo_run as run
+
+    from nemo.collections.llm.recipes import adam
+    from nemo.collections.speechlm.api import finetune, generate, pretrain, train, validate
+
+    __all__.extend(
+        [
+            "train",
+            "pretrain",
+            "validate",
+            "finetune",
+            "generate",
+        ]
+    )
+except ImportError as error:
+    logging.warning(f"Failed to import nemo.collections.speechlm.[api, recipes]: {error}")
diff --git a/nemo/collections/speechlm/api.py b/nemo/collections/speechlm/api.py
new file mode 100644
index 000000000000..2342da6eb45c
--- /dev/null
+++ b/nemo/collections/speechlm/api.py
@@ -0,0 +1,442 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from copy import deepcopy
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
+
+import lightning.pytorch as pl
+import nemo_run as run
+
+from typing_extensions import Annotated
+
+import nemo.lightning as nl
+from nemo.lightning import (
+    AutoResume,
+    NeMoLogger,
+    OptimizerModule,
+    Trainer,
+    configure_no_restart_validation_training_loop,
+)
+from nemo.lightning.pytorch.callbacks import PEFT, ModelTransform
+from nemo.utils import logging
+
+TokenizerType = Any
+
+
+@run.cli.entrypoint(namespace="speechlm")
+def train(
+    model: pl.LightningModule,
+    data: pl.LightningDataModule,
+    trainer: Trainer,
+    log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None,
+    resume: Annotated[Optional[AutoResume], run.Config[AutoResume]] = None,
+    optim: Optional[OptimizerModule] = None,
+    tokenizer: Optional[TokenizerType] = None,
+    model_transform: Optional[Union[PEFT, ModelTransform, Callable]] = None,
+    # TODO: Fix export export: Optional[str] = None,
+) -> Path:
+    """
+    Trains a model using the specified data and trainer, with optional tokenizer, source, and export.
+
+    Args:
+        model (pl.LightningModule): The model to be trained.
+        data (pl.LightningDataModule): The data module containing training data.
+        trainer (Trainer): The trainer instance configured with a MegatronStrategy.
+        log (NeMoLogger): A nemologger instance.
+        resume (Optional[Union[AutoResume, Resume]]): Resume training from a checkpoint.
+        optim (Optional[OptimizerModule]): The optimizer module to be used. If not provided, the default optimizer
+            from the model will be used.
+        tokenizer (Optional[TokenizerType]): Tokenizer setting to be applied. Can be 'data' or 'model'
+            or an instance of TokenizerSpec.
+        export (Optional[str]): Filename to save the exported checkpoint after training.
+        model_transform (Optional[Union[Callable[[nn.Module], nn.Module], PEFT]]): A model transform to be applied.
+
+    Returns
+    -------
+        Path: The directory path where training artifacts are saved.
+
+    Examples
+    --------
+        >>> from nemo.collections import llm
+        >>> from nemo import lightning as nl
+        >>> model = llm.MistralModel()
+        >>> data = llm.SquadDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2)
+        >>> precision = nl.MegatronMixedPrecision(precision="bf16-mixed")
+        >>> trainer = nl.Trainer(strategy=nl.MegatronStrategy(tensor_model_parallel_size=2), plugins=precision)
+        >>> llm.train(model, data, trainer, tokenizer="data")
+        PosixPath('/path/to/log_dir')
+    """
+    app_state = _setup(
+        model=model,
+        data=data,
+        trainer=trainer,
+        log=log,
+        resume=resume,
+        optim=optim,
+        tokenizer=tokenizer,
+        model_transform=model_transform,
+    )
+
+    trainer.fit(model, data)
+
+    return app_state.exp_dir
+
+
+@run.cli.entrypoint(namespace="speechlm")
+def pretrain(
+    model: pl.LightningModule,
+    data: pl.LightningDataModule,
+    trainer: Trainer,
+    log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None,
+    resume: Annotated[Optional[AutoResume], run.Config[AutoResume]] = None,
+    optim: Optional[OptimizerModule] = None,
+) -> Path:
+    """
+    Pretrains a model using the specified data and trainer, with optional logging, resuming, and optimization.
+
+    This function is a wrapper around the `train` function, specifically configured for pretraining tasks.
+    Note, by default it will use the tokenizer from the model.
+
+    Args:
+        model (pl.LightningModule): The model to be pretrained.
+        data (pl.LightningDataModule): The data module containing pretraining data.
+        trainer (Trainer): The trainer instance configured with a MegatronStrategy.
+        log (NeMoLogger): A nemologger instance.
+        resume (Optional[AutoResume]): Resume training from a checkpoint.
+        optim (Optional[OptimizerModule]): The optimizer module to be used. If not provided, the default
+            optimizer from the model will be used.
+
+    Returns:
+        Path: The directory path where pretraining artifacts are saved.
+
+    Examples:
+        >>> from nemo.collections import llm
+        >>> from nemo import lightning as nl
+        >>> model = llm.MistralModel()
+        >>> data = llm.PretrainingDataModule(paths=[...], seq_length=4096, global_batch_size=16, micro_batch_size=2)
+        >>> precision = nl.MegatronMixedPrecision(precision="bf16-mixed")
+        >>> trainer = nl.Trainer(strategy=nl.MegatronStrategy(tensor_model_parallel_size=2), plugins=precision)
+        >>> llm.pretrain(model, data, trainer)
+        PosixPath('/path/to/log_dir')
+    """
+    _validate_config(model, data, trainer, log=log, resume=resume, optim=optim)
+    return train(
+        model=model,
+        data=data,
+        trainer=trainer,
+        log=log,
+        resume=resume,
+        optim=optim,
+        tokenizer="data",
+    )
+
+
+@run.cli.entrypoint(namespace="speechlm")
+def finetune(
+    model: pl.LightningModule,
+    data: pl.LightningDataModule,
+    trainer: Trainer,
+    log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None,
+    resume: Annotated[Optional[AutoResume], run.Config[AutoResume]] = None,
+    optim: Optional[OptimizerModule] = None,
+    peft: Optional[Union[PEFT, ModelTransform, Callable]] = None,
+) -> Path:
+    """
+    Finetunes a model using the specified data and trainer, with optional logging, resuming, and PEFT.
+
+    Note, by default it will use the tokenizer from the model.
+
+    Args:
+        model (pl.LightningModule): The model to be finetuned.
+        data (pl.LightningDataModule): The data module containing finetuning data.
+        trainer (Trainer): The trainer instance configured with a MegatronStrategy.
+        log (NeMoLogger): A nemologger instance.
+        resume (Optional[AutoResume]): Resume training from a checkpoint.
+        optim (Optional[OptimizerModule]): The optimizer module to be used. If not provided, the default
+            optimizer from the model will be used.
+        peft (Optional[PEFT]): A PEFT (Parameter-Efficient Fine-Tuning) configuration to be applied.
+
+    Returns:
+        Path: The directory path where finetuning artifacts are saved.
+
+    Examples:
+        >>> from nemo.collections import llm
+        >>> from nemo import lightning as nl
+        >>> model = llm.MistralModel()
+        >>> data = llm.SquadDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2)
+        >>> precision = nl.MegatronMixedPrecision(precision="bf16-mixed")
+        >>> trainer = nl.Trainer(strategy=nl.MegatronStrategy(tensor_model_parallel_size=2), plugins=precision)
+        >>> llm.finetune(model, data, trainer, peft=llm.peft.LoRA()])
+        PosixPath('/path/to/log_dir')
+    """
+
+    _validate_config(model, data, trainer, log=log, resume=resume, optim=optim, model_transform=peft)
+    return train(
+        model=model,
+        data=data,
+        trainer=trainer,
+        log=log,
+        resume=resume,
+        optim=optim,
+        tokenizer="model",
+        model_transform=peft,
+    )
+
+
+@run.cli.entrypoint(namespace="speechlm")
+def validate(
+    model: pl.LightningModule,
+    data: pl.LightningDataModule,
+    trainer: Trainer,
+    log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None,
+    resume: Annotated[Optional[AutoResume], run.Config[AutoResume]] = None,
+    optim: Optional[OptimizerModule] = None,
+    tokenizer: Optional[TokenizerType] = None,
+    model_transform: Optional[Union[PEFT, ModelTransform, Callable]] = None,
+) -> Path:
+    """
+    Validates a model using the specified data and trainer, with optional logging, resuming, and model transformations.
+
+    Args:
+        model (pl.LightningModule): The model to be validated.
+        data (pl.LightningDataModule): The data module containing validation data.
+        trainer (Trainer): The trainer instance configured with a MegatronStrategy.
+        log (NeMoLogger): A nemologger instance.
+        resume (Optional[AutoResume]): Resume from a checkpoint for validation.
+        optim (Optional[OptimizerModule]): The optimizer module to be used. If not provided, the default optimizer
+            from the model will be used.
+        tokenizer (Optional[TokenizerType]): Tokenizer setting to be applied. Can be 'data' or 'model'
+            or an instance of TokenizerSpec.
+        model_transform (Optional[Union[Callable[[nn.Module], nn.Module], PEFT]]): A model transform to be applied.
+
+    Returns:
+        Path: The directory path where validation artifacts are saved.
+
+    Examples:
+        >>> from nemo.collections import llm
+        >>> from nemo import lightning as nl
+        >>> model = llm.MistralModel()
+        >>> data = llm.SquadDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2)
+        >>> precision = nl.MegatronMixedPrecision(precision="bf16-mixed")
+        >>> trainer = nl.Trainer(strategy=nl.MegatronStrategy(tensor_model_parallel_size=2), plugins=precision)
+        >>> llm.validate(model, data, trainer, tokenizer="data")
+        PosixPath('/path/to/log_dir')
+    """
+    app_state = _setup(
+        model=model,
+        data=data,
+        trainer=trainer,
+        log=log,
+        resume=resume,
+        optim=optim,
+        tokenizer=tokenizer,
+        model_transform=model_transform,
+    )
+
+    trainer.validate(model, data)
+
+    return app_state.exp_dir
+
+
+def evaluate():
+    """
+    Evaluates NeMo SpeechLM model.
+    """
+    raise NotImplementedError("This function will be implemented later")
+
+
+@run.cli.entrypoint(name="generate", namespace="speechlm")
+def generate():
+    """
+    Generates text using a NeMo Speech model.
+    """
+    raise NotImplementedError("This function will be implemented later")
+
+
+def _use_tokenizer(model: pl.LightningModule, data: pl.LightningDataModule, tokenizer: TokenizerType) -> None:
+    if tokenizer == "data":
+        _set_with_io(model, "tokenizer", data.tokenizer)
+    elif tokenizer == "model":
+        _set_with_io(data, "tokenizer", model.tokenizer)
+    else:
+        try:
+            from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+
+            if isinstance(tokenizer, TokenizerSpec):
+                _set_with_io(model, "tokenizer", tokenizer)
+                _set_with_io(data, "tokenizer", tokenizer)
+            else:
+                raise ValueError(f"Expected TokenizerSpec or 'data' or 'model', got: {tokenizer}")
+        except ImportError:
+            raise ValueError("TokenizerSpec is not available")
+
+
+def _setup(
+    model: pl.LightningModule,
+    data: pl.LightningDataModule,
+    trainer: Trainer,
+    log: Optional[NeMoLogger],
+    resume: Optional[AutoResume],
+    optim: Optional[OptimizerModule],
+    tokenizer: Optional[TokenizerType],
+    model_transform: Optional[Union[PEFT, ModelTransform, Callable]],
+) -> Any:  # Return type is Any because app_state's type is not specified
+    configure_no_restart_validation_training_loop(trainer)
+    _log = log or NeMoLogger()
+    if resume and isinstance(model_transform, PEFT) and _log.ckpt:
+        logging.info("Disabling try_restore_best_ckpt restoration for adapters")
+        _log.ckpt.try_restore_best_ckpt = False
+
+    app_state = _log.setup(
+        trainer,
+        resume_if_exists=getattr(resume, "resume_if_exists", False),
+        task_config=getattr(train, "__io__", None),
+    )
+    if resume is not None:
+        resume.setup(trainer, model)
+
+    if optim:
+        optim.connect(model)
+    if tokenizer:  # TODO: Improve this
+        _use_tokenizer(model, data, tokenizer)
+
+    if model_transform:
+        _set_with_io(model, "model_transform", model_transform)
+
+    # Add ModelTransform callback to Trainer if needed
+    if getattr(model, "model_transform", None):
+        if not any(isinstance(cb, ModelTransform) for cb in trainer.callbacks):
+            if isinstance(model_transform, ModelTransform):
+                trainer.callbacks.append(model_transform)
+            else:
+                trainer.callbacks.append(ModelTransform())
+
+    return app_state
+
+
+def _set_with_io(obj, attr, value):
+    setattr(obj, attr, value)
+    if hasattr(obj, "__io__") and hasattr(value, "__io__"):
+        setattr(obj.__io__, attr, deepcopy(value.__io__))
+
+
+def _validate_config(
+    model: pl.LightningModule,
+    data: pl.LightningDataModule,
+    trainer: Trainer,
+    log: Optional[NeMoLogger] = None,
+    resume: Optional[AutoResume] = None,
+    optim: Optional[OptimizerModule] = None,
+    tokenizer: Optional[TokenizerType] = None,
+    model_transform: Optional[Union[PEFT, ModelTransform, Callable]] = None,
+) -> None:
+
+    ## Model validation
+    if hasattr(model, "config"):
+        assert getattr(model.config, "seq_length", 1) > 0
+        assert getattr(model.config, "max_position_embeddings", 1) > 0
+        assert model.config.num_layers > 0
+        assert model.config.hidden_size > 0
+        assert model.config.num_attention_heads > 0
+        assert model.config.ffn_hidden_size > 0
+
+        if hasattr(model.config, "seq_length"):
+            if getattr(model.config, "max_position_embeddings", None) is not None:
+                assert model.config.seq_length <= model.config.max_position_embeddings
+    else:
+        assert not isinstance(trainer.strategy, nl.MegatronStrategy), "Expected model.config to exist"
+
+    ## Data validation
+    if hasattr(data, 'micro_batch_size'):
+        assert data.micro_batch_size > 0
+    if hasattr(data, 'global_batch_size'):
+        assert data.global_batch_size > 0
+    if hasattr(data, 'seq_length'):
+        assert data.seq_length > 0
+
+    if hasattr(data, 'micro_batch_size') and hasattr(data, 'global_batch_size'):
+        assert (
+            data.global_batch_size % data.micro_batch_size == 0
+        ), "Global batch size must be divisible by micro batch size in data module."
+
+    ## Trainer validation
+
+    # MegatronStrategy validation
+    if isinstance(trainer.strategy, nl.MegatronStrategy):
+        # Basic validation
+        assert trainer.strategy.tensor_model_parallel_size > 0
+        assert trainer.strategy.pipeline_model_parallel_size > 0
+        assert trainer.strategy.context_parallel_size > 0
+
+        # DP validation
+        assert (trainer.num_devices * trainer.num_nodes) % (
+            trainer.strategy.tensor_model_parallel_size
+            * trainer.strategy.pipeline_model_parallel_size
+            * trainer.strategy.context_parallel_size
+        ) == 0, "Number of GPUs must be divisible by the product of all parallelism sizes for data parallel."
+
+        assert (
+            data.global_batch_size
+            % (
+                data.micro_batch_size
+                * (
+                    (trainer.num_devices * trainer.num_nodes)
+                    / (
+                        trainer.strategy.tensor_model_parallel_size
+                        * trainer.strategy.pipeline_model_parallel_size
+                        * trainer.strategy.context_parallel_size
+                    )
+                )
+            )
+            == 0
+        ), "Global batch size must be divisible by the product of micro batch size and data parallel size"
+
+        # TP/SP validation
+        if trainer.strategy.tensor_model_parallel_size == 1:
+            if trainer.strategy.sequence_parallel == True:
+                warnings.warn("Disabling sequence parallelism because tensor model parallelism is disabled")
+                trainer.strategy.sequence_parallel = False
+
+        # PP/VP validation
+        if trainer.strategy.pipeline_model_parallel_size > 1:
+            assert (
+                trainer.strategy.pipeline_dtype is not None
+            ), "pipeline_dtype must be set if pipeline model parallelism is enabled"
+        else:
+            if trainer.strategy.virtual_pipeline_model_parallel_size is not None:
+                warnings.warn("Disabling virtual pipeline parallelism because pipeline model parallelism is disabled")
+                trainer.strategy.virtual_pipeline_model_parallel_size = None
+            if trainer.strategy.pipeline_dtype is not None:
+                warnings.warn("Setting pipeline dtype to None because pipeline model parallelism is disabled")
+                trainer.strategy.pipeline_dtype = None
+
+        # CP validation
+        if trainer.strategy.context_parallel_size > 1:
+            if hasattr(model, "config"):
+                if model.config.seq_length is not None:
+                    assert (
+                        model.config.seq_length % (trainer.strategy.context_parallel_size * 2) == 0
+                    ), 'Sequence length must be divisible by 2 * context parallel size if context parallel is used.'
+
+        # EP validation
+        if trainer.strategy.expert_model_parallel_size > 1:
+            if hasattr(model, "config"):
+                assert (
+                    model.config.num_moe_experts is not None
+                ), "num_experts must be non None to use expert model parallelism"
+                assert (
+                    model.config.num_moe_experts % trainer.strategy.expert_model_parallel_size == 0
+                ), "Number of experts should be a multiple of expert model parallel_size."
diff --git a/nemo/collections/speechlm/models/__init__.py b/nemo/collections/speechlm/models/__init__.py
new file mode 100644
index 000000000000..a7d4e02cb4ca
--- /dev/null
+++ b/nemo/collections/speechlm/models/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.speechlm.models.hf_auto_model_for_speech_seq2seq import HFAutoModelForSpeechSeq2Seq
diff --git a/nemo/collections/speechlm/models/hf_auto_model_for_speech_seq2seq.py b/nemo/collections/speechlm/models/hf_auto_model_for_speech_seq2seq.py
new file mode 100644
index 000000000000..a039edc66a39
--- /dev/null
+++ b/nemo/collections/speechlm/models/hf_auto_model_for_speech_seq2seq.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import lightning.pytorch as pl
+import torch
+import torch.nn.functional as F
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
+
+from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+from nemo.collections.llm import fn
+from nemo.lightning import io
+from nemo.utils import logging
+
+
+def masked_cross_entropy(logits, targets, mask=None):
+    if mask is not None:
+        loss = F.cross_entropy(logits, targets, reduction='none')
+        return torch.mean(loss[mask == 1])
+    else:
+        return F.cross_entropy(logits, targets)
+
+
+class HFAutoModelForSpeechSeq2Seq(pl.LightningModule, io.IOMixin, fn.FNMixin):
+    def __init__(
+        self,
+        model_name='gpt2',
+        load_pretrained_weights=True,
+        tokenizer=None,
+        loss_fn=masked_cross_entropy,
+        model_transform=None,
+        model_accelerator=None,
+        trust_remote_code=False,
+    ):
+        super().__init__()
+        self.save_hyperparameters()
+        self.model_name = model_name
+        self._tokenizer = None
+        self._processor = None
+        self.model = None
+        self.loss_fn = loss_fn
+        self.load_pretrained_weights = load_pretrained_weights
+        self.is_hf_model = True
+        self.model_transform = model_transform
+        self.model_accelerator = model_accelerator
+        self.trust_remote_code = trust_remote_code
+
+    @property
+    def tokenizer(self):
+        if self._tokenizer is None:
+            self._tokenizer = AutoTokenizer(
+                self.model_name, include_special_tokens=True, trust_remote_code=self.trust_remote_code
+            )
+        return self._tokenizer
+
+    @tokenizer.setter
+    def tokenizer(self, value):
+        assert self._tokenizer is None
+        self._tokenizer = value
+
+    @property
+    def processor(self):
+        if self._processor is None:
+            self._processor = AutoProcessor.from_pretrained(self.model_name, trust_remote_code=self.trust_remote_code)
+        return self._processor
+
+    @staticmethod
+    def configure_tokenizer(model_name):
+        return AutoProcessor.from_pretrained(model_name).tokenizer
+
+    def configure_model(self, train=True):
+        # create all your layers here
+        if self.model is None:
+            if self.load_pretrained_weights:
+                self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
+                    self.model_name,
+                    torch_dtype=torch.bfloat16,
+                    trust_remote_code=self.trust_remote_code,
+                    use_safetensors=True,
+                )
+            else:
+                from transformers import AutoConfig
+
+                config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=self.trust_remote_code)
+                self.model = AutoModelForSpeechSeq2Seq.from_config(config, trust_remote_code=self.trust_remote_code)
+
+        if train:
+            self.model.train()
+
+    def forward(self, input_features, decoder_input_ids, attention_mask=None):
+        return self.model(
+            input_features=input_features.to(self.model.device),
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+        )
+
+    def training_step(self, batch):
+        outputs = self.forward(input_features=batch["input_features"], decoder_input_ids=batch["decoder_input_ids"])
+        loss_mask = batch.get('loss_mask', None)
+        if loss_mask is not None:
+            loss_mask = loss_mask.to(self.model.device).view(-1)
+        n_cls = outputs.logits.shape[-1]
+        logits = outputs.logits.view(-1, n_cls)
+        loss = self.loss_fn(logits, batch["labels"], loss_mask)
+
+        self.log('loss', loss, on_step=True, on_epoch=True, prog_bar=True)
+        return loss
+
+    def validation_step(self, batch):
+        output = self.forward(input_features=batch["input_features"], decoder_input_ids=batch["decoder_input_ids"])
+        loss = output.loss
+        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
+
+    def save_pretrained(self, path):
+        assert self.model is not None, "Model has to be created first."
+        self.model.save_pretrained(path)
+        if self._tokenizer is not None:
+            self._tokenizer.save_pretrained(path)
+        else:
+            logging.warning("A tokenizer wasn't created before to save.")
+
+        if self._processor is not None:
+            self._processor.save_pretrained(path)
+        else:
+            logging.warning("A processor wasn't created before to save.")
diff --git a/nemo/collections/speechlm/recipes/__init__.py b/nemo/collections/speechlm/recipes/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/speechlm/recipes/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/speechlm/recipes/optim/__init__.py b/nemo/collections/speechlm/recipes/optim/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/speechlm/recipes/optim/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/speechlm/recipes/optim/adam.py b/nemo/collections/speechlm/recipes/optim/adam.py
new file mode 100644
index 000000000000..777c3978c3e0
--- /dev/null
+++ b/nemo/collections/speechlm/recipes/optim/adam.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import nemo_run as run
+
+from nemo.lightning.pytorch.optim import PytorchOptimizerModule
+
+
+@run.cli.factory
+def pytorch_adam_with_flat_lr(
+    lr: float = 1e-5,
+) -> run.Config[PytorchOptimizerModule]:
+    from torch.optim import Adam
+
+    return run.Config(
+        PytorchOptimizerModule,
+        optimizer_fn=run.Partial(
+            Adam,
+            lr=lr,
+            weight_decay=0.1,
+            betas=(0.9, 0.95),
+            eps=1e-8,
+        ),
+    )
diff --git a/nemo/collections/vlm/inference/base.py b/nemo/collections/vlm/inference/base.py
index bbc85a8ee4a8..77918bae26b9 100644
--- a/nemo/collections/vlm/inference/base.py
+++ b/nemo/collections/vlm/inference/base.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import List, Optional, Union
 
 import pytorch_lightning as pl
 import torch
 import torch.distributed
 from megatron.core.inference.common_inference_params import CommonInferenceParams
 from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
+from PIL.Image import Image
 from transformers import AutoProcessor
 
 import nemo.lightning as nl
@@ -86,8 +87,8 @@ def generate(
     wrapped_model: VLMInferenceWrapper,
     tokenizer,
     image_processor,
-    prompts: list[str],
-    images: list,
+    prompts: List[str],
+    images: List[Union[Image, List[Image]]],
     max_batch_size: int = 4,
     random_seed: Optional[int] = None,
     inference_params: Optional[CommonInferenceParams] = None,
diff --git a/nemo/collections/vlm/inference/vlm_engine.py b/nemo/collections/vlm/inference/vlm_engine.py
index bce373e7a2f5..6e5fd7fa11ec 100644
--- a/nemo/collections/vlm/inference/vlm_engine.py
+++ b/nemo/collections/vlm/inference/vlm_engine.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
+from typing import List, Union
 
 import torch
 from megatron.core.inference.common_inference_params import CommonInferenceParams
@@ -26,7 +26,7 @@ class VLMEngine(MCoreEngine):
     def generate(
         self,
         prompts: List[str],
-        images: List[Image] = None,
+        images: List[Union[Image, List[Image]]] = None,
         common_inference_params: CommonInferenceParams = None,
     ) -> dict:
         # pylint: disable=C0115,C0116
diff --git a/nemo/core/connectors/save_restore_connector.py b/nemo/core/connectors/save_restore_connector.py
index 2c4c826d1daf..cbbb30772036 100644
--- a/nemo/core/connectors/save_restore_connector.py
+++ b/nemo/core/connectors/save_restore_connector.py
@@ -601,7 +601,12 @@ def _is_safe_path(member, extract_to):
         # Construct the full path where the member would be extracted
         full_path = os.path.join(extract_to, member_path)
         # Ensure the member would be extracted within the intended directory
-        return os.path.commonprefix([full_path, extract_to]) == extract_to
+        if os.path.commonprefix([full_path, extract_to]) != extract_to:
+            return False
+        # Check if the member is a symbolic link
+        if member.issym() or member.islnk():
+            return False
+        return True
 
     @staticmethod
     def _safe_extract(tar, out_folder: str, members=None):
diff --git a/nemo/deploy/deploy_base.py b/nemo/deploy/deploy_base.py
index 41e0e7ddbdc9..aeb94255a273 100644
--- a/nemo/deploy/deploy_base.py
+++ b/nemo/deploy/deploy_base.py
@@ -42,7 +42,8 @@ def __init__(
         checkpoint_path: str = None,
         model=None,
         max_batch_size: int = 128,
-        port: int = 8000,
+        http_port: int = 8000,
+        grpc_port: int = 8001,
         address="0.0.0.0",
         allow_grpc=True,
         allow_http=True,
@@ -54,7 +55,8 @@ def __init__(
         self.triton_model_version = triton_model_version
         self.max_batch_size = max_batch_size
         self.model = model
-        self.port = port
+        self.http_port = http_port
+        self.grpc_port = grpc_port
         self.address = address
         self.triton = None
         self.allow_grpc = allow_grpc
diff --git a/nemo/deploy/deploy_pytriton.py b/nemo/deploy/deploy_pytriton.py
index 797f805f99b9..2e934847e04f 100644
--- a/nemo/deploy/deploy_pytriton.py
+++ b/nemo/deploy/deploy_pytriton.py
@@ -38,7 +38,7 @@ class DeployPyTriton(DeployBase):
             tensor_parallelism_size=1,
         )
 
-        nm = DeployPyTriton(model=trt_llm_exporter, triton_model_name="model_name", port=8000)
+        nm = DeployPyTriton(model=trt_llm_exporter, triton_model_name="model_name", http_port=8000)
         nm.deploy()
         nm.run()
         nq = NemoQueryLLM(url="localhost", model_name="model_name")
@@ -66,7 +66,8 @@ def __init__(
         checkpoint_path: str = None,
         model=None,
         max_batch_size: int = 128,
-        port: int = 8000,
+        http_port: int = 8000,
+        grpc_port: int = 8001,
         address="0.0.0.0",
         allow_grpc=True,
         allow_http=True,
@@ -92,7 +93,8 @@ def __init__(
             checkpoint_path=checkpoint_path,
             model=model,
             max_batch_size=max_batch_size,
-            port=port,
+            http_port=http_port,
+            grpc_port=grpc_port,
             address=address,
             allow_grpc=allow_grpc,
             allow_http=allow_http,
@@ -128,7 +130,9 @@ def deploy(self):
             else:
                 triton_config = TritonConfig(
                     http_address=self.address,
-                    http_port=self.port,
+                    http_port=self.http_port,
+                    grpc_address=self.address,
+                    grpc_port=self.grpc_port,
                     allow_grpc=self.allow_grpc,
                     allow_http=self.allow_http,
                 )
diff --git a/nemo/deploy/nlp/__init__.py b/nemo/deploy/nlp/__init__.py
index cd9ef54a6035..560615894a32 100644
--- a/nemo/deploy/nlp/__init__.py
+++ b/nemo/deploy/nlp/__init__.py
@@ -12,5 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeploy, MegatronLLMDeployable
 from nemo.deploy.nlp.query_llm import NemoQueryLLM, NemoQueryLLMPyTorch
diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py
index a88d3b610cda..e91754297aa7 100644
--- a/nemo/deploy/nlp/query_llm.py
+++ b/nemo/deploy/nlp/query_llm.py
@@ -299,9 +299,8 @@ def query_llm(
                         "model": self.model_name,
                         "choices": [{"text": str(sentences)}],
                     }
-                    # Convert gneration logits to a list to make it json serializable and add it to openai_response dict
                     if output_generation_logits:
-                        openai_response["choices"][0]["generation_logits"] = result_dict["generation_logits"].tolist()
+                        openai_response["choices"][0]["generation_logits"] = result_dict["generation_logits"]
                     return openai_response
                 else:
                     return sentences
diff --git a/nemo/export/__init__.py b/nemo/export/__init__.py
index 6b1f8c90aa8f..d9155f923f18 100644
--- a/nemo/export/__init__.py
+++ b/nemo/export/__init__.py
@@ -11,5 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from nemo.export.tensorrt_lazy_compiler import trt_compile
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index 6f7027f12be8..e9f6830c92b5 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import tarfile
 from contextlib import nullcontext
 from typing import Callable, Optional
@@ -120,7 +121,7 @@ def __init__(self, quantization_config: Optional[DictConfig], export_config: Opt
             enable_quant_kv_cache = quantization_config.get("enable_kv_cache", None)
             if enable_quant_kv_cache is None:
                 enable_quant_kv_cache = (
-                    "int8" not in quantization_config.algorithm and quantization_config.decoder_type != "gptnext"
+                    "int8" not in quantization_config.algorithm and quantization_config.decoder_type != "gpt"
                 )
             logging.info(f'{"Enabled" if enable_quant_kv_cache else "Disabled"} KV cache quantization')
             quant_cfg["quant_cfg"]["*output_quantizer"] = {
@@ -196,7 +197,7 @@ def quantize(self, model: MegatronGPTModel, forward_loop: Callable[[MegatronGPTM
 
         model = mtq.quantize(model, self.quant_cfg, forward_loop)
 
-        if self.quantization_config.decoder_type == "gptnext":
+        if self.quantization_config.decoder_type == "gpt":
             # We found squared_relu may have an under-calibration problem.
             # Clamp the scaling_factor with a min threshold to avoid under-calibration.
             maxbound = 0
@@ -250,5 +251,6 @@ def export(self, model: MegatronGPTModel):
             if dist.get_rank() == 0:
                 save_artifacts(model, export_dir)
                 if compress:
-                    with tarfile.open(self.export_config.save_path, "w:gz") as tar:
+                    os.makedirs(os.path.dirname(self.export_config.save_path), exist_ok=True)
+                    with tarfile.open(self.export_config.save_path, "w") as tar:
                         tar.add(export_dir, arcname="./")
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index d1b5aa0e76d6..7d95bcca5709 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -32,7 +32,7 @@
 
 from nemo.deploy import ITritonDeployable
 from nemo.export.tarutils import TarPath, unpack_tarball
-from nemo.export.trt_llm.converter.model_converter import model_to_trtllm_ckpt
+from nemo.export.trt_llm.converter.model_converter import determine_quantization_settings, model_to_trtllm_ckpt
 from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import (
     dist_model_to_trt_llm_ckpt,
     get_layer_prefix,
@@ -324,7 +324,9 @@ def export(
                         "Supported model types are: {1}.".format(model_type, self.get_supported_models_list)
                     )
 
-                model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
+                model, model_config, self.tokenizer = load_nemo_model(
+                    nemo_checkpoint_path, nemo_export_dir, use_mcore_path
+                )
                 if use_mcore_path:
                     from megatron.core.export.data_type import DataType
                     from megatron.core.export.export_config import ExportConfig
@@ -335,35 +337,58 @@ def export(
                     from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
                     from tensorrt_llm.layers import MoeConfig
 
+                    share_embeddings_and_output_weights = model_config.get(
+                        "share_embeddings_and_output_weights", False
+                    )
+                    fp8_quantized, fp8_kvcache = determine_quantization_settings(
+                        model_config, fp8_quantized, fp8_kvcache
+                    )
+
                     # We build the transformer config using the nemo model config.
-                    transformer_config = self.get_transformer_config(model_configs)
+                    transformer_config = self.get_transformer_config(model_config)
                     input_model_type = getattr(ModelType, model_type)
 
                     # MCore export supports some default conversion dictionaries
                     mcore_model_conversion_dict = DEFAULT_CONVERSION_DICT
+
+                    # TODO: remove after adding this mapping to mcore
+                    from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+                    mcore_model_conversion_dict |= {
+                        'decoder.layers.mlp.experts.experts.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight_mixture_of_experts,
+                        'decoder.layers.mlp.experts.experts.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight_mixture_of_experts,
+                        'decoder.layers.mlp.router.weight': TRTLLMLayers.mlp_router_weight,
+                    }
+
                     # All Mcore conversion dicts start with "decoder.layers.4.blah.blah" , while nemo models start with "model.decoder.layers.4.blahblah". so we append model. to the keys
                     nemo_model_conversion_dict = {
                         f'model.{key}': value for key, value in mcore_model_conversion_dict.items()
+                    } | {  # Mapping for NeMo 2.0
+                        f'module.{key}': value for key, value in mcore_model_conversion_dict.items()
                     }
 
+                    # TODO: Workaround: Gemma uses gated activation, while mcore does not handle openai-gelu
+                    # as a gated function. Remove once !11614 is merged.
+                    activation = model_config.get('activation', "gelu")
+                    if activation == "openai-gelu" and input_model_type.name == 'gemma':
+                        activation = "geglu"
+
                     trtllm_helper = TRTLLMHelper(
                         transformer_config=transformer_config,
                         model_type=input_model_type,
                         trtllm_conversion_dict=nemo_model_conversion_dict,
-                        position_embedding_type=model_configs.get('position_embedding_type'),
-                        max_position_embeddings=model_configs.get('max_position_embeddings'),
-                        rotary_percentage=model_configs.get('rotary_percentage', 1.0),
-                        rotary_base=model_configs.get('rotary_base', 10000),
-                        moe_tp_mode=model_configs.get('moe_tp_mode', 2),
-                        multi_query_mode=model_configs.get("multi_query_mode", False),
-                        activation=model_configs.get('activation', "gelu"),
-                        seq_len_interpolation_factor=model_configs.get("seq_len_interpolation_factor"),
-                        moe_renorm_mode=model_configs.get(
+                        position_embedding_type=model_config.get('position_embedding_type'),
+                        max_position_embeddings=model_config.get('max_position_embeddings'),
+                        rotary_percentage=model_config.get('rotary_percentage', 1.0),
+                        rotary_base=model_config.get('rotary_base', 10000),
+                        moe_tp_mode=model_config.get('moe_tp_mode', 2),
+                        multi_query_mode=model_config.get("multi_query_mode", False),
+                        activation=activation,
+                        seq_len_interpolation_factor=model_config.get("seq_len_interpolation_factor"),
+                        moe_renorm_mode=model_config.get(
                             'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
                         ),
-                        share_embeddings_and_output_weights=model_configs.get(
-                            "share_embeddings_and_output_weights", False
-                        ),
+                        share_embeddings_and_output_weights=share_embeddings_and_output_weights,
                     )
 
                     input_dtype = getattr(DataType, dtype)
@@ -371,7 +396,7 @@ def export(
                         tensor_parallelism_size,
                         pipeline_parallelism_size,
                         use_parallel_embedding,
-                        use_embedding_sharing,
+                        share_embeddings_and_output_weights,
                     )
 
                     trtllm_model_weights_list, trtllm_model_config_list = (
@@ -380,6 +405,8 @@ def export(
                             export_config=export_config,
                             dtype=input_dtype,
                             state_dict_split_by_layer_numbers=False,
+                            fp8_quantized=fp8_quantized,
+                            fp8_kvcache=fp8_kvcache,
                         )
                     )
 
@@ -420,7 +447,7 @@ def export(
 
                     weights_dicts, model_configs = model_to_trtllm_ckpt(
                         model=model,
-                        nemo_model_config=model_configs,
+                        nemo_model_config=model_config,
                         nemo_export_dir=nemo_export_dir,
                         decoder_type=model_type,
                         dtype=dtype,
@@ -490,12 +517,13 @@ def get_transformer_config(self, nemo_model_config):
 
         normalization = nemo_model_config.get('normalization', 'layernorm')
         transformer_config_normalization = 'LayerNorm'
-        layernorm_zero_centered_gamma = False
+        layernorm_zero_centered_gamma = nemo_model_config.get('layernorm_zero_centered_gamma', False)
         if normalization == 'layernorm1p':
             layernorm_zero_centered_gamma = True
         elif normalization == 'rmsnorm':
             transformer_config_normalization = 'RMSNorm'
 
+        num_moe_experts = nemo_model_config.get('num_moe_experts', 0)
         conf = TransformerConfig(
             num_layers=nemo_model_config.get('num_layers'),
             moe_router_topk=nemo_model_config.get('moe_router_topk', 0),
@@ -506,9 +534,10 @@ def get_transformer_config(self, nemo_model_config):
             ffn_hidden_size=nemo_model_config.get('ffn_hidden_size'),
             layernorm_epsilon=nemo_model_config.get('layernorm_epsilon'),
             add_bias_linear=nemo_model_config.get('bias'),
-            num_moe_experts=nemo_model_config.get('num_moe_experts', None),
+            num_moe_experts=num_moe_experts if num_moe_experts > 0 else None,
             normalization=transformer_config_normalization,
             layernorm_zero_centered_gamma=layernorm_zero_centered_gamma,
+            gated_linear_unit=nemo_model_config.get('gated_linear_unit', False),
         )
         return conf
 
@@ -553,10 +582,10 @@ def convert_to_safe_tensors(
             tmp_dir = tempfile.TemporaryDirectory()
             nemo_export_dir = Path(tmp_dir.name)
 
-            model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
+            model, model_config, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
             weights_dicts, model_configs = model_to_trtllm_ckpt(
                 model=model,
-                nemo_model_config=model_configs,
+                nemo_model_config=model_config,
                 nemo_export_dir=nemo_export_dir,
                 decoder_type=model_type,
                 dtype=dtype,
@@ -790,7 +819,6 @@ def build(
             input_model_type = getattr(ModelType, model_type)
 
             nemo_model_conversion_dict = self.get_nemo_to_trtllm_conversion_dict(model_state_dict)
-
             self.trtllm_helper = TRTLLMHelper(
                 transformer_config=transformer_config,
                 model_type=input_model_type,
@@ -1150,7 +1178,7 @@ def triton_infer_fn(self, **inputs: np.ndarray):
             if "output_generation_logits" in inputs:
                 infer_input["output_generation_logits"] = inputs.pop("output_generation_logits")[0][0]
 
-            if infer_input["output_generation_logits"]:
+            if "output_generation_logits" in inputs:
                 output_texts, generation_logits = self.forward(**infer_input)
                 output_dict["generation_logits"] = np.array(generation_logits.cpu().numpy())
             else:
diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py
index 9729781e6eba..e459dc31d0fb 100755
--- a/nemo/export/trt_llm/converter/model_converter.py
+++ b/nemo/export/trt_llm/converter/model_converter.py
@@ -15,7 +15,7 @@
 
 import csv
 import logging
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import numpy as np
 import tensorrt_llm
@@ -82,9 +82,23 @@ def prompt_convert(prompt_config, prompt_weights):
 
 
 def determine_quantization_settings(
-    nemo_model_config, fp8_quantized: Optional[bool] = None, fp8_kvcache: Optional[bool] = None
+    nemo_model_config: Dict[str, Any], fp8_quantized: Optional[bool] = None, fp8_kvcache: Optional[bool] = None
 ) -> Tuple[bool, bool]:
-    is_nemo_quantized = nemo_model_config.get('fp8', False)
+    """
+    Determines the exported models quantization settings.
+    Reads from NeMo config, with optional override.
+
+    Args:
+        nemo_model_config (dict): NeMo model configuration
+        fp8_quantized (optional, bool): User-specified quantization flag
+        fp8_kvcache (optional, bool): User-specified cache quantization flag
+    Returns:
+        Tuple[bool, bool]:
+            - Model quantization flag
+            - Model kv-cache quantization flag
+    """
+
+    is_nemo_quantized: bool = nemo_model_config.get('fp8', False)
     if fp8_quantized is None:
         fp8_quantized = is_nemo_quantized
     if fp8_kvcache is None:
diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
index 827bbf929796..518a5bad8883 100644
--- a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
+++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
@@ -17,11 +17,9 @@
 import json
 import logging
 import os
-import re
 import shutil
-from io import BytesIO
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Union
 
 import numpy as np
 import tensorstore  # This is important even though not used. Otherwise zarr raises error.
@@ -38,8 +36,17 @@
 from nemo.export.tarutils import TarPath, ZarrPathStore
 from nemo.export.tiktoken_tokenizer import TiktokenTokenizer
 
+try:
+    from nemo.lightning import io
+
+    HAVE_NEMO2 = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_NEMO2 = False
+
 LOGGER = logging.getLogger("NeMo")
 
+EXTRA_STATE = "extra_state"
+
 
 def is_nemo_file(path):
     flag = False
@@ -70,69 +77,86 @@ def __init__(self, path: Union[Path, TarPath]) -> None:
             self.path = path  # overwrites path set in super().__init__ call
 
 
-def get_extra_state_key(state_dict: dict) -> Optional[str]:
-    for key in state_dict.keys():
-        if '_extra_state/' in key:
-            return key
-    return None
-
-
-def unpack_extra_state_key(key: str) -> Tuple[str, int]:
-    basename = key.split('/')[0]
-    size = int(key.split('/')[1].split('_')[-1])
-    return basename, size
-
+def preprocess_scaling_factors_for_local_export(state_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Scaling factors are kept in BufferIO objects.
+    This function reads the exact scales, preparing them for export.
+    Used only for local (non-mcore) path.
 
-def clear_loaded_extra_states(state_dict: dict, basename: str) -> dict:
-    """The scaling factors are originally saved to state_dict under the keynames 'basename/*'
-    The standardized representation is saved to 'basename.*'. This function clears the former from the state.
+    Args:
+        state_dict (dict): Model state dictionary
+    Returns:
+        dict: The same dictionary, with explicitly loaded extra states from BufferIO objects.
     """
-    to_remove = [k for k in state_dict.keys() if basename + '/' in k]
-    for key in to_remove:
-        state_dict.pop(key)
-    return state_dict
+    scales_dict = {k: v for k, v in state_dict.items() if EXTRA_STATE in k}
+    state_dict = {k: v for k, v in state_dict.items() if EXTRA_STATE not in k}
+    scales = {}
+
+    for key, value in scales_dict.items():
+        value.seek(0)
+        extra_state = torch.load(value)
+        if extra_state is not None and 'scale_fwd' in extra_state:
+            scales[key + '.scale_fwd'] = extra_state['scale_fwd'].cpu()
+
+    combined_scales = {}
+    for key in scales:
+        if '.decoder.layers.0' not in key:
+            continue
+
+        # Key has a structure "model.decoder.layers.<layer_number>.<rest>"
+        decomposed = key.split('.')
+        layer_num_idx = 3
 
+        # Merges scales from "model.decoder.layers.<layer_num>.<rest>" to
+        # larger dimensional tensor with "model.decoder.layers.<rest>" key
+        combined = []
+        layer_num = 0
+        decomposed[layer_num_idx] = str(layer_num)
+        while (scale := scales.get('.'.join(decomposed))) is not None:
+            combined.append(scale)
+            layer_num += 1
+            decomposed[layer_num_idx] = str(layer_num)
 
-def retrieve_scale(bytes: BytesIO) -> Optional[torch.Tensor]:
-    bytes.seek(0)
-    extra_state = torch.load(bytes)
-    if not extra_state or 'scale_fwd' not in extra_state:
-        return None
-    return extra_state['scale_fwd'].cpu()
+        del decomposed[layer_num_idx]
+        combined_scales['.'.join(decomposed)] = torch.stack(combined)
 
+    return state_dict | combined_scales
 
-def load_scales_from_bytes(bytes_list: List[BytesIO]) -> Optional[torch.Tensor]:
-    scales = []
-    for bytes in bytes_list:
-        scale = retrieve_scale(bytes)
-        if scale is None:
-            return None
-        scales.append(scale)
-    return torch.stack(scales)
 
+def rename_extra_states(state_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    This function preprocesses extra states for Megatron export.
 
-def load_scaling_factors(state_dict: dict, basename: str, size: int) -> Optional[torch.Tensor]:
-    keynames = [f'{basename}/shard_{layer}_{size}' for layer in range(size)]
-    bytes_list = [state_dict[keyname][0] for keyname in keynames]
-    return load_scales_from_bytes(bytes_list)
+    Args:
+        state_dict (dict): Model state dictionary
+    Returns:
+        dict: Model state dictionary, with extra states consumable by mcore export
+    """
+    mcore_extra_states = {}
 
+    for key, value in state_dict.items():
+        if EXTRA_STATE not in key:
+            continue
 
-def filter_experts_extra_states(state_dict: dict):
-    pattern = (
-        r'(model|module)\.decoder\.layers\.mlp\.experts\.experts\.linear_fc\d+\._extra_state/shard_\d+\.\d+_\d+\.\d+'
-    )
-    return {k: v for k, v in state_dict.items() if not re.fullmatch(pattern, k)}
+        # Keys with the extra states have the following format:
+        # <prefix>.layers.<layer>._extra_state/shard_<layer_number>_<number_of_layers>
+        key_base, shard_key = key.split('/')
+        if '_' not in shard_key:
+            continue
 
+        shard_layer = shard_key.split('_')[1]
+        if not shard_layer.isnumeric():
+            continue
 
-def standarize_distributed_scaling_factors(state_dict: dict) -> dict:
-    while key := get_extra_state_key(state_dict):
-        basename, size = unpack_extra_state_key(key)
-        scaling_factors = load_scaling_factors(state_dict, basename, size)
-        if scaling_factors is not None:
-            state_dict[basename + '.scale_fwd'] = scaling_factors
-        state_dict = clear_loaded_extra_states(state_dict, basename)
+        # Renames keys to:
+        # <prefix>.layers.<layer_number>.<layer>._extra_state
+        mcore_key = key_base.replace("layers", f"layers.{shard_layer}")
+        if isinstance(value, list):
+            value = value[0]
+        mcore_extra_states[mcore_key] = value
 
-    return state_dict
+    state_dict = {k: v for k, v in state_dict.items() if EXTRA_STATE not in k}
+    return state_dict | mcore_extra_states
 
 
 def load_sharded_metadata_torch_dist(checkpoint_dir: Union[Path, TarPath], torch_tensor: bool = True):
@@ -154,8 +178,7 @@ def load_sharded_metadata_torch_dist(checkpoint_dir: Union[Path, TarPath], torch
         storage_reader=fs_reader,
         no_dist=True,
     )
-    state_dict = filter_experts_extra_states(state_dict)
-    state_dict = standarize_distributed_scaling_factors(state_dict)
+    state_dict = rename_extra_states(state_dict)
 
     if not torch_tensor:
         for k, v in state_dict.items():
@@ -166,40 +189,21 @@ def load_sharded_metadata_torch_dist(checkpoint_dir: Union[Path, TarPath], torch
     return state_dict
 
 
-def get_sharded_file(dir: dict, layer_number: int) -> Optional[os.PathLike]:
-    pt_file_list = list(dir.glob(f'shard_{layer_number}_*.pt'))
-    if pt_file_list == []:
-        return None
-    return pt_file_list[0]
-
-
 def load_sharded_pickle_extra_state_scale(dir: Union[Path, TarPath]):
-    def _get_layer_number(file):
-        basename = os.path.basename(str(file))
-        return int(basename.split('_')[1])
-
     pt_files = list(dir.glob('shard_*_*.pt'))
-    bytes_list = []
-    for file in sorted(pt_files, key=_get_layer_number):
+    extra_states = {}
+    for file in pt_files:
+        shard_name = file.name.split('.')[0]
         with file.open('rb') as opened_file:
-            bytes_list.append(torch.load(opened_file))
+            extra_states[dir.name + '/' + shard_name] = torch.load(opened_file)
 
-    return load_scales_from_bytes(bytes_list)
+    return rename_extra_states(extra_states)
 
 
 def contains_extra_states(subdir: Union[Path, TarPath]):
     return list(subdir.glob('shard_0_*.pt')) != []
 
 
-def load_extra_state_from_pickle(sharded_state_dict: dict, subdir: Union[Path, TarPath]):
-    scales = load_sharded_pickle_extra_state_scale(subdir)
-    if scales is not None:
-        key = subdir.name + '.scale_fwd'
-        sharded_state_dict[key] = scales
-
-    return sharded_state_dict
-
-
 def load_sharded_metadata_zarr(checkpoint_dir: Union[Path, TarPath], torch_tensor=True):
     sharded_state_dict = {}
     for subdir in checkpoint_dir.iterdir():
@@ -207,7 +211,7 @@ def load_sharded_metadata_zarr(checkpoint_dir: Union[Path, TarPath], torch_tenso
             continue
 
         if contains_extra_states(subdir):
-            sharded_state_dict = load_extra_state_from_pickle(sharded_state_dict, subdir)
+            sharded_state_dict.update(load_sharded_pickle_extra_state_scale(subdir))
         elif (subdir / '.zarray').exists():
             key = subdir.name
             zstore = ZarrPathStore(subdir)
@@ -289,14 +293,54 @@ def copy_tokenizer_files(config, out_dir):
     return config
 
 
+def get_tokenizer_from_nemo2_context(model_context_dir: Path):
+    """
+    Retrieve tokenizer configuration from NeMo 2.0 context and instantiate the tokenizer.
+
+    Args:
+        model_context_dir (Path): Path to the model context directory.
+
+    Returns:
+        The instantiated tokenizer (various classes possible).
+    """
+
+    if HAVE_NEMO2:
+        # Use NeMo tokenizer loaded from the NeMo 2.0 model context
+        tokenizer_spec = io.load_context(model_context_dir, subpath="model.tokenizer")
+        return build_tokenizer(tokenizer_spec)
+    else:
+        # Use local nemo.export SentencePieceTokenizer implementation
+        # or directly a HuggingFace tokenizer based on the model config
+        with (model_context_dir / "model.yaml").open("r") as stream:
+            model_config = yaml.safe_load(stream)
+
+        tokenizer_config = model_config["tokenizer"]
+        target_class = tokenizer_config["_target_"]
+        tokenizer_module = "nemo.collections.common.tokenizers."
+        assert target_class.startswith(tokenizer_module)
+        target_class = target_class.removeprefix(tokenizer_module)
+
+        if target_class == "sentencepiece_tokenizer.SentencePieceTokenizer":
+            tokenizer = SentencePieceTokenizer(
+                model_path=str(model_context_dir / tokenizer_config["model_path"]),
+                special_tokens=tokenizer_config.get("special_tokens", None),
+                legacy=tokenizer_config.get("legacy", False),
+            )
+        elif target_class == "huggingface.auto_tokenizer.AutoTokenizer":
+            tokenizer = AutoTokenizer.from_pretrained(
+                str(model_context_dir / tokenizer_config["pretrained_model_name"])
+            )
+        else:
+            raise ValueError(f"Unsupported tokenizer type: {tokenizer_module}{target_class}.")
+
+    return tokenizer
+
+
 def get_tokenizer(tokenizer_dir_or_path: Union[str, Path]) -> PreTrainedTokenizer:
     """Loads the tokenizer from the decoded NeMo weights dir."""
     tokenizer_dir_or_path = Path(tokenizer_dir_or_path)
     if (tokenizer_dir_or_path / "nemo_context").exists():
-        from nemo.lightning import io
-
-        tokenizer_spec = io.load_context((tokenizer_dir_or_path / "nemo_context"), subpath="model.tokenizer")
-        return build_tokenizer(tokenizer_spec)
+        return get_tokenizer_from_nemo2_context(tokenizer_dir_or_path / "nemo_context")
     elif os.path.exists(os.path.join(tokenizer_dir_or_path, "vocab.json")):
         vocab_path = tokenizer_dir_or_path / "vocab.json" if tokenizer_dir_or_path.is_dir() else tokenizer_dir_or_path
         tokenizer_config = {"library": "tiktoken", "vocab_file": str(vocab_path)}
@@ -430,8 +474,7 @@ def get_model_type(nemo_ckpt: Union[str, Path]) -> Optional[str]:
     return model_type
 
 
-def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Path]):
-
+def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Path], mcore_scales_format: bool = True):
     if not os.path.exists(nemo_ckpt):
         raise TypeError("%s does not exist", nemo_ckpt)
 
@@ -448,6 +491,10 @@ def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Pat
             dist_ckpt_folder = nemo_dir / "model_weights"
 
             model = load_sharded_metadata(dist_ckpt_folder)
+            if not mcore_scales_format:
+                model.update({k: v[0] for k, v in model.items() if EXTRA_STATE in k and isinstance(v, list)})
+                model = preprocess_scaling_factors_for_local_export(model)
+
             nemo_model_config = unpacked_checkpoint_dir.model_config
 
             if nemo_model_config["tokenizer"].get("library", None) == "huggingface":
@@ -476,7 +523,7 @@ def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Pat
                     elif k == "activation_func":
                         nemo_model_config["activation"] = v["_target_"].rsplit('.', 1)[-1]
             else:
-                from nemo.lightning import io
+                assert HAVE_NEMO2, "nemo_toolkit>=2.0.0 is required to load the model context."
 
                 config = io.load_context(io_folder, subpath="model.config")
 
diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py
index 1fb4b4e0a757..884ea416ce10 100644
--- a/nemo/lightning/_strategy_lib.py
+++ b/nemo/lightning/_strategy_lib.py
@@ -91,6 +91,7 @@ def init_parallel_ranks(
         use_fp8=fp8,
         init_mpi_proc_group=getattr(parallel_config, "tp_comm_overlap", False)
         and getattr(parallel_config, "tp_comm_bootstrap_backend", None) == 'mpi',
+        use_te_rng_tracker=getattr(parallel_config, "use_te_rng_tracker", False),
         # apex_transformer_log_level=self.cfg.get('apex_transformer_log_level', 30),
     )
 
diff --git a/nemo/lightning/io/__init__.py b/nemo/lightning/io/__init__.py
index d53fa1e5f57e..388156ecf4a7 100644
--- a/nemo/lightning/io/__init__.py
+++ b/nemo/lightning/io/__init__.py
@@ -2,7 +2,7 @@
 from nemo.lightning.io.api import export_ckpt, import_ckpt, load, load_context, model_exporter, model_importer
 from nemo.lightning.io.capture import reinit
 from nemo.lightning.io.connector import Connector, ModelConnector
-from nemo.lightning.io.mixin import ConnectorMixin, IOMixin, track_io
+from nemo.lightning.io.mixin import ConnectorMixin, IOMixin, drop_unexpected_params, track_io
 from nemo.lightning.io.pl import TrainerContext, is_distributed_ckpt
 from nemo.lightning.io.state import TransformCTX, apply_transforms, state_transform
 
@@ -10,6 +10,7 @@
     "apply_transforms",
     "Connector",
     "ConnectorMixin",
+    "drop_unexpected_params",
     "IOMixin",
     "track_io",
     "import_ckpt",
diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
index 258d2848a63a..602551ae4479 100644
--- a/nemo/lightning/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -227,10 +227,16 @@ def nemo_load(
         from nemo.lightning.io.api import load_context
 
         model = load_context(path, subpath="model")
+        is_peft_ckpt = model.model_transform is not None
+        callbacks = []
+        if is_peft_ckpt:
+            callbacks.append(model.model_transform)
+
         _trainer = trainer or Trainer(
             devices=1,
             accelerator="cpu" if cpu else "gpu",
             strategy=MegatronStrategy(ddp="pytorch", setup_optimizers=False),
+            callbacks=callbacks,
         )
 
         _trainer.strategy.connect(model)
@@ -245,7 +251,20 @@ def nemo_load(
                 model.configure_model()
 
         _trainer.strategy.setup(_trainer)
-        _trainer.strategy.load_checkpoint(path)
+        if is_peft_ckpt:
+            from nemo.lightning.io.pl import ckpt_to_weights_subdir
+
+            model.trainer = _trainer
+            model = model.model_transform(model)
+            adapter_sharded_state_dict = {
+                k: v for k, v in _trainer.strategy.megatron_parallel.sharded_state_dict().items() if ".adapter." in k
+            }
+            adapter_state = _trainer.strategy.checkpoint_io.load_checkpoint(
+                ckpt_to_weights_subdir(path, is_saving=False), sharded_state_dict=adapter_sharded_state_dict
+            )
+            _trainer.strategy.load_model_state_dict(adapter_state, strict=False)
+        else:
+            _trainer.strategy.load_checkpoint(path)
 
         return model, _trainer
 
diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
index e356caf92162..283cea6943b5 100644
--- a/nemo/lightning/io/mixin.py
+++ b/nemo/lightning/io/mixin.py
@@ -687,6 +687,45 @@ def _artifact_transform_load(cfg: fdl.Config, path: Path):
             pass
 
 
+def drop_unexpected_params(config: fdl.Config) -> bool:
+    """
+    Analyzes config to detect unexpected keyword arguments -- for example, deprecated parameters -- and
+    updates the config by dropping them. Returns True if the config gets updated and False otherwise.
+
+    Args:
+        config (fdl.Config): The configuration object to analyze.
+    """
+
+    updated = False
+
+    def analyze(config: fdl.Config, prefix: str):
+
+        if isinstance(config, fdl.Config):
+            signature = inspect.signature(config.__fn_or_cls__)
+
+            accept_kwargs = any(param.kind is inspect.Parameter.VAR_KEYWORD for param in signature.parameters.values())
+
+            if not accept_kwargs:
+                to_drop = [param for param in config.__arguments__ if param not in signature.parameters]
+
+                if to_drop:
+                    nonlocal updated
+                    updated = True
+                    logging.warning(f"Deprecated parameters to drop from {prefix}: {to_drop}")
+                    for param in to_drop:
+                        del config.__arguments__[param]
+            else:
+                logging.debug(f"Skip analyzing {prefix} as it accepts arbitrary keyword arguments.")
+
+            # Proceed recursively for all arguments
+            for key, value in config.__arguments__.items():
+                analyze(value, prefix + "." + key)
+
+    analyze(config, "<root>")
+
+    return updated
+
+
 def load(path: Path, output_type: Type[CkptType] = Any, subpath: Optional[str] = None, build: bool = True) -> CkptType:
     """
     Loads a configuration from a pickle file and constructs an object of the specified type.
@@ -752,6 +791,8 @@ def load(path: Path, output_type: Type[CkptType] = Any, subpath: Optional[str] =
     config = serialization.Deserialization(json_config).result
     _artifact_transform_load(config, path)
 
+    drop_unexpected_params(config)
+
     if not build:
         return config
 
diff --git a/nemo/lightning/io/state.py b/nemo/lightning/io/state.py
index b807f79fe369..67c005ee9621 100644
--- a/nemo/lightning/io/state.py
+++ b/nemo/lightning/io/state.py
@@ -43,6 +43,7 @@ def apply_transforms(
     target: TargetModuleT,
     mapping: Dict[str, str],
     transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = [],
+    state_dict_ignored_entries: List = [],
 ) -> TargetModuleT:
     """
     Applies a series of transformations to adapt the state dictionary of a source module to
@@ -60,6 +61,11 @@ def apply_transforms(
         transforms (Optional[List[Callable[[TransformCTX], TransformCTX]]]): A list of functions
             that modify the `TransformCTX` object. If None, no transformations beyond key renaming
             are applied. Defaults to None.
+        state_dict_ignored_entries: List of entries to ignore in _target.state_dict(). There are cases
+            where multiple entries in model's state_dict point to one entry in model's named_parameter.
+            E.g., model has multiple pointers pointing to one shared parameters (`encoder.embed_tokens.weight`,
+            `decoder.embed_tokens.weight` and `shared.weight` all points to `shared.weight
+            in T5 Huggingface implementation.). In these cases, ignore redundant entries.
 
     Returns
     -------
@@ -166,6 +172,7 @@ def scale_weights(ctx):
         _module.register_buffer(_key, val)
 
     keys = list(filter(lambda x: x is not None and not x.endswith("_extra_state"), target_state.keys()))
+    keys = [key for key in keys if key not in state_dict_ignored_entries]
     if len(keys) != 0:
         raise RuntimeError(f"Additional keys: {keys} in checkpoint but not in model.")
 
@@ -459,3 +466,72 @@ def wrapper(fn) -> StateDictTransform:
         return wrapper
 
     return wrapper(fn)
+
+
+class TransformFns:
+    """
+    A collection of common functions used in state dict transformation.
+    """
+
+    @staticmethod
+    def split_qkv(ctx: TransformCTX, linear_qkv):
+        """
+        Split interleave-concatenated qkv to q, k, v
+
+        Example: export layer linear_qkv to HF {q|k|v}_proj
+        """
+        megatron_config = ctx.source.config
+
+        head_num = megatron_config.num_attention_heads
+        num_query_groups = megatron_config.num_query_groups
+        heads_per_group = head_num // num_query_groups
+        # hidden_size = megatron_config.hidden_size
+        head_size = megatron_config.kv_channels
+        qkv_total_dim = head_num + 2 * num_query_groups
+
+        linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, -1])
+        # when converting base model (linear_qkv), hidden size = megatron_config.hidden_size
+        # when converting lora (linear_qkv.adapter.linear_out), hidden size = lora_r
+        hidden_size = linear_qkv.size(-1)
+        q_slice = torch.cat(
+            [
+                torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+                for i in range(num_query_groups)
+            ]
+        )
+        k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+        v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+        q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
+        k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
+        v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()
+
+        return q_proj, k_proj, v_proj
+
+    @staticmethod
+    def split_fc1(linear_fc1):
+        """
+        Split concatenated fc1 to gate and up proj
+
+        Example: export layer linear_fc1 to HF {gate|up}_proj
+        """
+        gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
+        return gate_proj, up_proj
+
+    @staticmethod
+    def duplicate2(param):
+        """
+        Duplicate the source parameter to two target parameters
+
+        Example: export Performant LoRA linear_fc1.adapter.linear_in to HF {gate|up}_proj.lora_A
+        """
+        return param, param
+
+    @staticmethod
+    def duplicate3(param):
+        """
+        Duplicate the source parameter to three target parameters
+
+        Example: export Performant LoRA linear_qkv.adapter.linear_in to HF {q|k|v}_proj.lora_A
+        """
+        return param, param, param
diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py
index d2e93fe9ab42..0c559d1b3990 100644
--- a/nemo/lightning/pytorch/callbacks/peft.py
+++ b/nemo/lightning/pytorch/callbacks/peft.py
@@ -138,6 +138,7 @@ def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str)
 
         super().setup(trainer, pl_module, stage=stage)
 
+        self._is_fsdp_v1 = type(trainer.strategy).__name__ == 'FSDPStrategy'
         trainer.strategy.trainer = trainer
         wrapped_io = partial(WrappedAdapterIO, peft=self)
 
@@ -313,12 +314,11 @@ def state_dict(self, destination=None, prefix='', keep_vars=False):
             destination = {}
 
         # Get state dict of the main module
-        main_state_dict = self.to_wrap.state_dict(destination, prefix, keep_vars)
+        self.to_wrap.state_dict(destination, prefix, keep_vars)
 
-        # Store adapter state dict under the special "adapters" key in the destination dict
-        adapter_state_dict = self.adapter.state_dict(None, prefix, keep_vars)
-        destination[f'{prefix}adapters'] = adapter_state_dict
-        return main_state_dict
+        # Store adapter state dict under the "adapter" prefix in the destination dict
+        self.adapter.state_dict(destination, f'{prefix}adapter.', keep_vars)
+        return destination
 
     def sharded_state_dict(
         self,
diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
index 8767df7dad03..8b3daab30b19 100644
--- a/nemo/lightning/pytorch/strategies/megatron_strategy.py
+++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -99,6 +99,7 @@ class ParallelismConfig:
     pipeline_dtype: torch.dtype
     encoder_tensor_model_parallel_size: int = 0
     encoder_pipeline_model_parallel_size: int = 0
+    use_te_rng_tracker: bool = False
 
 
 class MegatronStrategy(DDPStrategy, io.IOMixin):
@@ -199,6 +200,7 @@ def __init__(
         ddp: Union[DDPLiteral, DistributedDataParallelConfig] = "megatron",
         lazy_init: bool = False,
         pipeline_dtype: Optional[torch.dtype] = None,
+        use_te_rng_tracker: bool = False,
         save_ckpt_format: str = "torch_dist",
         ckpt_async_save: bool = True,
         ckpt_torch_dist_multiproc: int = None,  ## TODO(ashors): put elsewhere?
@@ -244,6 +246,7 @@ def __init__(
         self.ckpt_load_optimizer = ckpt_load_optimizer
         self.ckpt_save_optimizer = ckpt_save_optimizer
         self.ckpt_load_strictness = ckpt_load_strictness
+        self.use_te_rng_tracker = use_te_rng_tracker
         self._pipeline_dtype = pipeline_dtype
         self._setup_optimizers = setup_optimizers
         self._init_model_parallel = init_model_parallel
@@ -900,6 +903,7 @@ def parallelism(self) -> ParallelismConfig:
             encoder_tensor_model_parallel_size=self.encoder_tensor_model_parallel_size,
             encoder_pipeline_model_parallel_size=self.encoder_pipeline_model_parallel_size,
             pipeline_dtype=self.pipeline_dtype,
+            use_te_rng_tracker=self.use_te_rng_tracker,
         )
 
     @contextmanager
diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py
index 7118d6de0831..0b509df6415f 100644
--- a/nemo/lightning/run/plugins.py
+++ b/nemo/lightning/run/plugins.py
@@ -314,7 +314,7 @@ class PerfEnvPlugin(run.Plugin):
     enable_layernorm_sm_margin: bool = True
     layernorm_sm_margin: int = 16
     enable_vboost: bool = False
-    nccl_pp_comm_chunksize: int = None
+    nccl_pp_comm_chunksize: Optional[int] = None
 
     def get_vboost_srun_cmd(self, nodes, job_dir):
         "Create the vboost `sudo nvidia-smi boost-slider --vboost 1` command"
@@ -361,7 +361,7 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
 
         # Improve perf by steering power to tensor cores, may not work on all systems
         if self.enable_vboost and isinstance(executor, run.SlurmExecutor):
-            vboost_cmd = self.get_vboost_srun_cmd(executor.nodes, executor.job_dir)
+            vboost_cmd = self.get_vboost_srun_cmd(executor.nodes, executor.tunnel.job_dir)
             executor.setup_lines = (
                 executor.setup_lines + vboost_cmd
                 if (executor.setup_lines and len(executor.setup_lines) > 0)
diff --git a/nemo/utils/get_rank.py b/nemo/utils/get_rank.py
index 37d3906760e7..20aef7285085 100644
--- a/nemo/utils/get_rank.py
+++ b/nemo/utils/get_rank.py
@@ -18,8 +18,7 @@
 
 
 def is_global_rank_zero():
-    """ Helper function to determine if the current process is global_rank 0 (the main process)
-    """
+    """Helper function to determine if the current process is global_rank 0 (the main process)"""
     # Try to get the pytorch RANK env var
     # RANK is set by torch.distributed.launch
     rank = get_envint("RANK", None)
@@ -46,8 +45,7 @@ def is_global_rank_zero():
 
 
 def get_rank():
-    """ Helper function that returns torch.distributed.get_rank() if DDP has been initialized otherwise it returns 0.
-    """
+    """Helper function that returns torch.distributed.get_rank() if DDP has been initialized otherwise it returns 0."""
 
     if is_global_rank_zero():
         return 0
diff --git a/nemo/utils/sequence_packing_utils.py b/nemo/utils/sequence_packing_utils.py
index 647a0401bd5b..672975f28bba 100644
--- a/nemo/utils/sequence_packing_utils.py
+++ b/nemo/utils/sequence_packing_utils.py
@@ -153,6 +153,7 @@ def create_packing_strategy(
     Returns:
           assignments: A list of lists, where each inner list represents a bin and contains the indices of the
                         sequence lengths assigned to that bin.
+          pack_metadata: A dict that records packing metadata, for instance the max number of samples per bin.
     """
 
     logging.info(f"Packing sequences to length {pack_size}...")
@@ -166,13 +167,17 @@ def create_packing_strategy(
     packed_seq_lens = [sum(x) for x in assignments]
     packing_factor = len(all_seq_lens) / len(packed_seq_lens)
 
+    max_seqlen = max(all_seq_lens)
+    max_samples_per_bin = max([len(b) for b in assignments])
+    packing_metadata = {'dataset_max_seqlen': max_seqlen, 'max_samples_per_bin': max_samples_per_bin}
+
     logging.debug("Packed sequence lengths:")
     logging.debug(packed_seq_lens)
     logging.info(f"Packing is {sum(packed_seq_lens)/len(packed_seq_lens)/pack_size*100:.2f}% efficient")
     logging.info(
         f">>>>> For pack size {pack_size}, average number of sequences per pack is n = {packing_factor:.3f} <<<<<"
     )
-    return assignments
+    return assignments, packing_metadata
 
 
 def fill_packing_strategy(
diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt
index adca2283f577..68700ca87990 100644
--- a/requirements/requirements_lightning.txt
+++ b/requirements/requirements_lightning.txt
@@ -1,8 +1,9 @@
 cloudpickle
 fiddle
 hydra-core>1.3,<=1.3.2
-lightning>2.2.1
+lightning>2.2.1,<=2.4.0
 omegaconf<=2.3
+peft
 torchmetrics>=0.11.0
 transformers>=4.45.0
 wandb
diff --git a/requirements/requirements_multimodal.txt b/requirements/requirements_multimodal.txt
index aa33b3b55127..35a060164c5e 100644
--- a/requirements/requirements_multimodal.txt
+++ b/requirements/requirements_multimodal.txt
@@ -1,4 +1,5 @@
 addict
+bitsandbytes==0.45.0
 clip
 decord; sys_platform == 'linux'
 diffusers>=0.19.3
diff --git a/scripts/checkpoint_averaging/README.md b/scripts/checkpoint_averaging/README.md
new file mode 100644
index 000000000000..614b4b697e0e
--- /dev/null
+++ b/scripts/checkpoint_averaging/README.md
@@ -0,0 +1,25 @@
+Checkpoint Averaging
+====================
+
+Overview
+--------
+The checkpoint averaging script is used to compute the average of multiple distributed checkpoints. This can be useful for improving model performance by combining multiple training states.
+
+When executed, the script processes checkpoints stored in a specified directory, averages their weights, and generates new checkpoint containing the averaged weights.
+
+Average Zarr Distributed Checkpoints
+------------------------------------
+Use the following command to run the checkpoint averaging script for zarr distributed checkpoints:
+
+```shell
+python scripts/checkpoint_averaging/zarr_distributed_checkpoint_averaging.py \
+    --name_prefix <output checkpoint name> \
+    --checkpoint_dir <folder with zarr distriubted checkpoints> \
+    --steps <optionally a list of checkpoint steps to average, if not provided, it will average all the checkpoints>
+```
+**Arguments**:
+- `--name_prefix`: Specifies the prefix for the generated averaged checkpoint.
+- `--checkpoint_dir`: Specifies the folder containing zarr distributed checkpoints.
+- `--steps`: (Optional) A comma-separated list of checkpoint steps to average (e.g., 1000, 2000, 3000). If not provided, the script will average all the checkpoints in the directory.
+
+After execution, the script generates averaged checkpoint in `<checkpoint_dir>` named `<name_prefix>-averaged`.
diff --git a/scripts/checkpoint_averaging/average_model_checkpoints.py b/scripts/checkpoint_averaging/legacy/average_model_checkpoints.py
similarity index 97%
rename from scripts/checkpoint_averaging/average_model_checkpoints.py
rename to scripts/checkpoint_averaging/legacy/average_model_checkpoints.py
index ce88bba9716b..a9eca6d06875 100644
--- a/scripts/checkpoint_averaging/average_model_checkpoints.py
+++ b/scripts/checkpoint_averaging/legacy/average_model_checkpoints.py
@@ -71,6 +71,9 @@
 
 
 def process_config(cfg: OmegaConf):
+    """
+    Process config
+    """
     if 'name' not in cfg or cfg.name is None:
         raise ValueError("`cfg.name` must be provided to save a model checkpoint")
 
@@ -107,6 +110,12 @@ def process_config(cfg: OmegaConf):
 
 @hydra_runner(config_path=None, config_name=None)
 def main(cfg):
+    """
+    Main function
+    """
+
+    logging.info("This script is deprecated and will be removed in the 25.01 release.")
+
     name_prefix, checkpoint_paths, save_ckpt_only = process_config(cfg)
 
     if not save_ckpt_only:
diff --git a/scripts/checkpoint_averaging/checkpoint_averaging.py b/scripts/checkpoint_averaging/legacy/checkpoint_averaging.py
similarity index 91%
rename from scripts/checkpoint_averaging/checkpoint_averaging.py
rename to scripts/checkpoint_averaging/legacy/checkpoint_averaging.py
index 0988479a17e3..846777fe70b5 100755
--- a/scripts/checkpoint_averaging/checkpoint_averaging.py
+++ b/scripts/checkpoint_averaging/legacy/checkpoint_averaging.py
@@ -42,6 +42,12 @@
 
 
 def main():
+    """
+    Main function
+    """
+
+    logging.info("This script is deprecated and will be removed in the 25.01 release.")
+
     parser = argparse.ArgumentParser()
     parser.add_argument(
         'model_fname_list',
@@ -56,15 +62,20 @@ def main():
         type=str,
         nargs='+',
         default=[],
-        help='A list of Python file names to "from FILE import *" (Needed when some classes were defined in __main__ of a script)',
+        help='A list of Python file names to "from FILE import *"',
     )
     parser.add_argument(
-        '--class_path', type=str, default='', help='A path to class "module.submodule.class" (if given)',
+        '--class_path',
+        type=str,
+        default='',
+        help='A path to class "module.submodule.class" (if given)',
     )
     args = parser.parse_args()
 
     logging.info(
-        f"\n\nIMPORTANT:\nIf you get the following error:\n\t(AttributeError: Can't get attribute '???' on <module '__main__' from '???'>)\nuse:\n\t--import_fname_list\nfor all files that contain missing classes.\n\n"
+        f"\n\nIMPORTANT:\nIf you get the following error:\n\t"
+        "(AttributeError: Can't get attribute '???' on <module '__main__' from '???'>)\nuse:\n\t"
+        "--import_fname_list\nfor all files that contain missing classes.\n\n"
     )
 
     for fn in args.import_fname_list:
@@ -77,7 +88,7 @@ def main():
     # loop over all folders with .nemo files (or .nemo files)
     for model_fname_i, model_fname in enumerate(args.model_fname_list):
         if not model_fname.endswith(".nemo"):
-            # assume model_fname is a folder which contains a .nemo file (filter .nemo files which matches with "*-averaged.nemo")
+            # assume model_fname is a folder which contains a .nemo file
             nemo_files = list(
                 filter(lambda fn: not fn.endswith("-averaged.nemo"), glob.glob(os.path.join(model_fname, "*.nemo")))
             )
diff --git a/scripts/checkpoint_averaging/checkpoint_averaging_model_parallel.py b/scripts/checkpoint_averaging/legacy/checkpoint_averaging_model_parallel.py
similarity index 92%
rename from scripts/checkpoint_averaging/checkpoint_averaging_model_parallel.py
rename to scripts/checkpoint_averaging/legacy/checkpoint_averaging_model_parallel.py
index bf5b49a9e4e9..df03458a22d8 100644
--- a/scripts/checkpoint_averaging/checkpoint_averaging_model_parallel.py
+++ b/scripts/checkpoint_averaging/legacy/checkpoint_averaging_model_parallel.py
@@ -46,12 +46,20 @@
 
 
 def main():
+    """
+    Main function
+    """
+
+    logging.info("This script is deprecated and will be removed in the 25.01 release.")
+
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        '--name_prefix', help='Name of the final checkpoint. Will append -averaged.ckpt automatically.',
+        '--name_prefix',
+        help='Name of the final checkpoint. Will append -averaged.ckpt automatically.',
     )
     parser.add_argument(
-        '--checkpoint_dir', help='Folder containing all mp_rank_X subfolders.',
+        '--checkpoint_dir',
+        help='Folder containing all mp_rank_X subfolders.',
     )
     args = parser.parse_args()
 
diff --git a/scripts/checkpoint_averaging/megatron_checkpoint_averaging.py b/scripts/checkpoint_averaging/legacy/megatron_checkpoint_averaging.py
similarity index 95%
rename from scripts/checkpoint_averaging/megatron_checkpoint_averaging.py
rename to scripts/checkpoint_averaging/legacy/megatron_checkpoint_averaging.py
index 7b964fd7bade..5bf921e74518 100755
--- a/scripts/checkpoint_averaging/megatron_checkpoint_averaging.py
+++ b/scripts/checkpoint_averaging/legacy/megatron_checkpoint_averaging.py
@@ -44,6 +44,12 @@
 
 
 def main():
+    """
+    Main function
+    """
+
+    logging.info("This script is deprecated and will be removed in the 25.01 release.")
+
     parser = argparse.ArgumentParser()
     parser.add_argument(
         'model_fname_list',
@@ -57,7 +63,7 @@ def main():
         type=str,
         nargs='+',
         default=[],
-        help='A list of Python file names to "from FILE import *" (Needed when some classes were defined in __main__ of a script)',
+        help='A list of Python file names to "from FILE import *"',
     )
     parser.add_argument(
         '--class_path',
@@ -68,7 +74,8 @@ def main():
     args = parser.parse_args()
 
     logging.info(
-        f"\n\nIMPORTANT: Use --import_fname_list for all files that contain missing classes (AttributeError: Can't get attribute '???' on <module '__main__' from '???'>)\n\n"
+        f"\n\nIMPORTANT: Use --import_fname_list for all files that contain missing classes:\n\t"
+        "(AttributeError: Can't get attribute '???' on <module '__main__' from '???'>)\n\n"
     )
 
     for fn in args.import_fname_list:
@@ -82,7 +89,7 @@ def main():
     # loop over all folders with .nemo files (or .nemo files)
     for model_fname_i, model_fname in enumerate(args.model_fname_list):
         if not model_fname.endswith(".nemo"):
-            # assume model_fname is a folder which contains a .nemo file (filter .nemo files which matches with "*-averaged.nemo")
+            # assume model_fname is a folder which contains a .nemo file
             nemo_files = list(
                 filter(lambda fn: not fn.endswith("-averaged.nemo"), glob.glob(os.path.join(model_fname, "*.nemo")))
             )
diff --git a/scripts/checkpoint_averaging/distributed_checkpoint_averaging.py b/scripts/checkpoint_averaging/zarr_distributed_checkpoint_averaging.py
similarity index 91%
rename from scripts/checkpoint_averaging/distributed_checkpoint_averaging.py
rename to scripts/checkpoint_averaging/zarr_distributed_checkpoint_averaging.py
index 89b1430198b3..9d146c1e8501 100644
--- a/scripts/checkpoint_averaging/distributed_checkpoint_averaging.py
+++ b/scripts/checkpoint_averaging/zarr_distributed_checkpoint_averaging.py
@@ -27,10 +27,10 @@
 # limitations under the License.
 
 """
-Example: python scripts/checkpoint_averaging/distributed_checkpoint_averaging.py \
+Example: python scripts/checkpoint_averaging/zarr_distributed_checkpoint_averaging.py \
              --name_prefix=<checkpoint name> \
-             --checkpoint_dir=<folder with mp_rank_X subfolders containing checkpoints>
-             --steps <optinally a list of checkpoint steps to average, if not provided, it will average all the checkpoints>
+             --checkpoint_dir=<folder containing checkpoints> \
+             --steps <list of checkpoint steps to average, if not provided, it will average all the checkpoints>
 
 will generate a new directory in each of the distributed checkpoint subfolders named <checkpoint name>-averaged
 """
@@ -40,19 +40,24 @@
 import os
 import shutil
 import numpy as np
-import tensorstore  # need to import it for bf16 support
 import zarr
 
 logging.basicConfig(level=logging.INFO)
 
 
 def main():
+    """
+    Main function
+    """
+
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        '--name_prefix', help='Name of the final checkpoint. Will append -averaged automatically.',
+        '--name_prefix',
+        help='Name of the final checkpoint. Will append -averaged automatically.',
     )
     parser.add_argument(
-        '--checkpoint_dir', help='Folder containing all the distributed checkpoints.',
+        '--checkpoint_dir',
+        help='Folder containing all the distributed checkpoints.',
     )
     # list of checkpoint steps to average
     parser.add_argument(
diff --git a/scripts/deploy/multimodal/deploy_triton.py b/scripts/deploy/multimodal/deploy_triton.py
index 1d0c755c12d8..388407ce784f 100755
--- a/scripts/deploy/multimodal/deploy_triton.py
+++ b/scripts/deploy/multimodal/deploy_triton.py
@@ -212,7 +212,7 @@ def nemo_deploy(argv):
             triton_model_name=args.triton_model_name,
             triton_model_version=args.triton_model_version,
             max_batch_size=args.max_batch_size,
-            port=args.triton_port,
+            http_port=args.triton_port,
             address=args.triton_http_address,
         )
 
diff --git a/scripts/deploy/nlp/deploy_inframework_triton.py b/scripts/deploy/nlp/deploy_inframework_triton.py
index c5b391ab2c1e..c8ea92c2fb50 100755
--- a/scripts/deploy/nlp/deploy_inframework_triton.py
+++ b/scripts/deploy/nlp/deploy_inframework_triton.py
@@ -23,7 +23,7 @@
 
 megatron_llm_supported = True
 try:
-    from nemo.deploy.nlp import MegatronLLMDeploy
+    from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeploy
 except Exception as e:
     LOGGER.warning(f"Cannot import MegatronLLMDeployable, it will not be available. {type(e).__name__}: {e}")
     megatron_llm_supported = False
@@ -91,7 +91,7 @@ def nemo_deploy(argv):
                 triton_model_name=args.triton_model_name,
                 triton_model_version=args.triton_model_version,
                 max_batch_size=args.max_batch_size,
-                port=args.triton_port,
+                http_port=args.triton_port,
                 address=args.triton_http_address,
             )
 
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index e133ecb8cfd3..e3e33746562f 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -33,7 +33,7 @@ class UsageError(Exception):
 
 megatron_llm_supported = True
 try:
-    from nemo.deploy.nlp import MegatronLLMDeployable
+    from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable
 except Exception as e:
     LOGGER.warning(f"Cannot import MegatronLLMDeployable, it will not be available. {type(e).__name__}: {e}")
     megatron_llm_supported = False
@@ -411,7 +411,7 @@ def nemo_deploy(argv):
             triton_model_name=args.triton_model_name,
             triton_model_version=args.triton_model_version,
             max_batch_size=args.max_batch_size,
-            port=args.triton_port,
+            http_port=args.triton_port,
             address=args.triton_http_address,
             streaming=args.enable_streaming,
         )
diff --git a/scripts/deploy/nlp/deploy_vllm_triton.py b/scripts/deploy/nlp/deploy_vllm_triton.py
index a3cf5e8ec762..2e95bbd49183 100755
--- a/scripts/deploy/nlp/deploy_vllm_triton.py
+++ b/scripts/deploy/nlp/deploy_vllm_triton.py
@@ -156,7 +156,7 @@ def nemo_deploy(argv):
             triton_model_name=args.triton_model_name,
             triton_model_version=args.triton_model_version,
             max_batch_size=args.max_batch_size,
-            port=args.triton_port,
+            http_port=args.triton_port,
             address=args.triton_http_address,
             streaming=args.enable_streaming,
         )
diff --git a/scripts/llm/performance/README.md b/scripts/llm/performance/README.md
new file mode 100644
index 000000000000..62bf58329633
--- /dev/null
+++ b/scripts/llm/performance/README.md
@@ -0,0 +1,27 @@
+# Performance Recipes
+
+- Scripts defined in `scripts/llm/performance` are recipes optimized for performance. These scripts can launch pre-training experiments on Slurm based clusters.
+- You will need a virtual environemnt with NeMo and Nemo-Run related dependencies installed as the experiment configuration is resolved before launching it inside NeMo container.
+
+## Example
+
+The following line shows an example of how you can launch a pre-training experiment-
+
+`python3 scripts/llm/performance/llama3_8b.py --account <your_slurm_account> -partition <your_slurm_partition>`
+
+## Configuration Options
+
+- Slurm account and partition are mandatory arguments for launching the experiment.
+- You can use the following optional arguments as needed-
+  - -l/--log_dir: Location to store your experiment artifacts and logs. 
+    - Make sure the environemnt variable `NEMORUN_HOME=<log_dir>` is accessible and set correctly in your virtual environment. 
+    - You can run `export NEMORUN_HOME=<log_dir>` in your terminal. You can add it your bashrc file (or equivalent for your OS/Linux distro) for setting it permanently.
+  - -t/--time_limit: Maximum time limit for your experiment. Your slurm job will be cancelled after this. Default is 30 minutes.
+  - -i/--container_image: The NeMo container you want to use. Defaults to latest dev container- 'nvcr.io/nvidia/nemo:dev'.
+  - -c/--compute_dtype: Specifies whether you want to use bf16 or fp8 precision for training. Defaults to 'bf16'. You can choose to use 'fp8'.
+  - -ep/--enable_profiling: Enable nsys profiling. It is disabled by default. When enabled, profiling will be enabled for 1 step from step 5 to step 6. You can change the step in the respective recipe script. 
+  - -tb/--tensorboard: Enable tensorboard logging. It is disabled by default. 
+    - CAUTION: Tensorboard logging may cause performance overhead. 
+  - -d/--dryrun: Using this argument will not launch the experiment. It will simply print the sbatch script to stdout. This can be helpful to verify you have set your experiment correctly as needed.
+- You don't need to set any value for `--enable_profiling`, `--tensorboard` and `--dryrun`. See the below example for reference-
+  `python3 scripts/llm/performance/llama3_8b.py --account <your_slurm_account> -p <your_slurm_partition> -ep --tensorboard -d`
diff --git a/scripts/llm/performance/gpt3_175b.py b/scripts/llm/performance/gpt3_175b.py
new file mode 100644
index 000000000000..01a3f7381628
--- /dev/null
+++ b/scripts/llm/performance/gpt3_175b.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import nemo_run as run
+from nemo_run.config import NEMORUN_HOME
+from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
+
+from nemo.collections.llm.recipes.gpt3_175b import pretrain_recipe
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+from nemo.utils import logging
+
+NUM_NODES = 64
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 2
+GLOBAL_BATCH_SIZE = 2048
+TP_SIZE = 4
+PP_SIZE = 8
+CP_SIZE = 1
+VP_SIZE = 6
+MAX_STEPS = 100
+
+
+def gpt3_175b_performance_recipe(
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    """
+    gpt3 175b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    recipe = pretrain_recipe(performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
+    recipe.data.tokenizer = hf_tokenizer("nvidia/megatron-gpt2-345m")
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    if tp_size > 1:
+        recipe.trainer.strategy.sequence_parallel = True
+    else:
+        recipe.trainer.strategy.sequence_parallel = False
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_overlap_cfg.proj_fprop.fp8_buf = True
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    garbage_collection_callback = run.Config(
+        GarbageCollectionCallback,
+        gc_interval_train=100,
+        gc_interval_val=500,
+    )
+    recipe.trainer.callbacks.extend(
+        [
+            garbage_collection_callback,
+        ]
+    )
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
+        if comm_overlap_callback_idx >= 0:
+            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    recipe.model.config.tp_only_amax_red = True
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+    if args.log_dir != NEMORUN_HOME:
+        import sys
+
+        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
+        sys.exit(1)
+
+    exp_name = "_".join(
+        [
+            f"gpt3_175b",
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={},
+        retries=0,
+    )
+
+    recipe = gpt3_175b_performance_recipe(
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/llama3_405b.py b/scripts/llm/performance/llama3_405b.py
new file mode 100644
index 000000000000..dd6194c7f8b3
--- /dev/null
+++ b/scripts/llm/performance/llama3_405b.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import nemo_run as run
+from nemo_run.config import NEMORUN_HOME
+from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
+
+from nemo.collections.llm.recipes.llama31_405b import pretrain_recipe
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+from nemo.utils import logging
+
+NUM_NODES = 72
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 1
+GLOBAL_BATCH_SIZE = 252
+TP_SIZE = 8
+PP_SIZE = 9
+CP_SIZE = 2
+VP_SIZE = 7
+MAX_STEPS = 100
+
+
+def llama3_405b_performance_recipe(
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    """
+    llama3 405b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    recipe = pretrain_recipe(performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
+    recipe.data.tokenizer = hf_tokenizer("meta-llama/Llama-3.1-405B")
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    if tp_size > 1:
+        recipe.trainer.strategy.sequence_parallel = True
+    else:
+        recipe.trainer.strategy.sequence_parallel = False
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_overlap_cfg.proj_fprop.fp8_buf = True
+        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_overlap_cfg.fc2_fprop.fp8_buf = True
+
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    garbage_collection_callback = run.Config(
+        GarbageCollectionCallback,
+        gc_interval_train=100,
+        gc_interval_val=500,
+    )
+    recipe.trainer.callbacks.extend(
+        [
+            garbage_collection_callback,
+        ]
+    )
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
+        if comm_overlap_callback_idx >= 0:
+            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+    if args.log_dir != NEMORUN_HOME:
+        import sys
+
+        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
+        sys.exit(1)
+
+    exp_name = "_".join(
+        [
+            f"llama3_405b",
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={},
+        retries=0,
+    )
+
+    recipe = llama3_405b_performance_recipe(
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/llama3_70b.py b/scripts/llm/performance/llama3_70b.py
new file mode 100644
index 000000000000..97babadbe803
--- /dev/null
+++ b/scripts/llm/performance/llama3_70b.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import nemo_run as run
+from nemo_run.config import NEMORUN_HOME
+from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
+
+from nemo.collections.llm.recipes.llama3_70b import pretrain_recipe
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+from nemo.utils import logging
+
+NUM_NODES = 8
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 1
+GLOBAL_BATCH_SIZE = 128
+TP_SIZE = 4
+PP_SIZE = 4
+CP_SIZE = 2
+VP_SIZE = 5
+MAX_STEPS = 100
+
+
+def llama3_70b_performance_recipe(
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    """
+    llama3 70b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    recipe = pretrain_recipe(performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
+    recipe.data.tokenizer = hf_tokenizer("meta-llama/Meta-Llama-3-70B")
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    if tp_size > 1:
+        recipe.trainer.strategy.sequence_parallel = True
+    else:
+        recipe.trainer.strategy.sequence_parallel = False
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_overlap_cfg.proj_fprop.fp8_buf = True
+        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_overlap_cfg.fc2_fprop.fp8_buf = True
+
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    garbage_collection_callback = run.Config(
+        GarbageCollectionCallback,
+        gc_interval_train=100,
+        gc_interval_val=500,
+    )
+    recipe.trainer.callbacks.extend(
+        [
+            garbage_collection_callback,
+        ]
+    )
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
+        if comm_overlap_callback_idx >= 0:
+            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+    if args.log_dir != NEMORUN_HOME:
+        import sys
+
+        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
+        sys.exit(1)
+
+    exp_name = "_".join(
+        [
+            f"llama3_70b",
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={},
+        retries=0,
+    )
+
+    recipe = llama3_70b_performance_recipe(
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/llama3_8b.py b/scripts/llm/performance/llama3_8b.py
new file mode 100644
index 000000000000..81382cc33a16
--- /dev/null
+++ b/scripts/llm/performance/llama3_8b.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import nemo_run as run
+from nemo_run.config import NEMORUN_HOME
+from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
+
+from nemo.collections.llm.recipes.llama3_8b import pretrain_recipe
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+from nemo.utils import logging
+
+NUM_NODES = 1
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 1
+GLOBAL_BATCH_SIZE = 128
+TP_SIZE = 1
+PP_SIZE = 1
+CP_SIZE = 2
+VP_SIZE = None
+MAX_STEPS = 100
+
+
+def llama3_8b_performance_recipe(
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    """
+    llama3 8b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    recipe = pretrain_recipe(performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
+    recipe.data.tokenizer = hf_tokenizer("meta-llama/Meta-Llama-3-8B")
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    if tp_size > 1:
+        recipe.trainer.strategy.sequence_parallel = True
+    else:
+        recipe.trainer.strategy.sequence_parallel = False
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    garbage_collection_callback = run.Config(
+        GarbageCollectionCallback,
+        gc_interval_train=100,
+        gc_interval_val=500,
+    )
+    recipe.trainer.callbacks.extend(
+        [
+            garbage_collection_callback,
+        ]
+    )
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
+        if comm_overlap_callback_idx >= 0:
+            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+    if args.log_dir != NEMORUN_HOME:
+        import sys
+
+        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
+        sys.exit(1)
+
+    exp_name = "_".join(
+        [
+            f"llama3_8b",
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={},
+        retries=0,
+    )
+
+    recipe = llama3_8b_performance_recipe(
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/mixtral_8x22b.py b/scripts/llm/performance/mixtral_8x22b.py
new file mode 100644
index 000000000000..b474561296e4
--- /dev/null
+++ b/scripts/llm/performance/mixtral_8x22b.py
@@ -0,0 +1,180 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import nemo_run as run
+from nemo_run.config import NEMORUN_HOME
+from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
+
+from nemo.collections.llm.recipes.mixtral_8x7b import pretrain_recipe
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+from nemo.utils import logging
+
+NUM_NODES = 128
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 1
+GLOBAL_BATCH_SIZE = 256
+TP_SIZE = 4
+PP_SIZE = 4
+CP_SIZE = 8
+VP_SIZE = 14
+EP_SIZE = 8
+MAX_STEPS = 100
+
+
+def mixtral_8x22b_performance_recipe(
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    ep_size: int,
+    max_steps: int,
+):
+    """
+    mixtral 8x7b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    recipe = pretrain_recipe(performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
+    recipe.data.tokenizer = hf_tokenizer("mistralai/Mixtral-8x22B-v0.1")
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    recipe.trainer.strategy.expert_model_parallel_size = ep_size
+    if tp_size > 1:
+        recipe.trainer.strategy.sequence_parallel = True
+    else:
+        recipe.trainer.strategy.sequence_parallel = False
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    garbage_collection_callback = run.Config(
+        GarbageCollectionCallback,
+        gc_interval_train=100,
+        gc_interval_val=500,
+    )
+    recipe.trainer.callbacks.extend(
+        [
+            garbage_collection_callback,
+        ]
+    )
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
+        if comm_overlap_callback_idx >= 0:
+            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+    if args.log_dir != NEMORUN_HOME:
+        import sys
+
+        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
+        sys.exit(1)
+
+    exp_name = "_".join(
+        [
+            f"mixtral_8x22b",
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={},
+        retries=0,
+    )
+
+    recipe = mixtral_8x22b_performance_recipe(
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        EP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/mixtral_8x7b.py b/scripts/llm/performance/mixtral_8x7b.py
new file mode 100644
index 000000000000..4d5321269227
--- /dev/null
+++ b/scripts/llm/performance/mixtral_8x7b.py
@@ -0,0 +1,180 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import nemo_run as run
+from nemo_run.config import NEMORUN_HOME
+from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
+
+from nemo.collections.llm.recipes.mixtral_8x7b import pretrain_recipe
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+from nemo.utils import logging
+
+NUM_NODES = 8
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 1
+GLOBAL_BATCH_SIZE = 256
+TP_SIZE = 1
+PP_SIZE = 4
+CP_SIZE = 1
+VP_SIZE = 8
+EP_SIZE = 8
+MAX_STEPS = 100
+
+
+def mixtral_8x7b_performance_recipe(
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    ep_size: int,
+    max_steps: int,
+):
+    """
+    mixtral 8x7b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    recipe = pretrain_recipe(performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
+    recipe.data.tokenizer = hf_tokenizer("mistralai/Mixtral-8x7B-v0.1")
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    recipe.trainer.strategy.expert_model_parallel_size = ep_size
+    if tp_size > 1:
+        recipe.trainer.strategy.sequence_parallel = True
+    else:
+        recipe.trainer.strategy.sequence_parallel = False
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    garbage_collection_callback = run.Config(
+        GarbageCollectionCallback,
+        gc_interval_train=100,
+        gc_interval_val=500,
+    )
+    recipe.trainer.callbacks.extend(
+        [
+            garbage_collection_callback,
+        ]
+    )
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
+        if comm_overlap_callback_idx >= 0:
+            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+    if args.log_dir != NEMORUN_HOME:
+        import sys
+
+        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
+        sys.exit(1)
+
+    exp_name = "_".join(
+        [
+            f"mixtral_8x7b",
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={},
+        retries=0,
+    )
+
+    recipe = mixtral_8x7b_performance_recipe(
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        EP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
new file mode 100644
index 000000000000..5f50f8474fbc
--- /dev/null
+++ b/scripts/llm/performance/utils.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from typing import Dict, List, Optional
+
+import nemo_run as run
+from lightning.pytorch.callbacks.callback import Callback
+from nemo_run.config import NEMORUN_HOME
+
+from nemo.collections.common.tokenizers.huggingface import AutoTokenizer
+from nemo.collections.llm.recipes.llama3_8b import MegatronCommOverlapCallback
+from nemo.utils import logging
+
+
+def slurm_executor(
+    account: str,
+    partition: str,
+    log_dir: str,
+    nodes: int,
+    num_gpus_per_node: int,
+    time_limit: str = "01:00:00",
+    container_image: str = "nvcr.io/nvidia/nemo:dev",
+    custom_mounts: Optional[List[str]] = None,
+    custom_env_vars: Optional[Dict[str, str]] = None,
+    custom_srun_args: Optional[List[str]] = None,
+    retries: int = 0,
+) -> run.SlurmExecutor:
+    """
+    Slurm cluster definition with appropriate cluster params and NeMo container params needed for pre-training
+    and fine-tuning experiments
+    """
+    if not (log_dir and account and partition and nodes and num_gpus_per_node):
+        raise RuntimeError(
+            "Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this ",
+            "function.",
+        )
+
+    mounts = []
+    if custom_mounts:
+        mounts.extend(custom_mounts)
+
+    env_vars = {
+        "TRANSFORMERS_OFFLINE": "1",
+        "TOKENIZERS_PARALLELISM": "False",
+        "NCCL_NVLS_ENABLE": "0",
+        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
+        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "NVTE_FUSED_ATTN": "1",
+        "NVTE_FLASH_ATTN": "0",
+        "NEMO_LOG_MEMORY_USAGE": "1",
+        "NEMORUN_HOME": log_dir,
+    }
+    if custom_env_vars:
+        env_vars |= custom_env_vars
+
+    srun_args = ["--mpi=pmix"]
+    if custom_srun_args:
+        srun_args.extend(custom_srun_args)
+
+    executor = run.SlurmExecutor(
+        account=account,
+        partition=partition,
+        tunnel=run.LocalTunnel(
+            job_dir=os.path.join(log_dir, "experiments"),
+        ),
+        nodes=nodes,
+        ntasks_per_node=num_gpus_per_node,
+        mem="0",
+        exclusive=True,
+        packager=run.GitArchivePackager(),
+    )
+
+    executor.container_image = container_image
+    executor.container_mounts = mounts
+    executor.env_vars = env_vars
+    executor.srun_args = srun_args
+    executor.retries = retries
+    executor.time = time_limit
+
+    return executor
+
+
+def hf_tokenizer(model_name: str) -> run.Config[AutoTokenizer]:
+    """
+    HuggingFace tokenizer.
+
+    Args:
+        model_name (str): corresponds to HuggingFace-AutoTokenizer's 'pretrained_model_name_or_path' input argument.
+                For more details please refer to-
+                huggingface.co/docs/transformers/v4.47.1/en/model_doc/auto#transformers.AutoTokenizer
+    """
+    log_msg = [
+        "AutoTokenizer first searches for tokenizer files locally in env var 'NEMO_HOME'.",
+        "If files are missing locally, AutoTokenizer will try downloading from HuggingFace.",
+        "Make sure 'TRANSFORMERS_OFFLINE=0' and 'HF_TOKEN:<token_value>'.",
+        "You can set them as scripts.llm.performance.utils.slurm_executor(custom_env_vars=",
+        "{'TRANSFORMERS_OFFLINE: 0', 'HF_TOKEN: <token_value>'}",
+    ]
+    logging.warning(" ".join(log_msg))
+
+    return run.Config(
+        AutoTokenizer,
+        pretrained_model_name=model_name,
+        use_fast=True,
+    )
+
+
+def get_comm_overlap_callback_idx(callbacks: List[Callback]):
+    """
+    nemo.lightning.Trainer has a list of callbacks defined. This method identifies index of MegatronCommOverlapCallback
+    from the list defined in recipes in nemo.collections.llm.recipes. The index is needed to override ddp communication
+    params
+    """
+    if callbacks:  # default is None in lightning
+        for idx, callback in enumerate(callbacks):
+            if isinstance(callback, MegatronCommOverlapCallback):
+                return idx
+    return -1
+
+
+def parse_cli_args():
+    """
+    Command line arguments correspong to Slurm cluster and NeMo2.0 for running pre-training and
+    fine-tuning experiments.
+    """
+    parser = argparse.ArgumentParser(description="NeMo2.0 Performance Pretraining and Fine-Tuning")
+
+    parser.add_argument(
+        "-a",
+        "--account",
+        type=str,
+        help="Slurm account to use for experiment",
+        required=True,
+    )
+    parser.add_argument(
+        "-p",
+        "--partition",
+        type=str,
+        help="Slurm partition to use for experiment",
+        required=True,
+    )
+    parser.add_argument(
+        "-l",
+        "--log_dir",
+        type=str,
+        help=f"Directory for logging experiment results. Defaults to {NEMORUN_HOME}",
+        required=False,
+        default=NEMORUN_HOME,
+    )
+    parser.add_argument(
+        "-t",
+        "--time_limit",
+        type=str,
+        help="Maximum time limit to run experiment for. Defaults to 30 minutes (format- 'HH:MM:SS')",
+        required=False,
+        default="00:30:00",
+    )
+    parser.add_argument(
+        "-i",
+        "--container_image",
+        type=str,
+        help="NeMo container to use for experiment. Defaults to latest dev container- 'nvcr.io/nvidia/nemo:dev'\
+            Make sure your NGC credentials are accessible in your environment.",
+        required=False,
+        default="nvcr.io/nvidia/nemo:dev",
+    )
+    parser.add_argument(
+        "-c",
+        "--compute_dtype",
+        type=str,
+        help="Compute precision. Options- bf16 or fp8. Defaults to bf16",
+        required=False,
+        default="bf16",
+    )
+    parser.add_argument(
+        "-ep",
+        "--enable_profiling",
+        help="Enable Nsys profiling. Diabled by default",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-tb",
+        "--tensorboard",
+        help="Enable tensorboard logging. Disabled by default",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-d",
+        "--dryrun",
+        help="If true, prints sbatch script to terminal without launching experiment.",
+        required=False,
+        action="store_true",
+    )
+
+    return parser
diff --git a/scripts/llm/pretraining.py b/scripts/llm/pretraining.py
index c08ab353bc3e..3b1a2f140b4c 100644
--- a/scripts/llm/pretraining.py
+++ b/scripts/llm/pretraining.py
@@ -154,6 +154,17 @@ def main():
 
     pretrain.trainer.max_steps = 1000
 
+    # Change here and add your files to custom_mounts
+    vocab_file = None
+    merges_file = None
+    pretrain.data = MockDataModule(
+        seq_length=pretrain.data.seq_length,
+        global_batch_size=pretrain.data.global_batch_size,
+        micro_batch_size=pretrain.data.micro_batch_size,
+        vocab_file=vocab_file,
+        merges_file=merges_file,
+    )
+
     executor: run.Executor
 
     if args.slurm:
@@ -166,6 +177,7 @@ def main():
             partition="",
             nodes=pretrain.trainer.num_nodes,
             devices=pretrain.trainer.devices,
+            custom_mounts=[],
         )
     else:
         executor = local_executor_torchrun(nodes=pretrain.trainer.num_nodes, devices=pretrain.trainer.devices)
diff --git a/scripts/nlp_language_modeling/preprocess_data_for_megatron.py b/scripts/nlp_language_modeling/preprocess_data_for_megatron.py
index cde14d83ec4b..6341e8edadae 100644
--- a/scripts/nlp_language_modeling/preprocess_data_for_megatron.py
+++ b/scripts/nlp_language_modeling/preprocess_data_for_megatron.py
@@ -80,6 +80,14 @@
     --chunk_size=64 \
     --workers=64 
 ```
+
+This script supports multiple tokenizer libraries for data preprocessing.
+
+Example1: Preprocess data using any tokenizer hosted on HuggingFace:
+          --tokenizer-library=sentencepiece --tokenizer-type=HF-URL
+Example2: Preprocess data using SentencePiece tokenizer with tokenizer.model:
+          --tokenizer-library=sentencepiece --tokenizer-model=tokenizer.model
+Refer to get_nmt_tokenizer in nemo/collections/nlp/modules/common/tokenizer_util.py for complete usage.
 """
 
 import argparse
diff --git a/scripts/vlm/llava_next_finetune.py b/scripts/vlm/llava_next_finetune.py
index 334b360d7c70..91df8a39452d 100644
--- a/scripts/vlm/llava_next_finetune.py
+++ b/scripts/vlm/llava_next_finetune.py
@@ -49,7 +49,7 @@ def main(args):
         from transformers import AutoProcessor
 
         from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
-        from nemo.collections.multimodal.data.energon import SimpleMultiModalDataModule
+        from nemo.collections.multimodal.data.energon import EnergonMultiModalDataModule
         from nemo.collections.multimodal.data.energon.config import MultiModalSampleConfig
         from nemo.collections.vlm import LlavaNextTaskEncoder
 
@@ -65,7 +65,7 @@ def main(args):
             image_processor=processor.image_processor,
             multimodal_sample_config=multimodal_sample_config,
         )
-        data = SimpleMultiModalDataModule(
+        data = EnergonMultiModalDataModule(
             path=data_path,
             tokenizer=tokenizer,
             image_processor=processor.image_processor,
diff --git a/scripts/vlm/llava_next_pretrain.py b/scripts/vlm/llava_next_pretrain.py
index bb84e3dae1e5..0beb9b5b08d0 100644
--- a/scripts/vlm/llava_next_pretrain.py
+++ b/scripts/vlm/llava_next_pretrain.py
@@ -49,7 +49,7 @@ def main(args):
         from transformers import AutoProcessor
 
         from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
-        from nemo.collections.multimodal.data.energon import SimpleMultiModalDataModule
+        from nemo.collections.multimodal.data.energon import EnergonMultiModalDataModule
         from nemo.collections.multimodal.data.energon.config import MultiModalSampleConfig
         from nemo.collections.vlm import LlavaNextTaskEncoder
 
@@ -67,7 +67,7 @@ def main(args):
             image_processor=processor.image_processor,
             multimodal_sample_config=multimodal_sample_config,
         )
-        data = SimpleMultiModalDataModule(
+        data = EnergonMultiModalDataModule(
             path=data_path,
             tokenizer=tokenizer,
             image_processor=processor.image_processor,
diff --git a/scripts/vlm/mllama_generate.py b/scripts/vlm/mllama_generate.py
index 10dc197f63a0..afa6ee05a221 100644
--- a/scripts/vlm/mllama_generate.py
+++ b/scripts/vlm/mllama_generate.py
@@ -45,10 +45,8 @@ def load_image(image_url: str) -> Image.Image:
         return None
 
 
-def generate(model, processor, image, text):
+def generate(model, processor, images, text):
     # pylint: disable=C0115,C0116
-    tokenizer = processor.tokenizer
-
     messages = [
         {
             "role": "user",
@@ -60,8 +58,8 @@ def generate(model, processor, image, text):
     model = setup_inference_wrapper(model, processor.tokenizer)
 
     prompts = [input_text]
-    images = [image]
-    params = CommonInferenceParams(top_k=1, top_p=0, num_tokens_to_generate=100)
+    images = [images]
+    params = CommonInferenceParams(top_k=1, top_p=0, num_tokens_to_generate=50)
     result = vlm_generate(
         model,
         processor.tokenizer,
@@ -113,11 +111,11 @@ def main(args) -> None:
     model = model.to(torch.bfloat16)
 
     # Load the image
-    raw_image = load_image(args.image_url)
-    if raw_image is None:
+    raw_images = [load_image(url) for url in args.image_url]
+    if not raw_images:
         return  # Exit if the image can't be loaded
 
-    generate(model, processor, image=raw_image, text="<|image|>\nDescribe the image.")
+    generate(model, processor, images=raw_images, text=args.prompt)
 
 
 if __name__ == "__main__":
@@ -133,12 +131,21 @@ def main(args) -> None:
         default=None,
         help="Local path to the model if not loading from Hugging Face.",
     )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="<|image|>\nDescribe the image.",
+        help="Input prompt",
+    )
     parser.add_argument(
         "--image_url",
+        nargs='+',
         type=str,
         # pylint: disable=line-too-long
-        default="https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg",
-        help="URL of the image to use for inference.",
+        default=[
+            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+        ],
+        help="List of the image urls to use for inference.",
     )
     parser.add_argument("--devices", type=int, required=False, default=1)
     parser.add_argument("--tp_size", type=int, required=False, default=1)
diff --git a/tests/collections/asr/test_asr_multitask_model_bpe.py b/tests/collections/asr/test_asr_multitask_model_bpe.py
index 5ee2d8279cf2..63185f687fea 100644
--- a/tests/collections/asr/test_asr_multitask_model_bpe.py
+++ b/tests/collections/asr/test_asr_multitask_model_bpe.py
@@ -643,7 +643,9 @@ def canary2_tokenizer(asr_model, tmp_path):
                     "<|notimestamp|>",
                     "<|emo:undefined|>",
                     "<|emo:happy|>",
-                ],
+                ]
+                # Timestamp frame special tokens
+                + [f"<|{i}|>" for i in range(900)],
                 tmp_path,
                 force_rebuild=False,
             ),
@@ -659,7 +661,7 @@ def test_prompted_dataset_canary2(canary2_tokenizer):
         tokenizer=canary2_tokenizer, prompt=Canary2PromptFormatter(canary2_tokenizer)
     )
 
-    cuts = DummyManifest(CutSet, begin_id=0, end_id=3, with_data=True)
+    cuts = DummyManifest(CutSet, begin_id=0, end_id=4, with_data=True)
 
     # backward compatibility
     c = cuts[0]
@@ -693,11 +695,24 @@ def test_prompted_dataset_canary2(canary2_tokenizer):
     c.emotion = "<|emo:happy|>"
     c.decodercontext = "some decoder context"
 
+    # transcript with timestamps
+    c = cuts[3]
+    c.supervisions[0].language = "en"
+    c.supervisions[0].text = "<|0|> hello <|3|> <|4|> world <|5|>"
+    c.source_lang = "en"
+    c.target_lang = "en"
+    c.pnc = "<|pnc|>"
+    c.itn = "<|noitn|>"
+    c.diarize = "<|diarize|>"
+    c.timestamp = "<|timestamp|>"
+    c.emotion = "<|emo:happy|>"
+    c.decodercontext = "some decoder context"
+
     batch = dataset[cuts]
 
     assert isinstance(batch, PromptedAudioToTextMiniBatch)
-    assert batch.audio.shape == (3, 16000)
-    assert batch.audio_lens.tolist() == [16000, 16000, 16000]
+    assert batch.audio.shape == (4, 16000)
+    assert batch.audio_lens.tolist() == [16000, 16000, 16000, 16000]
 
     # Test example 0
     i = 0
@@ -706,11 +721,11 @@ def test_prompted_dataset_canary2(canary2_tokenizer):
         == '<|startofcontext|><|startoftranscript|><|emo:undefined|><|en|><|en|><|pnc|><|noitn|><|notimestamp|><|nodiarize|><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
     )
     assert batch.prompt_lens[i] == 9
-    assert canary2_tokenizer.ids_to_text(batch.transcript[i]) == 'i##r##r##el##e##v##a##nt'
+    assert canary2_tokenizer.ids_to_text(batch.transcript[i]) == 'i##r##r##el##e##v##a##nt<pad><pad><pad><pad><pad>'
     assert batch.transcript_lens[i] == 8
     assert (
         canary2_tokenizer.ids_to_text(batch.prompted_transcript[i])
-        == '<|startofcontext|><|startoftranscript|><|emo:undefined|><|en|><|en|><|pnc|><|noitn|><|notimestamp|><|nodiarize|>i##r##r##el##e##v##a##nt<|endoftext|><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
+        == '<|startofcontext|><|startoftranscript|><|emo:undefined|><|en|><|en|><|pnc|><|noitn|><|notimestamp|><|nodiarize|>i##r##r##el##e##v##a##nt<|endoftext|><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
     )
     assert batch.prompted_transcript_lens[i] == 18
 
@@ -721,11 +736,14 @@ def test_prompted_dataset_canary2(canary2_tokenizer):
         == '<|startofcontext|><|startoftranscript|><|emo:happy|><|en|><|en|><|pnc|><|itn|><|timestamp|><|diarize|><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
     )
     assert batch.prompt_lens[i] == 9
-    assert canary2_tokenizer.ids_to_text(batch.transcript[i]) == 'a##s##d<pad><pad><pad><pad><pad>'
+    assert (
+        canary2_tokenizer.ids_to_text(batch.transcript[i])
+        == 'a##s##d<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
+    )
     assert batch.transcript_lens[i] == 3
     assert (
         canary2_tokenizer.ids_to_text(batch.prompted_transcript[i])
-        == '<|startofcontext|><|startoftranscript|><|emo:happy|><|en|><|en|><|pnc|><|itn|><|timestamp|><|diarize|>a##s##d<|endoftext|><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
+        == '<|startofcontext|><|startoftranscript|><|emo:happy|><|en|><|en|><|pnc|><|itn|><|timestamp|><|diarize|>a##s##d<|endoftext|><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
     )
     assert batch.prompted_transcript_lens[i] == 13
 
@@ -736,10 +754,28 @@ def test_prompted_dataset_canary2(canary2_tokenizer):
         == '<|startofcontext|>s##o##m##ed##e##c##o##d##erc##o##nt##e##x##t<|startoftranscript|><|emo:happy|><|en|><|en|><|pnc|><|noitn|><|timestamp|><|diarize|>'
     )
     assert batch.prompt_lens[i] == 25
-    assert canary2_tokenizer.ids_to_text(batch.transcript[i]) == 'a##s##d<pad><pad><pad><pad><pad>'
+    assert (
+        canary2_tokenizer.ids_to_text(batch.transcript[i])
+        == 'a##s##d<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
+    )
     assert batch.transcript_lens[i] == 3
     assert (
         canary2_tokenizer.ids_to_text(batch.prompted_transcript[i])
-        == '<|startofcontext|>s##o##m##ed##e##c##o##d##erc##o##nt##e##x##t<|startoftranscript|><|emo:happy|><|en|><|en|><|pnc|><|noitn|><|timestamp|><|diarize|>a##s##d<|endoftext|>'
+        == '<|startofcontext|>s##o##m##ed##e##c##o##d##erc##o##nt##e##x##t<|startoftranscript|><|emo:happy|><|en|><|en|><|pnc|><|noitn|><|timestamp|><|diarize|>a##s##d<|endoftext|><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
     )
     assert batch.prompted_transcript_lens[i] == 29
+
+    # Test example 3
+    i = 3
+    assert (
+        canary2_tokenizer.ids_to_text(batch.prompt[i])
+        == '<|startofcontext|>s##o##m##ed##e##c##o##d##erc##o##nt##e##x##t<|startoftranscript|><|emo:happy|><|en|><|en|><|pnc|><|noitn|><|timestamp|><|diarize|>'
+    )
+    assert batch.prompt_lens[i] == 25
+    assert canary2_tokenizer.ids_to_text(batch.transcript[i]) == '<|0|>h##el##l##o<|3|><|4|>w##o##r##l##d<|5|>'
+    assert batch.transcript_lens[i] == 13
+    assert (
+        canary2_tokenizer.ids_to_text(batch.prompted_transcript[i])
+        == '<|startofcontext|>s##o##m##ed##e##c##o##d##erc##o##nt##e##x##t<|startoftranscript|><|emo:happy|><|en|><|en|><|pnc|><|noitn|><|timestamp|><|diarize|><|0|>h##el##l##o<|3|><|4|>w##o##r##l##d<|5|><|endoftext|>'
+    )
+    assert batch.prompted_transcript_lens[i] == 39
diff --git a/tests/collections/llm/hf/peft.py b/tests/collections/llm/hf/peft.py
index 018774280946..3be0443d69fe 100644
--- a/tests/collections/llm/hf/peft.py
+++ b/tests/collections/llm/hf/peft.py
@@ -100,6 +100,7 @@ def formatting_prompts_func(examples):
             use_distributed_sampler=use_dist_samp,
             logger=wandb,
             enable_checkpointing=args.disable_ckpt,
+            precision='bf16',
         ),
         optim=fdl.build(llm.adam.pytorch_adam_with_flat_lr(lr=1e-5)),
         log=None,
diff --git a/tests/collections/llm/hf/pretrain.py b/tests/collections/llm/hf/pretrain.py
new file mode 100755
index 000000000000..045b11b37076
--- /dev/null
+++ b/tests/collections/llm/hf/pretrain.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fiddle as fdl
+from lightning.pytorch.loggers import WandbLogger
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.lightning.pytorch.accelerate.transformer_engine import is_te_accelerated
+
+
+DATA_PATH = '/home/TestData/lite/hf_cache/squad/'
+
+
+def make_squad_hf_dataset(data_path, tokenizer):
+    EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN
+
+    def formatting_prompts_func(examples):
+        alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+
+    ### Instruction:
+    {}
+
+    ### Input:
+    {}
+
+    ### Response:
+    {}"""
+        instruction = examples["context"]
+        input = examples["question"]
+        output = examples["answers"]['text']
+        if isinstance(output, list):
+            output = output[0]
+        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
+        ans = tokenizer(text)
+        ans['labels'] = ans['input_ids']
+        return ans
+
+    tokenizer = getattr(tokenizer, 'tokenizer', tokenizer)
+    datamodule = llm.HFDatasetDataModule(data_path, split="train[:100]", pad_token_id=tokenizer.eos_token_id)
+
+    datamodule.map(
+        formatting_prompts_func,
+        batched=False,
+        batch_size=2,
+        remove_columns=["id", "title", "context", "question", 'answers'],
+    )
+
+    return datamodule
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', default='meta-llama/Llama-3.2-1B')
+    parser.add_argument('--strategy', type=str, default='auto', choices=['auto', 'ddp', 'fsdp'])
+    parser.add_argument('--devices', default=1)
+    parser.add_argument('--accelerator', default='gpu', choices=['gpu'])
+    parser.add_argument('--model-accelerator', default=None, choices=['te'])
+    parser.add_argument('--max-steps', type=int, default=100)
+    parser.add_argument("--fp8-autocast", default=False, action='store_true')
+    parser.add_argument('--wandb-project', type=str, default=None)
+    parser.add_argument('--model-save-path', type=str, default=None)
+    args = parser.parse_args()
+
+    wandb = None
+    if args.wandb_project is not None:
+        model = '_'.join(args.model.split('/')[-2:])
+        wandb = WandbLogger(
+            project=args.wandb_project,
+            name=f'{model}_dev{args.devices}_strat_{args.strategy}',
+        )
+    grad_clip = 0.5
+    if args.strategy == 'fsdp':
+        # See: https://github.com/Lightning-AI/pytorch-lightning/blob/8ad3e29816a63d8ce5c00ac104b14729a4176f4f/src/lightning/pytorch/plugins/precision/fsdp.py#L81
+        grad_clip = None
+    use_dist_samp = False
+
+    model_accelerator = None
+    if args.model_accelerator == "te":
+        from functools import partial
+        from nemo.lightning.pytorch.accelerate.transformer_engine import te_accelerate
+
+        model_accelerator = partial(te_accelerate, fp8_autocast=args.fp8_autocast)
+
+    from nemo.lightning.pytorch.accelerate.transformer_engine import te_accelerate
+
+    model = llm.HFAutoModelForCausalLM(
+        model_name=args.model, model_accelerator=model_accelerator, load_pretrained_weights=False
+    )
+    tokenizer = model.tokenizer
+
+    llm.api.finetune(
+        model=model,
+        data=make_squad_hf_dataset(DATA_PATH, tokenizer),
+        trainer=nl.Trainer(
+            devices=args.devices,
+            max_steps=args.max_steps,
+            accelerator=args.accelerator,
+            strategy=args.strategy,
+            log_every_n_steps=1,
+            limit_val_batches=0.0,
+            num_sanity_val_steps=0,
+            accumulate_grad_batches=10,
+            gradient_clip_val=grad_clip,
+            use_distributed_sampler=use_dist_samp,
+            callbacks=[],
+            logger=wandb,
+        ),
+        optim=fdl.build(llm.adam.pytorch_adam_with_flat_lr(lr=1e-5)),
+        log=None,
+    )
+
+    if args.model_accelerator:
+        if args.model_accelerator == "te":
+            te_acc = is_te_accelerated(model.model)
+            assert te_acc, "Transformer Engine acceleration was unsuccessful"
+            print("TE Accelerated: ", te_acc)
+
+    if args.model_save_path is not None:
+        model.save_pretrained(args.model_save_path)
diff --git a/tests/collections/llm/hf/pretrain_nemorun.py b/tests/collections/llm/hf/pretrain_nemorun.py
new file mode 100644
index 000000000000..331a0652e21a
--- /dev/null
+++ b/tests/collections/llm/hf/pretrain_nemorun.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import nemo_run as run
+
+from nemo.collections import llm
+from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+from nemo.collections.llm.gpt.data.hf_dataset import SquadHFDataModule
+
+
+DATA_PATH = '/home/TestData/lite/hf_cache/squad/'
+
+
+def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
+    # Env vars for jobs are configured here
+    env_vars = {
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
+        "NCCL_NVLS_ENABLE": "0",
+        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
+        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "NVTE_FUSED_ATTN": "0",
+    }
+
+    executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
+
+    return executor
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', default='meta-llama/Llama-3.2-1B')
+    parser.add_argument('--strategy', type=str, default='auto', choices=['auto', 'ddp', 'fsdp'])
+    parser.add_argument('--devices', default=1)
+    parser.add_argument('--accelerator', default='gpu', choices=['gpu'])
+    parser.add_argument('--max-steps', type=int, default=100)
+    args = parser.parse_args()
+
+    recipe = llm.hf_auto_model_for_causal_lm.pretrain_recipe(
+        model_name=args.model,
+        name="pt",
+        num_nodes=1,
+        num_gpus_per_node=args.devices,
+        max_steps=args.max_steps,
+    )
+    recipe.trainer.val_check_interval = 50
+
+    tokenizer = llm.HFAutoModelForCausalLM.configure_tokenizer(args.model)
+    recipe.data = run.Config(
+        SquadHFDataModule,
+        path_or_dataset=DATA_PATH,
+        split="train[:100]",
+        pad_token_id=tokenizer.tokenizer.eos_token_id,
+        tokenizer=run.Config(AutoTokenizer, pretrained_model_name=args.model),
+    )
+    executor = local_executor_torchrun(nodes=recipe.trainer.num_nodes, devices=recipe.trainer.devices)
+    run.run(recipe, executor=executor)
diff --git a/tests/collections/llm/io/test_drop_unexpected_params.py b/tests/collections/llm/io/test_drop_unexpected_params.py
new file mode 100644
index 000000000000..b60c7236ba82
--- /dev/null
+++ b/tests/collections/llm/io/test_drop_unexpected_params.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fiddle as fdl
+
+from nemo.lightning.io import drop_unexpected_params
+
+
+class TestDropUnexpectedParams:
+
+    def setup_method(self):
+        """
+        Setup common test resources.
+        """
+
+        class MockClassOld:
+            def __init__(self, x, y, deprecated):
+                pass
+
+        class MockClassNew:
+            def __init__(self, x, y):
+                pass
+
+        class OuterClass:
+            def __init__(self, z, t):
+                pass
+
+        self.MockClassOld = MockClassOld
+        self.MockClassNew = MockClassNew
+        self.OuterClass = OuterClass
+
+    def test_valid_config_stays_same(self):
+        """
+        Test that a valid config remains unchanged.
+        """
+
+        config = fdl.Config(self.MockClassNew, x=1, y=2)
+        updated = drop_unexpected_params(config)
+
+        assert not updated, "Expected the config to remain unchanged."
+        assert config.x == 1
+        assert config.y == 2
+
+    def test_config_updates(self):
+        """
+        Test that a config with unexpected parameters gets updated.
+        """
+        config = fdl.Config(self.MockClassOld, x=1, y=2, deprecated=3)
+
+        # Simulate deprecation issue by overriding target class
+        config.__dict__['__fn_or_cls__'] = self.MockClassNew
+
+        updated = drop_unexpected_params(config)
+        assert updated, "Expected the config to be updated."
+        assert config.x == 1
+        assert config.y == 2
+        assert not hasattr(config, "deprecated"), "Expected 'deprecated' to be removed from the config."
+
+    def test_nested_config_updates(self):
+        """
+        Test that a nested config with unexpected parameters gets updated.
+        """
+        config = fdl.Config(self.OuterClass, z=4, t=fdl.Config(self.MockClassOld, x=1, y=2, deprecated=3))
+
+        # Simulate deprecation issue by overriding target class
+        config.t.__dict__["__fn_or_cls__"] = self.MockClassNew
+
+        updated = drop_unexpected_params(config)
+        assert updated, "Expected the nested config to be updated."
+        assert config.z == 4
+        assert config.t.x == 1
+        assert config.t.y == 2
+        assert not hasattr(config.t, "deprecated"), "Expected 'deprecated' to be removed from the inner config."
diff --git a/tests/collections/llm/peft/lora_export.py b/tests/collections/llm/peft/lora_export.py
new file mode 100644
index 000000000000..40c11177077b
--- /dev/null
+++ b/tests/collections/llm/peft/lora_export.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from nemo.collections import llm
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='Merge LoRA weights with base LLM')
+    parser.add_argument('--lora_checkpoint_path', type=str, required=True, help="Path to finetuned LORA checkpoint")
+    parser.add_argument('--output_path', type=str, required=True, help="Path to save merged checkpoint")
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = get_args()
+
+    llm.peft.export_lora(
+        lora_checkpoint_path=args.lora_checkpoint_path,
+        output_path=args.output_path,
+    )
diff --git a/tests/collections/multimodal/data/energon/test_data_module.py b/tests/collections/multimodal/data/energon/test_data_module.py
index 179d3f09f2df..c499ecfe9ca4 100644
--- a/tests/collections/multimodal/data/energon/test_data_module.py
+++ b/tests/collections/multimodal/data/energon/test_data_module.py
@@ -25,10 +25,10 @@
 from PIL import Image
 from transformers import AutoProcessor
 
-from nemo.collections.multimodal.data.energon import ImageToken, MultiModalSampleConfig, SimpleMultiModalDataModule
+from nemo.collections.multimodal.data.energon import EnergonMultiModalDataModule, ImageToken, MultiModalSampleConfig
 
 
-class TestSimpleMultiModalDataModuleWithDummyData(unittest.TestCase):
+class TestEnergonMultiModalDataModuleWithDummyData(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
@@ -47,7 +47,7 @@ def setUp(self):
 
         self.create_vqa_test_dataset(self.dataset_path, 10)
 
-        self.data_module = SimpleMultiModalDataModule(
+        self.data_module = EnergonMultiModalDataModule(
             path=str(self.dataset_path),
             tokenizer=self.tokenizer,
             image_processor=self.image_processor,
diff --git a/tests/collections/nlp/test_tokenizer_with_special_tokens.py b/tests/collections/nlp/test_tokenizer_with_special_tokens.py
new file mode 100644
index 000000000000..d042231f6670
--- /dev/null
+++ b/tests/collections/nlp/test_tokenizer_with_special_tokens.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
+TOKENIZER_SPM_FILE = '/home/TestData/nlp/tokenizer_with_special_tokens/tokenizer.model'
+
+
+def test_spm_with_special_tokens() -> None:
+    special_tokens = [
+        '<s>',
+        '</s>',
+        '[INST]',
+        '[/INST]',
+        '[TOOL_CALLS]',
+        '[AVAILABLE_TOOLS]',
+        '[/AVAILABLE_TOOLS]',
+        '[TOOL_RESULTS]',
+        '[/TOOL_RESULTS]',
+    ]
+    tokenizer_cfg = {
+        "library": "sentencepiece",
+        "type": None,
+        "vocab_file": None,
+        "merge_file": None,
+        "delimiter": None,
+        "sentencepiece_legacy": True,
+        "special_tokens": special_tokens,
+    }
+    tokenizer = get_nmt_tokenizer(
+        library=tokenizer_cfg['library'],
+        model_name=tokenizer_cfg.get("type", None),
+        use_fast=tokenizer_cfg.get("use_fast", False),
+        delimiter=tokenizer_cfg.get("delimiter", None),
+        special_tokens=tokenizer_cfg.get("special_tokens", None),
+        trust_remote_code=tokenizer_cfg.get("trust_remote_code", False),
+        tokenizer_model=TOKENIZER_SPM_FILE,
+        legacy=True,
+    )
+
+    assert tokenizer.text_to_ids('[INST]') == [3]
+    for i, special_token in enumerate(special_tokens):
+        assert special_token in tokenizer.special_token_to_id, f'Expected {special_token} to be a special token'
+        assert tokenizer.special_token_to_id[special_token] == i + 1
diff --git a/tests/collections/vlm/hf/peft.py b/tests/collections/vlm/hf/peft.py
index 109bccfcfa1f..bbe5462e431a 100644
--- a/tests/collections/vlm/hf/peft.py
+++ b/tests/collections/vlm/hf/peft.py
@@ -86,6 +86,7 @@ def fmt(sample):
     parser.add_argument('--max-steps', type=int, default=100)
     parser.add_argument('--wandb-project', type=str, default=None)
     parser.add_argument('--disable-ckpt', action='store_false')
+    parser.add_argument('--use-4bit', help="Load model in 4bit", action="store_true")
     args = parser.parse_args()
 
     wandb = None
@@ -103,7 +104,7 @@ def fmt(sample):
     processor = vlm.HFAutoModelForImageTextToText.configure_processor(args.model)
 
     llm.api.finetune(
-        model=vlm.HFAutoModelForImageTextToText(args.model),
+        model=vlm.HFAutoModelForImageTextToText(args.model, load_in_4bit=args.use_4bit),
         data=mk_hf_vlm_dataset(processor, args.mbs, args.gbs),
         trainer=nl.Trainer(
             devices=args.devices,
@@ -118,11 +119,13 @@ def fmt(sample):
             use_distributed_sampler=use_dist_samp,
             logger=wandb,
             enable_checkpointing=args.disable_ckpt,
+            precision='bf16',
         ),
         optim=fdl.build(llm.adam.pytorch_adam_with_flat_lr(lr=1e-5)),
         log=None,
         peft=llm.peft.LoRA(
             target_modules=['*_proj'],
             dim=16,
+            lora_dtype=torch.bfloat16 if args.use_4bit else None,
         ),
     )
diff --git a/tests/deploy/nemo_deploy.py b/tests/deploy/nemo_deploy.py
index 647fe52c9bc6..2d521602d8de 100644
--- a/tests/deploy/nemo_deploy.py
+++ b/tests/deploy/nemo_deploy.py
@@ -21,7 +21,7 @@
 
 import torch
 
-from nemo.deploy.nlp import MegatronLLMDeployable
+from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable
 from tests.infer_data_path import get_infer_test_data
 
 run_export_tests = True
@@ -136,7 +136,7 @@ def run_in_framework_inference(
     nm = DeployPyTriton(
         model=model,
         triton_model_name=model_name,
-        port=8000,
+        http_port=8000,
     )
     nm.deploy()
     nm.run()
@@ -286,7 +286,7 @@ def run_trt_llm_inference(
             nm = DeployPyTriton(
                 model=trt_llm_exporter,
                 triton_model_name=model_name,
-                port=8000,
+                http_port=8000,
             )
             nm.deploy()
             nm.run()
diff --git a/tests/export/nemo_export.py b/tests/export/nemo_export.py
index 5f3be23209aa..674416861e43 100644
--- a/tests/export/nemo_export.py
+++ b/tests/export/nemo_export.py
@@ -40,7 +40,8 @@
 
 in_framework_supported = True
 try:
-    from nemo.deploy.nlp import MegatronLLMDeployable, NemoQueryLLMPyTorch
+    from nemo.deploy.nlp import NemoQueryLLMPyTorch
+    from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable
 except Exception as e:
     LOGGER.warning(
         "Cannot import MegatronLLMDeployable or NemoQueryLLMPyTorch,"
@@ -397,7 +398,7 @@ def run_inference(
             nm = DeployPyTriton(
                 model=exporter,
                 triton_model_name=model_name,
-                port=8000,
+                http_port=8000,
             )
             nm.deploy()
             nm.run()
@@ -578,7 +579,7 @@ def run_in_framework_inference(
         nm = DeployPyTriton(
             model=deployed_model,
             triton_model_name=model_name,
-            port=8000,
+            http_port=8000,
         )
         nm.deploy()
         nm.run()
diff --git a/tests/lightning/pytorch/callbacks/test_peft.py b/tests/lightning/pytorch/callbacks/test_peft.py
index fb6728acee8f..2b295324a7ab 100644
--- a/tests/lightning/pytorch/callbacks/test_peft.py
+++ b/tests/lightning/pytorch/callbacks/test_peft.py
@@ -14,8 +14,10 @@
 
 from unittest.mock import MagicMock, call, patch
 
+import torch
 import torch.nn as nn
 from lightning.pytorch.trainer.states import TrainerFn
+
 from nemo.collections.llm import fn
 from nemo.lightning.pytorch.callbacks.peft import PEFT, WrappedAdapterIO
 from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizableCheckpointIO
@@ -49,6 +51,51 @@ def test_peft_call(self):
         assert transformed_model.linear.weight.requires_grad == False
         assert transformed_model.conv.weight.requires_grad == False
 
+    def test_linear_adapter(self):
+        from nemo.collections.llm.peft.lora import LinearAdapter
+
+        for has_bias in [True, False]:
+            linear = nn.Linear(10, 10, bias=has_bias)
+            linear_adapter = LinearAdapter(linear)
+            bias_in_state_dict = 'bias' in linear.state_dict()
+            if has_bias:
+                assert bias_in_state_dict
+            else:
+                assert not bias_in_state_dict
+
+            # Check if the state-dict keys changed
+            for key, val in linear.state_dict().items():
+                assert key in linear_adapter.state_dict(), f"Key {key} not found in LinearAdapter"
+                assert torch.equal(val, linear_adapter.state_dict()[key]), f"Key {key} diff. val in LinearAdapter"
+            # Make sure the additional keys are in the allow list
+            for key, val in linear_adapter.state_dict().items():
+                if key in linear.state_dict():
+                    continue
+                assert key in ['lora_a', 'lora_b']
+
+    def test_linear_adapter_monkey_patch(self):
+        from copy import deepcopy
+
+        from nemo.collections.llm.peft.lora import patch_linear_module
+
+        linear = nn.Linear(10, 10)
+        state_init = deepcopy(linear.state_dict())
+        linear_adapter = patch_linear_module(linear)
+        # Check if the state-dict keys changed
+        for key, val in state_init.items():
+            assert key in linear_adapter.state_dict(), f"Key {key} not found in LinearAdapter"
+            assert torch.equal(val, linear_adapter.state_dict()[key]), f"Key {key} diff. val in LinearAdapter"
+        # Make sure the additional keys are in the allow list
+        for key, val in linear_adapter.state_dict().items():
+            if key in state_init:
+                continue
+            assert key in ['lora_a', 'lora_b']
+
+        for key in ['lora_a', 'lora_b']:
+            assert hasattr(linear_adapter, key), f"Expected {key} to be in module"
+            assert key in linear_adapter.state_dict(), f"Expected {key} to be in state dict"
+            assert getattr(linear_adapter, key).requires_grad == True, "Expected {key} to require_grad"
+
     def test_peft_setup(self):
         peft = self.DummyPEFT()
         trainer = MagicMock()
diff --git a/tests/lightning/test_strategy_lib.py b/tests/lightning/test_strategy_lib.py
index 241debd16316..fd980666afc4 100644
--- a/tests/lightning/test_strategy_lib.py
+++ b/tests/lightning/test_strategy_lib.py
@@ -82,6 +82,7 @@ def test_init_parallel_ranks() -> None:
     mock_parallel_config.encoder_pipeline_model_parallel_size = 0
     mock_parallel_config.tp_comm_overlap = False
     mock_parallel_config.pipeline_model_parallel_split_rank = None
+    mock_parallel_config.use_te_rng_tracker = False
 
     _strategy_lib.init_parallel_ranks(
         world_size=24,
diff --git a/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb b/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb
index 6ad3307da496..b0cbdf2375b7 100644
--- a/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb
+++ b/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb
@@ -37,6 +37,7 @@
         "!apt-get install sox libsndfile1 ffmpeg libsox-fmt-mp3\n",
         "!pip install text-unidecode\n",
         "!pip install matplotlib>=3.3.2\n",
+        "!pip install datasets==2.21.0 # downgrading to 2.21.0 because latest version (3.0.0) has some issues\n",
         "\n",
         "## Install NeMo\n",
         "BRANCH = 'main'\n",
diff --git a/tutorials/asr/Multilang_ASR.ipynb b/tutorials/asr/Multilang_ASR.ipynb
index 800f8a2d2ded..8557ab849cf5 100644
--- a/tutorials/asr/Multilang_ASR.ipynb
+++ b/tutorials/asr/Multilang_ASR.ipynb
@@ -98,6 +98,7 @@
     "!pip install matplotlib>=3.3.2\n",
     "# this is needed for RNNT loss\n",
     "!pip install --upgrade numba\n",
+    "!pip install datasets==2.21.0 # downgrading to 2.21.0 because latest version (3.0.0) has some issues\n",
     "\n",
     "# this is needed to pre-process MCV Spanish dataset, which contains mp3 files\n",
     "!apt-get install -y sox libsox-fmt-mp3\n",
diff --git a/tutorials/llm/llama-3/pruning-distillation/03_a_depth_pruning.ipynb b/tutorials/llm/llama-3/pruning-distillation/03_a_depth_pruning.ipynb
index d64f8c15bd00..20be1b054605 100644
--- a/tutorials/llm/llama-3/pruning-distillation/03_a_depth_pruning.ipynb
+++ b/tutorials/llm/llama-3/pruning-distillation/03_a_depth_pruning.ipynb
@@ -17,7 +17,7 @@
    "metadata": {},
    "source": [
     "#### Step 3.a.: Using depth-pruning \n",
-    "To depth-prune, we will trim the last 16 layers in the finetined teacher model. For depth-pruning, we would be using the [megatron_gpt_drop_layers](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_drop_layers.py) script. \n",
+    "To depth-prune, we will trim the layers 16-31 (leaving 1-15 and 32) in the finetined teacher model. For depth-pruning, we would be using the [megatron_gpt_prune](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/megatron_gpt_prune.py) script. \n",
     "\n",
     "Per the [blog](https://developer.nvidia.com/blog/how-to-prune-and-distill-llama-3-1-8b-to-an-nvidia-llama-3-1-minitron-4b-model/) and [tech report](https://arxiv.org/pdf/2408.11796), removing contiguous layers from the second last block (layers 16 to 31 continuously) yields the best overall results. \n",
     "\n",
@@ -34,14 +34,17 @@
    },
    "outputs": [],
    "source": [
-    "!python -m torch.distributed.launch --nproc_per_node=8 \\\n",
-    "     /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_drop_layers.py \\\n",
-    "     --path_to_nemo \"./distill_trainings/megatron_llama_ft/checkpoints/megatron_llama_ft.nemo\" \\\n",
-    "     --path_to_save \"/workspace/4b_depth_pruned_model.nemo\" \\\n",
-    "     --tensor_model_parallel_size 8 \\\n",
-    "     --pipeline_model_parallel_size 1 \\\n",
-    "     --gpus_per_node 8 \\\n",
-    "     --drop_layers 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31"
+    "!torchrun --nproc_per_node=8 \\\n",
+    "     /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_prune.py \\\n",
+    "     model.restore_from_path=\"./distill_trainings/megatron_llama_ft/checkpoints/megatron_llama_ft.nemo\" \\\n",
+    "     model.tensor_model_parallel_size=8 \\\n",
+    "     model.pipeline_model_parallel_size=1 \\\n",
+    "     +model.dist_ckpt_load_strictness=log_all \\\n",
+    "     trainer.num_nodes=1 \\\n",
+    "     trainer.precision=bf16 \\\n",
+    "     trainer.devices=8 \\\n",
+    "     \"prune.drop_layers=[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]\" \\\n",
+    "     export.save_path=\"/workspace/4b_depth_pruned_model.nemo\""
    ]
   },
   {
diff --git a/tutorials/llm/llama-3/slimpajama/README.md b/tutorials/llm/llama-3/slimpajama/README.md
new file mode 100644
index 000000000000..a35be5dc1bd8
--- /dev/null
+++ b/tutorials/llm/llama-3/slimpajama/README.md
@@ -0,0 +1,48 @@
+**Introduction**
+
+Welcome to the NeMo SlimPajama Data Pipeline and Pretraining tutorial! This tutorial provides a step-by-step guide to preprocessing the SlimPajama dataset and pretraining a Llama based model using the NeMo 2.0 library.
+
+The tutorial includes two Jupyter notebooks: `data_pipeline.ipynb` and `pretraining.ipynb`. The `data_pipeline.ipynb` notebook provides a data pipeline to preprocess the SlimPajama dataset, including downloading, extracting, concatenating and tokenizing the data. The `pretraining.ipynb` notebook provides a pretraining recipe to train a language model using the preprocessed data.
+
+This repository is designed to be used with the NeMo 2.0 and NeMo-Run.
+
+**Pre-requisites / Requirements**
+
+- System Configuration
+  - For Preprocessing: access to any CPU node should be sufficient. Please reach out to us if you run into errors.
+  - For Pretraining: access to at least 1 NVIDIA GPUs with a cumulative memory of at least 48GB.
+  - A Docker-enabled environment, with NVIDIA Container Runtime installed, which will make the container GPU-aware.
+- Software Requirements
+  - Use the latest [NeMo Framework Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags) . Note that you must be logged in to the container registry to view this page.
+  - This notebook uses the container: nvcr.io/nvidia/nemo:dev.
+  - Get your Hugging Face [access token](https://huggingface.co/docs/hub/en/security-tokens), which will be used to download assets from Hugging Face.
+  - Download Jupyter Lab or Jupyter Notebook in your environment if not already installed.
+- NeMo 2.0 and NeMo-Run
+  - We will use NeMo 2.0 and NeMo-Run for this tutorial. Both are already available in the NeMo Framework Container.
+
+**Getting started**
+
+Assuming you have all the pre-requisites installed, you can get started by following these steps:
+1. Start and enter the dev container by running:
+   ```bash
+    docker run \
+    --gpus device=all \
+    --shm-size=2g \
+    --net=host \
+    --ulimit memlock=-1 \
+    --rm -it \
+    -v ${PWD}:/workspace \
+    -w /workspace \
+    nvcr.io/nvidia/nemo:dev bash
+    ```
+2. Log in through huggingface-cli using your Hugging Face token.
+    ```huggingface-cli login```
+3. From within the container, start the Jupyter lab:
+    ```jupyter lab --ip 0.0.0.0 --port=8888 --allow-root```
+4. Follow the directions in data_pipeline.ipynb and pretraining.ipynb notebooks to preprocess the SlimPajama dataset and pretrain a model.
+
+**Note**
+
+* Make sure to replace placeholder paths with the actual paths on your machine. Make sure to update the docker volume mounts to persist data.
+* The `data_pipeline.ipynb` notebook assumes that the SlimPajama dataset is stored in the `/data/slimpajama` directory.
+* The `pretraining.ipynb` notebook assumes that the preprocessed data is stored in the `/data/slimpajama_megatron` directory.
\ No newline at end of file
diff --git a/tutorials/llm/llama-3/slimpajama/data/concat.sh b/tutorials/llm/llama-3/slimpajama/data/concat.sh
new file mode 100644
index 000000000000..78c3b5ca51a4
--- /dev/null
+++ b/tutorials/llm/llama-3/slimpajama/data/concat.sh
@@ -0,0 +1,70 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+
+concatenate_chunk() {
+    local data_folder=$1
+    local chunk_number=$2
+    local chunk_folder="$data_folder/chunk$chunk_number"
+    local output_file="$data_folder/concatenated_chunk$chunk_number.jsonl"
+
+    echo "Combining files for $data_folder/chunk$chunk_number to $output_file."
+
+    if [ ! -d "$chunk_folder" ]; then
+        echo "Chunk folder $chunk_folder does not exist"
+        return 1
+    fi
+
+    # Check if the concatenated file already exists
+    if [ -f "$output_file" ]; then
+        echo "Concatenated file for chunk$chunk_number already exists. Skipping."
+        return 0
+    fi
+
+    # Use find to get all files in the chunk folder and sort them
+    files=$(find $chunk_folder -maxdepth 1 -type f -name "*.jsonl" | sort)
+
+    # Concatenate all files in the chunk folder
+    cat $files > "$output_file"
+
+    if [ $? -eq 0 ]; then
+        echo "Successfully concatenated files for chunk$chunk_number"
+    else
+        echo "Failed to concatenate files for chunk$chunk_number"
+    fi
+}
+
+# Check if enough arguments are provided
+if [ $# -lt 2 ]; then
+    echo "Usage: $0 <data_folder> <chunk_number1> [<chunk_number2> ...]"
+    exit 1
+fi
+
+# Get the train folder from the first argument
+data_folder=$1
+shift
+
+# Check if the train folder exists
+if [ ! -d "$data_folder" ]; then
+    echo "Error: Data folder '$data_folder' does not exist"
+    exit 1
+fi
+
+# Process each provided chunk number
+for chunk_number in "$@"; do
+    if [[ -n "$chunk_number" ]]; then
+        concatenate_chunk "$data_folder" "$chunk_number"
+    fi
+done
diff --git a/tutorials/llm/llama-3/slimpajama/data/download.py b/tutorials/llm/llama-3/slimpajama/data/download.py
new file mode 100644
index 000000000000..7354d9ec0be9
--- /dev/null
+++ b/tutorials/llm/llama-3/slimpajama/data/download.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import nemo_run as run
+
+
+def download_slimpajama(include_pattern: str = "", exclude_pattern: str = ""):
+    """
+    Configure run.Script to download SlimPajama dataset from HuggingFace.
+
+    Args:
+        include_pattern: Include pattern for HuggingFace CLI.
+        exclude_pattern: Exclude pattern for HuggingFace CLI.
+    """
+    hf_cli_cmd = "huggingface-cli download cerebras/SlimPajama-627B {include_pattern} {exclude_pattern} --quiet --repo-type dataset --local-dir /data/slimpajama --cache-dir /data/slimpajama"  # pylint: disable=line-too-long
+    hf_cli_cmd = hf_cli_cmd.format(include_pattern=include_pattern, exclude_pattern=exclude_pattern)
+
+    download_script = """
+pip install "huggingface_hub[cli,hf_transfer]"
+
+retry_command() {
+    local max_retries=$1
+    local sleep_time=$2
+    local retry_count=0
+    local command=${@:3}
+
+    echo "Running $command"
+    while [ $retry_count -lt $max_retries ]; do
+        eval $command
+        if [ $? -eq 0 ]; then
+            echo "Command succeeded"
+            return 0
+        else
+            echo "Command failed. Attempt: $((retry_count + 1))"
+            retry_count=$((retry_count + 1))
+            sleep $sleep_time
+        fi
+    done
+
+    echo "Command failed after $max_retries retries"
+    return 1
+}
+
+export HF_HUB_DOWNLOAD_TIMEOUT=30
+export HF_ENABLE_HF_TRANSFER=True
+"""
+
+    download_script += f"retry_command 5 5 {hf_cli_cmd}\n"
+    download_task = run.Script(inline=download_script)
+    return download_task
diff --git a/tutorials/llm/llama-3/slimpajama/data/extract.py b/tutorials/llm/llama-3/slimpajama/data/extract.py
new file mode 100644
index 000000000000..5f64d7b670ec
--- /dev/null
+++ b/tutorials/llm/llama-3/slimpajama/data/extract.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import multiprocessing
+import os
+from pathlib import Path
+from typing import Optional
+
+import tqdm
+import zstandard
+
+SOURCES_LIST = [
+    "RedPajamaCommonCrawl",
+    "RedPajamaC4",
+    "RedPajamaGithub",
+    "RedPajamaBook",
+    "RedPajamaArXiv",
+    "RedPajamaWikipedia",
+    "RedPajamaStackExchange",
+]
+
+DEFAULT_APPROVED_SOURCES = [
+    "RedPajamaCommonCrawl",
+    "RedPajamaC4",
+    "RedPajamaGithub",
+    "RedPajamaArXiv",
+    "RedPajamaWikipedia",
+    "RedPajamaStackExchange",
+]
+
+
+def approve_source(filename: str, source_list: list):
+    """
+    Function to remove data from non approved sources.
+    Books data is removed by default due to copyright issues
+
+    Arguments:
+        filename: path to jsonl file with the data
+        source_list: list of sources that are allowed to be included in the dataset
+    """
+
+    with open(filename, "r") as i:
+        with open(filename + ".tmp", "w") as o:
+            for line in i.read().splitlines():
+                j = json.loads(line)
+                if j["meta"]["redpajama_set_name"] in source_list:
+                    json.dump(j, o)
+                    o.write("\n")
+    os.rename(filename + ".tmp", filename)
+    return
+
+
+def _split_shards(dataset: list[str], w_size: int) -> list:
+    shards = []
+    for shard in range(w_size):
+        idx_start = (shard * len(dataset)) // w_size
+        idx_end = ((shard + 1) * len(dataset)) // w_size
+        shards.append(dataset[idx_start:idx_end])
+    return shards
+
+
+def _get_shard_list(data_dir: str, w_size: int, extension: str = "*zst") -> list:
+    files = Path(data_dir).rglob(extension)
+    files = sorted([str(f) for f in files])
+    return _split_shards(files, w_size)
+
+
+def _extract_single_zst_file(input_path: str, save_dir: str, file_name: str, rm_input: bool = False):
+    os.makedirs(save_dir, exist_ok=True)
+    save_path = os.path.join(save_dir, file_name)
+    if os.path.exists(save_path):
+        print(f"File {save_path} already exists, skipping extraction.")
+        return save_path
+
+    total_length = os.stat(input_path).st_size
+    with tqdm.tqdm(
+        total=total_length,
+        unit="B",
+        unit_scale=True,
+        desc=file_name,
+    ) as pbar:
+        dctx = zstandard.ZstdDecompressor()
+        read_size = 131075
+        write_size = int(read_size * 4)
+        save_path = os.path.join(save_dir, file_name)
+        update_len = 0
+        with open(input_path, "rb") as in_f, open(save_path, "wb") as out_f:
+            for chunk in dctx.read_to_iter(in_f, read_size=read_size, write_size=write_size):
+                out_f.write(chunk)
+                update_len += read_size
+                if update_len >= 3000000:
+                    pbar.update(update_len)
+                    update_len = 0
+    if rm_input:
+        os.remove(input_path)
+
+
+def _extract_single_shard(shard_tuple: tuple):
+    data_dir, shard, source_list, rm_downloaded = shard_tuple
+    file_path = os.path.join(data_dir, shard)
+    _extract_single_zst_file(file_path, data_dir, shard[:-4], rm_downloaded)
+    shard_path = os.path.join(data_dir, shard[:-4])
+    approve_source(shard_path, source_list)
+
+
+def _run_extraction_on_shard(
+    data_dir: str,
+    shards_to_extract: list,
+    shard_index: int,
+    approved_sources: list,
+    rm_downloaded: bool = False,
+) -> int:
+    source_list = []
+    if not approved_sources:
+        approved_sources = DEFAULT_APPROVED_SOURCES
+
+    for source in approved_sources:
+        if source in SOURCES_LIST:
+            source_list.append(source)
+        else:
+            logging.warning(f"Source: {source} is not recognized, should be one of {SOURCES_LIST}")
+
+    print(f"Task :{shard_index} is extracting shards {shards_to_extract[shard_index]}")
+
+    shards_to_process = [(data_dir, shard, source_list, rm_downloaded) for shard in shards_to_extract[shard_index]]
+    with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
+        pool.map(_extract_single_shard, shards_to_process)
+
+
+def run_extraction(
+    data_dir: str,
+    rm_downloaded: bool = False,
+    approved_sources: Optional[list] = None,
+    num_tasks: Optional[int] = None,
+    task_id: Optional[int] = None,
+):
+    """
+    Function to download the pile dataset files on Slurm.
+
+    Arguments:
+        cfg: main config file.
+    conf variables being used:
+        data_dir
+    """
+    if not num_tasks:
+        if "SLURM_ARRAY_TASK_COUNT" in os.environ:
+            num_tasks = int(os.environ["SLURM_ARRAY_TASK_COUNT"])
+            task_id = int(os.environ["SLURM_ARRAY_TASK_ID"])
+        else:
+            num_tasks = 1
+            task_id = 0
+
+    shards_to_extract = _get_shard_list(data_dir, num_tasks)
+    _run_extraction_on_shard(data_dir, shards_to_extract, task_id, approved_sources, rm_downloaded)
+    print(f"Extracted {len(shards_to_extract[task_id])} files")
diff --git a/tutorials/llm/llama-3/slimpajama/data/preprocess.py b/tutorials/llm/llama-3/slimpajama/data/preprocess.py
new file mode 100644
index 000000000000..19c3a915e330
--- /dev/null
+++ b/tutorials/llm/llama-3/slimpajama/data/preprocess.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import multiprocessing
+import os
+import subprocess
+import time
+from typing import Optional
+
+from data.extract import _get_shard_list
+
+
+def _execute_cmd(cmd_tuple: tuple):
+    cmd, task_id = cmd_tuple
+    start_time = time.time()
+    print(f" ****** Task ID {task_id:02d} starts to preprocess {os.path.basename(cmd[2])}...")
+
+    subprocess.check_call(cmd)
+    print(f" ****** Task ID {task_id:02d} finished preprocessing {os.path.basename(cmd[2])}...")
+    print(f" ****** Task ID {task_id:02d} time elapsed {(time.time() - start_time) / 60:.2f} min.")
+
+
+def preprocess_data(
+    data_dir: str,
+    output_dir: str,
+    dataset_impl: str = "",
+    tokenizer_type: str = "",
+    tokenizer_library: str = "sentencepiece",
+    tokenizer_model: str = "",
+    vocab_file_path: Optional[str] = None,
+    merges_file_path: Optional[str] = None,
+    num_tasks: Optional[int] = None,
+    task_id: Optional[int] = None,
+    extra_args: Optional[list[str]] = None,
+):
+    """
+    Preprocess data for Megatron Core using scripts/nlp_language_modeling/preprocess_data_for_megatron.py
+
+    Args:
+        data_dir: Path to the directory containing the data to preprocess.
+        output_dir: Path to the directory where the preprocessed data will be saved.
+        dataset_impl: Dataset implementation to use.
+        tokenizer_type: Tokenizer type to use.
+        tokenizer_library: Tokenizer library to use.
+        tokenizer_model: Tokenizer model to use.
+        vocab_file_path: Path to the vocabulary file.
+        merges_file_path: Path to the merges file.
+        num_tasks: Number of tasks to split the data into.
+        task_id: Task ID of run.
+        extra_args: Extra arguments to pass to the preprocess_data_for_megatron.py script.
+    """
+    if not num_tasks:
+        if "SLURM_ARRAY_TASK_COUNT" in os.environ:
+            num_tasks = int(os.environ["SLURM_ARRAY_TASK_COUNT"])
+            task_id = int(os.environ["SLURM_ARRAY_TASK_ID"])
+        else:
+            num_tasks = 1
+            task_id = 0
+    shards_to_extract = _get_shard_list(data_dir, num_tasks, extension="concatenated*.jsonl")
+    shard_files = shards_to_extract[task_id]
+    cmd = [
+        "python",
+        "/opt/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py",
+    ]
+
+    os.makedirs(output_dir, exist_ok=True)
+    final_cmds = []
+    for split in shard_files:
+        if not split:  # Remove empty split
+            continue
+
+        output_arg = os.path.join(output_dir, os.path.basename(split))
+
+        flags = [
+            f"--input={split}",
+            f"--output-prefix={output_arg}",
+            f"--tokenizer-library={tokenizer_library}",
+            f"--tokenizer-type={tokenizer_type}" if tokenizer_type else f"--tokenizer-model={tokenizer_model}",
+            f"--workers={multiprocessing.cpu_count()}",
+            "--log-interval=100000",
+            "--apply-ftfy",
+        ]
+
+        if dataset_impl:
+            flags += [f"--dataset-impl={dataset_impl}"]
+
+        if vocab_file_path:
+            flags += [
+                f"--vocab-file={vocab_file_path}",
+                "--append-eod",
+            ]
+
+            if merges_file_path:
+                flags += [f"--merges-file={merges_file_path}"]
+
+        final_cmd = cmd + flags
+        if extra_args:
+            final_cmd += extra_args
+        final_cmds.append((final_cmd, task_id))
+
+    for cmd in final_cmds:
+        _execute_cmd(cmd)
diff --git a/tutorials/llm/llama-3/slimpajama/data_pipeline.ipynb b/tutorials/llm/llama-3/slimpajama/data_pipeline.ipynb
new file mode 100644
index 000000000000..8d081e5d27eb
--- /dev/null
+++ b/tutorials/llm/llama-3/slimpajama/data_pipeline.ipynb
@@ -0,0 +1,338 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data Processing for NeMo 2.0 LLMs with the SlimPajama Dataset\n",
+    "\n",
+    "This tutorial will guide you through the process of transforming a raw pretraining dataset into a configured data module for pretraining with a NeMo 2.0 recipe. We will use the [SlimPajama-627B](https://huggingface.co/datasets/cerebras/SlimPajama-627B>) dataset as our reference. Additionally, we will demonstrate how to exclude specific sources from the dataset, such as excluding all data from the `RedPajamaBook` set by default.\n",
+    "\n",
+    "This tutorial involves four steps:\n",
+    "\n",
+    "1. Download data\n",
+    "2. Extract data\n",
+    "3. Concatenate data\n",
+    "4. Preprocess data for NeMo 2.0/Megatron\n",
+    "\n",
+    "First, we'll define each step. Next, we will see how we can use NeMo-Run to execute the steps sequentially on your local workstation using Docker or on Slurm.\n",
+    "\n",
+    "### Prerequisites\n",
+    "This notebook assumes familiarity with [NeMo-Run](https://github.com/NVIDIA/NeMo-Run). Additionally, the Docker execution and Slurm execution steps require access to Docker on your host and a remote Slurm cluster, respectively.\n",
+    "Additionally, you will have to complete the following steps:\n",
+    "\n",
+    "1. Set HOST_DATA_PATH in the first cell to a parent folder on your workstation where you want to save the data.\n",
+    "1. Create directories `HOST_DATA_PATH/tokenizer` and `HOST_DATA_PATH/slimpajama`.\n",
+    "1. Download the Llama `tokenizer.model` file either from [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b/blob/main/tokenizer.model) or https://www.llama.com/llama-downloads/ and place it at `{HOST_DATA_PATH}/tokenizer/tokenizer.model`.\n",
+    "    For HF, you can do it by running \n",
+    "    ```bash\n",
+    "    HF_TOKEN=... huggingface-cli download meta-llama/Llama-2-7B tokenizer.model --local-dir {HOST_DATA_PATH}/tokenizer/\n",
+    "    ```\n",
+    "\n",
+    "> [!NOTE]\n",
+    "> All code for this tutorial can be found at https://github.com/NVIDIA/NeMo/tree/main/examples/llm/slimpajama."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nemo_run as run\n",
+    "\n",
+    "from data.download import download_slimpajama\n",
+    "from data.extract import run_extraction\n",
+    "from data.preprocess import preprocess_data\n",
+    "\n",
+    "HOST_DATA_PATH = \"/data\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download Data\n",
+    "\n",
+    "First, we will configure the task to download data from Hugging Face. We will use the Hugging Face CLI for this. The function that configures the download script can be found [here](./data/download.py)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "download_task = download_slimpajama(\n",
+    "    include_pattern='--include \"train/chunk1/*_100*zst\"',\n",
+    ")\n",
+    "\n",
+    "# The configured script looks like below\n",
+    "print(download_task.inline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Extract Data\n",
+    "\n",
+    "The downloaded data is in compressed ZST format. We need to extract it into JSONL files. For that, we will configure the `extract_data` function defined [here](./data/extract.py). This function also allows excluding certain sources. By default, we exclude all data from the `RedPajamaBook` set, but this setting is configurable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_extraction??"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extract_task = run.Partial(run_extraction, data_dir=\"/data/slimpajama\")\n",
+    "extract_task"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Concatenate Data\n",
+    "\n",
+    "This optional step concatenates small JSONL files into a single large JSONL file. The example script is [here](./data/concat.sh), but feel free to change it based on your needs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "concat_task = run.Script(\"/nemo_run/code/data/concat.sh\", args=[\"/data/slimpajama/train\", \"1\"])\n",
+    "concat_task"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preprocess Data\n",
+    "\n",
+    "This final step preprocesses the JSONL files to the BIN and IDX files required by NeMo and Megatron Core. It uses the `preprocess_data` function defined [here](./data/preprocess.py)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preprocess_data??"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preprocess_task = run.Partial(\n",
+    "    preprocess_data,\n",
+    "    data_dir=\"/data/slimpajama\",\n",
+    "    output_dir=\"/data/slimpajama_megatron\",\n",
+    "    tokenizer_model=\"/data/tokenizer/tokenizer.model\",\n",
+    "    tokenizer_library=\"sentencepiece\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preprocess_task"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Put it all together\n",
+    "\n",
+    "Now that all the tasks are configured, lets define an executor to run them on and an experiment to run them sequeuntially. \n",
+    "\n",
+    "> [!NOTE]\n",
+    "> Each task can be run individually or in any combination. The notebook runs all tasks sequentially. To remove a task, just remove the corresponding `exp.add(...)` for that corresponding task.\n",
+    "> This customization is handy if you already have JSONL files processed, for example, from NeMo-Curator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Let's define a local executor to run the experiment locally.\n",
+    "def docker_executor(host_data_path: str):\n",
+    "    packager = run.GitArchivePackager(subpath=\"examples/llm/slimpajama\") # This will package all code inside the folder. NOTE: only committed changes are packaged, so if you make a change, make sure to commit it.\n",
+    "    executor = run.DockerExecutor(\n",
+    "        packager=packager,\n",
+    "        ipc_mode=\"host\",\n",
+    "        shm_size=\"30g\",\n",
+    "        env_vars={\"PYTHONUNBUFFERED\": \"1\"},\n",
+    "        volumes=[f\"{host_data_path}:/data\"],\n",
+    "        container_image=\"python:3.11\",\n",
+    "        ulimits=[\"memlock:-1\", \"stack:67108864\"],\n",
+    "    )\n",
+    "    return executor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Replace the host_data_path with the path on your host to save the data to.\n",
+    "executor = docker_executor(host_data_path=\"/data\")\n",
+    "\n",
+    "with run.Experiment(\"slimpajama-data-pipeline\") as exp:\n",
+    "    exp.add(download_task, name=\"download_slimpajama\", executor=executor)\n",
+    "\n",
+    "    # Use NeMo image for the remaining tasks\n",
+    "    executor.container_image = \"nvcr.io/nvidia/nemo:dev\"\n",
+    "    exp.add(extract_task, name=\"extract_slimpajama\", executor=executor)\n",
+    "\n",
+    "    # examples/llm/slimpajama is automatically mounted to /nemo_run/code\n",
+    "    exp.add(concat_task, name=\"concat_slimpajama\", executor=executor)\n",
+    "    exp.add(preprocess_task, name=\"preprocess_slimpajama\", executor=executor)\n",
+    "\n",
+    "    exp.run(sequential=True, tail_logs=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If the experiment runs successfully, you will see the BIN and IDX files as shown below. These files can directly be used in NeMo and Megatron Data Loaders."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "concatenated_chunk1.jsonl_text_document.bin\n",
+      "concatenated_chunk1.jsonl_text_document.idx\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls {HOST_DATA_PATH}/slimpajama_megatron"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Appendix\n",
+    "\n",
+    "### Running on Slurm\n",
+    "\n",
+    "You can also run the same experiment on a remote cluster like Slurm by replacing the Docker executor with a Slurm executor. A sample definition of a Slurm executor looks like:\n",
+    "\n",
+    "```python\n",
+    "def slurm_executor(\n",
+    "    user: str,\n",
+    "    host: str,\n",
+    "    remote_job_dir: str,\n",
+    "    account: str,\n",
+    "    partition: str,\n",
+    "    nodes: int,\n",
+    "    tasks_per_node: int,\n",
+    "    time: str = \"04:00:00\",\n",
+    "    custom_mounts: Optional[list[str]] = None,\n",
+    "    custom_env_vars: Optional[dict[str, str]] = None,\n",
+    "    container_image: str = \"nvcr.io/nvidia/nemo:dev\",\n",
+    "    retries: int = 0,\n",
+    ") -> run.SlurmExecutor:\n",
+    "    if not (user and host and remote_job_dir and account and partition and nodes and tasks_per_node):\n",
+    "        raise RuntimeError(\n",
+    "            \"Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this function.\"\n",
+    "        )\n",
+    "\n",
+    "    mounts = []\n",
+    "    if custom_mounts:\n",
+    "        mounts.extend(custom_mounts)\n",
+    "\n",
+    "    env_vars = {\n",
+    "        \"NVIDIA_VISIBLE_DEVICES\": \"void\", # Might be needed for CPU only nodes with NeMo docker image\n",
+    "    }\n",
+    "    if custom_env_vars:\n",
+    "        env_vars |= custom_env_vars\n",
+    "\n",
+    "    executor = run.SlurmExecutor(\n",
+    "        account=account,\n",
+    "        partition=partition,\n",
+    "        tunnel=run.SSHTunnel(\n",
+    "            user=user,\n",
+    "            host=host,\n",
+    "            job_dir=remote_job_dir,\n",
+    "            identity=\"/path/to/identity/file/for/ssh/to/cluster\",  # OPTIONAL: Provide path to the private key that can be used to establish the SSH connection without entering your password\n",
+    "        ),\n",
+    "        nodes=nodes,\n",
+    "        ntasks_per_node=tasks_per_node,\n",
+    "        mem=\"0\",\n",
+    "        exclusive=True,\n",
+    "        packager=run.GitArchivePackager(subpath=\"examples/llm/slimpajama\"),\n",
+    "    )\n",
+    "\n",
+    "    executor.container_image = container_image\n",
+    "    executor.container_mounts = mounts\n",
+    "    executor.env_vars = env_vars\n",
+    "    executor.retries = retries\n",
+    "    executor.time = time\n",
+    "\n",
+    "    return executor\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tutorials/llm/llama-3/slimpajama/data_pipeline.py b/tutorials/llm/llama-3/slimpajama/data_pipeline.py
new file mode 100644
index 000000000000..e06991d1f343
--- /dev/null
+++ b/tutorials/llm/llama-3/slimpajama/data_pipeline.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import nemo_run as run
+from data.download import download_slimpajama
+from data.extract import run_extraction
+from data.preprocess import preprocess_data
+
+
+def slurm_executor(  # pylint: disable=C0116
+    user: str,
+    host: str,
+    remote_job_dir: str,
+    account: str,
+    partition: str,
+    nodes: int,
+    tasks_per_node: int,
+    time: str = "01:00:00",
+    custom_mounts: Optional[list[str]] = None,
+    custom_env_vars: Optional[dict[str, str]] = None,
+    container_image: str = "nvcr.io/nvidia/nemo:dev",
+    retries: int = 0,
+    ssh_key_file_path: Optional[str] = None,
+) -> run.SlurmExecutor:
+    if not (user and host and remote_job_dir and account and partition and nodes and tasks_per_node):
+        raise RuntimeError(
+            "Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this function."  # pylint: disable=line-too-long
+        )
+
+    mounts = []
+    if custom_mounts:
+        mounts.extend(custom_mounts)
+
+    # Required to run on CPU nodes
+    env_vars = {"NVIDIA_VISIBLE_DEVICES": "void"}
+    if custom_env_vars:
+        env_vars |= custom_env_vars
+
+    executor = run.SlurmExecutor(
+        account=account,
+        partition=partition,
+        tunnel=run.SSHTunnel(
+            user=user,
+            host=host,
+            job_dir=remote_job_dir,
+            identity=ssh_key_file_path,
+        ),
+        nodes=nodes,
+        ntasks_per_node=tasks_per_node,
+        mem="0",
+        exclusive=True,
+        packager=run.GitArchivePackager(subpath="examples/llm/slimpajama"),
+    )
+
+    executor.container_image = container_image
+    executor.container_mounts = mounts
+    executor.env_vars = env_vars
+    executor.retries = retries
+    executor.time = time
+
+    return executor
+
+
+def docker_executor():  # pylint: disable=C0116
+    packager = run.GitArchivePackager(subpath="examples/llm/slimpajama")
+    executor = run.DockerExecutor(
+        packager=packager,
+        ipc_mode="host",
+        shm_size="30g",
+        env_vars={"PYTHONUNBUFFERED": "1"},
+        volumess=["/path/to/save/data:/data"],
+        container_image="python:3.11",
+        ulimits=["memlock:-1", "stack:67108864"],
+    )
+    return executor
+
+
+def run_data_pipeline():  # pylint: disable=C0116
+    executor = docker_executor()
+    with run.Experiment("slimpajama-data-pipeline") as exp:
+        exp.add(
+            download_slimpajama(
+                include_pattern='--include "train/chunk1/*_1*zst"',
+            ),
+            name="download_slimpajama",
+            executor=executor,
+        )
+
+        # Use NeMo image for the remaining tasks
+        executor.container_image = "nvcr.io/nvidia/nemo:nightly"
+        exp.add(run.Partial(run_extraction, data_dir="/data/slimpajama"), executor=executor)
+
+        # examples/llm/slimpajama is automatically mounted to /nemo_run/code
+        exp.add(run.Script("/nemo_run/code/data/concat.sh", args=["/data/slimpajama/train", "1"]), executor=executor)
+        exp.add(
+            run.Partial(
+                preprocess_data,
+                data_dir="/data/slimpajama",
+                output_dir="/data/slimpajama_megatron",
+                tokenizer_model="/data/tokenizer/tokenizer.model",
+                tokenizer_library="sentencepiece",
+                vocab_file_path="/data/tokenizer/tokenizer.vocab",
+            ),
+            executor=executor,
+        )
+
+        exp.run(sequential=True, tail_logs=True, detach=True)
+
+
+if __name__ == "__main__":
+    run_data_pipeline()
diff --git a/tutorials/llm/llama-3/slimpajama/pretraining.ipynb b/tutorials/llm/llama-3/slimpajama/pretraining.ipynb
new file mode 100644
index 000000000000..420aafab24f3
--- /dev/null
+++ b/tutorials/llm/llama-3/slimpajama/pretraining.ipynb
@@ -0,0 +1,164 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Pretraining using SlimPajama\n",
+    "\n",
+    "Let's see how we can use the data generated from the [data pipeline notebook](./data_pipeline.ipynb) to pretrain a model. All we need to do is define the data module based on the generated data and replace it with the mock data module provided by default in the [NeMo LLM recipes](../../../nemo/collections/llm/recipes/__init__.py)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nemo_run as run\n",
+    "from typing import Optional\n",
+    "import pytorch_lightning as pl\n",
+    "from nemo.collections import llm\n",
+    "from nemo.collections.common.tokenizers import SentencePieceTokenizer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define the data module\n",
+    "To define the data module, we can use `llm.PreTrainingDataModule` and pass in the data paths and tokenizer. In case you don't have either of the two, please refer to the [data pipeline notebook](./data_pipeline.ipynb). You can also look at the definition of the data module for the other parameters supported like `split`, `num_workers`, `index_mapping_dir`, etc."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def slimpajama(\n",
+    "    gbs: int = 256,\n",
+    "    mbs: int = 4,\n",
+    "    seq_length: int = 8192,\n",
+    ") -> run.Config[pl.LightningDataModule]:\n",
+    "\n",
+    "    return run.Config(\n",
+    "        llm.PreTrainingDataModule,\n",
+    "        paths=[\"/data/slimpajama_megatron/concatenated_chunk1.jsonl_text_document\"],\n",
+    "        seq_length=seq_length,\n",
+    "        global_batch_size=gbs,\n",
+    "        micro_batch_size=mbs,\n",
+    "        tokenizer=run.Config(SentencePieceTokenizer, model_path=\"/data/tokenizer/tokenizer.model\"),\n",
+    "        split=\"99,8,2\",\n",
+    "        num_workers=2,\n",
+    "        index_mapping_dir=\"/data/index_mapping\",\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Configure the recipe and launch pretraining\n",
+    "Once the data module is defined, you can use an existing recipe and replace the data module as shown below.\n",
+    "To learn more about the recipes, refer to the [quickstart](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/quickstart.html)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def configure_recipe(nodes: int = 1, gpus_per_node: int = 1):\n",
+    "    recipe = llm.llama3_8b.pretrain_recipe(\n",
+    "        dir=\"/checkpoints/llama-new\", # Path to store checkpoints\n",
+    "        name=\"llama_pretraining\",\n",
+    "        num_nodes=nodes,\n",
+    "        num_gpus_per_node=gpus_per_node,\n",
+    "    )\n",
+    "\n",
+    "    recipe.model.config.num_layers = 1\n",
+    "    recipe.model.config.hidden_size = 128\n",
+    "    recipe.trainer.max_steps = 30\n",
+    "    recipe.data = slimpajama(\n",
+    "        gbs=32,\n",
+    "        mbs=1,\n",
+    "    )\n",
+    "    recipe.trainer.val_check_interval = 20\n",
+    "    recipe.trainer.strategy.context_parallel_size = 1\n",
+    "    recipe.log.ckpt.save_optim_on_train_end = True\n",
+    "    return recipe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def local_executor_torchrun(nodes: int = 1, devices: int = 1) -> run.LocalExecutor:\n",
+    "    # Env vars for jobs are configured here\n",
+    "    env_vars = {\n",
+    "        \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n",
+    "        \"NEMO_ENV_VARNAME_TESTING\": \"1\",\n",
+    "        \"CUDA_VISIBLE_DEVICES\": \"0\"\n",
+    "    }\n",
+    "\n",
+    "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",
+    "    return executor\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_pretraining():\n",
+    "    recipe = configure_recipe()\n",
+    "    executor = local_executor_torchrun(nodes=recipe.trainer.num_nodes, devices=recipe.trainer.devices)\n",
+    "\n",
+    "    run.run(recipe, executor=executor)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run Pretraining\n",
+    "Now, you can just call the `run_pretraining` function to start pretraining on your local machine using torchrun."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_pretraining()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tutorials/tools/CTC_Segmentation_Tutorial.ipynb b/tutorials/tools/CTC_Segmentation_Tutorial.ipynb
index 2d5c20ea09bd..05a3890abf9c 100644
--- a/tutorials/tools/CTC_Segmentation_Tutorial.ipynb
+++ b/tutorials/tools/CTC_Segmentation_Tutorial.ipynb
@@ -1,39 +1,12 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "name": "CTC_Segmentation_Tutorial_update.ipynb",
-      "private_outputs": true,
-      "provenance": [],
-      "collapsed_sections": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.9.7"
-    }
-  },
   "cells": [
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "d4KCUoxSpdoZ"
       },
+      "outputs": [],
       "source": [
         "BRANCH = 'main'\n",
         "\n",
@@ -46,15 +19,15 @@
         "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
         "4. Run this cell to set up dependencies.\n",
         "\"\"\""
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "JDk9zxC6pdod"
       },
+      "outputs": [],
       "source": [
         "import os\n",
         "# either provide a path to local NeMo repository with NeMo already installed or git clone\n",
@@ -67,15 +40,15 @@
         "  ! git clone -b $BRANCH https://github.com/NVIDIA/NeMo\n",
         "  ! cd NeMo\n",
         "  ! python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "CH7yR7cSwPKr"
       },
+      "outputs": [],
       "source": [
         "import json\n",
         "import os\n",
@@ -88,9 +61,7 @@
         "! pip install pandas\n",
         "! pip install plotly\n",
         "from plotly import graph_objects as go"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -108,17 +79,18 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "8FAZKakrIyGI"
       },
+      "outputs": [],
       "source": [
         "requirements = f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/tools/ctc_segmentation/requirements.txt'\n",
         "wget.download(requirements)\n",
         "! pip install -r requirements.txt\n",
-        "! apt-get install -y ffmpeg"
-      ],
-      "execution_count": null,
-      "outputs": []
+        "! apt-get install -y ffmpeg\n",
+        "! apt-get install -y libsox-fmt-mp3 "
+      ]
     },
     {
       "cell_type": "markdown",
@@ -131,9 +103,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "1C9DdMfvRFM-"
       },
+      "outputs": [],
       "source": [
         "if 'google.colab' in str(get_ipython()):\n",
         "  NEMO_DIR_PATH = \"/content/NeMo\"\n",
@@ -143,9 +117,7 @@
         "TOOLS_DIR = f'{NEMO_DIR_PATH}/tools/ctc_segmentation/scripts'\n",
         "print(TOOLS_DIR)\n",
         "! ls -l $TOOLS_DIR"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -159,9 +131,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "bkeKX2I_tIgV"
       },
+      "outputs": [],
       "source": [
         "## create data directory and download an audio file\n",
         "WORK_DIR = 'WORK_DIR'\n",
@@ -174,9 +148,7 @@
         "! rm $DATA_DIR/audio_samples.zip\n",
         "\n",
         "DATA_DIR = os.path.join(DATA_DIR, \"audio_samples\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -189,14 +161,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "Y6VYVk9mpdol"
       },
+      "outputs": [],
       "source": [
         "! ls $DATA_DIR"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -209,14 +181,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "IGhijb-Bpdol"
       },
+      "outputs": [],
       "source": [
         "! ls $DATA_DIR/es/audio/ $DATA_DIR/es/text/"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -231,15 +203,15 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ulkPrqwipdom"
       },
+      "outputs": [],
       "source": [
         "base_name_es = \"el19demarzoyel2demayo_03_perezgaldos\"\n",
         "Audio(f\"{DATA_DIR}/es/audio/{base_name_es}.wav\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -252,15 +224,15 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "9Qfp10Xnpdom"
       },
+      "outputs": [],
       "source": [
         "text = f\"{DATA_DIR}/es/text/{base_name_es}.txt\"\n",
         "! cat $text"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -296,9 +268,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "u4zjeVVv-UXR"
       },
+      "outputs": [],
       "source": [
         "MODEL = \"stt_es_citrinet_512\" \n",
         "OUTPUT_DIR = WORK_DIR + \"/es_output\"\n",
@@ -311,9 +285,7 @@
         "--language='en' \\\n",
         "--model=$MODEL \\\n",
         "--audio_dir=$DATA_DIR/es/audio"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -331,14 +303,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "6R7OKAsYH9p0"
       },
+      "outputs": [],
       "source": [
         "! ls $OUTPUT_DIR/processed"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -351,14 +323,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "74GLpMgoICmk"
       },
+      "outputs": [],
       "source": [
         "! head $OUTPUT_DIR/processed/el19demarzoyel2demayo_03_perezgaldos.txt"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -376,9 +348,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "xyKtaqAd-Tvk"
       },
+      "outputs": [],
       "source": [
         "WINDOW = 8000\n",
         "\n",
@@ -387,9 +361,7 @@
         "--data=$OUTPUT_DIR/processed \\\n",
         "--model=$MODEL \\\n",
         "--window_len=$WINDOW "
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -404,15 +376,15 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ktBAsfJRVCwI"
       },
+      "outputs": [],
       "source": [
         "alignment_file = f\"{WINDOW}_{base_name_es}_segments.txt\"\n",
         "! head -n 3 $OUTPUT_DIR/segments/$alignment_file"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -451,9 +423,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "6YM64RPlitPL"
       },
+      "outputs": [],
       "source": [
         "OFFSET = 0\n",
         "THRESHOLD = -2\n",
@@ -463,9 +437,7 @@
         "--alignment=$OUTPUT_DIR/segments/ \\\n",
         "--threshold=$THRESHOLD \\\n",
         "--offset=$OFFSET"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -480,9 +452,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "1UaSIflBZwaV"
       },
+      "outputs": [],
       "source": [
         "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/asr/transcribe_speech.py')\n",
         "\n",
@@ -490,15 +464,15 @@
         "pretrained_name=$MODEL \\\n",
         "dataset_manifest=$OUTPUT_DIR/manifests/manifest.json \\\n",
         "output_filename=$OUTPUT_DIR/manifests/manifest_transcribed.json"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "F-nPT8z_IVD-"
       },
+      "outputs": [],
       "source": [
         "def plot_signal(signal, sample_rate):\n",
         "    \"\"\" Plot the signal in time domain \"\"\"\n",
@@ -530,9 +504,7 @@
         "                display('ASR transcript: ' + sample['pred_text'])\n",
         "            print(f\"Score: {sample['score']}\")\n",
         "            print('\\n' + '-' * 110)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -549,17 +521,17 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "Q45uBtsHIaAD"
       },
+      "outputs": [],
       "source": [
         "# let's examine only a few first samples\n",
         "! head -n 2 $OUTPUT_DIR/manifests/manifest_transcribed.json > $OUTPUT_DIR/manifests/samples.json\n",
         "\n",
         "display_samples(f\"{OUTPUT_DIR}/manifests/samples.json\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -576,14 +548,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "KRc9yMjPXPgj"
       },
+      "outputs": [],
       "source": [
         "! ls $DATA_DIR/en/audio $DATA_DIR/en/text"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -626,9 +598,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "hRFAl0gO92bp"
       },
+      "outputs": [],
       "source": [
         "MODEL = \"QuartzNet15x5Base-En\" # \"stt_en_citrinet_512_gamma_0_25\" \n",
         "OUTPUT_DIR_2 = WORK_DIR + \"/en_output\"\n",
@@ -642,9 +616,7 @@
         "--SCRIPTS_DIR=$TOOLS_DIR \\\n",
         "--MIN_SCORE=$THRESHOLD  \\\n",
         "--USE_NEMO_NORMALIZATION=False"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -664,32 +636,32 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "xsm89hYlpdor"
       },
+      "outputs": [],
       "source": [
         "! bash $TOOLS_DIR/../run_filter.sh \\\n",
         "--SCRIPTS_DIR=$TOOLS_DIR \\\n",
         "--MODEL_NAME_OR_PATH=stt_en_conformer_ctc_large \\\n",
         "--MANIFEST=$OUTPUT_DIR_2/manifests/manifest.json \\\n",
         "--INPUT_AUDIO_DIR=$DATA_DIR/en/audio/"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "nacE_iQ2_85L"
       },
+      "outputs": [],
       "source": [
         "# let's examine only a few first samples\n",
         "! head -n 2 $OUTPUT_DIR_2/manifests/manifest_transcribed_metrics_filtered.json > $OUTPUT_DIR_2/manifests/samples.json\n",
         "\n",
         "display_samples(f\"{OUTPUT_DIR_2}/manifests/samples.json\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -713,5 +685,34 @@
         "Kürzinger, Ludwig, et al. [\"CTC-Segmentation of Large Corpora for German End-to-End Speech Recognition.\"](https://arxiv.org/abs/2007.09127) International Conference on Speech and Computer. Springer, Cham, 2020."
       ]
     }
-  ]
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "CTC_Segmentation_Tutorial_update.ipynb",
+      "private_outputs": true,
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.7"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
 }