From 28f3e4396111662bb83ec02d9da73e7dc4d9a52e Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Mon, 30 Oct 2023 07:46:10 -0700
Subject: [PATCH 01/39] Added fault tolerance config for gpt3 126m

---
 ...un_gpt126m_batch_training_on_dlcluster.txt | 22 +++++++++++++++++++
 ..._gpt126m_iteract_training_on_dlcluster.txt | 22 +++++++++++++++++++
 launcher_scripts/conf/cluster/bcm.yaml        |  2 +-
 launcher_scripts/conf/training/gpt3/126m.yaml | 12 ++++++++--
 4 files changed, 55 insertions(+), 3 deletions(-)
 create mode 100644 examples/fault_tolerance/run_gpt126m_batch_training_on_dlcluster.txt
 create mode 100644 examples/fault_tolerance/run_gpt126m_iteract_training_on_dlcluster.txt

diff --git a/examples/fault_tolerance/run_gpt126m_batch_training_on_dlcluster.txt b/examples/fault_tolerance/run_gpt126m_batch_training_on_dlcluster.txt
new file mode 100644
index 0000000000..e228f4737c
--- /dev/null
+++ b/examples/fault_tolerance/run_gpt126m_batch_training_on_dlcluster.txt
@@ -0,0 +1,22 @@
+LAUNCHER_DIR="/mnt/nvdl/usr/jbieniusiewi/nemo/NeMo-Megatron-Launcher"
+
+HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore"  python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
+    training=gpt3/126m \
+    stages=["training"] \
+    numa_mapping.enable=True \
+    launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts  \
+    cluster.gpus_per_task=null \
+    cluster.gpus_per_node=null \
+    cluster.job_name_prefix="nv-test:" \
+    training.exp_manager.create_checkpoint_callback=False \
+    training.run.name="test" \
+    training.trainer.num_nodes=1 \
+    training.trainer.devices=8 \
+    training.model.global_batch_size=16 \
+    training.model.micro_batch_size=2 \
+    cluster_type=bcm \
+    ++training.cluster_type=bcm \
+    training.model.data.data_impl="mock" \
+    training.model.data.data_prefix=[] \
+    ++fault_tolerance.enabled=True \
+
diff --git a/examples/fault_tolerance/run_gpt126m_iteract_training_on_dlcluster.txt b/examples/fault_tolerance/run_gpt126m_iteract_training_on_dlcluster.txt
new file mode 100644
index 0000000000..e75ae0dbc9
--- /dev/null
+++ b/examples/fault_tolerance/run_gpt126m_iteract_training_on_dlcluster.txt
@@ -0,0 +1,22 @@
+LAUNCHER_DIR="/mnt/nvdl/usr/jbieniusiewi/nemo/NeMo-Megatron-Launcher"
+
+HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore"  python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
+    training=gpt3/126m \
+    stages=["training"] \
+    numa_mapping.enable=True \
+    launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts  \
+    cluster.gpus_per_task=null \
+    cluster.gpus_per_node=null \
+    cluster.job_name_prefix="nv-test:" \
+    training.exp_manager.create_checkpoint_callback=False \
+    training.run.name="test" \
+    training.trainer.num_nodes=1 \
+    training.trainer.devices=8 \
+    training.model.global_batch_size=16 \
+    training.model.micro_batch_size=2 \
+    cluster_type=interactive \
+    ++training.cluster_type=BCP \
+    training.model.data.data_impl="mock" \
+    training.model.data.data_prefix=[] \
+    ++fault_tolerance.enabled=True \
+
diff --git a/launcher_scripts/conf/cluster/bcm.yaml b/launcher_scripts/conf/cluster/bcm.yaml
index ba8f2ebbb0..24d5c78f6b 100755
--- a/launcher_scripts/conf/cluster/bcm.yaml
+++ b/launcher_scripts/conf/cluster/bcm.yaml
@@ -1,4 +1,4 @@
-partition: null
+partition: dgx1v
 account: null
 exclusive: True
 gpus_per_task: null
diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml
index 27d3329756..5810378966 100755
--- a/launcher_scripts/conf/training/gpt3/126m.yaml
+++ b/launcher_scripts/conf/training/gpt3/126m.yaml
@@ -5,7 +5,7 @@ hydra:
 run:
   name: gpt3_126m
   results_dir: ${base_results_dir}/${.name}
-  time_limit: "1-00:00:00"
+  time_limit: "02:00:00"
   dependency: "singleton"
 
 trainer:
@@ -26,7 +26,6 @@ trainer:
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
 
-
 exp_manager:
   explicit_log_dir: ${training.run.results_dir}/results
   exp_dir: null
@@ -168,6 +167,15 @@ model:
       constant_steps: 100000
       min_lr: 6e-5
 
+  fault_tolerance:
+      initial_rank_heartbeat_timeout: 120
+      rank_heartbeat_timeout: 30
+      ipc_timeout: 30
+      simulated_fault:
+          fault_type: rank_killed
+          rank_to_fail: 1
+          base_delay: 180
+
   data:
     data_impl: mmap
     splits_string: "99990,8,2"

From 7a955ddce9a78aa07475c4dcbf1896ee8e2f599d Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Fri, 3 Nov 2023 08:15:42 -0700
Subject: [PATCH 02/39] inital auto-resume impl-WIP

---
 ...un_gpt126m_batch_training_on_dlcluster.txt |  3 +--
 ..._gpt126m_iteract_training_on_dlcluster.txt |  3 +--
 launcher_scripts/conf/cluster/bcm.yaml        |  2 +-
 launcher_scripts/conf/config.yaml             |  4 ++--
 launcher_scripts/conf/training/gpt3/126m.yaml |  3 ++-
 .../nemo_launcher/core/launchers.py           | 21 +++++++++++++++++++
 launcher_scripts/nemo_launcher/core/stages.py |  6 ++++++
 7 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/examples/fault_tolerance/run_gpt126m_batch_training_on_dlcluster.txt b/examples/fault_tolerance/run_gpt126m_batch_training_on_dlcluster.txt
index e228f4737c..92f1b303ba 100644
--- a/examples/fault_tolerance/run_gpt126m_batch_training_on_dlcluster.txt
+++ b/examples/fault_tolerance/run_gpt126m_batch_training_on_dlcluster.txt
@@ -17,6 +17,5 @@ HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore"  python3 ${LAUNCHER_DIR}/launcher_scr
     cluster_type=bcm \
     ++training.cluster_type=bcm \
     training.model.data.data_impl="mock" \
-    training.model.data.data_prefix=[] \
-    ++fault_tolerance.enabled=True \
+    training.model.data.data_prefix=[]
 
diff --git a/examples/fault_tolerance/run_gpt126m_iteract_training_on_dlcluster.txt b/examples/fault_tolerance/run_gpt126m_iteract_training_on_dlcluster.txt
index e75ae0dbc9..fe80192be5 100644
--- a/examples/fault_tolerance/run_gpt126m_iteract_training_on_dlcluster.txt
+++ b/examples/fault_tolerance/run_gpt126m_iteract_training_on_dlcluster.txt
@@ -18,5 +18,4 @@ HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore"  python3 ${LAUNCHER_DIR}/launcher_scr
     ++training.cluster_type=BCP \
     training.model.data.data_impl="mock" \
     training.model.data.data_prefix=[] \
-    ++fault_tolerance.enabled=True \
-
+    
diff --git a/launcher_scripts/conf/cluster/bcm.yaml b/launcher_scripts/conf/cluster/bcm.yaml
index 24d5c78f6b..2c70c43d04 100755
--- a/launcher_scripts/conf/cluster/bcm.yaml
+++ b/launcher_scripts/conf/cluster/bcm.yaml
@@ -1,4 +1,4 @@
-partition: dgx1v
+partition: DGX1
 account: null
 exclusive: True
 gpus_per_task: null
diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml
index 89e86c652f..09f067a88d 100755
--- a/launcher_scripts/conf/config.yaml
+++ b/launcher_scripts/conf/config.yaml
@@ -38,9 +38,9 @@ launcher_scripts_path: ???  # Path to NeMo Megatron Launch scripts, should ends
 data_dir: ${launcher_scripts_path}/data  # Location to store and read the data.
 base_results_dir: ${launcher_scripts_path}/results  # Location to store the results, checkpoints and logs.
 container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
-  - null
+  - /mnt/nvdl/usr/jbieniusiewi/
+#container: /mnt/nvdl/usr/jbieniusiewi/nemo/nemofw-training_230803_fault_tol.sqsh
 container: nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.03
-
 wandb_api_key_file: null  # File where the w&B api key is stored. Key must be on the first line.
 
 env_vars:
diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml
index 5810378966..56482695fd 100755
--- a/launcher_scripts/conf/training/gpt3/126m.yaml
+++ b/launcher_scripts/conf/training/gpt3/126m.yaml
@@ -5,7 +5,7 @@ hydra:
 run:
   name: gpt3_126m
   results_dir: ${base_results_dir}/${.name}
-  time_limit: "02:00:00"
+  time_limit: "00:30:00"
   dependency: "singleton"
 
 trainer:
@@ -175,6 +175,7 @@ model:
           fault_type: rank_killed
           rank_to_fail: 1
           base_delay: 180
+      autoresume_if_interrupted: True
 
   data:
     data_impl: mmap
diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index 9c9e90d381..73b3cac9ac 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -540,6 +540,7 @@ def _make_sbatch_string(
     additional_parameters: Optional[Dict[str, Any]] = None,
     srun_args: Optional[Iterable[str]] = None,
     heterogeneous: bool = False,
+    autoresume_if_interrupted: bool = False,
 ) -> str:
     """Creates the content of an sbatch file with provided parameters
 
@@ -580,6 +581,7 @@ def _make_sbatch_string(
         "container_mounts",
         "srun_args",
         "heterogeneous",
+        "autoresume_if_interrupted",
     ]
     parameters = {k: v for k, v in locals().items() if v is not None and k not in nonslurm}
     # rename and reformat parameters
@@ -635,6 +637,15 @@ def _make_sbatch_string(
     if setup is not None:
         lines += ["", "# setup"] + setup
 
+    if autoresume_if_interrupted is True:
+        lines += [
+            '',
+            '# if the flag file is created by a trainer script, this slurm batch script will be rescheduled',
+            'export INTERRUPTED_FLAG_FILE='+str(paths.results_folder / "_interrupted_flag"),
+            'rm -f $INTERRUPTED_FLAG_FILE',
+            '',
+        ]
+           
     # commandline (this will run the function and args specified in the file provided as argument)
     # We pass --output and --error here, because the SBATCH command doesn't work as expected with a filename pattern
     stderr_flags = [] if stderr_to_stdout else ["--error", stderr]
@@ -689,6 +700,16 @@ def _make_sbatch_string(
                 f"  {command} \"",
                 "",
             ]
+
+    if autoresume_if_interrupted is True:
+        lines += [
+            '', '# automatic resumption',
+            'if [ -f "$INTERRUPTED_FLAG_FILE" ] ; then ',
+            'IS_RESUMED=1 sbatch "$0"',
+            'fi',
+            '',
+        ]
+
     return "\n".join(lines)
 
 
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 2c6ac5ae3e..2df3649f5f 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -323,6 +323,12 @@ def _make_cluster_parameters(self, cluster: str) -> Dict:
                 }
             )
 
+        fault_tol_conf = stage_cfg.get("model").get("fault_tolerance", None)
+        if fault_tol_conf is not None:
+            cluster_parameters["autoresume_if_interrupted"] = fault_tol_conf.get("autoresume_if_interrupted", False)
+            if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm":
+                logging.warn(f"autoresume_if_interrupted has no effect if cluster type is not bcm (current cluster is {cluster})")
+        
         return cluster_parameters
 
     def _find_optimal_nodes(self, cfg, gpus) -> None:

From 0b84305ecc2cd50cfc83fa53dc79e3c558546caa Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Tue, 7 Nov 2023 06:46:11 -0800
Subject: [PATCH 03/39] Auto-resume loop-WIP

---
 .../run_gpt126m_batch_training_on_selene.txt  | 36 +++++++++++++++++++
 .../nemo_launcher/core/launchers.py           | 16 ++++++---
 launcher_scripts/nemo_launcher/core/stages.py |  2 +-
 3 files changed, 48 insertions(+), 6 deletions(-)
 create mode 100644 examples/fault_tolerance/run_gpt126m_batch_training_on_selene.txt

diff --git a/examples/fault_tolerance/run_gpt126m_batch_training_on_selene.txt b/examples/fault_tolerance/run_gpt126m_batch_training_on_selene.txt
new file mode 100644
index 0000000000..a696dc9876
--- /dev/null
+++ b/examples/fault_tolerance/run_gpt126m_batch_training_on_selene.txt
@@ -0,0 +1,36 @@
+LAUNCHER_DIR="/lustre/fsw/joc/jbieniusiewi/nemo/NeMo-Megatron-Launcher"
+
+HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore"  python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
+    training=gpt3/126m \
+    stages=["training"] \
+    numa_mapping.enable=True \
+    data_dir=/lustre/fsw/joc/big_nlp/gpt3/prepare_dataset/the_pile/train \
+    launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts  \
+    container_mounts=[/lustre/fsw/joc/jbieniusiewi/:/lustre/fsw/joc/jbieniusiewi/] \
+    cluster.partition=luna \
+    cluster.account=coreai_dlalgo_llm \
+    cluster.job_name_prefix="coreai_dlalgo_llm-test-interact:" \
+    cluster.gpus_per_task=null \
+    cluster.gpus_per_node=null \
+    training.exp_manager.resume_if_exists=True \
+    training.exp_manager.create_checkpoint_callback=True \
+    training.exp_manager.checkpoint_callback_params.save_top_k=1 \
+    training.exp_manager.resume_ignore_no_checkpoint=True \
+    training.run.name="dummy_run_name_126m" \
+    training.run.time_limit=00:12:00 \
+    training.trainer.max_time=00:04:00:00 \
+    training.trainer.num_nodes=1 \
+    training.trainer.devices=8 \
+    training.trainer.log_every_n_steps=1 \
+    training.trainer.val_check_interval=1000 \
+    training.trainer.enable_checkpointing=False \
+    training.model.micro_batch_size=2 \
+    training.model.global_batch_size=16 \
+    training.model.tensor_model_parallel_size=1 \
+    training.model.pipeline_model_parallel_size=1 \
+    training.model.transformer_engine=True \
+    training.model.fp8=False \
+    training.model.fp8_e4m3=False \
+    training.model.grad_div_ar_fusion=False \
+    training.model.activations_checkpoint_granularity=selective \
+    training.model.activations_checkpoint_method=uniform \
diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index 73b3cac9ac..97fa3012a3 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -641,9 +641,14 @@ def _make_sbatch_string(
         lines += [
             '',
             '# if the flag file is created by a trainer script, this slurm batch script will be rescheduled',
-            'export INTERRUPTED_FLAG_FILE='+str(paths.results_folder / "_interrupted_flag"),
+            'export INTERRUPTED_FLAG_FILE='+str(paths.folder / "_interrupted_flag"),
+            'if [ "$RESUMED" = "1" ] && [ ! -f "$INTERRUPTED_FLAG_FILE" ] ; then exit 0 ; fi',
+            'CONT_SBATCH_OUT=$(RESUMED=1 sbatch --parsable --dependency=afterany:"$SLURM_JOB_ID" "$0")',
+            'if [ $? -ne 0 ] ; then echo "Could not schedule continuation job. Check stderr for details." ; exit 1 ; fi',
+            'CONT_SLURM_JOB_ID=$(echo $CONT_SBATCH_OUT | cut -f1 -d",")',
             'rm -f $INTERRUPTED_FLAG_FILE',
             '',
+            'touch $INTERRUPTED_FLAG_FILE', # FIXME TODO this is for debug only
         ]
            
     # commandline (this will run the function and args specified in the file provided as argument)
@@ -703,10 +708,11 @@ def _make_sbatch_string(
 
     if autoresume_if_interrupted is True:
         lines += [
-            '', '# automatic resumption',
-            'if [ -f "$INTERRUPTED_FLAG_FILE" ] ; then ',
-            'IS_RESUMED=1 sbatch "$0"',
-            'fi',
+            '',
+            '# cancel continuation job if no continuation marker file was created',
+            'if [ ! -f "$INTERRUPTED_FLAG_FILE" ] && [ ! -z "$CONT_SLURM_JOB_ID" ] ; then', 
+            'scancel $CONT_SLURM_JOB_ID',
+            'fi'
             '',
         ]
 
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 2df3649f5f..05e0826040 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -327,7 +327,7 @@ def _make_cluster_parameters(self, cluster: str) -> Dict:
         if fault_tol_conf is not None:
             cluster_parameters["autoresume_if_interrupted"] = fault_tol_conf.get("autoresume_if_interrupted", False)
             if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm":
-                logging.warn(f"autoresume_if_interrupted has no effect if cluster type is not bcm (current cluster is {cluster})")
+                raise ValueError(f"autoresume_if_interrupted works only with 'bcm' cluster (current cluster is '{cluster}')")
         
         return cluster_parameters
 

From 733ac4cb452189b3a77deed3c48751cc91c7bc35 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Tue, 7 Nov 2023 08:38:11 -0800
Subject: [PATCH 04/39] Cleaning dbg code

---
 .../fault_tolerance/run_gpt126m_batch_training_on_selene.txt  | 1 +
 launcher_scripts/conf/cluster/bcm.yaml                        | 2 +-
 launcher_scripts/conf/config.yaml                             | 4 ++--
 launcher_scripts/conf/training/gpt3/126m.yaml                 | 2 +-
 launcher_scripts/nemo_launcher/core/launchers.py              | 1 -
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/fault_tolerance/run_gpt126m_batch_training_on_selene.txt b/examples/fault_tolerance/run_gpt126m_batch_training_on_selene.txt
index a696dc9876..2db9461ae7 100644
--- a/examples/fault_tolerance/run_gpt126m_batch_training_on_selene.txt
+++ b/examples/fault_tolerance/run_gpt126m_batch_training_on_selene.txt
@@ -7,6 +7,7 @@ HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore"  python3 ${LAUNCHER_DIR}/launcher_scr
     data_dir=/lustre/fsw/joc/big_nlp/gpt3/prepare_dataset/the_pile/train \
     launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts  \
     container_mounts=[/lustre/fsw/joc/jbieniusiewi/:/lustre/fsw/joc/jbieniusiewi/] \
+    container="gitlab-master.nvidia.com/dl/nemo/nemo-fw/train:230803_fault_tol" \
     cluster.partition=luna \
     cluster.account=coreai_dlalgo_llm \
     cluster.job_name_prefix="coreai_dlalgo_llm-test-interact:" \
diff --git a/launcher_scripts/conf/cluster/bcm.yaml b/launcher_scripts/conf/cluster/bcm.yaml
index 2c70c43d04..ba8f2ebbb0 100755
--- a/launcher_scripts/conf/cluster/bcm.yaml
+++ b/launcher_scripts/conf/cluster/bcm.yaml
@@ -1,4 +1,4 @@
-partition: DGX1
+partition: null
 account: null
 exclusive: True
 gpus_per_task: null
diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml
index 09f067a88d..89e86c652f 100755
--- a/launcher_scripts/conf/config.yaml
+++ b/launcher_scripts/conf/config.yaml
@@ -38,9 +38,9 @@ launcher_scripts_path: ???  # Path to NeMo Megatron Launch scripts, should ends
 data_dir: ${launcher_scripts_path}/data  # Location to store and read the data.
 base_results_dir: ${launcher_scripts_path}/results  # Location to store the results, checkpoints and logs.
 container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
-  - /mnt/nvdl/usr/jbieniusiewi/
-#container: /mnt/nvdl/usr/jbieniusiewi/nemo/nemofw-training_230803_fault_tol.sqsh
+  - null
 container: nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.03
+
 wandb_api_key_file: null  # File where the w&B api key is stored. Key must be on the first line.
 
 env_vars:
diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml
index 56482695fd..97fcb49316 100755
--- a/launcher_scripts/conf/training/gpt3/126m.yaml
+++ b/launcher_scripts/conf/training/gpt3/126m.yaml
@@ -5,7 +5,7 @@ hydra:
 run:
   name: gpt3_126m
   results_dir: ${base_results_dir}/${.name}
-  time_limit: "00:30:00"
+  time_limit: "1-00:00:00"
   dependency: "singleton"
 
 trainer:
diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index 97fa3012a3..04110b9e6c 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -648,7 +648,6 @@ def _make_sbatch_string(
             'CONT_SLURM_JOB_ID=$(echo $CONT_SBATCH_OUT | cut -f1 -d",")',
             'rm -f $INTERRUPTED_FLAG_FILE',
             '',
-            'touch $INTERRUPTED_FLAG_FILE', # FIXME TODO this is for debug only
         ]
            
     # commandline (this will run the function and args specified in the file provided as argument)

From adc5b7bc3d0b5881c7bc195daf28d216dff2eb42 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@login-eos01.eos.clusters.nvidia.com>
Date: Thu, 9 Nov 2023 09:21:37 -0800
Subject: [PATCH 05/39] Fixes after testing on EOS

---
 .../run_gpt126m_batch_training_on_eos.txt     | 40 +++++++++++++++++++
 launcher_scripts/conf/training/gpt3/126m.yaml | 17 ++++----
 .../nemo_launcher/core/launchers.py           |  6 ++-
 3 files changed, 53 insertions(+), 10 deletions(-)
 create mode 100644 examples/fault_tolerance/run_gpt126m_batch_training_on_eos.txt

diff --git a/examples/fault_tolerance/run_gpt126m_batch_training_on_eos.txt b/examples/fault_tolerance/run_gpt126m_batch_training_on_eos.txt
new file mode 100644
index 0000000000..8ecdfea185
--- /dev/null
+++ b/examples/fault_tolerance/run_gpt126m_batch_training_on_eos.txt
@@ -0,0 +1,40 @@
+LAUNCHER_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi/nemo/NeMo-Megatron-Launcher"
+
+HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore"  python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
+    training=gpt3/126m \
+    stages=["training"] \
+    numa_mapping.enable=True \
+    data_dir="/lustre/fsr/datasets/gpt/gpt3" \
+    launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts  \
+    container_mounts=[/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi/:/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi/] \
+    container=gitlab-master.nvidia.com/dl/nemo/nemo-fw/train:230803_fault_tol \
+    cluster.partition=batch \
+    cluster.account=coreai_dlalgo_llm \
+    cluster.job_name_prefix="coreai_dlalgo_llm-test-interact:" \
+    cluster.gpus_per_task=null \
+    cluster.gpus_per_node=null \
+    training.exp_manager.resume_if_exists=True \
+    training.exp_manager.create_checkpoint_callback=True \
+    training.exp_manager.checkpoint_callback_params.save_top_k=1 \
+    training.exp_manager.resume_ignore_no_checkpoint=True \
+    training.run.name="dummy_run_name_126m" \
+    training.run.time_limit=00:12:00 \
+    training.trainer.max_time=00:04:00:00 \
+    training.trainer.num_nodes=1 \
+    training.trainer.devices=8 \
+    training.trainer.log_every_n_steps=1 \
+    training.trainer.val_check_interval=1000 \
+    training.trainer.enable_checkpointing=False \
+    training.model.micro_batch_size=2 \
+    training.model.global_batch_size=16 \
+    training.model.tensor_model_parallel_size=1 \
+    training.model.pipeline_model_parallel_size=1 \
+    training.model.transformer_engine=True \
+    training.model.fp8=False \
+    training.model.fp8_e4m3=False \
+    training.model.grad_div_ar_fusion=False \
+    training.model.activations_checkpoint_granularity=selective \
+    training.model.activations_checkpoint_method=uniform \
+    training.model.data.data_impl="mock" \
+    training.model.data.data_prefix=[]
+
diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml
index 97fcb49316..ec6900ae8d 100755
--- a/launcher_scripts/conf/training/gpt3/126m.yaml
+++ b/launcher_scripts/conf/training/gpt3/126m.yaml
@@ -168,14 +168,15 @@ model:
       min_lr: 6e-5
 
   fault_tolerance:
-      initial_rank_heartbeat_timeout: 120
-      rank_heartbeat_timeout: 30
-      ipc_timeout: 30
-      simulated_fault:
-          fault_type: rank_killed
-          rank_to_fail: 1
-          base_delay: 180
-      autoresume_if_interrupted: True
+     initial_rank_heartbeat_timeout: 120
+     rank_heartbeat_timeout: 30
+     ipc_timeout: 30
+     rank_termination_signal: 9
+     simulated_fault:
+         fault_type: rank_killed
+         rank_to_fail: 1
+         base_delay: 180
+     autoresume_if_interrupted: True
 
   data:
     data_impl: mmap
diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index 04110b9e6c..8760b3ca7f 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -637,6 +637,9 @@ def _make_sbatch_string(
     if setup is not None:
         lines += ["", "# setup"] + setup
 
+    if srun_args is None:
+        srun_args = []
+
     if autoresume_if_interrupted is True:
         lines += [
             '',
@@ -649,14 +652,13 @@ def _make_sbatch_string(
             'rm -f $INTERRUPTED_FLAG_FILE',
             '',
         ]
+        srun_args += ["--kill-on-bad-exit=0", "--wait=0"]
            
     # commandline (this will run the function and args specified in the file provided as argument)
     # We pass --output and --error here, because the SBATCH command doesn't work as expected with a filename pattern
     stderr_flags = [] if stderr_to_stdout else ["--error", stderr]
     container_flags = ["--container-image", container_image] if container_image else []
     container_flags += ["--container-mounts", container_mounts] if container_mounts else []
-    if srun_args is None:
-        srun_args = []
 
     if NEMO_LAUNCHER_MEMORY_MEASURE:
         srun_args += ["--overlap"]

From cc4f54b1c02940c54558e4f10989e28779486a06 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@SJC-DC2-DGX1-4.nvidia.com>
Date: Wed, 15 Nov 2023 08:42:35 -0800
Subject: [PATCH 06/39] Updated fault tolerance-end of day version

---
 launcher_scripts/conf/training/gpt3/126m.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml
index ec6900ae8d..fb76ae4731 100755
--- a/launcher_scripts/conf/training/gpt3/126m.yaml
+++ b/launcher_scripts/conf/training/gpt3/126m.yaml
@@ -174,9 +174,8 @@ model:
      rank_termination_signal: 9
      simulated_fault:
          fault_type: rank_killed
-         rank_to_fail: 1
          base_delay: 180
-     autoresume_if_interrupted: True
+     autoresume_if_interrupted: False
 
   data:
     data_impl: mmap

From 2b0d228090dc5531cfd7dafb3378d38b1e7d06f0 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@login-eos01.eos.clusters.nvidia.com>
Date: Fri, 17 Nov 2023 08:04:35 -0800
Subject: [PATCH 07/39] Read fault tolerance config from exp_manager section

---
 launcher_scripts/nemo_launcher/core/stages.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 05e0826040..d20ccb9a78 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -323,7 +323,7 @@ def _make_cluster_parameters(self, cluster: str) -> Dict:
                 }
             )
 
-        fault_tol_conf = stage_cfg.get("model").get("fault_tolerance", None)
+        fault_tol_conf = stage_cfg.get("exp_manager").get("fault_tolerance", None)
         if fault_tol_conf is not None:
             cluster_parameters["autoresume_if_interrupted"] = fault_tol_conf.get("autoresume_if_interrupted", False)
             if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm":

From 63dc9276c1eb2894a120099b9787200fa583bd6e Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Mon, 27 Nov 2023 15:24:43 +0100
Subject: [PATCH 08/39] Added autoresume after preemption

---
 launcher_scripts/conf/training/gpt3/126m.yaml | 11 +----------
 launcher_scripts/nemo_launcher/core/stages.py |  6 ++++--
 2 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml
index fb76ae4731..27d3329756 100755
--- a/launcher_scripts/conf/training/gpt3/126m.yaml
+++ b/launcher_scripts/conf/training/gpt3/126m.yaml
@@ -26,6 +26,7 @@ trainer:
   accumulate_grad_batches: 1
   gradient_clip_val: 1.0
 
+
 exp_manager:
   explicit_log_dir: ${training.run.results_dir}/results
   exp_dir: null
@@ -167,16 +168,6 @@ model:
       constant_steps: 100000
       min_lr: 6e-5
 
-  fault_tolerance:
-     initial_rank_heartbeat_timeout: 120
-     rank_heartbeat_timeout: 30
-     ipc_timeout: 30
-     rank_termination_signal: 9
-     simulated_fault:
-         fault_type: rank_killed
-         base_delay: 180
-     autoresume_if_interrupted: False
-
   data:
     data_impl: mmap
     splits_string: "99990,8,2"
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index d20ccb9a78..cfabe1068e 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -325,9 +325,11 @@ def _make_cluster_parameters(self, cluster: str) -> Dict:
 
         fault_tol_conf = stage_cfg.get("exp_manager").get("fault_tolerance", None)
         if fault_tol_conf is not None:
-            cluster_parameters["autoresume_if_interrupted"] = fault_tol_conf.get("autoresume_if_interrupted", False)
+            resume_on_fault = fault_tol_conf.get("autoresume_if_faulted", False)
+            resume_on_preemption = fault_tol_conf.get("autoresume_if_preempted", False)
+            cluster_parameters["autoresume_if_interrupted"] = (resume_on_fault or resume_on_preemption)
             if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm":
-                raise ValueError(f"autoresume_if_interrupted works only with 'bcm' cluster (current cluster is '{cluster}')")
+                raise ValueError(f"`autoresume_if_faulted` and `autoresume_if_preempted` works only with 'bcm' cluster (current cluster is '{cluster}')")
         
         return cluster_parameters
 

From 2334995b6aeabaf6955010890ba85006fb4bf027 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Tue, 28 Nov 2023 16:24:38 +0000
Subject: [PATCH 09/39] Updated auto-resume params reading

---
 launcher_scripts/nemo_launcher/core/stages.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index cfabe1068e..cd7da3ab19 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -324,12 +324,11 @@ def _make_cluster_parameters(self, cluster: str) -> Dict:
             )
 
         fault_tol_conf = stage_cfg.get("exp_manager").get("fault_tolerance", None)
-        if fault_tol_conf is not None:
-            resume_on_fault = fault_tol_conf.get("autoresume_if_faulted", False)
-            resume_on_preemption = fault_tol_conf.get("autoresume_if_preempted", False)
-            cluster_parameters["autoresume_if_interrupted"] = (resume_on_fault or resume_on_preemption)
-            if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm":
-                raise ValueError(f"`autoresume_if_faulted` and `autoresume_if_preempted` works only with 'bcm' cluster (current cluster is '{cluster}')")
+        resume_on_fault = fault_tol_conf and fault_tol_conf.get("autoresume_if_faulted", False)
+        resume_on_preemption = stage_cfg.get("exp_manager").get("autoresume_if_preempted", False)
+        cluster_parameters["autoresume_if_interrupted"] = (resume_on_fault or resume_on_preemption)
+        if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm":
+            raise ValueError(f"`autoresume_if_faulted` and `autoresume_if_preempted` works only with 'bcm' cluster (current cluster is '{cluster}')")
         
         return cluster_parameters
 

From e50d3887cd392f9066f5b07faa77ff2584994693 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@login-eos01.eos.clusters.nvidia.com>
Date: Tue, 5 Dec 2023 11:33:49 -0800
Subject: [PATCH 10/39] Added SC2 config and run script

---
 .../fault_tolerance/run_sc2_3b_on_eos_FT.txt  |  33 +
 .../conf/training/gpt3/starcoder2_3b.yaml     | 771 ++++++++++++++++++
 2 files changed, 804 insertions(+)
 create mode 100644 examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
 create mode 100755 launcher_scripts/conf/training/gpt3/starcoder2_3b.yaml

diff --git a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
new file mode 100644
index 0000000000..6d13973532
--- /dev/null
+++ b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
@@ -0,0 +1,33 @@
+USR_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi"
+LAUNCHER_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi/nemo/NeMo-Megatron-Launcher/"
+
+HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
+    training=gpt3/starcoder2_3b \
+    stages=["training"] \
+    numa_mapping.enable=True \
+    data_dir=/lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized \
+    launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts  \
+    container_mounts=[$USR_DIR:$USR_DIR] \
+    container="gitlab-master.nvidia.com/dl/nemo/nemo-fw/train:sc2_fault_tol" \
+    cluster.partition=batch \
+    cluster.account=coreai_dlalgo_llm \
+    cluster.job_name_prefix="coreai_dlalgo_llm-sc2_3b-ft:" \
+    cluster.gpus_per_task=null \
+    cluster.gpus_per_node=null \
+    training.run.name="fault_tol_sc2_3b" \
+    training.run.time_limit=04:00:00 \
+    training.trainer.max_time=00:04:00:00 \
+    training.trainer.num_nodes=2 \
+    training.trainer.devices=8 \
+    training.trainer.log_every_n_steps=1 \
+    training.trainer.val_check_interval=1000 \
+    ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=600 \
+    ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=300 \
+    ++training.exp_manager.fault_tolerance.ipc_timeout=30 \
+    ++training.exp_manager.fault_tolerance.rank_termination_signal=9
+    ++training.exp_manager.fault_tolerance.autoresume_if_faulted=True \
+    ++training.exp_manager.autoresume_if_preempted=True
+
+# Uncomment to test simulated faults
+#    ++training.exp_manager.fault_tolerance.simulated_fault.fault_type=random \
+#    ++training.exp_manager.fault_tolerance.simulated_fault.base_delay=1800
diff --git a/launcher_scripts/conf/training/gpt3/starcoder2_3b.yaml b/launcher_scripts/conf/training/gpt3/starcoder2_3b.yaml
new file mode 100755
index 0000000000..b71d98ed3a
--- /dev/null
+++ b/launcher_scripts/conf/training/gpt3/starcoder2_3b.yaml
@@ -0,0 +1,771 @@
+hydra:
+  searchpath:
+  - file:///opt/NeMo/examples/nlp/language_modeling/conf
+run:
+  name: starcoder2_3b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: 04:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 8
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: false
+  max_epochs: null
+  max_steps: 114400
+  max_time: 02:23:30:00
+  log_every_n_steps: 10
+  val_check_interval: 500
+  limit_val_batches: 25
+  limit_test_batches: 25
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${base_results_dir}/${.name}
+  exp_dir: null
+  name: megatron_gpt
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    project: starcoder2
+    name: starcoder2_3b
+  resume_if_exists: true
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: false
+    filename: megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: 2
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+model:
+  micro_batch_size: 1
+  global_batch_size: 160
+  rampup_batch_size: null
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  encoder_seq_length: 16384
+  max_position_embeddings: 16384
+  num_layers: 30
+  hidden_size: 3072
+  ffn_hidden_size: 12288
+  num_attention_heads: 24
+  init_method_std: 0.018042
+  use_scaled_init_method: true
+  hidden_dropout: 0.1
+  attention_dropout: 0.1
+  ffn_dropout: 0.0
+  kv_channels: 128
+  apply_query_key_layer_scaling: true
+  normalization: layernorm1p
+  layernorm_zero_centered_gamma: true
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 0.5
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: huggingface
+    type: bigcode/starcoder2-tokenizer
+    model: null
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: false
+  bias_activation_fusion: false
+  bias_dropout_add_fusion: false
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: false
+  overlap_p2p_comm: false
+  batch_p2p_comm: true
+  num_query_groups: null
+  mcore_gpt: true
+  transformer_engine: false
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: true
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+  fp8_wgrad: true
+  ub_tp_comm_overlap: false
+  optim:
+    name: distributed_fused_adam
+    lr: 0.0003
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 100
+      constant_steps: 0
+      min_lr: 3.0e-05
+  data:
+    data_impl: mmap
+    splits_string: 9995,3,2
+    seq_length: 16384
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    add_fim: true
+    fim:
+      rate: 0.5
+      spm_rate: 0.5
+      split_sample: <file_sep>
+      fragment_rate: 0.5
+      no_prefix: <repo_name>
+      extra_tokens:
+        prefix: <fim_prefix>
+        middle: <fim_middle>
+        suffix: <fim_suffix>
+        pad: <fim_pad>
+        eod: <|endoftext|>
+    data_prefix:
+    - 1.96
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_0/gpt2-preprocessed_content_document
+    - 1.96
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_1/gpt2-preprocessed_content_document
+    - 1.95
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_2/gpt2-preprocessed_content_document
+    - 1.95
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_3/gpt2-preprocessed_content_document
+    - 1.96
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_4/gpt2-preprocessed_content_document
+    - 1.95
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_5/gpt2-preprocessed_content_document
+    - 1.95
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_6/gpt2-preprocessed_content_document
+    - 1.95
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_7/gpt2-preprocessed_content_document
+    - 1.95
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_8/gpt2-preprocessed_content_document
+    - 1.95
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_9/gpt2-preprocessed_content_document
+    - 2.21
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_0/gpt2-preprocessed_content_document
+    - 2.21
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_1/gpt2-preprocessed_content_document
+    - 2.21
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_2/gpt2-preprocessed_content_document
+    - 2.21
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_3/gpt2-preprocessed_content_document
+    - 2.21
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_4/gpt2-preprocessed_content_document
+    - 2.59
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_0/gpt2-preprocessed_content_document
+    - 2.5
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_1/gpt2-preprocessed_content_document
+    - 2.46
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_2/gpt2-preprocessed_content_document
+    - 2.42
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_3/gpt2-preprocessed_content_document
+    - 2.41
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_4/gpt2-preprocessed_content_document
+    - 2.36
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_5/gpt2-preprocessed_content_document
+    - 2.72
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_0/gpt2-preprocessed_content_document
+    - 2.71
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_1/gpt2-preprocessed_content_document
+    - 2.71
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_2/gpt2-preprocessed_content_document
+    - 2.73
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_3/gpt2-preprocessed_content_document
+    - 2.7
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_4/gpt2-preprocessed_content_document
+    - 2.71
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_5/gpt2-preprocessed_content_document
+    - 1.68
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/kaggle_scripts/kaggle_scripts_0/gpt2-preprocessed_content_document
+    - 1.6
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/documentation/documentation_0/gpt2-preprocessed_content_document
+    - 2.42
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_0/gpt2-preprocessed_content_document
+    - 2.42
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_1/gpt2-preprocessed_content_document
+    - 2.43
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_2/gpt2-preprocessed_content_document
+    - 2.43
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_3/gpt2-preprocessed_content_document
+    - 2.42
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_4/gpt2-preprocessed_content_document
+    - 2.29
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_5/gpt2-preprocessed_content_document
+    - 3.32
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/stackoverflow/stackoverflow_0/gpt2-preprocessed_content_document
+    - 3.55
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/stackoverflow/stackoverflow_1/gpt2-preprocessed_content_document
+    - 3.39
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/stackoverflow/stackoverflow_2/gpt2-preprocessed_content_document
+    - 0.25
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/lhq_data/lhq_data_0/gpt2-preprocessed_content_document
+    - 0.28
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/lhq_data/lhq_data_1/gpt2-preprocessed_content_document
+    - 0.47
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/lhq_data/lhq_data_2/gpt2-preprocessed_content_document
+    - 1.0
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/ir_cpp/ir_cpp_0/gpt2-preprocessed_content_document
+    - 1.0
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/ir_rust/ir_rust_0/gpt2-preprocessed_content_document
+    - 1.0
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/ir_python/ir_python_0/gpt2-preprocessed_content_document
+    - 3.0
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/ir_low_resource/ir_low_resource_0/gpt2-preprocessed_content_document
+    - 1.95
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_0/gpt2-preprocessed_content_document
+    - 2.01
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_1/gpt2-preprocessed_content_document
+    - 1.97
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_2/gpt2-preprocessed_content_document
+    - 2.02
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_3/gpt2-preprocessed_content_document
+    - 2.08
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_4/gpt2-preprocessed_content_document
+    - 1.95
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_5/gpt2-preprocessed_content_document
+    - 1.89
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_6/gpt2-preprocessed_content_document
+    - 1.85
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_7/gpt2-preprocessed_content_document
+    - 2.09
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_8/gpt2-preprocessed_content_document
+    - 2.05
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_9/gpt2-preprocessed_content_document
+    - 1.93
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_10/gpt2-preprocessed_content_document
+    - 1.9
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_11/gpt2-preprocessed_content_document
+    - 2.0
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_12/gpt2-preprocessed_content_document
+    - 2.06
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_13/gpt2-preprocessed_content_document
+    - 1.84
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_14/gpt2-preprocessed_content_document
+    - 1.93
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_15/gpt2-preprocessed_content_document
+    - 1.85
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_16/gpt2-preprocessed_content_document
+    - 1.83
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_17/gpt2-preprocessed_content_document
+    - 1.83
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_18/gpt2-preprocessed_content_document
+    - 2.02
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_19/gpt2-preprocessed_content_document
+    - 1.92
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_20/gpt2-preprocessed_content_document
+    - 2.27
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_21/gpt2-preprocessed_content_document
+    - 2.25
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_22/gpt2-preprocessed_content_document
+    - 2.49
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_23/gpt2-preprocessed_content_document
+    - 1.88
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_24/gpt2-preprocessed_content_document
+    - 2.02
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_25/gpt2-preprocessed_content_document
+    - 2.04
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_26/gpt2-preprocessed_content_document
+    - 2.42
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_27/gpt2-preprocessed_content_document
+    - 1.97
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_28/gpt2-preprocessed_content_document
+    - 1.91
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_29/gpt2-preprocessed_content_document
+    - 2.54
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_30/gpt2-preprocessed_content_document
+    - 2.28
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_31/gpt2-preprocessed_content_document
+    - 2.02
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_32/gpt2-preprocessed_content_document
+    - 1.92
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_33/gpt2-preprocessed_content_document
+    - 2.26
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_34/gpt2-preprocessed_content_document
+    - 1.88
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_35/gpt2-preprocessed_content_document
+    - 2.09
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_36/gpt2-preprocessed_content_document
+    - 2.1
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_37/gpt2-preprocessed_content_document
+    - 2.02
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_38/gpt2-preprocessed_content_document
+    - 1.96
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_39/gpt2-preprocessed_content_document
+    - 2.05
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_40/gpt2-preprocessed_content_document
+    - 2.01
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_41/gpt2-preprocessed_content_document
+    - 2.15
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_42/gpt2-preprocessed_content_document
+    - 1.91
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_43/gpt2-preprocessed_content_document
+    - 1.99
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_44/gpt2-preprocessed_content_document
+    - 2.03
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_45/gpt2-preprocessed_content_document
+    - 1.92
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_46/gpt2-preprocessed_content_document
+    - 2.1
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_47/gpt2-preprocessed_content_document
+    - 2.14
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_48/gpt2-preprocessed_content_document
+    - 1.96
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_49/gpt2-preprocessed_content_document
+    - 1.98
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_50/gpt2-preprocessed_content_document
+    - 2.04
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_51/gpt2-preprocessed_content_document
+    - 1.93
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_52/gpt2-preprocessed_content_document
+    - 1.87
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_53/gpt2-preprocessed_content_document
+    - 1.92
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_54/gpt2-preprocessed_content_document
+    - 2.18
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_55/gpt2-preprocessed_content_document
+    - 2.04
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_56/gpt2-preprocessed_content_document
+    - 2.06
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_57/gpt2-preprocessed_content_document
+    - 1.89
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_58/gpt2-preprocessed_content_document
+    - 2.05
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_59/gpt2-preprocessed_content_document
+    - 2.11
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_60/gpt2-preprocessed_content_document
+    - 2.15
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_61/gpt2-preprocessed_content_document
+    - 2.03
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_62/gpt2-preprocessed_content_document
+    - 1.88
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_63/gpt2-preprocessed_content_document
+    - 2.12
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_64/gpt2-preprocessed_content_document
+    - 2.04
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_65/gpt2-preprocessed_content_document
+    - 2.04
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_66/gpt2-preprocessed_content_document
+    - 2.45
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_67/gpt2-preprocessed_content_document
+    - 1.91
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_68/gpt2-preprocessed_content_document
+    - 2.13
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_69/gpt2-preprocessed_content_document
+    - 1.98
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_70/gpt2-preprocessed_content_document
+    - 1.94
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_71/gpt2-preprocessed_content_document
+    - 1.93
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_72/gpt2-preprocessed_content_document
+    - 1.95
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_73/gpt2-preprocessed_content_document
+    - 2.04
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_74/gpt2-preprocessed_content_document
+    - 1.99
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_75/gpt2-preprocessed_content_document
+    - 2.02
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_76/gpt2-preprocessed_content_document
+    - 1.93
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_77/gpt2-preprocessed_content_document
+    - 1.89
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_78/gpt2-preprocessed_content_document
+    - 2.1
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_79/gpt2-preprocessed_content_document
+    - 2.07
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_80/gpt2-preprocessed_content_document
+    - 2.17
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_81/gpt2-preprocessed_content_document
+    - 2.65
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_82/gpt2-preprocessed_content_document
+    - 2.13
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_83/gpt2-preprocessed_content_document
+    - 1.95
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_84/gpt2-preprocessed_content_document
+    - 1.96
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_85/gpt2-preprocessed_content_document
+    - 2.03
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_86/gpt2-preprocessed_content_document
+    - 2.02
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_87/gpt2-preprocessed_content_document
+    - 1.87
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_88/gpt2-preprocessed_content_document
+    - 1.96
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_89/gpt2-preprocessed_content_document
+    - 2.25
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_90/gpt2-preprocessed_content_document
+    - 2.11
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_91/gpt2-preprocessed_content_document
+    - 2.28
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_92/gpt2-preprocessed_content_document
+    - 2.04
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_93/gpt2-preprocessed_content_document
+    - 1.97
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_94/gpt2-preprocessed_content_document
+    - 2.37
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_95/gpt2-preprocessed_content_document
+    - 2.15
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_96/gpt2-preprocessed_content_document
+    - 1.87
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_97/gpt2-preprocessed_content_document
+    - 2.03
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_98/gpt2-preprocessed_content_document
+    - 2.26
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_99/gpt2-preprocessed_content_document
+    - 2.07
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_100/gpt2-preprocessed_content_document
+    - 2.12
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_101/gpt2-preprocessed_content_document
+    - 2.22
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_102/gpt2-preprocessed_content_document
+    - 1.86
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_103/gpt2-preprocessed_content_document
+    - 2.0
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_104/gpt2-preprocessed_content_document
+    - 1.99
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_105/gpt2-preprocessed_content_document
+    - 2.2
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_106/gpt2-preprocessed_content_document
+    - 2.28
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_107/gpt2-preprocessed_content_document
+    - 2.14
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_108/gpt2-preprocessed_content_document
+    - 2.16
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_109/gpt2-preprocessed_content_document
+    - 1.95
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_110/gpt2-preprocessed_content_document
+    - 2.32
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_111/gpt2-preprocessed_content_document
+    - 1.98
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_112/gpt2-preprocessed_content_document
+    - 2.46
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_113/gpt2-preprocessed_content_document
+    - 2.15
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_114/gpt2-preprocessed_content_document
+    - 2.24
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_115/gpt2-preprocessed_content_document
+    - 1.9
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_116/gpt2-preprocessed_content_document
+    - 2.12
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_117/gpt2-preprocessed_content_document
+    - 1.93
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_118/gpt2-preprocessed_content_document
+    - 2.01
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_119/gpt2-preprocessed_content_document
+    - 2.3
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_120/gpt2-preprocessed_content_document
+    - 1.95
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_121/gpt2-preprocessed_content_document
+    - 1.91
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_122/gpt2-preprocessed_content_document
+    - 1.88
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_123/gpt2-preprocessed_content_document
+    - 2.27
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_124/gpt2-preprocessed_content_document
+    - 2.13
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_125/gpt2-preprocessed_content_document
+    - 2.04
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_126/gpt2-preprocessed_content_document
+    - 2.15
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_127/gpt2-preprocessed_content_document
+    - 2.18
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_128/gpt2-preprocessed_content_document
+    - 2.22
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_129/gpt2-preprocessed_content_document
+    - 2.06
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_130/gpt2-preprocessed_content_document
+    - 1.99
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_131/gpt2-preprocessed_content_document
+    - 1.93
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_132/gpt2-preprocessed_content_document
+    - 2.37
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_133/gpt2-preprocessed_content_document
+    - 1.96
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_134/gpt2-preprocessed_content_document
+    - 2.12
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_135/gpt2-preprocessed_content_document
+    - 2.03
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_136/gpt2-preprocessed_content_document
+    - 2.44
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_137/gpt2-preprocessed_content_document
+    - 2.16
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_138/gpt2-preprocessed_content_document
+    - 1.97
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_139/gpt2-preprocessed_content_document
+    - 2.0
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_140/gpt2-preprocessed_content_document
+    - 2.12
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_141/gpt2-preprocessed_content_document
+    - 1.98
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_142/gpt2-preprocessed_content_document
+    - 2.15
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_143/gpt2-preprocessed_content_document
+    - 1.85
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_144/gpt2-preprocessed_content_document
+    - 2.04
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_145/gpt2-preprocessed_content_document
+    - 1.94
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_146/gpt2-preprocessed_content_document
+    - 1.87
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_147/gpt2-preprocessed_content_document
+    - 1.85
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_148/gpt2-preprocessed_content_document
+    - 2.49
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_149/gpt2-preprocessed_content_document
+    - 2.13
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_150/gpt2-preprocessed_content_document
+    - 1.97
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_151/gpt2-preprocessed_content_document
+    - 1.88
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_152/gpt2-preprocessed_content_document
+    - 2.36
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_153/gpt2-preprocessed_content_document
+    - 2.02
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_154/gpt2-preprocessed_content_document
+    - 2.1
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_155/gpt2-preprocessed_content_document
+    - 1.93
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_156/gpt2-preprocessed_content_document
+    - 1.98
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_157/gpt2-preprocessed_content_document
+    - 1.96
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_158/gpt2-preprocessed_content_document
+    - 1.99
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_159/gpt2-preprocessed_content_document
+    - 2.0
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_160/gpt2-preprocessed_content_document
+    - 2.08
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_161/gpt2-preprocessed_content_document
+    - 2.0
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_162/gpt2-preprocessed_content_document
+    - 2.08
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_163/gpt2-preprocessed_content_document
+    - 2.02
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_164/gpt2-preprocessed_content_document
+    - 2.12
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_165/gpt2-preprocessed_content_document
+    - 2.07
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_166/gpt2-preprocessed_content_document
+    - 2.03
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_167/gpt2-preprocessed_content_document
+    - 2.28
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_168/gpt2-preprocessed_content_document
+    - 2.32
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_169/gpt2-preprocessed_content_document
+    - 1.92
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_170/gpt2-preprocessed_content_document
+    - 2.02
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_171/gpt2-preprocessed_content_document
+    - 1.94
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_172/gpt2-preprocessed_content_document
+    - 2.06
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_173/gpt2-preprocessed_content_document
+    - 2.01
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_174/gpt2-preprocessed_content_document
+    - 1.99
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_175/gpt2-preprocessed_content_document
+    - 2.19
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_176/gpt2-preprocessed_content_document
+    - 2.14
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_177/gpt2-preprocessed_content_document
+    - 1.91
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_178/gpt2-preprocessed_content_document
+    - 2.23
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_179/gpt2-preprocessed_content_document
+    - 1.99
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_180/gpt2-preprocessed_content_document
+    - 2.11
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_181/gpt2-preprocessed_content_document
+    - 1.93
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_182/gpt2-preprocessed_content_document
+    - 1.87
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_183/gpt2-preprocessed_content_document
+    - 2.02
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_184/gpt2-preprocessed_content_document
+    - 1.9
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_185/gpt2-preprocessed_content_document
+    - 2.05
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_186/gpt2-preprocessed_content_document
+    - 2.01
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_187/gpt2-preprocessed_content_document
+    - 1.9
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_188/gpt2-preprocessed_content_document
+    - 1.99
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_189/gpt2-preprocessed_content_document
+    - 2.12
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_190/gpt2-preprocessed_content_document
+    - 1.89
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_191/gpt2-preprocessed_content_document
+    - 1.89
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_192/gpt2-preprocessed_content_document
+    - 1.88
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_193/gpt2-preprocessed_content_document
+    - 2.63
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_194/gpt2-preprocessed_content_document
+    - 1.87
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_195/gpt2-preprocessed_content_document
+    - 1.98
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_196/gpt2-preprocessed_content_document
+    - 1.97
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_197/gpt2-preprocessed_content_document
+    - 2.0
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_198/gpt2-preprocessed_content_document
+    - 2.17
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_199/gpt2-preprocessed_content_document
+    - 2.02
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_200/gpt2-preprocessed_content_document
+    - 2.11
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_201/gpt2-preprocessed_content_document
+    - 2.24
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_202/gpt2-preprocessed_content_document
+    - 2.19
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_203/gpt2-preprocessed_content_document
+    - 2.07
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_204/gpt2-preprocessed_content_document
+    - 2.03
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_205/gpt2-preprocessed_content_document
+    - 2.03
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_206/gpt2-preprocessed_content_document
+    - 2.18
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_207/gpt2-preprocessed_content_document
+    - 1.92
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_208/gpt2-preprocessed_content_document
+    - 2.37
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_209/gpt2-preprocessed_content_document
+    - 2.03
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_210/gpt2-preprocessed_content_document
+    - 2.04
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_211/gpt2-preprocessed_content_document
+    - 1.86
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_212/gpt2-preprocessed_content_document
+    - 1.99
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_213/gpt2-preprocessed_content_document
+    - 1.96
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_214/gpt2-preprocessed_content_document
+    - 1.97
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_215/gpt2-preprocessed_content_document
+    - 2.1
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_216/gpt2-preprocessed_content_document
+    - 2.06
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_217/gpt2-preprocessed_content_document
+    - 1.99
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_218/gpt2-preprocessed_content_document
+    - 1.9
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_219/gpt2-preprocessed_content_document
+    - 2.05
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_220/gpt2-preprocessed_content_document
+    - 1.95
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_221/gpt2-preprocessed_content_document
+    - 1.9
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_222/gpt2-preprocessed_content_document
+    - 2.08
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_223/gpt2-preprocessed_content_document
+    - 1.97
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_224/gpt2-preprocessed_content_document
+    - 1.98
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_225/gpt2-preprocessed_content_document
+    - 1.93
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_226/gpt2-preprocessed_content_document
+    - 2.22
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_227/gpt2-preprocessed_content_document
+    - 1.93
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_228/gpt2-preprocessed_content_document
+    - 2.17
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_229/gpt2-preprocessed_content_document
+    - 2.06
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_230/gpt2-preprocessed_content_document
+    - 1.98
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_231/gpt2-preprocessed_content_document
+    - 1.94
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_232/gpt2-preprocessed_content_document
+    - 2.14
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_233/gpt2-preprocessed_content_document
+    - 1.97
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_234/gpt2-preprocessed_content_document
+    - 2.14
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_235/gpt2-preprocessed_content_document
+    - 2.12
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_236/gpt2-preprocessed_content_document
+    - 2.09
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_237/gpt2-preprocessed_content_document
+    - 2.01
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_238/gpt2-preprocessed_content_document
+    - 2.15
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_239/gpt2-preprocessed_content_document
+    - 2.01
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_240/gpt2-preprocessed_content_document
+    - 1.99
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_241/gpt2-preprocessed_content_document
+    - 1.86
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_242/gpt2-preprocessed_content_document
+    - 2.12
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_243/gpt2-preprocessed_content_document
+    - 1.99
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_244/gpt2-preprocessed_content_document
+    - 2.41
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_245/gpt2-preprocessed_content_document
+    - 2.04
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_246/gpt2-preprocessed_content_document
+    - 1.95
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_247/gpt2-preprocessed_content_document
+    - 1.93
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_248/gpt2-preprocessed_content_document
+    - 2.61
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_249/gpt2-preprocessed_content_document
+    - 1.77
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_250/gpt2-preprocessed_content_document
+    - 1.94
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_251/gpt2-preprocessed_content_document
+    - 2.2
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_252/gpt2-preprocessed_content_document
+    - 1.9
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_253/gpt2-preprocessed_content_document
+    - 2.15
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_254/gpt2-preprocessed_content_document
+    - 2.13
+    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_255/gpt2-preprocessed_content_document

From ca7ab16b3c34cc16364edc928a44ed51eaa815b1 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@login-eos01.eos.clusters.nvidia.com>
Date: Tue, 5 Dec 2023 12:16:53 -0800
Subject: [PATCH 11/39] Fixed launch cmd for Sc2

---
 examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
index 6d13973532..24663507b7 100644
--- a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
+++ b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
@@ -24,10 +24,11 @@ HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scri
     ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=600 \
     ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=300 \
     ++training.exp_manager.fault_tolerance.ipc_timeout=30 \
-    ++training.exp_manager.fault_tolerance.rank_termination_signal=9
+    ++training.exp_manager.fault_tolerance.rank_termination_signal=9 \
     ++training.exp_manager.fault_tolerance.autoresume_if_faulted=True \
     ++training.exp_manager.autoresume_if_preempted=True
 
+
 # Uncomment to test simulated faults
 #    ++training.exp_manager.fault_tolerance.simulated_fault.fault_type=random \
 #    ++training.exp_manager.fault_tolerance.simulated_fault.base_delay=1800

From 74b2e4227cf6baf1701f1362d0daf283183cb443 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@login-eos01.eos.clusters.nvidia.com>
Date: Wed, 6 Dec 2023 10:18:43 -0800
Subject: [PATCH 12/39] Updated launcher cmd

---
 examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
index 24663507b7..57509227e7 100644
--- a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
+++ b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
@@ -1,7 +1,7 @@
 USR_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi"
 LAUNCHER_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi/nemo/NeMo-Megatron-Launcher/"
 
-HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
+NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
     training=gpt3/starcoder2_3b \
     stages=["training"] \
     numa_mapping.enable=True \
@@ -26,7 +26,7 @@ HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scri
     ++training.exp_manager.fault_tolerance.ipc_timeout=30 \
     ++training.exp_manager.fault_tolerance.rank_termination_signal=9 \
     ++training.exp_manager.fault_tolerance.autoresume_if_faulted=True \
-    ++training.exp_manager.autoresume_if_preempted=True
+    ++training.exp_manager.autoresume_if_preempted=False
 
 
 # Uncomment to test simulated faults

From b8d8a585247835c7b29df0f0aed619d577597550 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@login-eos01.eos.clusters.nvidia.com>
Date: Wed, 6 Dec 2023 10:46:13 -0800
Subject: [PATCH 13/39] increased timeouts and removed
 NVTE_APPLY_QK_LAYER_SCALING

---
 examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
index 57509227e7..99e34fe523 100644
--- a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
+++ b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
@@ -1,7 +1,7 @@
 USR_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi"
 LAUNCHER_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi/nemo/NeMo-Megatron-Launcher/"
 
-NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
+HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
     training=gpt3/starcoder2_3b \
     stages=["training"] \
     numa_mapping.enable=True \
@@ -21,9 +21,9 @@ NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3
     training.trainer.devices=8 \
     training.trainer.log_every_n_steps=1 \
     training.trainer.val_check_interval=1000 \
-    ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=600 \
-    ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=300 \
-    ++training.exp_manager.fault_tolerance.ipc_timeout=30 \
+    ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=720 \
+    ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=600 \
+    ++training.exp_manager.fault_tolerance.ipc_timeout=60 \
     ++training.exp_manager.fault_tolerance.rank_termination_signal=9 \
     ++training.exp_manager.fault_tolerance.autoresume_if_faulted=True \
     ++training.exp_manager.autoresume_if_preempted=False

From 0d9065ec6f30fdbd96bfa400ac13164003630de3 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Thu, 7 Dec 2023 17:55:11 +0100
Subject: [PATCH 14/39] Added back NVTE_APPLY_QK_LAYER_SCALING=1 as error
 happens without it

---
 examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
index 99e34fe523..71831a30e3 100644
--- a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
+++ b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
@@ -1,7 +1,7 @@
 USR_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi"
 LAUNCHER_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi/nemo/NeMo-Megatron-Launcher/"
 
-HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
+NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
     training=gpt3/starcoder2_3b \
     stages=["training"] \
     numa_mapping.enable=True \

From c6367c5fa22c82327bdf91d38715ec842c68c640 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Mon, 11 Dec 2023 15:07:00 +0100
Subject: [PATCH 15/39] Added fault tolerance unit tests

---
 .../config_tests/test_fault_tol_config.py     | 111 ++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py

diff --git a/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py
new file mode 100644
index 0000000000..02cb78b17d
--- /dev/null
+++ b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py
@@ -0,0 +1,111 @@
+import math
+import os
+
+import nemo_launcher.core.launchers
+import omegaconf
+import pytest
+from nemo_launcher.core.stages import Training
+from omegaconf import OmegaConf
+
+# Setup NEMO_LAUNCHER_DEBUG=True, so no 'srun' or 'sbatch' is required
+nemo_launcher.core.launchers.NEMO_LAUNCHER_DEBUG = True
+
+omegaconf.OmegaConf.register_new_resolver("multiply", lambda x, y: x * y, replace=True)
+
+omegaconf.OmegaConf.register_new_resolver(
+    "divide_ceil", lambda x, y: int(math.ceil(x / y)), replace=True
+)
+
+omegaconf.OmegaConf.register_new_resolver(
+    "divide_floor", lambda x, y: int(math.floor(x / y)), replace=True
+)
+
+LAUNCHER_SCRIPTS_PATH = "."
+TEST_RESULTS_DIR = "test_folder_ft"
+
+
+@pytest.fixture(autouse=True)
+def _setup_and_teardown():
+    yield
+    os.system(f"rm -rf {TEST_RESULTS_DIR}")
+
+
+def test_fault_tol_config_no_fault_tol_section():
+    """ No fault tolerance section in config: should be fine """
+    cfg = OmegaConf.load("conf/config.yaml")
+    cfg.stages = ["training"]
+    cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH
+    cfg.base_results_dir = TEST_RESULTS_DIR
+    cfg.cluster_type = "bcm"
+    cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml")
+    cfg.training_config = "gpt3/126m"
+    cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
+    assert cfg.training.exp_manager.get("fault_tolernace", None) is None
+    stage = Training(cfg)
+    _ = stage.run()
+
+
+def test_fault_tol_config_autoresume_if_preempted():
+    """ autpresume_if_preempted=True and BCM cluster: should be fine """
+    cfg = OmegaConf.load("conf/config.yaml")
+    cfg.stages = ["training"]
+    cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH
+    cfg.base_results_dir = TEST_RESULTS_DIR
+    cfg.cluster_type = "bcm"
+    cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml")
+    cfg.training_config = "gpt3/126m"
+    cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
+    cfg.training.exp_manager.autoresume_if_preempted = True
+    stage = Training(cfg)
+    _ = stage.run()
+
+
+def test_fault_tol_config_autoresume_if_preempted_invalid_cluster():
+    """ autpresume_if_preempted=True is not allowed with non-BCM cluster """
+    cfg = OmegaConf.load("conf/config.yaml")
+    cfg.stages = ["training"]
+    cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH
+    cfg.base_results_dir = TEST_RESULTS_DIR
+    cfg.cluster_type = "bcp"
+    cfg.cluster = dict()
+    cfg.training_config = "gpt3/126m"
+    cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
+    cfg.training.exp_manager.autoresume_if_preempted = True
+    with pytest.raises(ValueError):
+        stage = Training(cfg)
+        _ = stage.run()
+
+
+def test_fault_tol_config_autoresume_if_faulted():
+    """ autoresume_if_faulted=True and BCM cluster: should be fine """
+    cfg = OmegaConf.load("conf/config.yaml")
+    cfg.stages = ["training"]
+    cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH
+    cfg.base_results_dir = TEST_RESULTS_DIR
+    cfg.cluster_type = "bcm"
+    cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml")
+    cfg.training_config = "gpt3/126m"
+    cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
+    cfg.training.exp_manager.fault_tolerance = OmegaConf.create(
+        {"autoresume_if_faulted": True}
+    )
+    stage = Training(cfg)
+    _ = stage.run()
+
+
+def test_fault_tol_config_autoresume_if_faulted_invalid_cluster():
+    """ autoresume_if_faulted=True is not allowed with non-BCM cluster """
+    cfg = OmegaConf.load("conf/config.yaml")
+    cfg.stages = ["training"]
+    cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH
+    cfg.base_results_dir = TEST_RESULTS_DIR
+    cfg.cluster_type = "bcp"
+    cfg.cluster = dict()
+    cfg.training_config = "gpt3/126m"
+    cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
+    cfg.training.exp_manager.fault_tolerance = OmegaConf.create(
+        {"autoresume_if_faulted": True}
+    )
+    with pytest.raises(ValueError):
+        stage = Training(cfg)
+        _ = stage.run()

From f1dfe54a8e443dac28cf9c6e425690b413ba85ee Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Thu, 11 Jan 2024 12:42:57 +0100
Subject: [PATCH 16/39] Use FT launcher, WIP

---
 .../nemo_launcher/core/launchers.py           | 251 ++++++++++++++++--
 launcher_scripts/nemo_launcher/core/stages.py |  27 +-
 .../config_tests/test_fault_tol_config.py     |  19 +-
 3 files changed, 268 insertions(+), 29 deletions(-)

diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index 7dac967053..2e704d0a36 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -455,9 +455,14 @@ def _make_submission_file_text(self, command_groups: List[List[str]]) -> str:
         :return: submission script file's text
         :rtype: str
         """
-        return _make_sbatch_string(
-            command_groups=command_groups, folder=self.folder, **self.parameters
-        )
+        if getattr(self.parameters, 'use_fault_tolerance', None):
+            return _make_sbatch_string_ft_launcher(
+                command_groups=command_groups, folder=self.folder, **self.parameters
+            )     
+        else:
+            return _make_sbatch_string(
+                command_groups=command_groups, folder=self.folder, **self.parameters
+            )
 
     @staticmethod
     def _make_submission_command(submission_file_path: Path) -> List[str]:
@@ -586,7 +591,6 @@ def _make_sbatch_string(
     additional_parameters: Optional[Dict[str, Any]] = None,
     srun_args: Optional[Iterable[str]] = None,
     heterogeneous: bool = False,
-    autoresume_if_interrupted: bool = False,
 ) -> str:
     """Creates the content of an sbatch file with provided parameters
 
@@ -627,7 +631,6 @@ def _make_sbatch_string(
         "container_mounts",
         "srun_args",
         "heterogeneous",
-        "autoresume_if_interrupted",
     ]
     parameters = {
         k: v for k, v in locals().items() if v is not None and k not in nonslurm
@@ -689,23 +692,6 @@ def _make_sbatch_string(
     if setup is not None:
         lines += ["", "# setup"] + setup
 
-    if srun_args is None:
-        srun_args = []
-
-    if autoresume_if_interrupted is True:
-        lines += [
-            '',
-            '# if the flag file is created by a trainer script, this slurm batch script will be rescheduled',
-            'export INTERRUPTED_FLAG_FILE='+str(paths.folder / "_interrupted_flag"),
-            'if [ "$RESUMED" = "1" ] && [ ! -f "$INTERRUPTED_FLAG_FILE" ] ; then exit 0 ; fi',
-            'CONT_SBATCH_OUT=$(RESUMED=1 sbatch --parsable --dependency=afterany:"$SLURM_JOB_ID" "$0")',
-            'if [ $? -ne 0 ] ; then echo "Could not schedule continuation job. Check stderr for details." ; exit 1 ; fi',
-            'CONT_SLURM_JOB_ID=$(echo $CONT_SBATCH_OUT | cut -f1 -d",")',
-            'rm -f $INTERRUPTED_FLAG_FILE',
-            '',
-        ]
-        srun_args += ["--kill-on-bad-exit=0", "--wait=3600"]
-           
     # commandline (this will run the function and args specified in the file provided as argument)
     # We pass --output and --error here, because the SBATCH command doesn't work as expected with a filename pattern
     stderr_flags = [] if stderr_to_stdout else ["--error", stderr]
@@ -713,6 +699,8 @@ def _make_sbatch_string(
     container_flags += (
         ["--container-mounts", container_mounts] if container_mounts else []
     )
+    if srun_args is None:
+        srun_args = []
 
     if NEMO_LAUNCHER_MEMORY_MEASURE:
         srun_args += ["--overlap"]
@@ -787,6 +775,225 @@ def _make_sbatch_string(
                 f'  {command} "',
                 "",
             ]
+    return "\n".join(lines)
+
+
+# pylint: disable=too-many-arguments,unused-argument, too-many-locals
+def _make_sbatch_string_ft_launcher(
+    command_groups: List[List[str]],
+    folder: Union[str, Path],
+    job_name: str = "nemo_launcher",
+    partition: Optional[str] = None,
+    time: int = 5,
+    nodes: Union[int, List[int]] = 1,
+    ntasks_per_node: Optional[Union[int, List[int]]] = None,
+    cpus_per_task: Optional[int] = None,
+    cpus_per_gpu: Optional[int] = None,
+    num_gpus: Optional[int] = None,  # legacy
+    gpus_per_node: Optional[int] = None,
+    gpus_per_task: Optional[int] = None,
+    qos: Optional[str] = None,  # quality of service
+    setup: Optional[List[str]] = None,
+    mem: Optional[str] = None,
+    mem_per_gpu: Optional[str] = None,
+    mem_per_cpu: Optional[str] = None,
+    dependency: Optional[str] = None,
+    comment: Optional[str] = None,
+    constraint: Optional[str] = None,
+    exclude: Optional[str] = None,
+    account: Optional[str] = None,
+    gres: Optional[str] = None,
+    exclusive: Optional[Union[bool, str]] = None,
+    array: Optional[str] = None,
+    stderr_to_stdout: bool = False,
+    container_image: Optional[str] = None,
+    container_mounts: Optional[str] = None,
+    additional_parameters: Optional[Dict[str, Any]] = None,
+    srun_args: Optional[Iterable[str]] = None,
+    heterogeneous: bool = False,
+    autoresume_if_interrupted: bool = False,
+) -> str:
+        
+    """Creates the content of an sbatch file with provided parameters
+
+    Parameters
+    ----------
+    See slurm sbatch documentation for most parameters:
+    https://slurm.schedmd.com/sbatch.html
+
+    Below are the parameters that differ from slurm documentation:
+
+    command_groups:
+        each command group will be assigned one srun
+    folder: str/Path
+        folder where print logs and error logs will be written
+    setup: list
+        a list of command to run in sbatch before running srun
+    additional_parameters: dict
+        Forces any parameter to a given value in sbatch. This can be useful
+        to add parameters which are not currently available in nemo_launcher.
+        Eg: {"mail-user": "blublu@nvidia.com", "mail-type": "BEGIN"}
+    srun_args: List[str]
+        Add each argument in the list to the srun call
+
+    Raises
+    ------
+    ValueError
+        In case an erroneous keyword argument is added, a list of all eligible parameters
+        is printed, with their default values
+    """
+    nonslurm = [
+        "nonslurm",
+        "folder",
+        "command_groups",
+        "additional_parameters",
+        "setup",
+        "stderr_to_stdout",
+        "container_image",
+        "container_mounts",
+        "srun_args",
+        "heterogeneous",
+        "autoresume_if_interrupted",
+    ]
+    parameters = {
+        k: v for k, v in locals().items() if v is not None and k not in nonslurm
+    }
+    # rename and reformat parameters
+
+    if num_gpus is not None:
+        warnings.warn(
+            '"num_gpus" is deprecated, please use "gpus_per_node" instead (overwritting with num_gpus)'
+        )
+        parameters["gpus_per_node"] = parameters.pop("num_gpus", 0)
+    if "cpus_per_gpu" in parameters and "gpus_per_task" not in parameters:
+        warnings.warn(
+            '"cpus_per_gpu" requires to set "gpus_per_task" to work (and not "gpus_per_node")'
+        )
+    # add necessary parameters
+    job_name = parameters.get("job_name")
+    paths = job_utils.JobPaths(folder=folder, job_name=job_name)
+    stdout = str(paths.stdout)
+    stderr = str(paths.stderr)
+
+    if array is not None:
+        stdout = stdout.replace("%j", "%A_%a")
+        stderr = stderr.replace("%j", "%A_%a")
+    parameters["output"] = stdout.replace("%t", "0")
+
+    if not stderr_to_stdout:
+        parameters["error"] = stderr.replace("%t", "0")
+
+    if NEMO_LAUNCHER_CI:  # Override output file for slurm
+        parameters["output"] = parameters["error"] = str(paths.folder / "slurm_%j.out")
+        stdout = stderr = parameters["output"]
+
+    if additional_parameters is not None:
+        parameters.update(additional_parameters)
+    # now create
+    lines = ["#!/bin/bash", "", "# Parameters"]
+    if heterogeneous:
+        raise ValueError("This PoC does not support heterogeneous jobs")
+    else:
+        # run 1 FT launcher per node, it will spawn the actual tasks
+        parameters["ntasks_per_node"] = 1
+        for k in sorted(parameters):
+            lines.append(_as_sbatch_flag(k, parameters[k]))
+        parameters["ntasks_per_node"] = ntasks_per_node
+        
+    lines += ["", "# This script uses experimental fault tolerance launcher", ""]
+    
+    # environment setup:
+    if setup is not None:
+        lines += ["", "# setup"] + setup
+
+    if srun_args is None:
+        srun_args = []
+
+    if autoresume_if_interrupted is True:
+        lines += [
+            '',
+            'export INTERRUPTED_FLAG_FILE='+str(paths.folder / "_interrupted_flag"),
+            'if [ "$RESUMED" = "1" ] && [ ! -f "$INTERRUPTED_FLAG_FILE" ] ; then exit 0 ; fi',
+            'CONT_SBATCH_OUT=$(RESUMED=1 sbatch --parsable --dependency=afterany:"$SLURM_JOB_ID" "$0")',
+            'if [ $? -ne 0 ] ; then echo "Could not schedule continuation job. Check stderr for details." ; exit 1 ; fi',
+            'CONT_SLURM_JOB_ID=$(echo $CONT_SBATCH_OUT | cut -f1 -d",")',
+            'rm -f $INTERRUPTED_FLAG_FILE',
+            '',
+        ]
+        srun_args += ["--kill-on-bad-exit=0", "--wait=3600"]
+         
+    lines += [
+        "FT_RDZV_HOST=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)"
+    ]
+      
+    # commandline (this will run the function and args specified in the file provided as argument)
+    # We pass --output and --error here, because the SBATCH command doesn't work as expected with a filename pattern
+    stderr_flags = [] if stderr_to_stdout else ["--error", stderr]
+    container_flags = ["--container-image", container_image] if container_image else []
+    container_flags += (
+        ["--container-mounts", container_mounts] if container_mounts else []
+    )
+
+    if NEMO_LAUNCHER_MEMORY_MEASURE:
+        srun_args += ["--overlap"]
+
+        mem_stdout = stdout.replace("_%j", "_mem_%j")
+        mem_stdout = mem_stdout.replace("_%A_%a", "_mem_%A_%a")
+        mem_srun_cmd = shlex.join(
+            [
+                "srun",
+                "--ntasks=1",
+                "--ntasks-per-node=1",
+                "--output",
+                mem_stdout,
+                *container_flags,
+                *srun_args,
+            ]
+        )
+        lines += [
+            "",
+            "# run memory measure",
+            f"{mem_srun_cmd} \\",
+            f"  nvidia-smi --query-gpu=timestamp,index,,memory.total,memory.free,memory.used --format=csv -l 1 & ",
+            "",
+        ]
+        
+    # Fault tolerance uses Torch Elastic based launcher with SLURM.
+    # Torch Lightning does not handle that case correctly, 
+    # so we need to force TorchElasticEnvironment over SLURMEnvironment. 
+    # We do this by setting SLURM_JOB_NAME=interactive.
+    # This is a temporary workaround, until the following PR is merged with NeMo 
+    # https://github.com/Lightning-AI/pytorch-lightning/pull/18618
+    ft_launcher_cmd="SLURM_JOB_NAME=interactive ft_launcher " +\
+                    "--rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$FT_RDZV_HOST " +\
+                    f"--nnodes={nodes} --nproc_per_node={ntasks_per_node}"
+
+    for group_ind, command_group in enumerate(command_groups):
+        if heterogeneous:
+            raise ValueError("This PoC does not support heterogeneous jobs")
+        else:
+            srun_cmd = shlex.join(
+                [
+                    "srun",
+                    "--output",
+                    stdout,
+                    *stderr_flags,
+                    *container_flags,
+                    *srun_args,
+                ]
+            )
+            command = ";\n  ".join(command_group)
+            assert "python3 -u" in command
+            command = command.replace(
+                "python3 -u", ft_launcher_cmd,
+            )
+            lines += [
+                "",
+                f"# command {group_ind + 1}",
+                f'{srun_cmd} bash -c "',
+                f'  {command} "',
+                "",
+            ]
 
     if autoresume_if_interrupted is True:
         lines += [
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 3011b7ebfa..25ce943c5e 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -351,15 +351,30 @@ def _make_cluster_parameters(self, cluster: str) -> Dict:
                 }
             )
 
-        fault_tol_conf = stage_cfg.get("exp_manager").get("fault_tolerance", None)
-        resume_on_fault = fault_tol_conf and fault_tol_conf.get("autoresume_if_faulted", False)
-        resume_on_preemption = stage_cfg.get("exp_manager").get("autoresume_if_preempted", False)
-        cluster_parameters["autoresume_if_interrupted"] = (resume_on_fault or resume_on_preemption)
-        if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm":
-            raise ValueError(f"`autoresume_if_faulted` and `autoresume_if_preempted` works only with 'bcm' cluster (current cluster is '{cluster}')")
+        cluster_parameters = \
+            self._update_fault_tolerance_params(stage_cfg, cluster, cluster_parameters)
         
         return cluster_parameters
 
+    def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters):
+        # TODO: cleanup this function
+        exp_man_conf = stage_cfg.get("exp_manager", None)
+        resume_on_preemption = exp_man_conf.get("autoresume_if_preempted", False)
+        ft_conf = exp_man_conf is not None and exp_man_conf.get("fault_tolerance", None)
+        is_ft_enabled = ft_conf is not None
+        if is_ft_enabled:
+            cluster_parameters["use_fault_tolerance"] = True
+            resume_on_fault = ft_conf.get("autoresume_if_faulted", False)
+            cluster_parameters["autoresume_if_interrupted"] = (resume_on_fault or resume_on_preemption)
+            if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm":
+                raise ValueError(f"`autoresume_if_faulted` and `autoresume_if_preempted` "
+                                 f"works only with 'bcm' cluster (current cluster is '{cluster}')")
+        else:
+            if resume_on_preemption is True:
+                raise ValueError(f"`autoresume_if_preempted` works only with fault tolerance enabled")   
+            
+        return cluster_parameters
+                
     def _find_optimal_nodes(self, cfg, gpus) -> None:
         nodes_scheduler_path = (
             f"{cfg.get('training').get('run').get('results_dir')}/nodes_scheduler.json"
diff --git a/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py
index 02cb78b17d..9ef6899574 100644
--- a/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py
+++ b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py
@@ -46,7 +46,7 @@ def test_fault_tol_config_no_fault_tol_section():
 
 
 def test_fault_tol_config_autoresume_if_preempted():
-    """ autpresume_if_preempted=True and BCM cluster: should be fine """
+    """ autpresume_if_preempted=True and FT enabled, should be fine """
     cfg = OmegaConf.load("conf/config.yaml")
     cfg.stages = ["training"]
     cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH
@@ -56,9 +56,26 @@ def test_fault_tol_config_autoresume_if_preempted():
     cfg.training_config = "gpt3/126m"
     cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
     cfg.training.exp_manager.autoresume_if_preempted = True
+    cfg.training.exp_manager.fault_tolerance = OmegaConf.create(
+        {"autoresume_if_faulted": False}
+    )
     stage = Training(cfg)
     _ = stage.run()
 
+def test_fault_tol_config_autoresume_if_preempted_no_ft():
+    """ autpresume_if_preempted=True without fault tolerance is invalid """
+    cfg = OmegaConf.load("conf/config.yaml")
+    cfg.stages = ["training"]
+    cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH
+    cfg.base_results_dir = TEST_RESULTS_DIR
+    cfg.cluster_type = "bcm"
+    cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml")
+    cfg.training_config = "gpt3/126m"
+    cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
+    cfg.training.exp_manager.autoresume_if_preempted = True
+    with pytest.raises(ValueError):
+        stage = Training(cfg)
+        _ = stage.run()
 
 def test_fault_tol_config_autoresume_if_preempted_invalid_cluster():
     """ autpresume_if_preempted=True is not allowed with non-BCM cluster """

From 87e338632900dd48d7a6767223321ffa8bfc54e2 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Thu, 11 Jan 2024 18:11:42 +0100
Subject: [PATCH 17/39] Fix...

---
 launcher_scripts/nemo_launcher/core/launchers.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index 2e704d0a36..b12d09ed88 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -811,6 +811,7 @@ def _make_sbatch_string_ft_launcher(
     additional_parameters: Optional[Dict[str, Any]] = None,
     srun_args: Optional[Iterable[str]] = None,
     heterogeneous: bool = False,
+    use_fault_tolerance: bool = True,
     autoresume_if_interrupted: bool = False,
 ) -> str:
         
@@ -853,6 +854,7 @@ def _make_sbatch_string_ft_launcher(
         "container_mounts",
         "srun_args",
         "heterogeneous",
+        "use_fault_tolerance",
         "autoresume_if_interrupted",
     ]
     parameters = {

From 90fc006d540b0e43acf893345beee15e6ed21603 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Thu, 11 Jan 2024 19:54:17 +0100
Subject: [PATCH 18/39] Fix2

---
 launcher_scripts/nemo_launcher/core/launchers.py | 5 ++---
 launcher_scripts/nemo_launcher/core/stages.py    | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index b12d09ed88..b95b307c2b 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -366,6 +366,7 @@ class SlurmLauncher(Launcher):
     def __init__(self, folder: Union[Path, str], job_name: str, **kwargs: Any) -> None:
         super().__init__(folder, job_name)
         self.parameters = {}
+        self.use_fault_tolerance = kwargs.pop("use_fault_tolerance", False)
         self._update_parameters(job_name=job_name, **kwargs)
 
         if shutil.which("srun") is None and not NEMO_LAUNCHER_DEBUG:
@@ -455,7 +456,7 @@ def _make_submission_file_text(self, command_groups: List[List[str]]) -> str:
         :return: submission script file's text
         :rtype: str
         """
-        if getattr(self.parameters, 'use_fault_tolerance', None):
+        if self.use_fault_tolerance:
             return _make_sbatch_string_ft_launcher(
                 command_groups=command_groups, folder=self.folder, **self.parameters
             )     
@@ -811,7 +812,6 @@ def _make_sbatch_string_ft_launcher(
     additional_parameters: Optional[Dict[str, Any]] = None,
     srun_args: Optional[Iterable[str]] = None,
     heterogeneous: bool = False,
-    use_fault_tolerance: bool = True,
     autoresume_if_interrupted: bool = False,
 ) -> str:
         
@@ -854,7 +854,6 @@ def _make_sbatch_string_ft_launcher(
         "container_mounts",
         "srun_args",
         "heterogeneous",
-        "use_fault_tolerance",
         "autoresume_if_interrupted",
     ]
     parameters = {
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 25ce943c5e..5655272e7d 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -362,8 +362,8 @@ def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters)
         resume_on_preemption = exp_man_conf.get("autoresume_if_preempted", False)
         ft_conf = exp_man_conf is not None and exp_man_conf.get("fault_tolerance", None)
         is_ft_enabled = ft_conf is not None
+        cluster_parameters["use_fault_tolerance"] = is_ft_enabled
         if is_ft_enabled:
-            cluster_parameters["use_fault_tolerance"] = True
             resume_on_fault = ft_conf.get("autoresume_if_faulted", False)
             cluster_parameters["autoresume_if_interrupted"] = (resume_on_fault or resume_on_preemption)
             if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm":

From 89362b328222895cc9243a7b90cfef105ff3442f Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Tue, 16 Jan 2024 11:16:43 +0100
Subject: [PATCH 19/39] Updating for FT launcher, wip...

---
 .../fault_tolerance/run_sc2_3b_on_eos_FT.txt  |  2 +-
 .../nemo_launcher/core/launchers.py           | 29 +++++++++++++++----
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
index 71831a30e3..d39b5bafc0 100644
--- a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
+++ b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
@@ -1,5 +1,5 @@
 USR_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi"
-LAUNCHER_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi/nemo/NeMo-Megatron-Launcher/"
+LAUNCHER_DIR="/home/jbieniusiewi/nvwork/sc2/NeMo-Megatron-Launcher"
 
 NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
     training=gpt3/starcoder2_3b \
diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index b95b307c2b..40e8ac3452 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -357,16 +357,21 @@ class SlurmLauncher(Launcher):
 
     :param Union[Path, str] folder: folder for storing job submission/output and logs.
     :param str job_name: Name of the job, used as job folder name
+    :param bool use_fault_tolerance: Use fault tolerance launcher to run the job
     :param Any **kwargs: See slurm documentation for most parameters.
             Most useful parameters are: time, mem, gpus_per_node, cpus_per_task, partition
             Below are the parameters that differ from slurm documentation:
                 setup: a list of command to run in sbatch before running srun
     """
 
-    def __init__(self, folder: Union[Path, str], job_name: str, **kwargs: Any) -> None:
+    def __init__(self, 
+                 folder: Union[Path, str], 
+                 job_name: str, 
+                 use_fault_tolerance: bool, 
+                 **kwargs: Any) -> None:
         super().__init__(folder, job_name)
         self.parameters = {}
-        self.use_fault_tolerance = kwargs.pop("use_fault_tolerance", False)
+        self.use_fault_tolerance = use_fault_tolerance
         self._update_parameters(job_name=job_name, **kwargs)
 
         if shutil.which("srun") is None and not NEMO_LAUNCHER_DEBUG:
@@ -387,9 +392,12 @@ def _equivalence_dict(cls):
         }
 
     @classmethod
-    def _valid_parameters(cls) -> Set[str]:
+    def _valid_parameters(cls, use_fault_tolerance) -> Set[str]:
         """Parameters that can be set through update_parameters"""
-        return set(_get_default_parameters())
+        if use_fault_tolerance:
+            return set(_get_default_parameters(_make_sbatch_string_ft_launcher))
+        else:
+            return set(_get_default_parameters(_make_sbatch_string))
 
     def _convert_parameters(self, params: Dict[str, Any]) -> Dict[str, Any]:
         """translate slurm parameter names"""
@@ -413,7 +421,11 @@ def _update_parameters(self, **kwargs: Any) -> None:
             Below are the parameters that differ from slurm documentation:
                 setup: a list of command to run in sbatch before running srun
         """
-        defaults = _get_default_parameters()
+        
+        if self.use_fault_tolerance:
+            defaults = _get_default_parameters_ft_launcher()
+        else:
+            defaults = _get_default_parameters()
         in_valid_parameters = sorted(set(kwargs) - set(defaults))
         if in_valid_parameters:
             string = "\n  - ".join(
@@ -779,6 +791,13 @@ def _make_sbatch_string(
     return "\n".join(lines)
 
 
+@functools.lru_cache()
+def _get_default_parameters_ft_launcher() -> Dict[str, Any]:
+    """Parameters that can be set through update_parameters"""
+    specs = inspect.getfullargspec(_make_sbatch_string_ft_launcher)
+    zipped = zip(specs.args[-len(specs.defaults) :], specs.defaults)  # type: ignore
+    return {key: val for key, val in zipped if key not in {"command_groups", "folder"}}
+
 # pylint: disable=too-many-arguments,unused-argument, too-many-locals
 def _make_sbatch_string_ft_launcher(
     command_groups: List[List[str]],

From b4ee9ed81dc1f305ffaad205c742348e0e3cd2dc Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Wed, 17 Jan 2024 12:02:41 +0100
Subject: [PATCH 20/39] Updated FT params reading

---
 .../fault_tolerance/run_sc2_3b_on_eos_FT.txt  |  4 +-
 .../nemo_launcher/core/launchers.py           | 76 +++++++++++++------
 launcher_scripts/nemo_launcher/core/stages.py | 22 ++----
 .../config_tests/test_fault_tol_config.py     | 61 ++++-----------
 4 files changed, 75 insertions(+), 88 deletions(-)

diff --git a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
index d39b5bafc0..f2fd6fcc93 100644
--- a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
+++ b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
@@ -25,8 +25,8 @@ NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3
     ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=600 \
     ++training.exp_manager.fault_tolerance.ipc_timeout=60 \
     ++training.exp_manager.fault_tolerance.rank_termination_signal=9 \
-    ++training.exp_manager.fault_tolerance.autoresume_if_faulted=True \
-    ++training.exp_manager.autoresume_if_preempted=False
+    ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=2 \
+    ++training.exp_manager.fault_tolerance.max_rank_restarts=1 \
 
 
 # Uncomment to test simulated faults
diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index 40e8ac3452..86ffd589dc 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -798,6 +798,7 @@ def _get_default_parameters_ft_launcher() -> Dict[str, Any]:
     zipped = zip(specs.args[-len(specs.defaults) :], specs.defaults)  # type: ignore
     return {key: val for key, val in zipped if key not in {"command_groups", "folder"}}
 
+
 # pylint: disable=too-many-arguments,unused-argument, too-many-locals
 def _make_sbatch_string_ft_launcher(
     command_groups: List[List[str]],
@@ -831,7 +832,8 @@ def _make_sbatch_string_ft_launcher(
     additional_parameters: Optional[Dict[str, Any]] = None,
     srun_args: Optional[Iterable[str]] = None,
     heterogeneous: bool = False,
-    autoresume_if_interrupted: bool = False,
+    max_subsequent_job_failures: int = 0,
+    max_rank_restarts: int = 0,
 ) -> str:
         
     """Creates the content of an sbatch file with provided parameters
@@ -873,7 +875,8 @@ def _make_sbatch_string_ft_launcher(
         "container_mounts",
         "srun_args",
         "heterogeneous",
-        "autoresume_if_interrupted",
+        "max_subsequent_job_failures",
+        "max_rank_restarts",
     ]
     parameters = {
         k: v for k, v in locals().items() if v is not None and k not in nonslurm
@@ -929,23 +932,40 @@ def _make_sbatch_string_ft_launcher(
     if srun_args is None:
         srun_args = []
 
-    if autoresume_if_interrupted is True:
+    lines += [
+        '',
+        '# Fault tolerance related items',
+        f'export FAULT_TOL_CFG_PATH="{str(paths.config_file)}"',
+        f'export FAULT_TOL_FINISHED_FLAG_FILE="{str(paths.folder / "_finished_flag")}"',
+        'RDZV_HOST=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)',
+        'IS_THIS_JOB_SUCCESSFUL=1',
+    ]
+        
+    if max_subsequent_job_failures > 0:
         lines += [
             '',
-            'export INTERRUPTED_FLAG_FILE='+str(paths.folder / "_interrupted_flag"),
-            'if [ "$RESUMED" = "1" ] && [ ! -f "$INTERRUPTED_FLAG_FILE" ] ; then exit 0 ; fi',
-            'CONT_SBATCH_OUT=$(RESUMED=1 sbatch --parsable --dependency=afterany:"$SLURM_JOB_ID" "$0")',
-            'if [ $? -ne 0 ] ; then echo "Could not schedule continuation job. Check stderr for details." ; exit 1 ; fi',
+            '# Automatic job resubmission related items',
+            f'JOB_RESULTS_FILE="{str(paths.folder / "_job_results")}"',
+            f'MAX_JOB_FAILURES={max_subsequent_job_failures}',
+            'is_job_failures_limit_reached() {',
+            '    tail -n $MAX_JOB_FAILURES "$JOB_RESULTS_FILE" | awk "/0/{f++} END{exit !(f>=$MAX_JOB_FAILURES)}"',
+            '}',
+            'is_training_finished() {',
+            '    test -f "$FAULT_TOL_FINISHED_FLAG_FILE"',
+            '}',
+            '# Exit immediately if finished flag file exists and this job is a continuation',
+            'if [ "$FT_RESUMED" = "1" ] ; then',
+            '    if is_training_finished ; then echo "Training is finished" ; exit 0 ; fi',
+            '    if is_job_failures_limit_reached ; then echo "Job failures limit reached ($MAX_JOB_FAILURES)" ; exit 1 ; fi',
+            'else',
+            '    rm -f "$FAULT_TOL_FINISHED_FLAG_FILE" "$JOB_RESULTS_FILE"',
+            'fi',
+            '# Pre-schedule continuation job',
+            'CONT_SBATCH_OUT=$(FT_RESUMED=1 sbatch --parsable --dependency=afterany:"$SLURM_JOB_ID" "$0")',
+            'if [ $? -ne 0 ] ; then echo "Couldnt schedule continuation job. Check stderr for details." ; exit 1 ; fi',
             'CONT_SLURM_JOB_ID=$(echo $CONT_SBATCH_OUT | cut -f1 -d",")',
-            'rm -f $INTERRUPTED_FLAG_FILE',
-            '',
         ]
-        srun_args += ["--kill-on-bad-exit=0", "--wait=3600"]
-         
-    lines += [
-        "FT_RDZV_HOST=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)"
-    ]
-      
+              
     # commandline (this will run the function and args specified in the file provided as argument)
     # We pass --output and --error here, because the SBATCH command doesn't work as expected with a filename pattern
     stderr_flags = [] if stderr_to_stdout else ["--error", stderr]
@@ -984,9 +1004,9 @@ def _make_sbatch_string_ft_launcher(
     # We do this by setting SLURM_JOB_NAME=interactive.
     # This is a temporary workaround, until the following PR is merged with NeMo 
     # https://github.com/Lightning-AI/pytorch-lightning/pull/18618
-    ft_launcher_cmd="SLURM_JOB_NAME=interactive ft_launcher " +\
-                    "--rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$FT_RDZV_HOST " +\
-                    f"--nnodes={nodes} --nproc_per_node={ntasks_per_node}"
+    ft_launcher_cmd_part="SLURM_JOB_NAME=interactive ft_launcher " +\
+                    "--rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$RDZV_HOST " +\
+                    f"--nnodes={nodes} --nproc_per_node={ntasks_per_node} --max-restarts={max_rank_restarts}"
 
     for group_ind, command_group in enumerate(command_groups):
         if heterogeneous:
@@ -1005,7 +1025,7 @@ def _make_sbatch_string_ft_launcher(
             command = ";\n  ".join(command_group)
             assert "python3 -u" in command
             command = command.replace(
-                "python3 -u", ft_launcher_cmd,
+                "python3 -u", ft_launcher_cmd_part,
             )
             lines += [
                 "",
@@ -1014,15 +1034,21 @@ def _make_sbatch_string_ft_launcher(
                 f'  {command} "',
                 "",
             ]
+            lines += [
+                'if [ $? -ne 0 ]; then IS_THIS_JOB_SUCCESSFUL=0 ; fi'
+            ]
 
-    if autoresume_if_interrupted is True:
+    if max_subsequent_job_failures > 0:
         lines += [
             '',
-            '# cancel continuation job if no continuation marker file was created',
-            'if [ ! -f "$INTERRUPTED_FLAG_FILE" ] && [ ! -z "$CONT_SLURM_JOB_ID" ] ; then', 
-            'scancel $CONT_SLURM_JOB_ID',
-            'fi'
-            '',
+            '# Check if the continuation job can be cancelled',
+            'echo $IS_THIS_JOB_SUCCESSFUL >> $JOB_RESULTS_FILE',
+            'if is_training_finished ; then',
+            '    echo "Training is finished" ; scancel $CONT_SLURM_JOB_ID ; exit 0',
+            'fi',
+            'if is_job_failures_limit_reached ; then',
+            '    echo "Job failures limit reached ($MAX_JOB_FAILURES)" ; scancel $CONT_SLURM_JOB_ID ; exit 1',
+            'fi',
         ]
 
     return "\n".join(lines)
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 5655272e7d..8cbdaaee43 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -357,22 +357,16 @@ def _make_cluster_parameters(self, cluster: str) -> Dict:
         return cluster_parameters
 
     def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters):
-        # TODO: cleanup this function
         exp_man_conf = stage_cfg.get("exp_manager", None)
-        resume_on_preemption = exp_man_conf.get("autoresume_if_preempted", False)
         ft_conf = exp_man_conf is not None and exp_man_conf.get("fault_tolerance", None)
-        is_ft_enabled = ft_conf is not None
-        cluster_parameters["use_fault_tolerance"] = is_ft_enabled
-        if is_ft_enabled:
-            resume_on_fault = ft_conf.get("autoresume_if_faulted", False)
-            cluster_parameters["autoresume_if_interrupted"] = (resume_on_fault or resume_on_preemption)
-            if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm":
-                raise ValueError(f"`autoresume_if_faulted` and `autoresume_if_preempted` "
-                                 f"works only with 'bcm' cluster (current cluster is '{cluster}')")
-        else:
-            if resume_on_preemption is True:
-                raise ValueError(f"`autoresume_if_preempted` works only with fault tolerance enabled")   
-            
+        cluster_parameters["use_fault_tolerance"] = ft_conf is not None
+        if cluster_parameters["use_fault_tolerance"]:
+            if cluster.lower() != "bcm":
+                raise ValueError(f"Fault tolerance requires 'bcm' cluster, but it's '{cluster}')")
+            cluster_parameters["max_rank_restarts"] = \
+                ft_conf.get('max_rank_restarts', 0)
+            cluster_parameters["max_subsequent_job_failures"] = \
+                ft_conf.get('max_subsequent_job_failures', 0)
         return cluster_parameters
                 
     def _find_optimal_nodes(self, cfg, gpus) -> None:
diff --git a/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py
index 9ef6899574..d2198fd5fc 100644
--- a/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py
+++ b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py
@@ -30,8 +30,8 @@ def _setup_and_teardown():
     os.system(f"rm -rf {TEST_RESULTS_DIR}")
 
 
-def test_fault_tol_config_no_fault_tol_section():
-    """ No fault tolerance section in config: should be fine """
+def test_fault_tol_config_no_fault_tol_section_bcm():
+    """ No fault tolerance section in config, BCM cluster, should be fine """
     cfg = OmegaConf.load("conf/config.yaml")
     cfg.stages = ["training"]
     cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH
@@ -45,40 +45,8 @@ def test_fault_tol_config_no_fault_tol_section():
     _ = stage.run()
 
 
-def test_fault_tol_config_autoresume_if_preempted():
-    """ autpresume_if_preempted=True and FT enabled, should be fine """
-    cfg = OmegaConf.load("conf/config.yaml")
-    cfg.stages = ["training"]
-    cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH
-    cfg.base_results_dir = TEST_RESULTS_DIR
-    cfg.cluster_type = "bcm"
-    cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml")
-    cfg.training_config = "gpt3/126m"
-    cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
-    cfg.training.exp_manager.autoresume_if_preempted = True
-    cfg.training.exp_manager.fault_tolerance = OmegaConf.create(
-        {"autoresume_if_faulted": False}
-    )
-    stage = Training(cfg)
-    _ = stage.run()
-
-def test_fault_tol_config_autoresume_if_preempted_no_ft():
-    """ autpresume_if_preempted=True without fault tolerance is invalid """
-    cfg = OmegaConf.load("conf/config.yaml")
-    cfg.stages = ["training"]
-    cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH
-    cfg.base_results_dir = TEST_RESULTS_DIR
-    cfg.cluster_type = "bcm"
-    cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml")
-    cfg.training_config = "gpt3/126m"
-    cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
-    cfg.training.exp_manager.autoresume_if_preempted = True
-    with pytest.raises(ValueError):
-        stage = Training(cfg)
-        _ = stage.run()
-
-def test_fault_tol_config_autoresume_if_preempted_invalid_cluster():
-    """ autpresume_if_preempted=True is not allowed with non-BCM cluster """
+def test_fault_tol_config_no_fault_tol_section_bcp():
+    """ No fault tolerance section in config, BCP cluster, should be fine """
     cfg = OmegaConf.load("conf/config.yaml")
     cfg.stages = ["training"]
     cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH
@@ -87,14 +55,13 @@ def test_fault_tol_config_autoresume_if_preempted_invalid_cluster():
     cfg.cluster = dict()
     cfg.training_config = "gpt3/126m"
     cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
-    cfg.training.exp_manager.autoresume_if_preempted = True
-    with pytest.raises(ValueError):
-        stage = Training(cfg)
-        _ = stage.run()
+    assert cfg.training.exp_manager.get("fault_tolernace", None) is None
+    stage = Training(cfg)
+    _ = stage.run()
 
 
-def test_fault_tol_config_autoresume_if_faulted():
-    """ autoresume_if_faulted=True and BCM cluster: should be fine """
+def test_fault_tol_config_with_bcm():
+    """ Fault tolerance + BCM cluster, should be fine """
     cfg = OmegaConf.load("conf/config.yaml")
     cfg.stages = ["training"]
     cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH
@@ -104,14 +71,13 @@ def test_fault_tol_config_autoresume_if_faulted():
     cfg.training_config = "gpt3/126m"
     cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
     cfg.training.exp_manager.fault_tolerance = OmegaConf.create(
-        {"autoresume_if_faulted": True}
+        {"max_subsequent_job_failures": 1}
     )
     stage = Training(cfg)
     _ = stage.run()
 
-
-def test_fault_tol_config_autoresume_if_faulted_invalid_cluster():
-    """ autoresume_if_faulted=True is not allowed with non-BCM cluster """
+def test_fault_tol_config_with_bcp():
+    """ Fault tolerance + BCP cluster, BCP is not supported """
     cfg = OmegaConf.load("conf/config.yaml")
     cfg.stages = ["training"]
     cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH
@@ -121,8 +87,9 @@ def test_fault_tol_config_autoresume_if_faulted_invalid_cluster():
     cfg.training_config = "gpt3/126m"
     cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
     cfg.training.exp_manager.fault_tolerance = OmegaConf.create(
-        {"autoresume_if_faulted": True}
+        {"max_subsequent_job_failures": 1}
     )
     with pytest.raises(ValueError):
         stage = Training(cfg)
         _ = stage.run()
+

From 050ccf2a6af1ef12a49b2f61a860207a95610b34 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Wed, 17 Jan 2024 12:15:26 +0000
Subject: [PATCH 21/39] Fixes after testing on DlCluster...

---
 launcher_scripts/nemo_launcher/core/launchers.py |  2 +-
 launcher_scripts/nemo_launcher/core/stages.py    | 10 ++++------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index 86ffd589dc..44da1621bf 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -1004,7 +1004,7 @@ def _make_sbatch_string_ft_launcher(
     # We do this by setting SLURM_JOB_NAME=interactive.
     # This is a temporary workaround, until the following PR is merged with NeMo 
     # https://github.com/Lightning-AI/pytorch-lightning/pull/18618
-    ft_launcher_cmd_part="SLURM_JOB_NAME=interactive ft_launcher " +\
+    ft_launcher_cmd_part="SLURM_JOB_NAME=interactive ft_launcher --fault-tol-cfg-path=$FAULT_TOL_CFG_PATH " +\
                     "--rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$RDZV_HOST " +\
                     f"--nnodes={nodes} --nproc_per_node={ntasks_per_node} --max-restarts={max_rank_restarts}"
 
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 8cbdaaee43..afde2e9241 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -47,7 +47,8 @@ def __init__(self, cfg):
         self.stage_cfg = None
         self.setup_stage_vars(cfg)
         self.job_name = self.stage_cfg.run.get("name")
-
+        if self.cluster.lower() == 'bcm':
+            self.job_name = cfg.get("cluster").get("job_name_prefix","") + self.job_name
         self.nodes_scheduler = {}
 
     def setup_stage_vars(self, cfg: OmegaConf):
@@ -281,7 +282,7 @@ def _make_cluster_parameters(self, cluster: str) -> Dict:
         cfg = self.cfg
         stage_cfg = self.stage_cfg
         run_cfg = stage_cfg.get("run")
-        job_name = run_cfg.get("name")
+        job_name = self.job_name
         time_limit = run_cfg.get("time_limit")
         nodes = run_cfg.get("nodes")
         dependency = run_cfg.get("dependency")
@@ -315,7 +316,7 @@ def _make_cluster_parameters(self, cluster: str) -> Dict:
                     cluster_cfg["srun_args"] = []
                 cluster_cfg["srun_args"] += ["--mpi=pmix"]
             slurm_cfg = {**copy.deepcopy(cluster_cfg)}
-            job_name_prefix = slurm_cfg.pop("job_name_prefix")
+            slurm_cfg.pop("job_name_prefix")
             cluster_parameters = {**slurm_cfg}
             cluster_parameters.update(
                 {
@@ -325,9 +326,6 @@ def _make_cluster_parameters(self, cluster: str) -> Dict:
                     "container_mounts": container_mounts,
                 }
             )
-            cluster_parameters["job_name"] = (
-                job_name_prefix + cluster_parameters["job_name"]
-            )
         elif cluster == "bcp":
             cluster_parameters.update(
                 {

From 147024b692fbe67616d0506179d7841006df4899 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Thu, 18 Jan 2024 13:54:18 +0100
Subject: [PATCH 22/39] Update after 'create_fault_tolerance_callback' param
 was added

---
 launcher_scripts/nemo_launcher/core/stages.py |  9 ++---
 .../config_tests/test_fault_tol_config.py     | 34 ++++++++++++++-----
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index afde2e9241..0a36e09c54 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -355,12 +355,13 @@ def _make_cluster_parameters(self, cluster: str) -> Dict:
         return cluster_parameters
 
     def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters):
-        exp_man_conf = stage_cfg.get("exp_manager", None)
-        ft_conf = exp_man_conf is not None and exp_man_conf.get("fault_tolerance", None)
-        cluster_parameters["use_fault_tolerance"] = ft_conf is not None
-        if cluster_parameters["use_fault_tolerance"]:
+        exp_man_conf = stage_cfg.get("exp_manager", dict())
+        use_ft = exp_man_conf.get('create_fault_tolerance_callback', False)
+        cluster_parameters["use_fault_tolerance"] = use_ft
+        if use_ft:
             if cluster.lower() != "bcm":
                 raise ValueError(f"Fault tolerance requires 'bcm' cluster, but it's '{cluster}')")
+            ft_conf = exp_man_conf.get("fault_tolerance", dict())
             cluster_parameters["max_rank_restarts"] = \
                 ft_conf.get('max_rank_restarts', 0)
             cluster_parameters["max_subsequent_job_failures"] = \
diff --git a/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py
index d2198fd5fc..8a9f7a1d70 100644
--- a/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py
+++ b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py
@@ -30,8 +30,8 @@ def _setup_and_teardown():
     os.system(f"rm -rf {TEST_RESULTS_DIR}")
 
 
-def test_fault_tol_config_no_fault_tol_section_bcm():
-    """ No fault tolerance section in config, BCM cluster, should be fine """
+def test_fault_tol_config_fault_tol_disabled_bcm():
+    """ No fault tolerance, BCM cluster, should be fine """
     cfg = OmegaConf.load("conf/config.yaml")
     cfg.stages = ["training"]
     cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH
@@ -40,13 +40,14 @@ def test_fault_tol_config_no_fault_tol_section_bcm():
     cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml")
     cfg.training_config = "gpt3/126m"
     cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
-    assert cfg.training.exp_manager.get("fault_tolernace", None) is None
+    assert cfg.training.exp_manager.get("create_fault_tolerance_callback", None) is None
+    assert cfg.training.exp_manager.get("fault_toleranace", None) is None
     stage = Training(cfg)
     _ = stage.run()
 
 
-def test_fault_tol_config_no_fault_tol_section_bcp():
-    """ No fault tolerance section in config, BCP cluster, should be fine """
+def test_fault_tol_config_fault_tol_disabled_bcp():
+    """ No fault tolerance, BCP cluster, should be fine """
     cfg = OmegaConf.load("conf/config.yaml")
     cfg.stages = ["training"]
     cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH
@@ -55,7 +56,8 @@ def test_fault_tol_config_no_fault_tol_section_bcp():
     cfg.cluster = dict()
     cfg.training_config = "gpt3/126m"
     cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
-    assert cfg.training.exp_manager.get("fault_tolernace", None) is None
+    assert cfg.training.exp_manager.get("create_fault_tolerance_callback", None) is None
+    assert cfg.training.exp_manager.get("fault_toleranace", None) is None
     stage = Training(cfg)
     _ = stage.run()
 
@@ -70,12 +72,28 @@ def test_fault_tol_config_with_bcm():
     cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml")
     cfg.training_config = "gpt3/126m"
     cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
+    cfg.training.exp_manager.create_fault_tolerance_callback=True
     cfg.training.exp_manager.fault_tolerance = OmegaConf.create(
         {"max_subsequent_job_failures": 1}
     )
     stage = Training(cfg)
     _ = stage.run()
 
+def test_fault_tol_config_with_bcm_no_ft_section():
+    """ Fault tolerance + BCM cluster, no "fault_tolerance" section in cfg, should be fine """
+    cfg = OmegaConf.load("conf/config.yaml")
+    cfg.stages = ["training"]
+    cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH
+    cfg.base_results_dir = TEST_RESULTS_DIR
+    cfg.cluster_type = "bcm"
+    cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml")
+    cfg.training_config = "gpt3/126m"
+    cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
+    cfg.training.exp_manager.create_fault_tolerance_callback=True
+    stage = Training(cfg)
+    _ = stage.run()
+
+
 def test_fault_tol_config_with_bcp():
     """ Fault tolerance + BCP cluster, BCP is not supported """
     cfg = OmegaConf.load("conf/config.yaml")
@@ -86,9 +104,7 @@ def test_fault_tol_config_with_bcp():
     cfg.cluster = dict()
     cfg.training_config = "gpt3/126m"
     cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
-    cfg.training.exp_manager.fault_tolerance = OmegaConf.create(
-        {"max_subsequent_job_failures": 1}
-    )
+    cfg.training.exp_manager.create_fault_tolerance_callback=True
     with pytest.raises(ValueError):
         stage = Training(cfg)
         _ = stage.run()

From 5105a29311049749d6c7975daf4bf9fc386c6454 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Fri, 19 Jan 2024 11:58:50 +0100
Subject: [PATCH 23/39] Improved auto-resume

---
 examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt |  1 +
 launcher_scripts/nemo_launcher/core/launchers.py  | 14 ++++++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
index f2fd6fcc93..1ccf8af65e 100644
--- a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
+++ b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
@@ -21,6 +21,7 @@ NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3
     training.trainer.devices=8 \
     training.trainer.log_every_n_steps=1 \
     training.trainer.val_check_interval=1000 \
+    ++training.exp_manager.create_fault_tolerance_callback=True \
     ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=720 \
     ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=600 \
     ++training.exp_manager.fault_tolerance.ipc_timeout=60 \
diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index 44da1621bf..af59639457 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -938,7 +938,7 @@ def _make_sbatch_string_ft_launcher(
         f'export FAULT_TOL_CFG_PATH="{str(paths.config_file)}"',
         f'export FAULT_TOL_FINISHED_FLAG_FILE="{str(paths.folder / "_finished_flag")}"',
         'RDZV_HOST=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)',
-        'IS_THIS_JOB_SUCCESSFUL=1',
+        'ANY_JOB_STEP_FAILED=0',
     ]
         
     if max_subsequent_job_failures > 0:
@@ -948,7 +948,8 @@ def _make_sbatch_string_ft_launcher(
             f'JOB_RESULTS_FILE="{str(paths.folder / "_job_results")}"',
             f'MAX_JOB_FAILURES={max_subsequent_job_failures}',
             'is_job_failures_limit_reached() {',
-            '    tail -n $MAX_JOB_FAILURES "$JOB_RESULTS_FILE" | awk "/0/{f++} END{exit !(f>=$MAX_JOB_FAILURES)}"',
+            '    tail -n $MAX_JOB_FAILURES "$JOB_RESULTS_FILE" | \\',
+            '       awk "/^[[:alnum:]]+[[:space:]]+F$/{f++} END{exit !(f>=$MAX_JOB_FAILURES)}"',
             '}',
             'is_training_finished() {',
             '    test -f "$FAULT_TOL_FINISHED_FLAG_FILE"',
@@ -964,6 +965,8 @@ def _make_sbatch_string_ft_launcher(
             'CONT_SBATCH_OUT=$(FT_RESUMED=1 sbatch --parsable --dependency=afterany:"$SLURM_JOB_ID" "$0")',
             'if [ $? -ne 0 ] ; then echo "Couldnt schedule continuation job. Check stderr for details." ; exit 1 ; fi',
             'CONT_SLURM_JOB_ID=$(echo $CONT_SBATCH_OUT | cut -f1 -d",")',
+            '# Write failure to the job log, eventually we will fix it at the end',
+            'echo "$SLURM_JOB_ID F" >> "$JOB_RESULTS_FILE"',
         ]
               
     # commandline (this will run the function and args specified in the file provided as argument)
@@ -1035,14 +1038,17 @@ def _make_sbatch_string_ft_launcher(
                 "",
             ]
             lines += [
-                'if [ $? -ne 0 ]; then IS_THIS_JOB_SUCCESSFUL=0 ; fi'
+                'if [ $? -ne 0 ]; then ANY_JOB_STEP_FAILED=1 ; fi'
             ]
 
     if max_subsequent_job_failures > 0:
         lines += [
             '',
+            '# Fix the job log entry ("JOB_ID F" -> "JOB_ID S"), if the job was successful',
+            'if [ "$ANY_JOB_STEP_FAILED" = "0" ] ; then',
+            '   sed -i "s/^$SLURM_JOB_ID[[:space:]]\+F/$SLURM_JOB_ID S/" "$JOB_RESULTS_FILE"',
+            'fi',
             '# Check if the continuation job can be cancelled',
-            'echo $IS_THIS_JOB_SUCCESSFUL >> $JOB_RESULTS_FILE',
             'if is_training_finished ; then',
             '    echo "Training is finished" ; scancel $CONT_SLURM_JOB_ID ; exit 0',
             'fi',

From 58741ae068ec9b8049c500114d926b76ccefcfc3 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Fri, 19 Jan 2024 13:15:26 +0100
Subject: [PATCH 24/39] Improved auto-resume-cont

---
 launcher_scripts/nemo_launcher/core/launchers.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index af59639457..75598fa799 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -965,8 +965,8 @@ def _make_sbatch_string_ft_launcher(
             'CONT_SBATCH_OUT=$(FT_RESUMED=1 sbatch --parsable --dependency=afterany:"$SLURM_JOB_ID" "$0")',
             'if [ $? -ne 0 ] ; then echo "Couldnt schedule continuation job. Check stderr for details." ; exit 1 ; fi',
             'CONT_SLURM_JOB_ID=$(echo $CONT_SBATCH_OUT | cut -f1 -d",")',
-            '# Write failure to the job log, eventually we will fix it at the end',
-            'echo "$SLURM_JOB_ID F" >> "$JOB_RESULTS_FILE"',
+            '# Write unknown job status to the job log, we will fix it at the end',
+            'echo "$SLURM_JOB_ID X" >> "$JOB_RESULTS_FILE"',
         ]
               
     # commandline (this will run the function and args specified in the file provided as argument)
@@ -1046,7 +1046,9 @@ def _make_sbatch_string_ft_launcher(
             '',
             '# Fix the job log entry ("JOB_ID F" -> "JOB_ID S"), if the job was successful',
             'if [ "$ANY_JOB_STEP_FAILED" = "0" ] ; then',
-            '   sed -i "s/^$SLURM_JOB_ID[[:space:]]\+F/$SLURM_JOB_ID S/" "$JOB_RESULTS_FILE"',
+            '   sed -i "s/$SLURM_JOB_ID X/$SLURM_JOB_ID S/" "$JOB_RESULTS_FILE"',
+            'else',
+            '   sed -i "s/$SLURM_JOB_ID X/$SLURM_JOB_ID F/" "$JOB_RESULTS_FILE"',
             'fi',
             '# Check if the continuation job can be cancelled',
             'if is_training_finished ; then',

From 44421184f43fee6d7a310974bf60267272b33ac8 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Mon, 22 Jan 2024 13:19:05 +0100
Subject: [PATCH 25/39] Use hostname to get the rendezvous host

---
 launcher_scripts/nemo_launcher/core/launchers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index 75598fa799..f6ea5c9f2a 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -937,7 +937,7 @@ def _make_sbatch_string_ft_launcher(
         '# Fault tolerance related items',
         f'export FAULT_TOL_CFG_PATH="{str(paths.config_file)}"',
         f'export FAULT_TOL_FINISHED_FLAG_FILE="{str(paths.folder / "_finished_flag")}"',
-        'RDZV_HOST=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)',
+        'RDZV_HOST=$(hostname)',
         'ANY_JOB_STEP_FAILED=0',
     ]
         

From 25cc521e2accbb4ac14c8b8b3a45f145e23c53bd Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Tue, 23 Jan 2024 12:02:45 +0100
Subject: [PATCH 26/39] Added additional_ft_launcher_args

---
 launcher_scripts/nemo_launcher/core/launchers.py | 5 ++++-
 launcher_scripts/nemo_launcher/core/stages.py    | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index f6ea5c9f2a..5a2e0ef64b 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -834,6 +834,7 @@ def _make_sbatch_string_ft_launcher(
     heterogeneous: bool = False,
     max_subsequent_job_failures: int = 0,
     max_rank_restarts: int = 0,
+    additional_ft_launcher_args: str = "",
 ) -> str:
         
     """Creates the content of an sbatch file with provided parameters
@@ -877,6 +878,7 @@ def _make_sbatch_string_ft_launcher(
         "heterogeneous",
         "max_subsequent_job_failures",
         "max_rank_restarts",
+        "additional_ft_launcher_args",
     ]
     parameters = {
         k: v for k, v in locals().items() if v is not None and k not in nonslurm
@@ -1007,7 +1009,8 @@ def _make_sbatch_string_ft_launcher(
     # We do this by setting SLURM_JOB_NAME=interactive.
     # This is a temporary workaround, until the following PR is merged with NeMo 
     # https://github.com/Lightning-AI/pytorch-lightning/pull/18618
-    ft_launcher_cmd_part="SLURM_JOB_NAME=interactive ft_launcher --fault-tol-cfg-path=$FAULT_TOL_CFG_PATH " +\
+    ft_launcher_cmd_part="SLURM_JOB_NAME=interactive ft_launcher "+\
+                    f"--fault-tol-cfg-path=$FAULT_TOL_CFG_PATH {additional_ft_launcher_args} "+\
                     "--rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$RDZV_HOST " +\
                     f"--nnodes={nodes} --nproc_per_node={ntasks_per_node} --max-restarts={max_rank_restarts}"
 
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 0a36e09c54..b2a9dce0ef 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -366,6 +366,8 @@ def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters)
                 ft_conf.get('max_rank_restarts', 0)
             cluster_parameters["max_subsequent_job_failures"] = \
                 ft_conf.get('max_subsequent_job_failures', 0)
+            cluster_parameters["additional_ft_launcher_args"] = \
+                ft_conf.get('additional_ft_launcher_args', "")
         return cluster_parameters
                 
     def _find_optimal_nodes(self, cfg, gpus) -> None:

From 874b651aa62171635c0233f7a3c2dace65f04c05 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Thu, 25 Jan 2024 13:01:32 +0100
Subject: [PATCH 27/39] Restored --kill-on-bad-exit=0, --wait=3600

---
 launcher_scripts/nemo_launcher/core/launchers.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index 5a2e0ef64b..4c2aa41cf8 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -934,6 +934,10 @@ def _make_sbatch_string_ft_launcher(
     if srun_args is None:
         srun_args = []
 
+    # FT launcher will terminate failed workers, no need SLURM for that.
+    # A safety measure, let SLURM kill the job, 1h after any task ended.
+    srun_args += ["--kill-on-bad-exit=0", "--wait=3600"]
+
     lines += [
         '',
         '# Fault tolerance related items',

From 83164111ef36cb7fa3a93ace06d7c6e8b52e107c Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Thu, 25 Jan 2024 13:19:55 +0100
Subject: [PATCH 28/39] Fixed comment

---
 launcher_scripts/nemo_launcher/core/launchers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index 4c2aa41cf8..16a83c83cb 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -1051,7 +1051,7 @@ def _make_sbatch_string_ft_launcher(
     if max_subsequent_job_failures > 0:
         lines += [
             '',
-            '# Fix the job log entry ("JOB_ID F" -> "JOB_ID S"), if the job was successful',
+            '# Fix the job log entry ("JOB_ID X" -> "JOB_ID S/F"), depending on the job result',
             'if [ "$ANY_JOB_STEP_FAILED" = "0" ] ; then',
             '   sed -i "s/$SLURM_JOB_ID X/$SLURM_JOB_ID S/" "$JOB_RESULTS_FILE"',
             'else',

From fb80ec0bb9125b5d9c68c9e89e6ae277b966d102 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Thu, 25 Jan 2024 14:24:17 +0100
Subject: [PATCH 29/39] --kill-on-bad-exit=1

---
 launcher_scripts/nemo_launcher/core/launchers.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index 16a83c83cb..fc514aaee4 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -934,9 +934,10 @@ def _make_sbatch_string_ft_launcher(
     if srun_args is None:
         srun_args = []
 
-    # FT launcher will terminate failed workers, no need SLURM for that.
-    # A safety measure, let SLURM kill the job, 1h after any task ended.
-    srun_args += ["--kill-on-bad-exit=0", "--wait=3600"]
+    # A safety measures:
+    # let SLURM kill all tasks if any FT launcher returns with a failure.
+    # let SLURM kill the job, 1h after any task ended.
+    srun_args += ["--kill-on-bad-exit=1", "--wait=3600"]
 
     lines += [
         '',

From 79b371f2097771418eb3c06e2bd06fee0493f328 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Mon, 5 Feb 2024 15:44:15 +0100
Subject: [PATCH 30/39] Set FT work dir

---
 launcher_scripts/nemo_launcher/core/stages.py | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index b2a9dce0ef..f94dd205a4 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -81,6 +81,8 @@ def run(self) -> str:
                 f"global batch size and number of nodes will change following this schedule:\n {self.nodes_scheduler}"
             )
 
+        self._set_fault_tolerance_work_dir_in_stage_cfg(self.stage_cfg, self.cluster)
+
         stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(
             self.stage_cfg, job_path, self.cfg
         )
@@ -353,15 +355,19 @@ def _make_cluster_parameters(self, cluster: str) -> Dict:
             self._update_fault_tolerance_params(stage_cfg, cluster, cluster_parameters)
         
         return cluster_parameters
-
-    def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters):
+    
+    def _get_fault_tol_config_section(self, stage_cfg, cluster):
         exp_man_conf = stage_cfg.get("exp_manager", dict())
         use_ft = exp_man_conf.get('create_fault_tolerance_callback', False)
-        cluster_parameters["use_fault_tolerance"] = use_ft
         if use_ft:
             if cluster.lower() != "bcm":
                 raise ValueError(f"Fault tolerance requires 'bcm' cluster, but it's '{cluster}')")
-            ft_conf = exp_man_conf.get("fault_tolerance", dict())
+        return use_ft, exp_man_conf.get("fault_tolerance", dict())   
+
+    def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters):
+        use_ft, ft_conf = self._get_fault_tol_config_section(stage_cfg, cluster)
+        cluster_parameters["use_fault_tolerance"] = use_ft
+        if use_ft:
             cluster_parameters["max_rank_restarts"] = \
                 ft_conf.get('max_rank_restarts', 0)
             cluster_parameters["max_subsequent_job_failures"] = \
@@ -369,6 +375,12 @@ def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters)
             cluster_parameters["additional_ft_launcher_args"] = \
                 ft_conf.get('additional_ft_launcher_args', "")
         return cluster_parameters
+    
+    def _set_fault_tolerance_work_dir_in_stage_cfg(self, stage_cfg, cluster):
+        use_ft, ft_conf = self._get_fault_tol_config_section(stage_cfg, cluster)
+        if use_ft:
+            with omegaconf.open_dict(ft_conf):
+                ft_conf.work_dir = str(self.get_job_path().folder)
                 
     def _find_optimal_nodes(self, cfg, gpus) -> None:
         nodes_scheduler_path = (

From ff6e939eeffcf8067092dd8d4b62b18fc6fd8aae Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Mon, 5 Feb 2024 16:18:49 +0100
Subject: [PATCH 31/39] Set FT work dir/fix

---
 launcher_scripts/nemo_launcher/core/stages.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index f94dd205a4..344065b353 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -380,7 +380,7 @@ def _set_fault_tolerance_work_dir_in_stage_cfg(self, stage_cfg, cluster):
         use_ft, ft_conf = self._get_fault_tol_config_section(stage_cfg, cluster)
         if use_ft:
             with omegaconf.open_dict(ft_conf):
-                ft_conf.work_dir = str(self.get_job_path().folder)
+                ft_conf.work_dir = str(self.get_job_path().folder / "_ft_scratch_dir")
                 
     def _find_optimal_nodes(self, cfg, gpus) -> None:
         nodes_scheduler_path = (

From e5ade772be7e5d5fe4c0dd9d648e81d512f7b1e9 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Mon, 5 Feb 2024 08:46:03 -0800
Subject: [PATCH 32/39] Added test script for DracoRNO/wip

---
 .../run_gpt_on_draco_rno_FT.txt               | 59 +++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 examples/fault_tolerance/run_gpt_on_draco_rno_FT.txt

diff --git a/examples/fault_tolerance/run_gpt_on_draco_rno_FT.txt b/examples/fault_tolerance/run_gpt_on_draco_rno_FT.txt
new file mode 100644
index 0000000000..98402f3f87
--- /dev/null
+++ b/examples/fault_tolerance/run_gpt_on_draco_rno_FT.txt
@@ -0,0 +1,59 @@
+USR_DIR="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi"
+LAUNCHER_DIR="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi/ft/NeMo-Megatron-Launcher"
+
+# create dummy data this that is required by the launcher
+# we will use mock data
+mkdir -p ${LAUNCHER_DIR}/dummy_data_dir
+
+
+# USE SC2 container, but train GPT3 5b
+
+NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface/hub  python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
+    training=gpt3/5b \
+    stages=["training"] \
+    numa_mapping.enable=True \
+    data_dir=${LAUNCHER_DIR}/dummy_data_dir \
+    training.model.data.data_impl="mock" \
+    training.model.data.data_prefix=[] \
+    launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts  \
+    container_mounts=[$USR_DIR:$USR_DIR] \
+    container="gitlab-master.nvidia.com/dl/nemo/nemo-fw/train:sc2_fault_tol_elastic" \
+    cluster.partition=batch_short_dgx1_m2 \
+    cluster.account=coreai_dlalgo_llm \
+    cluster.job_name_prefix="coreai_dlalgo_llm-test-ft5b:" \
+    cluster.gpus_per_task=null \
+    cluster.gpus_per_node=null \
+    ++cluster.nv_meta="ml-model.fault_tol_tests" \
+    ++cluster.gres="gpu:8" \
+    ++cluster.signal="TERM@180" \
+    training.exp_manager.resume_if_exists=True \
+    training.exp_manager.create_checkpoint_callback=True \
+    training.exp_manager.checkpoint_callback_params.save_top_k=1 \
+    training.exp_manager.resume_ignore_no_checkpoint=True \
+    training.run.name="fault_tol_gpt3_5b_dbg" \
+    training.run.time_limit=00:30:00 \
+    training.trainer.max_time=00:01:00:00 \
+    training.trainer.num_nodes=4 \
+    training.trainer.devices=8 \
+    training.trainer.log_every_n_steps=10 \
+    training.trainer.val_check_interval=400 \
+    ++training.trainer.precision=16 \
+    ++training.model.mcore_gpt=False \
+    ++training.model.tokenizer.merge_file="/gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt3/bpe/gpt2-merges.txt" \
+    ++training.model.tokenizer.vocab_file="/gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt3/bpe/gpt2-vocab.txt" \
+    training.trainer.enable_checkpointing=False \
+    training.model.micro_batch_size=1 \
+    training.model.global_batch_size=4 \
+    training.model.tensor_model_parallel_size=8 \
+    training.model.pipeline_model_parallel_size=1 \
+    ++training.exp_manager.create_fault_tolerance_callback=True \
+    ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=null \
+    ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=null \
+    ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=3 \
+    ++training.exp_manager.fault_tolerance.max_rank_restarts=0
+
+
+#
+#    ++training.exp_manager.fault_tolerance.simulated_fault.fault_type=random \
+#    ++training.exp_manager.fault_tolerance.simulated_fault.base_delay=900
+#

From c2f8cba16dc5e8dbf97d61c17f264fc2698a738b Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Tue, 5 Mar 2024 13:22:36 +0100
Subject: [PATCH 33/39] Working on test scripts/WIP

---
 examples/fault_tolerance/run_on_cluster.sh | 74 ++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 examples/fault_tolerance/run_on_cluster.sh

diff --git a/examples/fault_tolerance/run_on_cluster.sh b/examples/fault_tolerance/run_on_cluster.sh
new file mode 100644
index 0000000000..510c5bdbab
--- /dev/null
+++ b/examples/fault_tolerance/run_on_cluster.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+CLUSTER="draco-rno"
+CONTAINER="gitlab-master.nvidia.com:5005/dl/gwe/fault_tolerance_related/nemo-gwe-ft:test"
+RUN_NAME="fault_tol_gpt3_5b_dbg_no_err"
+NODES=2
+
+FT_ARGS="
+    ++training.exp_manager.create_fault_tolerance_callback=True \
+    ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=900 \
+    ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=600 \
+    ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=3 \
+    ++training.exp_manager.fault_tolerance.max_rank_restarts=0
+    ++training.exp_manager.fault_tolerance.simulated_fault.fault_type=random \
+    ++training.exp_manager.fault_tolerance.simulated_fault.base_delay=900 
+"
+
+if [ "$CLUSTER" == "draco-rno" ]; then
+    PARTITION="batch_short_dgx1_m2"
+    ACCOUNT="coreai_dlalgo_llm"
+    JOB_PREFIX="coreai_dlalgo_llm-test-ft5b:"
+    CLUSTER_SPECIFIC_ARGS="++cluster.nv_meta=\"ml-model.fault_tol_tests\""
+    USR_DIR="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi"                                                           
+    LAUNCHER_DIR="${USR_DIR}/ft/NeMo-Megatron-Launcher"
+else
+    echo "Unknown cluster: $CLUSTER"
+    exit 1
+fi
+
+# create dummy data this that is required by the launcher     
+# we will use mock data
+mkdir -p ${LAUNCHER_DIR}/dummy_data_dir                                                                           
+                                                                                                                                                                           
+HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
+    training=gpt3/5b \
+    stages=["training"] \
+    numa_mapping.enable=True \
+    data_dir=${LAUNCHER_DIR}/dummy_data_dir \
+    training.model.data.data_impl="mock" \
+    training.model.data.data_prefix=[] \
+    launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts  \
+    container_mounts=[$USR_DIR:$USR_DIR] \
+    container=${CONTAINER} \
+    cluster.partition=${PARTITION} \
+    cluster.account=${ACCOUNT} \
+    cluster.job_name_prefix=${JOB_PREFIX} \
+    ${CLUSTER_SPECIFIC_ARGS} \
+    cluster.gpus_per_task=null \
+    cluster.gpus_per_node=null \
+    ++cluster.gres="gpu:8" \
+    ++cluster.signal="TERM@300" \
+    training.exp_manager.resume_if_exists=True \
+    training.exp_manager.create_checkpoint_callback=True \
+    training.exp_manager.checkpoint_callback_params.save_top_k=1 \
+    training.exp_manager.resume_ignore_no_checkpoint=True \
+    training.run.name=${RUN_NAME} \
+    training.run.time_limit=00:45:00 \
+    training.trainer.max_time=00:03:00:00 \
+    training.trainer.num_nodes=${NODES} \
+    training.trainer.devices=8 \
+    training.trainer.log_every_n_steps=10 \
+    training.trainer.val_check_interval=400 \
+    ++training.trainer.precision=16 \
+    ++training.model.mcore_gpt=False \
+    ++training.model.tokenizer.merge_file="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi/bpe/gpt2-merges.txt" \
+    ++training.model.tokenizer.vocab_file="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi/bpe/gpt2-vocab.json" \
+    training.trainer.enable_checkpointing=False \
+    training.model.micro_batch_size=1 \
+    training.model.global_batch_size=${NODES} \
+    training.model.tensor_model_parallel_size=8 \
+    training.model.pipeline_model_parallel_size=1 \
+    ${FT_ARGS}
+
+

From 76f176af37ff1c899e602ad13ecaaf3c18a1092c Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Fri, 8 Mar 2024 04:05:19 -0800
Subject: [PATCH 34/39] Version for testing

---
 examples/fault_tolerance/run_on_cluster.sh    |  56 +-
 .../conf/training/gpt3/starcoder2_3b.yaml     | 771 ------------------
 .../nemo_launcher/core/launchers.py           |   4 +-
 launcher_scripts/nemo_launcher/core/stages.py |   8 -
 4 files changed, 32 insertions(+), 807 deletions(-)
 mode change 100644 => 100755 examples/fault_tolerance/run_on_cluster.sh
 delete mode 100755 launcher_scripts/conf/training/gpt3/starcoder2_3b.yaml

diff --git a/examples/fault_tolerance/run_on_cluster.sh b/examples/fault_tolerance/run_on_cluster.sh
old mode 100644
new mode 100755
index 510c5bdbab..b28bc18597
--- a/examples/fault_tolerance/run_on_cluster.sh
+++ b/examples/fault_tolerance/run_on_cluster.sh
@@ -1,27 +1,32 @@
 #!/bin/bash
 
+# NOTE: NeMo-Megatron-Launcher requirements should be installed
+# e.g. cd /NeMo-Megatron-Launcher && pip install -r requirements.txt
+
 CLUSTER="draco-rno"
-CONTAINER="gitlab-master.nvidia.com:5005/dl/gwe/fault_tolerance_related/nemo-gwe-ft:test"
-RUN_NAME="fault_tol_gpt3_5b_dbg_no_err"
-NODES=2
+CONTAINER="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi/ft/NeMo-Megatron-Launcher-gwe-ft/dl+gwe+fault_tolerance_related+nemo-gwe-ft+test.sqsh" # "gitlab-master.nvidia.com/dl/gwe/fault_tolerance_related/nemo-gwe-ft:test"
+RUN_NAME="fault_tol_gpt3_5b_no_err"
+NODES=4
 
 FT_ARGS="
     ++training.exp_manager.create_fault_tolerance_callback=True \
-    ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=900 \
-    ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=600 \
-    ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=3 \
-    ++training.exp_manager.fault_tolerance.max_rank_restarts=0
-    ++training.exp_manager.fault_tolerance.simulated_fault.fault_type=random \
-    ++training.exp_manager.fault_tolerance.simulated_fault.base_delay=900 
+    ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=1
 "
 
+#    ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=900 \
+#    ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=600 \
+#    ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=3 \
+#    ++training.exp_manager.fault_tolerance.max_rank_restarts=0
+#    ++training.exp_manager.fault_tolerance.simulated_fault.fault_type=random \
+#    ++training.exp_manager.fault_tolerance.simulated_fault.base_delay=900 
+
 if [ "$CLUSTER" == "draco-rno" ]; then
     PARTITION="batch_short_dgx1_m2"
     ACCOUNT="coreai_dlalgo_llm"
-    JOB_PREFIX="coreai_dlalgo_llm-test-ft5b:"
+    JOB_PREFIX="coreai_dlalgo_llm-test:"
     CLUSTER_SPECIFIC_ARGS="++cluster.nv_meta=\"ml-model.fault_tol_tests\""
     USR_DIR="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi"                                                           
-    LAUNCHER_DIR="${USR_DIR}/ft/NeMo-Megatron-Launcher"
+    LAUNCHER_DIR="${USR_DIR}/ft/NeMo-Megatron-Launcher-gwe-ft"
 else
     echo "Unknown cluster: $CLUSTER"
     exit 1
@@ -31,13 +36,14 @@ fi
 # we will use mock data
 mkdir -p ${LAUNCHER_DIR}/dummy_data_dir                                                                           
                                                                                                                                                                            
-HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
+HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface/hub HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
     training=gpt3/5b \
     stages=["training"] \
     numa_mapping.enable=True \
     data_dir=${LAUNCHER_DIR}/dummy_data_dir \
-    training.model.data.data_impl="mock" \
-    training.model.data.data_prefix=[] \
+    ++training.model.data.mock_dataset=True \
+    ++training.model.data.data_impl="mock" \
+    ++training.model.data.data_prefix=[] \
     launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts  \
     container_mounts=[$USR_DIR:$USR_DIR] \
     container=${CONTAINER} \
@@ -45,30 +51,26 @@ HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scri
     cluster.account=${ACCOUNT} \
     cluster.job_name_prefix=${JOB_PREFIX} \
     ${CLUSTER_SPECIFIC_ARGS} \
-    cluster.gpus_per_task=null \
-    cluster.gpus_per_node=null \
     ++cluster.gres="gpu:8" \
-    ++cluster.signal="TERM@300" \
+    ++cluster.signal="TERM@240" \
     training.exp_manager.resume_if_exists=True \
     training.exp_manager.create_checkpoint_callback=True \
     training.exp_manager.checkpoint_callback_params.save_top_k=1 \
     training.exp_manager.resume_ignore_no_checkpoint=True \
     training.run.name=${RUN_NAME} \
-    training.run.time_limit=00:45:00 \
-    training.trainer.max_time=00:03:00:00 \
+    training.run.time_limit=00:20:00 \
+    training.trainer.max_time=00:01:30:00 \
     training.trainer.num_nodes=${NODES} \
     training.trainer.devices=8 \
     training.trainer.log_every_n_steps=10 \
-    training.trainer.val_check_interval=400 \
+    training.trainer.val_check_interval=50 \
     ++training.trainer.precision=16 \
-    ++training.model.mcore_gpt=False \
-    ++training.model.tokenizer.merge_file="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi/bpe/gpt2-merges.txt" \
-    ++training.model.tokenizer.vocab_file="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi/bpe/gpt2-vocab.json" \
+    ++training.model.tokenizer.merge_file="${USR_DIR}/bpe/gpt2-merges.txt" \
+    ++training.model.tokenizer.vocab_file="${USR_DIR}/bpe/gpt2-vocab.txt" \
     training.trainer.enable_checkpointing=False \
     training.model.micro_batch_size=1 \
-    training.model.global_batch_size=${NODES} \
-    training.model.tensor_model_parallel_size=8 \
-    training.model.pipeline_model_parallel_size=1 \
+    training.model.global_batch_size=$((${NODES} * 8)) \
+    training.model.tensor_model_parallel_size=2 \
+    training.model.pipeline_model_parallel_size=4 \
     ${FT_ARGS}
 
-
diff --git a/launcher_scripts/conf/training/gpt3/starcoder2_3b.yaml b/launcher_scripts/conf/training/gpt3/starcoder2_3b.yaml
deleted file mode 100755
index b71d98ed3a..0000000000
--- a/launcher_scripts/conf/training/gpt3/starcoder2_3b.yaml
+++ /dev/null
@@ -1,771 +0,0 @@
-hydra:
-  searchpath:
-  - file:///opt/NeMo/examples/nlp/language_modeling/conf
-run:
-  name: starcoder2_3b
-  results_dir: ${base_results_dir}/${.name}
-  time_limit: 04:00:00
-  dependency: singleton
-trainer:
-  num_nodes: 8
-  devices: 8
-  accelerator: gpu
-  precision: bf16
-  logger: false
-  enable_checkpointing: false
-  use_distributed_sampler: false
-  max_epochs: null
-  max_steps: 114400
-  max_time: 02:23:30:00
-  log_every_n_steps: 10
-  val_check_interval: 500
-  limit_val_batches: 25
-  limit_test_batches: 25
-  accumulate_grad_batches: 1
-  gradient_clip_val: 1.0
-exp_manager:
-  explicit_log_dir: ${base_results_dir}/${.name}
-  exp_dir: null
-  name: megatron_gpt
-  create_wandb_logger: false
-  wandb_logger_kwargs:
-    project: starcoder2
-    name: starcoder2_3b
-  resume_if_exists: true
-  resume_ignore_no_checkpoint: true
-  create_checkpoint_callback: true
-  checkpoint_callback_params:
-    monitor: val_loss
-    save_top_k: 10
-    mode: min
-    always_save_nemo: false
-    save_nemo_on_train_end: false
-    filename: megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}
-    model_parallel_size: 2
-  log_step_timing: true
-  step_timing_kwargs:
-    sync_cuda: true
-    buffer_size: 5
-model:
-  micro_batch_size: 1
-  global_batch_size: 160
-  rampup_batch_size: null
-  tensor_model_parallel_size: 2
-  pipeline_model_parallel_size: 1
-  virtual_pipeline_model_parallel_size: null
-  encoder_seq_length: 16384
-  max_position_embeddings: 16384
-  num_layers: 30
-  hidden_size: 3072
-  ffn_hidden_size: 12288
-  num_attention_heads: 24
-  init_method_std: 0.018042
-  use_scaled_init_method: true
-  hidden_dropout: 0.1
-  attention_dropout: 0.1
-  ffn_dropout: 0.0
-  kv_channels: 128
-  apply_query_key_layer_scaling: true
-  normalization: layernorm1p
-  layernorm_zero_centered_gamma: true
-  layernorm_epsilon: 1.0e-05
-  do_layer_norm_weight_decay: false
-  make_vocab_size_divisible_by: 128
-  pre_process: true
-  post_process: true
-  persist_layer_norm: true
-  bias: false
-  activation: fast-swiglu
-  headscale: false
-  transformer_block_type: pre_ln
-  openai_gelu: false
-  normalize_attention_scores: true
-  position_embedding_type: rope
-  rotary_percentage: 0.5
-  attention_type: multihead
-  share_embeddings_and_output_weights: false
-  tokenizer:
-    library: huggingface
-    type: bigcode/starcoder2-tokenizer
-    model: null
-    delimiter: null
-    vocab_file: null
-    merge_file: null
-  native_amp_init_scale: 4294967296
-  native_amp_growth_interval: 1000
-  hysteresis: 2
-  fp32_residual_connection: false
-  fp16_lm_cross_entropy: false
-  megatron_amp_O2: true
-  grad_allreduce_chunk_size_mb: 125
-  grad_div_ar_fusion: true
-  gradient_accumulation_fusion: false
-  bias_activation_fusion: false
-  bias_dropout_add_fusion: false
-  masked_softmax_fusion: true
-  seed: 1234
-  resume_from_checkpoint: null
-  use_cpu_initialization: false
-  onnx_safe: false
-  apex_transformer_log_level: 30
-  gradient_as_bucket_view: true
-  sync_batch_comm: false
-  activations_checkpoint_granularity: null
-  activations_checkpoint_method: null
-  activations_checkpoint_num_layers: null
-  num_micro_batches_with_partial_activation_checkpoints: null
-  activations_checkpoint_layers_per_pipeline: null
-  sequence_parallel: false
-  overlap_p2p_comm: false
-  batch_p2p_comm: true
-  num_query_groups: null
-  mcore_gpt: true
-  transformer_engine: false
-  fp8: false
-  fp8_e4m3: false
-  fp8_hybrid: true
-  fp8_margin: 0
-  fp8_interval: 1
-  fp8_amax_history_len: 1024
-  fp8_amax_compute_algo: max
-  fp8_wgrad: true
-  ub_tp_comm_overlap: false
-  optim:
-    name: distributed_fused_adam
-    lr: 0.0003
-    weight_decay: 0.1
-    betas:
-    - 0.9
-    - 0.95
-    sched:
-      name: CosineAnnealing
-      warmup_steps: 100
-      constant_steps: 0
-      min_lr: 3.0e-05
-  data:
-    data_impl: mmap
-    splits_string: 9995,3,2
-    seq_length: 16384
-    skip_warmup: true
-    num_workers: 2
-    dataloader_type: single
-    reset_position_ids: false
-    reset_attention_mask: false
-    eod_mask_loss: false
-    index_mapping_dir: null
-    add_fim: true
-    fim:
-      rate: 0.5
-      spm_rate: 0.5
-      split_sample: <file_sep>
-      fragment_rate: 0.5
-      no_prefix: <repo_name>
-      extra_tokens:
-        prefix: <fim_prefix>
-        middle: <fim_middle>
-        suffix: <fim_suffix>
-        pad: <fim_pad>
-        eod: <|endoftext|>
-    data_prefix:
-    - 1.96
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_0/gpt2-preprocessed_content_document
-    - 1.96
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_1/gpt2-preprocessed_content_document
-    - 1.95
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_2/gpt2-preprocessed_content_document
-    - 1.95
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_3/gpt2-preprocessed_content_document
-    - 1.96
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_4/gpt2-preprocessed_content_document
-    - 1.95
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_5/gpt2-preprocessed_content_document
-    - 1.95
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_6/gpt2-preprocessed_content_document
-    - 1.95
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_7/gpt2-preprocessed_content_document
-    - 1.95
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_8/gpt2-preprocessed_content_document
-    - 1.95
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_9/gpt2-preprocessed_content_document
-    - 2.21
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_0/gpt2-preprocessed_content_document
-    - 2.21
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_1/gpt2-preprocessed_content_document
-    - 2.21
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_2/gpt2-preprocessed_content_document
-    - 2.21
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_3/gpt2-preprocessed_content_document
-    - 2.21
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_4/gpt2-preprocessed_content_document
-    - 2.59
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_0/gpt2-preprocessed_content_document
-    - 2.5
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_1/gpt2-preprocessed_content_document
-    - 2.46
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_2/gpt2-preprocessed_content_document
-    - 2.42
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_3/gpt2-preprocessed_content_document
-    - 2.41
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_4/gpt2-preprocessed_content_document
-    - 2.36
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_5/gpt2-preprocessed_content_document
-    - 2.72
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_0/gpt2-preprocessed_content_document
-    - 2.71
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_1/gpt2-preprocessed_content_document
-    - 2.71
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_2/gpt2-preprocessed_content_document
-    - 2.73
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_3/gpt2-preprocessed_content_document
-    - 2.7
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_4/gpt2-preprocessed_content_document
-    - 2.71
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_5/gpt2-preprocessed_content_document
-    - 1.68
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/kaggle_scripts/kaggle_scripts_0/gpt2-preprocessed_content_document
-    - 1.6
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/documentation/documentation_0/gpt2-preprocessed_content_document
-    - 2.42
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_0/gpt2-preprocessed_content_document
-    - 2.42
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_1/gpt2-preprocessed_content_document
-    - 2.43
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_2/gpt2-preprocessed_content_document
-    - 2.43
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_3/gpt2-preprocessed_content_document
-    - 2.42
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_4/gpt2-preprocessed_content_document
-    - 2.29
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_5/gpt2-preprocessed_content_document
-    - 3.32
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/stackoverflow/stackoverflow_0/gpt2-preprocessed_content_document
-    - 3.55
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/stackoverflow/stackoverflow_1/gpt2-preprocessed_content_document
-    - 3.39
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/stackoverflow/stackoverflow_2/gpt2-preprocessed_content_document
-    - 0.25
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/lhq_data/lhq_data_0/gpt2-preprocessed_content_document
-    - 0.28
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/lhq_data/lhq_data_1/gpt2-preprocessed_content_document
-    - 0.47
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/lhq_data/lhq_data_2/gpt2-preprocessed_content_document
-    - 1.0
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/ir_cpp/ir_cpp_0/gpt2-preprocessed_content_document
-    - 1.0
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/ir_rust/ir_rust_0/gpt2-preprocessed_content_document
-    - 1.0
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/ir_python/ir_python_0/gpt2-preprocessed_content_document
-    - 3.0
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/ir_low_resource/ir_low_resource_0/gpt2-preprocessed_content_document
-    - 1.95
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_0/gpt2-preprocessed_content_document
-    - 2.01
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_1/gpt2-preprocessed_content_document
-    - 1.97
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_2/gpt2-preprocessed_content_document
-    - 2.02
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_3/gpt2-preprocessed_content_document
-    - 2.08
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_4/gpt2-preprocessed_content_document
-    - 1.95
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_5/gpt2-preprocessed_content_document
-    - 1.89
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_6/gpt2-preprocessed_content_document
-    - 1.85
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_7/gpt2-preprocessed_content_document
-    - 2.09
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_8/gpt2-preprocessed_content_document
-    - 2.05
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_9/gpt2-preprocessed_content_document
-    - 1.93
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_10/gpt2-preprocessed_content_document
-    - 1.9
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_11/gpt2-preprocessed_content_document
-    - 2.0
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_12/gpt2-preprocessed_content_document
-    - 2.06
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_13/gpt2-preprocessed_content_document
-    - 1.84
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_14/gpt2-preprocessed_content_document
-    - 1.93
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_15/gpt2-preprocessed_content_document
-    - 1.85
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_16/gpt2-preprocessed_content_document
-    - 1.83
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_17/gpt2-preprocessed_content_document
-    - 1.83
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_18/gpt2-preprocessed_content_document
-    - 2.02
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_19/gpt2-preprocessed_content_document
-    - 1.92
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_20/gpt2-preprocessed_content_document
-    - 2.27
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_21/gpt2-preprocessed_content_document
-    - 2.25
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_22/gpt2-preprocessed_content_document
-    - 2.49
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_23/gpt2-preprocessed_content_document
-    - 1.88
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_24/gpt2-preprocessed_content_document
-    - 2.02
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_25/gpt2-preprocessed_content_document
-    - 2.04
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_26/gpt2-preprocessed_content_document
-    - 2.42
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_27/gpt2-preprocessed_content_document
-    - 1.97
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_28/gpt2-preprocessed_content_document
-    - 1.91
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_29/gpt2-preprocessed_content_document
-    - 2.54
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_30/gpt2-preprocessed_content_document
-    - 2.28
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_31/gpt2-preprocessed_content_document
-    - 2.02
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_32/gpt2-preprocessed_content_document
-    - 1.92
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_33/gpt2-preprocessed_content_document
-    - 2.26
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_34/gpt2-preprocessed_content_document
-    - 1.88
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_35/gpt2-preprocessed_content_document
-    - 2.09
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_36/gpt2-preprocessed_content_document
-    - 2.1
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_37/gpt2-preprocessed_content_document
-    - 2.02
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_38/gpt2-preprocessed_content_document
-    - 1.96
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_39/gpt2-preprocessed_content_document
-    - 2.05
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_40/gpt2-preprocessed_content_document
-    - 2.01
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_41/gpt2-preprocessed_content_document
-    - 2.15
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_42/gpt2-preprocessed_content_document
-    - 1.91
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_43/gpt2-preprocessed_content_document
-    - 1.99
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_44/gpt2-preprocessed_content_document
-    - 2.03
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_45/gpt2-preprocessed_content_document
-    - 1.92
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_46/gpt2-preprocessed_content_document
-    - 2.1
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_47/gpt2-preprocessed_content_document
-    - 2.14
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_48/gpt2-preprocessed_content_document
-    - 1.96
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_49/gpt2-preprocessed_content_document
-    - 1.98
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_50/gpt2-preprocessed_content_document
-    - 2.04
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_51/gpt2-preprocessed_content_document
-    - 1.93
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_52/gpt2-preprocessed_content_document
-    - 1.87
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_53/gpt2-preprocessed_content_document
-    - 1.92
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_54/gpt2-preprocessed_content_document
-    - 2.18
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_55/gpt2-preprocessed_content_document
-    - 2.04
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_56/gpt2-preprocessed_content_document
-    - 2.06
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_57/gpt2-preprocessed_content_document
-    - 1.89
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_58/gpt2-preprocessed_content_document
-    - 2.05
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_59/gpt2-preprocessed_content_document
-    - 2.11
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_60/gpt2-preprocessed_content_document
-    - 2.15
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_61/gpt2-preprocessed_content_document
-    - 2.03
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_62/gpt2-preprocessed_content_document
-    - 1.88
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_63/gpt2-preprocessed_content_document
-    - 2.12
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_64/gpt2-preprocessed_content_document
-    - 2.04
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_65/gpt2-preprocessed_content_document
-    - 2.04
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_66/gpt2-preprocessed_content_document
-    - 2.45
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_67/gpt2-preprocessed_content_document
-    - 1.91
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_68/gpt2-preprocessed_content_document
-    - 2.13
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_69/gpt2-preprocessed_content_document
-    - 1.98
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_70/gpt2-preprocessed_content_document
-    - 1.94
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_71/gpt2-preprocessed_content_document
-    - 1.93
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_72/gpt2-preprocessed_content_document
-    - 1.95
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_73/gpt2-preprocessed_content_document
-    - 2.04
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_74/gpt2-preprocessed_content_document
-    - 1.99
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_75/gpt2-preprocessed_content_document
-    - 2.02
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_76/gpt2-preprocessed_content_document
-    - 1.93
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_77/gpt2-preprocessed_content_document
-    - 1.89
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_78/gpt2-preprocessed_content_document
-    - 2.1
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_79/gpt2-preprocessed_content_document
-    - 2.07
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_80/gpt2-preprocessed_content_document
-    - 2.17
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_81/gpt2-preprocessed_content_document
-    - 2.65
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_82/gpt2-preprocessed_content_document
-    - 2.13
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_83/gpt2-preprocessed_content_document
-    - 1.95
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_84/gpt2-preprocessed_content_document
-    - 1.96
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_85/gpt2-preprocessed_content_document
-    - 2.03
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_86/gpt2-preprocessed_content_document
-    - 2.02
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_87/gpt2-preprocessed_content_document
-    - 1.87
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_88/gpt2-preprocessed_content_document
-    - 1.96
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_89/gpt2-preprocessed_content_document
-    - 2.25
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_90/gpt2-preprocessed_content_document
-    - 2.11
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_91/gpt2-preprocessed_content_document
-    - 2.28
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_92/gpt2-preprocessed_content_document
-    - 2.04
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_93/gpt2-preprocessed_content_document
-    - 1.97
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_94/gpt2-preprocessed_content_document
-    - 2.37
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_95/gpt2-preprocessed_content_document
-    - 2.15
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_96/gpt2-preprocessed_content_document
-    - 1.87
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_97/gpt2-preprocessed_content_document
-    - 2.03
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_98/gpt2-preprocessed_content_document
-    - 2.26
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_99/gpt2-preprocessed_content_document
-    - 2.07
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_100/gpt2-preprocessed_content_document
-    - 2.12
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_101/gpt2-preprocessed_content_document
-    - 2.22
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_102/gpt2-preprocessed_content_document
-    - 1.86
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_103/gpt2-preprocessed_content_document
-    - 2.0
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_104/gpt2-preprocessed_content_document
-    - 1.99
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_105/gpt2-preprocessed_content_document
-    - 2.2
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_106/gpt2-preprocessed_content_document
-    - 2.28
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_107/gpt2-preprocessed_content_document
-    - 2.14
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_108/gpt2-preprocessed_content_document
-    - 2.16
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_109/gpt2-preprocessed_content_document
-    - 1.95
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_110/gpt2-preprocessed_content_document
-    - 2.32
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_111/gpt2-preprocessed_content_document
-    - 1.98
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_112/gpt2-preprocessed_content_document
-    - 2.46
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_113/gpt2-preprocessed_content_document
-    - 2.15
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_114/gpt2-preprocessed_content_document
-    - 2.24
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_115/gpt2-preprocessed_content_document
-    - 1.9
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_116/gpt2-preprocessed_content_document
-    - 2.12
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_117/gpt2-preprocessed_content_document
-    - 1.93
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_118/gpt2-preprocessed_content_document
-    - 2.01
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_119/gpt2-preprocessed_content_document
-    - 2.3
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_120/gpt2-preprocessed_content_document
-    - 1.95
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_121/gpt2-preprocessed_content_document
-    - 1.91
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_122/gpt2-preprocessed_content_document
-    - 1.88
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_123/gpt2-preprocessed_content_document
-    - 2.27
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_124/gpt2-preprocessed_content_document
-    - 2.13
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_125/gpt2-preprocessed_content_document
-    - 2.04
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_126/gpt2-preprocessed_content_document
-    - 2.15
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_127/gpt2-preprocessed_content_document
-    - 2.18
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_128/gpt2-preprocessed_content_document
-    - 2.22
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_129/gpt2-preprocessed_content_document
-    - 2.06
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_130/gpt2-preprocessed_content_document
-    - 1.99
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_131/gpt2-preprocessed_content_document
-    - 1.93
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_132/gpt2-preprocessed_content_document
-    - 2.37
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_133/gpt2-preprocessed_content_document
-    - 1.96
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_134/gpt2-preprocessed_content_document
-    - 2.12
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_135/gpt2-preprocessed_content_document
-    - 2.03
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_136/gpt2-preprocessed_content_document
-    - 2.44
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_137/gpt2-preprocessed_content_document
-    - 2.16
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_138/gpt2-preprocessed_content_document
-    - 1.97
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_139/gpt2-preprocessed_content_document
-    - 2.0
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_140/gpt2-preprocessed_content_document
-    - 2.12
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_141/gpt2-preprocessed_content_document
-    - 1.98
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_142/gpt2-preprocessed_content_document
-    - 2.15
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_143/gpt2-preprocessed_content_document
-    - 1.85
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_144/gpt2-preprocessed_content_document
-    - 2.04
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_145/gpt2-preprocessed_content_document
-    - 1.94
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_146/gpt2-preprocessed_content_document
-    - 1.87
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_147/gpt2-preprocessed_content_document
-    - 1.85
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_148/gpt2-preprocessed_content_document
-    - 2.49
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_149/gpt2-preprocessed_content_document
-    - 2.13
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_150/gpt2-preprocessed_content_document
-    - 1.97
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_151/gpt2-preprocessed_content_document
-    - 1.88
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_152/gpt2-preprocessed_content_document
-    - 2.36
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_153/gpt2-preprocessed_content_document
-    - 2.02
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_154/gpt2-preprocessed_content_document
-    - 2.1
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_155/gpt2-preprocessed_content_document
-    - 1.93
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_156/gpt2-preprocessed_content_document
-    - 1.98
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_157/gpt2-preprocessed_content_document
-    - 1.96
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_158/gpt2-preprocessed_content_document
-    - 1.99
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_159/gpt2-preprocessed_content_document
-    - 2.0
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_160/gpt2-preprocessed_content_document
-    - 2.08
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_161/gpt2-preprocessed_content_document
-    - 2.0
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_162/gpt2-preprocessed_content_document
-    - 2.08
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_163/gpt2-preprocessed_content_document
-    - 2.02
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_164/gpt2-preprocessed_content_document
-    - 2.12
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_165/gpt2-preprocessed_content_document
-    - 2.07
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_166/gpt2-preprocessed_content_document
-    - 2.03
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_167/gpt2-preprocessed_content_document
-    - 2.28
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_168/gpt2-preprocessed_content_document
-    - 2.32
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_169/gpt2-preprocessed_content_document
-    - 1.92
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_170/gpt2-preprocessed_content_document
-    - 2.02
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_171/gpt2-preprocessed_content_document
-    - 1.94
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_172/gpt2-preprocessed_content_document
-    - 2.06
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_173/gpt2-preprocessed_content_document
-    - 2.01
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_174/gpt2-preprocessed_content_document
-    - 1.99
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_175/gpt2-preprocessed_content_document
-    - 2.19
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_176/gpt2-preprocessed_content_document
-    - 2.14
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_177/gpt2-preprocessed_content_document
-    - 1.91
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_178/gpt2-preprocessed_content_document
-    - 2.23
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_179/gpt2-preprocessed_content_document
-    - 1.99
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_180/gpt2-preprocessed_content_document
-    - 2.11
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_181/gpt2-preprocessed_content_document
-    - 1.93
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_182/gpt2-preprocessed_content_document
-    - 1.87
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_183/gpt2-preprocessed_content_document
-    - 2.02
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_184/gpt2-preprocessed_content_document
-    - 1.9
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_185/gpt2-preprocessed_content_document
-    - 2.05
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_186/gpt2-preprocessed_content_document
-    - 2.01
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_187/gpt2-preprocessed_content_document
-    - 1.9
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_188/gpt2-preprocessed_content_document
-    - 1.99
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_189/gpt2-preprocessed_content_document
-    - 2.12
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_190/gpt2-preprocessed_content_document
-    - 1.89
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_191/gpt2-preprocessed_content_document
-    - 1.89
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_192/gpt2-preprocessed_content_document
-    - 1.88
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_193/gpt2-preprocessed_content_document
-    - 2.63
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_194/gpt2-preprocessed_content_document
-    - 1.87
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_195/gpt2-preprocessed_content_document
-    - 1.98
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_196/gpt2-preprocessed_content_document
-    - 1.97
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_197/gpt2-preprocessed_content_document
-    - 2.0
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_198/gpt2-preprocessed_content_document
-    - 2.17
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_199/gpt2-preprocessed_content_document
-    - 2.02
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_200/gpt2-preprocessed_content_document
-    - 2.11
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_201/gpt2-preprocessed_content_document
-    - 2.24
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_202/gpt2-preprocessed_content_document
-    - 2.19
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_203/gpt2-preprocessed_content_document
-    - 2.07
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_204/gpt2-preprocessed_content_document
-    - 2.03
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_205/gpt2-preprocessed_content_document
-    - 2.03
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_206/gpt2-preprocessed_content_document
-    - 2.18
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_207/gpt2-preprocessed_content_document
-    - 1.92
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_208/gpt2-preprocessed_content_document
-    - 2.37
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_209/gpt2-preprocessed_content_document
-    - 2.03
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_210/gpt2-preprocessed_content_document
-    - 2.04
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_211/gpt2-preprocessed_content_document
-    - 1.86
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_212/gpt2-preprocessed_content_document
-    - 1.99
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_213/gpt2-preprocessed_content_document
-    - 1.96
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_214/gpt2-preprocessed_content_document
-    - 1.97
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_215/gpt2-preprocessed_content_document
-    - 2.1
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_216/gpt2-preprocessed_content_document
-    - 2.06
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_217/gpt2-preprocessed_content_document
-    - 1.99
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_218/gpt2-preprocessed_content_document
-    - 1.9
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_219/gpt2-preprocessed_content_document
-    - 2.05
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_220/gpt2-preprocessed_content_document
-    - 1.95
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_221/gpt2-preprocessed_content_document
-    - 1.9
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_222/gpt2-preprocessed_content_document
-    - 2.08
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_223/gpt2-preprocessed_content_document
-    - 1.97
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_224/gpt2-preprocessed_content_document
-    - 1.98
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_225/gpt2-preprocessed_content_document
-    - 1.93
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_226/gpt2-preprocessed_content_document
-    - 2.22
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_227/gpt2-preprocessed_content_document
-    - 1.93
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_228/gpt2-preprocessed_content_document
-    - 2.17
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_229/gpt2-preprocessed_content_document
-    - 2.06
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_230/gpt2-preprocessed_content_document
-    - 1.98
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_231/gpt2-preprocessed_content_document
-    - 1.94
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_232/gpt2-preprocessed_content_document
-    - 2.14
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_233/gpt2-preprocessed_content_document
-    - 1.97
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_234/gpt2-preprocessed_content_document
-    - 2.14
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_235/gpt2-preprocessed_content_document
-    - 2.12
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_236/gpt2-preprocessed_content_document
-    - 2.09
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_237/gpt2-preprocessed_content_document
-    - 2.01
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_238/gpt2-preprocessed_content_document
-    - 2.15
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_239/gpt2-preprocessed_content_document
-    - 2.01
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_240/gpt2-preprocessed_content_document
-    - 1.99
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_241/gpt2-preprocessed_content_document
-    - 1.86
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_242/gpt2-preprocessed_content_document
-    - 2.12
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_243/gpt2-preprocessed_content_document
-    - 1.99
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_244/gpt2-preprocessed_content_document
-    - 2.41
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_245/gpt2-preprocessed_content_document
-    - 2.04
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_246/gpt2-preprocessed_content_document
-    - 1.95
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_247/gpt2-preprocessed_content_document
-    - 1.93
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_248/gpt2-preprocessed_content_document
-    - 2.61
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_249/gpt2-preprocessed_content_document
-    - 1.77
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_250/gpt2-preprocessed_content_document
-    - 1.94
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_251/gpt2-preprocessed_content_document
-    - 2.2
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_252/gpt2-preprocessed_content_document
-    - 1.9
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_253/gpt2-preprocessed_content_document
-    - 2.15
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_254/gpt2-preprocessed_content_document
-    - 2.13
-    - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_255/gpt2-preprocessed_content_document
diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index fc514aaee4..ed68abf6d2 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -1014,8 +1014,10 @@ def _make_sbatch_string_ft_launcher(
     # We do this by setting SLURM_JOB_NAME=interactive.
     # This is a temporary workaround, until the following PR is merged with NeMo 
     # https://github.com/Lightning-AI/pytorch-lightning/pull/18618
+    # --ignore-missing-fault-tol-cfg is used so FT launcher can handle NeMo YAML without fault_tolerance section
+    # in such case default FT config will be used
     ft_launcher_cmd_part="SLURM_JOB_NAME=interactive ft_launcher "+\
-                    f"--fault-tol-cfg-path=$FAULT_TOL_CFG_PATH {additional_ft_launcher_args} "+\
+                    f"--fault-tol-cfg-path=$FAULT_TOL_CFG_PATH --ignore-missing-fault-tol-cfg {additional_ft_launcher_args} "+\
                     "--rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$RDZV_HOST " +\
                     f"--nnodes={nodes} --nproc_per_node={ntasks_per_node} --max-restarts={max_rank_restarts}"
 
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 344065b353..02f2411e4a 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -81,8 +81,6 @@ def run(self) -> str:
                 f"global batch size and number of nodes will change following this schedule:\n {self.nodes_scheduler}"
             )
 
-        self._set_fault_tolerance_work_dir_in_stage_cfg(self.stage_cfg, self.cluster)
-
         stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(
             self.stage_cfg, job_path, self.cfg
         )
@@ -376,12 +374,6 @@ def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters)
                 ft_conf.get('additional_ft_launcher_args', "")
         return cluster_parameters
     
-    def _set_fault_tolerance_work_dir_in_stage_cfg(self, stage_cfg, cluster):
-        use_ft, ft_conf = self._get_fault_tol_config_section(stage_cfg, cluster)
-        if use_ft:
-            with omegaconf.open_dict(ft_conf):
-                ft_conf.work_dir = str(self.get_job_path().folder / "_ft_scratch_dir")
-                
     def _find_optimal_nodes(self, cfg, gpus) -> None:
         nodes_scheduler_path = (
             f"{cfg.get('training').get('run').get('results_dir')}/nodes_scheduler.json"

From 8e0cf6f17f47aa416f8669224e3e26027ccad91d Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Tue, 30 Apr 2024 10:12:46 +0200
Subject: [PATCH 35/39] Handle unknown job result as a failure, added some
 comments

---
 launcher_scripts/nemo_launcher/core/launchers.py | 6 +++---
 launcher_scripts/nemo_launcher/core/stages.py    | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index 4e8b7c0ec5..4a5ffe8488 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -951,8 +951,8 @@ def _make_sbatch_string_ft_launcher(
         srun_args = []
 
     # A safety measures:
-    # let SLURM kill all tasks if any FT launcher returns with a failure.
-    # let SLURM kill the job, 1h after any task ended.
+    # let SLURM immediately kill all tasks if any FT launcher returns with a failure.
+    # let SLURM kill the job, 1h after any task ended without a failure.
     srun_args += ["--kill-on-bad-exit=1", "--wait=3600"]
 
     lines += [
@@ -972,7 +972,7 @@ def _make_sbatch_string_ft_launcher(
             f'MAX_JOB_FAILURES={max_subsequent_job_failures}',
             'is_job_failures_limit_reached() {',
             '    tail -n $MAX_JOB_FAILURES "$JOB_RESULTS_FILE" | \\',
-            '       awk "/^[[:alnum:]]+[[:space:]]+F$/{f++} END{exit !(f>=$MAX_JOB_FAILURES)}"',
+            '       awk "/^[[:alnum:]]+[[:space:]]+[XF]$/{f++} END{exit !(f>=$MAX_JOB_FAILURES)}"',
             '}',
             'is_training_finished() {',
             '    test -f "$FAULT_TOL_FINISHED_FLAG_FILE"',
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index c0759e5128..6ef1e973d1 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -73,6 +73,8 @@ def __init__(self, cfg):
         self.setup_stage_vars(cfg)
         self.job_name = self.stage_cfg.run.get("name")
         if self.cluster.lower() == 'bcm':
+            # this to ensure that submission filename (.sh) matches the config filename (.yaml)
+            # expected result: <prefix><run_name>_submission.sh, <prefix><run_name>_hydra.yaml
             self.job_name = cfg.get("cluster").get("job_name_prefix","") + self.job_name
         self.nodes_scheduler = {}
 

From 23d03cb739dafccb6d490fd967d96d33c914f201 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Tue, 4 Jun 2024 13:43:02 -0700
Subject: [PATCH 36/39] formatting

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 .../nemo_launcher/core/launchers.py           | 102 +++++++++---------
 launcher_scripts/nemo_launcher/core/stages.py |  47 ++++----
 .../config_tests/test_fault_tol_config.py     |   8 +-
 3 files changed, 83 insertions(+), 74 deletions(-)

diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index e4197da211..2fb22bd526 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -26,9 +26,9 @@
 
 import nemo_launcher.utils.job_utils as job_utils
 import yaml
+from hera.workflows import Workflow
 from nemo_launcher.core.logger import logger
 from omegaconf import DictConfig, OmegaConf
-from hera.workflows import Workflow
 
 NEMO_LAUNCHER_CI = os.getenv("NEMO_LAUNCHER_CI", "False").lower() in ("true", "t", "1")
 NEMO_LAUNCHER_DEBUG = os.getenv("NEMO_LAUNCHER_DEBUG", "False").lower() in (
@@ -367,11 +367,13 @@ class SlurmLauncher(Launcher):
                 setup: a list of command to run in sbatch before running srun
     """
 
-    def __init__(self, 
-                 folder: Union[Path, str], 
-                 job_name: str, 
-                 use_fault_tolerance: bool, 
-                 **kwargs: Any) -> None:
+    def __init__(
+        self,
+        folder: Union[Path, str],
+        job_name: str,
+        use_fault_tolerance: bool,
+        **kwargs: Any,
+    ) -> None:
         super().__init__(folder, job_name)
         self.parameters = {}
         self.use_fault_tolerance = use_fault_tolerance
@@ -424,7 +426,7 @@ def _update_parameters(self, **kwargs: Any) -> None:
             Below are the parameters that differ from slurm documentation:
                 setup: a list of command to run in sbatch before running srun
         """
-        
+
         if self.use_fault_tolerance:
             defaults = _get_default_parameters_ft_launcher()
         else:
@@ -474,7 +476,7 @@ def _make_submission_file_text(self, command_groups: List[List[str]]) -> str:
         if self.use_fault_tolerance:
             return _make_sbatch_string_ft_launcher(
                 command_groups=command_groups, folder=self.folder, **self.parameters
-            )     
+            )
         else:
             return _make_sbatch_string(
                 command_groups=command_groups, folder=self.folder, **self.parameters
@@ -897,7 +899,7 @@ def _make_sbatch_string_ft_launcher(
     max_rank_restarts: int = 0,
     additional_ft_launcher_args: str = "",
 ) -> str:
-        
+
     """Creates the content of an sbatch file with provided parameters
 
     Parameters
@@ -985,9 +987,9 @@ def _make_sbatch_string_ft_launcher(
         for k in sorted(parameters):
             lines.append(_as_sbatch_flag(k, parameters[k]))
         parameters["ntasks_per_node"] = ntasks_per_node
-        
+
     lines += ["", "# This script uses experimental fault tolerance launcher", ""]
-    
+
     # environment setup:
     if setup is not None:
         lines += ["", "# setup"] + setup
@@ -1001,42 +1003,42 @@ def _make_sbatch_string_ft_launcher(
     srun_args += ["--kill-on-bad-exit=1", "--wait=3600"]
 
     lines += [
-        '',
-        '# Fault tolerance related items',
+        "",
+        "# Fault tolerance related items",
         f'export FAULT_TOL_CFG_PATH="{str(paths.config_file)}"',
         f'export FAULT_TOL_FINISHED_FLAG_FILE="{str(paths.folder / "_finished_flag")}"',
-        'RDZV_HOST=$(hostname)',
-        'ANY_JOB_STEP_FAILED=0',
+        "RDZV_HOST=$(hostname)",
+        "ANY_JOB_STEP_FAILED=0",
     ]
-        
+
     if max_subsequent_job_failures > 0:
         lines += [
-            '',
-            '# Automatic job resubmission related items',
+            "",
+            "# Automatic job resubmission related items",
             f'JOB_RESULTS_FILE="{str(paths.folder / "_job_results")}"',
-            f'MAX_JOB_FAILURES={max_subsequent_job_failures}',
-            'is_job_failures_limit_reached() {',
+            f"MAX_JOB_FAILURES={max_subsequent_job_failures}",
+            "is_job_failures_limit_reached() {",
             '    tail -n $MAX_JOB_FAILURES "$JOB_RESULTS_FILE" | \\',
             '       awk "/^[[:alnum:]]+[[:space:]]+[XF]$/{f++} END{exit !(f>=$MAX_JOB_FAILURES)}"',
-            '}',
-            'is_training_finished() {',
+            "}",
+            "is_training_finished() {",
             '    test -f "$FAULT_TOL_FINISHED_FLAG_FILE"',
-            '}',
-            '# Exit immediately if finished flag file exists and this job is a continuation',
+            "}",
+            "# Exit immediately if finished flag file exists and this job is a continuation",
             'if [ "$FT_RESUMED" = "1" ] ; then',
             '    if is_training_finished ; then echo "Training is finished" ; exit 0 ; fi',
             '    if is_job_failures_limit_reached ; then echo "Job failures limit reached ($MAX_JOB_FAILURES)" ; exit 1 ; fi',
-            'else',
+            "else",
             '    rm -f "$FAULT_TOL_FINISHED_FLAG_FILE" "$JOB_RESULTS_FILE"',
-            'fi',
-            '# Pre-schedule continuation job',
+            "fi",
+            "# Pre-schedule continuation job",
             'CONT_SBATCH_OUT=$(FT_RESUMED=1 sbatch --parsable --dependency=afterany:"$SLURM_JOB_ID" "$0")',
             'if [ $? -ne 0 ] ; then echo "Couldnt schedule continuation job. Check stderr for details." ; exit 1 ; fi',
             'CONT_SLURM_JOB_ID=$(echo $CONT_SBATCH_OUT | cut -f1 -d",")',
-            '# Write unknown job status to the job log, we will fix it at the end',
+            "# Write unknown job status to the job log, we will fix it at the end",
             'echo "$SLURM_JOB_ID X" >> "$JOB_RESULTS_FILE"',
         ]
-              
+
     # commandline (this will run the function and args specified in the file provided as argument)
     # We pass --output and --error here, because the SBATCH command doesn't work as expected with a filename pattern
     stderr_flags = [] if stderr_to_stdout else ["--error", stderr]
@@ -1068,19 +1070,21 @@ def _make_sbatch_string_ft_launcher(
             f"  nvidia-smi --query-gpu=timestamp,index,,memory.total,memory.free,memory.used --format=csv -l 1 & ",
             "",
         ]
-        
+
     # Fault tolerance uses Torch Elastic based launcher with SLURM.
-    # Torch Lightning does not handle that case correctly, 
-    # so we need to force TorchElasticEnvironment over SLURMEnvironment. 
+    # Torch Lightning does not handle that case correctly,
+    # so we need to force TorchElasticEnvironment over SLURMEnvironment.
     # We do this by setting SLURM_JOB_NAME=interactive.
-    # This is a temporary workaround, until the following PR is merged with NeMo 
+    # This is a temporary workaround, until the following PR is merged with NeMo
     # https://github.com/Lightning-AI/pytorch-lightning/pull/18618
     # --ignore-missing-fault-tol-cfg is used so FT launcher can handle NeMo YAML without fault_tolerance section
     # in such case default FT config will be used
-    ft_launcher_cmd_part="SLURM_JOB_NAME=interactive ft_launcher "+\
-                    f"--fault-tol-cfg-path=$FAULT_TOL_CFG_PATH --ignore-missing-fault-tol-cfg {additional_ft_launcher_args} "+\
-                    "--rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$RDZV_HOST " +\
-                    f"--nnodes={nodes} --nproc_per_node={ntasks_per_node} --max-restarts={max_rank_restarts}"
+    ft_launcher_cmd_part = (
+        "SLURM_JOB_NAME=interactive ft_launcher "
+        + f"--fault-tol-cfg-path=$FAULT_TOL_CFG_PATH --ignore-missing-fault-tol-cfg {additional_ft_launcher_args} "
+        + "--rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$RDZV_HOST "
+        + f"--nnodes={nodes} --nproc_per_node={ntasks_per_node} --max-restarts={max_rank_restarts}"
+    )
 
     for group_ind, command_group in enumerate(command_groups):
         if heterogeneous:
@@ -1098,9 +1102,7 @@ def _make_sbatch_string_ft_launcher(
             )
             command = ";\n  ".join(command_group)
             assert "python3 -u" in command
-            command = command.replace(
-                "python3 -u", ft_launcher_cmd_part,
-            )
+            command = command.replace("python3 -u", ft_launcher_cmd_part,)
             lines += [
                 "",
                 f"# command {group_ind + 1}",
@@ -1108,26 +1110,24 @@ def _make_sbatch_string_ft_launcher(
                 f'  {command} "',
                 "",
             ]
-            lines += [
-                'if [ $? -ne 0 ]; then ANY_JOB_STEP_FAILED=1 ; fi'
-            ]
+            lines += ["if [ $? -ne 0 ]; then ANY_JOB_STEP_FAILED=1 ; fi"]
 
     if max_subsequent_job_failures > 0:
         lines += [
-            '',
+            "",
             '# Fix the job log entry ("JOB_ID X" -> "JOB_ID S/F"), depending on the job result',
             'if [ "$ANY_JOB_STEP_FAILED" = "0" ] ; then',
             '   sed -i "s/$SLURM_JOB_ID X/$SLURM_JOB_ID S/" "$JOB_RESULTS_FILE"',
-            'else',
+            "else",
             '   sed -i "s/$SLURM_JOB_ID X/$SLURM_JOB_ID F/" "$JOB_RESULTS_FILE"',
-            'fi',
-            '# Check if the continuation job can be cancelled',
-            'if is_training_finished ; then',
+            "fi",
+            "# Check if the continuation job can be cancelled",
+            "if is_training_finished ; then",
             '    echo "Training is finished" ; scancel $CONT_SLURM_JOB_ID ; exit 0',
-            'fi',
-            'if is_job_failures_limit_reached ; then',
+            "fi",
+            "if is_job_failures_limit_reached ; then",
             '    echo "Job failures limit reached ($MAX_JOB_FAILURES)" ; scancel $CONT_SLURM_JOB_ID ; exit 1',
-            'fi',
+            "fi",
         ]
 
     return "\n".join(lines)
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
index 7b8770183e..724d003c1c 100755
--- a/launcher_scripts/nemo_launcher/core/stages.py
+++ b/launcher_scripts/nemo_launcher/core/stages.py
@@ -17,10 +17,13 @@
 import glob
 import json
 import logging
-import omegaconf
 import os
 import re
 import shutil
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import omegaconf
 from nemo_launcher.core.launchers import AutoLauncher
 from nemo_launcher.utils.data_utils.prepare_squad import (
     prepare_squad_for_fine_tuning,
@@ -28,8 +31,6 @@
 )
 from nemo_launcher.utils.job_utils import JobPaths
 from omegaconf import DictConfig, OmegaConf
-from pathlib import Path
-from typing import Any, Dict, List, Optional
 
 __LANGUAGE_MODELS_LIST__ = [
     "gpt3",
@@ -76,10 +77,12 @@ def __init__(self, cfg):
         self.stage_cfg = None
         self.setup_stage_vars(cfg)
         self.job_name = self.stage_cfg.run.get("name")
-        if self.cluster.lower() == 'bcm':
+        if self.cluster.lower() == "bcm":
             # this to ensure that submission filename (.sh) matches the config filename (.yaml)
             # expected result: <prefix><run_name>_submission.sh, <prefix><run_name>_hydra.yaml
-            self.job_name = cfg.get("cluster").get("job_name_prefix","") + self.job_name
+            self.job_name = (
+                cfg.get("cluster").get("job_name_prefix", "") + self.job_name
+            )
         self.nodes_scheduler = {}
 
     def setup_stage_vars(self, cfg: OmegaConf):
@@ -399,31 +402,37 @@ def _make_cluster_parameters(self, cluster: str) -> Dict:
                 }
             )
 
-        cluster_parameters = \
-            self._update_fault_tolerance_params(stage_cfg, cluster, cluster_parameters)
-        
+        cluster_parameters = self._update_fault_tolerance_params(
+            stage_cfg, cluster, cluster_parameters
+        )
+
         return cluster_parameters
-    
+
     def _get_fault_tol_config_section(self, stage_cfg, cluster):
         exp_man_conf = stage_cfg.get("exp_manager", dict())
-        use_ft = exp_man_conf.get('create_fault_tolerance_callback', False)
+        use_ft = exp_man_conf.get("create_fault_tolerance_callback", False)
         if use_ft:
             if cluster.lower() != "bcm":
-                raise ValueError(f"Fault tolerance requires 'bcm' cluster, but it's '{cluster}')")
-        return use_ft, exp_man_conf.get("fault_tolerance", dict())   
+                raise ValueError(
+                    f"Fault tolerance requires 'bcm' cluster, but it's '{cluster}')"
+                )
+        return use_ft, exp_man_conf.get("fault_tolerance", dict())
 
     def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters):
         use_ft, ft_conf = self._get_fault_tol_config_section(stage_cfg, cluster)
         cluster_parameters["use_fault_tolerance"] = use_ft
         if use_ft:
-            cluster_parameters["max_rank_restarts"] = \
-                ft_conf.get('max_rank_restarts', 0)
-            cluster_parameters["max_subsequent_job_failures"] = \
-                ft_conf.get('max_subsequent_job_failures', 0)
-            cluster_parameters["additional_ft_launcher_args"] = \
-                ft_conf.get('additional_ft_launcher_args', "")
+            cluster_parameters["max_rank_restarts"] = ft_conf.get(
+                "max_rank_restarts", 0
+            )
+            cluster_parameters["max_subsequent_job_failures"] = ft_conf.get(
+                "max_subsequent_job_failures", 0
+            )
+            cluster_parameters["additional_ft_launcher_args"] = ft_conf.get(
+                "additional_ft_launcher_args", ""
+            )
         return cluster_parameters
-    
+
     def _find_optimal_nodes(self, cfg, gpus) -> None:
         nodes_scheduler_path = (
             f"{cfg.get('training').get('run').get('results_dir')}/nodes_scheduler.json"
diff --git a/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py
index 8a9f7a1d70..d22a69834d 100644
--- a/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py
+++ b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py
@@ -72,13 +72,14 @@ def test_fault_tol_config_with_bcm():
     cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml")
     cfg.training_config = "gpt3/126m"
     cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
-    cfg.training.exp_manager.create_fault_tolerance_callback=True
+    cfg.training.exp_manager.create_fault_tolerance_callback = True
     cfg.training.exp_manager.fault_tolerance = OmegaConf.create(
         {"max_subsequent_job_failures": 1}
     )
     stage = Training(cfg)
     _ = stage.run()
 
+
 def test_fault_tol_config_with_bcm_no_ft_section():
     """ Fault tolerance + BCM cluster, no "fault_tolerance" section in cfg, should be fine """
     cfg = OmegaConf.load("conf/config.yaml")
@@ -89,7 +90,7 @@ def test_fault_tol_config_with_bcm_no_ft_section():
     cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml")
     cfg.training_config = "gpt3/126m"
     cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
-    cfg.training.exp_manager.create_fault_tolerance_callback=True
+    cfg.training.exp_manager.create_fault_tolerance_callback = True
     stage = Training(cfg)
     _ = stage.run()
 
@@ -104,8 +105,7 @@ def test_fault_tol_config_with_bcp():
     cfg.cluster = dict()
     cfg.training_config = "gpt3/126m"
     cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml")
-    cfg.training.exp_manager.create_fault_tolerance_callback=True
+    cfg.training.exp_manager.create_fault_tolerance_callback = True
     with pytest.raises(ValueError):
         stage = Training(cfg)
         _ = stage.run()
-

From 9a9accf9070403edd8f4d1ebe18118bbae066796 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Tue, 4 Jun 2024 13:44:32 -0700
Subject: [PATCH 37/39] fix unit tests

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 launcher_scripts/nemo_launcher/core/launchers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index 2fb22bd526..861f263a7d 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -371,7 +371,7 @@ def __init__(
         self,
         folder: Union[Path, str],
         job_name: str,
-        use_fault_tolerance: bool,
+        use_fault_tolerance: bool = False,
         **kwargs: Any,
     ) -> None:
         super().__init__(folder, job_name)

From eb48956d368dfd4594978e3f1fa5db1531c6f8f2 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Wed, 5 Jun 2024 15:39:03 -0700
Subject: [PATCH 38/39] remove examples

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 .../run_gpt_on_draco_rno_FT.txt               | 59 --------------
 examples/fault_tolerance/run_on_cluster.sh    | 76 -------------------
 .../fault_tolerance/run_sc2_3b_on_eos_FT.txt  | 35 ---------
 3 files changed, 170 deletions(-)
 delete mode 100644 examples/fault_tolerance/run_gpt_on_draco_rno_FT.txt
 delete mode 100755 examples/fault_tolerance/run_on_cluster.sh
 delete mode 100644 examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt

diff --git a/examples/fault_tolerance/run_gpt_on_draco_rno_FT.txt b/examples/fault_tolerance/run_gpt_on_draco_rno_FT.txt
deleted file mode 100644
index 98402f3f87..0000000000
--- a/examples/fault_tolerance/run_gpt_on_draco_rno_FT.txt
+++ /dev/null
@@ -1,59 +0,0 @@
-USR_DIR="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi"
-LAUNCHER_DIR="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi/ft/NeMo-Megatron-Launcher"
-
-# create dummy data this that is required by the launcher
-# we will use mock data
-mkdir -p ${LAUNCHER_DIR}/dummy_data_dir
-
-
-# USE SC2 container, but train GPT3 5b
-
-NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface/hub  python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
-    training=gpt3/5b \
-    stages=["training"] \
-    numa_mapping.enable=True \
-    data_dir=${LAUNCHER_DIR}/dummy_data_dir \
-    training.model.data.data_impl="mock" \
-    training.model.data.data_prefix=[] \
-    launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts  \
-    container_mounts=[$USR_DIR:$USR_DIR] \
-    container="gitlab-master.nvidia.com/dl/nemo/nemo-fw/train:sc2_fault_tol_elastic" \
-    cluster.partition=batch_short_dgx1_m2 \
-    cluster.account=coreai_dlalgo_llm \
-    cluster.job_name_prefix="coreai_dlalgo_llm-test-ft5b:" \
-    cluster.gpus_per_task=null \
-    cluster.gpus_per_node=null \
-    ++cluster.nv_meta="ml-model.fault_tol_tests" \
-    ++cluster.gres="gpu:8" \
-    ++cluster.signal="TERM@180" \
-    training.exp_manager.resume_if_exists=True \
-    training.exp_manager.create_checkpoint_callback=True \
-    training.exp_manager.checkpoint_callback_params.save_top_k=1 \
-    training.exp_manager.resume_ignore_no_checkpoint=True \
-    training.run.name="fault_tol_gpt3_5b_dbg" \
-    training.run.time_limit=00:30:00 \
-    training.trainer.max_time=00:01:00:00 \
-    training.trainer.num_nodes=4 \
-    training.trainer.devices=8 \
-    training.trainer.log_every_n_steps=10 \
-    training.trainer.val_check_interval=400 \
-    ++training.trainer.precision=16 \
-    ++training.model.mcore_gpt=False \
-    ++training.model.tokenizer.merge_file="/gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt3/bpe/gpt2-merges.txt" \
-    ++training.model.tokenizer.vocab_file="/gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt3/bpe/gpt2-vocab.txt" \
-    training.trainer.enable_checkpointing=False \
-    training.model.micro_batch_size=1 \
-    training.model.global_batch_size=4 \
-    training.model.tensor_model_parallel_size=8 \
-    training.model.pipeline_model_parallel_size=1 \
-    ++training.exp_manager.create_fault_tolerance_callback=True \
-    ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=null \
-    ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=null \
-    ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=3 \
-    ++training.exp_manager.fault_tolerance.max_rank_restarts=0
-
-
-#
-#    ++training.exp_manager.fault_tolerance.simulated_fault.fault_type=random \
-#    ++training.exp_manager.fault_tolerance.simulated_fault.base_delay=900
-#
diff --git a/examples/fault_tolerance/run_on_cluster.sh b/examples/fault_tolerance/run_on_cluster.sh
deleted file mode 100755
index b28bc18597..0000000000
--- a/examples/fault_tolerance/run_on_cluster.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/bin/bash
-
-# NOTE: NeMo-Megatron-Launcher requirements should be installed
-# e.g. cd /NeMo-Megatron-Launcher && pip install -r requirements.txt
-
-CLUSTER="draco-rno"
-CONTAINER="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi/ft/NeMo-Megatron-Launcher-gwe-ft/dl+gwe+fault_tolerance_related+nemo-gwe-ft+test.sqsh" # "gitlab-master.nvidia.com/dl/gwe/fault_tolerance_related/nemo-gwe-ft:test"
-RUN_NAME="fault_tol_gpt3_5b_no_err"
-NODES=4
-
-FT_ARGS="
-    ++training.exp_manager.create_fault_tolerance_callback=True \
-    ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=1
-"
-
-#    ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=900 \
-#    ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=600 \
-#    ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=3 \
-#    ++training.exp_manager.fault_tolerance.max_rank_restarts=0
-#    ++training.exp_manager.fault_tolerance.simulated_fault.fault_type=random \
-#    ++training.exp_manager.fault_tolerance.simulated_fault.base_delay=900 
-
-if [ "$CLUSTER" == "draco-rno" ]; then
-    PARTITION="batch_short_dgx1_m2"
-    ACCOUNT="coreai_dlalgo_llm"
-    JOB_PREFIX="coreai_dlalgo_llm-test:"
-    CLUSTER_SPECIFIC_ARGS="++cluster.nv_meta=\"ml-model.fault_tol_tests\""
-    USR_DIR="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi"                                                           
-    LAUNCHER_DIR="${USR_DIR}/ft/NeMo-Megatron-Launcher-gwe-ft"
-else
-    echo "Unknown cluster: $CLUSTER"
-    exit 1
-fi
-
-# create dummy data this that is required by the launcher     
-# we will use mock data
-mkdir -p ${LAUNCHER_DIR}/dummy_data_dir                                                                           
-                                                                                                                                                                           
-HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface/hub HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
-    training=gpt3/5b \
-    stages=["training"] \
-    numa_mapping.enable=True \
-    data_dir=${LAUNCHER_DIR}/dummy_data_dir \
-    ++training.model.data.mock_dataset=True \
-    ++training.model.data.data_impl="mock" \
-    ++training.model.data.data_prefix=[] \
-    launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts  \
-    container_mounts=[$USR_DIR:$USR_DIR] \
-    container=${CONTAINER} \
-    cluster.partition=${PARTITION} \
-    cluster.account=${ACCOUNT} \
-    cluster.job_name_prefix=${JOB_PREFIX} \
-    ${CLUSTER_SPECIFIC_ARGS} \
-    ++cluster.gres="gpu:8" \
-    ++cluster.signal="TERM@240" \
-    training.exp_manager.resume_if_exists=True \
-    training.exp_manager.create_checkpoint_callback=True \
-    training.exp_manager.checkpoint_callback_params.save_top_k=1 \
-    training.exp_manager.resume_ignore_no_checkpoint=True \
-    training.run.name=${RUN_NAME} \
-    training.run.time_limit=00:20:00 \
-    training.trainer.max_time=00:01:30:00 \
-    training.trainer.num_nodes=${NODES} \
-    training.trainer.devices=8 \
-    training.trainer.log_every_n_steps=10 \
-    training.trainer.val_check_interval=50 \
-    ++training.trainer.precision=16 \
-    ++training.model.tokenizer.merge_file="${USR_DIR}/bpe/gpt2-merges.txt" \
-    ++training.model.tokenizer.vocab_file="${USR_DIR}/bpe/gpt2-vocab.txt" \
-    training.trainer.enable_checkpointing=False \
-    training.model.micro_batch_size=1 \
-    training.model.global_batch_size=$((${NODES} * 8)) \
-    training.model.tensor_model_parallel_size=2 \
-    training.model.pipeline_model_parallel_size=4 \
-    ${FT_ARGS}
-
diff --git a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
deleted file mode 100644
index 1ccf8af65e..0000000000
--- a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-USR_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi"
-LAUNCHER_DIR="/home/jbieniusiewi/nvwork/sc2/NeMo-Megatron-Launcher"
-
-NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \
-    training=gpt3/starcoder2_3b \
-    stages=["training"] \
-    numa_mapping.enable=True \
-    data_dir=/lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized \
-    launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts  \
-    container_mounts=[$USR_DIR:$USR_DIR] \
-    container="gitlab-master.nvidia.com/dl/nemo/nemo-fw/train:sc2_fault_tol" \
-    cluster.partition=batch \
-    cluster.account=coreai_dlalgo_llm \
-    cluster.job_name_prefix="coreai_dlalgo_llm-sc2_3b-ft:" \
-    cluster.gpus_per_task=null \
-    cluster.gpus_per_node=null \
-    training.run.name="fault_tol_sc2_3b" \
-    training.run.time_limit=04:00:00 \
-    training.trainer.max_time=00:04:00:00 \
-    training.trainer.num_nodes=2 \
-    training.trainer.devices=8 \
-    training.trainer.log_every_n_steps=1 \
-    training.trainer.val_check_interval=1000 \
-    ++training.exp_manager.create_fault_tolerance_callback=True \
-    ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=720 \
-    ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=600 \
-    ++training.exp_manager.fault_tolerance.ipc_timeout=60 \
-    ++training.exp_manager.fault_tolerance.rank_termination_signal=9 \
-    ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=2 \
-    ++training.exp_manager.fault_tolerance.max_rank_restarts=1 \
-
-
-# Uncomment to test simulated faults
-#    ++training.exp_manager.fault_tolerance.simulated_fault.fault_type=random \
-#    ++training.exp_manager.fault_tolerance.simulated_fault.base_delay=1800

From df4232261e5a7d867aa5b27c81cb7c9dcd989658 Mon Sep 17 00:00:00 2001
From: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Date: Mon, 1 Jul 2024 13:55:39 +0200
Subject: [PATCH 39/39] Final cleanup

Signed-off-by: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
---
 launcher_scripts/nemo_launcher/core/launchers.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py
index 3df38215a1..6a7cecda1b 100755
--- a/launcher_scripts/nemo_launcher/core/launchers.py
+++ b/launcher_scripts/nemo_launcher/core/launchers.py
@@ -980,7 +980,7 @@ def _make_sbatch_string_ft_launcher(
     # now create
     lines = ["#!/bin/bash", "", "# Parameters"]
     if heterogeneous:
-        raise ValueError("This PoC does not support heterogeneous jobs")
+        raise ValueError("Fault tolerance is not supported with heterogeneous jobs.")
     else:
         # run 1 FT launcher per node, it will spawn the actual tasks
         parameters["ntasks_per_node"] = 1
@@ -988,8 +988,6 @@ def _make_sbatch_string_ft_launcher(
             lines.append(_as_sbatch_flag(k, parameters[k]))
         parameters["ntasks_per_node"] = ntasks_per_node
 
-    lines += ["", "# This script uses experimental fault tolerance launcher", ""]
-
     # environment setup:
     if setup is not None:
         lines += ["", "# setup"] + setup
@@ -1071,16 +1069,8 @@ def _make_sbatch_string_ft_launcher(
             "",
         ]
 
-    # Fault tolerance uses Torch Elastic based launcher with SLURM.
-    # Torch Lightning does not handle that case correctly,
-    # so we need to force TorchElasticEnvironment over SLURMEnvironment.
-    # We do this by setting SLURM_JOB_NAME=interactive.
-    # This is a temporary workaround, until the following PR is merged with NeMo
-    # https://github.com/Lightning-AI/pytorch-lightning/pull/18618
-    # --ignore-missing-fault-tol-cfg is used so FT launcher can handle NeMo YAML without fault_tolerance section
-    # in such case default FT config will be used
     ft_launcher_cmd_part = (
-        "SLURM_JOB_NAME=interactive ft_launcher "
+        "ft_launcher "
         + f"--fault-tol-cfg-path=$FAULT_TOL_CFG_PATH --ignore-missing-fault-tol-cfg {additional_ft_launcher_args} "
         + "--rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$RDZV_HOST "
         + f"--nnodes={nodes} --nproc_per_node={ntasks_per_node} --max-restarts={max_rank_restarts}"