From 28f3e4396111662bb83ec02d9da73e7dc4d9a52e Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Mon, 30 Oct 2023 07:46:10 -0700 Subject: [PATCH 01/39] Added fault tolerance config for gpt3 126m --- ...un_gpt126m_batch_training_on_dlcluster.txt | 22 +++++++++++++++++++ ..._gpt126m_iteract_training_on_dlcluster.txt | 22 +++++++++++++++++++ launcher_scripts/conf/cluster/bcm.yaml | 2 +- launcher_scripts/conf/training/gpt3/126m.yaml | 12 ++++++++-- 4 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 examples/fault_tolerance/run_gpt126m_batch_training_on_dlcluster.txt create mode 100644 examples/fault_tolerance/run_gpt126m_iteract_training_on_dlcluster.txt diff --git a/examples/fault_tolerance/run_gpt126m_batch_training_on_dlcluster.txt b/examples/fault_tolerance/run_gpt126m_batch_training_on_dlcluster.txt new file mode 100644 index 0000000000..e228f4737c --- /dev/null +++ b/examples/fault_tolerance/run_gpt126m_batch_training_on_dlcluster.txt @@ -0,0 +1,22 @@ +LAUNCHER_DIR="/mnt/nvdl/usr/jbieniusiewi/nemo/NeMo-Megatron-Launcher" + +HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ + training=gpt3/126m \ + stages=["training"] \ + numa_mapping.enable=True \ + launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts \ + cluster.gpus_per_task=null \ + cluster.gpus_per_node=null \ + cluster.job_name_prefix="nv-test:" \ + training.exp_manager.create_checkpoint_callback=False \ + training.run.name="test" \ + training.trainer.num_nodes=1 \ + training.trainer.devices=8 \ + training.model.global_batch_size=16 \ + training.model.micro_batch_size=2 \ + cluster_type=bcm \ + ++training.cluster_type=bcm \ + training.model.data.data_impl="mock" \ + training.model.data.data_prefix=[] \ + ++fault_tolerance.enabled=True \ + diff --git a/examples/fault_tolerance/run_gpt126m_iteract_training_on_dlcluster.txt b/examples/fault_tolerance/run_gpt126m_iteract_training_on_dlcluster.txt new file mode 100644 index 0000000000..e75ae0dbc9 --- /dev/null +++ b/examples/fault_tolerance/run_gpt126m_iteract_training_on_dlcluster.txt @@ -0,0 +1,22 @@ +LAUNCHER_DIR="/mnt/nvdl/usr/jbieniusiewi/nemo/NeMo-Megatron-Launcher" + +HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ + training=gpt3/126m \ + stages=["training"] \ + numa_mapping.enable=True \ + launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts \ + cluster.gpus_per_task=null \ + cluster.gpus_per_node=null \ + cluster.job_name_prefix="nv-test:" \ + training.exp_manager.create_checkpoint_callback=False \ + training.run.name="test" \ + training.trainer.num_nodes=1 \ + training.trainer.devices=8 \ + training.model.global_batch_size=16 \ + training.model.micro_batch_size=2 \ + cluster_type=interactive \ + ++training.cluster_type=BCP \ + training.model.data.data_impl="mock" \ + training.model.data.data_prefix=[] \ + ++fault_tolerance.enabled=True \ + diff --git a/launcher_scripts/conf/cluster/bcm.yaml b/launcher_scripts/conf/cluster/bcm.yaml index ba8f2ebbb0..24d5c78f6b 100755 --- a/launcher_scripts/conf/cluster/bcm.yaml +++ b/launcher_scripts/conf/cluster/bcm.yaml @@ -1,4 +1,4 @@ -partition: null +partition: dgx1v account: null exclusive: True gpus_per_task: null diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml index 27d3329756..5810378966 100755 --- a/launcher_scripts/conf/training/gpt3/126m.yaml +++ b/launcher_scripts/conf/training/gpt3/126m.yaml @@ -5,7 +5,7 @@ hydra: run: name: gpt3_126m results_dir: ${base_results_dir}/${.name} - time_limit: "1-00:00:00" + time_limit: "02:00:00" dependency: "singleton" trainer: @@ -26,7 +26,6 @@ trainer: accumulate_grad_batches: 1 gradient_clip_val: 1.0 - exp_manager: explicit_log_dir: ${training.run.results_dir}/results exp_dir: null @@ -168,6 +167,15 @@ model: constant_steps: 100000 min_lr: 6e-5 + fault_tolerance: + initial_rank_heartbeat_timeout: 120 + rank_heartbeat_timeout: 30 + ipc_timeout: 30 + simulated_fault: + fault_type: rank_killed + rank_to_fail: 1 + base_delay: 180 + data: data_impl: mmap splits_string: "99990,8,2" From 7a955ddce9a78aa07475c4dcbf1896ee8e2f599d Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Fri, 3 Nov 2023 08:15:42 -0700 Subject: [PATCH 02/39] inital auto-resume impl-WIP --- ...un_gpt126m_batch_training_on_dlcluster.txt | 3 +-- ..._gpt126m_iteract_training_on_dlcluster.txt | 3 +-- launcher_scripts/conf/cluster/bcm.yaml | 2 +- launcher_scripts/conf/config.yaml | 4 ++-- launcher_scripts/conf/training/gpt3/126m.yaml | 3 ++- .../nemo_launcher/core/launchers.py | 21 +++++++++++++++++++ launcher_scripts/nemo_launcher/core/stages.py | 6 ++++++ 7 files changed, 34 insertions(+), 8 deletions(-) diff --git a/examples/fault_tolerance/run_gpt126m_batch_training_on_dlcluster.txt b/examples/fault_tolerance/run_gpt126m_batch_training_on_dlcluster.txt index e228f4737c..92f1b303ba 100644 --- a/examples/fault_tolerance/run_gpt126m_batch_training_on_dlcluster.txt +++ b/examples/fault_tolerance/run_gpt126m_batch_training_on_dlcluster.txt @@ -17,6 +17,5 @@ HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scr cluster_type=bcm \ ++training.cluster_type=bcm \ training.model.data.data_impl="mock" \ - training.model.data.data_prefix=[] \ - ++fault_tolerance.enabled=True \ + training.model.data.data_prefix=[] diff --git a/examples/fault_tolerance/run_gpt126m_iteract_training_on_dlcluster.txt b/examples/fault_tolerance/run_gpt126m_iteract_training_on_dlcluster.txt index e75ae0dbc9..fe80192be5 100644 --- a/examples/fault_tolerance/run_gpt126m_iteract_training_on_dlcluster.txt +++ b/examples/fault_tolerance/run_gpt126m_iteract_training_on_dlcluster.txt @@ -18,5 +18,4 @@ HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scr ++training.cluster_type=BCP \ training.model.data.data_impl="mock" \ training.model.data.data_prefix=[] \ - ++fault_tolerance.enabled=True \ - + diff --git a/launcher_scripts/conf/cluster/bcm.yaml b/launcher_scripts/conf/cluster/bcm.yaml index 24d5c78f6b..2c70c43d04 100755 --- a/launcher_scripts/conf/cluster/bcm.yaml +++ b/launcher_scripts/conf/cluster/bcm.yaml @@ -1,4 +1,4 @@ -partition: dgx1v +partition: DGX1 account: null exclusive: True gpus_per_task: null diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml index 89e86c652f..09f067a88d 100755 --- a/launcher_scripts/conf/config.yaml +++ b/launcher_scripts/conf/config.yaml @@ -38,9 +38,9 @@ launcher_scripts_path: ??? # Path to NeMo Megatron Launch scripts, should ends data_dir: ${launcher_scripts_path}/data # Location to store and read the data. base_results_dir: ${launcher_scripts_path}/results # Location to store the results, checkpoints and logs. container_mounts: # List of additional paths to mount to container. They will be mounted to same path. - - null + - /mnt/nvdl/usr/jbieniusiewi/ +#container: /mnt/nvdl/usr/jbieniusiewi/nemo/nemofw-training_230803_fault_tol.sqsh container: nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.03 - wandb_api_key_file: null # File where the w&B api key is stored. Key must be on the first line. env_vars: diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml index 5810378966..56482695fd 100755 --- a/launcher_scripts/conf/training/gpt3/126m.yaml +++ b/launcher_scripts/conf/training/gpt3/126m.yaml @@ -5,7 +5,7 @@ hydra: run: name: gpt3_126m results_dir: ${base_results_dir}/${.name} - time_limit: "02:00:00" + time_limit: "00:30:00" dependency: "singleton" trainer: @@ -175,6 +175,7 @@ model: fault_type: rank_killed rank_to_fail: 1 base_delay: 180 + autoresume_if_interrupted: True data: data_impl: mmap diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index 9c9e90d381..73b3cac9ac 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -540,6 +540,7 @@ def _make_sbatch_string( additional_parameters: Optional[Dict[str, Any]] = None, srun_args: Optional[Iterable[str]] = None, heterogeneous: bool = False, + autoresume_if_interrupted: bool = False, ) -> str: """Creates the content of an sbatch file with provided parameters @@ -580,6 +581,7 @@ def _make_sbatch_string( "container_mounts", "srun_args", "heterogeneous", + "autoresume_if_interrupted", ] parameters = {k: v for k, v in locals().items() if v is not None and k not in nonslurm} # rename and reformat parameters @@ -635,6 +637,15 @@ def _make_sbatch_string( if setup is not None: lines += ["", "# setup"] + setup + if autoresume_if_interrupted is True: + lines += [ + '', + '# if the flag file is created by a trainer script, this slurm batch script will be rescheduled', + 'export INTERRUPTED_FLAG_FILE='+str(paths.results_folder / "_interrupted_flag"), + 'rm -f $INTERRUPTED_FLAG_FILE', + '', + ] + # commandline (this will run the function and args specified in the file provided as argument) # We pass --output and --error here, because the SBATCH command doesn't work as expected with a filename pattern stderr_flags = [] if stderr_to_stdout else ["--error", stderr] @@ -689,6 +700,16 @@ def _make_sbatch_string( f" {command} \"", "", ] + + if autoresume_if_interrupted is True: + lines += [ + '', '# automatic resumption', + 'if [ -f "$INTERRUPTED_FLAG_FILE" ] ; then ', + 'IS_RESUMED=1 sbatch "$0"', + 'fi', + '', + ] + return "\n".join(lines) diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 2c6ac5ae3e..2df3649f5f 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -323,6 +323,12 @@ def _make_cluster_parameters(self, cluster: str) -> Dict: } ) + fault_tol_conf = stage_cfg.get("model").get("fault_tolerance", None) + if fault_tol_conf is not None: + cluster_parameters["autoresume_if_interrupted"] = fault_tol_conf.get("autoresume_if_interrupted", False) + if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm": + logging.warn(f"autoresume_if_interrupted has no effect if cluster type is not bcm (current cluster is {cluster})") + return cluster_parameters def _find_optimal_nodes(self, cfg, gpus) -> None: From 0b84305ecc2cd50cfc83fa53dc79e3c558546caa Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Tue, 7 Nov 2023 06:46:11 -0800 Subject: [PATCH 03/39] Auto-resume loop-WIP --- .../run_gpt126m_batch_training_on_selene.txt | 36 +++++++++++++++++++ .../nemo_launcher/core/launchers.py | 16 ++++++--- launcher_scripts/nemo_launcher/core/stages.py | 2 +- 3 files changed, 48 insertions(+), 6 deletions(-) create mode 100644 examples/fault_tolerance/run_gpt126m_batch_training_on_selene.txt diff --git a/examples/fault_tolerance/run_gpt126m_batch_training_on_selene.txt b/examples/fault_tolerance/run_gpt126m_batch_training_on_selene.txt new file mode 100644 index 0000000000..a696dc9876 --- /dev/null +++ b/examples/fault_tolerance/run_gpt126m_batch_training_on_selene.txt @@ -0,0 +1,36 @@ +LAUNCHER_DIR="/lustre/fsw/joc/jbieniusiewi/nemo/NeMo-Megatron-Launcher" + +HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ + training=gpt3/126m \ + stages=["training"] \ + numa_mapping.enable=True \ + data_dir=/lustre/fsw/joc/big_nlp/gpt3/prepare_dataset/the_pile/train \ + launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts \ + container_mounts=[/lustre/fsw/joc/jbieniusiewi/:/lustre/fsw/joc/jbieniusiewi/] \ + cluster.partition=luna \ + cluster.account=coreai_dlalgo_llm \ + cluster.job_name_prefix="coreai_dlalgo_llm-test-interact:" \ + cluster.gpus_per_task=null \ + cluster.gpus_per_node=null \ + training.exp_manager.resume_if_exists=True \ + training.exp_manager.create_checkpoint_callback=True \ + training.exp_manager.checkpoint_callback_params.save_top_k=1 \ + training.exp_manager.resume_ignore_no_checkpoint=True \ + training.run.name="dummy_run_name_126m" \ + training.run.time_limit=00:12:00 \ + training.trainer.max_time=00:04:00:00 \ + training.trainer.num_nodes=1 \ + training.trainer.devices=8 \ + training.trainer.log_every_n_steps=1 \ + training.trainer.val_check_interval=1000 \ + training.trainer.enable_checkpointing=False \ + training.model.micro_batch_size=2 \ + training.model.global_batch_size=16 \ + training.model.tensor_model_parallel_size=1 \ + training.model.pipeline_model_parallel_size=1 \ + training.model.transformer_engine=True \ + training.model.fp8=False \ + training.model.fp8_e4m3=False \ + training.model.grad_div_ar_fusion=False \ + training.model.activations_checkpoint_granularity=selective \ + training.model.activations_checkpoint_method=uniform \ diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index 73b3cac9ac..97fa3012a3 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -641,9 +641,14 @@ def _make_sbatch_string( lines += [ '', '# if the flag file is created by a trainer script, this slurm batch script will be rescheduled', - 'export INTERRUPTED_FLAG_FILE='+str(paths.results_folder / "_interrupted_flag"), + 'export INTERRUPTED_FLAG_FILE='+str(paths.folder / "_interrupted_flag"), + 'if [ "$RESUMED" = "1" ] && [ ! -f "$INTERRUPTED_FLAG_FILE" ] ; then exit 0 ; fi', + 'CONT_SBATCH_OUT=$(RESUMED=1 sbatch --parsable --dependency=afterany:"$SLURM_JOB_ID" "$0")', + 'if [ $? -ne 0 ] ; then echo "Could not schedule continuation job. Check stderr for details." ; exit 1 ; fi', + 'CONT_SLURM_JOB_ID=$(echo $CONT_SBATCH_OUT | cut -f1 -d",")', 'rm -f $INTERRUPTED_FLAG_FILE', '', + 'touch $INTERRUPTED_FLAG_FILE', # FIXME TODO this is for debug only ] # commandline (this will run the function and args specified in the file provided as argument) @@ -703,10 +708,11 @@ def _make_sbatch_string( if autoresume_if_interrupted is True: lines += [ - '', '# automatic resumption', - 'if [ -f "$INTERRUPTED_FLAG_FILE" ] ; then ', - 'IS_RESUMED=1 sbatch "$0"', - 'fi', + '', + '# cancel continuation job if no continuation marker file was created', + 'if [ ! -f "$INTERRUPTED_FLAG_FILE" ] && [ ! -z "$CONT_SLURM_JOB_ID" ] ; then', + 'scancel $CONT_SLURM_JOB_ID', + 'fi' '', ] diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 2df3649f5f..05e0826040 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -327,7 +327,7 @@ def _make_cluster_parameters(self, cluster: str) -> Dict: if fault_tol_conf is not None: cluster_parameters["autoresume_if_interrupted"] = fault_tol_conf.get("autoresume_if_interrupted", False) if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm": - logging.warn(f"autoresume_if_interrupted has no effect if cluster type is not bcm (current cluster is {cluster})") + raise ValueError(f"autoresume_if_interrupted works only with 'bcm' cluster (current cluster is '{cluster}')") return cluster_parameters From 733ac4cb452189b3a77deed3c48751cc91c7bc35 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Tue, 7 Nov 2023 08:38:11 -0800 Subject: [PATCH 04/39] Cleaning dbg code --- .../fault_tolerance/run_gpt126m_batch_training_on_selene.txt | 1 + launcher_scripts/conf/cluster/bcm.yaml | 2 +- launcher_scripts/conf/config.yaml | 4 ++-- launcher_scripts/conf/training/gpt3/126m.yaml | 2 +- launcher_scripts/nemo_launcher/core/launchers.py | 1 - 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/fault_tolerance/run_gpt126m_batch_training_on_selene.txt b/examples/fault_tolerance/run_gpt126m_batch_training_on_selene.txt index a696dc9876..2db9461ae7 100644 --- a/examples/fault_tolerance/run_gpt126m_batch_training_on_selene.txt +++ b/examples/fault_tolerance/run_gpt126m_batch_training_on_selene.txt @@ -7,6 +7,7 @@ HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scr data_dir=/lustre/fsw/joc/big_nlp/gpt3/prepare_dataset/the_pile/train \ launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts \ container_mounts=[/lustre/fsw/joc/jbieniusiewi/:/lustre/fsw/joc/jbieniusiewi/] \ + container="gitlab-master.nvidia.com/dl/nemo/nemo-fw/train:230803_fault_tol" \ cluster.partition=luna \ cluster.account=coreai_dlalgo_llm \ cluster.job_name_prefix="coreai_dlalgo_llm-test-interact:" \ diff --git a/launcher_scripts/conf/cluster/bcm.yaml b/launcher_scripts/conf/cluster/bcm.yaml index 2c70c43d04..ba8f2ebbb0 100755 --- a/launcher_scripts/conf/cluster/bcm.yaml +++ b/launcher_scripts/conf/cluster/bcm.yaml @@ -1,4 +1,4 @@ -partition: DGX1 +partition: null account: null exclusive: True gpus_per_task: null diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml index 09f067a88d..89e86c652f 100755 --- a/launcher_scripts/conf/config.yaml +++ b/launcher_scripts/conf/config.yaml @@ -38,9 +38,9 @@ launcher_scripts_path: ??? # Path to NeMo Megatron Launch scripts, should ends data_dir: ${launcher_scripts_path}/data # Location to store and read the data. base_results_dir: ${launcher_scripts_path}/results # Location to store the results, checkpoints and logs. container_mounts: # List of additional paths to mount to container. They will be mounted to same path. - - /mnt/nvdl/usr/jbieniusiewi/ -#container: /mnt/nvdl/usr/jbieniusiewi/nemo/nemofw-training_230803_fault_tol.sqsh + - null container: nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.03 + wandb_api_key_file: null # File where the w&B api key is stored. Key must be on the first line. env_vars: diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml index 56482695fd..97fcb49316 100755 --- a/launcher_scripts/conf/training/gpt3/126m.yaml +++ b/launcher_scripts/conf/training/gpt3/126m.yaml @@ -5,7 +5,7 @@ hydra: run: name: gpt3_126m results_dir: ${base_results_dir}/${.name} - time_limit: "00:30:00" + time_limit: "1-00:00:00" dependency: "singleton" trainer: diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index 97fa3012a3..04110b9e6c 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -648,7 +648,6 @@ def _make_sbatch_string( 'CONT_SLURM_JOB_ID=$(echo $CONT_SBATCH_OUT | cut -f1 -d",")', 'rm -f $INTERRUPTED_FLAG_FILE', '', - 'touch $INTERRUPTED_FLAG_FILE', # FIXME TODO this is for debug only ] # commandline (this will run the function and args specified in the file provided as argument) From adc5b7bc3d0b5881c7bc195daf28d216dff2eb42 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Thu, 9 Nov 2023 09:21:37 -0800 Subject: [PATCH 05/39] Fixes after testing on EOS --- .../run_gpt126m_batch_training_on_eos.txt | 40 +++++++++++++++++++ launcher_scripts/conf/training/gpt3/126m.yaml | 17 ++++---- .../nemo_launcher/core/launchers.py | 6 ++- 3 files changed, 53 insertions(+), 10 deletions(-) create mode 100644 examples/fault_tolerance/run_gpt126m_batch_training_on_eos.txt diff --git a/examples/fault_tolerance/run_gpt126m_batch_training_on_eos.txt b/examples/fault_tolerance/run_gpt126m_batch_training_on_eos.txt new file mode 100644 index 0000000000..8ecdfea185 --- /dev/null +++ b/examples/fault_tolerance/run_gpt126m_batch_training_on_eos.txt @@ -0,0 +1,40 @@ +LAUNCHER_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi/nemo/NeMo-Megatron-Launcher" + +HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ + training=gpt3/126m \ + stages=["training"] \ + numa_mapping.enable=True \ + data_dir="/lustre/fsr/datasets/gpt/gpt3" \ + launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts \ + container_mounts=[/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi/:/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi/] \ + container=gitlab-master.nvidia.com/dl/nemo/nemo-fw/train:230803_fault_tol \ + cluster.partition=batch \ + cluster.account=coreai_dlalgo_llm \ + cluster.job_name_prefix="coreai_dlalgo_llm-test-interact:" \ + cluster.gpus_per_task=null \ + cluster.gpus_per_node=null \ + training.exp_manager.resume_if_exists=True \ + training.exp_manager.create_checkpoint_callback=True \ + training.exp_manager.checkpoint_callback_params.save_top_k=1 \ + training.exp_manager.resume_ignore_no_checkpoint=True \ + training.run.name="dummy_run_name_126m" \ + training.run.time_limit=00:12:00 \ + training.trainer.max_time=00:04:00:00 \ + training.trainer.num_nodes=1 \ + training.trainer.devices=8 \ + training.trainer.log_every_n_steps=1 \ + training.trainer.val_check_interval=1000 \ + training.trainer.enable_checkpointing=False \ + training.model.micro_batch_size=2 \ + training.model.global_batch_size=16 \ + training.model.tensor_model_parallel_size=1 \ + training.model.pipeline_model_parallel_size=1 \ + training.model.transformer_engine=True \ + training.model.fp8=False \ + training.model.fp8_e4m3=False \ + training.model.grad_div_ar_fusion=False \ + training.model.activations_checkpoint_granularity=selective \ + training.model.activations_checkpoint_method=uniform \ + training.model.data.data_impl="mock" \ + training.model.data.data_prefix=[] + diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml index 97fcb49316..ec6900ae8d 100755 --- a/launcher_scripts/conf/training/gpt3/126m.yaml +++ b/launcher_scripts/conf/training/gpt3/126m.yaml @@ -168,14 +168,15 @@ model: min_lr: 6e-5 fault_tolerance: - initial_rank_heartbeat_timeout: 120 - rank_heartbeat_timeout: 30 - ipc_timeout: 30 - simulated_fault: - fault_type: rank_killed - rank_to_fail: 1 - base_delay: 180 - autoresume_if_interrupted: True + initial_rank_heartbeat_timeout: 120 + rank_heartbeat_timeout: 30 + ipc_timeout: 30 + rank_termination_signal: 9 + simulated_fault: + fault_type: rank_killed + rank_to_fail: 1 + base_delay: 180 + autoresume_if_interrupted: True data: data_impl: mmap diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index 04110b9e6c..8760b3ca7f 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -637,6 +637,9 @@ def _make_sbatch_string( if setup is not None: lines += ["", "# setup"] + setup + if srun_args is None: + srun_args = [] + if autoresume_if_interrupted is True: lines += [ '', @@ -649,14 +652,13 @@ def _make_sbatch_string( 'rm -f $INTERRUPTED_FLAG_FILE', '', ] + srun_args += ["--kill-on-bad-exit=0", "--wait=0"] # commandline (this will run the function and args specified in the file provided as argument) # We pass --output and --error here, because the SBATCH command doesn't work as expected with a filename pattern stderr_flags = [] if stderr_to_stdout else ["--error", stderr] container_flags = ["--container-image", container_image] if container_image else [] container_flags += ["--container-mounts", container_mounts] if container_mounts else [] - if srun_args is None: - srun_args = [] if NEMO_LAUNCHER_MEMORY_MEASURE: srun_args += ["--overlap"] From cc4f54b1c02940c54558e4f10989e28779486a06 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Wed, 15 Nov 2023 08:42:35 -0800 Subject: [PATCH 06/39] Updated fault tolerance-end of day version --- launcher_scripts/conf/training/gpt3/126m.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml index ec6900ae8d..fb76ae4731 100755 --- a/launcher_scripts/conf/training/gpt3/126m.yaml +++ b/launcher_scripts/conf/training/gpt3/126m.yaml @@ -174,9 +174,8 @@ model: rank_termination_signal: 9 simulated_fault: fault_type: rank_killed - rank_to_fail: 1 base_delay: 180 - autoresume_if_interrupted: True + autoresume_if_interrupted: False data: data_impl: mmap From 2b0d228090dc5531cfd7dafb3378d38b1e7d06f0 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Fri, 17 Nov 2023 08:04:35 -0800 Subject: [PATCH 07/39] Read fault tolerance config from exp_manager section --- launcher_scripts/nemo_launcher/core/stages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 05e0826040..d20ccb9a78 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -323,7 +323,7 @@ def _make_cluster_parameters(self, cluster: str) -> Dict: } ) - fault_tol_conf = stage_cfg.get("model").get("fault_tolerance", None) + fault_tol_conf = stage_cfg.get("exp_manager").get("fault_tolerance", None) if fault_tol_conf is not None: cluster_parameters["autoresume_if_interrupted"] = fault_tol_conf.get("autoresume_if_interrupted", False) if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm": From 63dc9276c1eb2894a120099b9787200fa583bd6e Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Mon, 27 Nov 2023 15:24:43 +0100 Subject: [PATCH 08/39] Added autoresume after preemption --- launcher_scripts/conf/training/gpt3/126m.yaml | 11 +---------- launcher_scripts/nemo_launcher/core/stages.py | 6 ++++-- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml index fb76ae4731..27d3329756 100755 --- a/launcher_scripts/conf/training/gpt3/126m.yaml +++ b/launcher_scripts/conf/training/gpt3/126m.yaml @@ -26,6 +26,7 @@ trainer: accumulate_grad_batches: 1 gradient_clip_val: 1.0 + exp_manager: explicit_log_dir: ${training.run.results_dir}/results exp_dir: null @@ -167,16 +168,6 @@ model: constant_steps: 100000 min_lr: 6e-5 - fault_tolerance: - initial_rank_heartbeat_timeout: 120 - rank_heartbeat_timeout: 30 - ipc_timeout: 30 - rank_termination_signal: 9 - simulated_fault: - fault_type: rank_killed - base_delay: 180 - autoresume_if_interrupted: False - data: data_impl: mmap splits_string: "99990,8,2" diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index d20ccb9a78..cfabe1068e 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -325,9 +325,11 @@ def _make_cluster_parameters(self, cluster: str) -> Dict: fault_tol_conf = stage_cfg.get("exp_manager").get("fault_tolerance", None) if fault_tol_conf is not None: - cluster_parameters["autoresume_if_interrupted"] = fault_tol_conf.get("autoresume_if_interrupted", False) + resume_on_fault = fault_tol_conf.get("autoresume_if_faulted", False) + resume_on_preemption = fault_tol_conf.get("autoresume_if_preempted", False) + cluster_parameters["autoresume_if_interrupted"] = (resume_on_fault or resume_on_preemption) if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm": - raise ValueError(f"autoresume_if_interrupted works only with 'bcm' cluster (current cluster is '{cluster}')") + raise ValueError(f"`autoresume_if_faulted` and `autoresume_if_preempted` works only with 'bcm' cluster (current cluster is '{cluster}')") return cluster_parameters From 2334995b6aeabaf6955010890ba85006fb4bf027 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Tue, 28 Nov 2023 16:24:38 +0000 Subject: [PATCH 09/39] Updated auto-resume params reading --- launcher_scripts/nemo_launcher/core/stages.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index cfabe1068e..cd7da3ab19 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -324,12 +324,11 @@ def _make_cluster_parameters(self, cluster: str) -> Dict: ) fault_tol_conf = stage_cfg.get("exp_manager").get("fault_tolerance", None) - if fault_tol_conf is not None: - resume_on_fault = fault_tol_conf.get("autoresume_if_faulted", False) - resume_on_preemption = fault_tol_conf.get("autoresume_if_preempted", False) - cluster_parameters["autoresume_if_interrupted"] = (resume_on_fault or resume_on_preemption) - if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm": - raise ValueError(f"`autoresume_if_faulted` and `autoresume_if_preempted` works only with 'bcm' cluster (current cluster is '{cluster}')") + resume_on_fault = fault_tol_conf and fault_tol_conf.get("autoresume_if_faulted", False) + resume_on_preemption = stage_cfg.get("exp_manager").get("autoresume_if_preempted", False) + cluster_parameters["autoresume_if_interrupted"] = (resume_on_fault or resume_on_preemption) + if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm": + raise ValueError(f"`autoresume_if_faulted` and `autoresume_if_preempted` works only with 'bcm' cluster (current cluster is '{cluster}')") return cluster_parameters From e50d3887cd392f9066f5b07faa77ff2584994693 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Tue, 5 Dec 2023 11:33:49 -0800 Subject: [PATCH 10/39] Added SC2 config and run script --- .../fault_tolerance/run_sc2_3b_on_eos_FT.txt | 33 + .../conf/training/gpt3/starcoder2_3b.yaml | 771 ++++++++++++++++++ 2 files changed, 804 insertions(+) create mode 100644 examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt create mode 100755 launcher_scripts/conf/training/gpt3/starcoder2_3b.yaml diff --git a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt new file mode 100644 index 0000000000..6d13973532 --- /dev/null +++ b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt @@ -0,0 +1,33 @@ +USR_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi" +LAUNCHER_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi/nemo/NeMo-Megatron-Launcher/" + +HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ + training=gpt3/starcoder2_3b \ + stages=["training"] \ + numa_mapping.enable=True \ + data_dir=/lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized \ + launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts \ + container_mounts=[$USR_DIR:$USR_DIR] \ + container="gitlab-master.nvidia.com/dl/nemo/nemo-fw/train:sc2_fault_tol" \ + cluster.partition=batch \ + cluster.account=coreai_dlalgo_llm \ + cluster.job_name_prefix="coreai_dlalgo_llm-sc2_3b-ft:" \ + cluster.gpus_per_task=null \ + cluster.gpus_per_node=null \ + training.run.name="fault_tol_sc2_3b" \ + training.run.time_limit=04:00:00 \ + training.trainer.max_time=00:04:00:00 \ + training.trainer.num_nodes=2 \ + training.trainer.devices=8 \ + training.trainer.log_every_n_steps=1 \ + training.trainer.val_check_interval=1000 \ + ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=600 \ + ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=300 \ + ++training.exp_manager.fault_tolerance.ipc_timeout=30 \ + ++training.exp_manager.fault_tolerance.rank_termination_signal=9 + ++training.exp_manager.fault_tolerance.autoresume_if_faulted=True \ + ++training.exp_manager.autoresume_if_preempted=True + +# Uncomment to test simulated faults +# ++training.exp_manager.fault_tolerance.simulated_fault.fault_type=random \ +# ++training.exp_manager.fault_tolerance.simulated_fault.base_delay=1800 diff --git a/launcher_scripts/conf/training/gpt3/starcoder2_3b.yaml b/launcher_scripts/conf/training/gpt3/starcoder2_3b.yaml new file mode 100755 index 0000000000..b71d98ed3a --- /dev/null +++ b/launcher_scripts/conf/training/gpt3/starcoder2_3b.yaml @@ -0,0 +1,771 @@ +hydra: + searchpath: + - file:///opt/NeMo/examples/nlp/language_modeling/conf +run: + name: starcoder2_3b + results_dir: ${base_results_dir}/${.name} + time_limit: 04:00:00 + dependency: singleton +trainer: + num_nodes: 8 + devices: 8 + accelerator: gpu + precision: bf16 + logger: false + enable_checkpointing: false + use_distributed_sampler: false + max_epochs: null + max_steps: 114400 + max_time: 02:23:30:00 + log_every_n_steps: 10 + val_check_interval: 500 + limit_val_batches: 25 + limit_test_batches: 25 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 +exp_manager: + explicit_log_dir: ${base_results_dir}/${.name} + exp_dir: null + name: megatron_gpt + create_wandb_logger: false + wandb_logger_kwargs: + project: starcoder2 + name: starcoder2_3b + resume_if_exists: true + resume_ignore_no_checkpoint: true + create_checkpoint_callback: true + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: false + save_nemo_on_train_end: false + filename: megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples} + model_parallel_size: 2 + log_step_timing: true + step_timing_kwargs: + sync_cuda: true + buffer_size: 5 +model: + micro_batch_size: 1 + global_batch_size: 160 + rampup_batch_size: null + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + encoder_seq_length: 16384 + max_position_embeddings: 16384 + num_layers: 30 + hidden_size: 3072 + ffn_hidden_size: 12288 + num_attention_heads: 24 + init_method_std: 0.018042 + use_scaled_init_method: true + hidden_dropout: 0.1 + attention_dropout: 0.1 + ffn_dropout: 0.0 + kv_channels: 128 + apply_query_key_layer_scaling: true + normalization: layernorm1p + layernorm_zero_centered_gamma: true + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 0.5 + attention_type: multihead + share_embeddings_and_output_weights: false + tokenizer: + library: huggingface + type: bigcode/starcoder2-tokenizer + model: null + delimiter: null + vocab_file: null + merge_file: null + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: false + bias_activation_fusion: false + bias_dropout_add_fusion: false + masked_softmax_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: null + activations_checkpoint_method: null + activations_checkpoint_num_layers: null + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_layers_per_pipeline: null + sequence_parallel: false + overlap_p2p_comm: false + batch_p2p_comm: true + num_query_groups: null + mcore_gpt: true + transformer_engine: false + fp8: false + fp8_e4m3: false + fp8_hybrid: true + fp8_margin: 0 + fp8_interval: 1 + fp8_amax_history_len: 1024 + fp8_amax_compute_algo: max + fp8_wgrad: true + ub_tp_comm_overlap: false + optim: + name: distributed_fused_adam + lr: 0.0003 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + sched: + name: CosineAnnealing + warmup_steps: 100 + constant_steps: 0 + min_lr: 3.0e-05 + data: + data_impl: mmap + splits_string: 9995,3,2 + seq_length: 16384 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + add_fim: true + fim: + rate: 0.5 + spm_rate: 0.5 + split_sample: + fragment_rate: 0.5 + no_prefix: + extra_tokens: + prefix: + middle: + suffix: + pad: + eod: <|endoftext|> + data_prefix: + - 1.96 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_0/gpt2-preprocessed_content_document + - 1.96 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_1/gpt2-preprocessed_content_document + - 1.95 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_2/gpt2-preprocessed_content_document + - 1.95 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_3/gpt2-preprocessed_content_document + - 1.96 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_4/gpt2-preprocessed_content_document + - 1.95 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_5/gpt2-preprocessed_content_document + - 1.95 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_6/gpt2-preprocessed_content_document + - 1.95 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_7/gpt2-preprocessed_content_document + - 1.95 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_8/gpt2-preprocessed_content_document + - 1.95 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_9/gpt2-preprocessed_content_document + - 2.21 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_0/gpt2-preprocessed_content_document + - 2.21 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_1/gpt2-preprocessed_content_document + - 2.21 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_2/gpt2-preprocessed_content_document + - 2.21 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_3/gpt2-preprocessed_content_document + - 2.21 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_4/gpt2-preprocessed_content_document + - 2.59 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_0/gpt2-preprocessed_content_document + - 2.5 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_1/gpt2-preprocessed_content_document + - 2.46 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_2/gpt2-preprocessed_content_document + - 2.42 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_3/gpt2-preprocessed_content_document + - 2.41 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_4/gpt2-preprocessed_content_document + - 2.36 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_5/gpt2-preprocessed_content_document + - 2.72 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_0/gpt2-preprocessed_content_document + - 2.71 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_1/gpt2-preprocessed_content_document + - 2.71 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_2/gpt2-preprocessed_content_document + - 2.73 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_3/gpt2-preprocessed_content_document + - 2.7 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_4/gpt2-preprocessed_content_document + - 2.71 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_5/gpt2-preprocessed_content_document + - 1.68 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/kaggle_scripts/kaggle_scripts_0/gpt2-preprocessed_content_document + - 1.6 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/documentation/documentation_0/gpt2-preprocessed_content_document + - 2.42 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_0/gpt2-preprocessed_content_document + - 2.42 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_1/gpt2-preprocessed_content_document + - 2.43 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_2/gpt2-preprocessed_content_document + - 2.43 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_3/gpt2-preprocessed_content_document + - 2.42 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_4/gpt2-preprocessed_content_document + - 2.29 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_5/gpt2-preprocessed_content_document + - 3.32 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/stackoverflow/stackoverflow_0/gpt2-preprocessed_content_document + - 3.55 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/stackoverflow/stackoverflow_1/gpt2-preprocessed_content_document + - 3.39 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/stackoverflow/stackoverflow_2/gpt2-preprocessed_content_document + - 0.25 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/lhq_data/lhq_data_0/gpt2-preprocessed_content_document + - 0.28 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/lhq_data/lhq_data_1/gpt2-preprocessed_content_document + - 0.47 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/lhq_data/lhq_data_2/gpt2-preprocessed_content_document + - 1.0 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/ir_cpp/ir_cpp_0/gpt2-preprocessed_content_document + - 1.0 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/ir_rust/ir_rust_0/gpt2-preprocessed_content_document + - 1.0 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/ir_python/ir_python_0/gpt2-preprocessed_content_document + - 3.0 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/ir_low_resource/ir_low_resource_0/gpt2-preprocessed_content_document + - 1.95 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_0/gpt2-preprocessed_content_document + - 2.01 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_1/gpt2-preprocessed_content_document + - 1.97 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_2/gpt2-preprocessed_content_document + - 2.02 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_3/gpt2-preprocessed_content_document + - 2.08 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_4/gpt2-preprocessed_content_document + - 1.95 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_5/gpt2-preprocessed_content_document + - 1.89 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_6/gpt2-preprocessed_content_document + - 1.85 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_7/gpt2-preprocessed_content_document + - 2.09 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_8/gpt2-preprocessed_content_document + - 2.05 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_9/gpt2-preprocessed_content_document + - 1.93 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_10/gpt2-preprocessed_content_document + - 1.9 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_11/gpt2-preprocessed_content_document + - 2.0 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_12/gpt2-preprocessed_content_document + - 2.06 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_13/gpt2-preprocessed_content_document + - 1.84 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_14/gpt2-preprocessed_content_document + - 1.93 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_15/gpt2-preprocessed_content_document + - 1.85 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_16/gpt2-preprocessed_content_document + - 1.83 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_17/gpt2-preprocessed_content_document + - 1.83 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_18/gpt2-preprocessed_content_document + - 2.02 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_19/gpt2-preprocessed_content_document + - 1.92 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_20/gpt2-preprocessed_content_document + - 2.27 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_21/gpt2-preprocessed_content_document + - 2.25 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_22/gpt2-preprocessed_content_document + - 2.49 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_23/gpt2-preprocessed_content_document + - 1.88 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_24/gpt2-preprocessed_content_document + - 2.02 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_25/gpt2-preprocessed_content_document + - 2.04 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_26/gpt2-preprocessed_content_document + - 2.42 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_27/gpt2-preprocessed_content_document + - 1.97 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_28/gpt2-preprocessed_content_document + - 1.91 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_29/gpt2-preprocessed_content_document + - 2.54 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_30/gpt2-preprocessed_content_document + - 2.28 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_31/gpt2-preprocessed_content_document + - 2.02 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_32/gpt2-preprocessed_content_document + - 1.92 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_33/gpt2-preprocessed_content_document + - 2.26 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_34/gpt2-preprocessed_content_document + - 1.88 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_35/gpt2-preprocessed_content_document + - 2.09 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_36/gpt2-preprocessed_content_document + - 2.1 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_37/gpt2-preprocessed_content_document + - 2.02 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_38/gpt2-preprocessed_content_document + - 1.96 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_39/gpt2-preprocessed_content_document + - 2.05 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_40/gpt2-preprocessed_content_document + - 2.01 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_41/gpt2-preprocessed_content_document + - 2.15 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_42/gpt2-preprocessed_content_document + - 1.91 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_43/gpt2-preprocessed_content_document + - 1.99 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_44/gpt2-preprocessed_content_document + - 2.03 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_45/gpt2-preprocessed_content_document + - 1.92 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_46/gpt2-preprocessed_content_document + - 2.1 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_47/gpt2-preprocessed_content_document + - 2.14 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_48/gpt2-preprocessed_content_document + - 1.96 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_49/gpt2-preprocessed_content_document + - 1.98 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_50/gpt2-preprocessed_content_document + - 2.04 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_51/gpt2-preprocessed_content_document + - 1.93 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_52/gpt2-preprocessed_content_document + - 1.87 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_53/gpt2-preprocessed_content_document + - 1.92 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_54/gpt2-preprocessed_content_document + - 2.18 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_55/gpt2-preprocessed_content_document + - 2.04 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_56/gpt2-preprocessed_content_document + - 2.06 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_57/gpt2-preprocessed_content_document + - 1.89 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_58/gpt2-preprocessed_content_document + - 2.05 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_59/gpt2-preprocessed_content_document + - 2.11 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_60/gpt2-preprocessed_content_document + - 2.15 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_61/gpt2-preprocessed_content_document + - 2.03 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_62/gpt2-preprocessed_content_document + - 1.88 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_63/gpt2-preprocessed_content_document + - 2.12 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_64/gpt2-preprocessed_content_document + - 2.04 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_65/gpt2-preprocessed_content_document + - 2.04 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_66/gpt2-preprocessed_content_document + - 2.45 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_67/gpt2-preprocessed_content_document + - 1.91 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_68/gpt2-preprocessed_content_document + - 2.13 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_69/gpt2-preprocessed_content_document + - 1.98 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_70/gpt2-preprocessed_content_document + - 1.94 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_71/gpt2-preprocessed_content_document + - 1.93 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_72/gpt2-preprocessed_content_document + - 1.95 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_73/gpt2-preprocessed_content_document + - 2.04 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_74/gpt2-preprocessed_content_document + - 1.99 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_75/gpt2-preprocessed_content_document + - 2.02 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_76/gpt2-preprocessed_content_document + - 1.93 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_77/gpt2-preprocessed_content_document + - 1.89 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_78/gpt2-preprocessed_content_document + - 2.1 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_79/gpt2-preprocessed_content_document + - 2.07 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_80/gpt2-preprocessed_content_document + - 2.17 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_81/gpt2-preprocessed_content_document + - 2.65 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_82/gpt2-preprocessed_content_document + - 2.13 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_83/gpt2-preprocessed_content_document + - 1.95 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_84/gpt2-preprocessed_content_document + - 1.96 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_85/gpt2-preprocessed_content_document + - 2.03 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_86/gpt2-preprocessed_content_document + - 2.02 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_87/gpt2-preprocessed_content_document + - 1.87 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_88/gpt2-preprocessed_content_document + - 1.96 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_89/gpt2-preprocessed_content_document + - 2.25 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_90/gpt2-preprocessed_content_document + - 2.11 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_91/gpt2-preprocessed_content_document + - 2.28 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_92/gpt2-preprocessed_content_document + - 2.04 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_93/gpt2-preprocessed_content_document + - 1.97 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_94/gpt2-preprocessed_content_document + - 2.37 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_95/gpt2-preprocessed_content_document + - 2.15 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_96/gpt2-preprocessed_content_document + - 1.87 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_97/gpt2-preprocessed_content_document + - 2.03 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_98/gpt2-preprocessed_content_document + - 2.26 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_99/gpt2-preprocessed_content_document + - 2.07 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_100/gpt2-preprocessed_content_document + - 2.12 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_101/gpt2-preprocessed_content_document + - 2.22 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_102/gpt2-preprocessed_content_document + - 1.86 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_103/gpt2-preprocessed_content_document + - 2.0 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_104/gpt2-preprocessed_content_document + - 1.99 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_105/gpt2-preprocessed_content_document + - 2.2 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_106/gpt2-preprocessed_content_document + - 2.28 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_107/gpt2-preprocessed_content_document + - 2.14 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_108/gpt2-preprocessed_content_document + - 2.16 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_109/gpt2-preprocessed_content_document + - 1.95 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_110/gpt2-preprocessed_content_document + - 2.32 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_111/gpt2-preprocessed_content_document + - 1.98 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_112/gpt2-preprocessed_content_document + - 2.46 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_113/gpt2-preprocessed_content_document + - 2.15 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_114/gpt2-preprocessed_content_document + - 2.24 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_115/gpt2-preprocessed_content_document + - 1.9 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_116/gpt2-preprocessed_content_document + - 2.12 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_117/gpt2-preprocessed_content_document + - 1.93 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_118/gpt2-preprocessed_content_document + - 2.01 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_119/gpt2-preprocessed_content_document + - 2.3 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_120/gpt2-preprocessed_content_document + - 1.95 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_121/gpt2-preprocessed_content_document + - 1.91 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_122/gpt2-preprocessed_content_document + - 1.88 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_123/gpt2-preprocessed_content_document + - 2.27 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_124/gpt2-preprocessed_content_document + - 2.13 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_125/gpt2-preprocessed_content_document + - 2.04 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_126/gpt2-preprocessed_content_document + - 2.15 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_127/gpt2-preprocessed_content_document + - 2.18 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_128/gpt2-preprocessed_content_document + - 2.22 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_129/gpt2-preprocessed_content_document + - 2.06 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_130/gpt2-preprocessed_content_document + - 1.99 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_131/gpt2-preprocessed_content_document + - 1.93 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_132/gpt2-preprocessed_content_document + - 2.37 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_133/gpt2-preprocessed_content_document + - 1.96 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_134/gpt2-preprocessed_content_document + - 2.12 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_135/gpt2-preprocessed_content_document + - 2.03 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_136/gpt2-preprocessed_content_document + - 2.44 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_137/gpt2-preprocessed_content_document + - 2.16 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_138/gpt2-preprocessed_content_document + - 1.97 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_139/gpt2-preprocessed_content_document + - 2.0 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_140/gpt2-preprocessed_content_document + - 2.12 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_141/gpt2-preprocessed_content_document + - 1.98 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_142/gpt2-preprocessed_content_document + - 2.15 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_143/gpt2-preprocessed_content_document + - 1.85 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_144/gpt2-preprocessed_content_document + - 2.04 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_145/gpt2-preprocessed_content_document + - 1.94 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_146/gpt2-preprocessed_content_document + - 1.87 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_147/gpt2-preprocessed_content_document + - 1.85 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_148/gpt2-preprocessed_content_document + - 2.49 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_149/gpt2-preprocessed_content_document + - 2.13 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_150/gpt2-preprocessed_content_document + - 1.97 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_151/gpt2-preprocessed_content_document + - 1.88 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_152/gpt2-preprocessed_content_document + - 2.36 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_153/gpt2-preprocessed_content_document + - 2.02 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_154/gpt2-preprocessed_content_document + - 2.1 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_155/gpt2-preprocessed_content_document + - 1.93 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_156/gpt2-preprocessed_content_document + - 1.98 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_157/gpt2-preprocessed_content_document + - 1.96 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_158/gpt2-preprocessed_content_document + - 1.99 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_159/gpt2-preprocessed_content_document + - 2.0 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_160/gpt2-preprocessed_content_document + - 2.08 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_161/gpt2-preprocessed_content_document + - 2.0 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_162/gpt2-preprocessed_content_document + - 2.08 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_163/gpt2-preprocessed_content_document + - 2.02 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_164/gpt2-preprocessed_content_document + - 2.12 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_165/gpt2-preprocessed_content_document + - 2.07 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_166/gpt2-preprocessed_content_document + - 2.03 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_167/gpt2-preprocessed_content_document + - 2.28 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_168/gpt2-preprocessed_content_document + - 2.32 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_169/gpt2-preprocessed_content_document + - 1.92 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_170/gpt2-preprocessed_content_document + - 2.02 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_171/gpt2-preprocessed_content_document + - 1.94 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_172/gpt2-preprocessed_content_document + - 2.06 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_173/gpt2-preprocessed_content_document + - 2.01 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_174/gpt2-preprocessed_content_document + - 1.99 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_175/gpt2-preprocessed_content_document + - 2.19 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_176/gpt2-preprocessed_content_document + - 2.14 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_177/gpt2-preprocessed_content_document + - 1.91 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_178/gpt2-preprocessed_content_document + - 2.23 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_179/gpt2-preprocessed_content_document + - 1.99 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_180/gpt2-preprocessed_content_document + - 2.11 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_181/gpt2-preprocessed_content_document + - 1.93 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_182/gpt2-preprocessed_content_document + - 1.87 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_183/gpt2-preprocessed_content_document + - 2.02 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_184/gpt2-preprocessed_content_document + - 1.9 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_185/gpt2-preprocessed_content_document + - 2.05 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_186/gpt2-preprocessed_content_document + - 2.01 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_187/gpt2-preprocessed_content_document + - 1.9 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_188/gpt2-preprocessed_content_document + - 1.99 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_189/gpt2-preprocessed_content_document + - 2.12 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_190/gpt2-preprocessed_content_document + - 1.89 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_191/gpt2-preprocessed_content_document + - 1.89 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_192/gpt2-preprocessed_content_document + - 1.88 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_193/gpt2-preprocessed_content_document + - 2.63 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_194/gpt2-preprocessed_content_document + - 1.87 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_195/gpt2-preprocessed_content_document + - 1.98 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_196/gpt2-preprocessed_content_document + - 1.97 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_197/gpt2-preprocessed_content_document + - 2.0 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_198/gpt2-preprocessed_content_document + - 2.17 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_199/gpt2-preprocessed_content_document + - 2.02 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_200/gpt2-preprocessed_content_document + - 2.11 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_201/gpt2-preprocessed_content_document + - 2.24 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_202/gpt2-preprocessed_content_document + - 2.19 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_203/gpt2-preprocessed_content_document + - 2.07 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_204/gpt2-preprocessed_content_document + - 2.03 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_205/gpt2-preprocessed_content_document + - 2.03 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_206/gpt2-preprocessed_content_document + - 2.18 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_207/gpt2-preprocessed_content_document + - 1.92 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_208/gpt2-preprocessed_content_document + - 2.37 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_209/gpt2-preprocessed_content_document + - 2.03 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_210/gpt2-preprocessed_content_document + - 2.04 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_211/gpt2-preprocessed_content_document + - 1.86 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_212/gpt2-preprocessed_content_document + - 1.99 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_213/gpt2-preprocessed_content_document + - 1.96 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_214/gpt2-preprocessed_content_document + - 1.97 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_215/gpt2-preprocessed_content_document + - 2.1 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_216/gpt2-preprocessed_content_document + - 2.06 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_217/gpt2-preprocessed_content_document + - 1.99 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_218/gpt2-preprocessed_content_document + - 1.9 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_219/gpt2-preprocessed_content_document + - 2.05 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_220/gpt2-preprocessed_content_document + - 1.95 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_221/gpt2-preprocessed_content_document + - 1.9 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_222/gpt2-preprocessed_content_document + - 2.08 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_223/gpt2-preprocessed_content_document + - 1.97 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_224/gpt2-preprocessed_content_document + - 1.98 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_225/gpt2-preprocessed_content_document + - 1.93 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_226/gpt2-preprocessed_content_document + - 2.22 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_227/gpt2-preprocessed_content_document + - 1.93 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_228/gpt2-preprocessed_content_document + - 2.17 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_229/gpt2-preprocessed_content_document + - 2.06 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_230/gpt2-preprocessed_content_document + - 1.98 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_231/gpt2-preprocessed_content_document + - 1.94 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_232/gpt2-preprocessed_content_document + - 2.14 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_233/gpt2-preprocessed_content_document + - 1.97 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_234/gpt2-preprocessed_content_document + - 2.14 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_235/gpt2-preprocessed_content_document + - 2.12 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_236/gpt2-preprocessed_content_document + - 2.09 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_237/gpt2-preprocessed_content_document + - 2.01 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_238/gpt2-preprocessed_content_document + - 2.15 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_239/gpt2-preprocessed_content_document + - 2.01 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_240/gpt2-preprocessed_content_document + - 1.99 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_241/gpt2-preprocessed_content_document + - 1.86 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_242/gpt2-preprocessed_content_document + - 2.12 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_243/gpt2-preprocessed_content_document + - 1.99 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_244/gpt2-preprocessed_content_document + - 2.41 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_245/gpt2-preprocessed_content_document + - 2.04 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_246/gpt2-preprocessed_content_document + - 1.95 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_247/gpt2-preprocessed_content_document + - 1.93 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_248/gpt2-preprocessed_content_document + - 2.61 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_249/gpt2-preprocessed_content_document + - 1.77 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_250/gpt2-preprocessed_content_document + - 1.94 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_251/gpt2-preprocessed_content_document + - 2.2 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_252/gpt2-preprocessed_content_document + - 1.9 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_253/gpt2-preprocessed_content_document + - 2.15 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_254/gpt2-preprocessed_content_document + - 2.13 + - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_255/gpt2-preprocessed_content_document From ca7ab16b3c34cc16364edc928a44ed51eaa815b1 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Tue, 5 Dec 2023 12:16:53 -0800 Subject: [PATCH 11/39] Fixed launch cmd for Sc2 --- examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt index 6d13973532..24663507b7 100644 --- a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt +++ b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt @@ -24,10 +24,11 @@ HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scri ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=600 \ ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=300 \ ++training.exp_manager.fault_tolerance.ipc_timeout=30 \ - ++training.exp_manager.fault_tolerance.rank_termination_signal=9 + ++training.exp_manager.fault_tolerance.rank_termination_signal=9 \ ++training.exp_manager.fault_tolerance.autoresume_if_faulted=True \ ++training.exp_manager.autoresume_if_preempted=True + # Uncomment to test simulated faults # ++training.exp_manager.fault_tolerance.simulated_fault.fault_type=random \ # ++training.exp_manager.fault_tolerance.simulated_fault.base_delay=1800 From 74b2e4227cf6baf1701f1362d0daf283183cb443 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Wed, 6 Dec 2023 10:18:43 -0800 Subject: [PATCH 12/39] Updated launcher cmd --- examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt index 24663507b7..57509227e7 100644 --- a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt +++ b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt @@ -1,7 +1,7 @@ USR_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi" LAUNCHER_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi/nemo/NeMo-Megatron-Launcher/" -HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ +NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ training=gpt3/starcoder2_3b \ stages=["training"] \ numa_mapping.enable=True \ @@ -26,7 +26,7 @@ HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scri ++training.exp_manager.fault_tolerance.ipc_timeout=30 \ ++training.exp_manager.fault_tolerance.rank_termination_signal=9 \ ++training.exp_manager.fault_tolerance.autoresume_if_faulted=True \ - ++training.exp_manager.autoresume_if_preempted=True + ++training.exp_manager.autoresume_if_preempted=False # Uncomment to test simulated faults From b8d8a585247835c7b29df0f0aed619d577597550 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Wed, 6 Dec 2023 10:46:13 -0800 Subject: [PATCH 13/39] increased timeouts and removed NVTE_APPLY_QK_LAYER_SCALING --- examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt index 57509227e7..99e34fe523 100644 --- a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt +++ b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt @@ -1,7 +1,7 @@ USR_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi" LAUNCHER_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi/nemo/NeMo-Megatron-Launcher/" -NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ +HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ training=gpt3/starcoder2_3b \ stages=["training"] \ numa_mapping.enable=True \ @@ -21,9 +21,9 @@ NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 training.trainer.devices=8 \ training.trainer.log_every_n_steps=1 \ training.trainer.val_check_interval=1000 \ - ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=600 \ - ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=300 \ - ++training.exp_manager.fault_tolerance.ipc_timeout=30 \ + ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=720 \ + ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=600 \ + ++training.exp_manager.fault_tolerance.ipc_timeout=60 \ ++training.exp_manager.fault_tolerance.rank_termination_signal=9 \ ++training.exp_manager.fault_tolerance.autoresume_if_faulted=True \ ++training.exp_manager.autoresume_if_preempted=False From 0d9065ec6f30fdbd96bfa400ac13164003630de3 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Thu, 7 Dec 2023 17:55:11 +0100 Subject: [PATCH 14/39] Added back NVTE_APPLY_QK_LAYER_SCALING=1 as error happens without it --- examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt index 99e34fe523..71831a30e3 100644 --- a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt +++ b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt @@ -1,7 +1,7 @@ USR_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi" LAUNCHER_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi/nemo/NeMo-Megatron-Launcher/" -HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ +NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ training=gpt3/starcoder2_3b \ stages=["training"] \ numa_mapping.enable=True \ From c6367c5fa22c82327bdf91d38715ec842c68c640 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Mon, 11 Dec 2023 15:07:00 +0100 Subject: [PATCH 15/39] Added fault tolerance unit tests --- .../config_tests/test_fault_tol_config.py | 111 ++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py diff --git a/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py new file mode 100644 index 0000000000..02cb78b17d --- /dev/null +++ b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py @@ -0,0 +1,111 @@ +import math +import os + +import nemo_launcher.core.launchers +import omegaconf +import pytest +from nemo_launcher.core.stages import Training +from omegaconf import OmegaConf + +# Setup NEMO_LAUNCHER_DEBUG=True, so no 'srun' or 'sbatch' is required +nemo_launcher.core.launchers.NEMO_LAUNCHER_DEBUG = True + +omegaconf.OmegaConf.register_new_resolver("multiply", lambda x, y: x * y, replace=True) + +omegaconf.OmegaConf.register_new_resolver( + "divide_ceil", lambda x, y: int(math.ceil(x / y)), replace=True +) + +omegaconf.OmegaConf.register_new_resolver( + "divide_floor", lambda x, y: int(math.floor(x / y)), replace=True +) + +LAUNCHER_SCRIPTS_PATH = "." +TEST_RESULTS_DIR = "test_folder_ft" + + +@pytest.fixture(autouse=True) +def _setup_and_teardown(): + yield + os.system(f"rm -rf {TEST_RESULTS_DIR}") + + +def test_fault_tol_config_no_fault_tol_section(): + """ No fault tolerance section in config: should be fine """ + cfg = OmegaConf.load("conf/config.yaml") + cfg.stages = ["training"] + cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH + cfg.base_results_dir = TEST_RESULTS_DIR + cfg.cluster_type = "bcm" + cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml") + cfg.training_config = "gpt3/126m" + cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") + assert cfg.training.exp_manager.get("fault_tolernace", None) is None + stage = Training(cfg) + _ = stage.run() + + +def test_fault_tol_config_autoresume_if_preempted(): + """ autpresume_if_preempted=True and BCM cluster: should be fine """ + cfg = OmegaConf.load("conf/config.yaml") + cfg.stages = ["training"] + cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH + cfg.base_results_dir = TEST_RESULTS_DIR + cfg.cluster_type = "bcm" + cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml") + cfg.training_config = "gpt3/126m" + cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") + cfg.training.exp_manager.autoresume_if_preempted = True + stage = Training(cfg) + _ = stage.run() + + +def test_fault_tol_config_autoresume_if_preempted_invalid_cluster(): + """ autpresume_if_preempted=True is not allowed with non-BCM cluster """ + cfg = OmegaConf.load("conf/config.yaml") + cfg.stages = ["training"] + cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH + cfg.base_results_dir = TEST_RESULTS_DIR + cfg.cluster_type = "bcp" + cfg.cluster = dict() + cfg.training_config = "gpt3/126m" + cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") + cfg.training.exp_manager.autoresume_if_preempted = True + with pytest.raises(ValueError): + stage = Training(cfg) + _ = stage.run() + + +def test_fault_tol_config_autoresume_if_faulted(): + """ autoresume_if_faulted=True and BCM cluster: should be fine """ + cfg = OmegaConf.load("conf/config.yaml") + cfg.stages = ["training"] + cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH + cfg.base_results_dir = TEST_RESULTS_DIR + cfg.cluster_type = "bcm" + cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml") + cfg.training_config = "gpt3/126m" + cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") + cfg.training.exp_manager.fault_tolerance = OmegaConf.create( + {"autoresume_if_faulted": True} + ) + stage = Training(cfg) + _ = stage.run() + + +def test_fault_tol_config_autoresume_if_faulted_invalid_cluster(): + """ autoresume_if_faulted=True is not allowed with non-BCM cluster """ + cfg = OmegaConf.load("conf/config.yaml") + cfg.stages = ["training"] + cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH + cfg.base_results_dir = TEST_RESULTS_DIR + cfg.cluster_type = "bcp" + cfg.cluster = dict() + cfg.training_config = "gpt3/126m" + cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") + cfg.training.exp_manager.fault_tolerance = OmegaConf.create( + {"autoresume_if_faulted": True} + ) + with pytest.raises(ValueError): + stage = Training(cfg) + _ = stage.run() From f1dfe54a8e443dac28cf9c6e425690b413ba85ee Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Thu, 11 Jan 2024 12:42:57 +0100 Subject: [PATCH 16/39] Use FT launcher, WIP --- .../nemo_launcher/core/launchers.py | 251 ++++++++++++++++-- launcher_scripts/nemo_launcher/core/stages.py | 27 +- .../config_tests/test_fault_tol_config.py | 19 +- 3 files changed, 268 insertions(+), 29 deletions(-) diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index 7dac967053..2e704d0a36 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -455,9 +455,14 @@ def _make_submission_file_text(self, command_groups: List[List[str]]) -> str: :return: submission script file's text :rtype: str """ - return _make_sbatch_string( - command_groups=command_groups, folder=self.folder, **self.parameters - ) + if getattr(self.parameters, 'use_fault_tolerance', None): + return _make_sbatch_string_ft_launcher( + command_groups=command_groups, folder=self.folder, **self.parameters + ) + else: + return _make_sbatch_string( + command_groups=command_groups, folder=self.folder, **self.parameters + ) @staticmethod def _make_submission_command(submission_file_path: Path) -> List[str]: @@ -586,7 +591,6 @@ def _make_sbatch_string( additional_parameters: Optional[Dict[str, Any]] = None, srun_args: Optional[Iterable[str]] = None, heterogeneous: bool = False, - autoresume_if_interrupted: bool = False, ) -> str: """Creates the content of an sbatch file with provided parameters @@ -627,7 +631,6 @@ def _make_sbatch_string( "container_mounts", "srun_args", "heterogeneous", - "autoresume_if_interrupted", ] parameters = { k: v for k, v in locals().items() if v is not None and k not in nonslurm @@ -689,23 +692,6 @@ def _make_sbatch_string( if setup is not None: lines += ["", "# setup"] + setup - if srun_args is None: - srun_args = [] - - if autoresume_if_interrupted is True: - lines += [ - '', - '# if the flag file is created by a trainer script, this slurm batch script will be rescheduled', - 'export INTERRUPTED_FLAG_FILE='+str(paths.folder / "_interrupted_flag"), - 'if [ "$RESUMED" = "1" ] && [ ! -f "$INTERRUPTED_FLAG_FILE" ] ; then exit 0 ; fi', - 'CONT_SBATCH_OUT=$(RESUMED=1 sbatch --parsable --dependency=afterany:"$SLURM_JOB_ID" "$0")', - 'if [ $? -ne 0 ] ; then echo "Could not schedule continuation job. Check stderr for details." ; exit 1 ; fi', - 'CONT_SLURM_JOB_ID=$(echo $CONT_SBATCH_OUT | cut -f1 -d",")', - 'rm -f $INTERRUPTED_FLAG_FILE', - '', - ] - srun_args += ["--kill-on-bad-exit=0", "--wait=3600"] - # commandline (this will run the function and args specified in the file provided as argument) # We pass --output and --error here, because the SBATCH command doesn't work as expected with a filename pattern stderr_flags = [] if stderr_to_stdout else ["--error", stderr] @@ -713,6 +699,8 @@ def _make_sbatch_string( container_flags += ( ["--container-mounts", container_mounts] if container_mounts else [] ) + if srun_args is None: + srun_args = [] if NEMO_LAUNCHER_MEMORY_MEASURE: srun_args += ["--overlap"] @@ -787,6 +775,225 @@ def _make_sbatch_string( f' {command} "', "", ] + return "\n".join(lines) + + +# pylint: disable=too-many-arguments,unused-argument, too-many-locals +def _make_sbatch_string_ft_launcher( + command_groups: List[List[str]], + folder: Union[str, Path], + job_name: str = "nemo_launcher", + partition: Optional[str] = None, + time: int = 5, + nodes: Union[int, List[int]] = 1, + ntasks_per_node: Optional[Union[int, List[int]]] = None, + cpus_per_task: Optional[int] = None, + cpus_per_gpu: Optional[int] = None, + num_gpus: Optional[int] = None, # legacy + gpus_per_node: Optional[int] = None, + gpus_per_task: Optional[int] = None, + qos: Optional[str] = None, # quality of service + setup: Optional[List[str]] = None, + mem: Optional[str] = None, + mem_per_gpu: Optional[str] = None, + mem_per_cpu: Optional[str] = None, + dependency: Optional[str] = None, + comment: Optional[str] = None, + constraint: Optional[str] = None, + exclude: Optional[str] = None, + account: Optional[str] = None, + gres: Optional[str] = None, + exclusive: Optional[Union[bool, str]] = None, + array: Optional[str] = None, + stderr_to_stdout: bool = False, + container_image: Optional[str] = None, + container_mounts: Optional[str] = None, + additional_parameters: Optional[Dict[str, Any]] = None, + srun_args: Optional[Iterable[str]] = None, + heterogeneous: bool = False, + autoresume_if_interrupted: bool = False, +) -> str: + + """Creates the content of an sbatch file with provided parameters + + Parameters + ---------- + See slurm sbatch documentation for most parameters: + https://slurm.schedmd.com/sbatch.html + + Below are the parameters that differ from slurm documentation: + + command_groups: + each command group will be assigned one srun + folder: str/Path + folder where print logs and error logs will be written + setup: list + a list of command to run in sbatch before running srun + additional_parameters: dict + Forces any parameter to a given value in sbatch. This can be useful + to add parameters which are not currently available in nemo_launcher. + Eg: {"mail-user": "blublu@nvidia.com", "mail-type": "BEGIN"} + srun_args: List[str] + Add each argument in the list to the srun call + + Raises + ------ + ValueError + In case an erroneous keyword argument is added, a list of all eligible parameters + is printed, with their default values + """ + nonslurm = [ + "nonslurm", + "folder", + "command_groups", + "additional_parameters", + "setup", + "stderr_to_stdout", + "container_image", + "container_mounts", + "srun_args", + "heterogeneous", + "autoresume_if_interrupted", + ] + parameters = { + k: v for k, v in locals().items() if v is not None and k not in nonslurm + } + # rename and reformat parameters + + if num_gpus is not None: + warnings.warn( + '"num_gpus" is deprecated, please use "gpus_per_node" instead (overwritting with num_gpus)' + ) + parameters["gpus_per_node"] = parameters.pop("num_gpus", 0) + if "cpus_per_gpu" in parameters and "gpus_per_task" not in parameters: + warnings.warn( + '"cpus_per_gpu" requires to set "gpus_per_task" to work (and not "gpus_per_node")' + ) + # add necessary parameters + job_name = parameters.get("job_name") + paths = job_utils.JobPaths(folder=folder, job_name=job_name) + stdout = str(paths.stdout) + stderr = str(paths.stderr) + + if array is not None: + stdout = stdout.replace("%j", "%A_%a") + stderr = stderr.replace("%j", "%A_%a") + parameters["output"] = stdout.replace("%t", "0") + + if not stderr_to_stdout: + parameters["error"] = stderr.replace("%t", "0") + + if NEMO_LAUNCHER_CI: # Override output file for slurm + parameters["output"] = parameters["error"] = str(paths.folder / "slurm_%j.out") + stdout = stderr = parameters["output"] + + if additional_parameters is not None: + parameters.update(additional_parameters) + # now create + lines = ["#!/bin/bash", "", "# Parameters"] + if heterogeneous: + raise ValueError("This PoC does not support heterogeneous jobs") + else: + # run 1 FT launcher per node, it will spawn the actual tasks + parameters["ntasks_per_node"] = 1 + for k in sorted(parameters): + lines.append(_as_sbatch_flag(k, parameters[k])) + parameters["ntasks_per_node"] = ntasks_per_node + + lines += ["", "# This script uses experimental fault tolerance launcher", ""] + + # environment setup: + if setup is not None: + lines += ["", "# setup"] + setup + + if srun_args is None: + srun_args = [] + + if autoresume_if_interrupted is True: + lines += [ + '', + 'export INTERRUPTED_FLAG_FILE='+str(paths.folder / "_interrupted_flag"), + 'if [ "$RESUMED" = "1" ] && [ ! -f "$INTERRUPTED_FLAG_FILE" ] ; then exit 0 ; fi', + 'CONT_SBATCH_OUT=$(RESUMED=1 sbatch --parsable --dependency=afterany:"$SLURM_JOB_ID" "$0")', + 'if [ $? -ne 0 ] ; then echo "Could not schedule continuation job. Check stderr for details." ; exit 1 ; fi', + 'CONT_SLURM_JOB_ID=$(echo $CONT_SBATCH_OUT | cut -f1 -d",")', + 'rm -f $INTERRUPTED_FLAG_FILE', + '', + ] + srun_args += ["--kill-on-bad-exit=0", "--wait=3600"] + + lines += [ + "FT_RDZV_HOST=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)" + ] + + # commandline (this will run the function and args specified in the file provided as argument) + # We pass --output and --error here, because the SBATCH command doesn't work as expected with a filename pattern + stderr_flags = [] if stderr_to_stdout else ["--error", stderr] + container_flags = ["--container-image", container_image] if container_image else [] + container_flags += ( + ["--container-mounts", container_mounts] if container_mounts else [] + ) + + if NEMO_LAUNCHER_MEMORY_MEASURE: + srun_args += ["--overlap"] + + mem_stdout = stdout.replace("_%j", "_mem_%j") + mem_stdout = mem_stdout.replace("_%A_%a", "_mem_%A_%a") + mem_srun_cmd = shlex.join( + [ + "srun", + "--ntasks=1", + "--ntasks-per-node=1", + "--output", + mem_stdout, + *container_flags, + *srun_args, + ] + ) + lines += [ + "", + "# run memory measure", + f"{mem_srun_cmd} \\", + f" nvidia-smi --query-gpu=timestamp,index,,memory.total,memory.free,memory.used --format=csv -l 1 & ", + "", + ] + + # Fault tolerance uses Torch Elastic based launcher with SLURM. + # Torch Lightning does not handle that case correctly, + # so we need to force TorchElasticEnvironment over SLURMEnvironment. + # We do this by setting SLURM_JOB_NAME=interactive. + # This is a temporary workaround, until the following PR is merged with NeMo + # https://github.com/Lightning-AI/pytorch-lightning/pull/18618 + ft_launcher_cmd="SLURM_JOB_NAME=interactive ft_launcher " +\ + "--rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$FT_RDZV_HOST " +\ + f"--nnodes={nodes} --nproc_per_node={ntasks_per_node}" + + for group_ind, command_group in enumerate(command_groups): + if heterogeneous: + raise ValueError("This PoC does not support heterogeneous jobs") + else: + srun_cmd = shlex.join( + [ + "srun", + "--output", + stdout, + *stderr_flags, + *container_flags, + *srun_args, + ] + ) + command = ";\n ".join(command_group) + assert "python3 -u" in command + command = command.replace( + "python3 -u", ft_launcher_cmd, + ) + lines += [ + "", + f"# command {group_ind + 1}", + f'{srun_cmd} bash -c "', + f' {command} "', + "", + ] if autoresume_if_interrupted is True: lines += [ diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 3011b7ebfa..25ce943c5e 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -351,15 +351,30 @@ def _make_cluster_parameters(self, cluster: str) -> Dict: } ) - fault_tol_conf = stage_cfg.get("exp_manager").get("fault_tolerance", None) - resume_on_fault = fault_tol_conf and fault_tol_conf.get("autoresume_if_faulted", False) - resume_on_preemption = stage_cfg.get("exp_manager").get("autoresume_if_preempted", False) - cluster_parameters["autoresume_if_interrupted"] = (resume_on_fault or resume_on_preemption) - if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm": - raise ValueError(f"`autoresume_if_faulted` and `autoresume_if_preempted` works only with 'bcm' cluster (current cluster is '{cluster}')") + cluster_parameters = \ + self._update_fault_tolerance_params(stage_cfg, cluster, cluster_parameters) return cluster_parameters + def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters): + # TODO: cleanup this function + exp_man_conf = stage_cfg.get("exp_manager", None) + resume_on_preemption = exp_man_conf.get("autoresume_if_preempted", False) + ft_conf = exp_man_conf is not None and exp_man_conf.get("fault_tolerance", None) + is_ft_enabled = ft_conf is not None + if is_ft_enabled: + cluster_parameters["use_fault_tolerance"] = True + resume_on_fault = ft_conf.get("autoresume_if_faulted", False) + cluster_parameters["autoresume_if_interrupted"] = (resume_on_fault or resume_on_preemption) + if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm": + raise ValueError(f"`autoresume_if_faulted` and `autoresume_if_preempted` " + f"works only with 'bcm' cluster (current cluster is '{cluster}')") + else: + if resume_on_preemption is True: + raise ValueError(f"`autoresume_if_preempted` works only with fault tolerance enabled") + + return cluster_parameters + def _find_optimal_nodes(self, cfg, gpus) -> None: nodes_scheduler_path = ( f"{cfg.get('training').get('run').get('results_dir')}/nodes_scheduler.json" diff --git a/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py index 02cb78b17d..9ef6899574 100644 --- a/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py +++ b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py @@ -46,7 +46,7 @@ def test_fault_tol_config_no_fault_tol_section(): def test_fault_tol_config_autoresume_if_preempted(): - """ autpresume_if_preempted=True and BCM cluster: should be fine """ + """ autpresume_if_preempted=True and FT enabled, should be fine """ cfg = OmegaConf.load("conf/config.yaml") cfg.stages = ["training"] cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH @@ -56,9 +56,26 @@ def test_fault_tol_config_autoresume_if_preempted(): cfg.training_config = "gpt3/126m" cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") cfg.training.exp_manager.autoresume_if_preempted = True + cfg.training.exp_manager.fault_tolerance = OmegaConf.create( + {"autoresume_if_faulted": False} + ) stage = Training(cfg) _ = stage.run() +def test_fault_tol_config_autoresume_if_preempted_no_ft(): + """ autpresume_if_preempted=True without fault tolerance is invalid """ + cfg = OmegaConf.load("conf/config.yaml") + cfg.stages = ["training"] + cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH + cfg.base_results_dir = TEST_RESULTS_DIR + cfg.cluster_type = "bcm" + cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml") + cfg.training_config = "gpt3/126m" + cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") + cfg.training.exp_manager.autoresume_if_preempted = True + with pytest.raises(ValueError): + stage = Training(cfg) + _ = stage.run() def test_fault_tol_config_autoresume_if_preempted_invalid_cluster(): """ autpresume_if_preempted=True is not allowed with non-BCM cluster """ From 87e338632900dd48d7a6767223321ffa8bfc54e2 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Thu, 11 Jan 2024 18:11:42 +0100 Subject: [PATCH 17/39] Fix... --- launcher_scripts/nemo_launcher/core/launchers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index 2e704d0a36..b12d09ed88 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -811,6 +811,7 @@ def _make_sbatch_string_ft_launcher( additional_parameters: Optional[Dict[str, Any]] = None, srun_args: Optional[Iterable[str]] = None, heterogeneous: bool = False, + use_fault_tolerance: bool = True, autoresume_if_interrupted: bool = False, ) -> str: @@ -853,6 +854,7 @@ def _make_sbatch_string_ft_launcher( "container_mounts", "srun_args", "heterogeneous", + "use_fault_tolerance", "autoresume_if_interrupted", ] parameters = { From 90fc006d540b0e43acf893345beee15e6ed21603 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Thu, 11 Jan 2024 19:54:17 +0100 Subject: [PATCH 18/39] Fix2 --- launcher_scripts/nemo_launcher/core/launchers.py | 5 ++--- launcher_scripts/nemo_launcher/core/stages.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index b12d09ed88..b95b307c2b 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -366,6 +366,7 @@ class SlurmLauncher(Launcher): def __init__(self, folder: Union[Path, str], job_name: str, **kwargs: Any) -> None: super().__init__(folder, job_name) self.parameters = {} + self.use_fault_tolerance = kwargs.pop("use_fault_tolerance", False) self._update_parameters(job_name=job_name, **kwargs) if shutil.which("srun") is None and not NEMO_LAUNCHER_DEBUG: @@ -455,7 +456,7 @@ def _make_submission_file_text(self, command_groups: List[List[str]]) -> str: :return: submission script file's text :rtype: str """ - if getattr(self.parameters, 'use_fault_tolerance', None): + if self.use_fault_tolerance: return _make_sbatch_string_ft_launcher( command_groups=command_groups, folder=self.folder, **self.parameters ) @@ -811,7 +812,6 @@ def _make_sbatch_string_ft_launcher( additional_parameters: Optional[Dict[str, Any]] = None, srun_args: Optional[Iterable[str]] = None, heterogeneous: bool = False, - use_fault_tolerance: bool = True, autoresume_if_interrupted: bool = False, ) -> str: @@ -854,7 +854,6 @@ def _make_sbatch_string_ft_launcher( "container_mounts", "srun_args", "heterogeneous", - "use_fault_tolerance", "autoresume_if_interrupted", ] parameters = { diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 25ce943c5e..5655272e7d 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -362,8 +362,8 @@ def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters) resume_on_preemption = exp_man_conf.get("autoresume_if_preempted", False) ft_conf = exp_man_conf is not None and exp_man_conf.get("fault_tolerance", None) is_ft_enabled = ft_conf is not None + cluster_parameters["use_fault_tolerance"] = is_ft_enabled if is_ft_enabled: - cluster_parameters["use_fault_tolerance"] = True resume_on_fault = ft_conf.get("autoresume_if_faulted", False) cluster_parameters["autoresume_if_interrupted"] = (resume_on_fault or resume_on_preemption) if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm": From 89362b328222895cc9243a7b90cfef105ff3442f Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Tue, 16 Jan 2024 11:16:43 +0100 Subject: [PATCH 19/39] Updating for FT launcher, wip... --- .../fault_tolerance/run_sc2_3b_on_eos_FT.txt | 2 +- .../nemo_launcher/core/launchers.py | 29 +++++++++++++++---- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt index 71831a30e3..d39b5bafc0 100644 --- a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt +++ b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt @@ -1,5 +1,5 @@ USR_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi" -LAUNCHER_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi/nemo/NeMo-Megatron-Launcher/" +LAUNCHER_DIR="/home/jbieniusiewi/nvwork/sc2/NeMo-Megatron-Launcher" NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ training=gpt3/starcoder2_3b \ diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index b95b307c2b..40e8ac3452 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -357,16 +357,21 @@ class SlurmLauncher(Launcher): :param Union[Path, str] folder: folder for storing job submission/output and logs. :param str job_name: Name of the job, used as job folder name + :param bool use_fault_tolerance: Use fault tolerance launcher to run the job :param Any **kwargs: See slurm documentation for most parameters. Most useful parameters are: time, mem, gpus_per_node, cpus_per_task, partition Below are the parameters that differ from slurm documentation: setup: a list of command to run in sbatch before running srun """ - def __init__(self, folder: Union[Path, str], job_name: str, **kwargs: Any) -> None: + def __init__(self, + folder: Union[Path, str], + job_name: str, + use_fault_tolerance: bool, + **kwargs: Any) -> None: super().__init__(folder, job_name) self.parameters = {} - self.use_fault_tolerance = kwargs.pop("use_fault_tolerance", False) + self.use_fault_tolerance = use_fault_tolerance self._update_parameters(job_name=job_name, **kwargs) if shutil.which("srun") is None and not NEMO_LAUNCHER_DEBUG: @@ -387,9 +392,12 @@ def _equivalence_dict(cls): } @classmethod - def _valid_parameters(cls) -> Set[str]: + def _valid_parameters(cls, use_fault_tolerance) -> Set[str]: """Parameters that can be set through update_parameters""" - return set(_get_default_parameters()) + if use_fault_tolerance: + return set(_get_default_parameters(_make_sbatch_string_ft_launcher)) + else: + return set(_get_default_parameters(_make_sbatch_string)) def _convert_parameters(self, params: Dict[str, Any]) -> Dict[str, Any]: """translate slurm parameter names""" @@ -413,7 +421,11 @@ def _update_parameters(self, **kwargs: Any) -> None: Below are the parameters that differ from slurm documentation: setup: a list of command to run in sbatch before running srun """ - defaults = _get_default_parameters() + + if self.use_fault_tolerance: + defaults = _get_default_parameters_ft_launcher() + else: + defaults = _get_default_parameters() in_valid_parameters = sorted(set(kwargs) - set(defaults)) if in_valid_parameters: string = "\n - ".join( @@ -779,6 +791,13 @@ def _make_sbatch_string( return "\n".join(lines) +@functools.lru_cache() +def _get_default_parameters_ft_launcher() -> Dict[str, Any]: + """Parameters that can be set through update_parameters""" + specs = inspect.getfullargspec(_make_sbatch_string_ft_launcher) + zipped = zip(specs.args[-len(specs.defaults) :], specs.defaults) # type: ignore + return {key: val for key, val in zipped if key not in {"command_groups", "folder"}} + # pylint: disable=too-many-arguments,unused-argument, too-many-locals def _make_sbatch_string_ft_launcher( command_groups: List[List[str]], From b4ee9ed81dc1f305ffaad205c742348e0e3cd2dc Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Wed, 17 Jan 2024 12:02:41 +0100 Subject: [PATCH 20/39] Updated FT params reading --- .../fault_tolerance/run_sc2_3b_on_eos_FT.txt | 4 +- .../nemo_launcher/core/launchers.py | 76 +++++++++++++------ launcher_scripts/nemo_launcher/core/stages.py | 22 ++---- .../config_tests/test_fault_tol_config.py | 61 ++++----------- 4 files changed, 75 insertions(+), 88 deletions(-) diff --git a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt index d39b5bafc0..f2fd6fcc93 100644 --- a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt +++ b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt @@ -25,8 +25,8 @@ NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=600 \ ++training.exp_manager.fault_tolerance.ipc_timeout=60 \ ++training.exp_manager.fault_tolerance.rank_termination_signal=9 \ - ++training.exp_manager.fault_tolerance.autoresume_if_faulted=True \ - ++training.exp_manager.autoresume_if_preempted=False + ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=2 \ + ++training.exp_manager.fault_tolerance.max_rank_restarts=1 \ # Uncomment to test simulated faults diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index 40e8ac3452..86ffd589dc 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -798,6 +798,7 @@ def _get_default_parameters_ft_launcher() -> Dict[str, Any]: zipped = zip(specs.args[-len(specs.defaults) :], specs.defaults) # type: ignore return {key: val for key, val in zipped if key not in {"command_groups", "folder"}} + # pylint: disable=too-many-arguments,unused-argument, too-many-locals def _make_sbatch_string_ft_launcher( command_groups: List[List[str]], @@ -831,7 +832,8 @@ def _make_sbatch_string_ft_launcher( additional_parameters: Optional[Dict[str, Any]] = None, srun_args: Optional[Iterable[str]] = None, heterogeneous: bool = False, - autoresume_if_interrupted: bool = False, + max_subsequent_job_failures: int = 0, + max_rank_restarts: int = 0, ) -> str: """Creates the content of an sbatch file with provided parameters @@ -873,7 +875,8 @@ def _make_sbatch_string_ft_launcher( "container_mounts", "srun_args", "heterogeneous", - "autoresume_if_interrupted", + "max_subsequent_job_failures", + "max_rank_restarts", ] parameters = { k: v for k, v in locals().items() if v is not None and k not in nonslurm @@ -929,23 +932,40 @@ def _make_sbatch_string_ft_launcher( if srun_args is None: srun_args = [] - if autoresume_if_interrupted is True: + lines += [ + '', + '# Fault tolerance related items', + f'export FAULT_TOL_CFG_PATH="{str(paths.config_file)}"', + f'export FAULT_TOL_FINISHED_FLAG_FILE="{str(paths.folder / "_finished_flag")}"', + 'RDZV_HOST=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)', + 'IS_THIS_JOB_SUCCESSFUL=1', + ] + + if max_subsequent_job_failures > 0: lines += [ '', - 'export INTERRUPTED_FLAG_FILE='+str(paths.folder / "_interrupted_flag"), - 'if [ "$RESUMED" = "1" ] && [ ! -f "$INTERRUPTED_FLAG_FILE" ] ; then exit 0 ; fi', - 'CONT_SBATCH_OUT=$(RESUMED=1 sbatch --parsable --dependency=afterany:"$SLURM_JOB_ID" "$0")', - 'if [ $? -ne 0 ] ; then echo "Could not schedule continuation job. Check stderr for details." ; exit 1 ; fi', + '# Automatic job resubmission related items', + f'JOB_RESULTS_FILE="{str(paths.folder / "_job_results")}"', + f'MAX_JOB_FAILURES={max_subsequent_job_failures}', + 'is_job_failures_limit_reached() {', + ' tail -n $MAX_JOB_FAILURES "$JOB_RESULTS_FILE" | awk "/0/{f++} END{exit !(f>=$MAX_JOB_FAILURES)}"', + '}', + 'is_training_finished() {', + ' test -f "$FAULT_TOL_FINISHED_FLAG_FILE"', + '}', + '# Exit immediately if finished flag file exists and this job is a continuation', + 'if [ "$FT_RESUMED" = "1" ] ; then', + ' if is_training_finished ; then echo "Training is finished" ; exit 0 ; fi', + ' if is_job_failures_limit_reached ; then echo "Job failures limit reached ($MAX_JOB_FAILURES)" ; exit 1 ; fi', + 'else', + ' rm -f "$FAULT_TOL_FINISHED_FLAG_FILE" "$JOB_RESULTS_FILE"', + 'fi', + '# Pre-schedule continuation job', + 'CONT_SBATCH_OUT=$(FT_RESUMED=1 sbatch --parsable --dependency=afterany:"$SLURM_JOB_ID" "$0")', + 'if [ $? -ne 0 ] ; then echo "Couldnt schedule continuation job. Check stderr for details." ; exit 1 ; fi', 'CONT_SLURM_JOB_ID=$(echo $CONT_SBATCH_OUT | cut -f1 -d",")', - 'rm -f $INTERRUPTED_FLAG_FILE', - '', ] - srun_args += ["--kill-on-bad-exit=0", "--wait=3600"] - - lines += [ - "FT_RDZV_HOST=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)" - ] - + # commandline (this will run the function and args specified in the file provided as argument) # We pass --output and --error here, because the SBATCH command doesn't work as expected with a filename pattern stderr_flags = [] if stderr_to_stdout else ["--error", stderr] @@ -984,9 +1004,9 @@ def _make_sbatch_string_ft_launcher( # We do this by setting SLURM_JOB_NAME=interactive. # This is a temporary workaround, until the following PR is merged with NeMo # https://github.com/Lightning-AI/pytorch-lightning/pull/18618 - ft_launcher_cmd="SLURM_JOB_NAME=interactive ft_launcher " +\ - "--rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$FT_RDZV_HOST " +\ - f"--nnodes={nodes} --nproc_per_node={ntasks_per_node}" + ft_launcher_cmd_part="SLURM_JOB_NAME=interactive ft_launcher " +\ + "--rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$RDZV_HOST " +\ + f"--nnodes={nodes} --nproc_per_node={ntasks_per_node} --max-restarts={max_rank_restarts}" for group_ind, command_group in enumerate(command_groups): if heterogeneous: @@ -1005,7 +1025,7 @@ def _make_sbatch_string_ft_launcher( command = ";\n ".join(command_group) assert "python3 -u" in command command = command.replace( - "python3 -u", ft_launcher_cmd, + "python3 -u", ft_launcher_cmd_part, ) lines += [ "", @@ -1014,15 +1034,21 @@ def _make_sbatch_string_ft_launcher( f' {command} "', "", ] + lines += [ + 'if [ $? -ne 0 ]; then IS_THIS_JOB_SUCCESSFUL=0 ; fi' + ] - if autoresume_if_interrupted is True: + if max_subsequent_job_failures > 0: lines += [ '', - '# cancel continuation job if no continuation marker file was created', - 'if [ ! -f "$INTERRUPTED_FLAG_FILE" ] && [ ! -z "$CONT_SLURM_JOB_ID" ] ; then', - 'scancel $CONT_SLURM_JOB_ID', - 'fi' - '', + '# Check if the continuation job can be cancelled', + 'echo $IS_THIS_JOB_SUCCESSFUL >> $JOB_RESULTS_FILE', + 'if is_training_finished ; then', + ' echo "Training is finished" ; scancel $CONT_SLURM_JOB_ID ; exit 0', + 'fi', + 'if is_job_failures_limit_reached ; then', + ' echo "Job failures limit reached ($MAX_JOB_FAILURES)" ; scancel $CONT_SLURM_JOB_ID ; exit 1', + 'fi', ] return "\n".join(lines) diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 5655272e7d..8cbdaaee43 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -357,22 +357,16 @@ def _make_cluster_parameters(self, cluster: str) -> Dict: return cluster_parameters def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters): - # TODO: cleanup this function exp_man_conf = stage_cfg.get("exp_manager", None) - resume_on_preemption = exp_man_conf.get("autoresume_if_preempted", False) ft_conf = exp_man_conf is not None and exp_man_conf.get("fault_tolerance", None) - is_ft_enabled = ft_conf is not None - cluster_parameters["use_fault_tolerance"] = is_ft_enabled - if is_ft_enabled: - resume_on_fault = ft_conf.get("autoresume_if_faulted", False) - cluster_parameters["autoresume_if_interrupted"] = (resume_on_fault or resume_on_preemption) - if cluster_parameters["autoresume_if_interrupted"] is True and cluster != "bcm": - raise ValueError(f"`autoresume_if_faulted` and `autoresume_if_preempted` " - f"works only with 'bcm' cluster (current cluster is '{cluster}')") - else: - if resume_on_preemption is True: - raise ValueError(f"`autoresume_if_preempted` works only with fault tolerance enabled") - + cluster_parameters["use_fault_tolerance"] = ft_conf is not None + if cluster_parameters["use_fault_tolerance"]: + if cluster.lower() != "bcm": + raise ValueError(f"Fault tolerance requires 'bcm' cluster, but it's '{cluster}')") + cluster_parameters["max_rank_restarts"] = \ + ft_conf.get('max_rank_restarts', 0) + cluster_parameters["max_subsequent_job_failures"] = \ + ft_conf.get('max_subsequent_job_failures', 0) return cluster_parameters def _find_optimal_nodes(self, cfg, gpus) -> None: diff --git a/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py index 9ef6899574..d2198fd5fc 100644 --- a/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py +++ b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py @@ -30,8 +30,8 @@ def _setup_and_teardown(): os.system(f"rm -rf {TEST_RESULTS_DIR}") -def test_fault_tol_config_no_fault_tol_section(): - """ No fault tolerance section in config: should be fine """ +def test_fault_tol_config_no_fault_tol_section_bcm(): + """ No fault tolerance section in config, BCM cluster, should be fine """ cfg = OmegaConf.load("conf/config.yaml") cfg.stages = ["training"] cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH @@ -45,40 +45,8 @@ def test_fault_tol_config_no_fault_tol_section(): _ = stage.run() -def test_fault_tol_config_autoresume_if_preempted(): - """ autpresume_if_preempted=True and FT enabled, should be fine """ - cfg = OmegaConf.load("conf/config.yaml") - cfg.stages = ["training"] - cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH - cfg.base_results_dir = TEST_RESULTS_DIR - cfg.cluster_type = "bcm" - cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml") - cfg.training_config = "gpt3/126m" - cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") - cfg.training.exp_manager.autoresume_if_preempted = True - cfg.training.exp_manager.fault_tolerance = OmegaConf.create( - {"autoresume_if_faulted": False} - ) - stage = Training(cfg) - _ = stage.run() - -def test_fault_tol_config_autoresume_if_preempted_no_ft(): - """ autpresume_if_preempted=True without fault tolerance is invalid """ - cfg = OmegaConf.load("conf/config.yaml") - cfg.stages = ["training"] - cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH - cfg.base_results_dir = TEST_RESULTS_DIR - cfg.cluster_type = "bcm" - cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml") - cfg.training_config = "gpt3/126m" - cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") - cfg.training.exp_manager.autoresume_if_preempted = True - with pytest.raises(ValueError): - stage = Training(cfg) - _ = stage.run() - -def test_fault_tol_config_autoresume_if_preempted_invalid_cluster(): - """ autpresume_if_preempted=True is not allowed with non-BCM cluster """ +def test_fault_tol_config_no_fault_tol_section_bcp(): + """ No fault tolerance section in config, BCP cluster, should be fine """ cfg = OmegaConf.load("conf/config.yaml") cfg.stages = ["training"] cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH @@ -87,14 +55,13 @@ def test_fault_tol_config_autoresume_if_preempted_invalid_cluster(): cfg.cluster = dict() cfg.training_config = "gpt3/126m" cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") - cfg.training.exp_manager.autoresume_if_preempted = True - with pytest.raises(ValueError): - stage = Training(cfg) - _ = stage.run() + assert cfg.training.exp_manager.get("fault_tolernace", None) is None + stage = Training(cfg) + _ = stage.run() -def test_fault_tol_config_autoresume_if_faulted(): - """ autoresume_if_faulted=True and BCM cluster: should be fine """ +def test_fault_tol_config_with_bcm(): + """ Fault tolerance + BCM cluster, should be fine """ cfg = OmegaConf.load("conf/config.yaml") cfg.stages = ["training"] cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH @@ -104,14 +71,13 @@ def test_fault_tol_config_autoresume_if_faulted(): cfg.training_config = "gpt3/126m" cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") cfg.training.exp_manager.fault_tolerance = OmegaConf.create( - {"autoresume_if_faulted": True} + {"max_subsequent_job_failures": 1} ) stage = Training(cfg) _ = stage.run() - -def test_fault_tol_config_autoresume_if_faulted_invalid_cluster(): - """ autoresume_if_faulted=True is not allowed with non-BCM cluster """ +def test_fault_tol_config_with_bcp(): + """ Fault tolerance + BCP cluster, BCP is not supported """ cfg = OmegaConf.load("conf/config.yaml") cfg.stages = ["training"] cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH @@ -121,8 +87,9 @@ def test_fault_tol_config_autoresume_if_faulted_invalid_cluster(): cfg.training_config = "gpt3/126m" cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") cfg.training.exp_manager.fault_tolerance = OmegaConf.create( - {"autoresume_if_faulted": True} + {"max_subsequent_job_failures": 1} ) with pytest.raises(ValueError): stage = Training(cfg) _ = stage.run() + From 050ccf2a6af1ef12a49b2f61a860207a95610b34 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Wed, 17 Jan 2024 12:15:26 +0000 Subject: [PATCH 21/39] Fixes after testing on DlCluster... --- launcher_scripts/nemo_launcher/core/launchers.py | 2 +- launcher_scripts/nemo_launcher/core/stages.py | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index 86ffd589dc..44da1621bf 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -1004,7 +1004,7 @@ def _make_sbatch_string_ft_launcher( # We do this by setting SLURM_JOB_NAME=interactive. # This is a temporary workaround, until the following PR is merged with NeMo # https://github.com/Lightning-AI/pytorch-lightning/pull/18618 - ft_launcher_cmd_part="SLURM_JOB_NAME=interactive ft_launcher " +\ + ft_launcher_cmd_part="SLURM_JOB_NAME=interactive ft_launcher --fault-tol-cfg-path=$FAULT_TOL_CFG_PATH " +\ "--rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$RDZV_HOST " +\ f"--nnodes={nodes} --nproc_per_node={ntasks_per_node} --max-restarts={max_rank_restarts}" diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 8cbdaaee43..afde2e9241 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -47,7 +47,8 @@ def __init__(self, cfg): self.stage_cfg = None self.setup_stage_vars(cfg) self.job_name = self.stage_cfg.run.get("name") - + if self.cluster.lower() == 'bcm': + self.job_name = cfg.get("cluster").get("job_name_prefix","") + self.job_name self.nodes_scheduler = {} def setup_stage_vars(self, cfg: OmegaConf): @@ -281,7 +282,7 @@ def _make_cluster_parameters(self, cluster: str) -> Dict: cfg = self.cfg stage_cfg = self.stage_cfg run_cfg = stage_cfg.get("run") - job_name = run_cfg.get("name") + job_name = self.job_name time_limit = run_cfg.get("time_limit") nodes = run_cfg.get("nodes") dependency = run_cfg.get("dependency") @@ -315,7 +316,7 @@ def _make_cluster_parameters(self, cluster: str) -> Dict: cluster_cfg["srun_args"] = [] cluster_cfg["srun_args"] += ["--mpi=pmix"] slurm_cfg = {**copy.deepcopy(cluster_cfg)} - job_name_prefix = slurm_cfg.pop("job_name_prefix") + slurm_cfg.pop("job_name_prefix") cluster_parameters = {**slurm_cfg} cluster_parameters.update( { @@ -325,9 +326,6 @@ def _make_cluster_parameters(self, cluster: str) -> Dict: "container_mounts": container_mounts, } ) - cluster_parameters["job_name"] = ( - job_name_prefix + cluster_parameters["job_name"] - ) elif cluster == "bcp": cluster_parameters.update( { From 147024b692fbe67616d0506179d7841006df4899 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Thu, 18 Jan 2024 13:54:18 +0100 Subject: [PATCH 22/39] Update after 'create_fault_tolerance_callback' param was added --- launcher_scripts/nemo_launcher/core/stages.py | 9 ++--- .../config_tests/test_fault_tol_config.py | 34 ++++++++++++++----- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index afde2e9241..0a36e09c54 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -355,12 +355,13 @@ def _make_cluster_parameters(self, cluster: str) -> Dict: return cluster_parameters def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters): - exp_man_conf = stage_cfg.get("exp_manager", None) - ft_conf = exp_man_conf is not None and exp_man_conf.get("fault_tolerance", None) - cluster_parameters["use_fault_tolerance"] = ft_conf is not None - if cluster_parameters["use_fault_tolerance"]: + exp_man_conf = stage_cfg.get("exp_manager", dict()) + use_ft = exp_man_conf.get('create_fault_tolerance_callback', False) + cluster_parameters["use_fault_tolerance"] = use_ft + if use_ft: if cluster.lower() != "bcm": raise ValueError(f"Fault tolerance requires 'bcm' cluster, but it's '{cluster}')") + ft_conf = exp_man_conf.get("fault_tolerance", dict()) cluster_parameters["max_rank_restarts"] = \ ft_conf.get('max_rank_restarts', 0) cluster_parameters["max_subsequent_job_failures"] = \ diff --git a/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py index d2198fd5fc..8a9f7a1d70 100644 --- a/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py +++ b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py @@ -30,8 +30,8 @@ def _setup_and_teardown(): os.system(f"rm -rf {TEST_RESULTS_DIR}") -def test_fault_tol_config_no_fault_tol_section_bcm(): - """ No fault tolerance section in config, BCM cluster, should be fine """ +def test_fault_tol_config_fault_tol_disabled_bcm(): + """ No fault tolerance, BCM cluster, should be fine """ cfg = OmegaConf.load("conf/config.yaml") cfg.stages = ["training"] cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH @@ -40,13 +40,14 @@ def test_fault_tol_config_no_fault_tol_section_bcm(): cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml") cfg.training_config = "gpt3/126m" cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") - assert cfg.training.exp_manager.get("fault_tolernace", None) is None + assert cfg.training.exp_manager.get("create_fault_tolerance_callback", None) is None + assert cfg.training.exp_manager.get("fault_toleranace", None) is None stage = Training(cfg) _ = stage.run() -def test_fault_tol_config_no_fault_tol_section_bcp(): - """ No fault tolerance section in config, BCP cluster, should be fine """ +def test_fault_tol_config_fault_tol_disabled_bcp(): + """ No fault tolerance, BCP cluster, should be fine """ cfg = OmegaConf.load("conf/config.yaml") cfg.stages = ["training"] cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH @@ -55,7 +56,8 @@ def test_fault_tol_config_no_fault_tol_section_bcp(): cfg.cluster = dict() cfg.training_config = "gpt3/126m" cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") - assert cfg.training.exp_manager.get("fault_tolernace", None) is None + assert cfg.training.exp_manager.get("create_fault_tolerance_callback", None) is None + assert cfg.training.exp_manager.get("fault_toleranace", None) is None stage = Training(cfg) _ = stage.run() @@ -70,12 +72,28 @@ def test_fault_tol_config_with_bcm(): cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml") cfg.training_config = "gpt3/126m" cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") + cfg.training.exp_manager.create_fault_tolerance_callback=True cfg.training.exp_manager.fault_tolerance = OmegaConf.create( {"max_subsequent_job_failures": 1} ) stage = Training(cfg) _ = stage.run() +def test_fault_tol_config_with_bcm_no_ft_section(): + """ Fault tolerance + BCM cluster, no "fault_tolerance" section in cfg, should be fine """ + cfg = OmegaConf.load("conf/config.yaml") + cfg.stages = ["training"] + cfg.launcher_scripts_path = LAUNCHER_SCRIPTS_PATH + cfg.base_results_dir = TEST_RESULTS_DIR + cfg.cluster_type = "bcm" + cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml") + cfg.training_config = "gpt3/126m" + cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") + cfg.training.exp_manager.create_fault_tolerance_callback=True + stage = Training(cfg) + _ = stage.run() + + def test_fault_tol_config_with_bcp(): """ Fault tolerance + BCP cluster, BCP is not supported """ cfg = OmegaConf.load("conf/config.yaml") @@ -86,9 +104,7 @@ def test_fault_tol_config_with_bcp(): cfg.cluster = dict() cfg.training_config = "gpt3/126m" cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") - cfg.training.exp_manager.fault_tolerance = OmegaConf.create( - {"max_subsequent_job_failures": 1} - ) + cfg.training.exp_manager.create_fault_tolerance_callback=True with pytest.raises(ValueError): stage = Training(cfg) _ = stage.run() From 5105a29311049749d6c7975daf4bf9fc386c6454 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Fri, 19 Jan 2024 11:58:50 +0100 Subject: [PATCH 23/39] Improved auto-resume --- examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt | 1 + launcher_scripts/nemo_launcher/core/launchers.py | 14 ++++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt index f2fd6fcc93..1ccf8af65e 100644 --- a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt +++ b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt @@ -21,6 +21,7 @@ NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 training.trainer.devices=8 \ training.trainer.log_every_n_steps=1 \ training.trainer.val_check_interval=1000 \ + ++training.exp_manager.create_fault_tolerance_callback=True \ ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=720 \ ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=600 \ ++training.exp_manager.fault_tolerance.ipc_timeout=60 \ diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index 44da1621bf..af59639457 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -938,7 +938,7 @@ def _make_sbatch_string_ft_launcher( f'export FAULT_TOL_CFG_PATH="{str(paths.config_file)}"', f'export FAULT_TOL_FINISHED_FLAG_FILE="{str(paths.folder / "_finished_flag")}"', 'RDZV_HOST=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)', - 'IS_THIS_JOB_SUCCESSFUL=1', + 'ANY_JOB_STEP_FAILED=0', ] if max_subsequent_job_failures > 0: @@ -948,7 +948,8 @@ def _make_sbatch_string_ft_launcher( f'JOB_RESULTS_FILE="{str(paths.folder / "_job_results")}"', f'MAX_JOB_FAILURES={max_subsequent_job_failures}', 'is_job_failures_limit_reached() {', - ' tail -n $MAX_JOB_FAILURES "$JOB_RESULTS_FILE" | awk "/0/{f++} END{exit !(f>=$MAX_JOB_FAILURES)}"', + ' tail -n $MAX_JOB_FAILURES "$JOB_RESULTS_FILE" | \\', + ' awk "/^[[:alnum:]]+[[:space:]]+F$/{f++} END{exit !(f>=$MAX_JOB_FAILURES)}"', '}', 'is_training_finished() {', ' test -f "$FAULT_TOL_FINISHED_FLAG_FILE"', @@ -964,6 +965,8 @@ def _make_sbatch_string_ft_launcher( 'CONT_SBATCH_OUT=$(FT_RESUMED=1 sbatch --parsable --dependency=afterany:"$SLURM_JOB_ID" "$0")', 'if [ $? -ne 0 ] ; then echo "Couldnt schedule continuation job. Check stderr for details." ; exit 1 ; fi', 'CONT_SLURM_JOB_ID=$(echo $CONT_SBATCH_OUT | cut -f1 -d",")', + '# Write failure to the job log, eventually we will fix it at the end', + 'echo "$SLURM_JOB_ID F" >> "$JOB_RESULTS_FILE"', ] # commandline (this will run the function and args specified in the file provided as argument) @@ -1035,14 +1038,17 @@ def _make_sbatch_string_ft_launcher( "", ] lines += [ - 'if [ $? -ne 0 ]; then IS_THIS_JOB_SUCCESSFUL=0 ; fi' + 'if [ $? -ne 0 ]; then ANY_JOB_STEP_FAILED=1 ; fi' ] if max_subsequent_job_failures > 0: lines += [ '', + '# Fix the job log entry ("JOB_ID F" -> "JOB_ID S"), if the job was successful', + 'if [ "$ANY_JOB_STEP_FAILED" = "0" ] ; then', + ' sed -i "s/^$SLURM_JOB_ID[[:space:]]\+F/$SLURM_JOB_ID S/" "$JOB_RESULTS_FILE"', + 'fi', '# Check if the continuation job can be cancelled', - 'echo $IS_THIS_JOB_SUCCESSFUL >> $JOB_RESULTS_FILE', 'if is_training_finished ; then', ' echo "Training is finished" ; scancel $CONT_SLURM_JOB_ID ; exit 0', 'fi', From 58741ae068ec9b8049c500114d926b76ccefcfc3 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Fri, 19 Jan 2024 13:15:26 +0100 Subject: [PATCH 24/39] Improved auto-resume-cont --- launcher_scripts/nemo_launcher/core/launchers.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index af59639457..75598fa799 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -965,8 +965,8 @@ def _make_sbatch_string_ft_launcher( 'CONT_SBATCH_OUT=$(FT_RESUMED=1 sbatch --parsable --dependency=afterany:"$SLURM_JOB_ID" "$0")', 'if [ $? -ne 0 ] ; then echo "Couldnt schedule continuation job. Check stderr for details." ; exit 1 ; fi', 'CONT_SLURM_JOB_ID=$(echo $CONT_SBATCH_OUT | cut -f1 -d",")', - '# Write failure to the job log, eventually we will fix it at the end', - 'echo "$SLURM_JOB_ID F" >> "$JOB_RESULTS_FILE"', + '# Write unknown job status to the job log, we will fix it at the end', + 'echo "$SLURM_JOB_ID X" >> "$JOB_RESULTS_FILE"', ] # commandline (this will run the function and args specified in the file provided as argument) @@ -1046,7 +1046,9 @@ def _make_sbatch_string_ft_launcher( '', '# Fix the job log entry ("JOB_ID F" -> "JOB_ID S"), if the job was successful', 'if [ "$ANY_JOB_STEP_FAILED" = "0" ] ; then', - ' sed -i "s/^$SLURM_JOB_ID[[:space:]]\+F/$SLURM_JOB_ID S/" "$JOB_RESULTS_FILE"', + ' sed -i "s/$SLURM_JOB_ID X/$SLURM_JOB_ID S/" "$JOB_RESULTS_FILE"', + 'else', + ' sed -i "s/$SLURM_JOB_ID X/$SLURM_JOB_ID F/" "$JOB_RESULTS_FILE"', 'fi', '# Check if the continuation job can be cancelled', 'if is_training_finished ; then', From 44421184f43fee6d7a310974bf60267272b33ac8 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Mon, 22 Jan 2024 13:19:05 +0100 Subject: [PATCH 25/39] Use hostname to get the rendezvous host --- launcher_scripts/nemo_launcher/core/launchers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index 75598fa799..f6ea5c9f2a 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -937,7 +937,7 @@ def _make_sbatch_string_ft_launcher( '# Fault tolerance related items', f'export FAULT_TOL_CFG_PATH="{str(paths.config_file)}"', f'export FAULT_TOL_FINISHED_FLAG_FILE="{str(paths.folder / "_finished_flag")}"', - 'RDZV_HOST=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)', + 'RDZV_HOST=$(hostname)', 'ANY_JOB_STEP_FAILED=0', ] From 25cc521e2accbb4ac14c8b8b3a45f145e23c53bd Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Tue, 23 Jan 2024 12:02:45 +0100 Subject: [PATCH 26/39] Added additional_ft_launcher_args --- launcher_scripts/nemo_launcher/core/launchers.py | 5 ++++- launcher_scripts/nemo_launcher/core/stages.py | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index f6ea5c9f2a..5a2e0ef64b 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -834,6 +834,7 @@ def _make_sbatch_string_ft_launcher( heterogeneous: bool = False, max_subsequent_job_failures: int = 0, max_rank_restarts: int = 0, + additional_ft_launcher_args: str = "", ) -> str: """Creates the content of an sbatch file with provided parameters @@ -877,6 +878,7 @@ def _make_sbatch_string_ft_launcher( "heterogeneous", "max_subsequent_job_failures", "max_rank_restarts", + "additional_ft_launcher_args", ] parameters = { k: v for k, v in locals().items() if v is not None and k not in nonslurm @@ -1007,7 +1009,8 @@ def _make_sbatch_string_ft_launcher( # We do this by setting SLURM_JOB_NAME=interactive. # This is a temporary workaround, until the following PR is merged with NeMo # https://github.com/Lightning-AI/pytorch-lightning/pull/18618 - ft_launcher_cmd_part="SLURM_JOB_NAME=interactive ft_launcher --fault-tol-cfg-path=$FAULT_TOL_CFG_PATH " +\ + ft_launcher_cmd_part="SLURM_JOB_NAME=interactive ft_launcher "+\ + f"--fault-tol-cfg-path=$FAULT_TOL_CFG_PATH {additional_ft_launcher_args} "+\ "--rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$RDZV_HOST " +\ f"--nnodes={nodes} --nproc_per_node={ntasks_per_node} --max-restarts={max_rank_restarts}" diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 0a36e09c54..b2a9dce0ef 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -366,6 +366,8 @@ def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters) ft_conf.get('max_rank_restarts', 0) cluster_parameters["max_subsequent_job_failures"] = \ ft_conf.get('max_subsequent_job_failures', 0) + cluster_parameters["additional_ft_launcher_args"] = \ + ft_conf.get('additional_ft_launcher_args', "") return cluster_parameters def _find_optimal_nodes(self, cfg, gpus) -> None: From 874b651aa62171635c0233f7a3c2dace65f04c05 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Thu, 25 Jan 2024 13:01:32 +0100 Subject: [PATCH 27/39] Restored --kill-on-bad-exit=0, --wait=3600 --- launcher_scripts/nemo_launcher/core/launchers.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index 5a2e0ef64b..4c2aa41cf8 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -934,6 +934,10 @@ def _make_sbatch_string_ft_launcher( if srun_args is None: srun_args = [] + # FT launcher will terminate failed workers, no need SLURM for that. + # A safety measure, let SLURM kill the job, 1h after any task ended. + srun_args += ["--kill-on-bad-exit=0", "--wait=3600"] + lines += [ '', '# Fault tolerance related items', From 83164111ef36cb7fa3a93ace06d7c6e8b52e107c Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Thu, 25 Jan 2024 13:19:55 +0100 Subject: [PATCH 28/39] Fixed comment --- launcher_scripts/nemo_launcher/core/launchers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index 4c2aa41cf8..16a83c83cb 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -1051,7 +1051,7 @@ def _make_sbatch_string_ft_launcher( if max_subsequent_job_failures > 0: lines += [ '', - '# Fix the job log entry ("JOB_ID F" -> "JOB_ID S"), if the job was successful', + '# Fix the job log entry ("JOB_ID X" -> "JOB_ID S/F"), depending on the job result', 'if [ "$ANY_JOB_STEP_FAILED" = "0" ] ; then', ' sed -i "s/$SLURM_JOB_ID X/$SLURM_JOB_ID S/" "$JOB_RESULTS_FILE"', 'else', From fb80ec0bb9125b5d9c68c9e89e6ae277b966d102 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Thu, 25 Jan 2024 14:24:17 +0100 Subject: [PATCH 29/39] --kill-on-bad-exit=1 --- launcher_scripts/nemo_launcher/core/launchers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index 16a83c83cb..fc514aaee4 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -934,9 +934,10 @@ def _make_sbatch_string_ft_launcher( if srun_args is None: srun_args = [] - # FT launcher will terminate failed workers, no need SLURM for that. - # A safety measure, let SLURM kill the job, 1h after any task ended. - srun_args += ["--kill-on-bad-exit=0", "--wait=3600"] + # A safety measures: + # let SLURM kill all tasks if any FT launcher returns with a failure. + # let SLURM kill the job, 1h after any task ended. + srun_args += ["--kill-on-bad-exit=1", "--wait=3600"] lines += [ '', From 79b371f2097771418eb3c06e2bd06fee0493f328 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Mon, 5 Feb 2024 15:44:15 +0100 Subject: [PATCH 30/39] Set FT work dir --- launcher_scripts/nemo_launcher/core/stages.py | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index b2a9dce0ef..f94dd205a4 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -81,6 +81,8 @@ def run(self) -> str: f"global batch size and number of nodes will change following this schedule:\n {self.nodes_scheduler}" ) + self._set_fault_tolerance_work_dir_in_stage_cfg(self.stage_cfg, self.cluster) + stage_cfg_path = NemoMegatronStage.save_stage_hydra_config( self.stage_cfg, job_path, self.cfg ) @@ -353,15 +355,19 @@ def _make_cluster_parameters(self, cluster: str) -> Dict: self._update_fault_tolerance_params(stage_cfg, cluster, cluster_parameters) return cluster_parameters - - def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters): + + def _get_fault_tol_config_section(self, stage_cfg, cluster): exp_man_conf = stage_cfg.get("exp_manager", dict()) use_ft = exp_man_conf.get('create_fault_tolerance_callback', False) - cluster_parameters["use_fault_tolerance"] = use_ft if use_ft: if cluster.lower() != "bcm": raise ValueError(f"Fault tolerance requires 'bcm' cluster, but it's '{cluster}')") - ft_conf = exp_man_conf.get("fault_tolerance", dict()) + return use_ft, exp_man_conf.get("fault_tolerance", dict()) + + def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters): + use_ft, ft_conf = self._get_fault_tol_config_section(stage_cfg, cluster) + cluster_parameters["use_fault_tolerance"] = use_ft + if use_ft: cluster_parameters["max_rank_restarts"] = \ ft_conf.get('max_rank_restarts', 0) cluster_parameters["max_subsequent_job_failures"] = \ @@ -369,6 +375,12 @@ def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters) cluster_parameters["additional_ft_launcher_args"] = \ ft_conf.get('additional_ft_launcher_args', "") return cluster_parameters + + def _set_fault_tolerance_work_dir_in_stage_cfg(self, stage_cfg, cluster): + use_ft, ft_conf = self._get_fault_tol_config_section(stage_cfg, cluster) + if use_ft: + with omegaconf.open_dict(ft_conf): + ft_conf.work_dir = str(self.get_job_path().folder) def _find_optimal_nodes(self, cfg, gpus) -> None: nodes_scheduler_path = ( From ff6e939eeffcf8067092dd8d4b62b18fc6fd8aae Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Mon, 5 Feb 2024 16:18:49 +0100 Subject: [PATCH 31/39] Set FT work dir/fix --- launcher_scripts/nemo_launcher/core/stages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index f94dd205a4..344065b353 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -380,7 +380,7 @@ def _set_fault_tolerance_work_dir_in_stage_cfg(self, stage_cfg, cluster): use_ft, ft_conf = self._get_fault_tol_config_section(stage_cfg, cluster) if use_ft: with omegaconf.open_dict(ft_conf): - ft_conf.work_dir = str(self.get_job_path().folder) + ft_conf.work_dir = str(self.get_job_path().folder / "_ft_scratch_dir") def _find_optimal_nodes(self, cfg, gpus) -> None: nodes_scheduler_path = ( From e5ade772be7e5d5fe4c0dd9d648e81d512f7b1e9 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Mon, 5 Feb 2024 08:46:03 -0800 Subject: [PATCH 32/39] Added test script for DracoRNO/wip --- .../run_gpt_on_draco_rno_FT.txt | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 examples/fault_tolerance/run_gpt_on_draco_rno_FT.txt diff --git a/examples/fault_tolerance/run_gpt_on_draco_rno_FT.txt b/examples/fault_tolerance/run_gpt_on_draco_rno_FT.txt new file mode 100644 index 0000000000..98402f3f87 --- /dev/null +++ b/examples/fault_tolerance/run_gpt_on_draco_rno_FT.txt @@ -0,0 +1,59 @@ +USR_DIR="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi" +LAUNCHER_DIR="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi/ft/NeMo-Megatron-Launcher" + +# create dummy data this that is required by the launcher +# we will use mock data +mkdir -p ${LAUNCHER_DIR}/dummy_data_dir + + +# USE SC2 container, but train GPT3 5b + +NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface/hub python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ + training=gpt3/5b \ + stages=["training"] \ + numa_mapping.enable=True \ + data_dir=${LAUNCHER_DIR}/dummy_data_dir \ + training.model.data.data_impl="mock" \ + training.model.data.data_prefix=[] \ + launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts \ + container_mounts=[$USR_DIR:$USR_DIR] \ + container="gitlab-master.nvidia.com/dl/nemo/nemo-fw/train:sc2_fault_tol_elastic" \ + cluster.partition=batch_short_dgx1_m2 \ + cluster.account=coreai_dlalgo_llm \ + cluster.job_name_prefix="coreai_dlalgo_llm-test-ft5b:" \ + cluster.gpus_per_task=null \ + cluster.gpus_per_node=null \ + ++cluster.nv_meta="ml-model.fault_tol_tests" \ + ++cluster.gres="gpu:8" \ + ++cluster.signal="TERM@180" \ + training.exp_manager.resume_if_exists=True \ + training.exp_manager.create_checkpoint_callback=True \ + training.exp_manager.checkpoint_callback_params.save_top_k=1 \ + training.exp_manager.resume_ignore_no_checkpoint=True \ + training.run.name="fault_tol_gpt3_5b_dbg" \ + training.run.time_limit=00:30:00 \ + training.trainer.max_time=00:01:00:00 \ + training.trainer.num_nodes=4 \ + training.trainer.devices=8 \ + training.trainer.log_every_n_steps=10 \ + training.trainer.val_check_interval=400 \ + ++training.trainer.precision=16 \ + ++training.model.mcore_gpt=False \ + ++training.model.tokenizer.merge_file="/gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt3/bpe/gpt2-merges.txt" \ + ++training.model.tokenizer.vocab_file="/gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt3/bpe/gpt2-vocab.txt" \ + training.trainer.enable_checkpointing=False \ + training.model.micro_batch_size=1 \ + training.model.global_batch_size=4 \ + training.model.tensor_model_parallel_size=8 \ + training.model.pipeline_model_parallel_size=1 \ + ++training.exp_manager.create_fault_tolerance_callback=True \ + ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=null \ + ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=null \ + ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=3 \ + ++training.exp_manager.fault_tolerance.max_rank_restarts=0 + + +# +# ++training.exp_manager.fault_tolerance.simulated_fault.fault_type=random \ +# ++training.exp_manager.fault_tolerance.simulated_fault.base_delay=900 +# From c2f8cba16dc5e8dbf97d61c17f264fc2698a738b Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Tue, 5 Mar 2024 13:22:36 +0100 Subject: [PATCH 33/39] Working on test scripts/WIP --- examples/fault_tolerance/run_on_cluster.sh | 74 ++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 examples/fault_tolerance/run_on_cluster.sh diff --git a/examples/fault_tolerance/run_on_cluster.sh b/examples/fault_tolerance/run_on_cluster.sh new file mode 100644 index 0000000000..510c5bdbab --- /dev/null +++ b/examples/fault_tolerance/run_on_cluster.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +CLUSTER="draco-rno" +CONTAINER="gitlab-master.nvidia.com:5005/dl/gwe/fault_tolerance_related/nemo-gwe-ft:test" +RUN_NAME="fault_tol_gpt3_5b_dbg_no_err" +NODES=2 + +FT_ARGS=" + ++training.exp_manager.create_fault_tolerance_callback=True \ + ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=900 \ + ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=600 \ + ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=3 \ + ++training.exp_manager.fault_tolerance.max_rank_restarts=0 + ++training.exp_manager.fault_tolerance.simulated_fault.fault_type=random \ + ++training.exp_manager.fault_tolerance.simulated_fault.base_delay=900 +" + +if [ "$CLUSTER" == "draco-rno" ]; then + PARTITION="batch_short_dgx1_m2" + ACCOUNT="coreai_dlalgo_llm" + JOB_PREFIX="coreai_dlalgo_llm-test-ft5b:" + CLUSTER_SPECIFIC_ARGS="++cluster.nv_meta=\"ml-model.fault_tol_tests\"" + USR_DIR="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi" + LAUNCHER_DIR="${USR_DIR}/ft/NeMo-Megatron-Launcher" +else + echo "Unknown cluster: $CLUSTER" + exit 1 +fi + +# create dummy data this that is required by the launcher +# we will use mock data +mkdir -p ${LAUNCHER_DIR}/dummy_data_dir + +HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ + training=gpt3/5b \ + stages=["training"] \ + numa_mapping.enable=True \ + data_dir=${LAUNCHER_DIR}/dummy_data_dir \ + training.model.data.data_impl="mock" \ + training.model.data.data_prefix=[] \ + launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts \ + container_mounts=[$USR_DIR:$USR_DIR] \ + container=${CONTAINER} \ + cluster.partition=${PARTITION} \ + cluster.account=${ACCOUNT} \ + cluster.job_name_prefix=${JOB_PREFIX} \ + ${CLUSTER_SPECIFIC_ARGS} \ + cluster.gpus_per_task=null \ + cluster.gpus_per_node=null \ + ++cluster.gres="gpu:8" \ + ++cluster.signal="TERM@300" \ + training.exp_manager.resume_if_exists=True \ + training.exp_manager.create_checkpoint_callback=True \ + training.exp_manager.checkpoint_callback_params.save_top_k=1 \ + training.exp_manager.resume_ignore_no_checkpoint=True \ + training.run.name=${RUN_NAME} \ + training.run.time_limit=00:45:00 \ + training.trainer.max_time=00:03:00:00 \ + training.trainer.num_nodes=${NODES} \ + training.trainer.devices=8 \ + training.trainer.log_every_n_steps=10 \ + training.trainer.val_check_interval=400 \ + ++training.trainer.precision=16 \ + ++training.model.mcore_gpt=False \ + ++training.model.tokenizer.merge_file="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi/bpe/gpt2-merges.txt" \ + ++training.model.tokenizer.vocab_file="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi/bpe/gpt2-vocab.json" \ + training.trainer.enable_checkpointing=False \ + training.model.micro_batch_size=1 \ + training.model.global_batch_size=${NODES} \ + training.model.tensor_model_parallel_size=8 \ + training.model.pipeline_model_parallel_size=1 \ + ${FT_ARGS} + + From 76f176af37ff1c899e602ad13ecaaf3c18a1092c Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Fri, 8 Mar 2024 04:05:19 -0800 Subject: [PATCH 34/39] Version for testing --- examples/fault_tolerance/run_on_cluster.sh | 56 +- .../conf/training/gpt3/starcoder2_3b.yaml | 771 ------------------ .../nemo_launcher/core/launchers.py | 4 +- launcher_scripts/nemo_launcher/core/stages.py | 8 - 4 files changed, 32 insertions(+), 807 deletions(-) mode change 100644 => 100755 examples/fault_tolerance/run_on_cluster.sh delete mode 100755 launcher_scripts/conf/training/gpt3/starcoder2_3b.yaml diff --git a/examples/fault_tolerance/run_on_cluster.sh b/examples/fault_tolerance/run_on_cluster.sh old mode 100644 new mode 100755 index 510c5bdbab..b28bc18597 --- a/examples/fault_tolerance/run_on_cluster.sh +++ b/examples/fault_tolerance/run_on_cluster.sh @@ -1,27 +1,32 @@ #!/bin/bash +# NOTE: NeMo-Megatron-Launcher requirements should be installed +# e.g. cd /NeMo-Megatron-Launcher && pip install -r requirements.txt + CLUSTER="draco-rno" -CONTAINER="gitlab-master.nvidia.com:5005/dl/gwe/fault_tolerance_related/nemo-gwe-ft:test" -RUN_NAME="fault_tol_gpt3_5b_dbg_no_err" -NODES=2 +CONTAINER="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi/ft/NeMo-Megatron-Launcher-gwe-ft/dl+gwe+fault_tolerance_related+nemo-gwe-ft+test.sqsh" # "gitlab-master.nvidia.com/dl/gwe/fault_tolerance_related/nemo-gwe-ft:test" +RUN_NAME="fault_tol_gpt3_5b_no_err" +NODES=4 FT_ARGS=" ++training.exp_manager.create_fault_tolerance_callback=True \ - ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=900 \ - ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=600 \ - ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=3 \ - ++training.exp_manager.fault_tolerance.max_rank_restarts=0 - ++training.exp_manager.fault_tolerance.simulated_fault.fault_type=random \ - ++training.exp_manager.fault_tolerance.simulated_fault.base_delay=900 + ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=1 " +# ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=900 \ +# ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=600 \ +# ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=3 \ +# ++training.exp_manager.fault_tolerance.max_rank_restarts=0 +# ++training.exp_manager.fault_tolerance.simulated_fault.fault_type=random \ +# ++training.exp_manager.fault_tolerance.simulated_fault.base_delay=900 + if [ "$CLUSTER" == "draco-rno" ]; then PARTITION="batch_short_dgx1_m2" ACCOUNT="coreai_dlalgo_llm" - JOB_PREFIX="coreai_dlalgo_llm-test-ft5b:" + JOB_PREFIX="coreai_dlalgo_llm-test:" CLUSTER_SPECIFIC_ARGS="++cluster.nv_meta=\"ml-model.fault_tol_tests\"" USR_DIR="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi" - LAUNCHER_DIR="${USR_DIR}/ft/NeMo-Megatron-Launcher" + LAUNCHER_DIR="${USR_DIR}/ft/NeMo-Megatron-Launcher-gwe-ft" else echo "Unknown cluster: $CLUSTER" exit 1 @@ -31,13 +36,14 @@ fi # we will use mock data mkdir -p ${LAUNCHER_DIR}/dummy_data_dir -HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ +HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface/hub HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ training=gpt3/5b \ stages=["training"] \ numa_mapping.enable=True \ data_dir=${LAUNCHER_DIR}/dummy_data_dir \ - training.model.data.data_impl="mock" \ - training.model.data.data_prefix=[] \ + ++training.model.data.mock_dataset=True \ + ++training.model.data.data_impl="mock" \ + ++training.model.data.data_prefix=[] \ launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts \ container_mounts=[$USR_DIR:$USR_DIR] \ container=${CONTAINER} \ @@ -45,30 +51,26 @@ HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scri cluster.account=${ACCOUNT} \ cluster.job_name_prefix=${JOB_PREFIX} \ ${CLUSTER_SPECIFIC_ARGS} \ - cluster.gpus_per_task=null \ - cluster.gpus_per_node=null \ ++cluster.gres="gpu:8" \ - ++cluster.signal="TERM@300" \ + ++cluster.signal="TERM@240" \ training.exp_manager.resume_if_exists=True \ training.exp_manager.create_checkpoint_callback=True \ training.exp_manager.checkpoint_callback_params.save_top_k=1 \ training.exp_manager.resume_ignore_no_checkpoint=True \ training.run.name=${RUN_NAME} \ - training.run.time_limit=00:45:00 \ - training.trainer.max_time=00:03:00:00 \ + training.run.time_limit=00:20:00 \ + training.trainer.max_time=00:01:30:00 \ training.trainer.num_nodes=${NODES} \ training.trainer.devices=8 \ training.trainer.log_every_n_steps=10 \ - training.trainer.val_check_interval=400 \ + training.trainer.val_check_interval=50 \ ++training.trainer.precision=16 \ - ++training.model.mcore_gpt=False \ - ++training.model.tokenizer.merge_file="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi/bpe/gpt2-merges.txt" \ - ++training.model.tokenizer.vocab_file="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi/bpe/gpt2-vocab.json" \ + ++training.model.tokenizer.merge_file="${USR_DIR}/bpe/gpt2-merges.txt" \ + ++training.model.tokenizer.vocab_file="${USR_DIR}/bpe/gpt2-vocab.txt" \ training.trainer.enable_checkpointing=False \ training.model.micro_batch_size=1 \ - training.model.global_batch_size=${NODES} \ - training.model.tensor_model_parallel_size=8 \ - training.model.pipeline_model_parallel_size=1 \ + training.model.global_batch_size=$((${NODES} * 8)) \ + training.model.tensor_model_parallel_size=2 \ + training.model.pipeline_model_parallel_size=4 \ ${FT_ARGS} - diff --git a/launcher_scripts/conf/training/gpt3/starcoder2_3b.yaml b/launcher_scripts/conf/training/gpt3/starcoder2_3b.yaml deleted file mode 100755 index b71d98ed3a..0000000000 --- a/launcher_scripts/conf/training/gpt3/starcoder2_3b.yaml +++ /dev/null @@ -1,771 +0,0 @@ -hydra: - searchpath: - - file:///opt/NeMo/examples/nlp/language_modeling/conf -run: - name: starcoder2_3b - results_dir: ${base_results_dir}/${.name} - time_limit: 04:00:00 - dependency: singleton -trainer: - num_nodes: 8 - devices: 8 - accelerator: gpu - precision: bf16 - logger: false - enable_checkpointing: false - use_distributed_sampler: false - max_epochs: null - max_steps: 114400 - max_time: 02:23:30:00 - log_every_n_steps: 10 - val_check_interval: 500 - limit_val_batches: 25 - limit_test_batches: 25 - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 -exp_manager: - explicit_log_dir: ${base_results_dir}/${.name} - exp_dir: null - name: megatron_gpt - create_wandb_logger: false - wandb_logger_kwargs: - project: starcoder2 - name: starcoder2_3b - resume_if_exists: true - resume_ignore_no_checkpoint: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - save_top_k: 10 - mode: min - always_save_nemo: false - save_nemo_on_train_end: false - filename: megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples} - model_parallel_size: 2 - log_step_timing: true - step_timing_kwargs: - sync_cuda: true - buffer_size: 5 -model: - micro_batch_size: 1 - global_batch_size: 160 - rampup_batch_size: null - tensor_model_parallel_size: 2 - pipeline_model_parallel_size: 1 - virtual_pipeline_model_parallel_size: null - encoder_seq_length: 16384 - max_position_embeddings: 16384 - num_layers: 30 - hidden_size: 3072 - ffn_hidden_size: 12288 - num_attention_heads: 24 - init_method_std: 0.018042 - use_scaled_init_method: true - hidden_dropout: 0.1 - attention_dropout: 0.1 - ffn_dropout: 0.0 - kv_channels: 128 - apply_query_key_layer_scaling: true - normalization: layernorm1p - layernorm_zero_centered_gamma: true - layernorm_epsilon: 1.0e-05 - do_layer_norm_weight_decay: false - make_vocab_size_divisible_by: 128 - pre_process: true - post_process: true - persist_layer_norm: true - bias: false - activation: fast-swiglu - headscale: false - transformer_block_type: pre_ln - openai_gelu: false - normalize_attention_scores: true - position_embedding_type: rope - rotary_percentage: 0.5 - attention_type: multihead - share_embeddings_and_output_weights: false - tokenizer: - library: huggingface - type: bigcode/starcoder2-tokenizer - model: null - delimiter: null - vocab_file: null - merge_file: null - native_amp_init_scale: 4294967296 - native_amp_growth_interval: 1000 - hysteresis: 2 - fp32_residual_connection: false - fp16_lm_cross_entropy: false - megatron_amp_O2: true - grad_allreduce_chunk_size_mb: 125 - grad_div_ar_fusion: true - gradient_accumulation_fusion: false - bias_activation_fusion: false - bias_dropout_add_fusion: false - masked_softmax_fusion: true - seed: 1234 - resume_from_checkpoint: null - use_cpu_initialization: false - onnx_safe: false - apex_transformer_log_level: 30 - gradient_as_bucket_view: true - sync_batch_comm: false - activations_checkpoint_granularity: null - activations_checkpoint_method: null - activations_checkpoint_num_layers: null - num_micro_batches_with_partial_activation_checkpoints: null - activations_checkpoint_layers_per_pipeline: null - sequence_parallel: false - overlap_p2p_comm: false - batch_p2p_comm: true - num_query_groups: null - mcore_gpt: true - transformer_engine: false - fp8: false - fp8_e4m3: false - fp8_hybrid: true - fp8_margin: 0 - fp8_interval: 1 - fp8_amax_history_len: 1024 - fp8_amax_compute_algo: max - fp8_wgrad: true - ub_tp_comm_overlap: false - optim: - name: distributed_fused_adam - lr: 0.0003 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - sched: - name: CosineAnnealing - warmup_steps: 100 - constant_steps: 0 - min_lr: 3.0e-05 - data: - data_impl: mmap - splits_string: 9995,3,2 - seq_length: 16384 - skip_warmup: true - num_workers: 2 - dataloader_type: single - reset_position_ids: false - reset_attention_mask: false - eod_mask_loss: false - index_mapping_dir: null - add_fim: true - fim: - rate: 0.5 - spm_rate: 0.5 - split_sample: - fragment_rate: 0.5 - no_prefix: - extra_tokens: - prefix: - middle: - suffix: - pad: - eod: <|endoftext|> - data_prefix: - - 1.96 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_0/gpt2-preprocessed_content_document - - 1.96 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_1/gpt2-preprocessed_content_document - - 1.95 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_2/gpt2-preprocessed_content_document - - 1.95 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_3/gpt2-preprocessed_content_document - - 1.96 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_4/gpt2-preprocessed_content_document - - 1.95 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_5/gpt2-preprocessed_content_document - - 1.95 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_6/gpt2-preprocessed_content_document - - 1.95 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_7/gpt2-preprocessed_content_document - - 1.95 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_8/gpt2-preprocessed_content_document - - 1.95 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/pull_requests/pull_requests_9/gpt2-preprocessed_content_document - - 2.21 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_0/gpt2-preprocessed_content_document - - 2.21 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_1/gpt2-preprocessed_content_document - - 2.21 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_2/gpt2-preprocessed_content_document - - 2.21 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_3/gpt2-preprocessed_content_document - - 2.21 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/issues/issues_4/gpt2-preprocessed_content_document - - 2.59 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_0/gpt2-preprocessed_content_document - - 2.5 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_1/gpt2-preprocessed_content_document - - 2.46 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_2/gpt2-preprocessed_content_document - - 2.42 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_3/gpt2-preprocessed_content_document - - 2.41 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_4/gpt2-preprocessed_content_document - - 2.36 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_structured/jupyter_structured_5/gpt2-preprocessed_content_document - - 2.72 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_0/gpt2-preprocessed_content_document - - 2.71 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_1/gpt2-preprocessed_content_document - - 2.71 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_2/gpt2-preprocessed_content_document - - 2.73 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_3/gpt2-preprocessed_content_document - - 2.7 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_4/gpt2-preprocessed_content_document - - 2.71 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/jupyter_scripts/jupyter_scripts_5/gpt2-preprocessed_content_document - - 1.68 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/kaggle_scripts/kaggle_scripts_0/gpt2-preprocessed_content_document - - 1.6 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/documentation/documentation_0/gpt2-preprocessed_content_document - - 2.42 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_0/gpt2-preprocessed_content_document - - 2.42 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_1/gpt2-preprocessed_content_document - - 2.43 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_2/gpt2-preprocessed_content_document - - 2.43 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_3/gpt2-preprocessed_content_document - - 2.42 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_4/gpt2-preprocessed_content_document - - 2.29 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/owm/owm_5/gpt2-preprocessed_content_document - - 3.32 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/stackoverflow/stackoverflow_0/gpt2-preprocessed_content_document - - 3.55 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/stackoverflow/stackoverflow_1/gpt2-preprocessed_content_document - - 3.39 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/stackoverflow/stackoverflow_2/gpt2-preprocessed_content_document - - 0.25 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/lhq_data/lhq_data_0/gpt2-preprocessed_content_document - - 0.28 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/lhq_data/lhq_data_1/gpt2-preprocessed_content_document - - 0.47 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/lhq_data/lhq_data_2/gpt2-preprocessed_content_document - - 1.0 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/ir_cpp/ir_cpp_0/gpt2-preprocessed_content_document - - 1.0 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/ir_rust/ir_rust_0/gpt2-preprocessed_content_document - - 1.0 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/ir_python/ir_python_0/gpt2-preprocessed_content_document - - 3.0 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/tokenized_stack_v2_final/ir_low_resource/ir_low_resource_0/gpt2-preprocessed_content_document - - 1.95 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_0/gpt2-preprocessed_content_document - - 2.01 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_1/gpt2-preprocessed_content_document - - 1.97 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_2/gpt2-preprocessed_content_document - - 2.02 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_3/gpt2-preprocessed_content_document - - 2.08 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_4/gpt2-preprocessed_content_document - - 1.95 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_5/gpt2-preprocessed_content_document - - 1.89 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_6/gpt2-preprocessed_content_document - - 1.85 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_7/gpt2-preprocessed_content_document - - 2.09 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_8/gpt2-preprocessed_content_document - - 2.05 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_9/gpt2-preprocessed_content_document - - 1.93 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_10/gpt2-preprocessed_content_document - - 1.9 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_11/gpt2-preprocessed_content_document - - 2.0 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_12/gpt2-preprocessed_content_document - - 2.06 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_13/gpt2-preprocessed_content_document - - 1.84 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_14/gpt2-preprocessed_content_document - - 1.93 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_15/gpt2-preprocessed_content_document - - 1.85 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_16/gpt2-preprocessed_content_document - - 1.83 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_17/gpt2-preprocessed_content_document - - 1.83 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_18/gpt2-preprocessed_content_document - - 2.02 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_19/gpt2-preprocessed_content_document - - 1.92 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_20/gpt2-preprocessed_content_document - - 2.27 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_21/gpt2-preprocessed_content_document - - 2.25 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_22/gpt2-preprocessed_content_document - - 2.49 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_23/gpt2-preprocessed_content_document - - 1.88 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_24/gpt2-preprocessed_content_document - - 2.02 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_25/gpt2-preprocessed_content_document - - 2.04 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_26/gpt2-preprocessed_content_document - - 2.42 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_27/gpt2-preprocessed_content_document - - 1.97 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_28/gpt2-preprocessed_content_document - - 1.91 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_29/gpt2-preprocessed_content_document - - 2.54 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_30/gpt2-preprocessed_content_document - - 2.28 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_31/gpt2-preprocessed_content_document - - 2.02 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_32/gpt2-preprocessed_content_document - - 1.92 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_33/gpt2-preprocessed_content_document - - 2.26 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_34/gpt2-preprocessed_content_document - - 1.88 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_35/gpt2-preprocessed_content_document - - 2.09 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_36/gpt2-preprocessed_content_document - - 2.1 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_37/gpt2-preprocessed_content_document - - 2.02 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_38/gpt2-preprocessed_content_document - - 1.96 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_39/gpt2-preprocessed_content_document - - 2.05 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_40/gpt2-preprocessed_content_document - - 2.01 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_41/gpt2-preprocessed_content_document - - 2.15 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_42/gpt2-preprocessed_content_document - - 1.91 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_43/gpt2-preprocessed_content_document - - 1.99 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_44/gpt2-preprocessed_content_document - - 2.03 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_45/gpt2-preprocessed_content_document - - 1.92 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_46/gpt2-preprocessed_content_document - - 2.1 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_47/gpt2-preprocessed_content_document - - 2.14 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_48/gpt2-preprocessed_content_document - - 1.96 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_49/gpt2-preprocessed_content_document - - 1.98 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_50/gpt2-preprocessed_content_document - - 2.04 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_51/gpt2-preprocessed_content_document - - 1.93 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_52/gpt2-preprocessed_content_document - - 1.87 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_53/gpt2-preprocessed_content_document - - 1.92 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_54/gpt2-preprocessed_content_document - - 2.18 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_55/gpt2-preprocessed_content_document - - 2.04 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_56/gpt2-preprocessed_content_document - - 2.06 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_57/gpt2-preprocessed_content_document - - 1.89 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_58/gpt2-preprocessed_content_document - - 2.05 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_59/gpt2-preprocessed_content_document - - 2.11 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_60/gpt2-preprocessed_content_document - - 2.15 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_61/gpt2-preprocessed_content_document - - 2.03 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_62/gpt2-preprocessed_content_document - - 1.88 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_63/gpt2-preprocessed_content_document - - 2.12 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_64/gpt2-preprocessed_content_document - - 2.04 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_65/gpt2-preprocessed_content_document - - 2.04 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_66/gpt2-preprocessed_content_document - - 2.45 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_67/gpt2-preprocessed_content_document - - 1.91 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_68/gpt2-preprocessed_content_document - - 2.13 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_69/gpt2-preprocessed_content_document - - 1.98 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_70/gpt2-preprocessed_content_document - - 1.94 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_71/gpt2-preprocessed_content_document - - 1.93 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_72/gpt2-preprocessed_content_document - - 1.95 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_73/gpt2-preprocessed_content_document - - 2.04 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_74/gpt2-preprocessed_content_document - - 1.99 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_75/gpt2-preprocessed_content_document - - 2.02 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_76/gpt2-preprocessed_content_document - - 1.93 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_77/gpt2-preprocessed_content_document - - 1.89 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_78/gpt2-preprocessed_content_document - - 2.1 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_79/gpt2-preprocessed_content_document - - 2.07 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_80/gpt2-preprocessed_content_document - - 2.17 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_81/gpt2-preprocessed_content_document - - 2.65 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_82/gpt2-preprocessed_content_document - - 2.13 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_83/gpt2-preprocessed_content_document - - 1.95 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_84/gpt2-preprocessed_content_document - - 1.96 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_85/gpt2-preprocessed_content_document - - 2.03 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_86/gpt2-preprocessed_content_document - - 2.02 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_87/gpt2-preprocessed_content_document - - 1.87 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_88/gpt2-preprocessed_content_document - - 1.96 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_89/gpt2-preprocessed_content_document - - 2.25 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_90/gpt2-preprocessed_content_document - - 2.11 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_91/gpt2-preprocessed_content_document - - 2.28 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_92/gpt2-preprocessed_content_document - - 2.04 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_93/gpt2-preprocessed_content_document - - 1.97 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_94/gpt2-preprocessed_content_document - - 2.37 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_95/gpt2-preprocessed_content_document - - 2.15 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_96/gpt2-preprocessed_content_document - - 1.87 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_97/gpt2-preprocessed_content_document - - 2.03 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_98/gpt2-preprocessed_content_document - - 2.26 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_99/gpt2-preprocessed_content_document - - 2.07 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_100/gpt2-preprocessed_content_document - - 2.12 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_101/gpt2-preprocessed_content_document - - 2.22 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_102/gpt2-preprocessed_content_document - - 1.86 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_103/gpt2-preprocessed_content_document - - 2.0 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_104/gpt2-preprocessed_content_document - - 1.99 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_105/gpt2-preprocessed_content_document - - 2.2 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_106/gpt2-preprocessed_content_document - - 2.28 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_107/gpt2-preprocessed_content_document - - 2.14 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_108/gpt2-preprocessed_content_document - - 2.16 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_109/gpt2-preprocessed_content_document - - 1.95 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_110/gpt2-preprocessed_content_document - - 2.32 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_111/gpt2-preprocessed_content_document - - 1.98 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_112/gpt2-preprocessed_content_document - - 2.46 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_113/gpt2-preprocessed_content_document - - 2.15 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_114/gpt2-preprocessed_content_document - - 2.24 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_115/gpt2-preprocessed_content_document - - 1.9 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_116/gpt2-preprocessed_content_document - - 2.12 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_117/gpt2-preprocessed_content_document - - 1.93 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_118/gpt2-preprocessed_content_document - - 2.01 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_119/gpt2-preprocessed_content_document - - 2.3 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_120/gpt2-preprocessed_content_document - - 1.95 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_121/gpt2-preprocessed_content_document - - 1.91 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_122/gpt2-preprocessed_content_document - - 1.88 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_123/gpt2-preprocessed_content_document - - 2.27 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_124/gpt2-preprocessed_content_document - - 2.13 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_125/gpt2-preprocessed_content_document - - 2.04 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_126/gpt2-preprocessed_content_document - - 2.15 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_127/gpt2-preprocessed_content_document - - 2.18 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_128/gpt2-preprocessed_content_document - - 2.22 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_129/gpt2-preprocessed_content_document - - 2.06 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_130/gpt2-preprocessed_content_document - - 1.99 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_131/gpt2-preprocessed_content_document - - 1.93 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_132/gpt2-preprocessed_content_document - - 2.37 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_133/gpt2-preprocessed_content_document - - 1.96 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_134/gpt2-preprocessed_content_document - - 2.12 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_135/gpt2-preprocessed_content_document - - 2.03 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_136/gpt2-preprocessed_content_document - - 2.44 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_137/gpt2-preprocessed_content_document - - 2.16 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_138/gpt2-preprocessed_content_document - - 1.97 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_139/gpt2-preprocessed_content_document - - 2.0 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_140/gpt2-preprocessed_content_document - - 2.12 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_141/gpt2-preprocessed_content_document - - 1.98 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_142/gpt2-preprocessed_content_document - - 2.15 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_143/gpt2-preprocessed_content_document - - 1.85 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_144/gpt2-preprocessed_content_document - - 2.04 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_145/gpt2-preprocessed_content_document - - 1.94 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_146/gpt2-preprocessed_content_document - - 1.87 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_147/gpt2-preprocessed_content_document - - 1.85 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_148/gpt2-preprocessed_content_document - - 2.49 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_149/gpt2-preprocessed_content_document - - 2.13 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_150/gpt2-preprocessed_content_document - - 1.97 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_151/gpt2-preprocessed_content_document - - 1.88 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_152/gpt2-preprocessed_content_document - - 2.36 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_153/gpt2-preprocessed_content_document - - 2.02 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_154/gpt2-preprocessed_content_document - - 2.1 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_155/gpt2-preprocessed_content_document - - 1.93 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_156/gpt2-preprocessed_content_document - - 1.98 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_157/gpt2-preprocessed_content_document - - 1.96 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_158/gpt2-preprocessed_content_document - - 1.99 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_159/gpt2-preprocessed_content_document - - 2.0 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_160/gpt2-preprocessed_content_document - - 2.08 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_161/gpt2-preprocessed_content_document - - 2.0 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_162/gpt2-preprocessed_content_document - - 2.08 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_163/gpt2-preprocessed_content_document - - 2.02 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_164/gpt2-preprocessed_content_document - - 2.12 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_165/gpt2-preprocessed_content_document - - 2.07 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_166/gpt2-preprocessed_content_document - - 2.03 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_167/gpt2-preprocessed_content_document - - 2.28 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_168/gpt2-preprocessed_content_document - - 2.32 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_169/gpt2-preprocessed_content_document - - 1.92 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_170/gpt2-preprocessed_content_document - - 2.02 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_171/gpt2-preprocessed_content_document - - 1.94 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_172/gpt2-preprocessed_content_document - - 2.06 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_173/gpt2-preprocessed_content_document - - 2.01 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_174/gpt2-preprocessed_content_document - - 1.99 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_175/gpt2-preprocessed_content_document - - 2.19 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_176/gpt2-preprocessed_content_document - - 2.14 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_177/gpt2-preprocessed_content_document - - 1.91 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_178/gpt2-preprocessed_content_document - - 2.23 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_179/gpt2-preprocessed_content_document - - 1.99 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_180/gpt2-preprocessed_content_document - - 2.11 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_181/gpt2-preprocessed_content_document - - 1.93 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_182/gpt2-preprocessed_content_document - - 1.87 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_183/gpt2-preprocessed_content_document - - 2.02 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_184/gpt2-preprocessed_content_document - - 1.9 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_185/gpt2-preprocessed_content_document - - 2.05 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_186/gpt2-preprocessed_content_document - - 2.01 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_187/gpt2-preprocessed_content_document - - 1.9 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_188/gpt2-preprocessed_content_document - - 1.99 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_189/gpt2-preprocessed_content_document - - 2.12 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_190/gpt2-preprocessed_content_document - - 1.89 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_191/gpt2-preprocessed_content_document - - 1.89 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_192/gpt2-preprocessed_content_document - - 1.88 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_193/gpt2-preprocessed_content_document - - 2.63 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_194/gpt2-preprocessed_content_document - - 1.87 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_195/gpt2-preprocessed_content_document - - 1.98 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_196/gpt2-preprocessed_content_document - - 1.97 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_197/gpt2-preprocessed_content_document - - 2.0 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_198/gpt2-preprocessed_content_document - - 2.17 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_199/gpt2-preprocessed_content_document - - 2.02 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_200/gpt2-preprocessed_content_document - - 2.11 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_201/gpt2-preprocessed_content_document - - 2.24 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_202/gpt2-preprocessed_content_document - - 2.19 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_203/gpt2-preprocessed_content_document - - 2.07 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_204/gpt2-preprocessed_content_document - - 2.03 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_205/gpt2-preprocessed_content_document - - 2.03 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_206/gpt2-preprocessed_content_document - - 2.18 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_207/gpt2-preprocessed_content_document - - 1.92 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_208/gpt2-preprocessed_content_document - - 2.37 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_209/gpt2-preprocessed_content_document - - 2.03 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_210/gpt2-preprocessed_content_document - - 2.04 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_211/gpt2-preprocessed_content_document - - 1.86 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_212/gpt2-preprocessed_content_document - - 1.99 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_213/gpt2-preprocessed_content_document - - 1.96 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_214/gpt2-preprocessed_content_document - - 1.97 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_215/gpt2-preprocessed_content_document - - 2.1 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_216/gpt2-preprocessed_content_document - - 2.06 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_217/gpt2-preprocessed_content_document - - 1.99 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_218/gpt2-preprocessed_content_document - - 1.9 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_219/gpt2-preprocessed_content_document - - 2.05 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_220/gpt2-preprocessed_content_document - - 1.95 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_221/gpt2-preprocessed_content_document - - 1.9 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_222/gpt2-preprocessed_content_document - - 2.08 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_223/gpt2-preprocessed_content_document - - 1.97 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_224/gpt2-preprocessed_content_document - - 1.98 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_225/gpt2-preprocessed_content_document - - 1.93 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_226/gpt2-preprocessed_content_document - - 2.22 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_227/gpt2-preprocessed_content_document - - 1.93 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_228/gpt2-preprocessed_content_document - - 2.17 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_229/gpt2-preprocessed_content_document - - 2.06 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_230/gpt2-preprocessed_content_document - - 1.98 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_231/gpt2-preprocessed_content_document - - 1.94 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_232/gpt2-preprocessed_content_document - - 2.14 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_233/gpt2-preprocessed_content_document - - 1.97 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_234/gpt2-preprocessed_content_document - - 2.14 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_235/gpt2-preprocessed_content_document - - 2.12 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_236/gpt2-preprocessed_content_document - - 2.09 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_237/gpt2-preprocessed_content_document - - 2.01 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_238/gpt2-preprocessed_content_document - - 2.15 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_239/gpt2-preprocessed_content_document - - 2.01 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_240/gpt2-preprocessed_content_document - - 1.99 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_241/gpt2-preprocessed_content_document - - 1.86 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_242/gpt2-preprocessed_content_document - - 2.12 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_243/gpt2-preprocessed_content_document - - 1.99 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_244/gpt2-preprocessed_content_document - - 2.41 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_245/gpt2-preprocessed_content_document - - 2.04 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_246/gpt2-preprocessed_content_document - - 1.95 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_247/gpt2-preprocessed_content_document - - 1.93 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_248/gpt2-preprocessed_content_document - - 2.61 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_249/gpt2-preprocessed_content_document - - 1.77 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_250/gpt2-preprocessed_content_document - - 1.94 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_251/gpt2-preprocessed_content_document - - 2.2 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_252/gpt2-preprocessed_content_document - - 1.9 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_253/gpt2-preprocessed_content_document - - 2.15 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_254/gpt2-preprocessed_content_document - - 2.13 - - /lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized/stack_3b/stack_3b_255/gpt2-preprocessed_content_document diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index fc514aaee4..ed68abf6d2 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -1014,8 +1014,10 @@ def _make_sbatch_string_ft_launcher( # We do this by setting SLURM_JOB_NAME=interactive. # This is a temporary workaround, until the following PR is merged with NeMo # https://github.com/Lightning-AI/pytorch-lightning/pull/18618 + # --ignore-missing-fault-tol-cfg is used so FT launcher can handle NeMo YAML without fault_tolerance section + # in such case default FT config will be used ft_launcher_cmd_part="SLURM_JOB_NAME=interactive ft_launcher "+\ - f"--fault-tol-cfg-path=$FAULT_TOL_CFG_PATH {additional_ft_launcher_args} "+\ + f"--fault-tol-cfg-path=$FAULT_TOL_CFG_PATH --ignore-missing-fault-tol-cfg {additional_ft_launcher_args} "+\ "--rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$RDZV_HOST " +\ f"--nnodes={nodes} --nproc_per_node={ntasks_per_node} --max-restarts={max_rank_restarts}" diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 344065b353..02f2411e4a 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -81,8 +81,6 @@ def run(self) -> str: f"global batch size and number of nodes will change following this schedule:\n {self.nodes_scheduler}" ) - self._set_fault_tolerance_work_dir_in_stage_cfg(self.stage_cfg, self.cluster) - stage_cfg_path = NemoMegatronStage.save_stage_hydra_config( self.stage_cfg, job_path, self.cfg ) @@ -376,12 +374,6 @@ def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters) ft_conf.get('additional_ft_launcher_args', "") return cluster_parameters - def _set_fault_tolerance_work_dir_in_stage_cfg(self, stage_cfg, cluster): - use_ft, ft_conf = self._get_fault_tol_config_section(stage_cfg, cluster) - if use_ft: - with omegaconf.open_dict(ft_conf): - ft_conf.work_dir = str(self.get_job_path().folder / "_ft_scratch_dir") - def _find_optimal_nodes(self, cfg, gpus) -> None: nodes_scheduler_path = ( f"{cfg.get('training').get('run').get('results_dir')}/nodes_scheduler.json" From 8e0cf6f17f47aa416f8669224e3e26027ccad91d Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Tue, 30 Apr 2024 10:12:46 +0200 Subject: [PATCH 35/39] Handle unknown job result as a failure, added some comments --- launcher_scripts/nemo_launcher/core/launchers.py | 6 +++--- launcher_scripts/nemo_launcher/core/stages.py | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index 4e8b7c0ec5..4a5ffe8488 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -951,8 +951,8 @@ def _make_sbatch_string_ft_launcher( srun_args = [] # A safety measures: - # let SLURM kill all tasks if any FT launcher returns with a failure. - # let SLURM kill the job, 1h after any task ended. + # let SLURM immediately kill all tasks if any FT launcher returns with a failure. + # let SLURM kill the job, 1h after any task ended without a failure. srun_args += ["--kill-on-bad-exit=1", "--wait=3600"] lines += [ @@ -972,7 +972,7 @@ def _make_sbatch_string_ft_launcher( f'MAX_JOB_FAILURES={max_subsequent_job_failures}', 'is_job_failures_limit_reached() {', ' tail -n $MAX_JOB_FAILURES "$JOB_RESULTS_FILE" | \\', - ' awk "/^[[:alnum:]]+[[:space:]]+F$/{f++} END{exit !(f>=$MAX_JOB_FAILURES)}"', + ' awk "/^[[:alnum:]]+[[:space:]]+[XF]$/{f++} END{exit !(f>=$MAX_JOB_FAILURES)}"', '}', 'is_training_finished() {', ' test -f "$FAULT_TOL_FINISHED_FLAG_FILE"', diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index c0759e5128..6ef1e973d1 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -73,6 +73,8 @@ def __init__(self, cfg): self.setup_stage_vars(cfg) self.job_name = self.stage_cfg.run.get("name") if self.cluster.lower() == 'bcm': + # this to ensure that submission filename (.sh) matches the config filename (.yaml) + # expected result: _submission.sh, _hydra.yaml self.job_name = cfg.get("cluster").get("job_name_prefix","") + self.job_name self.nodes_scheduler = {} From 23d03cb739dafccb6d490fd967d96d33c914f201 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Tue, 4 Jun 2024 13:43:02 -0700 Subject: [PATCH 36/39] formatting Signed-off-by: Maanu Grover --- .../nemo_launcher/core/launchers.py | 102 +++++++++--------- launcher_scripts/nemo_launcher/core/stages.py | 47 ++++---- .../config_tests/test_fault_tol_config.py | 8 +- 3 files changed, 83 insertions(+), 74 deletions(-) diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index e4197da211..2fb22bd526 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -26,9 +26,9 @@ import nemo_launcher.utils.job_utils as job_utils import yaml +from hera.workflows import Workflow from nemo_launcher.core.logger import logger from omegaconf import DictConfig, OmegaConf -from hera.workflows import Workflow NEMO_LAUNCHER_CI = os.getenv("NEMO_LAUNCHER_CI", "False").lower() in ("true", "t", "1") NEMO_LAUNCHER_DEBUG = os.getenv("NEMO_LAUNCHER_DEBUG", "False").lower() in ( @@ -367,11 +367,13 @@ class SlurmLauncher(Launcher): setup: a list of command to run in sbatch before running srun """ - def __init__(self, - folder: Union[Path, str], - job_name: str, - use_fault_tolerance: bool, - **kwargs: Any) -> None: + def __init__( + self, + folder: Union[Path, str], + job_name: str, + use_fault_tolerance: bool, + **kwargs: Any, + ) -> None: super().__init__(folder, job_name) self.parameters = {} self.use_fault_tolerance = use_fault_tolerance @@ -424,7 +426,7 @@ def _update_parameters(self, **kwargs: Any) -> None: Below are the parameters that differ from slurm documentation: setup: a list of command to run in sbatch before running srun """ - + if self.use_fault_tolerance: defaults = _get_default_parameters_ft_launcher() else: @@ -474,7 +476,7 @@ def _make_submission_file_text(self, command_groups: List[List[str]]) -> str: if self.use_fault_tolerance: return _make_sbatch_string_ft_launcher( command_groups=command_groups, folder=self.folder, **self.parameters - ) + ) else: return _make_sbatch_string( command_groups=command_groups, folder=self.folder, **self.parameters @@ -897,7 +899,7 @@ def _make_sbatch_string_ft_launcher( max_rank_restarts: int = 0, additional_ft_launcher_args: str = "", ) -> str: - + """Creates the content of an sbatch file with provided parameters Parameters @@ -985,9 +987,9 @@ def _make_sbatch_string_ft_launcher( for k in sorted(parameters): lines.append(_as_sbatch_flag(k, parameters[k])) parameters["ntasks_per_node"] = ntasks_per_node - + lines += ["", "# This script uses experimental fault tolerance launcher", ""] - + # environment setup: if setup is not None: lines += ["", "# setup"] + setup @@ -1001,42 +1003,42 @@ def _make_sbatch_string_ft_launcher( srun_args += ["--kill-on-bad-exit=1", "--wait=3600"] lines += [ - '', - '# Fault tolerance related items', + "", + "# Fault tolerance related items", f'export FAULT_TOL_CFG_PATH="{str(paths.config_file)}"', f'export FAULT_TOL_FINISHED_FLAG_FILE="{str(paths.folder / "_finished_flag")}"', - 'RDZV_HOST=$(hostname)', - 'ANY_JOB_STEP_FAILED=0', + "RDZV_HOST=$(hostname)", + "ANY_JOB_STEP_FAILED=0", ] - + if max_subsequent_job_failures > 0: lines += [ - '', - '# Automatic job resubmission related items', + "", + "# Automatic job resubmission related items", f'JOB_RESULTS_FILE="{str(paths.folder / "_job_results")}"', - f'MAX_JOB_FAILURES={max_subsequent_job_failures}', - 'is_job_failures_limit_reached() {', + f"MAX_JOB_FAILURES={max_subsequent_job_failures}", + "is_job_failures_limit_reached() {", ' tail -n $MAX_JOB_FAILURES "$JOB_RESULTS_FILE" | \\', ' awk "/^[[:alnum:]]+[[:space:]]+[XF]$/{f++} END{exit !(f>=$MAX_JOB_FAILURES)}"', - '}', - 'is_training_finished() {', + "}", + "is_training_finished() {", ' test -f "$FAULT_TOL_FINISHED_FLAG_FILE"', - '}', - '# Exit immediately if finished flag file exists and this job is a continuation', + "}", + "# Exit immediately if finished flag file exists and this job is a continuation", 'if [ "$FT_RESUMED" = "1" ] ; then', ' if is_training_finished ; then echo "Training is finished" ; exit 0 ; fi', ' if is_job_failures_limit_reached ; then echo "Job failures limit reached ($MAX_JOB_FAILURES)" ; exit 1 ; fi', - 'else', + "else", ' rm -f "$FAULT_TOL_FINISHED_FLAG_FILE" "$JOB_RESULTS_FILE"', - 'fi', - '# Pre-schedule continuation job', + "fi", + "# Pre-schedule continuation job", 'CONT_SBATCH_OUT=$(FT_RESUMED=1 sbatch --parsable --dependency=afterany:"$SLURM_JOB_ID" "$0")', 'if [ $? -ne 0 ] ; then echo "Couldnt schedule continuation job. Check stderr for details." ; exit 1 ; fi', 'CONT_SLURM_JOB_ID=$(echo $CONT_SBATCH_OUT | cut -f1 -d",")', - '# Write unknown job status to the job log, we will fix it at the end', + "# Write unknown job status to the job log, we will fix it at the end", 'echo "$SLURM_JOB_ID X" >> "$JOB_RESULTS_FILE"', ] - + # commandline (this will run the function and args specified in the file provided as argument) # We pass --output and --error here, because the SBATCH command doesn't work as expected with a filename pattern stderr_flags = [] if stderr_to_stdout else ["--error", stderr] @@ -1068,19 +1070,21 @@ def _make_sbatch_string_ft_launcher( f" nvidia-smi --query-gpu=timestamp,index,,memory.total,memory.free,memory.used --format=csv -l 1 & ", "", ] - + # Fault tolerance uses Torch Elastic based launcher with SLURM. - # Torch Lightning does not handle that case correctly, - # so we need to force TorchElasticEnvironment over SLURMEnvironment. + # Torch Lightning does not handle that case correctly, + # so we need to force TorchElasticEnvironment over SLURMEnvironment. # We do this by setting SLURM_JOB_NAME=interactive. - # This is a temporary workaround, until the following PR is merged with NeMo + # This is a temporary workaround, until the following PR is merged with NeMo # https://github.com/Lightning-AI/pytorch-lightning/pull/18618 # --ignore-missing-fault-tol-cfg is used so FT launcher can handle NeMo YAML without fault_tolerance section # in such case default FT config will be used - ft_launcher_cmd_part="SLURM_JOB_NAME=interactive ft_launcher "+\ - f"--fault-tol-cfg-path=$FAULT_TOL_CFG_PATH --ignore-missing-fault-tol-cfg {additional_ft_launcher_args} "+\ - "--rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$RDZV_HOST " +\ - f"--nnodes={nodes} --nproc_per_node={ntasks_per_node} --max-restarts={max_rank_restarts}" + ft_launcher_cmd_part = ( + "SLURM_JOB_NAME=interactive ft_launcher " + + f"--fault-tol-cfg-path=$FAULT_TOL_CFG_PATH --ignore-missing-fault-tol-cfg {additional_ft_launcher_args} " + + "--rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$RDZV_HOST " + + f"--nnodes={nodes} --nproc_per_node={ntasks_per_node} --max-restarts={max_rank_restarts}" + ) for group_ind, command_group in enumerate(command_groups): if heterogeneous: @@ -1098,9 +1102,7 @@ def _make_sbatch_string_ft_launcher( ) command = ";\n ".join(command_group) assert "python3 -u" in command - command = command.replace( - "python3 -u", ft_launcher_cmd_part, - ) + command = command.replace("python3 -u", ft_launcher_cmd_part,) lines += [ "", f"# command {group_ind + 1}", @@ -1108,26 +1110,24 @@ def _make_sbatch_string_ft_launcher( f' {command} "', "", ] - lines += [ - 'if [ $? -ne 0 ]; then ANY_JOB_STEP_FAILED=1 ; fi' - ] + lines += ["if [ $? -ne 0 ]; then ANY_JOB_STEP_FAILED=1 ; fi"] if max_subsequent_job_failures > 0: lines += [ - '', + "", '# Fix the job log entry ("JOB_ID X" -> "JOB_ID S/F"), depending on the job result', 'if [ "$ANY_JOB_STEP_FAILED" = "0" ] ; then', ' sed -i "s/$SLURM_JOB_ID X/$SLURM_JOB_ID S/" "$JOB_RESULTS_FILE"', - 'else', + "else", ' sed -i "s/$SLURM_JOB_ID X/$SLURM_JOB_ID F/" "$JOB_RESULTS_FILE"', - 'fi', - '# Check if the continuation job can be cancelled', - 'if is_training_finished ; then', + "fi", + "# Check if the continuation job can be cancelled", + "if is_training_finished ; then", ' echo "Training is finished" ; scancel $CONT_SLURM_JOB_ID ; exit 0', - 'fi', - 'if is_job_failures_limit_reached ; then', + "fi", + "if is_job_failures_limit_reached ; then", ' echo "Job failures limit reached ($MAX_JOB_FAILURES)" ; scancel $CONT_SLURM_JOB_ID ; exit 1', - 'fi', + "fi", ] return "\n".join(lines) diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 7b8770183e..724d003c1c 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -17,10 +17,13 @@ import glob import json import logging -import omegaconf import os import re import shutil +from pathlib import Path +from typing import Any, Dict, List, Optional + +import omegaconf from nemo_launcher.core.launchers import AutoLauncher from nemo_launcher.utils.data_utils.prepare_squad import ( prepare_squad_for_fine_tuning, @@ -28,8 +31,6 @@ ) from nemo_launcher.utils.job_utils import JobPaths from omegaconf import DictConfig, OmegaConf -from pathlib import Path -from typing import Any, Dict, List, Optional __LANGUAGE_MODELS_LIST__ = [ "gpt3", @@ -76,10 +77,12 @@ def __init__(self, cfg): self.stage_cfg = None self.setup_stage_vars(cfg) self.job_name = self.stage_cfg.run.get("name") - if self.cluster.lower() == 'bcm': + if self.cluster.lower() == "bcm": # this to ensure that submission filename (.sh) matches the config filename (.yaml) # expected result: _submission.sh, _hydra.yaml - self.job_name = cfg.get("cluster").get("job_name_prefix","") + self.job_name + self.job_name = ( + cfg.get("cluster").get("job_name_prefix", "") + self.job_name + ) self.nodes_scheduler = {} def setup_stage_vars(self, cfg: OmegaConf): @@ -399,31 +402,37 @@ def _make_cluster_parameters(self, cluster: str) -> Dict: } ) - cluster_parameters = \ - self._update_fault_tolerance_params(stage_cfg, cluster, cluster_parameters) - + cluster_parameters = self._update_fault_tolerance_params( + stage_cfg, cluster, cluster_parameters + ) + return cluster_parameters - + def _get_fault_tol_config_section(self, stage_cfg, cluster): exp_man_conf = stage_cfg.get("exp_manager", dict()) - use_ft = exp_man_conf.get('create_fault_tolerance_callback', False) + use_ft = exp_man_conf.get("create_fault_tolerance_callback", False) if use_ft: if cluster.lower() != "bcm": - raise ValueError(f"Fault tolerance requires 'bcm' cluster, but it's '{cluster}')") - return use_ft, exp_man_conf.get("fault_tolerance", dict()) + raise ValueError( + f"Fault tolerance requires 'bcm' cluster, but it's '{cluster}')" + ) + return use_ft, exp_man_conf.get("fault_tolerance", dict()) def _update_fault_tolerance_params(self, stage_cfg, cluster, cluster_parameters): use_ft, ft_conf = self._get_fault_tol_config_section(stage_cfg, cluster) cluster_parameters["use_fault_tolerance"] = use_ft if use_ft: - cluster_parameters["max_rank_restarts"] = \ - ft_conf.get('max_rank_restarts', 0) - cluster_parameters["max_subsequent_job_failures"] = \ - ft_conf.get('max_subsequent_job_failures', 0) - cluster_parameters["additional_ft_launcher_args"] = \ - ft_conf.get('additional_ft_launcher_args', "") + cluster_parameters["max_rank_restarts"] = ft_conf.get( + "max_rank_restarts", 0 + ) + cluster_parameters["max_subsequent_job_failures"] = ft_conf.get( + "max_subsequent_job_failures", 0 + ) + cluster_parameters["additional_ft_launcher_args"] = ft_conf.get( + "additional_ft_launcher_args", "" + ) return cluster_parameters - + def _find_optimal_nodes(self, cfg, gpus) -> None: nodes_scheduler_path = ( f"{cfg.get('training').get('run').get('results_dir')}/nodes_scheduler.json" diff --git a/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py index 8a9f7a1d70..d22a69834d 100644 --- a/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py +++ b/launcher_scripts/tests/unit_tests/config_tests/test_fault_tol_config.py @@ -72,13 +72,14 @@ def test_fault_tol_config_with_bcm(): cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml") cfg.training_config = "gpt3/126m" cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") - cfg.training.exp_manager.create_fault_tolerance_callback=True + cfg.training.exp_manager.create_fault_tolerance_callback = True cfg.training.exp_manager.fault_tolerance = OmegaConf.create( {"max_subsequent_job_failures": 1} ) stage = Training(cfg) _ = stage.run() + def test_fault_tol_config_with_bcm_no_ft_section(): """ Fault tolerance + BCM cluster, no "fault_tolerance" section in cfg, should be fine """ cfg = OmegaConf.load("conf/config.yaml") @@ -89,7 +90,7 @@ def test_fault_tol_config_with_bcm_no_ft_section(): cfg.cluster = OmegaConf.load("conf/cluster/bcm.yaml") cfg.training_config = "gpt3/126m" cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") - cfg.training.exp_manager.create_fault_tolerance_callback=True + cfg.training.exp_manager.create_fault_tolerance_callback = True stage = Training(cfg) _ = stage.run() @@ -104,8 +105,7 @@ def test_fault_tol_config_with_bcp(): cfg.cluster = dict() cfg.training_config = "gpt3/126m" cfg.training = OmegaConf.load("conf/training/gpt3/126m.yaml") - cfg.training.exp_manager.create_fault_tolerance_callback=True + cfg.training.exp_manager.create_fault_tolerance_callback = True with pytest.raises(ValueError): stage = Training(cfg) _ = stage.run() - From 9a9accf9070403edd8f4d1ebe18118bbae066796 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Tue, 4 Jun 2024 13:44:32 -0700 Subject: [PATCH 37/39] fix unit tests Signed-off-by: Maanu Grover --- launcher_scripts/nemo_launcher/core/launchers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index 2fb22bd526..861f263a7d 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -371,7 +371,7 @@ def __init__( self, folder: Union[Path, str], job_name: str, - use_fault_tolerance: bool, + use_fault_tolerance: bool = False, **kwargs: Any, ) -> None: super().__init__(folder, job_name) From eb48956d368dfd4594978e3f1fa5db1531c6f8f2 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Wed, 5 Jun 2024 15:39:03 -0700 Subject: [PATCH 38/39] remove examples Signed-off-by: Maanu Grover --- .../run_gpt_on_draco_rno_FT.txt | 59 -------------- examples/fault_tolerance/run_on_cluster.sh | 76 ------------------- .../fault_tolerance/run_sc2_3b_on_eos_FT.txt | 35 --------- 3 files changed, 170 deletions(-) delete mode 100644 examples/fault_tolerance/run_gpt_on_draco_rno_FT.txt delete mode 100755 examples/fault_tolerance/run_on_cluster.sh delete mode 100644 examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt diff --git a/examples/fault_tolerance/run_gpt_on_draco_rno_FT.txt b/examples/fault_tolerance/run_gpt_on_draco_rno_FT.txt deleted file mode 100644 index 98402f3f87..0000000000 --- a/examples/fault_tolerance/run_gpt_on_draco_rno_FT.txt +++ /dev/null @@ -1,59 +0,0 @@ -USR_DIR="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi" -LAUNCHER_DIR="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi/ft/NeMo-Megatron-Launcher" - -# create dummy data this that is required by the launcher -# we will use mock data -mkdir -p ${LAUNCHER_DIR}/dummy_data_dir - - -# USE SC2 container, but train GPT3 5b - -NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface/hub python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ - training=gpt3/5b \ - stages=["training"] \ - numa_mapping.enable=True \ - data_dir=${LAUNCHER_DIR}/dummy_data_dir \ - training.model.data.data_impl="mock" \ - training.model.data.data_prefix=[] \ - launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts \ - container_mounts=[$USR_DIR:$USR_DIR] \ - container="gitlab-master.nvidia.com/dl/nemo/nemo-fw/train:sc2_fault_tol_elastic" \ - cluster.partition=batch_short_dgx1_m2 \ - cluster.account=coreai_dlalgo_llm \ - cluster.job_name_prefix="coreai_dlalgo_llm-test-ft5b:" \ - cluster.gpus_per_task=null \ - cluster.gpus_per_node=null \ - ++cluster.nv_meta="ml-model.fault_tol_tests" \ - ++cluster.gres="gpu:8" \ - ++cluster.signal="TERM@180" \ - training.exp_manager.resume_if_exists=True \ - training.exp_manager.create_checkpoint_callback=True \ - training.exp_manager.checkpoint_callback_params.save_top_k=1 \ - training.exp_manager.resume_ignore_no_checkpoint=True \ - training.run.name="fault_tol_gpt3_5b_dbg" \ - training.run.time_limit=00:30:00 \ - training.trainer.max_time=00:01:00:00 \ - training.trainer.num_nodes=4 \ - training.trainer.devices=8 \ - training.trainer.log_every_n_steps=10 \ - training.trainer.val_check_interval=400 \ - ++training.trainer.precision=16 \ - ++training.model.mcore_gpt=False \ - ++training.model.tokenizer.merge_file="/gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt3/bpe/gpt2-merges.txt" \ - ++training.model.tokenizer.vocab_file="/gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt3/bpe/gpt2-vocab.txt" \ - training.trainer.enable_checkpointing=False \ - training.model.micro_batch_size=1 \ - training.model.global_batch_size=4 \ - training.model.tensor_model_parallel_size=8 \ - training.model.pipeline_model_parallel_size=1 \ - ++training.exp_manager.create_fault_tolerance_callback=True \ - ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=null \ - ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=null \ - ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=3 \ - ++training.exp_manager.fault_tolerance.max_rank_restarts=0 - - -# -# ++training.exp_manager.fault_tolerance.simulated_fault.fault_type=random \ -# ++training.exp_manager.fault_tolerance.simulated_fault.base_delay=900 -# diff --git a/examples/fault_tolerance/run_on_cluster.sh b/examples/fault_tolerance/run_on_cluster.sh deleted file mode 100755 index b28bc18597..0000000000 --- a/examples/fault_tolerance/run_on_cluster.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash - -# NOTE: NeMo-Megatron-Launcher requirements should be installed -# e.g. cd /NeMo-Megatron-Launcher && pip install -r requirements.txt - -CLUSTER="draco-rno" -CONTAINER="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi/ft/NeMo-Megatron-Launcher-gwe-ft/dl+gwe+fault_tolerance_related+nemo-gwe-ft+test.sqsh" # "gitlab-master.nvidia.com/dl/gwe/fault_tolerance_related/nemo-gwe-ft:test" -RUN_NAME="fault_tol_gpt3_5b_no_err" -NODES=4 - -FT_ARGS=" - ++training.exp_manager.create_fault_tolerance_callback=True \ - ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=1 -" - -# ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=900 \ -# ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=600 \ -# ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=3 \ -# ++training.exp_manager.fault_tolerance.max_rank_restarts=0 -# ++training.exp_manager.fault_tolerance.simulated_fault.fault_type=random \ -# ++training.exp_manager.fault_tolerance.simulated_fault.base_delay=900 - -if [ "$CLUSTER" == "draco-rno" ]; then - PARTITION="batch_short_dgx1_m2" - ACCOUNT="coreai_dlalgo_llm" - JOB_PREFIX="coreai_dlalgo_llm-test:" - CLUSTER_SPECIFIC_ARGS="++cluster.nv_meta=\"ml-model.fault_tol_tests\"" - USR_DIR="/gpfs/fs1/projects/ent_joc/users/jbieniusiewi" - LAUNCHER_DIR="${USR_DIR}/ft/NeMo-Megatron-Launcher-gwe-ft" -else - echo "Unknown cluster: $CLUSTER" - exit 1 -fi - -# create dummy data this that is required by the launcher -# we will use mock data -mkdir -p ${LAUNCHER_DIR}/dummy_data_dir - -HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface/hub HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ - training=gpt3/5b \ - stages=["training"] \ - numa_mapping.enable=True \ - data_dir=${LAUNCHER_DIR}/dummy_data_dir \ - ++training.model.data.mock_dataset=True \ - ++training.model.data.data_impl="mock" \ - ++training.model.data.data_prefix=[] \ - launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts \ - container_mounts=[$USR_DIR:$USR_DIR] \ - container=${CONTAINER} \ - cluster.partition=${PARTITION} \ - cluster.account=${ACCOUNT} \ - cluster.job_name_prefix=${JOB_PREFIX} \ - ${CLUSTER_SPECIFIC_ARGS} \ - ++cluster.gres="gpu:8" \ - ++cluster.signal="TERM@240" \ - training.exp_manager.resume_if_exists=True \ - training.exp_manager.create_checkpoint_callback=True \ - training.exp_manager.checkpoint_callback_params.save_top_k=1 \ - training.exp_manager.resume_ignore_no_checkpoint=True \ - training.run.name=${RUN_NAME} \ - training.run.time_limit=00:20:00 \ - training.trainer.max_time=00:01:30:00 \ - training.trainer.num_nodes=${NODES} \ - training.trainer.devices=8 \ - training.trainer.log_every_n_steps=10 \ - training.trainer.val_check_interval=50 \ - ++training.trainer.precision=16 \ - ++training.model.tokenizer.merge_file="${USR_DIR}/bpe/gpt2-merges.txt" \ - ++training.model.tokenizer.vocab_file="${USR_DIR}/bpe/gpt2-vocab.txt" \ - training.trainer.enable_checkpointing=False \ - training.model.micro_batch_size=1 \ - training.model.global_batch_size=$((${NODES} * 8)) \ - training.model.tensor_model_parallel_size=2 \ - training.model.pipeline_model_parallel_size=4 \ - ${FT_ARGS} - diff --git a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt b/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt deleted file mode 100644 index 1ccf8af65e..0000000000 --- a/examples/fault_tolerance/run_sc2_3b_on_eos_FT.txt +++ /dev/null @@ -1,35 +0,0 @@ -USR_DIR="/lustre/fsw/coreai_dlalgo_llm/jbieniusiewi" -LAUNCHER_DIR="/home/jbieniusiewi/nvwork/sc2/NeMo-Megatron-Launcher" - -NVTE_APPLY_QK_LAYER_SCALING=1 HYDRA_FULL_ERROR=1 PYTHONWARNINGS="ignore" python3 ${LAUNCHER_DIR}/launcher_scripts/main.py \ - training=gpt3/starcoder2_3b \ - stages=["training"] \ - numa_mapping.enable=True \ - data_dir=/lustre/fsw/coreai_dlalgo_llm/aot/datasets/stack_v2_final_tokenized \ - launcher_scripts_path=${LAUNCHER_DIR}/launcher_scripts \ - container_mounts=[$USR_DIR:$USR_DIR] \ - container="gitlab-master.nvidia.com/dl/nemo/nemo-fw/train:sc2_fault_tol" \ - cluster.partition=batch \ - cluster.account=coreai_dlalgo_llm \ - cluster.job_name_prefix="coreai_dlalgo_llm-sc2_3b-ft:" \ - cluster.gpus_per_task=null \ - cluster.gpus_per_node=null \ - training.run.name="fault_tol_sc2_3b" \ - training.run.time_limit=04:00:00 \ - training.trainer.max_time=00:04:00:00 \ - training.trainer.num_nodes=2 \ - training.trainer.devices=8 \ - training.trainer.log_every_n_steps=1 \ - training.trainer.val_check_interval=1000 \ - ++training.exp_manager.create_fault_tolerance_callback=True \ - ++training.exp_manager.fault_tolerance.initial_rank_heartbeat_timeout=720 \ - ++training.exp_manager.fault_tolerance.rank_heartbeat_timeout=600 \ - ++training.exp_manager.fault_tolerance.ipc_timeout=60 \ - ++training.exp_manager.fault_tolerance.rank_termination_signal=9 \ - ++training.exp_manager.fault_tolerance.max_subsequent_job_failures=2 \ - ++training.exp_manager.fault_tolerance.max_rank_restarts=1 \ - - -# Uncomment to test simulated faults -# ++training.exp_manager.fault_tolerance.simulated_fault.fault_type=random \ -# ++training.exp_manager.fault_tolerance.simulated_fault.base_delay=1800 From df4232261e5a7d867aa5b27c81cb7c9dcd989658 Mon Sep 17 00:00:00 2001 From: Jacek Bieniusiewicz Date: Mon, 1 Jul 2024 13:55:39 +0200 Subject: [PATCH 39/39] Final cleanup Signed-off-by: Jacek Bieniusiewicz --- launcher_scripts/nemo_launcher/core/launchers.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index 3df38215a1..6a7cecda1b 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -980,7 +980,7 @@ def _make_sbatch_string_ft_launcher( # now create lines = ["#!/bin/bash", "", "# Parameters"] if heterogeneous: - raise ValueError("This PoC does not support heterogeneous jobs") + raise ValueError("Fault tolerance is not supported with heterogeneous jobs.") else: # run 1 FT launcher per node, it will spawn the actual tasks parameters["ntasks_per_node"] = 1 @@ -988,8 +988,6 @@ def _make_sbatch_string_ft_launcher( lines.append(_as_sbatch_flag(k, parameters[k])) parameters["ntasks_per_node"] = ntasks_per_node - lines += ["", "# This script uses experimental fault tolerance launcher", ""] - # environment setup: if setup is not None: lines += ["", "# setup"] + setup @@ -1071,16 +1069,8 @@ def _make_sbatch_string_ft_launcher( "", ] - # Fault tolerance uses Torch Elastic based launcher with SLURM. - # Torch Lightning does not handle that case correctly, - # so we need to force TorchElasticEnvironment over SLURMEnvironment. - # We do this by setting SLURM_JOB_NAME=interactive. - # This is a temporary workaround, until the following PR is merged with NeMo - # https://github.com/Lightning-AI/pytorch-lightning/pull/18618 - # --ignore-missing-fault-tol-cfg is used so FT launcher can handle NeMo YAML without fault_tolerance section - # in such case default FT config will be used ft_launcher_cmd_part = ( - "SLURM_JOB_NAME=interactive ft_launcher " + "ft_launcher " + f"--fault-tol-cfg-path=$FAULT_TOL_CFG_PATH --ignore-missing-fault-tol-cfg {additional_ft_launcher_args} " + "--rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$RDZV_HOST " + f"--nnodes={nodes} --nproc_per_node={ntasks_per_node} --max-restarts={max_rank_restarts}"